From ea3f04195ba4a5034c9c8e9b726d4f7ce96f1832 Mon Sep 17 00:00:00 2001 From: Apple Date: Fri, 1 May 2020 21:28:03 +0000 Subject: [PATCH] xnu-6153.101.6.tar.gz --- bsd/bsm/audit_kevents.h | 2 +- bsd/dev/dtrace/dtrace.c | 8 +- bsd/kern/kern_backtrace.c | 4 +- bsd/kern/kern_control.c | 54 +- bsd/kern/kern_core.c | 17 - bsd/kern/kern_descrip.c | 4 +- bsd/kern/kern_exec.c | 50 +- bsd/kern/kern_malloc.c | 13 +- bsd/kern/kern_memorystatus_freeze.c | 295 +- bsd/kern/kern_mib.c | 2 - bsd/kern/kern_mman.c | 14 +- bsd/kern/kern_overrides.c | 10 +- bsd/kern/kern_proc.c | 79 +- bsd/kern/kern_sysctl.c | 81 +- bsd/kern/policy_check.c | 4 +- bsd/kern/subr_log.c | 12 +- bsd/kern/subr_prf.c | 3 +- bsd/kern/syscalls.master | 9 +- bsd/kern/trace_codes | 1 + bsd/kern/uipc_socket.c | 46 +- bsd/man/man2/getattrlist.2 | 37 + bsd/man/man4/random.4 | 4 +- bsd/miscfs/devfs/devfs_vfsops.c | 4 +- bsd/miscfs/devfs/devfs_vnops.c | 2 +- bsd/miscfs/routefs/routefs_ops.c | 4 +- bsd/net/content_filter.c | 493 ++- bsd/net/content_filter.h | 30 + bsd/net/dlil.c | 6 +- bsd/net/if_bridge.c | 2342 ++++++++++- bsd/net/if_bridgevar.h | 84 +- bsd/net/kpi_interface.h | 268 +- bsd/net/kpi_interfacefilter.h | 13 +- bsd/net/kpi_protocol.h | 19 +- bsd/net/necp.c | 111 +- bsd/net/necp.h | 33 +- bsd/net/necp_client.c | 171 +- bsd/net/network_agent.c | 54 +- bsd/netinet/dhcp.h | 2 +- bsd/netinet/flow_divert.c | 373 +- bsd/netinet/flow_divert_proto.h | 8 +- bsd/netinet/in_pcb.c | 13 +- bsd/netinet/ip_output.c | 3 + bsd/netinet/tcp_input.c | 18 +- bsd/netinet/tcp_output.c | 68 +- bsd/netinet/tcp_timer.c | 2 +- bsd/netinet6/ip6_output.c | 3 + bsd/netinet6/nd6_prproxy.c | 2 +- bsd/netkey/key.c | 195 +- bsd/netkey/key.h | 4 +- bsd/netkey/keydb.h | 6 + bsd/nfs/krpc_subr.c | 5 + bsd/nfs/nfs.h | 2 + bsd/nfs/nfs4_subs.c | 5 + bsd/nfs/nfs4_vnops.c | 13 + bsd/nfs/nfs_bio.c | 6 + bsd/nfs/nfs_boot.c | 5 + bsd/nfs/nfs_conf.h | 45 + bsd/nfs/nfs_gss.c | 45 +- bsd/nfs/nfs_lock.c | 5 + bsd/nfs/nfs_node.c | 4 + bsd/nfs/nfs_serv.c | 8 +- bsd/nfs/nfs_socket.c | 17 +- bsd/nfs/nfs_srvcache.c | 6 +- bsd/nfs/nfs_subs.c | 45 +- bsd/nfs/nfs_syscalls.c | 72 +- bsd/nfs/nfs_upcall.c | 6 + bsd/nfs/nfs_vfsops.c | 15 +- bsd/nfs/nfs_vnops.c | 87 +- bsd/nfs/nfsm_subs.h | 6 +- bsd/nfs/nfsnode.h | 2 + bsd/sys/_types/_fd_def.h | 75 +- bsd/sys/attr.h | 4 +- bsd/sys/dtrace.h | 1 + bsd/sys/imgact.h | 2 + bsd/sys/kdebug.h | 1 + bsd/sys/kern_memorystatus_freeze.h | 4 +- bsd/sys/kpi_mbuf.h | 232 +- bsd/sys/mount_internal.h | 2 - bsd/sys/proc.h | 14 +- bsd/sys/socketvar.h | 6 + bsd/sys/spawn_internal.h | 1 + bsd/sys/stat.h | 11 + bsd/sys/vnode.h | 30 +- bsd/sys/vnode_internal.h | 1 - bsd/vfs/vfs_attrlist.c | 26 +- bsd/vfs/vfs_cache.c | 2 +- bsd/vfs/vfs_conf.c | 10 +- bsd/vfs/vfs_fsevents.c | 72 +- bsd/vfs/vfs_lookup.c | 6 +- bsd/vfs/vfs_subr.c | 34 +- bsd/vfs/vfs_syscalls.c | 74 +- bsd/vm/vnode_pager.c | 9 +- config/BSDKernel.exports | 1 + config/IOKit.exports | 5 +- config/MACFramework.exports | 1 + config/MasterVersion | 2 +- config/Private.exports | 4 + iokit/DriverKit/IOBufferMemoryDescriptor.iig | 3 +- iokit/DriverKit/IOKitKeys.h | 232 ++ iokit/DriverKit/IOMemoryDescriptor.iig | 4 +- iokit/DriverKit/IOMemoryMap.iig | 4 +- iokit/DriverKit/IOReturn.h | 12 +- iokit/DriverKit/IOService.iig | 79 +- .../IOServiceNotificationDispatchSource.iig | 131 + iokit/DriverKit/Makefile | 2 +- iokit/DriverKit/OSAction.iig | 53 +- iokit/DriverKit/OSObject.iig | 93 +- iokit/IOKit/IOCatalogue.h | 4 +- iokit/IOKit/IOKitKeys.h | 6 + iokit/IOKit/IOKitServer.h | 3 + iokit/IOKit/IORegistryEntry.h | 5 + iokit/IOKit/IOReturn.h | 8 + iokit/IOKit/IOService.h | 3 + iokit/IOKit/pwr_mgt/IOPM.h | 2 - iokit/IOKit/pwr_mgt/IOPMPrivate.h | 3 - iokit/IOKit/pwr_mgt/RootDomain.h | 2 - iokit/Kernel/IOCatalogue.cpp | 78 +- iokit/Kernel/IODeviceTreeSupport.cpp | 2 - iokit/Kernel/IOKitDebug.cpp | 2 +- iokit/Kernel/IOPMrootDomain.cpp | 72 +- iokit/Kernel/IORegistryEntry.cpp | 19 + iokit/Kernel/IOService.cpp | 5 + iokit/Kernel/IOServicePM.cpp | 14 - iokit/Kernel/IOUserClient.cpp | 49 +- iokit/Kernel/IOUserServer.cpp | 556 ++- iokit/conf/files | 1 + libkern/c++/OSMetaClass.cpp | 9 +- libkern/libkern/OSKextLib.h | 9 + libkern/libkern/c++/OSMetaClass.h | 2 + libsyscall/mach/mach_port.c | 15 + libsyscall/wrappers/_libc_funcptr.c | 12 + libsyscall/wrappers/_libkernel_init.h | 3 + libsyscall/wrappers/spawn/posix_spawn.c | 21 + libsyscall/wrappers/spawn/spawn.h | 2 + libsyscall/wrappers/terminate_with_reason.c | 30 + .../UserNotification/KUNCUserNotifications.c | 2 +- osfmk/arm/arm_init.c | 7 +- osfmk/arm/cswitch.s | 2 + osfmk/arm/locks.h | 2 + osfmk/arm/locks_arm.c | 353 +- osfmk/arm/machine_routines.c | 12 + osfmk/arm/machine_routines_asm.s | 2 + osfmk/arm/pcb.c | 9 + osfmk/arm/pmap.c | 59 +- osfmk/arm64/cswitch.s | 2 + osfmk/arm64/kpc.c | 38 +- osfmk/arm64/locore.s | 8 +- osfmk/arm64/machine_routines.c | 14 +- osfmk/arm64/monotonic_arm64.c | 94 +- osfmk/arm64/pcb.c | 9 + osfmk/arm64/proc_reg.h | 39 +- osfmk/arm64/start.s | 8 +- osfmk/bank/bank.c | 7 +- osfmk/conf/files | 1 + osfmk/device/device_types.h | 5 + osfmk/device/iokit_rpc.c | 19 +- osfmk/i386/AT386/model_dep.c | 25 +- osfmk/i386/cpu_data.h | 10 +- osfmk/i386/cpuid.c | 2 - osfmk/i386/cpuid.h | 2 - osfmk/i386/fpu.c | 129 +- osfmk/i386/i386_init.c | 3 +- osfmk/i386/locks.h | 11 +- osfmk/i386/locks_i386.c | 1370 ++++--- osfmk/i386/locks_i386_opt.c | 4 + osfmk/i386/machine_routines.c | 9 + osfmk/i386/machine_routines.h | 2 - osfmk/i386/pcb.c | 10 + osfmk/i386/proc_reg.h | 4 - osfmk/i386/user_ldt.c | 15 - osfmk/ipc/ipc_importance.c | 4 +- osfmk/ipc/ipc_init.c | 6 +- osfmk/ipc/ipc_kmsg.c | 3 +- osfmk/ipc/ipc_object.c | 44 + osfmk/ipc/ipc_object.h | 5 +- osfmk/ipc/ipc_port.h | 6 + osfmk/ipc/ipc_space.c | 64 + osfmk/ipc/ipc_space.h | 14 +- osfmk/ipc/ipc_types.h | 8 + osfmk/ipc/ipc_voucher.c | 27 +- osfmk/ipc/mach_debug.c | 47 +- osfmk/kern/arcade.c | 2 +- osfmk/kern/audit_sessionport.c | 4 +- osfmk/kern/backtrace.c | 17 +- osfmk/kern/backtrace.h | 15 +- osfmk/kern/block_hint.h | 1 + osfmk/kern/circle_queue.h | 18 + osfmk/kern/clock.c | 29 +- osfmk/kern/host_notify.c | 4 +- osfmk/kern/ipc_clock.c | 6 +- osfmk/kern/ipc_host.c | 10 +- osfmk/kern/ipc_kobject.c | 268 +- osfmk/kern/ipc_kobject.h | 51 +- osfmk/kern/ipc_mig.c | 44 +- osfmk/kern/ipc_mig.h | 19 +- osfmk/kern/ipc_misc.c | 4 +- osfmk/kern/ipc_sync.c | 4 +- osfmk/kern/ipc_tt.c | 18 +- osfmk/kern/kalloc.c | 1 + osfmk/kern/kern_stackshot.c | 4 + osfmk/kern/mk_timer.c | 61 +- osfmk/kern/sched_clutch.c | 240 +- osfmk/kern/sched_clutch.h | 17 +- osfmk/kern/sched_clutch.md | 4 +- osfmk/kern/startup.c | 3 + osfmk/kern/suid_cred.c | 240 ++ osfmk/kern/suid_cred.h | 51 + osfmk/kern/sysdiagnose.c | 2 +- osfmk/kern/task.c | 37 +- osfmk/kern/telemetry.c | 7 +- osfmk/kern/thread.c | 10 +- osfmk/kern/thread.h | 8 +- osfmk/kern/work_interval.c | 4 +- osfmk/kperf/callstack.c | 28 +- osfmk/mach/i386/_structs.h | 10 - osfmk/mach/i386/fp_reg.h | 2 - osfmk/mach/i386/thread_state.h | 4 - osfmk/mach/mach_port.defs | 15 + osfmk/mach/mach_types.defs | 10 + osfmk/mach/mach_types.h | 11 +- osfmk/mach/sysdiagnose_notification.defs | 5 + osfmk/mach/task.defs | 6 + osfmk/mach_debug/mach_debug_types.h | 2 + osfmk/vm/memory_object.c | 4 +- osfmk/vm/vm_compressor.c | 65 + osfmk/vm/vm_compressor.h | 8 +- osfmk/vm/vm_fault.c | 4 +- osfmk/vm/vm_map.c | 39 +- osfmk/vm/vm_map.h | 6 +- osfmk/vm/vm_shared_region.c | 18 +- osfmk/vm/vm_user.c | 19 +- osfmk/x86_64/monotonic_x86_64.c | 9 +- security/mac_base.c | 9 + security/mac_framework.h | 3 + security/mac_policy.h | 30 +- security/mac_vfs.c | 25 +- tests/Makefile | 22 +- tests/bpflib.c | 207 + tests/bpflib.h | 39 + tests/fcntl.c | 41 + tests/in_cksum.c | 101 + tests/in_cksum.h | 27 + tests/iokit/io_catalog_send_data.m | 136 + tests/kpc.c | 517 ++- tests/kperf.c | 1 + tests/kperf_helpers.h | 3 + ...apple.xnu.test.task_create_suid_cred.plist | 24 + tests/memorystatus_freeze_test.c | 48 +- tests/net_bridge.c | 3587 +++++++++++++++++ tests/netagent_race_infodisc_56244905.c | 198 + tests/socket_0byte_udp_poll_58140856.c | 108 + tests/stackshot_accuracy.m | 14 +- tests/stackshot_tests.m | 49 + tests/task_create_suid_cred.c | 326 ++ tests/task_create_suid_cred_entitlement.plist | 10 + tools/lldbmacros/core/kernelcore.py | 24 + tools/lldbmacros/core/operating_system.py | 26 - tools/lldbmacros/ipc.py | 12 +- tools/lldbmacros/kcdata.py | 3 + tools/lldbmacros/scheduler.py | 20 +- 260 files changed, 14712 insertions(+), 2541 deletions(-) create mode 100644 bsd/nfs/nfs_conf.h create mode 100644 iokit/DriverKit/IOKitKeys.h create mode 100644 iokit/DriverKit/IOServiceNotificationDispatchSource.iig create mode 100644 osfmk/kern/suid_cred.c create mode 100644 osfmk/kern/suid_cred.h create mode 100644 tests/bpflib.c create mode 100644 tests/bpflib.h create mode 100644 tests/fcntl.c create mode 100644 tests/in_cksum.c create mode 100644 tests/in_cksum.h create mode 100644 tests/iokit/io_catalog_send_data.m create mode 100644 tests/launchd_plists/com.apple.xnu.test.task_create_suid_cred.plist create mode 100644 tests/net_bridge.c create mode 100644 tests/netagent_race_infodisc_56244905.c create mode 100644 tests/socket_0byte_udp_poll_58140856.c create mode 100644 tests/task_create_suid_cred.c create mode 100644 tests/task_create_suid_cred_entitlement.plist diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 37dc16b53..a484e8528 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -447,7 +447,6 @@ #define AUE_PIDFORTASK 43049 /* Darwin-specific. */ #define AUE_SYSCTL_NONADMIN 43050 #define AUE_COPYFILE 43051 /* Darwin-specific. */ -#define AUE_DBGPORTFORPID 43052 /* Darwin-specific. */ /* * Events added to OpenBSM for FreeBSD and Linux; may also be used by Darwin * in the future. @@ -615,6 +614,7 @@ #define AUE_SETATTRLISTAT 43212 /* Darwin. */ #define AUE_FMOUNT 43213 /* Darwin. */ #define AUE_FSGETPATH_EXTENDED 43214 /* Darwin. */ +#define AUE_DBGPORTFORPID 43215 /* Darwin-specific. */ #define AUE_SESSION_START 44901 /* Darwin. */ #define AUE_SESSION_UPDATE 44902 /* Darwin. */ diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index a48f1e6f7..8b315b4b2 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -18660,7 +18660,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv * Range check the count. How much data can we pass around? * FIX ME! */ - if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) { + if (dtmodsyms_count == 0) { cmn_err(CE_WARN, "dtmodsyms_count is not valid"); return (EINVAL); } @@ -18669,6 +18669,12 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv * Allocate a correctly sized structure and copyin the data. */ module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count); + if (module_symbols_size > (size_t)dtrace_copy_maxsize()) { + size_t dtmodsyms_max = DTRACE_MODULE_SYMBOLS_COUNT(dtrace_copy_maxsize()); + cmn_err(CE_WARN, "dtmodsyms_count %ld is too high, maximum is %ld", dtmodsyms_count, dtmodsyms_max); + return (ENOBUFS); + } + if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL) return (ENOMEM); diff --git a/bsd/kern/kern_backtrace.c b/bsd/kern/kern_backtrace.c index d5b5ca727..f51656aa1 100644 --- a/bsd/kern/kern_backtrace.c +++ b/bsd/kern/kern_backtrace.c @@ -73,8 +73,8 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS return ENOBUFS; } memset(bt, 0, bt_size); - error = backtrace_user(bt, bt_len, &bt_filled, NULL, NULL); - if (error) { + bt_filled = backtrace_user(bt, bt_len, &error, NULL, NULL); + if (error != 0) { goto out; } bt_filled = min(bt_filled, bt_len); diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index 3142cbda6..5430ff820 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -102,6 +102,7 @@ struct ctl_cb { struct sockaddr_ctl sac; u_int32_t usecount; u_int32_t kcb_usecount; + u_int32_t require_clearing_count; #if DEVELOPMENT || DEBUG enum ctl_status status; #endif /* DEVELOPMENT || DEBUG */ @@ -370,24 +371,45 @@ ctl_sofreelastref(struct socket *so) } /* - * Use this function to serialize calls into the kctl subsystem + * Use this function and ctl_kcb_require_clearing to serialize + * critical calls into the kctl subsystem */ static void ctl_kcb_increment_use_count(struct ctl_cb *kcb, lck_mtx_t *mutex_held) { LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); - while (kcb->kcb_usecount > 0) { + while (kcb->require_clearing_count > 0) { + msleep(&kcb->require_clearing_count, mutex_held, PSOCK | PCATCH, "kcb_require_clearing", NULL); + } + kcb->kcb_usecount++; +} + +static void +ctl_kcb_require_clearing(struct ctl_cb *kcb, lck_mtx_t *mutex_held) +{ + assert(kcb->kcb_usecount != 0); + kcb->require_clearing_count++; + kcb->kcb_usecount--; + while (kcb->kcb_usecount > 0) { // we need to wait until no one else is running msleep(&kcb->kcb_usecount, mutex_held, PSOCK | PCATCH, "kcb_usecount", NULL); } kcb->kcb_usecount++; } static void -clt_kcb_decrement_use_count(struct ctl_cb *kcb) +ctl_kcb_done_clearing(struct ctl_cb *kcb) +{ + assert(kcb->require_clearing_count != 0); + kcb->require_clearing_count--; + wakeup((caddr_t)&kcb->require_clearing_count); +} + +static void +ctl_kcb_decrement_use_count(struct ctl_cb *kcb) { assert(kcb->kcb_usecount != 0); kcb->kcb_usecount--; - wakeup_one((caddr_t)&kcb->kcb_usecount); + wakeup((caddr_t)&kcb->kcb_usecount); } static int @@ -401,6 +423,7 @@ ctl_detach(struct socket *so) lck_mtx_t *mtx_held = socket_getlock(so, PR_F_WILLUNLOCK); ctl_kcb_increment_use_count(kcb, mtx_held); + ctl_kcb_require_clearing(kcb, mtx_held); if (kcb->kctl != NULL && kcb->kctl->bind != NULL && kcb->userdata != NULL && !(so->so_state & SS_ISCONNECTED)) { @@ -419,7 +442,8 @@ ctl_detach(struct socket *so) kcb->status = KCTL_DISCONNECTED; #endif /* DEVELOPMENT || DEBUG */ so->so_flags |= SOF_PCBCLEARING; - clt_kcb_decrement_use_count(kcb); + ctl_kcb_done_clearing(kcb); + ctl_kcb_decrement_use_count(kcb); return 0; } @@ -573,6 +597,7 @@ ctl_bind(struct socket *so, struct sockaddr *nam, struct proc *p) lck_mtx_t *mtx_held = socket_getlock(so, PR_F_WILLUNLOCK); ctl_kcb_increment_use_count(kcb, mtx_held); + ctl_kcb_require_clearing(kcb, mtx_held); error = ctl_setup_kctl(so, nam, p); if (error) { @@ -593,7 +618,8 @@ ctl_bind(struct socket *so, struct sockaddr *nam, struct proc *p) socket_lock(so, 0); out: - clt_kcb_decrement_use_count(kcb); + ctl_kcb_done_clearing(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } @@ -609,6 +635,7 @@ ctl_connect(struct socket *so, struct sockaddr *nam, struct proc *p) lck_mtx_t *mtx_held = socket_getlock(so, PR_F_WILLUNLOCK); ctl_kcb_increment_use_count(kcb, mtx_held); + ctl_kcb_require_clearing(kcb, mtx_held); #if DEVELOPMENT || DEBUG if (kcb->status != KCTL_DISCONNECTED && ctl_panic_debug) { @@ -668,7 +695,8 @@ end: lck_mtx_unlock(ctl_mtx); } out: - clt_kcb_decrement_use_count(kcb); + ctl_kcb_done_clearing(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } @@ -680,6 +708,7 @@ ctl_disconnect(struct socket *so) if ((kcb = (struct ctl_cb *)so->so_pcb)) { lck_mtx_t *mtx_held = socket_getlock(so, PR_F_WILLUNLOCK); ctl_kcb_increment_use_count(kcb, mtx_held); + ctl_kcb_require_clearing(kcb, mtx_held); struct kctl *kctl = kcb->kctl; if (kctl && kctl->disconnect) { @@ -706,7 +735,8 @@ ctl_disconnect(struct socket *so) kctlstat.kcs_gencnt++; lck_mtx_unlock(ctl_mtx); socket_lock(so, 0); - clt_kcb_decrement_use_count(kcb); + ctl_kcb_done_clearing(kcb); + ctl_kcb_decrement_use_count(kcb); } return 0; } @@ -798,7 +828,7 @@ ctl_usr_rcvd(struct socket *so, int flags) ctl_sbrcv_trim(so); out: - clt_kcb_decrement_use_count(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } @@ -842,7 +872,7 @@ ctl_send(struct socket *so, int flags, struct mbuf *m, if (error != 0) { OSIncrementAtomic64((SInt64 *)&kctlstat.kcs_send_fail); } - clt_kcb_decrement_use_count(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } @@ -906,7 +936,7 @@ ctl_send_list(struct socket *so, int flags, struct mbuf *m, if (error != 0) { OSIncrementAtomic64((SInt64 *)&kctlstat.kcs_send_list_fail); } - clt_kcb_decrement_use_count(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } @@ -1415,7 +1445,7 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) } out: - clt_kcb_decrement_use_count(kcb); + ctl_kcb_decrement_use_count(kcb); return error; } diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 46fbd3ee5..bbf2fcac5 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -70,11 +70,6 @@ #include #endif /* CONFIG_MACF */ -#if CONFIG_CSR -#include -#include -#endif - typedef struct { int flavor; /* the number for this flavor */ mach_msg_type_number_t count; /* count of ints in this flavor */ @@ -291,18 +286,6 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) } #endif -#if CONFIG_CSR - /* If the process is restricted, CSR isn't configured to allow - * restricted processes to be debugged, and CSR isn't configured in - * AppleInternal mode, then don't dump core. */ - if (cs_restricted(core_proc) && - csr_check(CSR_ALLOW_TASK_FOR_PID) && - csr_check(CSR_ALLOW_APPLE_INTERNAL)) { - error = EPERM; - goto out2; - } -#endif - if (IS_64BIT_PROCESS(core_proc)) { is_64 = 1; mach_header_sz = sizeof(struct mach_header_64); diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 320c27b2c..8e7a7db74 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -1972,7 +1972,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) kernel_blob_size = CAST_DOWN(vm_size_t, fs.fs_blob_size); kr = ubc_cs_blob_allocate(&kernel_blob_addr, &kernel_blob_size); - if (kr != KERN_SUCCESS) { + if (kr != KERN_SUCCESS || kernel_blob_size < fs.fs_blob_size) { error = ENOMEM; vnode_put(vp); goto outdrop; @@ -1981,7 +1981,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) if (uap->cmd == F_ADDSIGS) { error = copyin(fs.fs_blob_start, (void *) kernel_blob_addr, - kernel_blob_size); + fs.fs_blob_size); } else { /* F_ADDFILESIGS || F_ADDFILESIGS_RETURN || F_ADDFILESIGS_FOR_DYLD_SIM */ int resid; diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index e4ec2a210..afa0cb820 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -1863,6 +1863,7 @@ exec_handle_port_actions(struct image_params *imgp, kern_return_t kr; boolean_t task_has_watchport_boost = task_has_watchports(current_task()); boolean_t in_exec = (imgp->ip_flags & IMGPF_EXEC); + boolean_t suid_cred_specified = FALSE; for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; @@ -1886,6 +1887,16 @@ exec_handle_port_actions(struct image_params *imgp, goto done; } break; + + case PSPA_SUID_CRED: + /* Only a single suid credential can be specified. */ + if (suid_cred_specified) { + ret = EINVAL; + goto done; + } + suid_cred_specified = TRUE; + break; + default: ret = EINVAL; goto done; @@ -1973,6 +1984,11 @@ exec_handle_port_actions(struct image_params *imgp, /* hold on to this till end of spawn */ actions->registered_array[registered_i++] = port; break; + + case PSPA_SUID_CRED: + imgp->ip_sc_port = port; + break; + default: ret = EINVAL; break; @@ -3748,6 +3764,10 @@ bad: imgp->ip_cs_error = OS_REASON_NULL; } #endif + if (imgp->ip_sc_port != NULL) { + ipc_port_release_send(imgp->ip_sc_port); + imgp->ip_sc_port = NULL; + } } #if CONFIG_DTRACE @@ -5381,7 +5401,8 @@ exec_handle_sugid(struct image_params *imgp) kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) || ((imgp->ip_origvattr->va_mode & VSGID) != 0 && ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) || - (kauth_cred_getgid(cred) != imgp->ip_origvattr->va_gid)))) { + (kauth_cred_getgid(cred) != imgp->ip_origvattr->va_gid))) || + (imgp->ip_sc_port != NULL)) { #if CONFIG_MACF /* label for MAC transition and neither VSUID nor VSGID */ handle_mac_transition: @@ -5408,6 +5429,33 @@ handle_mac_transition: * proc's ucred lock. This prevents others from accessing * a garbage credential. */ + + if (imgp->ip_sc_port != NULL) { + extern int suid_cred_verify(ipc_port_t, vnode_t, uint32_t *); + int ret = -1; + uid_t uid = UINT32_MAX; + + /* + * Check that the vnodes match. If a script is being + * executed check the script's vnode rather than the + * interpreter's. + */ + struct vnode *vp = imgp->ip_scriptvp != NULL ? imgp->ip_scriptvp : imgp->ip_vp; + + ret = suid_cred_verify(imgp->ip_sc_port, vp, &uid); + if (ret == 0) { + apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) { + return kauth_cred_setresuid(my_cred, + KAUTH_UID_NONE, + uid, + uid, + KAUTH_UID_NONE); + }); + } else { + error = EPERM; + } + } + if (imgp->ip_origvattr->va_mode & VSUID) { apply_kauth_cred_update(p, ^kauth_cred_t (kauth_cred_t my_cred) { return kauth_cred_setresuid(my_cred, diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index 51fbadb8b..c9c87bc16 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -101,6 +101,7 @@ #include +#include #include #include #include @@ -145,7 +146,7 @@ const char *memname[] = { "iov32", /* 19 M_IOV32 */ "mount", /* 20 M_MOUNT */ "fhandle", /* 21 M_FHANDLE */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS "NFS req", /* 22 M_NFSREQ */ "NFS mount", /* 23 M_NFSMNT */ "NFS node", /* 24 M_NFSNODE */ @@ -187,7 +188,7 @@ const char *memname[] = { "NQNFS Lease", /* 47 M_NQLEASE */ "NQNFS Host", /* 48 M_NQMHOST */ "Export Host", /* 49 M_NETADDR */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS "NFS srvsock", /* 50 M_NFSSVC */ "NFS uid", /* 51 M_NFSUID */ "NFS daemon", /* 52 M_NFSD */ @@ -202,7 +203,7 @@ const char *memname[] = { "mrt", /* 56 M_MRTABLE */ "", /* 57 unused entry */ "", /* 58 unused entry */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS "NFSV3 srvdesc",/* 59 M_NFSRVDESC */ "NFSV3 diroff", /* 60 M_NFSDIROFF */ "NFSV3 bigfh", /* 61 M_NFSBIGFH */ @@ -343,7 +344,7 @@ struct kmzones { { SOS(user32_iovec), KMZ_LOOKUPZONE, FALSE }, /* 19 M_IOV32 */ { SOS(mount), KMZ_CREATEZONE, FALSE }, /* 20 M_MOUNT */ { 0, KMZ_MALLOC, FALSE }, /* 21 M_FHANDLE */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS { SOS(nfsreq), KMZ_CREATEZONE, FALSE }, /* 22 M_NFSREQ */ { SOS(nfsmount), KMZ_CREATEZONE, FALSE }, /* 23 M_NFSMNT */ { SOS(nfsnode), KMZ_CREATEZONE, FALSE }, /* 24 M_NFSNODE */ @@ -381,7 +382,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 47 M_NQLEASE */ { 0, KMZ_MALLOC, FALSE }, /* 48 M_NQMHOST */ { 0, KMZ_MALLOC, FALSE }, /* 49 M_NETADDR */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS { SOX(nfsrv_sock), KMZ_CREATEZONE_ACCT, FALSE }, /* 50 M_NFSSVC */ { 0, KMZ_MALLOC, FALSE }, /* 51 M_NFSUID */ @@ -400,7 +401,7 @@ struct kmzones { { SOX(mrt), KMZ_CREATEZONE, TRUE }, /* 56 M_MRTABLE */ { 0, KMZ_MALLOC, FALSE }, /* 57 unused entry */ { 0, KMZ_MALLOC, FALSE }, /* 58 unused entry */ -#if (NFSCLIENT || NFSSERVER) +#if CONFIG_NFS { SOS(nfsrv_descript), KMZ_CREATEZONE_ACCT, FALSE }, /* 59 M_NFSRVDESC */ { SOS(nfsdmap), KMZ_CREATEZONE, FALSE }, /* 60 M_NFSDIROFF */ diff --git a/bsd/kern/kern_memorystatus_freeze.c b/bsd/kern/kern_memorystatus_freeze.c index c83a80d72..a6e720285 100644 --- a/bsd/kern/kern_memorystatus_freeze.c +++ b/bsd/kern/kern_memorystatus_freeze.c @@ -128,6 +128,53 @@ unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of pri unsigned int memorystatus_thaw_count = 0; unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */ +/* Freezer counters collected for telemtry */ +static struct memorystatus_freezer_stats_t { + /* + * # of processes that we've considered freezing. + * Used to normalize the error reasons below. + */ + uint64_t mfs_process_considered_count; + + /* + * The following counters track how many times we've failed to freeze + * a process because of a specific FREEZER_ERROR. + */ + /* EXCESS_SHARED_MEMORY */ + uint64_t mfs_error_excess_shared_memory_count; + /* LOW_PRIVATE_SHARED_RATIO */ + uint64_t mfs_error_low_private_shared_ratio_count; + /* NO_COMPRESSOR_SPACE */ + uint64_t mfs_error_no_compressor_space_count; + /* NO_SWAP_SPACE */ + uint64_t mfs_error_no_swap_space_count; + /* pages < memorystatus_freeze_pages_min */ + uint64_t mfs_error_below_min_pages_count; + /* dasd determined it was unlikely to be relaunched. */ + uint64_t mfs_error_low_probability_of_use_count; + /* transient reasons (like inability to acquire a lock). */ + uint64_t mfs_error_other_count; + + /* + * # of times that we saw memorystatus_available_pages <= memorystatus_freeze_threshold. + * Used to normalize skipped_full_count and shared_mb_high_count. + */ + uint64_t mfs_below_threshold_count; + + /* Skipped running the freezer because we were out of slots */ + uint64_t mfs_skipped_full_count; + + /* Skipped running the freezer because we were over the shared mb limit*/ + uint64_t mfs_skipped_shared_mb_high_count; + + /* + * How many pages have not been sent to swap because they were in a shared object? + * This is being used to gather telemtry so we can understand the impact we'd have + * on our NAND budget if we did swap out these pages. + */ + uint64_t mfs_shared_pages_skipped; +} memorystatus_freezer_stats = {0}; + #endif /* XNU_KERNEL_PRIVATE */ static inline boolean_t memorystatus_can_freeze_processes(void); @@ -144,6 +191,7 @@ static uint64_t memorystatus_freeze_pageouts = 0; #define DEGRADED_WINDOW_MINS (30) #define NORMAL_WINDOW_MINS (24 * 60) +/* Protected by the freezer_mutex */ static throttle_interval_t throttle_intervals[] = { { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, @@ -166,6 +214,52 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOC SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, ""); SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_excess_shared_memory_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_excess_shared_memory_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_low_private_shared_ratio_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_low_private_shared_ratio_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_no_compressor_space_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_no_compressor_space_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_no_swap_space_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_no_swap_space_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_below_min_pages_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_below_min_pages_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_low_probability_of_use_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_low_probability_of_use_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_error_other_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_error_other_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_process_considered_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_process_considered_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_below_threshold_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_below_threshold_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_skipped_full_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_skipped_full_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_skipped_shared_mb_high_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_skipped_shared_mb_high_count, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freezer_shared_pages_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freezer_stats.mfs_shared_pages_skipped, ""); + +/* + * Calculates the hit rate for the freezer. + * The hit rate is defined as the percentage of procs that are currently in the + * freezer which we have thawed. + * A low hit rate means we're freezing bad candidates since they're not re-used. + */ +static int sysctl_memorystatus_freezer_thaw_percentage SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + size_t thaw_count = 0, frozen_count = 0; + int thaw_percentage = 100; + unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band; + proc_t p = PROC_NULL; + proc_list_lock(); + + p = memorystatus_get_first_proc_locked(&band, FALSE); + + while (p) { + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + if (p->p_memstat_thaw_count > 0) { + thaw_count++; + } + frozen_count++; + } + p = memorystatus_get_next_proc_locked(&band, p, FALSE); + } + proc_list_unlock(); + if (frozen_count > 0) { + thaw_percentage = 100 * thaw_count / frozen_count; + } + return sysctl_handle_int(oidp, &thaw_percentage, 0, req); +} +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freezer_thaw_percentage, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, &sysctl_memorystatus_freezer_thaw_percentage, "I", ""); #if DEVELOPMENT || DEBUG @@ -248,6 +342,7 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS again: p = proc_find(pid); if (p != NULL) { + memorystatus_freezer_stats.mfs_process_considered_count++; uint32_t purgeable, wired, clean, dirty, shared; uint32_t max_pages = 0, state = 0; @@ -297,18 +392,24 @@ again: } error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + if (!error || freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_shared_pages_skipped += shared; + } if (error) { char reason[128]; if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + memorystatus_freezer_stats.mfs_error_excess_shared_memory_count++; strlcpy(reason, "too much shared memory", 128); } if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_error_low_private_shared_ratio_count++; strlcpy(reason, "low private-shared pages ratio", 128); } if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + memorystatus_freezer_stats.mfs_error_no_compressor_space_count++; strlcpy(reason, "no compressor space", 128); } @@ -402,11 +503,20 @@ static int sysctl_memorystatus_demote_frozen_process SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2, oidp, req) + int error, val; + /* + * Only demote on write to prevent demoting during `sysctl -a`. + * The actual value written doesn't matter. + */ + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) { + return error; + } memorystatus_demote_frozen_processes(false); return 0; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_demote_frozen_processes, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_memorystatus_demote_frozen_process, "I", ""); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_demote_frozen_processes, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_memorystatus_demote_frozen_process, "I", ""); static int sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS @@ -1081,10 +1191,17 @@ memorystatus_is_process_eligible_for_freeze(proc_t p) } } + /* + * This proc is a suspended application. + * We're interested in tracking what percentage of these + * actually get frozen. + */ + memorystatus_freezer_stats.mfs_process_considered_count++; /* Only freeze applications meeting our minimum resident page criteria */ memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); if (pages < memorystatus_freeze_pages_min) { + memorystatus_freezer_stats.mfs_error_below_min_pages_count++; goto out; } @@ -1094,6 +1211,7 @@ memorystatus_is_process_eligible_for_freeze(proc_t p) * memorystatus_freeze_top_process holds the proc_list_lock while it traverses the bands. */ if ((p->p_listflag & P_LIST_EXITED) != 0) { + memorystatus_freezer_stats.mfs_error_other_count++; goto out; } @@ -1110,6 +1228,7 @@ memorystatus_is_process_eligible_for_freeze(proc_t p) } if (probability_of_use == 0) { + memorystatus_freezer_stats.mfs_error_low_probability_of_use_count++; goto out; } } @@ -1196,6 +1315,9 @@ memorystatus_freeze_process_sync(proc_t p) memorystatus_available_pages, 0, 0, 0, 0); ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + if (ret == KERN_SUCCESS || freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_shared_pages_skipped += shared; + } KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, memorystatus_available_pages, aPid, 0, 0, 0); @@ -1241,15 +1363,17 @@ memorystatus_freeze_process_sync(proc_t p) ret = 0; } - proc_list_lock(); /* Update stats */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { throttle_intervals[i].pageouts += dirty; } - } else { - proc_list_lock(); } + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s] done memorystatus_freeze_budget_pages_remaining %llu froze %u pages", + aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, dirty); + + proc_list_lock(); memorystatus_freeze_pageouts += dirty; @@ -1260,25 +1384,25 @@ memorystatus_freeze_process_sync(proc_t p) * can freeze a more eligible process at this moment in time? */ } - - memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s] done memorystatus_freeze_budget_pages_remaining %llu froze %u pages", - aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, dirty); } else { char reason[128]; if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + memorystatus_freezer_stats.mfs_error_excess_shared_memory_count++; strlcpy(reason, "too much shared memory", 128); } if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_error_low_private_shared_ratio_count++; strlcpy(reason, "low private-shared pages ratio", 128); } if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + memorystatus_freezer_stats.mfs_error_no_compressor_space_count++; strlcpy(reason, "no compressor space", 128); } if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + memorystatus_freezer_stats.mfs_error_no_swap_space_count++; strlcpy(reason, "no swap space", 128); } @@ -1298,6 +1422,9 @@ exit: return ret; } +/* + * Caller must hold the freezer_mutex and it will be locked on return. + */ static int memorystatus_freeze_top_process(void) { @@ -1311,6 +1438,7 @@ memorystatus_freeze_top_process(void) coalition_t coal = COALITION_NULL; pid_t pid_list[MAX_XPC_SERVICE_PIDS]; unsigned int ntasks = 0; + LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE_SCAN) | DBG_FUNC_START, memorystatus_available_pages, 0, 0, 0, 0); @@ -1432,6 +1560,7 @@ freeze_process: p = proc_ref_locked(p); if (!p) { + memorystatus_freezer_stats.mfs_error_other_count++; break; } @@ -1441,6 +1570,9 @@ freeze_process: memorystatus_available_pages, 0, 0, 0, 0); kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + if (kr == KERN_SUCCESS || freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_shared_pages_skipped += shared; + } KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, memorystatus_available_pages, aPid, 0, 0, 0); @@ -1484,15 +1616,16 @@ freeze_process: ret = 0; } - proc_list_lock(); - /* Update stats */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { throttle_intervals[i].pageouts += dirty; } - } else { - proc_list_lock(); } + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: %sfreezing (%s) pid %d [%s] done, memorystatus_freeze_budget_pages_remaining %llu %sfroze %u pages\n", + refreeze_processes? "re" : "", (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, refreeze_processes? "Re" : "", dirty); + + proc_list_lock(); memorystatus_freeze_pageouts += dirty; @@ -1504,10 +1637,6 @@ freeze_process: */ } - memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: %sfreezing (%s) pid %d [%s] done, memorystatus_freeze_budget_pages_remaining %llu %sfroze %u pages\n", - refreeze_processes? "re" : "", (coal == NULL ? "general" : "coalition-driven"), aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_freeze_budget_pages_remaining, refreeze_processes? "Re" : "", dirty); - /* Return KERN_SUCCESS */ ret = kr; @@ -1603,18 +1732,22 @@ freeze_process: char reason[128]; if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + memorystatus_freezer_stats.mfs_error_excess_shared_memory_count++; strlcpy(reason, "too much shared memory", 128); } if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + memorystatus_freezer_stats.mfs_error_low_private_shared_ratio_count++; strlcpy(reason, "low private-shared pages ratio", 128); } if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + memorystatus_freezer_stats.mfs_error_no_compressor_space_count++; strlcpy(reason, "no compressor space", 128); } if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + memorystatus_freezer_stats.mfs_error_no_swap_space_count++; strlcpy(reason, "no swap space", 128); } @@ -1836,8 +1969,8 @@ memorystatus_demote_frozen_processes(boolean_t force_one) if (force_one == FALSE) { /* - * We use this counter to track daily thaws. - * So we only reset it to 0 under the normal + * We use these counters to track daily hit rates. + * So we only reset them to 0 under the normal * mode. */ memorystatus_thaw_count = 0; @@ -1846,6 +1979,72 @@ memorystatus_demote_frozen_processes(boolean_t force_one) proc_list_unlock(); } +/* + * Calculate a new freezer budget. + * @param time_since_last_interval_expired_sec How long has it been (in seconds) since the previous interval expired. + * @param burst_multiple The burst_multiple for the new period + * @param interval_duration_min How many minutes will the new interval be? + * @param rollover The amount to rollover from the previous budget. + * + * @return A budget for the new interval. + */ +static uint32_t +memorystatus_freeze_calculate_new_budget( + unsigned int time_since_last_interval_expired_sec, + unsigned int burst_multiple, + unsigned int interval_duration_min, + uint32_t rollover) +{ + uint64_t freeze_daily_budget = 0; + unsigned int daily_budget_pageouts = 0; + unsigned int freeze_daily_pageouts_max = 0; + const static unsigned int kNumSecondsInDay = 60 * 60 * 24; + /* Precision factor for days_missed. 2 decimal points. */ + const static unsigned int kFixedPointFactor = 100; + unsigned int days_missed, budget_missed; + + /* Get the daily budget from the storage layer */ + if (vm_swap_max_budget(&freeze_daily_budget)) { + memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024)); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max); + } + /* Calculate the daily pageout budget */ + freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); + + daily_budget_pageouts = (burst_multiple * (((uint64_t) interval_duration_min * freeze_daily_pageouts_max) / (kNumSecondsInDay / 60))); + + /* + * Add additional budget for time since the interval expired. + * For example, if the interval expired n days ago, we should get an additional n days + * of budget since we didn't use any budget during those n days. + */ + days_missed = time_since_last_interval_expired_sec * kFixedPointFactor / kNumSecondsInDay; + budget_missed = days_missed * freeze_daily_pageouts_max / kFixedPointFactor; + return rollover + daily_budget_pageouts + budget_missed; +} + +#if DEVELOPMENT || DEBUG + +static int +sysctl_memorystatus_freeze_calculate_new_budget SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + unsigned int time_since_last_interval_expired_sec = 0; + unsigned int new_budget; + + error = sysctl_handle_int(oidp, &time_since_last_interval_expired_sec, 0, req); + if (error || !req->newptr) { + return error; + } + new_budget = memorystatus_freeze_calculate_new_budget(time_since_last_interval_expired_sec, 1, NORMAL_WINDOW_MINS, 0); + return copyout(&new_budget, req->oldptr, MIN(sizeof(req->oldlen), sizeof(new_budget))); +} + +SYSCTL_PROC(_vm, OID_AUTO, memorystatus_freeze_calculate_new_budget, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_freeze_calculate_new_budget, "I", ""); + +#endif /* DEVELOPMENT || DEBUG */ /* * This function will do 4 things: @@ -1861,6 +2060,9 @@ memorystatus_demote_frozen_processes(boolean_t force_one) * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below * what we would normally expect, then we are running low on our daily budget and need to enter * degraded perf. mode. + * + * Caller must hold the freezer mutex + * Caller must not hold the proc_list lock */ static void @@ -1868,7 +2070,9 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) { clock_sec_t sec; clock_nsec_t nsec; - mach_timespec_t ts; + mach_timespec_t now_ts; + LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); unsigned int freeze_daily_pageouts_max = 0; @@ -1883,15 +2087,15 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) #endif clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = sec; - ts.tv_nsec = nsec; + now_ts.tv_sec = sec; + now_ts.tv_nsec = nsec; struct throttle_interval_t *interval = NULL; if (memorystatus_freeze_degradation == TRUE) { interval = degraded_throttle_window; - if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { + if (CMP_MACH_TIMESPEC(&now_ts, &interval->ts) >= 0) { memorystatus_freeze_degradation = FALSE; interval->pageouts = 0; interval->max_pageouts = 0; @@ -1902,28 +2106,17 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) interval = normal_throttle_window; - if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { - /* - * New throttle window. - * Rollover any unused budget. - * Also ask the storage layer what the new budget needs to be. - */ - uint64_t freeze_daily_budget = 0; - unsigned int daily_budget_pageouts = 0; - - if (vm_swap_max_budget(&freeze_daily_budget)) { - memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024)); - os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max); - } - - freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); - - daily_budget_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); - interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts; + if (CMP_MACH_TIMESPEC(&now_ts, &interval->ts) >= 0) { + /* How long has it been since the previous interval expired? */ + mach_timespec_t expiration_period_ts = now_ts; + SUB_MACH_TIMESPEC(&expiration_period_ts, &interval->ts); + interval->max_pageouts = memorystatus_freeze_calculate_new_budget( + expiration_period_ts.tv_sec, interval->burst_multiple, + interval->mins, interval->max_pageouts - interval->pageouts); interval->ts.tv_sec = interval->mins * 60; interval->ts.tv_nsec = 0; - ADD_MACH_TIMESPEC(&interval->ts, &ts); + ADD_MACH_TIMESPEC(&interval->ts, &now_ts); /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ if (interval->pageouts > interval->max_pageouts) { interval->pageouts -= interval->max_pageouts; @@ -1931,6 +2124,7 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) interval->pageouts = 0; } *budget_pages_allowed = interval->max_pageouts; + memorystatus_freezer_stats.mfs_shared_pages_skipped = 0; memorystatus_demote_frozen_processes(FALSE); /* normal mode...don't force a demotion */ } else { @@ -1968,7 +2162,7 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) time_left.tv_sec = interval->ts.tv_sec; time_left.tv_nsec = 0; - SUB_MACH_TIMESPEC(&time_left, &ts); + SUB_MACH_TIMESPEC(&time_left, &now_ts); if (budget_left <= budget_threshold) { /* @@ -2004,7 +2198,7 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) } MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", - interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, + interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - now_ts->tv_sec) / 60, interval->throttle ? "on" : "off"); } @@ -2063,12 +2257,21 @@ memorystatus_freeze_thread_should_run(void) goto out; } - if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) && - (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) { - goto out; + memorystatus_freezer_stats.mfs_below_threshold_count++; + + if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max)) { + /* + * Consider this as a skip even if we wake up to refreeze because + * we won't freeze any new procs. + */ + memorystatus_freezer_stats.mfs_skipped_full_count++; + if (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD) { + goto out; + } } if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) { + memorystatus_freezer_stats.mfs_skipped_shared_mb_high_count++; goto out; } diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 667b17a7d..c09b4217b 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -586,7 +586,6 @@ SYSCTL_PROC(_hw_optional, OID_AUTO, hle, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN SYSCTL_PROC(_hw_optional, OID_AUTO, adx, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasADX, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, mpx, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasMPX, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, sgx, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasSGX, 0, sysctl_cpu_capability, "I", ""); -#if !defined(RC_HIDE_XNU_J137) SYSCTL_PROC(_hw_optional, OID_AUTO, avx512f, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512F, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, avx512cd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512CD, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, avx512dq, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512DQ, 0, sysctl_cpu_capability, "I", ""); @@ -594,7 +593,6 @@ SYSCTL_PROC(_hw_optional, OID_AUTO, avx512bw, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG SYSCTL_PROC(_hw_optional, OID_AUTO, avx512vl, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512VL, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, avx512ifma, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512IFMA, 0, sysctl_cpu_capability, "I", ""); SYSCTL_PROC(_hw_optional, OID_AUTO, avx512vbmi, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (void *) kHasAVX512VBMI, 0, sysctl_cpu_capability, "I", ""); -#endif /* not RC_HIDE_XNU_J137 */ #elif defined (__arm__) || defined (__arm64__) int watchpoint_flag = -1; int breakpoint_flag = -1; diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 0fd0cc336..de0e20667 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -133,12 +133,6 @@ #endif #include -#ifndef CONFIG_EMBEDDED -#include /* for IOTaskHasEntitlement */ -#include /* for csr_check */ -#define MAP_32BIT_ENTITLEMENT "com.apple.security.mmap-map-32bit" -#endif - /* * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct * XXX usage is PROT_* from an interface perspective. Thus the values of @@ -566,13 +560,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) #ifndef CONFIG_EMBEDDED if (flags & MAP_32BIT) { - if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) == 0 || - IOTaskHasEntitlement(current_task(), MAP_32BIT_ENTITLEMENT)) { - vmk_flags.vmkf_32bit_map_va = TRUE; - } else { - error = EPERM; - goto bad; - } + vmk_flags.vmkf_32bit_map_va = TRUE; } #endif diff --git a/bsd/kern/kern_overrides.c b/bsd/kern/kern_overrides.c index 04c70d47a..695d335b7 100644 --- a/bsd/kern/kern_overrides.c +++ b/bsd/kern/kern_overrides.c @@ -95,7 +95,7 @@ static void system_override_begin(uint64_t flags); static void system_override_end(uint64_t flags); static void system_override_abort(uint64_t flags); static void system_override_callouts(uint64_t flags, boolean_t enable_override); -static __attribute__((noinline)) void PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout); +static __attribute__((noinline)) int PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout); void init_system_override() @@ -140,7 +140,7 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un system_override_abort(flags); } else { system_override_begin(flags); - PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout); + error = PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout); system_override_end(flags); } @@ -307,11 +307,13 @@ system_override_abort(uint64_t flags) } } -static __attribute__((noinline)) void +static __attribute__((noinline)) int PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout) { struct timespec ts; ts.tv_sec = timeout / NSEC_PER_SEC; ts.tv_nsec = timeout - ((long)ts.tv_sec * NSEC_PER_SEC); - msleep((caddr_t)&sys_override_wait, &sys_override_lock, PRIBIO | PCATCH, "system_override", &ts); + int error = msleep((caddr_t)&sys_override_wait, &sys_override_lock, PRIBIO | PCATCH, "system_override", &ts); + /* msleep returns EWOULDBLOCK if timeout expires, treat that as success */ + return (error == EWOULDBLOCK) ? 0 : error; } diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index c074ae1c6..c5ea090ce 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -836,10 +836,20 @@ proc_selfppid(void) return current_proc()->p_ppid; } -int +uint64_t proc_selfcsflags(void) { - return current_proc()->p_csflags; + return (uint64_t)current_proc()->p_csflags; +} + +int +proc_csflags(proc_t p, uint64_t *flags) +{ + if (p && flags) { + *flags = (uint64_t)p->p_csflags; + return 0; + } + return EINVAL; } uint32_t @@ -936,6 +946,12 @@ proc_name(int pid, char * buf, int size) { proc_t p; + if (size <= 0) { + return; + } + + bzero(buf, size); + if ((p = proc_find(pid)) != PROC_NULL) { strlcpy(buf, &p->p_comm[0], size); proc_rele(p); @@ -1267,6 +1283,63 @@ proc_getexecutablevnode(proc_t p) return NULLVP; } +int +proc_gettty(proc_t p, vnode_t *vp) +{ + if (!p || !vp) { + return EINVAL; + } + + struct session *procsp = proc_session(p); + int err = EINVAL; + + if (procsp != SESSION_NULL) { + session_lock(procsp); + vnode_t ttyvp = procsp->s_ttyvp; + int ttyvid = procsp->s_ttyvid; + session_unlock(procsp); + + if (ttyvp) { + if (vnode_getwithvid(ttyvp, ttyvid) == 0) { + *vp = procsp->s_ttyvp; + err = 0; + } + } else { + err = ENOENT; + } + + session_rele(procsp); + } + + return err; +} + +int +proc_gettty_dev(proc_t p, dev_t *dev) +{ + struct session *procsp = proc_session(p); + boolean_t has_tty = FALSE; + + if (procsp != SESSION_NULL) { + session_lock(procsp); + + struct tty * tp = SESSION_TP(procsp); + if (tp != TTY_NULL) { + *dev = tp->t_dev; + has_tty = TRUE; + } + + session_unlock(procsp); + session_rele(procsp); + } + + if (has_tty) { + return 0; + } else { + return EINVAL; + } +} + int proc_selfexecutableargs(uint8_t *buf, size_t *buflen) { @@ -2429,7 +2502,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user */ if (forself == 1 && IOTaskHasEntitlement(pt->task, CLEAR_LV_ENTITLEMENT)) { proc_lock(pt); - pt->p_csflags &= (~(CS_REQUIRE_LV & CS_FORCED_LV)); + pt->p_csflags &= (~(CS_REQUIRE_LV | CS_FORCED_LV)); proc_unlock(pt); error = 0; } else { diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 5d2fcee09..0e55c4445 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -139,6 +139,8 @@ #include #include +#include + #include #include #include @@ -232,7 +234,7 @@ fill_user32_proc(proc_t, struct user32_kinfo_proc *__restrict); extern int kdbg_control(int *name, u_int namelen, user_addr_t where, size_t * sizep); -#if NFSCLIENT +#if CONFIG_NFS_CLIENT extern int netboot_root(void); #endif @@ -282,7 +284,7 @@ STATIC int sysctl_hostname(struct sysctl_oid *oidp, void *arg1, int arg2, struct STATIC int sysctl_procname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_boottime(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_symfile(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); -#if NFSCLIENT +#if CONFIG_NFS_CLIENT STATIC int sysctl_netboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); #endif #ifdef CONFIG_IMGSRC_ACCESS @@ -2347,7 +2349,7 @@ SYSCTL_PROC(_kern, KERN_SYMFILE, symfile, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_symfile, "A", ""); -#if NFSCLIENT +#if CONFIG_NFS_CLIENT STATIC int sysctl_netboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -4440,6 +4442,41 @@ SYSCTL_PROC(_kern, OID_AUTO, grade_cputype, #if DEVELOPMENT || DEBUG +extern void do_cseg_wedge_thread(void); +extern void do_cseg_unwedge_thread(void); + +static int +cseg_wedge_thread SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + do_cseg_wedge_thread(); + return 0; +} +SYSCTL_PROC(_kern, OID_AUTO, cseg_wedge_thread, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, cseg_wedge_thread, "I", "wedge c_seg thread"); + +static int +cseg_unwedge_thread SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + do_cseg_unwedge_thread(); + return 0; +} +SYSCTL_PROC(_kern, OID_AUTO, cseg_unwedge_thread, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, cseg_unwedge_thread, "I", "unstuck c_seg thread"); + static atomic_int wedge_thread_should_wake = 0; static int @@ -4792,8 +4829,42 @@ SYSCTL_PROC(_kern, OID_AUTO, test_mtx_uncontended, CTLTYPE_STRING | CTLFLAG_MASK extern uint64_t MutexSpin; -SYSCTL_QUAD(_kern, OID_AUTO, mutex_spin_us, CTLFLAG_RW, &MutexSpin, - "Spin time for acquiring a kernel mutex"); +SYSCTL_QUAD(_kern, OID_AUTO, mutex_spin_abs, CTLFLAG_RW, &MutexSpin, + "Spin time in abs for acquiring a kernel mutex"); + +extern uint64_t low_MutexSpin; +extern int64_t high_MutexSpin; +extern unsigned int real_ncpus; + +SYSCTL_QUAD(_kern, OID_AUTO, low_mutex_spin_abs, CTLFLAG_RW, &low_MutexSpin, + "Low spin threshold in abs for acquiring a kernel mutex"); + +static int +sysctl_high_mutex_spin_ns SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error; + int64_t val = 0; + int64_t res; + + /* Check if the user is writing to high_MutexSpin, or just reading it */ + if (req->newptr) { + error = SYSCTL_IN(req, &val, sizeof(val)); + if (error || (val < 0 && val != -1)) { + return error; + } + high_MutexSpin = val; + } + + if (high_MutexSpin >= 0) { + res = high_MutexSpin; + } else { + res = low_MutexSpin * real_ncpus; + } + return SYSCTL_OUT(req, &res, sizeof(res)); +} +SYSCTL_PROC(_kern, OID_AUTO, high_mutex_spin_abs, CTLFLAG_RW | CTLTYPE_QUAD, 0, 0, sysctl_high_mutex_spin_ns, "I", + "High spin threshold in abs for acquiring a kernel mutex"); #if defined (__x86_64__) diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index ba02e1540..83581807f 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -121,7 +121,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 59) +#if (MAC_POLICY_OPS_VERSION != 62) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -285,7 +285,7 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(vnode_check_trigger_resolve) CHECK_SET_HOOK(mount_check_mount_late) - .mpo_reserved1 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(mount_check_snapshot_mount) .mpo_reserved2 = (mpo_reserved_hook_t *)common_hook, CHECK_SET_HOOK(skywalk_flow_check_connect) CHECK_SET_HOOK(skywalk_flow_check_listen) diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index b5a78b6b9..4e1a1c4fb 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -332,19 +332,24 @@ oslog_streamopen(__unused dev_t dev, __unused int flags, __unused int mode, stru if (!oslog_stream_msg_bufc) { return ENOMEM; } + /* Zeroing to avoid copying uninitialized struct padding to userspace. */ + bzero(oslog_stream_msg_bufc, oslog_stream_buf_size); /* entries to support kernel logging in stream mode */ - entries = kalloc(oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s)); + size_t entries_size = oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s); + entries = kalloc(entries_size); if (!entries) { kfree(oslog_stream_msg_bufc, oslog_stream_buf_size); return ENOMEM; } + /* Zeroing to avoid copying uninitialized struct padding to userspace. */ + bzero(entries, entries_size); stream_lock(); if (oslog_stream_open) { stream_unlock(); kfree(oslog_stream_msg_bufc, oslog_stream_buf_size); - kfree(entries, oslog_stream_num_entries * sizeof(struct oslog_stream_buf_entry_s)); + kfree(entries, entries_size); return EBUSY; } @@ -359,9 +364,6 @@ oslog_streamopen(__unused dev_t dev, __unused int flags, __unused int mode, stru for (int i = 0; i < oslog_stream_num_entries; i++) { oslog_stream_buf_entries[i].type = oslog_stream_link_type_log; - oslog_stream_buf_entries[i].offset = 0; - oslog_stream_buf_entries[i].size = 0; - oslog_stream_buf_entries[i].timestamp = 0; STAILQ_INSERT_TAIL(&oslog_stream_free_head, &oslog_stream_buf_entries[i], buf_entries); } diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index ddf8e5db4..0d7973826 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -122,7 +122,6 @@ struct snprintf_arg { extern const char *debugger_panic_str; extern void cnputc(char); /* standard console putc */ -void (*v_putc)(char) = cnputc; /* routine to putc on virtual console */ extern struct tty cons; /* standard console tty */ extern struct tty *constty; /* pointer to console "window" tty */ @@ -385,7 +384,7 @@ putchar(int c, void *arg) log_putc_locked(msgbufp, c); } if ((pca->flags & TOCONS) && constty == 0 && c != '\0') { - (*v_putc)(c); + cnputc(c); } if (pca->flags & TOSTR) { **sp = c; diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 6878545e7..240bae020 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -37,6 +37,7 @@ #include #include #include +#include 0 AUE_NULL ALL { int nosys(void); } { indirect syscall } 1 AUE_EXIT ALL { void exit(int rval) NO_SYSCALL_STUB; } @@ -229,7 +230,7 @@ 153 AUE_PREAD ALL { user_ssize_t pread(int fd, user_addr_t buf, user_size_t nbyte, off_t offset); } 154 AUE_PWRITE ALL { user_ssize_t pwrite(int fd, user_addr_t buf, user_size_t nbyte, off_t offset); } -#if NFSSERVER +#if NFSSERVER /* XXX */ 155 AUE_NFS_SVC ALL { int nfssvc(int flag, caddr_t argp); } #else 155 AUE_NULL ALL { int nosys(void); } @@ -241,7 +242,7 @@ 159 AUE_UNMOUNT ALL { int unmount(user_addr_t path, int flags); } 160 AUE_NULL ALL { int nosys(void); } { old async_daemon } -#if NFSSERVER +#if NFSSERVER /* XXX */ 161 AUE_NFS_GETFH ALL { int getfh(char *fname, fhandle_t *fhp); } #else 161 AUE_NULL ALL { int nosys(void); } @@ -345,12 +346,12 @@ 245 AUE_FFSCTL ALL { int ffsctl(int fd, u_long cmd, caddr_t data, u_int options); } 246 AUE_NULL ALL { int nosys(void); } -#if NFSCLIENT +#if NFSCLIENT /* XXX */ 247 AUE_NULL ALL { int nfsclnt(int flag, caddr_t argp); } #else 247 AUE_NULL ALL { int nosys(void); } #endif -#if NFSSERVER +#if NFSSERVER /* XXX */ 248 AUE_FHOPEN ALL { int fhopen(const struct fhandle *u_fhp, int flags); } #else 248 AUE_NULL ALL { int nosys(void); } diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index fa3e01f84..842ff323b 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -992,6 +992,7 @@ 0x3130164 VFS_devfs_label_associate_device 0x3130168 VFS_devfs_label_associate_directory 0x313016C VFS_label_associate_fdesc +0x3130170 VFS_mount_check_snapshot_mount 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit 0x4010008 force_exit diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 4ab504c31..b94476d05 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -667,6 +667,9 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, struct protosw *prp; struct socket *so; int error = 0; +#if defined(XNU_TARGET_OS_OSX) + pid_t rpid = -1; +#endif #if TCPDEBUG extern int tcpconsdebug; @@ -757,7 +760,29 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, so->e_pid = proc_pid(ep); proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid)); so->so_flags |= SOF_DELEGATED; +#if defined(XNU_TARGET_OS_OSX) + if (ep->p_responsible_pid != so->e_pid) { + rpid = ep->p_responsible_pid; + } +#endif + } + +#if defined(XNU_TARGET_OS_OSX) + if (rpid < 0 && p->p_responsible_pid != so->last_pid) { + rpid = p->p_responsible_pid; + } + + so->so_rpid = -1; + uuid_clear(so->so_ruuid); + if (rpid >= 0) { + proc_t rp = proc_find(rpid); + if (rp != PROC_NULL) { + proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid)); + so->so_rpid = rpid; + proc_rele(rp); + } } +#endif so->so_cred = kauth_cred_proc_ref(p); if (!suser(kauth_cred_get(), NULL)) { @@ -6532,7 +6557,12 @@ filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so } } - retval = (data >= lowwat); + /* + * While the `data` field is the amount of data to read, + * 0-sized packets need to wake up the kqueue, see 58140856, + * so we need to take control bytes into account too. + */ + retval = (so->so_rcv.sb_cc >= lowwat); out: if (retval && kev) { @@ -7857,6 +7887,20 @@ so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t chec so->e_upid = proc_uniqueid(ep); so->e_pid = proc_pid(ep); proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid)); + +#if defined(XNU_TARGET_OS_OSX) + if (ep->p_responsible_pid != so->e_pid) { + proc_t rp = proc_find(ep->p_responsible_pid); + if (rp != PROC_NULL) { + proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid)); + so->so_rpid = ep->p_responsible_pid; + proc_rele(rp); + } else { + uuid_clear(so->so_ruuid); + so->so_rpid = -1; + } + } +#endif } if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) { (*so->so_proto->pr_update_last_owner)(so, NULL, ep); diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index f37a137aa..d0d8daec9 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -1222,7 +1222,44 @@ field of the .Vt statfs structure returned by .Xr statfs 2 . +. +.It ATTR_CMNEXT_CLONEID +A +.Vt u_int64_t +that uniquely identifies the data stream associated with the file +system object. Useful for finding which files are pure clones of each +other (as they will have the same clone-id). +. +.It ATTR_CMNEXT_EXT_FLAGS +A +.Vt u_int64_t +that contains additional flags with information about the file. The +flags are: +. +.Bl -tag -width EF_MAY_SHARE_BLOCKS +. +.It EF_MAY_SHARE_BLOCKS +If this bit is set then the file may share blocks with another file +(i.e. it is a clone of another file). +. +.It EF_NO_XATTRS +If this bit is set then the file has no extended attributes. Useful +for avoiding a call to listxattr(). +. +.It EF_IS_SYNC_ROOT +If this bit is set the directory is a "sync root". This bit will +never be set for regular files. +. +.It EF_IS_PURGEABLE +If this bit is set the item is a "purgeable" item that can be deleted +by the file system when asked to free space. +. +.It EF_IS_SPARSE +If this bit is set the item has sparse regions. +. +.El .El +.Pp . .Sh VOLUME CAPABILITIES . diff --git a/bsd/man/man4/random.4 b/bsd/man/man4/random.4 index 3c36e6317..4ba8415d2 100644 --- a/bsd/man/man4/random.4 +++ b/bsd/man/man4/random.4 @@ -36,13 +36,13 @@ is a compatibility nod to Linux. On Linux, will produce lower quality output if the entropy pool drains, while .Nm /dev/random will prefer to block and wait for additional entropy to be collected. -With Yarrow, this choice and distinction is not necessary, and +With Fortuna, this choice and distinction is not necessary, and the two devices behave identically. You may use either. .Pp The .Nm device implements the -.Nm Yarrow +.Nm Fortuna pseudo random number generator algorithm and maintains its entropy pool. The kernel automatically seeds the algorithm with additional entropy during normal execution. .Sh FILES diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index a1392ce2a..3498ffc04 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -202,7 +202,7 @@ devfs_mount(struct mount *mp, __unused vnode_t devvp, __unused user_addr_t data, * Fill out some fields */ __IGNORE_WCASTALIGN(mp->mnt_data = (qaddr_t)devfs_mp_p); - mp->mnt_vfsstat.f_fsid.val[0] = (int32_t)(uintptr_t)devfs_mp_p; + mp->mnt_vfsstat.f_fsid.val[0] = (int32_t)VM_KERNEL_ADDRHASH(devfs_mp_p); mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); mp->mnt_flag |= MNT_LOCAL; @@ -308,7 +308,7 @@ devfs_statfs( struct mount *mp, struct vfsstatfs *sbp, __unused vfs_context_t ct sbp->f_bavail = 0; sbp->f_files = devfs_stats.nodes; sbp->f_ffree = 0; - sbp->f_fsid.val[0] = (int32_t)(uintptr_t)devfs_mp_p; + sbp->f_fsid.val[0] = (int32_t)VM_KERNEL_ADDRHASH(devfs_mp_p); sbp->f_fsid.val[1] = vfs_typenum(mp); return 0; diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index b9de4b101..322f40823 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -483,7 +483,7 @@ devfs_getattr(struct vnop_getattr_args *ap) VATTR_RETURN(vap, va_nlink, file_node->dn_links); VATTR_RETURN(vap, va_uid, file_node->dn_uid); VATTR_RETURN(vap, va_gid, file_node->dn_gid); - VATTR_RETURN(vap, va_fsid, (uintptr_t)file_node->dn_dvm); + VATTR_RETURN(vap, va_fsid, (uint32_t)VM_KERNEL_ADDRHASH(file_node->dn_dvm)); VATTR_RETURN(vap, va_fileid, (uintptr_t)file_node->dn_ino); VATTR_RETURN(vap, va_data_size, file_node->dn_len); diff --git a/bsd/miscfs/routefs/routefs_ops.c b/bsd/miscfs/routefs/routefs_ops.c index 664ae0e16..42e082203 100644 --- a/bsd/miscfs/routefs/routefs_ops.c +++ b/bsd/miscfs/routefs/routefs_ops.c @@ -154,7 +154,7 @@ routefs_mount(struct mount *mp, __unused vnode_t devvp, user_addr_t data, vfs_co * Fill out some fields */ __IGNORE_WCASTALIGN(mp->mnt_data = (qaddr_t)routefs_mp_p); - mp->mnt_vfsstat.f_fsid.val[0] = (int32_t)(uintptr_t)routefs_mp_p; + mp->mnt_vfsstat.f_fsid.val[0] = (int32_t)VM_KERNEL_ADDRHASH(routefs_mp_p); mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp); mp->mnt_flag |= MNT_LOCAL; @@ -283,7 +283,7 @@ routefs_statfs( struct mount *mp, struct vfsstatfs *sbp, __unused vfs_context_t sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; - sbp->f_fsid.val[0] = (int32_t)(uintptr_t)routefs_mp_p; + sbp->f_fsid.val[0] = (int32_t)VM_KERNEL_ADDRHASH(routefs_mp_p); sbp->f_fsid.val[1] = vfs_typenum(mp); return 0; diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 62988b66b..626c2b2bf 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -359,6 +359,7 @@ struct content_filter **content_filters = NULL; uint32_t cfil_active_count = 0; /* Number of active content filters */ uint32_t cfil_sock_attached_count = 0; /* Number of sockets attachements */ uint32_t cfil_sock_udp_attached_count = 0; /* Number of UDP sockets attachements */ +uint32_t cfil_sock_attached_stats_count = 0; /* Number of sockets requested periodic stats report */ uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */ static kern_ctl_ref cfil_kctlref = NULL; @@ -408,6 +409,11 @@ struct cfil_entry { uint32_t cfe_necp_control_unit; struct timeval cfe_last_event; /* To user space */ struct timeval cfe_last_action; /* From user space */ + uint64_t cfe_byte_inbound_count_reported; /* stats already been reported */ + uint64_t cfe_byte_outbound_count_reported; /* stats already been reported */ + struct timeval cfe_stats_report_ts; /* Timestamp for last stats report */ + uint32_t cfe_stats_report_frequency; /* Interval for stats report in msecs */ + boolean_t cfe_laddr_sent; struct cfe_buf { /* @@ -455,6 +461,7 @@ struct cfil_hash_entry; */ struct cfil_info { TAILQ_ENTRY(cfil_info) cfi_link; + TAILQ_ENTRY(cfil_info) cfi_link_stats; struct socket *cfi_so; uint64_t cfi_flags; uint64_t cfi_sock_id; @@ -517,6 +524,7 @@ struct cfil_info { #define CFI_ENTRY_KCUNIT(i, e) (((e) - &((i)->cfi_entries[0])) + 1) TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head; +TAILQ_HEAD(cfil_sock_head_stats, cfil_info) cfil_sock_head_stats; #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x) #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x) @@ -538,6 +546,23 @@ LIST_HEAD(cfilhashhead, cfil_hash_entry); (addr.sa.sa_family == AF_INET && addr.sin.sin_addr.s_addr == 0) || \ (addr.sa.sa_family == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&addr.sin6.sin6_addr))) +/* + * Periodic Statistics Report: + */ +static struct thread *cfil_stats_report_thread; +#define CFIL_STATS_REPORT_INTERVAL_MIN_MSEC 500 // Highest report frequency +#define CFIL_STATS_REPORT_RUN_INTERVAL_NSEC (CFIL_STATS_REPORT_INTERVAL_MIN_MSEC * NSEC_PER_MSEC) +#define CFIL_STATS_REPORT_MAX_COUNT 50 // Max stats to be reported per run + +/* This buffer must have same layout as struct cfil_msg_stats_report */ +struct cfil_stats_report_buffer { + struct cfil_msg_hdr msghdr; + uint32_t count; + struct cfil_msg_sock_stats stats[CFIL_STATS_REPORT_MAX_COUNT]; +}; +static struct cfil_stats_report_buffer *global_cfil_stats_report_buffers[MAX_CONTENT_FILTER]; +static uint32_t global_cfil_stats_counts[MAX_CONTENT_FILTER]; + /* * UDP Garbage Collection: */ @@ -571,7 +596,7 @@ struct cfil_hash_entry { u_short cfentry_lport; sa_family_t cfentry_family; u_int32_t cfentry_flowhash; - u_int32_t cfentry_lastused; + u_int64_t cfentry_lastused; union { /* foreign host table entry */ struct in_addr_4in6 addr46; @@ -637,6 +662,7 @@ int cfil_debug = 1; #define DATA_DEBUG 0 #define SHOW_DEBUG 0 #define GC_DEBUG 0 +#define STATS_DEBUG 0 /* * Sysctls for logs and statistics @@ -754,6 +780,8 @@ static void cfil_sock_received_verdict(struct socket *so); static void cfil_fill_event_msg_addresses(struct cfil_hash_entry *, struct inpcb *, union sockaddr_in_4_6 *, union sockaddr_in_4_6 *, boolean_t, boolean_t); +static void cfil_stats_report_thread_func(void *, wait_result_t); +static void cfil_stats_report(void *v, wait_result_t w); bool check_port(struct sockaddr *, u_short); @@ -1190,6 +1218,34 @@ cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, *unitinfo = cfc; cfil_active_count++; + + // Allocate periodic stats buffer for this filter + if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] == NULL) { + cfil_rw_unlock_exclusive(&cfil_lck_rw); + + struct cfil_stats_report_buffer *buf; + + MALLOC(buf, + struct cfil_stats_report_buffer *, + sizeof(struct cfil_stats_report_buffer), + M_TEMP, + M_WAITOK | M_ZERO); + + cfil_rw_lock_exclusive(&cfil_lck_rw); + + if (buf == NULL) { + error = ENOMEM; + cfil_rw_unlock_exclusive(&cfil_lck_rw); + goto done; + } + + /* Another thread may have won the race */ + if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] != NULL) { + FREE(buf, M_TEMP); + } else { + global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] = buf; + } + } } cfil_rw_unlock_exclusive(&cfil_lck_rw); done: @@ -1334,6 +1390,11 @@ release: } verify_content_filter(cfc); + /* Free the stats buffer for this filter */ + if (global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] != NULL) { + FREE(global_cfil_stats_report_buffers[cfc->cf_kcunit - 1], M_TEMP); + global_cfil_stats_report_buffers[cfc->cf_kcunit - 1] = NULL; + } VERIFY(cfc->cf_sock_count == 0); /* @@ -1593,6 +1654,90 @@ done: return so; } +static void +cfil_info_stats_toggle(struct cfil_info *cfil_info, struct cfil_entry *entry, uint32_t report_frequency) +{ + struct cfil_info *cfil = NULL; + Boolean found = FALSE; + int kcunit; + + if (cfil_info == NULL) { + return; + } + + if (report_frequency) { + if (entry == NULL) { + return; + } + + // Update stats reporting frequency. + if (entry->cfe_stats_report_frequency != report_frequency) { + entry->cfe_stats_report_frequency = report_frequency; + if (entry->cfe_stats_report_frequency < CFIL_STATS_REPORT_INTERVAL_MIN_MSEC) { + entry->cfe_stats_report_frequency = CFIL_STATS_REPORT_INTERVAL_MIN_MSEC; + } + microuptime(&entry->cfe_stats_report_ts); + + // Insert cfil_info into list only if it is not in yet. + TAILQ_FOREACH(cfil, &cfil_sock_head_stats, cfi_link_stats) { + if (cfil == cfil_info) { + return; + } + } + + TAILQ_INSERT_TAIL(&cfil_sock_head_stats, cfil_info, cfi_link_stats); + + // Wake up stats thread if this is first flow added + if (cfil_sock_attached_stats_count == 0) { + thread_wakeup((caddr_t)&cfil_sock_attached_stats_count); + } + cfil_sock_attached_stats_count++; +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED - STATS FLOW INSERTED: stats frequency %d msecs", + cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0, + cfil_info->cfi_sock_id, + entry->cfe_stats_report_frequency); +#endif + } + } else { + // Turn off stats reporting for this filter. + if (entry != NULL) { + // Already off, no change. + if (entry->cfe_stats_report_frequency == 0) { + return; + } + + entry->cfe_stats_report_frequency = 0; + // If cfil_info still has filter(s) asking for stats, no need to remove from list. + for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + if (cfil_info->cfi_entries[kcunit - 1].cfe_stats_report_frequency > 0) { + return; + } + } + } + + // No more filter asking for stats for this cfil_info, remove from list. + if (!TAILQ_EMPTY(&cfil_sock_head_stats)) { + found = FALSE; + TAILQ_FOREACH(cfil, &cfil_sock_head_stats, cfi_link_stats) { + if (cfil == cfil_info) { + found = TRUE; + break; + } + } + if (found) { + cfil_sock_attached_stats_count--; + TAILQ_REMOVE(&cfil_sock_head_stats, cfil_info, cfi_link_stats); +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED - STATS FLOW DELETED: stats frequency reset", + cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0, + cfil_info->cfi_sock_id); +#endif + } + } + } +} + static errno_t cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, int flags) @@ -1787,6 +1932,12 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, if (error == EJUSTRETURN) { error = 0; } + + // Toggle stats reporting according to received verdict. + cfil_rw_lock_exclusive(&cfil_lck_rw); + cfil_info_stats_toggle(cfil_info, entry, action_msg->cfa_stats_frequency); + cfil_rw_unlock_exclusive(&cfil_lck_rw); + break; case CFM_OP_DROP: @@ -2247,6 +2398,7 @@ cfil_init(void) lck_rw_init(&cfil_lck_rw, cfil_lck_grp, cfil_lck_attr); TAILQ_INIT(&cfil_sock_head); + TAILQ_INIT(&cfil_sock_head_stats); /* * Register kernel control @@ -2278,10 +2430,21 @@ cfil_init(void) /* this must not fail */ VERIFY(cfil_udp_gc_thread != NULL); + // Spawn thread for statistics reporting + if (kernel_thread_start(cfil_stats_report_thread_func, NULL, + &cfil_stats_report_thread) != KERN_SUCCESS) { + panic_plain("%s: Can't create statistics report thread", __func__); + /* NOTREACHED */ + } + /* this must not fail */ + VERIFY(cfil_stats_report_thread != NULL); + // Set UDP per-flow mbuf thresholds to 1/32 of platform max mbuf_limit = MAX(UDP_FLOW_GC_MBUF_CNT_MAX, (nmbclusters << MCLSHIFT) >> UDP_FLOW_GC_MBUF_SHIFT); cfil_udp_gc_mbuf_num_max = (mbuf_limit >> MCLSHIFT); cfil_udp_gc_mbuf_cnt_max = mbuf_limit; + + memset(&global_cfil_stats_report_buffers, 0, sizeof(global_cfil_stats_report_buffers)); } struct cfil_info * @@ -2486,6 +2649,9 @@ cfil_info_free(struct cfil_info *cfil_info) cfil_sock_attached_count--; TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link); + // Turn off stats reporting for cfil_info. + cfil_info_stats_toggle(cfil_info, NULL, 0); + out_drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q); in_drain += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q); @@ -3258,6 +3424,10 @@ static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *sin46, struct in6_addr *ip6, u_int16_t port) { + if (sin46 == NULL) { + return; + } + struct sockaddr_in6 *sin6 = &sin46->sin6; sin6->sin6_family = AF_INET6; @@ -3274,6 +3444,10 @@ static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46, struct in_addr ip, u_int16_t port) { + if (sin46 == NULL) { + return; + } + struct sockaddr_in *sin = &sin46->sin; sin->sin_family = AF_INET; @@ -6548,7 +6722,7 @@ cfil_info_udp_expire(void *v, wait_result_t w) struct cfil_hash_entry *hash_entry; struct cfil_db *db; struct socket *so; - u_int32_t current_time = 0; + u_int64_t current_time = 0; current_time = net_uptime(); @@ -6699,3 +6873,318 @@ cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *opt } return NULL; } + +static int +cfil_dispatch_stats_event_locked(int kcunit, struct cfil_stats_report_buffer *buffer, uint32_t stats_count) +{ + struct content_filter *cfc = NULL; + errno_t error = 0; + size_t msgsize = 0; + + if (buffer == NULL || stats_count == 0) { + return error; + } + + if (content_filters == NULL || kcunit > MAX_CONTENT_FILTER) { + return error; + } + + cfc = content_filters[kcunit - 1]; + if (cfc == NULL) { + return error; + } + + /* Would be wasteful to try */ + if (cfc->cf_flags & CFF_FLOW_CONTROLLED) { + error = ENOBUFS; + goto done; + } + + msgsize = sizeof(struct cfil_msg_stats_report) + (sizeof(struct cfil_msg_sock_stats) * stats_count); + buffer->msghdr.cfm_len = msgsize; + buffer->msghdr.cfm_version = 1; + buffer->msghdr.cfm_type = CFM_TYPE_EVENT; + buffer->msghdr.cfm_op = CFM_OP_STATS; + buffer->msghdr.cfm_sock_id = 0; + buffer->count = stats_count; + +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "STATS (kcunit %d): msg size %lu - %lu %lu %lu", + kcunit, + (unsigned long)msgsize, + (unsigned long)sizeof(struct cfil_msg_stats_report), + (unsigned long)sizeof(struct cfil_msg_sock_stats), + (unsigned long)stats_count); +#endif + + error = ctl_enqueuedata(cfc->cf_kcref, cfc->cf_kcunit, + buffer, + msgsize, + CTL_DATA_EOR); + if (error != 0) { + CFIL_LOG(LOG_ERR, "ctl_enqueuedata() failed: %d", error); + goto done; + } + OSIncrementAtomic(&cfil_stats.cfs_stats_event_ok); + +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT: send msg to %d", kcunit); +#endif + +done: + + if (error == ENOBUFS) { + OSIncrementAtomic( + &cfil_stats.cfs_stats_event_flow_control); + + if (!cfil_rw_lock_shared_to_exclusive(&cfil_lck_rw)) { + cfil_rw_lock_exclusive(&cfil_lck_rw); + } + + cfc->cf_flags |= CFF_FLOW_CONTROLLED; + + cfil_rw_unlock_exclusive(&cfil_lck_rw); + } else if (error != 0) { + OSIncrementAtomic(&cfil_stats.cfs_stats_event_fail); + } + + return error; +} + +static void +cfil_stats_report_thread_sleep(bool forever) +{ +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS COLLECTION SLEEP"); +#endif + + if (forever) { + (void) assert_wait((event_t) &cfil_sock_attached_stats_count, + THREAD_INTERRUPTIBLE); + } else { + uint64_t deadline = 0; + nanoseconds_to_absolutetime(CFIL_STATS_REPORT_RUN_INTERVAL_NSEC, &deadline); + clock_absolutetime_interval_to_deadline(deadline, &deadline); + + (void) assert_wait_deadline(&cfil_sock_attached_stats_count, + THREAD_INTERRUPTIBLE, deadline); + } +} + +static void +cfil_stats_report_thread_func(void *v, wait_result_t w) +{ +#pragma unused(v, w) + + ASSERT(cfil_stats_report_thread == current_thread()); + thread_set_thread_name(current_thread(), "CFIL_STATS_REPORT"); + + // Kick off gc shortly + cfil_stats_report_thread_sleep(false); + thread_block_parameter((thread_continue_t) cfil_stats_report, NULL); + /* NOTREACHED */ +} + +static bool +cfil_stats_collect_flow_stats_for_filter(int kcunit, + struct cfil_info *cfil_info, + struct cfil_entry *entry, + struct timeval current_tv) +{ + struct cfil_stats_report_buffer *buffer = NULL; + struct cfil_msg_sock_stats *flow_array = NULL; + struct cfil_msg_sock_stats *stats = NULL; + struct inpcb *inp = NULL; + struct timeval diff_time; + uint64_t diff_time_usecs; + int index = 0; + + if (entry->cfe_stats_report_frequency == 0) { + return false; + } + + buffer = global_cfil_stats_report_buffers[kcunit - 1]; + if (buffer == NULL) { +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS: no buffer"); +#endif + return false; + } + + timersub(¤t_tv, &entry->cfe_stats_report_ts, &diff_time); + diff_time_usecs = (diff_time.tv_sec * USEC_PER_SEC) + diff_time.tv_usec; + +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - elapsed time - ts %llu %llu cur ts %llu %llu diff %llu %llu (usecs %llu) @freq %llu usecs sockID %llu", + (unsigned long long)entry->cfe_stats_report_ts.tv_sec, + (unsigned long long)entry->cfe_stats_report_ts.tv_usec, + (unsigned long long)current_tv.tv_sec, + (unsigned long long)current_tv.tv_usec, + (unsigned long long)diff_time.tv_sec, + (unsigned long long)diff_time.tv_usec, + (unsigned long long)diff_time_usecs, + (unsigned long long)((entry->cfe_stats_report_frequency * NSEC_PER_MSEC) / NSEC_PER_USEC), + cfil_info->cfi_sock_id); +#endif + + // Compare elapsed time in usecs + if (diff_time_usecs >= (entry->cfe_stats_report_frequency * NSEC_PER_MSEC) / NSEC_PER_USEC) { +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - in %llu reported %llu", + cfil_info->cfi_byte_inbound_count, + entry->cfe_byte_inbound_count_reported); + CFIL_LOG(LOG_ERR, "CFIL: STATS REPORT - out %llu reported %llu", + cfil_info->cfi_byte_outbound_count, + entry->cfe_byte_outbound_count_reported); +#endif + // Check if flow has new bytes that have not been reported + if (entry->cfe_byte_inbound_count_reported < cfil_info->cfi_byte_inbound_count || + entry->cfe_byte_outbound_count_reported < cfil_info->cfi_byte_outbound_count) { + flow_array = (struct cfil_msg_sock_stats *)&buffer->stats; + index = global_cfil_stats_counts[kcunit - 1]; + + stats = &flow_array[index]; + stats->cfs_sock_id = cfil_info->cfi_sock_id; + stats->cfs_byte_inbound_count = cfil_info->cfi_byte_inbound_count; + stats->cfs_byte_outbound_count = cfil_info->cfi_byte_outbound_count; + + if (entry->cfe_laddr_sent == false) { + /* cache it if necessary */ + if (cfil_info->cfi_so_attach_laddr.sa.sa_len == 0) { + inp = cfil_info->cfi_so ? sotoinpcb(cfil_info->cfi_so) : NULL; + if (inp != NULL) { + boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT); + union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL; + union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr; + cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, + src, dst, inp->inp_vflag & INP_IPV4, outgoing); + } + } + + if (cfil_info->cfi_so_attach_laddr.sa.sa_len != 0) { + stats->cfs_laddr.sin6 = cfil_info->cfi_so_attach_laddr.sin6; + entry->cfe_laddr_sent = true; + } + } + + global_cfil_stats_counts[kcunit - 1]++; + + entry->cfe_stats_report_ts = current_tv; + entry->cfe_byte_inbound_count_reported = cfil_info->cfi_byte_inbound_count; + entry->cfe_byte_outbound_count_reported = cfil_info->cfi_byte_outbound_count; +#if STATS_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: STATS COLLECTED"); +#endif + CFI_ADD_TIME_LOG(cfil_info, ¤t_tv, &cfil_info->cfi_first_event, CFM_OP_STATS); + return true; + } + } + return false; +} + +static void +cfil_stats_report(void *v, wait_result_t w) +{ +#pragma unused(v, w) + + struct cfil_info *cfil_info = NULL; + struct cfil_entry *entry = NULL; + struct timeval current_tv; + uint32_t flow_count = 0; + uint64_t saved_next_sock_id = 0; // Next sock id to be reported for next loop + bool flow_reported = false; + +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS COLLECTION RUNNING"); +#endif + + do { + // Collect all sock ids of flows that has new stats + cfil_rw_lock_shared(&cfil_lck_rw); + + if (cfil_sock_attached_stats_count == 0) { +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS: no flow"); +#endif + cfil_rw_unlock_shared(&cfil_lck_rw); + goto go_sleep; + } + + for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + if (global_cfil_stats_report_buffers[kcunit - 1] != NULL) { + memset(global_cfil_stats_report_buffers[kcunit - 1], 0, sizeof(struct cfil_stats_report_buffer)); + } + global_cfil_stats_counts[kcunit - 1] = 0; + } + + microuptime(¤t_tv); + flow_count = 0; + + TAILQ_FOREACH(cfil_info, &cfil_sock_head_stats, cfi_link_stats) { + if (saved_next_sock_id != 0 && + saved_next_sock_id == cfil_info->cfi_sock_id) { + // Here is where we left off previously, start accumulating + saved_next_sock_id = 0; + } + + if (saved_next_sock_id == 0) { + if (flow_count >= CFIL_STATS_REPORT_MAX_COUNT) { + // Examine a fixed number of flows each round. Remember the current flow + // so we can start from here for next loop + saved_next_sock_id = cfil_info->cfi_sock_id; + break; + } + + flow_reported = false; + for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + entry = &cfil_info->cfi_entries[kcunit - 1]; + if (entry->cfe_filter == NULL) { +#if STATS_DEBUG + CFIL_LOG(LOG_NOTICE, "CFIL: STATS REPORT - so %llx no filter", + cfil_info->cfi_so ? (uint64_t)VM_KERNEL_ADDRPERM(cfil_info->cfi_so) : 0); +#endif + continue; + } + + if ((entry->cfe_stats_report_frequency > 0) && + cfil_stats_collect_flow_stats_for_filter(kcunit, cfil_info, entry, current_tv) == true) { + flow_reported = true; + } + } + if (flow_reported == true) { + flow_count++; + } + } + } + + if (flow_count > 0) { +#if STATS_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: STATS reporting for %d flows", flow_count); +#endif + for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + if (global_cfil_stats_report_buffers[kcunit - 1] != NULL && + global_cfil_stats_counts[kcunit - 1] > 0) { + cfil_dispatch_stats_event_locked(kcunit, + global_cfil_stats_report_buffers[kcunit - 1], + global_cfil_stats_counts[kcunit - 1]); + } + } + } else { + cfil_rw_unlock_shared(&cfil_lck_rw); + goto go_sleep; + } + + cfil_rw_unlock_shared(&cfil_lck_rw); + + // Loop again if we haven't finished the whole cfil_info list + } while (saved_next_sock_id != 0); + +go_sleep: + + // Sleep forever (until waken up) if no more flow to report + cfil_rw_lock_shared(&cfil_lck_rw); + cfil_stats_report_thread_sleep(cfil_sock_attached_stats_count == 0 ? true : false); + cfil_rw_unlock_shared(&cfil_lck_rw); + thread_block_parameter((thread_continue_t) cfil_stats_report, NULL); + /* NOTREACHED */ +} diff --git a/bsd/net/content_filter.h b/bsd/net/content_filter.h index b4f4485c5..e3829bf02 100644 --- a/bsd/net/content_filter.h +++ b/bsd/net/content_filter.h @@ -149,6 +149,7 @@ typedef struct cfil_crypto_data { #define CFM_OP_DATA_IN 4 /* data being received */ #define CFM_OP_DISCONNECT_OUT 5 /* no more outgoing data */ #define CFM_OP_DISCONNECT_IN 6 /* no more incoming data */ +#define CFM_OP_STATS 7 /* periodic stats report(s) */ /* * Operations associated with action from filter to kernel @@ -262,6 +263,30 @@ struct cfil_msg_sock_closed { uint32_t cfc_signature_length; } __attribute__((aligned(8))); +/* + * struct cfil_msg_stats_report + * + * Statistics report for flow(s). + * + * Action: No reply is expected. + * + * Valid Types: CFM_TYPE_EVENT + * + * Valid Op: CFM_OP_STATS + */ +struct cfil_msg_sock_stats { + cfil_sock_id_t cfs_sock_id; + uint64_t cfs_byte_inbound_count; + uint64_t cfs_byte_outbound_count; + union sockaddr_in_4_6 cfs_laddr; +} __attribute__((aligned(8))); + +struct cfil_msg_stats_report { + struct cfil_msg_hdr cfr_msghdr; + uint32_t cfr_count; + struct cfil_msg_sock_stats cfr_stats[]; +} __attribute__((aligned(8))); + /* * struct cfil_msg_action * @@ -285,6 +310,7 @@ struct cfil_msg_action { uint64_t cfa_in_peek_offset; uint64_t cfa_out_pass_offset; uint64_t cfa_out_peek_offset; + uint32_t cfa_stats_frequency; // Statistics frequency in milliseconds }; /* @@ -409,6 +435,10 @@ struct cfil_stats { int32_t cfs_data_event_flow_control; int32_t cfs_data_event_fail; + int32_t cfs_stats_event_ok; + int32_t cfs_stats_event_flow_control; + int32_t cfs_stats_event_fail; + int32_t cfs_disconnect_in_event_ok; int32_t cfs_disconnect_out_event_ok; int32_t cfs_disconnect_event_flow_control; diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index d38af044c..4a703aee2 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -2275,7 +2275,8 @@ dlil_input_thread_cont(void *v, wait_result_t wres) lck_mtx_lock_spin(&inp->input_lck); VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING); - if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + if (!(inp->input_waiting & ~(DLIL_INPUT_RUNNING | + DLIL_INPUT_TERMINATE))) { break; } } @@ -2579,7 +2580,8 @@ skip: lck_mtx_lock_spin(&inp->input_lck); VERIFY(inp->input_waiting & DLIL_INPUT_RUNNING); - if (!(inp->input_waiting & ~DLIL_INPUT_RUNNING)) { + if (!(inp->input_waiting & ~(DLIL_INPUT_RUNNING | + DLIL_INPUT_TERMINATE))) { break; } } diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index a3c90194d..80bc27d78 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -188,6 +188,7 @@ #define BR_DBGF_MCAST 0x0080 #define BR_DBGF_HOSTFILTER 0x0100 #define BR_DBGF_CHECKSUM 0x0200 +#define BR_DBGF_MAC_NAT 0x0400 #endif /* BRIDGE_DEBUG */ #define _BRIDGE_LOCK(_sc) lck_mtx_lock(&(_sc)->sc_mtx) @@ -207,6 +208,7 @@ #define BRIDGE_UNREF(_sc) bridge_unref(_sc) #define BRIDGE_XLOCK(_sc) bridge_xlock(_sc) #define BRIDGE_XDROP(_sc) bridge_xdrop(_sc) +#define IF_BRIDGE_DEBUG(f) bridge_debug_flag_is_set(f) #else /* !BRIDGE_DEBUG */ @@ -241,12 +243,14 @@ (_sc)->sc_iflist_xcnt--; \ } while (0) +#define IF_BRIDGE_DEBUG(f) FALSE + #endif /* BRIDGE_DEBUG */ #if NBPFILTER > 0 #define BRIDGE_BPF_MTAP_INPUT(sc, m) \ - if (sc->sc_bpf_input) \ - bridge_bpf_input(sc->sc_ifp, m) + if (sc->sc_bpf_input != NULL) \ + bridge_bpf_input(sc->sc_ifp, m, __func__, __LINE__) #else /* NBPFILTER */ #define BRIDGE_BPF_MTAP_INPUT(ifp, m) #endif /* NBPFILTER */ @@ -287,6 +291,15 @@ #define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) #endif +/* + * Number of MAC NAT entries + * - sized based on 16 clients (including MAC NAT interface) + * each with 4 addresses + */ +#ifndef BRIDGE_MAC_NAT_ENTRY_MAX +#define BRIDGE_MAC_NAT_ENTRY_MAX 64 +#endif /* BRIDGE_MAC_NAT_ENTRY_MAX */ + /* * List of capabilities to possibly mask on the member interface. */ @@ -326,6 +339,66 @@ struct bridge_iflist { #define BIFF_HF_IPSRC 0x40 /* host filter source IP is set */ #define BIFF_INPUT_BROADCAST 0x80 /* send broadcast packets in */ +/* + * mac_nat_entry + * - translates between an IP address and MAC address on a specific + * bridge interface member + */ +struct mac_nat_entry { + LIST_ENTRY(mac_nat_entry) mne_list; /* list linkage */ + struct bridge_iflist *mne_bif; /* originating interface */ + unsigned long mne_expire; /* expiration time */ + union { + struct in_addr mneu_ip; /* originating IPv4 address */ + struct in6_addr mneu_ip6; /* originating IPv6 address */ + } mne_u; + uint8_t mne_mac[ETHER_ADDR_LEN]; + uint8_t mne_flags; + uint8_t mne_reserved; +}; +#define mne_ip mne_u.mneu_ip +#define mne_ip6 mne_u.mneu_ip6 + +#define MNE_FLAGS_IPV6 0x01 /* IPv6 address */ + +LIST_HEAD(mac_nat_entry_list, mac_nat_entry); + +/* + * mac_nat_record + * - used by bridge_mac_nat_output() to convey the translation that needs + * to take place in bridge_mac_nat_translate + * - holds enough information so that the translation can be done later without + * holding the bridge lock + */ +struct mac_nat_record { + uint16_t mnr_ether_type; + union { + uint16_t mnru_arp_offset; + struct { + uint16_t mnruip_dhcp_flags; + uint16_t mnruip_udp_csum; + uint8_t mnruip_header_len; + } mnru_ip; + struct { + uint16_t mnruip6_icmp6_len; + uint16_t mnruip6_lladdr_offset; + uint8_t mnruip6_icmp6_type; + uint8_t mnruip6_header_len; + } mnru_ip6; + } mnr_u; +}; + +#define mnr_arp_offset mnr_u.mnru_arp_offset + +#define mnr_ip_header_len mnr_u.mnru_ip.mnruip_header_len +#define mnr_ip_dhcp_flags mnr_u.mnru_ip.mnruip_dhcp_flags +#define mnr_ip_udp_csum mnr_u.mnru_ip.mnruip_udp_csum + +#define mnr_ip6_icmp6_len mnr_u.mnru_ip6.mnruip6_icmp6_len +#define mnr_ip6_icmp6_type mnr_u.mnru_ip6.mnruip6_icmp6_type +#define mnr_ip6_header_len mnr_u.mnru_ip6.mnruip6_header_len +#define mnr_ip6_lladdr_offset mnr_u.mnru_ip6.mnruip6_lladdr_offset + /* * Bridge route node. */ @@ -390,6 +463,12 @@ struct bridge_softc { u_char sc_defaddr[6]; /* Default MAC address */ char sc_if_xname[IFNAMSIZ]; + struct bridge_iflist *sc_mac_nat_bif; /* single MAC NAT interface */ + struct mac_nat_entry_list sc_mne_list; /* MAC NAT IPv4 */ + struct mac_nat_entry_list sc_mne_list_v6;/* MAC NAT IPv6 */ + uint32_t sc_mne_max; /* max # of entries */ + uint32_t sc_mne_count; /* cur. # of entries */ + uint32_t sc_mne_allocation_failures; #if BRIDGE_DEBUG /* * Locking and unlocking calling history @@ -401,9 +480,9 @@ struct bridge_softc { #endif /* BRIDGE_DEBUG */ }; -#define SCF_DETACHING 0x01 -#define SCF_RESIZING 0x02 -#define SCF_MEDIA_ACTIVE 0x04 +#define SCF_DETACHING 0x01 +#define SCF_RESIZING 0x02 +#define SCF_MEDIA_ACTIVE 0x04 typedef enum { kChecksumOperationNone = 0, @@ -419,6 +498,7 @@ decl_lck_mtx_data(static, bridge_list_mtx); static int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; static zone_t bridge_rtnode_pool = NULL; +static zone_t bridge_mne_pool = NULL; static int bridge_clone_create(struct if_clone *, uint32_t, void *); static int bridge_clone_destroy(struct ifnet *); @@ -440,13 +520,15 @@ static void bridge_ifstop(struct ifnet *, int); static int bridge_output(struct ifnet *, struct mbuf *); static void bridge_finalize_cksum(struct ifnet *, struct mbuf *); static void bridge_start(struct ifnet *); -__private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *); +static errno_t bridge_input(struct ifnet *, mbuf_t *); +static errno_t bridge_iff_input(void *, ifnet_t, protocol_family_t, + mbuf_t *, char **); static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t, mbuf_t *); static errno_t bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, - mbuf_t m); + mbuf_t *m); -static int bridge_enqueue(struct bridge_softc *, struct ifnet *, +static int bridge_enqueue(ifnet_t, struct ifnet *, struct ifnet *, struct mbuf *, ChecksumOperation); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); @@ -539,20 +621,24 @@ static int bridge_ioctl_gfilt(struct bridge_softc *, void *); static int bridge_ioctl_sfilt(struct bridge_softc *, void *); static int bridge_ioctl_ghostfilter(struct bridge_softc *, void *); static int bridge_ioctl_shostfilter(struct bridge_softc *, void *); +static int bridge_ioctl_gmnelist32(struct bridge_softc *, void *); +static int bridge_ioctl_gmnelist64(struct bridge_softc *, void *); #ifdef PFIL_HOOKS static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, int); -static int bridge_ip_checkbasic(struct mbuf **); -#ifdef INET6 -static int bridge_ip6_checkbasic(struct mbuf **); -#endif /* INET6 */ static int bridge_fragment(struct ifnet *, struct mbuf *, struct ether_header *, int, struct llc *); #endif /* PFIL_HOOKS */ +static int bridge_ip_checkbasic(struct mbuf **); +#ifdef INET6 +static int bridge_ip6_checkbasic(struct mbuf **); +#endif /* INET6 */ + +static int bridge_pf(struct mbuf **, struct ifnet *, uint32_t sc_filter_flags, int input); static errno_t bridge_set_bpf_tap(ifnet_t, bpf_tap_mode, bpf_packet_func); -__private_extern__ errno_t bridge_bpf_input(ifnet_t, struct mbuf *); -__private_extern__ errno_t bridge_bpf_output(ifnet_t, struct mbuf *); +static errno_t bridge_bpf_input(ifnet_t, struct mbuf *, const char *, int); +static errno_t bridge_bpf_output(ifnet_t, struct mbuf *); static void bridge_detach(ifnet_t); static void bridge_link_event(struct ifnet *, u_int32_t); @@ -562,7 +648,21 @@ static int interface_media_active(struct ifnet *); static void bridge_schedule_delayed_call(struct bridge_delayed_call *); static void bridge_cancel_delayed_call(struct bridge_delayed_call *); static void bridge_cleanup_delayed_call(struct bridge_delayed_call *); -static int bridge_host_filter(struct bridge_iflist *, struct mbuf *); +static int bridge_host_filter(struct bridge_iflist *, mbuf_t *); + +static errno_t bridge_mac_nat_enable(struct bridge_softc *, + struct bridge_iflist *); +static void bridge_mac_nat_disable(struct bridge_softc *sc); +static void bridge_mac_nat_age_entries(struct bridge_softc *sc, unsigned long); +static void bridge_mac_nat_populate_entries(struct bridge_softc *sc); +static void bridge_mac_nat_flush_entries(struct bridge_softc *sc, + struct bridge_iflist *); +static ifnet_t bridge_mac_nat_input(struct bridge_softc *, mbuf_t *, + boolean_t *); +static boolean_t bridge_mac_nat_output(struct bridge_softc *, + struct bridge_iflist *, mbuf_t *, struct mac_nat_record *); +static void bridge_mac_nat_translate(mbuf_t *, struct mac_nat_record *, + const caddr_t); #define m_copypacket(m, how) m_copym(m, 0, M_COPYALL, how) @@ -748,6 +848,9 @@ static const struct bridge_control bridge_control_table32[] = { .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + + { .bc_func = bridge_ioctl_gmnelist32, .bc_argsize = sizeof(struct ifbrmnelist32), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, }; static const struct bridge_control bridge_control_table64[] = { @@ -846,6 +949,9 @@ static const struct bridge_control bridge_control_table64[] = { .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, { .bc_func = bridge_ioctl_shostfilter, .bc_argsize = sizeof(struct ifbrhostfilter), .bc_flags = BC_F_COPYIN | BC_F_SUSER }, + + { .bc_func = bridge_ioctl_gmnelist64, .bc_argsize = sizeof(struct ifbrmnelist64), + .bc_flags = BC_F_COPYIN | BC_F_COPYOUT }, }; static const unsigned int bridge_control_table_size = @@ -1096,6 +1202,12 @@ link_print(struct bridge_softc * sc) printf("\n"); } +static boolean_t +bridge_debug_flag_is_set(uint32_t flag) +{ + return (if_bridge_debug & flag) != 0; +} + #endif /* BRIDGE_DEBUG */ /* @@ -1114,6 +1226,10 @@ bridgeattach(int n) 1024 * sizeof(struct bridge_rtnode), 0, "bridge_rtnode"); zone_change(bridge_rtnode_pool, Z_CALLERACCT, FALSE); + bridge_mne_pool = zinit(sizeof(struct mac_nat_entry), + 256 * sizeof(struct mac_nat_entry), 0, "bridge_mac_nat_entry"); + zone_change(bridge_mne_pool, Z_CALLERACCT, FALSE); + lck_grp_attr = lck_grp_attr_alloc_init(); bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr); @@ -1237,16 +1353,9 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) lck_mtx_init(&sc->sc_mtx, bridge_lock_grp, bridge_lock_attr); sc->sc_brtmax = BRIDGE_RTABLE_MAX; + sc->sc_mne_max = BRIDGE_MAC_NAT_ENTRY_MAX; sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; - sc->sc_filter_flags = IFBF_FILT_DEFAULT; -#ifndef BRIDGE_IPF - /* - * For backwards compatibility with previous behaviour... - * Switch off filtering on the bridge itself if BRIDGE_IPF is - * not defined. - */ - sc->sc_filter_flags &= ~IFBF_FILT_USEIPF; -#endif + sc->sc_filter_flags = 0; TAILQ_INIT(&sc->sc_iflist); @@ -1295,6 +1404,8 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) __func__, error); goto done; } + LIST_INIT(&sc->sc_mne_list); + LIST_INIT(&sc->sc_mne_list_v6); sc->sc_ifp = ifp; error = bridge_ifnet_set_attrs(ifp); if (error != 0) { @@ -1353,7 +1464,7 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) sc->sc_flags &= ~SCF_MEDIA_ACTIVE; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { link_print(sc); } #endif @@ -1512,7 +1623,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) BRIDGE_LOCK_ASSERT_NOTHELD(sc); #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_IOCTL) { + if (IF_BRIDGE_DEBUG(BR_DBGF_IOCTL)) { printf("%s: ifp %s cmd 0x%08lx (%c%c [%lu] %c %lu)\n", __func__, ifp->if_xname, cmd, (cmd & IOC_IN) ? 'I' : ' ', (cmd & IOC_OUT) ? 'O' : ' ', IOCPARM_LEN(cmd), @@ -1778,7 +1889,7 @@ bridge_set_tso(struct bridge_softc *sc) error = ifnet_set_offload(sc->sc_ifp, offload); if (error != 0) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: ifnet_set_offload(%s, 0x%x) " "failed %d\n", __func__, sc->sc_ifp->if_xname, offload, error); @@ -1798,7 +1909,7 @@ bridge_set_tso(struct bridge_softc *sc) tso_v4_mtu); if (error != 0) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: ifnet_set_tso_mtu(%s, " "AF_INET, %u) failed %d\n", __func__, sc->sc_ifp->if_xname, @@ -1816,7 +1927,7 @@ bridge_set_tso(struct bridge_softc *sc) tso_v6_mtu); if (error != 0) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: ifnet_set_tso_mtu(%s, " "AF_INET6, %u) failed %d\n", __func__, sc->sc_ifp->if_xname, @@ -1897,7 +2008,7 @@ bridge_iff_input(void *cookie, ifnet_t ifp, protocol_family_t protocol, frmlen = (char *)mbuf_data(m) - *frame_ptr; } #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s from %s m 0x%llx data 0x%llx frame 0x%llx %s " "frmlen %lu\n", __func__, sc->sc_ifp->if_xname, ifp->if_xname, (uint64_t)VM_KERNEL_ADDRPERM(m), @@ -1905,7 +2016,7 @@ bridge_iff_input(void *cookie, ifnet_t ifp, protocol_family_t protocol, (uint64_t)VM_KERNEL_ADDRPERM(*frame_ptr), included ? "inside" : "outside", frmlen); - if (if_bridge_debug & BR_DBGF_MBUF) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MBUF)) { printf_mbuf(m, "bridge_iff_input[", "\n"); printf_ether_header((struct ether_header *) (void *)*frame_ptr); @@ -1914,28 +2025,59 @@ bridge_iff_input(void *cookie, ifnet_t ifp, protocol_family_t protocol, } } #endif /* BRIDGE_DEBUG */ + if (included == 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { + printf("%s: frame_ptr outside mbuf\n", __func__); + } + goto out; + } /* Move data pointer to start of frame to the link layer header */ - if (included) { - (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, - mbuf_len(m) + frmlen); - (void) mbuf_pkthdr_adjustlen(m, frmlen); - } else { - printf("%s: frame_ptr outside mbuf\n", __func__); + (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, + mbuf_len(m) + frmlen); + (void) mbuf_pkthdr_adjustlen(m, frmlen); + + /* make sure we can access the ethernet header */ + if (mbuf_pkthdr_len(m) < sizeof(struct ether_header)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { + printf("%s: short frame %lu < %lu\n", __func__, + mbuf_pkthdr_len(m), sizeof(struct ether_header)); + } goto out; } + if (mbuf_len(m) < sizeof(struct ether_header)) { + error = mbuf_pullup(data, sizeof(struct ether_header)); + if (error != 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { + printf("%s: mbuf_pullup(%lu) failed %d\n", + __func__, sizeof(struct ether_header), + error); + } + error = EJUSTRETURN; + goto out; + } + if (m != *data) { + m = *data; + *frame_ptr = mbuf_data(m); + } + } - error = bridge_input(ifp, m, *frame_ptr); + error = bridge_input(ifp, data); /* Adjust packet back to original */ if (error == 0) { + /* bridge_input might have modified *data */ + if (*data != m) { + m = *data; + *frame_ptr = mbuf_data(m); + } (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen); (void) mbuf_pkthdr_adjustlen(m, -frmlen); } #if BRIDGE_DEBUG - if ((if_bridge_debug & BR_DBGF_INPUT) && - (if_bridge_debug & BR_DBGF_MBUF)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT) && + IF_BRIDGE_DEBUG(BR_DBGF_MBUF)) { printf("\n"); printf_mbuf(m, "bridge_iff_input]", "\n"); } @@ -1962,7 +2104,7 @@ bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, } #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_OUTPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_OUTPUT)) { printf("%s: %s from %s m 0x%llx data 0x%llx\n", __func__, sc->sc_ifp->if_xname, ifp->if_xname, (uint64_t)VM_KERNEL_ADDRPERM(m), @@ -1970,7 +2112,7 @@ bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, } #endif /* BRIDGE_DEBUG */ - error = bridge_member_output(sc, ifp, m); + error = bridge_member_output(sc, ifp, data); if (error != 0 && error != EJUSTRETURN) { printf("%s: bridge_member_output failed error %d\n", __func__, error); @@ -1993,7 +2135,7 @@ bridge_iff_event(void *cookie, ifnet_t ifp, protocol_family_t protocol, event_msg->kev_class == KEV_NETWORK_CLASS && event_msg->kev_subclass == KEV_DL_SUBCLASS) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s event_code %u - %s\n", __func__, ifp->if_xname, event_msg->event_code, dlil_kev_dl_code_str(event_msg->event_code)); @@ -2061,7 +2203,7 @@ bridge_iff_detached(void *cookie, ifnet_t ifp) struct bridge_iflist *bif = (struct bridge_iflist *)cookie; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -2090,7 +2232,7 @@ bridge_attach_protocol(struct ifnet *ifp) struct ifnet_attach_proto_param reg; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -2113,7 +2255,7 @@ bridge_detach_protocol(struct ifnet *ifp) int error; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -2144,13 +2286,21 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, VERIFY(ifs != NULL); /* - * First, remove the member from the list first so it cannot be found anymore + * Remove the member from the list first so it cannot be found anymore * when we release the bridge lock below */ BRIDGE_XLOCK(sc); TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next); BRIDGE_XDROP(sc); + if (sc->sc_mac_nat_bif != NULL) { + if (bif == sc->sc_mac_nat_bif) { + bridge_mac_nat_disable(sc); + } else { + bridge_mac_nat_flush_entries(sc, bif); + } + } + if (!gone) { switch (ifs->if_type) { case IFT_ETHER: @@ -2293,6 +2443,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) uint8_t eaddr[ETHER_ADDR_LEN]; struct iff_filter iff; u_int32_t event_code = 0; + boolean_t mac_nat = FALSE; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) { @@ -2307,9 +2458,10 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) } /* If it's in the span list, it can't be a member. */ - TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) - if (ifs == bif->bif_ifp) { - return EBUSY; + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { + if (ifs == bif->bif_ifp) { + return EBUSY; + } } if (ifs->if_bridge == sc) { @@ -2322,6 +2474,11 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) switch (ifs->if_type) { case IFT_ETHER: + if (strcmp(ifs->if_name, "en") == 0 && + ifs->if_subfamily == IFNET_SUBFAMILY_WIFI) { + /* XXX is there a better way to identify Wi-Fi STA? */ + mac_nat = TRUE; + } case IFT_L2VLAN: /* permitted interface types */ break; @@ -2332,27 +2489,36 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) return EINVAL; } + /* fail to add the interface if the MTU doesn't match */ + if (!TAILQ_EMPTY(&sc->sc_iflist) && sc->sc_ifp->if_mtu != ifs->if_mtu) { + printf("%s: %s: invalid MTU for %s", __func__, + sc->sc_ifp->if_xname, + ifs->if_xname); + return EINVAL; + } + + /* there's already an interface that's doing MAC NAT */ + if (mac_nat && sc->sc_mac_nat_bif != NULL) { + return EBUSY; + } bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_WAITOK | M_ZERO); if (bif == NULL) { return ENOMEM; } - bif->bif_ifp = ifs; ifnet_reference(ifs); - bif->bif_ifflags = IFBIF_LEARNING | IFBIF_DISCOVER; + bif->bif_ifflags |= IFBIF_LEARNING | IFBIF_DISCOVER; #if HAS_IF_CAP bif->bif_savedcaps = ifs->if_capenable; #endif /* HAS_IF_CAP */ bif->bif_sc = sc; + if (mac_nat) { + (void)bridge_mac_nat_enable(sc, bif); + } /* Allow the first Ethernet member to define the MTU */ if (TAILQ_EMPTY(&sc->sc_iflist)) { sc->sc_ifp->if_mtu = ifs->if_mtu; - } else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { - printf("%s: %s: invalid MTU for %s", __func__, - sc->sc_ifp->if_xname, - ifs->if_xname); - return EINVAL; } /* @@ -2523,6 +2689,8 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) req->ifbr_proto = bp->bp_protover; req->ifbr_role = bp->bp_role; req->ifbr_stpflags = bp->bp_flags; + req->ifbr_ifsflags = bif->bif_ifflags; + /* Copy STP state options as flags */ if (bp->bp_operedge) { req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; @@ -2543,7 +2711,6 @@ bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; } - req->ifbr_ifsflags = bif->bif_ifflags; req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; @@ -2571,6 +2738,15 @@ bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) /* SPAN is readonly */ return EINVAL; } + if ((req->ifbr_ifsflags & IFBIF_MAC_NAT) != 0) { + errno_t error; + error = bridge_mac_nat_enable(sc, bif); + if (error != 0) { + return error; + } + } else if (sc->sc_mac_nat_bif != NULL) { + bridge_mac_nat_disable(sc); + } #if BRIDGESTP @@ -3020,11 +3196,9 @@ bridge_ioctl_sfilt(struct bridge_softc *sc, void *arg) return EINVAL; } -#ifndef BRIDGE_IPF if (param->ifbrp_filter & IFBF_FILT_USEIPF) { return EINVAL; } -#endif sc->sc_filter_flags = param->ifbrp_filter; @@ -3343,6 +3517,113 @@ bridge_ioctl_shostfilter(struct bridge_softc *sc, void *arg) return 0; } +static char * +bridge_mac_nat_entry_out(struct mac_nat_entry_list * list, + unsigned int * count_p, char *buf, unsigned int *len_p) +{ + unsigned int count = *count_p; + struct ifbrmne ifbmne; + unsigned int len = *len_p; + struct mac_nat_entry *mne; + unsigned long now; + + bzero(&ifbmne, sizeof(ifbmne)); + LIST_FOREACH(mne, list, mne_list) { + if (len < sizeof(ifbmne)) { + break; + } + snprintf(ifbmne.ifbmne_ifname, sizeof(ifbmne.ifbmne_ifname), + "%s", mne->mne_bif->bif_ifp->if_xname); + memcpy(ifbmne.ifbmne_mac, mne->mne_mac, + sizeof(ifbmne.ifbmne_mac)); + now = (unsigned long) net_uptime(); + if (now < mne->mne_expire) { + ifbmne.ifbmne_expire = mne->mne_expire - now; + } else { + ifbmne.ifbmne_expire = 0; + } + if ((mne->mne_flags & MNE_FLAGS_IPV6) != 0) { + ifbmne.ifbmne_af = AF_INET6; + ifbmne.ifbmne_ip6_addr = mne->mne_ip6; + } else { + ifbmne.ifbmne_af = AF_INET; + ifbmne.ifbmne_ip_addr = mne->mne_ip; + } + memcpy(buf, &ifbmne, sizeof(ifbmne)); + count++; + buf += sizeof(ifbmne); + len -= sizeof(ifbmne); + } + *count_p = count; + *len_p = len; + return buf; +} + +/* + * bridge_ioctl_gmnelist() + * Perform the get mac_nat_entry list ioctl. + * + * Note: + * The struct ifbrmnelist32 and struct ifbrmnelist64 have the same + * field size/layout except for the last field ifbml_buf, the user-supplied + * buffer pointer. That is passed in separately via the 'user_addr' + * parameter from the respective 32-bit or 64-bit ioctl routine. + */ +static int +bridge_ioctl_gmnelist(struct bridge_softc *sc, struct ifbrmnelist32 *mnl, + user_addr_t user_addr) +{ + unsigned int count; + char *buf; + int error = 0; + char *outbuf = NULL; + struct mac_nat_entry *mne; + unsigned int buflen; + unsigned int len; + + mnl->ifbml_elsize = sizeof(struct ifbrmne); + count = 0; + LIST_FOREACH(mne, &sc->sc_mne_list, mne_list) + count++; + LIST_FOREACH(mne, &sc->sc_mne_list_v6, mne_list) + count++; + buflen = sizeof(struct ifbrmne) * count; + if (buflen == 0 || mnl->ifbml_len == 0) { + mnl->ifbml_len = buflen; + return error; + } + BRIDGE_UNLOCK(sc); + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); + BRIDGE_LOCK(sc); + count = 0; + buf = outbuf; + len = min(mnl->ifbml_len, buflen); + buf = bridge_mac_nat_entry_out(&sc->sc_mne_list, &count, buf, &len); + buf = bridge_mac_nat_entry_out(&sc->sc_mne_list_v6, &count, buf, &len); + mnl->ifbml_len = count * sizeof(struct ifbrmne); + BRIDGE_UNLOCK(sc); + error = copyout(outbuf, user_addr, mnl->ifbml_len); + _FREE(outbuf, M_TEMP); + BRIDGE_LOCK(sc); + return error; +} + +static int +bridge_ioctl_gmnelist64(struct bridge_softc *sc, void *arg) +{ + struct ifbrmnelist64 *mnl = arg; + + return bridge_ioctl_gmnelist(sc, arg, mnl->ifbml_buf); +} + +static int +bridge_ioctl_gmnelist32(struct bridge_softc *sc, void *arg) +{ + struct ifbrmnelist32 *mnl = arg; + + return bridge_ioctl_gmnelist(sc, arg, + CAST_USER_ADDR_T(mnl->ifbml_buf)); +} /* * bridge_ifdetach: @@ -3357,7 +3638,7 @@ bridge_ifdetach(struct ifnet *ifp) struct bridge_softc *sc = ifp->if_bridge; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -3400,7 +3681,7 @@ bridge_proto_attach_changed(struct ifnet *ifp) struct bridge_softc *sc = ifp->if_bridge; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -3429,7 +3710,7 @@ bridge_proto_attach_changed(struct ifnet *ifp) } BRIDGE_UNLOCK(sc); #if BRIDGE_DEBUG - if ((if_bridge_debug & BR_DBGF_LIFECYCLE) != 0 && changed) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: input broadcast %s", ifp->if_xname, input_broadcast ? "ENABLED" : "DISABLED"); } @@ -3506,7 +3787,7 @@ bridge_iflinkevent(struct ifnet *ifp) u_int32_t event_code = 0; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -3524,6 +3805,9 @@ bridge_iflinkevent(struct ifnet *ifp) } else { bif->bif_flags &= ~BIFF_MEDIA_ACTIVE; } + if (sc->sc_mac_nat_bif != NULL) { + bridge_mac_nat_flush_entries(sc, bif); + } event_code = bridge_updatelinkstatus(sc); } @@ -3565,7 +3849,7 @@ bridge_delayed_callback(void *param) BRIDGE_LOCK(sc); #if BRIDGE_DEBUG_DELAYED_CALLBACK - if (if_bridge_debug & BR_DBGF_DELAYED_CALL) { + if (IF_BRIDGE_DEBUG(BR_DBGF_DELAYED_CALL)) { printf("%s: %s call 0x%llx flags 0x%x\n", __func__, sc->sc_if_xname, (uint64_t)VM_KERNEL_ADDRPERM(call), call->bdc_flags); @@ -3612,7 +3896,7 @@ bridge_schedule_delayed_call(struct bridge_delayed_call *call) call->bdc_flags = BDCF_OUTSTANDING; #if BRIDGE_DEBUG_DELAYED_CALLBACK - if (if_bridge_debug & BR_DBGF_DELAYED_CALL) { + if (IF_BRIDGE_DEBUG(BR_DBGF_DELAYED_CALL)) { printf("%s: %s call 0x%llx flags 0x%x\n", __func__, sc->sc_if_xname, (uint64_t)VM_KERNEL_ADDRPERM(call), call->bdc_flags); @@ -3659,7 +3943,7 @@ bridge_cancel_delayed_call(struct bridge_delayed_call *call) while (call->bdc_flags & BDCF_OUTSTANDING) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_DELAYED_CALL) { + if (IF_BRIDGE_DEBUG(BR_DBGF_DELAYED_CALL)) { printf("%s: %s call 0x%llx flags 0x%x\n", __func__, sc->sc_if_xname, (uint64_t)VM_KERNEL_ADDRPERM(call), call->bdc_flags); @@ -3821,7 +4105,7 @@ bridge_compute_cksum(struct ifnet *src_if, struct ifnet *dst_if, struct mbuf *m) #endif /* INET6 */ } #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_CHECKSUM) { + if (IF_BRIDGE_DEBUG(BR_DBGF_CHECKSUM)) { printf("%s: [%s -> %s] before 0x%x did 0x%x after 0x%x\n", __func__, src_if->if_xname, dst_if->if_xname, csum_flags, did_sw, @@ -3837,7 +4121,7 @@ bridge_compute_cksum(struct ifnet *src_if, struct ifnet *dst_if, struct mbuf *m) * */ static int -bridge_enqueue(struct bridge_softc *sc, struct ifnet *src_ifp, +bridge_enqueue(ifnet_t bridge_ifp, struct ifnet *src_ifp, struct ifnet *dst_ifp, struct mbuf *m, ChecksumOperation cksum_op) { int len, error = 0; @@ -3907,9 +4191,9 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *src_ifp, } if (_error == 0) { - (void) ifnet_stat_increment_out(sc->sc_ifp, 1, len, 0); + (void) ifnet_stat_increment_out(bridge_ifp, 1, len, 0); } else { - (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + (void) ifnet_stat_increment_out(bridge_ifp, 0, 0, 1); } } @@ -3950,8 +4234,7 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) return; } } - - (void) bridge_enqueue(sc, NULL, ifp, m, kChecksumOperationNone); + (void) bridge_enqueue(sc->sc_ifp, NULL, ifp, m, kChecksumOperationNone); } #endif /* HAS_BRIDGE_DUMMYNET */ @@ -3965,14 +4248,18 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) * The mbuf has the Ethernet header already attached. */ static errno_t -bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) +bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *data) { + ifnet_t bridge_ifp; struct ether_header *eh; struct ifnet *dst_if; uint16_t vlan; + struct bridge_iflist *mac_nat_bif; + ifnet_t mac_nat_ifp; + mbuf_t m = *data; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_OUTPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_OUTPUT)) { printf("%s: ifp %s\n", __func__, ifp->if_xname); } #endif /* BRIDGE_DEBUG */ @@ -3980,7 +4267,8 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) { - return ENOBUFS; + *data = NULL; + return EJUSTRETURN; } } @@ -3988,6 +4276,19 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) vlan = VLANTAGOF(m); BRIDGE_LOCK(sc); + mac_nat_bif = sc->sc_mac_nat_bif; + mac_nat_ifp = (mac_nat_bif != NULL) ? mac_nat_bif->bif_ifp : NULL; + if (mac_nat_ifp == ifp) { + /* record the IP address used by the MAC NAT interface */ + (void)bridge_mac_nat_output(sc, mac_nat_bif, data, NULL); + m = *data; + if (m == NULL) { + /* packet was deallocated */ + BRIDGE_UNLOCK(sc); + return EJUSTRETURN; + } + } + bridge_ifp = sc->sc_ifp; /* * APPLE MODIFICATION @@ -4004,7 +4305,7 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ - if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { + if ((bridge_ifp->if_flags & IFF_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } @@ -4030,7 +4331,7 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) BRIDGE_LOCK2REF(sc, error); if (error != 0) { m_freem(m); - return error; + return EJUSTRETURN; } TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { @@ -4046,16 +4347,26 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) if ((dst_if->if_flags & IFF_RUNNING) == 0) { continue; } - - /* - * If this is not the original output interface, - * and the interface is participating in spanning - * tree, make sure the port is in a state that - * allows forwarding. - */ - if (dst_if != ifp && (bif->bif_ifflags & IFBIF_STP) && - bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { - continue; + if (dst_if != ifp) { + /* + * If this is not the original output interface, + * and the interface is participating in spanning + * tree, make sure the port is in a state that + * allows forwarding. + */ + if ((bif->bif_ifflags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + continue; + } + /* + * If this is not the original output interface, + * and the destination is the MAC NAT interface, + * drop the packet. The packet can't be sent + * if the source MAC is incorrect. + */ + if (dst_if == mac_nat_ifp) { + continue; + } } if (TAILQ_NEXT(bif, bif_next) == NULL) { used = 1; @@ -4064,12 +4375,12 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m) mc = m_dup(m, M_DONTWAIT); if (mc == NULL) { (void) ifnet_stat_increment_out( - sc->sc_ifp, 0, 0, 1); + bridge_ifp, 0, 0, 1); continue; } } - (void) bridge_enqueue(sc, ifp, dst_if, mc, - kChecksumOperationCompute); + (void) bridge_enqueue(bridge_ifp, ifp, dst_if, + mc, kChecksumOperationCompute); } if (used == 0) { m_freem(m); @@ -4095,8 +4406,18 @@ sendunicast: /* just let the packet continue on its way */ return 0; } - (void) bridge_enqueue(sc, ifp, dst_if, m, - kChecksumOperationCompute); + if (dst_if != mac_nat_ifp) { + (void) bridge_enqueue(bridge_ifp, ifp, dst_if, m, + kChecksumOperationCompute); + } else { + /* + * This is not the original output interface + * and the destination is the MAC NAT interface. + * Drop the packet because the packet can't be sent + * if the source MAC is incorrect. + */ + m_freem(m); + } return EJUSTRETURN; } @@ -4111,11 +4432,10 @@ bridge_output(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc = ifnet_softc(ifp); struct ether_header *eh; - struct ifnet *dst_if; + struct ifnet *dst_if = NULL; int error = 0; eh = mtod(m, struct ether_header *); - dst_if = NULL; BRIDGE_LOCK(sc); @@ -4135,8 +4455,11 @@ bridge_output(struct ifnet *ifp, struct mbuf *m) /* callee will unlock */ bridge_broadcast(sc, NULL, m, 0); } else { + ifnet_t bridge_ifp; + + bridge_ifp = sc->sc_ifp; BRIDGE_UNLOCK(sc); - error = bridge_enqueue(sc, NULL, dst_if, m, + error = bridge_enqueue(bridge_ifp, NULL, dst_if, m, kChecksumOperationFinalize); } @@ -4244,25 +4567,29 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, struct mbuf *m) { struct bridge_iflist *dbif; - struct ifnet *src_if, *dst_if, *ifp; + ifnet_t bridge_ifp; + struct ifnet *src_if, *dst_if; struct ether_header *eh; uint16_t vlan; uint8_t *dst; int error; + struct mac_nat_record mnr; + boolean_t translate_mac = FALSE; + uint32_t sc_filter_flags = 0; BRIDGE_LOCK_ASSERT_HELD(sc); + bridge_ifp = sc->sc_ifp; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_OUTPUT) { - printf("%s: %s m 0x%llx\n", __func__, sc->sc_ifp->if_xname, + if (IF_BRIDGE_DEBUG(BR_DBGF_OUTPUT)) { + printf("%s: %s m 0x%llx\n", __func__, bridge_ifp->if_xname, (uint64_t)VM_KERNEL_ADDRPERM(m)); } #endif /* BRIDGE_DEBUG */ src_if = m->m_pkthdr.rcvif; - ifp = sc->sc_ifp; - (void) ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); + (void) ifnet_stat_increment_in(bridge_ifp, 1, m->m_pkthdr.len, 0); vlan = VLANTAGOF(m); @@ -4324,7 +4651,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, /* ...forward it to all interfaces. */ - atomic_add_64(&ifp->if_imcasts, 1); + atomic_add_64(&bridge_ifp->if_imcasts, 1); dst_if = NULL; } @@ -4340,10 +4667,8 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, #if NBPFILTER > 0 if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH) || dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) { - m->m_pkthdr.rcvif = ifp; - if (sc->sc_bpf_input) { - bridge_bpf_input(ifp, m); - } + m->m_pkthdr.rcvif = bridge_ifp; + BRIDGE_BPF_MTAP_INPUT(sc, m); } #endif /* NBPFILTER */ @@ -4351,7 +4676,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, /* run the packet filter */ if (PFIL_HOOKED(&inet_pfil_hook) || PFIL_HOOKED_INET6) { BRIDGE_UNLOCK(sc); - if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) { + if (bridge_pfil(&m, bridge_ifp, src_if, PFIL_IN) != 0) { return; } if (m == NULL) { @@ -4362,6 +4687,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, #endif /* PFIL_HOOKS */ if (dst_if == NULL) { + /* bridge_broadcast will unlock */ bridge_broadcast(sc, src_if, m, 1); return; } @@ -4398,17 +4724,27 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, if ((dst_if->if_extflags & IFEXTF_DHCPRA_MASK) != 0) { m = ip_xdhcpra_output(dst_if, m); if (!m) { - ++sc->sc_sc.sc_ifp.if_xdhcpra; + ++bridge_ifp.if_xdhcpra; + BRIDGE_UNLOCK(sc); return; } } #endif /* HAS_DHCPRA_MASK */ - BRIDGE_UNLOCK(sc); + if (dbif == sc->sc_mac_nat_bif) { + /* determine how to translate the packet */ + translate_mac + = bridge_mac_nat_output(sc, sbif, &m, &mnr); + if (m == NULL) { + /* packet was deallocated */ + BRIDGE_UNLOCK(sc); + return; + } + } #if defined(PFIL_HOOKS) if (PFIL_HOOKED(&inet_pfil_hook) || PFIL_HOOKED_INET6) { - if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) { + if (bridge_pfil(&m, bridge_ifp, dst_if, PFIL_OUT) != 0) { return; } if (m == NULL) { @@ -4417,6 +4753,21 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, } #endif /* PFIL_HOOKS */ + sc_filter_flags = sc->sc_filter_flags; + BRIDGE_UNLOCK(sc); + if (PF_IS_ENABLED && (sc_filter_flags & IFBF_FILT_MEMBER)) { + if (bridge_pf(&m, dst_if, sc_filter_flags, FALSE) != 0) { + return; + } + if (m == NULL) { + return; + } + } + + /* if we need to, translate the MAC address */ + if (translate_mac) { + bridge_mac_nat_translate(&m, &mnr, IF_LLADDR(dst_if)); + } /* * This is an inbound packet where the checksum * (if applicable) is already present/valid. Since @@ -4424,7 +4775,10 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, * forwarding), there's no need to validate the checksum. * Clear the checksum offload flags and send it along. */ - (void) bridge_enqueue(sc, NULL, dst_if, m, kChecksumOperationClear); + if (m != NULL) { + (void) bridge_enqueue(bridge_ifp, NULL, dst_if, m, + kChecksumOperationClear); + } return; drop: @@ -4434,9 +4788,7 @@ drop: #if BRIDGE_DEBUG -char *ether_ntop(char *, size_t, const u_char *); - -__private_extern__ char * +static char * ether_ntop(char *buf, size_t len, const u_char *ap) { snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", @@ -4447,27 +4799,44 @@ ether_ntop(char *buf, size_t len, const u_char *ap) #endif /* BRIDGE_DEBUG */ +static void +inject_input_packet(ifnet_t ifp, mbuf_t m) +{ + mbuf_pkthdr_setrcvif(m, ifp); + mbuf_pkthdr_setheader(m, mbuf_data(m)); + mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, + mbuf_len(m) - ETHER_HDR_LEN); + mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN); + m->m_flags |= M_PROTO1; /* set to avoid loops */ + dlil_input_packet_list(ifp, m); + return; +} + /* * bridge_input: * * Filter input from a member interface. Queue the packet for * bridging if it is not for us. */ -__private_extern__ errno_t -bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) +errno_t +bridge_input(struct ifnet *ifp, mbuf_t *data) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif, *bif2; - struct ifnet *bifp; + ifnet_t bridge_ifp; struct ether_header *eh; struct mbuf *mc, *mc2; uint16_t vlan; - int error; + errno_t error; + boolean_t is_ifp_mac = FALSE; + mbuf_t m = *data; + uint32_t sc_filter_flags = 0; + bridge_ifp = sc->sc_ifp; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s from %s m 0x%llx data 0x%llx\n", __func__, - sc->sc_ifp->if_xname, ifp->if_xname, + bridge_ifp->if_xname, ifp->if_xname, (uint64_t)VM_KERNEL_ADDRPERM(m), (uint64_t)VM_KERNEL_ADDRPERM(mbuf_data(m))); } @@ -4475,15 +4844,14 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s not running passing along\n", - __func__, sc->sc_ifp->if_xname); + __func__, bridge_ifp->if_xname); } #endif /* BRIDGE_DEBUG */ return 0; } - bifp = sc->sc_ifp; vlan = VLANTAGOF(m); #ifdef IFF_MONITOR @@ -4493,10 +4861,10 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) * the bpf(4) machinery, but before we do, increment the byte and * packet counters associated with this interface. */ - if ((bifp->if_flags & IFF_MONITOR) != 0) { - m->m_pkthdr.rcvif = bifp; + if ((bridge_ifp->if_flags & IFF_MONITOR) != 0) { + m->m_pkthdr.rcvif = bridge_ifp; BRIDGE_BPF_MTAP_INPUT(sc, m); - (void) ifnet_stat_increment_in(bifp, 1, m->m_pkthdr.len, 0); + (void) ifnet_stat_increment_in(bridge_ifp, 1, m->m_pkthdr.len, 0); m_freem(m); return EJUSTRETURN; } @@ -4510,29 +4878,47 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) mbuf_setflags_mask(m, 0, MBUF_PROMISC); } + sc_filter_flags = sc->sc_filter_flags; + if (PF_IS_ENABLED && (sc_filter_flags & IFBF_FILT_MEMBER)) { + error = bridge_pf(&m, ifp, sc_filter_flags, TRUE); + if (error != 0) { + return EJUSTRETURN; + } + if (m == NULL) { + return EJUSTRETURN; + } + /* + * bridge_pf could have modified the pointer on success in order + * to do its processing. Updated data such that we don't use a + * stale pointer. + */ + *data = m; + } + BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { BRIDGE_UNLOCK(sc); #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s bridge_lookup_member_if failed\n", - __func__, sc->sc_ifp->if_xname); + __func__, bridge_ifp->if_xname); } #endif /* BRIDGE_DEBUG */ return 0; } if (bif->bif_flags & BIFF_HOST_FILTER) { - error = bridge_host_filter(bif, m); + error = bridge_host_filter(bif, data); if (error != 0) { - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s bridge_host_filter failed\n", __func__, bif->bif_ifp->if_xname); } BRIDGE_UNLOCK(sc); return EJUSTRETURN; } + m = *data; } eh = mtod(m, struct ether_header *); @@ -4541,7 +4927,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) if (m->m_flags & (M_BCAST | M_MCAST)) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_MCAST) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MCAST)) { if ((m->m_flags & M_MCAST)) { printf("%s: multicast: " "%02x:%02x:%02x:%02x:%02x:%02x\n", @@ -4598,7 +4984,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) * here from ether_input as a bridge is never a member of a * bridge. */ - VERIFY(bifp->if_bridge == NULL); + VERIFY(bridge_ifp->if_bridge == NULL); mc2 = m_dup(m, M_DONTWAIT); if (mc2 != NULL) { /* Keep the layer3 header aligned */ @@ -4607,30 +4993,27 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) } if (mc2 != NULL) { /* mark packet as arriving on the bridge */ - mc2->m_pkthdr.rcvif = bifp; + mc2->m_pkthdr.rcvif = bridge_ifp; mc2->m_pkthdr.pkt_hdr = mbuf_data(mc2); -#if NBPFILTER > 0 - if (sc->sc_bpf_input) { - bridge_bpf_input(bifp, mc2); - } -#endif /* NBPFILTER */ + BRIDGE_BPF_MTAP_INPUT(sc, m); + (void) mbuf_setdata(mc2, (char *)mbuf_data(mc2) + ETHER_HDR_LEN, mbuf_len(mc2) - ETHER_HDR_LEN); (void) mbuf_pkthdr_adjustlen(mc2, -ETHER_HDR_LEN); - (void) ifnet_stat_increment_in(bifp, 1, + (void) ifnet_stat_increment_in(bridge_ifp, 1, mbuf_pkthdr_len(mc2), 0); #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_MCAST) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MCAST)) { printf("%s: %s mcast for us\n", __func__, - sc->sc_ifp->if_xname); + bridge_ifp->if_xname); } #endif /* BRIDGE_DEBUG */ - dlil_input_packet_list(bifp, mc2); + dlil_input_packet_list(bridge_ifp, mc2); } /* Return the original packet for local processing. */ @@ -4699,13 +5082,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) } \ } \ BRIDGE_UNLOCK(sc); \ - mbuf_pkthdr_setrcvif(m, iface); \ - mbuf_pkthdr_setheader(m, mbuf_data(m)); \ - mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, \ - mbuf_len(m) - ETHER_HDR_LEN); \ - mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN); \ - m->m_flags |= M_PROTO1; /* set to avoid loops */ \ - dlil_input_packet_list(iface, m); \ + inject_input_packet(iface, m); \ return (EJUSTRETURN); \ } \ \ @@ -4720,16 +5097,44 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) /* * Unicast. */ + if (memcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) { + is_ifp_mac = TRUE; + } + + /* handle MAC-NAT if enabled */ + if (is_ifp_mac && sc->sc_mac_nat_bif == bif) { + ifnet_t dst_if; + boolean_t is_input = FALSE; + + dst_if = bridge_mac_nat_input(sc, data, &is_input); + m = *data; + if (dst_if == ifp) { + /* our input packet */ + } else if (dst_if != NULL || m == NULL) { + BRIDGE_UNLOCK(sc); + if (dst_if != NULL) { + ASSERT(m != NULL); + if (is_input) { + inject_input_packet(dst_if, m); + } else { + (void)bridge_enqueue(bridge_ifp, NULL, + dst_if, m, + kChecksumOperationClear); + } + } + return EJUSTRETURN; + } + } + /* - * If the packet is for us, set the packets source as the - * bridge, and return the packet back to ether_input for - * local processing. + * If the packet is for the bridge, set the packet's source interface + * and return the packet back to ether_input for local processing. */ - if (memcmp(eh->ether_dhost, IF_LLADDR(bifp), - ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_DST(bifp)) { + if (memcmp(eh->ether_dhost, IF_LLADDR(bridge_ifp), + ETHER_ADDR_LEN) == 0 || CARP_CHECK_WE_ARE_DST(bridge_ifp)) { /* Mark the packet as arriving on the bridge interface */ - (void) mbuf_pkthdr_setrcvif(m, bifp); - mbuf_pkthdr_setheader(m, frame_header); + (void) mbuf_pkthdr_setrcvif(m, bridge_ifp); + mbuf_pkthdr_setheader(m, mbuf_data(m)); /* * If the interface is learning, and the source @@ -4747,18 +5152,18 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) mbuf_len(m) - ETHER_HDR_LEN); (void) mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN); - (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); + (void) ifnet_stat_increment_in(bridge_ifp, 1, mbuf_pkthdr_len(m), 0); BRIDGE_UNLOCK(sc); #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_INPUT) { + if (IF_BRIDGE_DEBUG(BR_DBGF_INPUT)) { printf("%s: %s packet for bridge\n", __func__, - sc->sc_ifp->if_xname); + bridge_ifp->if_xname); } #endif /* BRIDGE_DEBUG */ - dlil_input_packet_list(bifp, m); + dlil_input_packet_list(bridge_ifp, m); return EJUSTRETURN; } @@ -4776,7 +5181,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m, void *frame_header) * bridge's own MAC address, because the bridge may be * using the SAME MAC address as one of its interfaces */ - if (memcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) { + if (is_ifp_mac) { #ifdef VERY_VERY_VERY_DIAGNOSTIC printf("%s: not forwarding packet bound for member " @@ -4821,27 +5226,40 @@ static void bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, struct mbuf *m, int runfilt) { -#ifndef PFIL_HOOKS -#pragma unused(runfilt) -#endif + ifnet_t bridge_ifp; struct bridge_iflist *dbif, *sbif; struct mbuf *mc; struct mbuf *mc_in; struct ifnet *dst_if; int error = 0, used = 0; - boolean_t is_output; + boolean_t bridge_if_out; ChecksumOperation cksum_op; + struct mac_nat_record mnr; + struct bridge_iflist *mac_nat_bif = sc->sc_mac_nat_bif; + boolean_t translate_mac = FALSE; + uint32_t sc_filter_flags = 0; + bridge_ifp = sc->sc_ifp; if (src_if != NULL) { - is_output = FALSE; + bridge_if_out = FALSE; cksum_op = kChecksumOperationClear; sbif = bridge_lookup_member_if(sc, src_if); + if (sbif != NULL && mac_nat_bif != NULL && sbif != mac_nat_bif) { + /* get the translation record while holding the lock */ + translate_mac + = bridge_mac_nat_output(sc, sbif, &m, &mnr); + if (m == NULL) { + /* packet was deallocated */ + BRIDGE_UNLOCK(sc); + return; + } + } } else { /* * src_if is NULL when the bridge interface calls * bridge_broadcast(). */ - is_output = TRUE; + bridge_if_out = TRUE; cksum_op = kChecksumOperationFinalize; sbif = NULL; } @@ -4855,7 +5273,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, #ifdef PFIL_HOOKS /* Filter on the bridge interface before broadcasting */ if (runfilt && (PFIL_HOOKED(&inet_pfil_hook) || PFIL_HOOKED_INET6)) { - if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) { + if (bridge_pfil(&m, bridge_ifp, NULL, PFIL_OUT) != 0) { goto out; } if (m == NULL) { @@ -4863,7 +5281,6 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } } #endif /* PFIL_HOOKS */ - TAILQ_FOREACH(dbif, &sc->sc_iflist, bif_next) { dst_if = dbif->bif_ifp; if (dst_if == src_if) { @@ -4901,7 +5318,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } else { mc = m_dup(m, M_DONTWAIT); if (mc == NULL) { - (void) ifnet_stat_increment_out(sc->sc_ifp, + (void) ifnet_stat_increment_out(bridge_ifp, 0, 0, 1); continue; } @@ -4911,7 +5328,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, * If broadcast input is enabled, do so only if this * is an input packet. */ - if (!is_output && + if (!bridge_if_out && (dbif->bif_flags & BIFF_INPUT_BROADCAST) != 0) { mc_in = m_dup(mc, M_DONTWAIT); /* this could fail, but we continue anyways */ @@ -4933,7 +5350,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { (void) ifnet_stat_increment_out( - sc->sc_ifp, 0, 0, 1); + bridge_ifp, 0, 0, 1); if (mc_in != NULL) { m_freem(mc_in); } @@ -4956,7 +5373,48 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, #endif /* PFIL_HOOKS */ /* out */ - (void) bridge_enqueue(sc, NULL, dst_if, mc, cksum_op); + if (translate_mac && mac_nat_bif == dbif) { + /* translate the packet without holding the lock */ + bridge_mac_nat_translate(&mc, &mnr, IF_LLADDR(dst_if)); + } + + sc_filter_flags = sc->sc_filter_flags; + if (runfilt && + PF_IS_ENABLED && (sc_filter_flags & IFBF_FILT_MEMBER)) { + if (used == 0) { + /* Keep the layer3 header aligned */ + int i = min(mc->m_pkthdr.len, max_protohdr); + mc = m_copyup(mc, i, ETHER_ALIGN); + if (mc == NULL) { + (void) ifnet_stat_increment_out( + sc->sc_ifp, 0, 0, 1); + if (mc_in != NULL) { + m_freem(mc_in); + mc_in = NULL; + } + continue; + } + } + if (bridge_pf(&mc, dst_if, sc_filter_flags, FALSE) != 0) { + if (mc_in != NULL) { + m_freem(mc_in); + mc_in = NULL; + } + continue; + } + if (mc == NULL) { + if (mc_in != NULL) { + m_freem(mc_in); + mc_in = NULL; + } + continue; + } + } + + if (mc != NULL) { + (void) bridge_enqueue(bridge_ifp, + NULL, dst_if, mc, cksum_op); + } /* in */ if (mc_in == NULL) { @@ -5012,7 +5470,7 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m) continue; } - (void) bridge_enqueue(sc, NULL, dst_if, mc, + (void) bridge_enqueue(sc->sc_ifp, NULL, dst_if, mc, kChecksumOperationNone); } } @@ -5067,6 +5525,9 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, */ brt = zalloc_noblock(bridge_rtnode_pool); if (brt == NULL) { + if (IF_BRIDGE_DEBUG(BR_DBGF_RT_TABLE)) { + printf("%s: zalloc_nolock failed", __func__); + } return ENOMEM; } bzero(brt, sizeof(struct bridge_rtnode)); @@ -5088,7 +5549,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, brt->brt_dst = bif; bif->bif_addrcnt++; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_RT_TABLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_RT_TABLE)) { printf("%s: added %02x:%02x:%02x:%02x:%02x:%02x " "on %s count %u hashsize %u\n", __func__, dst[0], dst[1], dst[2], dst[3], dst[4], dst[5], @@ -5184,7 +5645,6 @@ bridge_aging_timer(struct bridge_softc *sc) BRIDGE_LOCK_ASSERT_HELD(sc); bridge_rtage(sc); - if ((sc->sc_ifp->if_flags & IFF_RUNNING) && (sc->sc_flags & SCF_DETACHING) == 0) { sc->sc_aging_timer.bdc_sc = sc; @@ -5216,6 +5676,9 @@ bridge_rtage(struct bridge_softc *sc) } } } + if (sc->sc_mac_nat_bif != NULL) { + bridge_mac_nat_age_entries(sc, now); + } } /* @@ -5265,7 +5728,7 @@ bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) /* * bridge_rtdelete: * - * Delete routes to a speicifc member interface. + * Delete routes to a specific member interface. */ static void bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) @@ -5382,7 +5845,7 @@ bridge_rthash_delayed_resize(struct bridge_softc *sc) out: if (error == 0) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_RT_TABLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_RT_TABLE)) { printf("%s: %s new size %u\n", __func__, sc->sc_ifp->if_xname, sc->sc_rthash_size); } @@ -5555,7 +6018,7 @@ bridge_rtnode_hash(struct bridge_softc *sc, struct bridge_rtnode *brt) dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) { #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_RT_TABLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_RT_TABLE)) { printf("%s: %s EEXIST " "%02x:%02x:%02x:%02x:%02x:%02x\n", __func__, sc->sc_ifp->if_xname, @@ -5578,7 +6041,7 @@ bridge_rtnode_hash(struct bridge_softc *sc, struct bridge_rtnode *brt) } while (lbrt != NULL); #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_RT_TABLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_RT_TABLE)) { printf("%s: %s impossible %02x:%02x:%02x:%02x:%02x:%02x\n", __func__, sc->sc_ifp->if_xname, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], @@ -6004,6 +6467,7 @@ bad: *mp = NULL; return error; } +#endif /* PFIL_HOOKS */ /* * Perform basic checks on header size since @@ -6037,7 +6501,7 @@ bridge_ip_checkbasic(struct mbuf **mp) ipstat.ips_toosmall++; goto bad; } - } else if (__predict_false(m->m_len < sizeof(struct ip))) { + } else if (OS_EXPECT((size_t)m->m_len < sizeof(struct ip), 0)) { if ((m = m_pullup(m, sizeof(struct ip))) == NULL) { ipstat.ips_toosmall++; goto bad; @@ -6048,12 +6512,12 @@ bridge_ip_checkbasic(struct mbuf **mp) goto bad; } - if (ip->ip_v != IPVERSION) { + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { ipstat.ips_badvers++; goto bad; } - hlen = ip->ip_hl << 2; - if (hlen < sizeof(struct ip)) { /* minimum header length */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < (int)sizeof(struct ip)) { /* minimum header length */ ipstat.ips_badhlen++; goto bad; } @@ -6140,7 +6604,7 @@ bridge_ip6_checkbasic(struct mbuf **mp) in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } - } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { + } else if (OS_EXPECT((size_t)m->m_len < sizeof(struct ip6_hdr), 0)) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { ip6stat.ip6s_toosmall++; @@ -6167,6 +6631,7 @@ bad: } #endif /* INET6 */ +#ifdef PFIL_HOOKS /* * bridge_fragment: * @@ -6299,18 +6764,19 @@ bridge_detach(ifnet_t ifp) * * Invoke the input BPF callback if enabled */ -__private_extern__ errno_t -bridge_bpf_input(ifnet_t ifp, struct mbuf *m) +static errno_t +bridge_bpf_input(ifnet_t ifp, struct mbuf *m, const char * func, int line) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + bpf_packet_func input_func = sc->sc_bpf_input; - if (sc->sc_bpf_input) { + if (input_func != NULL) { if (mbuf_pkthdr_rcvif(m) != ifp) { - printf("%s: rcvif: 0x%llx != ifp 0x%llx\n", __func__, + printf("%s.%d: rcvif: 0x%llx != ifp 0x%llx\n", func, line, (uint64_t)VM_KERNEL_ADDRPERM(mbuf_pkthdr_rcvif(m)), (uint64_t)VM_KERNEL_ADDRPERM(ifp)); } - (*sc->sc_bpf_input)(ifp, m); + (*input_func)(ifp, m); } return 0; } @@ -6320,13 +6786,14 @@ bridge_bpf_input(ifnet_t ifp, struct mbuf *m) * * Invoke the output BPF callback if enabled */ -__private_extern__ errno_t +static errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m) { struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + bpf_packet_func output_func = sc->sc_bpf_output; - if (sc->sc_bpf_output) { - (*sc->sc_bpf_output)(ifp, m); + if (output_func != NULL) { + (*output_func)(ifp, m); } return 0; } @@ -6346,7 +6813,7 @@ bridge_link_event(struct ifnet *ifp, u_int32_t event_code) } event; #if BRIDGE_DEBUG - if (if_bridge_debug & BR_DBGF_LIFECYCLE) { + if (IF_BRIDGE_DEBUG(BR_DBGF_LIFECYCLE)) { printf("%s: %s event_code %u - %s\n", __func__, ifp->if_xname, event_code, dlil_kev_dl_code_str(event_code)); } @@ -6364,12 +6831,13 @@ bridge_link_event(struct ifnet *ifp, u_int32_t event_code) ifnet_event(ifp, &event.header); } -#define BRIDGE_HF_DROP(reason, func, line) { \ - bridge_hostfilter_stats.reason++; \ - if (if_bridge_debug & BR_DBGF_HOSTFILTER) \ - printf("%s.%d" #reason, func, line); \ - error = EINVAL; \ -} +#define BRIDGE_HF_DROP(reason, func, line) { \ + bridge_hostfilter_stats.reason++; \ + if (IF_BRIDGE_DEBUG(BR_DBGF_HOSTFILTER)) { \ + printf("%s.%d" #reason, func, line); \ + error = EINVAL; \ + } \ + } /* * Make sure this is a DHCP or Bootp request that match the host filter @@ -6423,24 +6891,13 @@ done: } static int -bridge_host_filter(struct bridge_iflist *bif, struct mbuf *m) +bridge_host_filter(struct bridge_iflist *bif, mbuf_t *data) { int error = EINVAL; struct ether_header *eh; static struct in_addr inaddr_any = { .s_addr = INADDR_ANY }; + mbuf_t m = *data; - /* - * Check the Ethernet header is large enough - */ - if (mbuf_pkthdr_len(m) < sizeof(struct ether_header)) { - BRIDGE_HF_DROP(brhf_ether_too_small, __func__, __LINE__); - goto done; - } - if (mbuf_len(m) < sizeof(struct ether_header) && - mbuf_pullup(&m, sizeof(struct ether_header)) != 0) { - BRIDGE_HF_DROP(brhf_ether_pullup_failed, __func__, __LINE__); - goto done; - } eh = mtod(m, struct ether_header *); /* @@ -6468,11 +6925,13 @@ bridge_host_filter(struct bridge_iflist *bif, struct mbuf *m) BRIDGE_HF_DROP(brhf_arp_too_small, __func__, __LINE__); goto done; } - if (mbuf_len(m) < minlen && mbuf_pullup(&m, minlen) != 0) { + if (mbuf_len(m) < minlen && mbuf_pullup(data, minlen) != 0) { BRIDGE_HF_DROP(brhf_arp_pullup_failed, __func__, __LINE__); goto done; } + m = *data; + /* * Verify this is an ethernet/ip arp */ @@ -6528,9 +6987,6 @@ bridge_host_filter(struct bridge_iflist *bif, struct mbuf *m) BRIDGE_HF_DROP(brhf_arp_bad_spa, __func__, __LINE__); goto done; } - /* - * - */ bridge_hostfilter_stats.brhf_arp_ok += 1; error = 0; } else if (eh->ether_type == htons(ETHERTYPE_IP)) { @@ -6629,7 +7085,7 @@ bridge_host_filter(struct bridge_iflist *bif, struct mbuf *m) } done: if (error != 0) { - if (if_bridge_debug & BR_DBGF_HOSTFILTER) { + if (IF_BRIDGE_DEBUG(BR_DBGF_HOSTFILTER)) { if (m) { printf_mbuf_data(m, 0, sizeof(struct ether_header) + @@ -6644,3 +7100,1453 @@ done: } return error; } + +/* + * MAC NAT + */ + +static errno_t +bridge_mac_nat_enable(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + errno_t error = 0; + + BRIDGE_LOCK_ASSERT_HELD(sc); + + if (sc->sc_mac_nat_bif != NULL) { + if (sc->sc_mac_nat_bif != bif) { + error = EBUSY; + } + goto done; + } + sc->sc_mac_nat_bif = bif; + bif->bif_ifflags |= IFBIF_MAC_NAT; + bridge_mac_nat_populate_entries(sc); + +done: + return error; +} + +static void +bridge_mac_nat_disable(struct bridge_softc *sc) +{ + struct bridge_iflist *mac_nat_bif = sc->sc_mac_nat_bif; + + assert(mac_nat_bif != NULL); + bridge_mac_nat_flush_entries(sc, mac_nat_bif); + mac_nat_bif->bif_ifflags &= ~IFBIF_MAC_NAT; + sc->sc_mac_nat_bif = NULL; + return; +} + +static void +mac_nat_entry_print2(struct mac_nat_entry *mne, + char *ifname, const char *msg1, const char *msg2) +{ + int af; + char etopbuf[24]; + char ntopbuf[MAX_IPv6_STR_LEN]; + const char *space; + + af = ((mne->mne_flags & MNE_FLAGS_IPV6) != 0) ? AF_INET6 : AF_INET; + ether_ntop(etopbuf, sizeof(etopbuf), mne->mne_mac); + (void)inet_ntop(af, &mne->mne_u, ntopbuf, sizeof(ntopbuf)); + if (msg2 == NULL) { + msg2 = ""; + space = ""; + } else { + space = " "; + } + printf("%s %s%s%s %p (%s, %s, %s)\n", + ifname, msg1, space, msg2, mne, mne->mne_bif->bif_ifp->if_xname, + ntopbuf, etopbuf); +} + +static void +mac_nat_entry_print(struct mac_nat_entry *mne, + char *ifname, const char *msg) +{ + mac_nat_entry_print2(mne, ifname, msg, NULL); +} + +static struct mac_nat_entry * +bridge_lookup_mac_nat_entry(struct bridge_softc *sc, int af, void * ip) +{ + struct mac_nat_entry *mne; + struct mac_nat_entry *ret_mne = NULL; + + if (af == AF_INET) { + in_addr_t s_addr = ((struct in_addr *)ip)->s_addr; + + LIST_FOREACH(mne, &sc->sc_mne_list, mne_list) { + if (mne->mne_ip.s_addr == s_addr) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + mac_nat_entry_print(mne, sc->sc_if_xname, + "found"); + } + ret_mne = mne; + break; + } + } + } else { + const struct in6_addr *ip6 = (const struct in6_addr *)ip; + + LIST_FOREACH(mne, &sc->sc_mne_list_v6, mne_list) { + if (IN6_ARE_ADDR_EQUAL(&mne->mne_ip6, ip6)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + mac_nat_entry_print(mne, sc->sc_if_xname, + "found"); + } + ret_mne = mne; + break; + } + } + } + return ret_mne; +} + +static void +bridge_destroy_mac_nat_entry(struct bridge_softc *sc, + struct mac_nat_entry *mne, const char *reason) +{ + LIST_REMOVE(mne, mne_list); + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + mac_nat_entry_print(mne, sc->sc_if_xname, reason); + } + zfree(bridge_mne_pool, mne); + sc->sc_mne_count--; +} + +static struct mac_nat_entry * +bridge_create_mac_nat_entry(struct bridge_softc *sc, + struct bridge_iflist *bif, int af, const void *ip, uint8_t *eaddr) +{ + struct mac_nat_entry_list *list; + struct mac_nat_entry *mne; + + if (sc->sc_mne_count >= sc->sc_mne_max) { + sc->sc_mne_allocation_failures++; + return NULL; + } + mne = zalloc_noblock(bridge_mne_pool); + if (mne == NULL) { + sc->sc_mne_allocation_failures++; + return NULL; + } + sc->sc_mne_count++; + bzero(mne, sizeof(*mne)); + bcopy(eaddr, mne->mne_mac, sizeof(mne->mne_mac)); + mne->mne_bif = bif; + if (af == AF_INET) { + bcopy(ip, &mne->mne_ip, sizeof(mne->mne_ip)); + list = &sc->sc_mne_list; + } else { + bcopy(ip, &mne->mne_ip6, sizeof(mne->mne_ip6)); + mne->mne_flags |= MNE_FLAGS_IPV6; + list = &sc->sc_mne_list_v6; + } + LIST_INSERT_HEAD(list, mne, mne_list); + mne->mne_expire = (unsigned long)net_uptime() + sc->sc_brttimeout; + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + mac_nat_entry_print(mne, sc->sc_if_xname, "created"); + } + return mne; +} + +static struct mac_nat_entry * +bridge_update_mac_nat_entry(struct bridge_softc *sc, + struct bridge_iflist *bif, int af, void *ip, uint8_t *eaddr) +{ + struct mac_nat_entry *mne; + + mne = bridge_lookup_mac_nat_entry(sc, af, ip); + if (mne != NULL) { + struct bridge_iflist *mac_nat_bif = sc->sc_mac_nat_bif; + + if (mne->mne_bif == mac_nat_bif) { + /* the MAC NAT interface takes precedence */ + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + if (mne->mne_bif != bif) { + mac_nat_entry_print2(mne, + sc->sc_if_xname, "reject", + bif->bif_ifp->if_xname); + } + } + } else if (mne->mne_bif != bif) { + const char *old_if = mne->mne_bif->bif_ifp->if_xname; + + mne->mne_bif = bif; + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + mac_nat_entry_print2(mne, + sc->sc_if_xname, "replaced", + old_if); + } + bcopy(eaddr, mne->mne_mac, sizeof(mne->mne_mac)); + } + mne->mne_expire = (unsigned long)net_uptime() + + sc->sc_brttimeout; + } else { + mne = bridge_create_mac_nat_entry(sc, bif, af, ip, eaddr); + } + return mne; +} + +static void +bridge_mac_nat_flush_entries_common(struct bridge_softc *sc, + struct mac_nat_entry_list *list, struct bridge_iflist *bif) +{ + struct mac_nat_entry *mne; + struct mac_nat_entry *tmne; + + LIST_FOREACH_SAFE(mne, list, mne_list, tmne) { + if (bif != NULL && mne->mne_bif != bif) { + continue; + } + bridge_destroy_mac_nat_entry(sc, mne, "flushed"); + } +} + +/* + * bridge_mac_nat_flush_entries: + * + * Flush MAC NAT entries for the specified member. Flush all entries if + * the member is the one that requires MAC NAT, otherwise just flush the + * ones for the specified member. + */ +static void +bridge_mac_nat_flush_entries(struct bridge_softc *sc, struct bridge_iflist * bif) +{ + struct bridge_iflist *flush_bif; + + flush_bif = (bif == sc->sc_mac_nat_bif) ? NULL : bif; + bridge_mac_nat_flush_entries_common(sc, &sc->sc_mne_list, flush_bif); + bridge_mac_nat_flush_entries_common(sc, &sc->sc_mne_list_v6, flush_bif); +} + +static void +bridge_mac_nat_populate_entries(struct bridge_softc *sc) +{ + errno_t error; + ifnet_t ifp; + ifaddr_t *list; + struct bridge_iflist *mac_nat_bif = sc->sc_mac_nat_bif; + + assert(mac_nat_bif != NULL); + ifp = mac_nat_bif->bif_ifp; + error = ifnet_get_address_list(ifp, &list); + if (error != 0) { + printf("%s: ifnet_get_address_list(%s) failed %d\n", + __func__, ifp->if_xname, error); + return; + } + for (ifaddr_t *scan = list; *scan != NULL; scan++) { + sa_family_t af; + void *ip; + + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } u; + af = ifaddr_address_family(*scan); + switch (af) { + case AF_INET: + case AF_INET6: + error = ifaddr_address(*scan, &u.sa, sizeof(u)); + if (error != 0) { + printf("%s: ifaddr_address failed %d\n", + __func__, error); + break; + } + if (af == AF_INET) { + ip = (void *)&u.sin.sin_addr; + } else { + if (IN6_IS_ADDR_LINKLOCAL(&u.sin6.sin6_addr)) { + /* remove scope ID */ + u.sin6.sin6_addr.s6_addr16[1] = 0; + } + ip = (void *)&u.sin6.sin6_addr; + } + bridge_create_mac_nat_entry(sc, mac_nat_bif, af, ip, + (uint8_t *)IF_LLADDR(ifp)); + break; + default: + break; + } + } + ifnet_free_address_list(list); + return; +} + +static void +bridge_mac_nat_age_entries_common(struct bridge_softc *sc, + struct mac_nat_entry_list *list, unsigned long now) +{ + struct mac_nat_entry *mne; + struct mac_nat_entry *tmne; + + LIST_FOREACH_SAFE(mne, list, mne_list, tmne) { + if (now >= mne->mne_expire) { + bridge_destroy_mac_nat_entry(sc, mne, "aged out"); + } + } +} + +static void +bridge_mac_nat_age_entries(struct bridge_softc *sc, unsigned long now) +{ + if (sc->sc_mac_nat_bif == NULL) { + return; + } + bridge_mac_nat_age_entries_common(sc, &sc->sc_mne_list, now); + bridge_mac_nat_age_entries_common(sc, &sc->sc_mne_list_v6, now); +} + +static const char * +get_in_out_string(boolean_t is_output) +{ + return is_output ? "OUT" : "IN"; +} + +/* + * is_valid_arp_packet: + * Verify that this is a valid ARP packet. + * + * Returns TRUE if the packet is valid, FALSE otherwise. + */ +static boolean_t +is_valid_arp_packet(mbuf_t *data, boolean_t is_output, + struct ether_header **eh_p, struct ether_arp **ea_p) +{ + struct ether_arp *ea; + struct ether_header *eh; + size_t minlen = sizeof(struct ether_header) + sizeof(struct ether_arp); + boolean_t is_valid = FALSE; + int flags = is_output ? BR_DBGF_OUTPUT : BR_DBGF_INPUT; + + if (mbuf_pkthdr_len(*data) < minlen) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s short frame %lu < %lu\n", + __func__, + get_in_out_string(is_output), + mbuf_pkthdr_len(*data), minlen); + } + goto done; + } + if (mbuf_len(*data) < minlen && mbuf_pullup(data, minlen) != 0) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s size %lu mbuf_pullup fail\n", + __func__, + get_in_out_string(is_output), + minlen); + } + *data = NULL; + goto done; + } + + /* validate ARP packet */ + eh = mtod(*data, struct ether_header *); + ea = (struct ether_arp *)(eh + 1); + if (ntohs(ea->arp_hrd) != ARPHRD_ETHER) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s htype not ethernet\n", + __func__, + get_in_out_string(is_output)); + } + goto done; + } + if (ea->arp_hln != ETHER_ADDR_LEN) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s hlen not ethernet\n", + __func__, + get_in_out_string(is_output)); + } + goto done; + } + if (ntohs(ea->arp_pro) != ETHERTYPE_IP) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s ptype not IP\n", + __func__, + get_in_out_string(is_output)); + } + goto done; + } + if (ea->arp_pln != sizeof(struct in_addr)) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: ARP %s plen not IP\n", + __func__, + get_in_out_string(is_output)); + } + goto done; + } + is_valid = TRUE; + *ea_p = ea; + *eh_p = eh; +done: + return is_valid; +} + +static struct mac_nat_entry * +bridge_mac_nat_arp_input(struct bridge_softc *sc, mbuf_t *data) +{ + struct ether_arp *ea; + struct ether_header *eh; + struct mac_nat_entry *mne = NULL; + u_short op; + struct in_addr tpa; + + if (!is_valid_arp_packet(data, FALSE, &eh, &ea)) { + goto done; + } + op = ntohs(ea->arp_op); + switch (op) { + case ARPOP_REQUEST: + case ARPOP_REPLY: + /* only care about REQUEST and REPLY */ + break; + default: + goto done; + } + + /* check the target IP address for a NAT entry */ + bcopy(ea->arp_tpa, &tpa, sizeof(tpa)); + if (tpa.s_addr != 0) { + mne = bridge_lookup_mac_nat_entry(sc, AF_INET, &tpa); + } + if (mne != NULL) { + if (op == ARPOP_REPLY) { + /* translate the MAC address */ + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + char mac_src[24]; + char mac_dst[24]; + + ether_ntop(mac_src, sizeof(mac_src), + ea->arp_tha); + ether_ntop(mac_dst, sizeof(mac_dst), + mne->mne_mac); + printf("%s %s ARP %s -> %s\n", + sc->sc_if_xname, + mne->mne_bif->bif_ifp->if_xname, + mac_src, mac_dst); + } + bcopy(mne->mne_mac, ea->arp_tha, sizeof(ea->arp_tha)); + } + } else { + /* handle conflicting ARP (sender matches mne) */ + struct in_addr spa; + + bcopy(ea->arp_spa, &spa, sizeof(spa)); + if (spa.s_addr != 0 && spa.s_addr != tpa.s_addr) { + /* check the source IP for a NAT entry */ + mne = bridge_lookup_mac_nat_entry(sc, AF_INET, &spa); + } + } + +done: + return mne; +} + +static boolean_t +bridge_mac_nat_arp_output(struct bridge_softc *sc, + struct bridge_iflist *bif, mbuf_t *data, struct mac_nat_record *mnr) +{ + struct ether_arp *ea; + struct ether_header *eh; + struct in_addr ip; + struct mac_nat_entry *mne = NULL; + u_short op; + boolean_t translate = FALSE; + + if (!is_valid_arp_packet(data, TRUE, &eh, &ea)) { + goto done; + } + op = ntohs(ea->arp_op); + switch (op) { + case ARPOP_REQUEST: + case ARPOP_REPLY: + /* only care about REQUEST and REPLY */ + break; + default: + goto done; + } + + bcopy(ea->arp_spa, &ip, sizeof(ip)); + if (ip.s_addr == 0) { + goto done; + } + /* XXX validate IP address: no multicast/broadcast */ + mne = bridge_update_mac_nat_entry(sc, bif, AF_INET, &ip, ea->arp_sha); + if (mnr != NULL && mne != NULL) { + /* record the offset to do the replacement */ + translate = TRUE; + mnr->mnr_arp_offset = (char *)ea->arp_sha - (char *)eh; + } + +done: + return translate; +} + +#define ETHER_IPV4_HEADER_LEN (sizeof(struct ether_header) + \ + + sizeof(struct ip)) +static struct ether_header * +get_ether_ip_header(mbuf_t *data, boolean_t is_output) +{ + struct ether_header *eh = NULL; + int flags = is_output ? BR_DBGF_OUTPUT : BR_DBGF_INPUT; + size_t minlen = ETHER_IPV4_HEADER_LEN; + + if (mbuf_pkthdr_len(*data) < minlen) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: IP %s short frame %lu < %lu\n", + __func__, + get_in_out_string(is_output), + mbuf_pkthdr_len(*data), minlen); + } + goto done; + } + if (mbuf_len(*data) < minlen && mbuf_pullup(data, minlen) != 0) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: IP %s size %lu mbuf_pullup fail\n", + __func__, + get_in_out_string(is_output), + minlen); + } + *data = NULL; + goto done; + } + eh = mtod(*data, struct ether_header *); +done: + return eh; +} + +static struct mac_nat_entry * +bridge_mac_nat_ip_input(struct bridge_softc *sc, mbuf_t *data) +{ + struct in_addr dst; + struct ether_header *eh; + struct ip *iphdr; + struct mac_nat_entry *mne = NULL; + + eh = get_ether_ip_header(data, FALSE); + if (eh == NULL) { + goto done; + } + iphdr = (struct ip *)(void *)(eh + 1); + bcopy(&iphdr->ip_dst, &dst, sizeof(dst)); + /* XXX validate IP address */ + if (dst.s_addr == 0) { + goto done; + } + mne = bridge_lookup_mac_nat_entry(sc, AF_INET, &dst); +done: + return mne; +} + +static void +bridge_mac_nat_udp_output(struct bridge_softc *sc, + struct bridge_iflist *bif, mbuf_t m, + uint8_t ip_header_len, struct mac_nat_record *mnr) +{ + uint16_t dp_flags; + errno_t error; + size_t offset; + struct udphdr udphdr; + + /* copy the UDP header */ + offset = sizeof(struct ether_header) + ip_header_len; + error = mbuf_copydata(m, offset, sizeof(struct udphdr), &udphdr); + if (error != 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: mbuf_copydata udphdr failed %d", + __func__, error); + } + return; + } + if (ntohs(udphdr.uh_sport) != IPPORT_BOOTPC || + ntohs(udphdr.uh_dport) != IPPORT_BOOTPS) { + /* not a BOOTP/DHCP packet */ + return; + } + /* check whether the broadcast bit is already set */ + offset += sizeof(struct udphdr) + offsetof(struct dhcp, dp_flags); + error = mbuf_copydata(m, offset, sizeof(dp_flags), &dp_flags); + if (error != 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: mbuf_copydata dp_flags failed %d", + __func__, error); + } + return; + } + if ((ntohs(dp_flags) & DHCP_FLAGS_BROADCAST) != 0) { + /* it's already set, nothing to do */ + return; + } + /* broadcast bit needs to be set */ + mnr->mnr_ip_dhcp_flags = dp_flags | htons(DHCP_FLAGS_BROADCAST); + mnr->mnr_ip_header_len = ip_header_len; + if (udphdr.uh_sum != 0) { + uint16_t delta; + + /* adjust checksum to take modified dp_flags into account */ + delta = dp_flags - mnr->mnr_ip_dhcp_flags; + mnr->mnr_ip_udp_csum = udphdr.uh_sum + delta; + } + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s %s DHCP dp_flags 0x%x UDP cksum 0x%x\n", + sc->sc_if_xname, + bif->bif_ifp->if_xname, + ntohs(mnr->mnr_ip_dhcp_flags), + ntohs(mnr->mnr_ip_udp_csum)); + } + return; +} + +static boolean_t +bridge_mac_nat_ip_output(struct bridge_softc *sc, + struct bridge_iflist *bif, mbuf_t *data, struct mac_nat_record *mnr) +{ +#pragma unused(mnr) + struct ether_header *eh; + struct in_addr ip; + struct ip *iphdr; + uint8_t ip_header_len; + struct mac_nat_entry *mne = NULL; + boolean_t translate = FALSE; + + eh = get_ether_ip_header(data, TRUE); + if (eh == NULL) { + goto done; + } + iphdr = (struct ip *)(void *)(eh + 1); + ip_header_len = IP_VHL_HL(iphdr->ip_vhl) << 2; + if (ip_header_len < sizeof(ip)) { + /* bogus IP header */ + goto done; + } + bcopy(&iphdr->ip_src, &ip, sizeof(ip)); + /* XXX validate the source address */ + if (ip.s_addr != 0) { + mne = bridge_update_mac_nat_entry(sc, bif, AF_INET, &ip, + eh->ether_shost); + } + if (mnr != NULL) { + if (iphdr->ip_p == IPPROTO_UDP) { + /* handle DHCP must broadcast */ + bridge_mac_nat_udp_output(sc, bif, *data, + ip_header_len, mnr); + } + translate = TRUE; + } +done: + return translate; +} + +#define ETHER_IPV6_HEADER_LEN (sizeof(struct ether_header) + \ + + sizeof(struct ip6_hdr)) +static struct ether_header * +get_ether_ipv6_header(mbuf_t *data, boolean_t is_output) +{ + struct ether_header *eh = NULL; + int flags = is_output ? BR_DBGF_OUTPUT : BR_DBGF_INPUT; + size_t minlen = ETHER_IPV6_HEADER_LEN; + + if (mbuf_pkthdr_len(*data) < minlen) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: IP %s short frame %lu < %lu\n", + __func__, + get_in_out_string(is_output), + mbuf_pkthdr_len(*data), minlen); + } + goto done; + } + if (mbuf_len(*data) < minlen && mbuf_pullup(data, minlen) != 0) { + if (IF_BRIDGE_DEBUG(flags)) { + printf("%s: IP %s size %lu mbuf_pullup fail\n", + __func__, + get_in_out_string(is_output), + minlen); + } + *data = NULL; + goto done; + } + eh = mtod(*data, struct ether_header *); +done: + return eh; +} + +#if 0 +static void +bridge_mac_nat_icmpv6_input(struct bridge_softc *sc, mbuf_t *data, + struct ether_header *eh, struct ip6_hdr *hdr) +{ +#pragma unused(sc) +#pragma unused(data) +#pragma unused(eh) +#pragma unused(hdr) + return; +} +#endif + +#include +#include + +#define ETHER_ND_LLADDR_LEN (ETHER_ADDR_LEN + sizeof(struct nd_opt_hdr)) + +static void +bridge_mac_nat_icmpv6_output(struct bridge_softc *sc, struct bridge_iflist *bif, + mbuf_t *data, struct ether_header *eh, + struct ip6_hdr *ip6h, struct in6_addr *saddrp, struct mac_nat_record *mnr) +{ + struct icmp6_hdr *icmp6; + unsigned int icmp6len; + int lladdrlen = 0; + char *lladdr = NULL; + mbuf_t m = *data; + unsigned int off = sizeof(*ip6h); + + icmp6len = m->m_pkthdr.len - sizeof(*eh) - off; + if (icmp6len < sizeof(*icmp6)) { + printf("%s: short packet %d < %lu\n", __func__, + icmp6len, sizeof(*icmp6)); + return; + } + icmp6 = (struct icmp6_hdr *)((caddr_t)ip6h + off); + switch (icmp6->icmp6_type) { + case ND_NEIGHBOR_SOLICIT: { + struct nd_neighbor_solicit *nd_ns; + union nd_opts ndopts; + boolean_t is_dad_probe; + struct in6_addr taddr; + + if (icmp6len < sizeof(*nd_ns)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: short nd_ns %d < %lu\n", __func__, + icmp6len, sizeof(*nd_ns)); + } + return; + } + + nd_ns = (struct nd_neighbor_solicit *)(void *)icmp6; + bcopy(&nd_ns->nd_ns_target, &taddr, sizeof(taddr)); + if (IN6_IS_ADDR_MULTICAST(&taddr) || + IN6_IS_ADDR_UNSPECIFIED(&taddr)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: invalid target ignored\n", __func__); + } + return; + } + /* parse options */ + nd6_option_init(nd_ns + 1, icmp6len - sizeof(*nd_ns), &ndopts); + if (nd6_options(&ndopts) < 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: invalid ND6 NS option\n", __func__); + } + return; + } + if (ndopts.nd_opts_src_lladdr != NULL) { + lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); + lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; + } + is_dad_probe = IN6_IS_ADDR_UNSPECIFIED(saddrp); + if (lladdr != NULL) { + if (is_dad_probe) { + printf("%s: bad ND6 DAD packet\n", __func__); + return; + } + if (lladdrlen != ETHER_ND_LLADDR_LEN) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: source lladdrlen %d != %lu\n", + __func__, + lladdrlen, ETHER_ND_LLADDR_LEN); + } + return; + } + mnr->mnr_ip6_lladdr_offset = (void *)lladdr - + (void *)eh; + mnr->mnr_ip6_icmp6_len = icmp6len; + mnr->mnr_ip6_icmp6_type = icmp6->icmp6_type; + mnr->mnr_ip6_header_len = off; + } + if (is_dad_probe) { + /* node is trying use taddr, create an mne using taddr */ + *saddrp = taddr; + } + break; + } + case ND_NEIGHBOR_ADVERT: { + struct nd_neighbor_advert *nd_na; + union nd_opts ndopts; + struct in6_addr taddr; + + + nd_na = (struct nd_neighbor_advert *)(void *)icmp6; + + if (icmp6len < sizeof(*nd_na)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: short nd_na %d < %lu\n", __func__, + icmp6len, sizeof(*nd_na)); + } + return; + } + + bcopy(&nd_na->nd_na_target, &taddr, sizeof(taddr)); + if (IN6_IS_ADDR_MULTICAST(&taddr) || + IN6_IS_ADDR_UNSPECIFIED(&taddr)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: invalid target ignored\n", __func__); + } + return; + } + /* parse options */ + nd6_option_init(nd_na + 1, icmp6len - sizeof(*nd_na), &ndopts); + if (nd6_options(&ndopts) < 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: invalid ND6 NA option\n", __func__); + } + return; + } + if (ndopts.nd_opts_tgt_lladdr == NULL) { + /* target linklayer, nothing to do */ + return; + } + lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); + lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; + if (lladdrlen != ETHER_ND_LLADDR_LEN) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: target lladdrlen %d != %lu\n", + __func__, lladdrlen, ETHER_ND_LLADDR_LEN); + } + return; + } + mnr->mnr_ip6_lladdr_offset = (void *)lladdr - (void *)eh; + mnr->mnr_ip6_icmp6_len = icmp6len; + mnr->mnr_ip6_header_len = off; + mnr->mnr_ip6_icmp6_type = icmp6->icmp6_type; + break; + } + case ND_ROUTER_SOLICIT: { + struct nd_router_solicit *nd_rs; + union nd_opts ndopts; + + if (icmp6len < sizeof(*nd_rs)) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: short nd_rs %d < %lu\n", __func__, + icmp6len, sizeof(*nd_rs)); + } + return; + } + nd_rs = (struct nd_router_solicit *)(void *)icmp6; + + /* parse options */ + nd6_option_init(nd_rs + 1, icmp6len - sizeof(*nd_rs), &ndopts); + if (nd6_options(&ndopts) < 0) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: invalid ND6 RS option\n", __func__); + } + return; + } + if (ndopts.nd_opts_src_lladdr != NULL) { + lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); + lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; + } + if (lladdr != NULL) { + if (lladdrlen != ETHER_ND_LLADDR_LEN) { + if (IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + printf("%s: source lladdrlen %d != %lu\n", + __func__, + lladdrlen, ETHER_ND_LLADDR_LEN); + } + return; + } + mnr->mnr_ip6_lladdr_offset = (void *)lladdr - + (void *)eh; + mnr->mnr_ip6_icmp6_len = icmp6len; + mnr->mnr_ip6_icmp6_type = icmp6->icmp6_type; + mnr->mnr_ip6_header_len = off; + } + break; + } + default: + break; + } + if (mnr->mnr_ip6_lladdr_offset != 0 && + IF_BRIDGE_DEBUG(BR_DBGF_MAC_NAT)) { + const char *str; + + switch (mnr->mnr_ip6_icmp6_type) { + case ND_ROUTER_SOLICIT: + str = "ROUTER SOLICIT"; + break; + case ND_NEIGHBOR_ADVERT: + str = "NEIGHBOR ADVERT"; + break; + case ND_NEIGHBOR_SOLICIT: + str = "NEIGHBOR SOLICIT"; + break; + default: + str = ""; + break; + } + printf("%s %s %s ip6len %d icmp6len %d lladdr offset %d\n", + sc->sc_if_xname, bif->bif_ifp->if_xname, str, + mnr->mnr_ip6_header_len, + mnr->mnr_ip6_icmp6_len, mnr->mnr_ip6_lladdr_offset); + } +} + +static struct mac_nat_entry * +bridge_mac_nat_ipv6_input(struct bridge_softc *sc, mbuf_t *data) +{ + struct in6_addr dst; + struct ether_header *eh; + struct ip6_hdr *ip6h; + struct mac_nat_entry *mne = NULL; + + eh = get_ether_ipv6_header(data, FALSE); + if (eh == NULL) { + goto done; + } + ip6h = (struct ip6_hdr *)(void *)(eh + 1); +#if 0 + if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { + bridge_mac_nat_icmpv6_input(sc, data, eh, ip6h); + } +#endif + bcopy(&ip6h->ip6_dst, &dst, sizeof(dst)); + /* XXX validate IPv6 address */ + if (IN6_IS_ADDR_UNSPECIFIED(&dst)) { + goto done; + } + mne = bridge_lookup_mac_nat_entry(sc, AF_INET6, &dst); + +done: + return mne; +} + +static boolean_t +bridge_mac_nat_ipv6_output(struct bridge_softc *sc, + struct bridge_iflist *bif, mbuf_t *data, struct mac_nat_record *mnr) +{ + struct ether_header *eh; + struct ip6_hdr *ip6h; + struct in6_addr saddr; + boolean_t translate; + + translate = (bif == sc->sc_mac_nat_bif) ? FALSE : TRUE; + eh = get_ether_ipv6_header(data, TRUE); + if (eh == NULL) { + translate = FALSE; + goto done; + } + ip6h = (struct ip6_hdr *)(void *)(eh + 1); + bcopy(&ip6h->ip6_src, &saddr, sizeof(saddr)); + if (mnr != NULL && ip6h->ip6_nxt == IPPROTO_ICMPV6) { + bridge_mac_nat_icmpv6_output(sc, bif, data, + eh, ip6h, &saddr, mnr); + } + if (IN6_IS_ADDR_UNSPECIFIED(&saddr)) { + goto done; + } + (void)bridge_update_mac_nat_entry(sc, bif, AF_INET6, &saddr, + eh->ether_shost); + +done: + return translate; +} + +/* + * bridge_mac_nat_input: + * Process a packet arriving on the MAC NAT interface (sc_mac_nat_bif). + * This interface is the "external" interface with respect to NAT. + * The interface is only capable of receiving a single MAC address + * (e.g. a Wi-Fi STA interface). + * + * When a packet arrives on the external interface, look up the destination + * IP address in the mac_nat_entry table. If there is a match, *is_input + * is set to TRUE if it's for the MAC NAT interface, otherwise *is_input + * is set to FALSE and translate the MAC address if necessary. + * + * Returns: + * The internal interface to direct the packet to, or NULL if the packet + * should not be redirected. + * + * *data may be updated to point at a different mbuf chain, or set to NULL + * if the chain was deallocated during processing. + */ +static ifnet_t +bridge_mac_nat_input(struct bridge_softc *sc, mbuf_t *data, + boolean_t *is_input) +{ + ifnet_t dst_if = NULL; + struct ether_header *eh; + uint16_t ether_type; + boolean_t is_unicast; + mbuf_t m = *data; + struct mac_nat_entry *mne = NULL; + + BRIDGE_LOCK_ASSERT_HELD(sc); + *is_input = FALSE; + assert(sc->sc_mac_nat_bif != NULL); + is_unicast = ((m->m_flags & (M_BCAST | M_MCAST)) == 0); + eh = mtod(m, struct ether_header *); + ether_type = ntohs(eh->ether_type); + switch (ether_type) { + case ETHERTYPE_ARP: + mne = bridge_mac_nat_arp_input(sc, data); + break; + case ETHERTYPE_IP: + if (is_unicast) { + mne = bridge_mac_nat_ip_input(sc, data); + } + break; + case ETHERTYPE_IPV6: + if (is_unicast) { + mne = bridge_mac_nat_ipv6_input(sc, data); + } + break; + default: + break; + } + if (mne != NULL) { + if (is_unicast) { + if (m != *data) { + /* it may have changed */ + eh = mtod(*data, struct ether_header *); + } + bcopy(mne->mne_mac, eh->ether_dhost, + sizeof(eh->ether_dhost)); + } + dst_if = mne->mne_bif->bif_ifp; + *is_input = (mne->mne_bif == sc->sc_mac_nat_bif); + } + return dst_if; +} + +/* + * bridge_mac_nat_output: + * Process a packet destined to the MAC NAT interface (sc_mac_nat_bif) + * from the interface 'bif'. + * + * Create a mac_nat_entry containing the source IP address and MAC address + * from the packet. Populate a mac_nat_record with information detailing + * how to translate the packet. Translation takes place later when + * the bridge lock is no longer held. + * + * If 'bif' == sc_mac_nat_bif, the stack over the MAC NAT + * interface is generating an output packet. No translation is required in this + * case, we just record the IP address used to prevent another bif from + * claiming our IP address. + * + * Returns: + * TRUE if the packet should be translated (*mnr updated as well), + * FALSE otherwise. + * + * *data may be updated to point at a different mbuf chain or NULL if + * the chain was deallocated during processing. + */ + +static boolean_t +bridge_mac_nat_output(struct bridge_softc *sc, + struct bridge_iflist *bif, mbuf_t *data, struct mac_nat_record *mnr) +{ + struct ether_header *eh; + uint16_t ether_type; + boolean_t translate = FALSE; + + BRIDGE_LOCK_ASSERT_HELD(sc); + assert(sc->sc_mac_nat_bif != NULL); + + eh = mtod(*data, struct ether_header *); + ether_type = ntohs(eh->ether_type); + if (mnr != NULL) { + bzero(mnr, sizeof(*mnr)); + mnr->mnr_ether_type = ether_type; + } + switch (ether_type) { + case ETHERTYPE_ARP: + translate = bridge_mac_nat_arp_output(sc, bif, data, mnr); + break; + case ETHERTYPE_IP: + translate = bridge_mac_nat_ip_output(sc, bif, data, mnr); + break; + case ETHERTYPE_IPV6: + translate = bridge_mac_nat_ipv6_output(sc, bif, data, mnr); + break; + default: + break; + } + return translate; +} + +static void +bridge_mac_nat_arp_translate(mbuf_t *data, struct mac_nat_record *mnr, + const caddr_t eaddr) +{ + errno_t error; + + if (mnr->mnr_arp_offset == 0) { + return; + } + /* replace the source hardware address */ + error = mbuf_copyback(*data, mnr->mnr_arp_offset, + ETHER_ADDR_LEN, eaddr, + MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback failed\n", + __func__); + m_freem(*data); + *data = NULL; + } + return; +} + +static void +bridge_mac_nat_ip_translate(mbuf_t *data, struct mac_nat_record *mnr) +{ + errno_t error; + size_t offset; + + if (mnr->mnr_ip_header_len == 0) { + return; + } + /* update the UDP checksum */ + offset = sizeof(struct ether_header) + mnr->mnr_ip_header_len; + error = mbuf_copyback(*data, offset + offsetof(struct udphdr, uh_sum), + sizeof(mnr->mnr_ip_udp_csum), + &mnr->mnr_ip_udp_csum, + MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback uh_sum failed\n", + __func__); + m_freem(*data); + *data = NULL; + } + /* update the DHCP must broadcast flag */ + offset += sizeof(struct udphdr); + error = mbuf_copyback(*data, offset + offsetof(struct dhcp, dp_flags), + sizeof(mnr->mnr_ip_dhcp_flags), + &mnr->mnr_ip_dhcp_flags, + MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback dp_flags failed\n", + __func__); + m_freem(*data); + *data = NULL; + } +} + +static void +bridge_mac_nat_ipv6_translate(mbuf_t *data, struct mac_nat_record *mnr, + const caddr_t eaddr) +{ + uint16_t cksum; + errno_t error; + mbuf_t m = *data; + + if (mnr->mnr_ip6_header_len == 0) { + return; + } + switch (mnr->mnr_ip6_icmp6_type) { + case ND_ROUTER_SOLICIT: + case ND_NEIGHBOR_SOLICIT: + case ND_NEIGHBOR_ADVERT: + if (mnr->mnr_ip6_lladdr_offset == 0) { + /* nothing to do */ + return; + } + break; + default: + return; + } + + /* + * replace the lladdr + */ + error = mbuf_copyback(m, mnr->mnr_ip6_lladdr_offset, + ETHER_ADDR_LEN, eaddr, + MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback lladdr failed\n", + __func__); + m_freem(m); + *data = NULL; + return; + } + + /* + * recompute the icmp6 checksum + */ + + /* skip past the ethernet header */ + mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, + mbuf_len(m) - ETHER_HDR_LEN); + mbuf_pkthdr_adjustlen(m, -ETHER_HDR_LEN); + +#define CKSUM_OFFSET_ICMP6 offsetof(struct icmp6_hdr, icmp6_cksum) + /* set the checksum to zero */ + cksum = 0; + error = mbuf_copyback(m, mnr->mnr_ip6_header_len + CKSUM_OFFSET_ICMP6, + sizeof(cksum), &cksum, MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback cksum=0 failed\n", + __func__); + m_freem(m); + *data = NULL; + return; + } + /* compute and set the new checksum */ + cksum = in6_cksum(m, IPPROTO_ICMPV6, mnr->mnr_ip6_header_len, + mnr->mnr_ip6_icmp6_len); + error = mbuf_copyback(m, mnr->mnr_ip6_header_len + CKSUM_OFFSET_ICMP6, + sizeof(cksum), &cksum, MBUF_DONTWAIT); + if (error != 0) { + printf("%s: mbuf_copyback cksum failed\n", + __func__); + m_freem(m); + *data = NULL; + return; + } + /* restore the ethernet header */ + mbuf_setdata(m, (char *)mbuf_data(m) - ETHER_HDR_LEN, + mbuf_len(m) + ETHER_HDR_LEN); + mbuf_pkthdr_adjustlen(m, ETHER_HDR_LEN); + return; +} + +static void +bridge_mac_nat_translate(mbuf_t *data, struct mac_nat_record *mnr, + const caddr_t eaddr) +{ + struct ether_header *eh; + + /* replace the source ethernet address with the single MAC */ + eh = mtod(*data, struct ether_header *); + bcopy(eaddr, eh->ether_shost, sizeof(eh->ether_shost)); + switch (mnr->mnr_ether_type) { + case ETHERTYPE_ARP: + bridge_mac_nat_arp_translate(data, mnr, eaddr); + break; + + case ETHERTYPE_IP: + bridge_mac_nat_ip_translate(data, mnr); + break; + + case ETHERTYPE_IPV6: + bridge_mac_nat_ipv6_translate(data, mnr, eaddr); + break; + + default: + break; + } + return; +} + +/* + * bridge packet filtering + */ + +/* + * the PF routines expect to be called from ip_input, so we + * need to do and undo here some of the same processing. + * + * XXX : this is heavily inspired on bridge_pfil() + */ +static +int +bridge_pf(struct mbuf **mp, struct ifnet *ifp, uint32_t sc_filter_flags, int input) +{ + /* + * XXX : mpetit : heavily inspired by bridge_pfil() + */ + + int snap, error, i, hlen; + struct ether_header *eh1, eh2; + struct ip *ip; + struct llc llc1; + u_int16_t ether_type; + + snap = 0; + error = -1; /* Default error if not error == 0 */ + + if ((sc_filter_flags & IFBF_FILT_MEMBER) == 0) { + return 0; /* filtering is disabled */ + } + i = min((*mp)->m_pkthdr.len, max_protohdr); + if ((*mp)->m_len < i) { + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return -1; + } + } + + eh1 = mtod(*mp, struct ether_header *); + ether_type = ntohs(eh1->ether_type); + + /* + * Check for SNAP/LLC. + */ + if (ether_type < ETHERMTU) { + struct llc *llc2 = (struct llc *)(eh1 + 1); + + if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && + llc2->llc_dsap == LLC_SNAP_LSAP && + llc2->llc_ssap == LLC_SNAP_LSAP && + llc2->llc_control == LLC_UI) { + ether_type = htons(llc2->llc_un.type_snap.ether_type); + snap = 1; + } + } + + /* + * If we're trying to filter bridge traffic, don't look at anything + * other than IP and ARP traffic. If the filter doesn't understand + * IPv6, don't allow IPv6 through the bridge either. This is lame + * since if we really wanted, say, an AppleTalk filter, we are hosed, + * but of course we don't have an AppleTalk filter to begin with. + * (Note that since pfil doesn't understand ARP it will pass *ALL* + * ARP traffic.) + */ + switch (ether_type) { + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + return 0; /* Automatically pass */ + + case ETHERTYPE_IP: + case ETHERTYPE_IPV6: + break; + default: + /* + * Check to see if the user wants to pass non-ip + * packets, these will not be checked by pf and + * passed unconditionally so the default is to drop. + */ + if ((sc_filter_flags & IFBF_FILT_ONLYIP)) { + goto bad; + } + break; + } + + /* Strip off the Ethernet header and keep a copy. */ + m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t)&eh2); + m_adj(*mp, ETHER_HDR_LEN); + + /* Strip off snap header, if present */ + if (snap) { + m_copydata(*mp, 0, sizeof(struct llc), (caddr_t)&llc1); + m_adj(*mp, sizeof(struct llc)); + } + + /* + * Check the IP header for alignment and errors + */ + switch (ether_type) { + case ETHERTYPE_IP: + error = bridge_ip_checkbasic(mp); + break; + case ETHERTYPE_IPV6: + error = bridge_ip6_checkbasic(mp); + break; + default: + error = 0; + break; + } + if (error) { + goto bad; + } + + error = 0; + + /* + * Run the packet through pf rules + */ + switch (ether_type) { + case ETHERTYPE_IP: + /* + * before calling the firewall, swap fields the same as + * IP does. here we assume the header is contiguous + */ + ip = mtod(*mp, struct ip *); + + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + if (ifp != NULL) { + error = pf_af_hook(ifp, 0, mp, AF_INET, input, NULL); + } + + if (*mp == NULL || error != 0) { /* filter may consume */ + break; + } + + /* Recalculate the ip checksum and restore byte ordering */ + ip = mtod(*mp, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < (int)sizeof(struct ip)) { + goto bad; + } + if (hlen > (*mp)->m_len) { + if ((*mp = m_pullup(*mp, hlen)) == 0) { + goto bad; + } + ip = mtod(*mp, struct ip *); + if (ip == NULL) { + goto bad; + } + } + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(*mp, hlen); + } + break; + + case ETHERTYPE_IPV6: + if (ifp != NULL) { + error = pf_af_hook(ifp, 0, mp, AF_INET6, input, NULL); + } + + if (*mp == NULL || error != 0) { /* filter may consume */ + break; + } + break; + default: + error = 0; + break; + } + + if (*mp == NULL) { + return error; + } + if (error != 0) { + goto bad; + } + + error = -1; + + /* + * Finally, put everything back the way it was and return + */ + if (snap) { + M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT, 0); + if (*mp == NULL) { + return error; + } + bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); + } + + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT, 0); + if (*mp == NULL) { + return error; + } + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + return 0; + +bad: + m_freem(*mp); + *mp = NULL; + return error; +} diff --git a/bsd/net/if_bridgevar.h b/bsd/net/if_bridgevar.h index ae7754b7b..48a6555fc 100644 --- a/bsd/net/if_bridgevar.h +++ b/bsd/net/if_bridgevar.h @@ -109,6 +109,7 @@ #include #include +#include /* * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the @@ -149,8 +150,10 @@ #define BRDGSPROTO 31 /* set protocol (ifbrparam) */ #define BRDGSTXHC 32 /* set tx hold count (ifbrparam) */ #define BRDGSIFAMAX 33 /* set max interface addrs (ifbreq) */ -#define BRDGGHOSTFILTER 34 /* set max interface addrs (ifbrhostfilter) */ -#define BRDGSHOSTFILTER 35 /* set max interface addrs (ifbrhostfilter) */ +#define BRDGGHOSTFILTER 34 /* get host filter (ifbrhostfilter) */ +#define BRDGSHOSTFILTER 35 /* set host filter (ifbrhostfilter) */ +#define BRDGGMACNATLIST 36 /* get MAC NAT list */ + /* * Generic bridge control request. @@ -175,7 +178,7 @@ struct ifbreq { #pragma pack() -/* BRDGGIFFLAGS, BRDGSIFFLAGS */ +/* BRDGGIFFLGS, BRDGSIFFLGS */ #define IFBIF_LEARNING 0x0001 /* if can learn */ #define IFBIF_DISCOVER 0x0002 /* if sends packets w/ unknown dest. */ #define IFBIF_STP 0x0004 /* if participates in spanning tree */ @@ -188,10 +191,13 @@ struct ifbreq { #define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ #define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ #define IFBIF_PRIVATE 0x0800 /* if is a private segment */ +#define IFBIF_MAC_NAT 0x8000 /* member requires MAC NAT */ #define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ - "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ - "\011AUTOPTP" + "\005STICKY\006EDGE\007AUTOEDGE\010PTP" \ + "\011AUTOPTP\014PRIVATE" \ + "\020MACNAT" + #define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ IFBIF_BSTP_ADMCOST) /* not saved */ @@ -201,23 +207,14 @@ struct ifbreq { #define IFBF_FLUSHALL 0x01 /* flush all addresses */ /* BRDGSFILT */ -#define IFBF_FILT_USEIPF 0x00000001 /* run pfil hooks on the bridge +#define IFBF_FILT_USEIPF 0x00000001 /* run pf hooks on the bridge * interface */ -#define IFBF_FILT_MEMBER 0x00000002 /* run pfil hooks on the member +#define IFBF_FILT_MEMBER 0x00000002 /* run pf hooks on the member * interfaces */ #define IFBF_FILT_ONLYIP 0x00000004 /* only pass IP[46] packets when - * pfil is enabled */ + * pf is enabled */ #define IFBF_FILT_MASK 0x00000007 /* mask of valid values */ - -/* APPLE MODIFICATION : Default is to pass non-IP packets. */ -#define IFBF_FILT_DEFAULT ( IFBF_FILT_USEIPF | IFBF_FILT_MEMBER ) -#if 0 -#define IFBF_FILT_DEFAULT (IFBF_FILT_USEIPF | \ -IFBF_FILT_MEMBER | \ -IFBF_FILT_ONLYIP) -#endif - /* * Interface list structure. */ @@ -551,5 +548,58 @@ extern u_int8_t bstp_etheraddr[ETHER_ADDR_LEN]; int bridgeattach(int); #endif /* XNU_KERNEL_PRIVATE */ + + +/* + * MAC NAT entry list + */ + +#pragma pack(4) + +union ifbrip { + struct in_addr ifbrip_addr; + struct in6_addr ifbrip_addr6; +}; + +struct ifbrmne { + char ifbmne_ifname[IFNAMSIZ]; /* member if name */ + uint64_t ifbmne_expire; /* expiration time */ + uint8_t ifbmne_mac[ETHER_ADDR_LEN];/* MAC address */ + uint8_t ifbmne_reserved; + uint8_t ifbmne_af; /* AF_INET or AF_INET6 */ + union ifbrip ifbmne_ip; +}; +#define ifbmne_ip_addr ifbmne_ip.ifbrip_addr +#define ifbmne_ip6_addr ifbmne_ip.ifbrip_addr6 + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbrmnelist { + uint32_t ifbml_len; /* buffer size (multiple of elsize) */ + uint16_t ifbml_elsize; /* sizeof(ifbrmacnatent) */ + uint16_t ifbml_pad; + caddr_t ifbml_buf; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbrmnelist32 { + uint32_t ifbml_len; /* buffer size */ + uint16_t ifbml_elsize; /* sizeof(ifbrmacnatent) */ + uint16_t ifbml_pad; + user32_addr_t ifbml_buf; +}; + +struct ifbrmnelist64 { + uint32_t ifbml_len; /* buffer size */ + uint16_t ifbml_elsize; /* sizeof(ifbrmacnatent) */ + uint16_t ifbml_pad; + user64_addr_t ifbml_buf; +}; + +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + #endif /* PRIVATE */ #endif /* !_NET_IF_BRIDGEVAR_H_ */ diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index da7f7579d..291aea2d7 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -49,6 +49,13 @@ struct ifnet_interface_advisory; #include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15.4)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + #ifdef XNU_KERNEL_PRIVATE #if CONFIG_EMBEDDED #define KPI_INTERFACE_EMBEDDED 1 @@ -1259,7 +1266,8 @@ extern errno_t ifnet_allocate_internal(const struct ifnet_init_params *init, ifnet_allocate_internal((init), (interface)) #else extern errno_t ifnet_allocate(const struct ifnet_init_params *init, - ifnet_t *interface); + ifnet_t *interface) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -1664,7 +1672,8 @@ extern errno_t ifnet_disable_output(ifnet_t interface); * @param interface The interface to increment the reference count of. * @result May return EINVAL if the interface is not valid. */ -extern errno_t ifnet_reference(ifnet_t interface); +extern errno_t ifnet_reference(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_release @@ -1674,7 +1683,8 @@ extern errno_t ifnet_reference(ifnet_t interface); * and possibly free. * @result May return EINVAL if the interface is not valid. */ -extern errno_t ifnet_release(ifnet_t interface); +extern errno_t ifnet_release(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_attach @@ -1695,7 +1705,8 @@ extern errno_t ifnet_release(ifnet_t interface); * interface. */ extern errno_t ifnet_attach(ifnet_t interface, - const struct sockaddr_dl *ll_addr); + const struct sockaddr_dl *ll_addr) +__NKE_API_DEPRECATED; /*! * @function ifnet_detach @@ -1721,7 +1732,8 @@ extern errno_t ifnet_attach(ifnet_t interface, * @param interface The interface to detach. * @result 0 on success, otherwise errno error. */ -extern errno_t ifnet_detach(ifnet_t interface); +extern errno_t ifnet_detach(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_interface_family_find @@ -1740,7 +1752,8 @@ extern errno_t ifnet_detach(ifnet_t interface); * is rebooted. * @result 0 on success, otherwise errno error. */ -extern errno_t ifnet_interface_family_find(const char *module_string, ifnet_family_t *family_id); +extern errno_t ifnet_interface_family_find(const char *module_string, ifnet_family_t *family_id) +__NKE_API_DEPRECATED; /* * Interface manipulation. @@ -1752,7 +1765,8 @@ extern errno_t ifnet_interface_family_find(const char *module_string, ifnet_fami * @param interface Interface to retrieve the storage from. * @result Driver's private storage. */ -extern void *ifnet_softc(ifnet_t interface); +extern void *ifnet_softc(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_name @@ -1760,7 +1774,8 @@ extern void *ifnet_softc(ifnet_t interface); * @param interface Interface to retrieve the name from. * @result Pointer to the name. */ -extern const char *ifnet_name(ifnet_t interface); +extern const char *ifnet_name(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_family @@ -1768,7 +1783,8 @@ extern const char *ifnet_name(ifnet_t interface); * @param interface Interface to retrieve the family from. * @result Interface family type. */ -extern ifnet_family_t ifnet_family(ifnet_t interface); +extern ifnet_family_t ifnet_family(ifnet_t interface) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -1786,7 +1802,9 @@ extern ifnet_subfamily_t ifnet_subfamily(ifnet_t interface); * @param interface Interface to retrieve the unit number from. * @result Unit number. */ -extern u_int32_t ifnet_unit(ifnet_t interface); +extern u_int32_t ifnet_unit(ifnet_t interface) +__NKE_API_DEPRECATED; + /*! * @function ifnet_index @@ -1798,7 +1816,8 @@ extern u_int32_t ifnet_unit(ifnet_t interface); * @param interface Interface to retrieve the index of. * @result Index. */ -extern u_int32_t ifnet_index(ifnet_t interface); +extern u_int32_t ifnet_index(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_flags @@ -1813,7 +1832,8 @@ extern u_int32_t ifnet_index(ifnet_t interface); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, - u_int16_t mask); + u_int16_t mask) +__NKE_API_DEPRECATED; /*! * @function ifnet_flags @@ -1821,8 +1841,8 @@ extern errno_t ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, * @param interface Interface to retrieve the flags from. * @result Flags. These flags are defined in net/if.h */ -extern u_int16_t ifnet_flags(ifnet_t interface); - +extern u_int16_t ifnet_flags(ifnet_t interface) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -2016,7 +2036,8 @@ extern errno_t ifnet_inet6_defrouter_llreachinfo(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_capabilities_supported(ifnet_t interface, u_int32_t new_caps, - u_int32_t mask); + u_int32_t mask) +__NKE_API_DEPRECATED; /*! * @function ifnet_capabilities_supported @@ -2024,7 +2045,8 @@ extern errno_t ifnet_set_capabilities_supported(ifnet_t interface, u_int32_t new * @param interface Interface to retrieve the capabilities from. * @result Flags. Capabilities flags are defined in net/if.h */ -extern u_int32_t ifnet_capabilities_supported(ifnet_t interface); +extern u_int32_t ifnet_capabilities_supported(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_capabilities_enabled @@ -2056,7 +2078,8 @@ extern u_int32_t ifnet_capabilities_supported(ifnet_t interface); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_capabilities_enabled(ifnet_t interface, u_int32_t new_caps, - u_int32_t mask); + u_int32_t mask) +__NKE_API_DEPRECATED; /*! * @function ifnet_capabilities_enabled @@ -2064,8 +2087,8 @@ extern errno_t ifnet_set_capabilities_enabled(ifnet_t interface, u_int32_t new_c * @param interface Interface to retrieve the capabilities from. * @result Flags. Capabilities flags are defined in net/if.h */ -extern u_int32_t ifnet_capabilities_enabled(ifnet_t interface); - +extern u_int32_t ifnet_capabilities_enabled(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_offload @@ -2083,7 +2106,8 @@ extern u_int32_t ifnet_capabilities_enabled(ifnet_t interface); * the device supports. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload); +extern errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) +__NKE_API_DEPRECATED; /*! * @function ifnet_offload @@ -2092,7 +2116,8 @@ extern errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload); * @param interface Interface to retrieve the offload from. * @result Abilities flags, see ifnet_offload_t. */ -extern ifnet_offload_t ifnet_offload(ifnet_t interface); +extern ifnet_offload_t ifnet_offload(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_tso_mtu @@ -2105,7 +2130,8 @@ extern ifnet_offload_t ifnet_offload(ifnet_t interface); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_tso_mtu(ifnet_t interface, sa_family_t family, - u_int32_t mtuLen); + u_int32_t mtuLen) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_tso_mtu @@ -2119,7 +2145,8 @@ extern errno_t ifnet_set_tso_mtu(ifnet_t interface, sa_family_t family, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_get_tso_mtu(ifnet_t interface, sa_family_t family, - u_int32_t *mtuLen); + u_int32_t *mtuLen) +__NKE_API_DEPRECATED; /*! * @enum Interface wake properties @@ -2139,7 +2166,8 @@ enum { * @param mask Mask of the properties to set of unset. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask); +extern errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_wake_flags @@ -2147,7 +2175,8 @@ extern errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_i * @param interface The interface. * @result The wake properties */ -extern u_int32_t ifnet_get_wake_flags(ifnet_t interface); +extern u_int32_t ifnet_get_wake_flags(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_link_mib_data @@ -2166,7 +2195,8 @@ extern u_int32_t ifnet_get_wake_flags(ifnet_t interface); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_link_mib_data(ifnet_t interface, void *mibData, - u_int32_t mibLen); + u_int32_t mibLen) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_link_mib_data @@ -2183,7 +2213,8 @@ extern errno_t ifnet_set_link_mib_data(ifnet_t interface, void *mibData, * no data. */ extern errno_t ifnet_get_link_mib_data(ifnet_t interface, void *mibData, - u_int32_t *mibLen); + u_int32_t *mibLen) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_link_mib_data_length @@ -2192,7 +2223,8 @@ extern errno_t ifnet_get_link_mib_data(ifnet_t interface, void *mibData, * @result Returns the number of bytes of mib data associated with the * interface. */ -extern u_int32_t ifnet_get_link_mib_data_length(ifnet_t interface); +extern u_int32_t ifnet_get_link_mib_data_length(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_attach_protocol @@ -2205,7 +2237,8 @@ extern u_int32_t ifnet_get_link_mib_data_length(ifnet_t interface); */ extern errno_t ifnet_attach_protocol(ifnet_t interface, protocol_family_t protocol_family, - const struct ifnet_attach_proto_param *proto_details); + const struct ifnet_attach_proto_param *proto_details) +__NKE_API_DEPRECATED; /*! * @function ifnet_attach_protocol_v2 @@ -2220,7 +2253,8 @@ extern errno_t ifnet_attach_protocol(ifnet_t interface, */ extern errno_t ifnet_attach_protocol_v2(ifnet_t interface, protocol_family_t protocol_family, - const struct ifnet_attach_proto_param_v2 *proto_details); + const struct ifnet_attach_proto_param_v2 *proto_details) +__NKE_API_DEPRECATED; /*! * @function ifnet_detach_protocol @@ -2231,7 +2265,8 @@ extern errno_t ifnet_attach_protocol_v2(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_detach_protocol(ifnet_t interface, - protocol_family_t protocol_family); + protocol_family_t protocol_family) +__NKE_API_DEPRECATED; /*! * @function ifnet_output @@ -2257,7 +2292,8 @@ extern errno_t ifnet_detach_protocol(ifnet_t interface, */ extern errno_t ifnet_output(ifnet_t interface, protocol_family_t protocol_family, mbuf_t packet, void *route, - const struct sockaddr *dest); + const struct sockaddr *dest) +__NKE_API_DEPRECATED; /*! * @function ifnet_output_raw @@ -2277,7 +2313,8 @@ extern errno_t ifnet_output(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_output_raw(ifnet_t interface, - protocol_family_t protocol_family, mbuf_t packet); + protocol_family_t protocol_family, mbuf_t packet) +__NKE_API_DEPRECATED; /*! * @function ifnet_input @@ -2294,7 +2331,8 @@ extern errno_t ifnet_output_raw(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_input(ifnet_t interface, mbuf_t first_packet, - const struct ifnet_stat_increment_param *stats); + const struct ifnet_stat_increment_param *stats) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -2335,7 +2373,8 @@ extern errno_t ifnet_input_extended(ifnet_t interface, mbuf_t first_packet, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_ioctl(ifnet_t interface, protocol_family_t protocol, - unsigned long ioctl_code, void *ioctl_arg); + unsigned long ioctl_code, void *ioctl_arg) +__NKE_API_DEPRECATED; /*! * @function ifnet_event @@ -2345,7 +2384,8 @@ extern errno_t ifnet_ioctl(ifnet_t interface, protocol_family_t protocol, * event. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_event(ifnet_t interface, struct kern_event_msg *event_ptr); +extern errno_t ifnet_event(ifnet_t interface, struct kern_event_msg *event_ptr) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_mtu @@ -2360,21 +2400,24 @@ extern errno_t ifnet_event(ifnet_t interface, struct kern_event_msg *event_ptr); * @param mtu The new MTU. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_mtu(ifnet_t interface, u_int32_t mtu); +extern errno_t ifnet_set_mtu(ifnet_t interface, u_int32_t mtu) +__NKE_API_DEPRECATED; /*! * @function ifnet_mtu * @param interface The interface. * @result The MTU. */ -extern u_int32_t ifnet_mtu(ifnet_t interface); +extern u_int32_t ifnet_mtu(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_type * @param interface The interface. * @result The type. See net/if_types.h. */ -extern u_int8_t ifnet_type(ifnet_t interface); +extern u_int8_t ifnet_type(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_addrlen @@ -2386,14 +2429,16 @@ extern u_int8_t ifnet_type(ifnet_t interface); * @param addrlen The new address length. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_addrlen(ifnet_t interface, u_int8_t addrlen); +extern errno_t ifnet_set_addrlen(ifnet_t interface, u_int8_t addrlen) +__NKE_API_DEPRECATED; /*! * @function ifnet_addrlen * @param interface The interface. * @result The address length. */ -extern u_int8_t ifnet_addrlen(ifnet_t interface); +extern u_int8_t ifnet_addrlen(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_hdrlen @@ -2405,14 +2450,16 @@ extern u_int8_t ifnet_addrlen(ifnet_t interface); * @param hdrlen The new header length. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_hdrlen(ifnet_t interface, u_int8_t hdrlen); +extern errno_t ifnet_set_hdrlen(ifnet_t interface, u_int8_t hdrlen) +__NKE_API_DEPRECATED; /*! * @function ifnet_hdrlen * @param interface The interface. * @result The header length. */ -extern u_int8_t ifnet_hdrlen(ifnet_t interface); +extern u_int8_t ifnet_hdrlen(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_metric @@ -2424,14 +2471,16 @@ extern u_int8_t ifnet_hdrlen(ifnet_t interface); * @param metric The new metric. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_metric(ifnet_t interface, u_int32_t metric); +extern errno_t ifnet_set_metric(ifnet_t interface, u_int32_t metric) +__NKE_API_DEPRECATED; /*! * @function ifnet_metric * @param interface The interface. * @result The metric. */ -extern u_int32_t ifnet_metric(ifnet_t interface); +extern u_int32_t ifnet_metric(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_baudrate @@ -2443,14 +2492,16 @@ extern u_int32_t ifnet_metric(ifnet_t interface); * @param baudrate The new baudrate. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_baudrate(ifnet_t interface, u_int64_t baudrate); +extern errno_t ifnet_set_baudrate(ifnet_t interface, u_int64_t baudrate) +__NKE_API_DEPRECATED; /*! * @function ifnet_baudrate * @param interface The interface. * @result The baudrate. */ -extern u_int64_t ifnet_baudrate(ifnet_t interface); +extern u_int64_t ifnet_baudrate(ifnet_t interface) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE typedef struct if_bandwidths if_bandwidths_t; @@ -2535,7 +2586,8 @@ extern errno_t ifnet_latencies(ifnet_t interface, if_latencies_t *output_lt, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_stat_increment(ifnet_t interface, - const struct ifnet_stat_increment_param *counts); + const struct ifnet_stat_increment_param *counts) +__NKE_API_DEPRECATED; /*! * @function ifnet_stat_increment_in @@ -2555,7 +2607,8 @@ extern errno_t ifnet_stat_increment(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_stat_increment_in(ifnet_t interface, - u_int32_t packets_in, u_int32_t bytes_in, u_int32_t errors_in); + u_int32_t packets_in, u_int32_t bytes_in, u_int32_t errors_in) +__NKE_API_DEPRECATED; /*! * @function ifnet_stat_increment_out @@ -2574,7 +2627,8 @@ extern errno_t ifnet_stat_increment_in(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_stat_increment_out(ifnet_t interface, - u_int32_t packets_out, u_int32_t bytes_out, u_int32_t errors_out); + u_int32_t packets_out, u_int32_t bytes_out, u_int32_t errors_out) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_stat @@ -2590,7 +2644,8 @@ extern errno_t ifnet_stat_increment_out(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_stat(ifnet_t interface, - const struct ifnet_stats_param *stats); + const struct ifnet_stats_param *stats) +__NKE_API_DEPRECATED; /*! * @function ifnet_stat @@ -2599,7 +2654,8 @@ extern errno_t ifnet_set_stat(ifnet_t interface, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_stat(ifnet_t interface, - struct ifnet_stats_param *out_stats); + struct ifnet_stats_param *out_stats) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_promiscuous @@ -2616,7 +2672,8 @@ extern errno_t ifnet_stat(ifnet_t interface, * zero, promiscuous mode will be disabled. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_set_promiscuous(ifnet_t interface, int on); +extern errno_t ifnet_set_promiscuous(ifnet_t interface, int on) +__NKE_API_DEPRECATED; /*! * @function ifnet_touch_lastchange @@ -2624,7 +2681,8 @@ extern errno_t ifnet_set_promiscuous(ifnet_t interface, int on); * @param interface The interface. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_touch_lastchange(ifnet_t interface); +extern errno_t ifnet_touch_lastchange(ifnet_t interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_lastchange @@ -2632,7 +2690,8 @@ extern errno_t ifnet_touch_lastchange(ifnet_t interface); * @param last_change A timeval struct to copy the last time changed in * to. */ -extern errno_t ifnet_lastchange(ifnet_t interface, struct timeval *last_change); +extern errno_t ifnet_lastchange(ifnet_t interface, struct timeval *last_change) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_address_list @@ -2647,7 +2706,8 @@ extern errno_t ifnet_lastchange(ifnet_t interface, struct timeval *last_change); * @param addresses A pointer to a NULL terminated array of ifaddr_ts. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses); +extern errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_address_list_family @@ -2665,7 +2725,8 @@ extern errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_get_address_list_family(ifnet_t interface, - ifaddr_t **addresses, sa_family_t family); + ifaddr_t **addresses, sa_family_t family) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -2693,7 +2754,8 @@ __private_extern__ errno_t ifnet_get_address_list_family_internal(ifnet_t, * memory used for the array of references. * @param addresses An array of ifaddr_ts. */ -extern void ifnet_free_address_list(ifaddr_t *addresses); +extern void ifnet_free_address_list(ifaddr_t *addresses) +__NKE_API_DEPRECATED; /*! * @function ifnet_set_lladdr @@ -2705,7 +2767,8 @@ extern void ifnet_free_address_list(ifaddr_t *addresses); * @param lladdr_len The length, in bytes, of the link layer address. */ extern errno_t ifnet_set_lladdr(ifnet_t interface, const void *lladdr, - size_t lladdr_len); + size_t lladdr_len) +__NKE_API_DEPRECATED; /*! * @function ifnet_lladdr_copy_bytes @@ -2717,7 +2780,8 @@ extern errno_t ifnet_set_lladdr(ifnet_t interface, const void *lladdr, * length of the link-layer address. */ extern errno_t ifnet_lladdr_copy_bytes(ifnet_t interface, void *lladdr, - size_t length); + size_t length) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -2739,6 +2803,7 @@ extern errno_t ifnet_guarded_lladdr_copy_bytes(ifnet_t interface, void *lladdr, * @param interface The interface the link-layer address is on. */ extern void *ifnet_lladdr(ifnet_t interface); + #endif /* KERNEL_PRIVATE */ /*! @@ -2751,7 +2816,8 @@ extern void *ifnet_lladdr(ifnet_t interface); * @param out_len On return, the length of the broadcast address. */ extern errno_t ifnet_llbroadcast_copy_bytes(ifnet_t interface, void *addr, - size_t bufferlen, size_t *out_len); + size_t bufferlen, size_t *out_len) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -2767,7 +2833,8 @@ extern errno_t ifnet_llbroadcast_copy_bytes(ifnet_t interface, void *addr, * @param type The link-layer address type. */ extern errno_t ifnet_set_lladdr_and_type(ifnet_t interface, const void *lladdr, - size_t length, u_char type); + size_t length, u_char type) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -2785,7 +2852,8 @@ extern errno_t ifnet_set_lladdr_and_type(ifnet_t interface, const void *lladdr, * indicate other failures. */ extern errno_t ifnet_resolve_multicast(ifnet_t ifp, - const struct sockaddr *proto_addr, struct sockaddr *ll_addr, size_t ll_len); + const struct sockaddr *proto_addr, struct sockaddr *ll_addr, size_t ll_len) +__NKE_API_DEPRECATED; /*! * @function ifnet_add_multicast @@ -2802,7 +2870,8 @@ extern errno_t ifnet_resolve_multicast(ifnet_t ifp, * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_add_multicast(ifnet_t interface, - const struct sockaddr *maddr, ifmultiaddr_t *multicast); + const struct sockaddr *maddr, ifmultiaddr_t *multicast) +__NKE_API_DEPRECATED; /*! * @function ifnet_remove_multicast @@ -2828,7 +2897,8 @@ extern errno_t ifnet_add_multicast(ifnet_t interface, * @param multicast The multicast to be removed. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_remove_multicast(ifmultiaddr_t multicast); +extern errno_t ifnet_remove_multicast(ifmultiaddr_t multicast) +__NKE_API_DEPRECATED; /*! * @function ifnet_get_multicast_list @@ -2844,7 +2914,8 @@ extern errno_t ifnet_remove_multicast(ifmultiaddr_t multicast); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_get_multicast_list(ifnet_t interface, - ifmultiaddr_t **addresses); + ifmultiaddr_t **addresses) +__NKE_API_DEPRECATED; /*! * @function ifnet_free_multicast_list @@ -2853,7 +2924,8 @@ extern errno_t ifnet_get_multicast_list(ifnet_t interface, * multicast address and frees the array. * @param multicasts An array of references to the multicast addresses. */ -extern void ifnet_free_multicast_list(ifmultiaddr_t *multicasts); +extern void ifnet_free_multicast_list(ifmultiaddr_t *multicasts) +__NKE_API_DEPRECATED; /*! * @function ifnet_find_by_name @@ -2866,7 +2938,8 @@ extern void ifnet_free_multicast_list(ifmultiaddr_t *multicasts); * filled in if a matching interface is found. * @result 0 on success otherwise the errno error. */ -extern errno_t ifnet_find_by_name(const char *ifname, ifnet_t *interface); +extern errno_t ifnet_find_by_name(const char *ifname, ifnet_t *interface) +__NKE_API_DEPRECATED; /*! * @function ifnet_list_get @@ -2883,7 +2956,8 @@ extern errno_t ifnet_find_by_name(const char *ifname, ifnet_t *interface); * @result 0 on success otherwise the errno error. */ extern errno_t ifnet_list_get(ifnet_family_t family, ifnet_t **interfaces, - u_int32_t *count); + u_int32_t *count) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -2903,6 +2977,7 @@ extern errno_t ifnet_list_get(ifnet_family_t family, ifnet_t **interfaces, */ extern errno_t ifnet_list_get_all(ifnet_family_t family, ifnet_t **interfaces, u_int32_t *count); + #endif /* KERNEL_PRIVATE */ /*! @@ -2914,7 +2989,8 @@ extern errno_t ifnet_list_get_all(ifnet_family_t family, ifnet_t **interfaces, * ifnet_list_free. * @param interfaces An array of interface references from ifnet_list_get. */ -extern void ifnet_list_free(ifnet_t *interfaces); +extern void ifnet_list_free(ifnet_t *interfaces) +__NKE_API_DEPRECATED; /******************************************************************************/ /* ifaddr_t accessors */ @@ -2927,7 +3003,8 @@ extern void ifnet_list_free(ifnet_t *interfaces); * @param ifaddr The interface address. * @result 0 upon success */ -extern errno_t ifaddr_reference(ifaddr_t ifaddr); +extern errno_t ifaddr_reference(ifaddr_t ifaddr) +__NKE_API_DEPRECATED; /*! * @function ifaddr_release @@ -2936,7 +3013,8 @@ extern errno_t ifaddr_reference(ifaddr_t ifaddr); * @param ifaddr The interface address. * @result 0 upon success */ -extern errno_t ifaddr_release(ifaddr_t ifaddr); +extern errno_t ifaddr_release(ifaddr_t ifaddr) +__NKE_API_DEPRECATED; /*! * @function ifaddr_address @@ -2947,7 +3025,8 @@ extern errno_t ifaddr_release(ifaddr_t ifaddr); * @result 0 upon success */ extern errno_t ifaddr_address(ifaddr_t ifaddr, struct sockaddr *out_addr, - u_int32_t addr_size); + u_int32_t addr_size) +__NKE_API_DEPRECATED; /*! * @function ifaddr_address @@ -2955,7 +3034,8 @@ extern errno_t ifaddr_address(ifaddr_t ifaddr, struct sockaddr *out_addr, * @param ifaddr The interface address. * @result 0 on failure, address family on success. */ -extern sa_family_t ifaddr_address_family(ifaddr_t ifaddr); +extern sa_family_t ifaddr_address_family(ifaddr_t ifaddr) +__NKE_API_DEPRECATED; /*! * @function ifaddr_dstaddress @@ -2966,7 +3046,8 @@ extern sa_family_t ifaddr_address_family(ifaddr_t ifaddr); * @result 0 upon success */ extern errno_t ifaddr_dstaddress(ifaddr_t ifaddr, struct sockaddr *out_dstaddr, - u_int32_t dstaddr_size); + u_int32_t dstaddr_size) +__NKE_API_DEPRECATED; /*! * @function ifaddr_netmask @@ -2977,7 +3058,8 @@ extern errno_t ifaddr_dstaddress(ifaddr_t ifaddr, struct sockaddr *out_dstaddr, * @result 0 upon success */ extern errno_t ifaddr_netmask(ifaddr_t ifaddr, struct sockaddr *out_netmask, - u_int32_t netmask_size); + u_int32_t netmask_size) +__NKE_API_DEPRECATED; /*! * @function ifaddr_ifnet @@ -2989,7 +3071,8 @@ extern errno_t ifaddr_netmask(ifaddr_t ifaddr, struct sockaddr *out_netmask, * @param ifaddr The interface address. * @result A reference to the interface the address is attached to. */ -extern ifnet_t ifaddr_ifnet(ifaddr_t ifaddr); +extern ifnet_t ifaddr_ifnet(ifaddr_t ifaddr) +__NKE_API_DEPRECATED; /*! * @function ifaddr_withaddr @@ -2999,7 +3082,8 @@ extern ifnet_t ifaddr_ifnet(ifaddr_t ifaddr); * @param address The address to search for. * @result A reference to the interface address. */ -extern ifaddr_t ifaddr_withaddr(const struct sockaddr *address); +extern ifaddr_t ifaddr_withaddr(const struct sockaddr *address) +__NKE_API_DEPRECATED; /*! * @function ifaddr_withdstaddr @@ -3010,8 +3094,8 @@ extern ifaddr_t ifaddr_withaddr(const struct sockaddr *address); * @param destination The destination to search for. * @result A reference to the interface address. */ -extern ifaddr_t ifaddr_withdstaddr(const struct sockaddr *destination); - +extern ifaddr_t ifaddr_withdstaddr(const struct sockaddr *destination) +__NKE_API_DEPRECATED; /*! * @function ifaddr_withnet * @discussion Returns an interface address for the interface with the @@ -3021,7 +3105,8 @@ extern ifaddr_t ifaddr_withdstaddr(const struct sockaddr *destination); * @param net The network to search for. * @result A reference to the interface address. */ -extern ifaddr_t ifaddr_withnet(const struct sockaddr *net); +extern ifaddr_t ifaddr_withnet(const struct sockaddr *net) +__NKE_API_DEPRECATED; /*! * @function ifaddr_withroute @@ -3035,7 +3120,8 @@ extern ifaddr_t ifaddr_withnet(const struct sockaddr *net); * @result A reference to the interface address. */ extern ifaddr_t ifaddr_withroute(int flags, const struct sockaddr *destination, - const struct sockaddr *gateway); + const struct sockaddr *gateway) +__NKE_API_DEPRECATED; /*! * @function ifaddr_findbestforaddr @@ -3048,7 +3134,8 @@ extern ifaddr_t ifaddr_withroute(int flags, const struct sockaddr *destination, * @result A reference to the interface address. */ extern ifaddr_t ifaddr_findbestforaddr(const struct sockaddr *addr, - ifnet_t interface); + ifnet_t interface) +__NKE_API_DEPRECATED; /******************************************************************************/ /* ifmultiaddr_t accessors */ @@ -3061,7 +3148,8 @@ extern ifaddr_t ifaddr_findbestforaddr(const struct sockaddr *addr, * @param ifmaddr The interface multicast address. * @result 0 on success. Only error will be EINVAL if ifmaddr is not valid. */ -extern errno_t ifmaddr_reference(ifmultiaddr_t ifmaddr); +extern errno_t ifmaddr_reference(ifmultiaddr_t ifmaddr) +__NKE_API_DEPRECATED; /*! * @function ifmaddr_release @@ -3072,7 +3160,8 @@ extern errno_t ifmaddr_reference(ifmultiaddr_t ifmaddr); * @param ifmaddr The interface multicast address. * @result 0 on success. Only error will be EINVAL if ifmaddr is not valid. */ -extern errno_t ifmaddr_release(ifmultiaddr_t ifmaddr); +extern errno_t ifmaddr_release(ifmultiaddr_t ifmaddr) +__NKE_API_DEPRECATED; /*! * @function ifmaddr_address @@ -3082,7 +3171,8 @@ extern errno_t ifmaddr_release(ifmultiaddr_t ifmaddr); * @result 0 on success. */ extern errno_t ifmaddr_address(ifmultiaddr_t ifmaddr, - struct sockaddr *out_multicast, u_int32_t addr_size); + struct sockaddr *out_multicast, u_int32_t addr_size) +__NKE_API_DEPRECATED; /*! * @function ifmaddr_lladdress @@ -3093,7 +3183,8 @@ extern errno_t ifmaddr_address(ifmultiaddr_t ifmaddr, * @result 0 on success. */ extern errno_t ifmaddr_lladdress(ifmultiaddr_t ifmaddr, - struct sockaddr *out_link_layer_multicast, u_int32_t addr_size); + struct sockaddr *out_link_layer_multicast, u_int32_t addr_size) +__NKE_API_DEPRECATED; /*! * @function ifmaddr_ifnet @@ -3106,7 +3197,8 @@ extern errno_t ifmaddr_lladdress(ifmultiaddr_t ifmaddr, * @param ifmaddr The interface multicast address. * @result A reference to the interface. */ -extern ifnet_t ifmaddr_ifnet(ifmultiaddr_t ifmaddr); +extern ifnet_t ifmaddr_ifnet(ifmultiaddr_t ifmaddr) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /******************************************************************************/ diff --git a/bsd/net/kpi_interfacefilter.h b/bsd/net/kpi_interfacefilter.h index dd16bd7d4..819112d7a 100644 --- a/bsd/net/kpi_interfacefilter.h +++ b/bsd/net/kpi_interfacefilter.h @@ -39,6 +39,13 @@ #include #include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15.4)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + struct kev_msg; __BEGIN_DECLS @@ -212,7 +219,8 @@ extern errno_t iflt_attach_internal(ifnet_t interface, const struct iff_filter * iflt_attach_internal((interface), (filter), (filter_ref)) #else extern errno_t iflt_attach(ifnet_t interface, const struct iff_filter *filter, - interface_filter_t *filter_ref); + interface_filter_t *filter_ref) +__NKE_API_DEPRECATED; #endif /* KERNEL_PRIVATE */ /*! @@ -220,7 +228,8 @@ extern errno_t iflt_attach(ifnet_t interface, const struct iff_filter *filter, * @discussion Detaches an interface filter from an interface. * @param filter_ref The reference to the filter from iflt_attach. */ -extern void iflt_detach(interface_filter_t filter_ref); +extern void iflt_detach(interface_filter_t filter_ref) +__NKE_API_DEPRECATED; __END_DECLS #endif /* __KPI_INTERFACEFILTER__ */ diff --git a/bsd/net/kpi_protocol.h b/bsd/net/kpi_protocol.h index f8b2ee8a4..f7ba31c2d 100644 --- a/bsd/net/kpi_protocol.h +++ b/bsd/net/kpi_protocol.h @@ -39,6 +39,13 @@ #include #include +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15.4)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + __BEGIN_DECLS /******************************************************************************/ @@ -103,7 +110,8 @@ extern void proto_unregister_input(protocol_family_t protocol); * @result A errno error on failure. Unless proto_input returns zero, * the caller is responsible for freeing the mbuf. */ -extern errno_t proto_input(protocol_family_t protocol, mbuf_t packet); +extern errno_t proto_input(protocol_family_t protocol, mbuf_t packet) +__NKE_API_DEPRECATED; /*! * @function proto_inject @@ -115,7 +123,8 @@ extern errno_t proto_input(protocol_family_t protocol, mbuf_t packet); * @result A errno error on failure. Unless proto_inject returns zero, * the caller is responsible for freeing the mbuf. */ -extern errno_t proto_inject(protocol_family_t protocol, mbuf_t packet); +extern errno_t proto_inject(protocol_family_t protocol, mbuf_t packet) +__NKE_API_DEPRECATED; /******************************************************************************/ @@ -164,7 +173,8 @@ typedef void (*proto_unplumb_handler)(ifnet_t ifp, protocol_family_t protocol); */ extern errno_t proto_register_plumber(protocol_family_t proto_fam, ifnet_family_t if_fam, proto_plumb_handler plumb, - proto_unplumb_handler unplumb); + proto_unplumb_handler unplumb) +__NKE_API_DEPRECATED; /*! * @function proto_unregister_plumber @@ -174,7 +184,8 @@ extern errno_t proto_register_plumber(protocol_family_t proto_fam, * @param if_fam The interface family these plumbing functions handle. */ extern void proto_unregister_plumber(protocol_family_t proto_fam, - ifnet_family_t if_fam); + ifnet_family_t if_fam) +__NKE_API_DEPRECATED; #ifdef BSD_KERNEL_PRIVATE /* diff --git a/bsd/net/necp.c b/bsd/net/necp.c index 2133f31f3..d410484ca 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -286,7 +286,8 @@ struct necp_socket_info { errno_t cred_result; unsigned has_client : 1; unsigned is_platform_binary : 1; - unsigned __pad_bits : 6; + unsigned used_responsible_pid : 1; + unsigned __pad_bits : 5; }; static lck_grp_attr_t *necp_kernel_policy_grp_attr = NULL; @@ -956,7 +957,8 @@ necp_session_set_session_priority(struct necp_session *session, struct necp_sess // Enforce special session priorities with entitlements if (requested_session_priority == NECP_SESSION_PRIORITY_CONTROL || - requested_session_priority == NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL) { + requested_session_priority == NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL || + requested_session_priority == NECP_SESSION_PRIORITY_HIGH_RESTRICTED) { errno_t cred_result = priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NECP_POLICIES, 0); if (cred_result != 0) { NECPLOG(LOG_ERR, "Session does not hold necessary entitlement to claim priority level %d", requested_session_priority); @@ -1788,6 +1790,10 @@ necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length) u_int32_t parameter_length = necp_policy_result_get_parameter_length_from_buffer(buffer, length); switch (type) { case NECP_POLICY_RESULT_PASS: + if (parameter_length == 0 || parameter_length == sizeof(u_int32_t)) { + validated = TRUE; + } + break; case NECP_POLICY_RESULT_DROP: case NECP_POLICY_RESULT_ROUTE_RULES: case NECP_POLICY_RESULT_SCOPED_DIRECT: @@ -3507,6 +3513,12 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli ultimate_result = necp_policy_get_result_type(policy); switch (ultimate_result) { case NECP_POLICY_RESULT_PASS: { + u_int32_t pass_flags = 0; + if (necp_policy_result_get_parameter_length_from_buffer(policy->result, policy->result_size) > 0) { + if (necp_policy_get_result_parameter(policy, (u_int8_t *)&pass_flags, sizeof(pass_flags))) { + ultimate_result_parameter.pass_flags = pass_flags; + } + } if (socket_only_conditions) { // socket_ip_conditions can be TRUE or FALSE socket_layer_non_id_conditions = TRUE; ip_output_layer_id_condition = TRUE; @@ -4016,7 +4028,7 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul break; } case NECP_KERNEL_POLICY_RESULT_PASS: { - snprintf(result_string, MAX_RESULT_STRING_LEN, "Pass"); + snprintf(result_string, MAX_RESULT_STRING_LEN, "Pass (%X)", result_parameter.pass_flags); break; } case NECP_KERNEL_POLICY_RESULT_SKIP: { @@ -5928,7 +5940,7 @@ necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info) #define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_LOCAL_NETWORKS) static void -necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info) +necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, uuid_t responsible_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info) { memset(info, 0, sizeof(struct necp_socket_info)); @@ -5971,6 +5983,15 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic } } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(responsible_application_uuid)) { + struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(responsible_application_uuid); + if (existing_mapping != NULL) { + info->real_application_id = info->application_id; + info->application_id = existing_mapping->id; + info->used_responsible_pid = true; + } + } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && account != NULL) { struct necp_string_id_mapping *existing_mapping = necp_lookup_string_to_id_locked(&necp_account_id_list, account); if (existing_mapping) { @@ -6046,7 +6067,8 @@ necp_application_find_policy_match_internal(proc_t proc, struct necp_client_endpoint *returned_v4_gateway, struct necp_client_endpoint *returned_v6_gateway, struct rtentry **returned_route, bool ignore_address, - bool has_client) + bool has_client, + uuid_t *returned_override_euuid) { int error = 0; size_t offset = 0; @@ -6090,6 +6112,8 @@ necp_application_find_policy_match_internal(proc_t proc, uuid_clear(real_application_uuid); proc_getexecutableuuid(proc, real_application_uuid, sizeof(real_application_uuid)); uuid_copy(application_uuid, real_application_uuid); + uuid_t responsible_application_uuid; + uuid_clear(responsible_application_uuid); char *domain = NULL; char *account = NULL; @@ -6108,6 +6132,11 @@ necp_application_find_policy_match_internal(proc_t proc, bool has_checked_delegation_entitlement = FALSE; bool has_delegation_entitlement = FALSE; +#if defined(XNU_TARGET_OS_OSX) + proc_t effective_proc = proc; + bool release_eproc = false; +#endif /* defined(XNU_TARGET_OS_OSX) */ + if (returned_result == NULL) { return EINVAL; } @@ -6120,6 +6149,10 @@ necp_application_find_policy_match_internal(proc_t proc, memset(returned_v6_gateway, 0, sizeof(struct necp_client_endpoint)); } + if (returned_override_euuid != NULL) { + uuid_clear(*returned_override_euuid); + } + memset(returned_result, 0, sizeof(struct necp_aggregate_result)); u_int32_t drop_order = necp_process_drop_order(proc_ucred(proc)); @@ -6331,17 +6364,40 @@ necp_application_find_policy_match_internal(proc_t proc, return 0; } +#if defined(XNU_TARGET_OS_OSX) + if (proc_pid(effective_proc) != pid) { + proc_t found_proc = proc_find(pid); + if (found_proc != PROC_NULL) { + effective_proc = found_proc; + release_eproc = true; + } + } + if (effective_proc->p_responsible_pid > 0 && effective_proc->p_responsible_pid != pid) { + proc_t responsible_proc = proc_find(effective_proc->p_responsible_pid); + if (responsible_proc != PROC_NULL) { + proc_getexecutableuuid(responsible_proc, responsible_application_uuid, sizeof(responsible_application_uuid)); + proc_rele(responsible_proc); + } + } + if (release_eproc && effective_proc != PROC_NULL) { + proc_rele(effective_proc); + } +#endif /* defined(XNU_TARGET_OS_OSX) */ + // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; size_t route_rule_id_array_count = 0; - necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, drop_order, client_flags, &info); + necp_application_fillout_info_locked(application_uuid, real_application_uuid, responsible_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, drop_order, client_flags, &info); matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL, NULL, &drop_dest_policy_result, &drop_all_bypass); if (matched_policy) { returned_result->policy_id = matched_policy->id; returned_result->routing_result = matched_policy->result; memcpy(&returned_result->routing_result_parameter, &matched_policy->result_parameter, sizeof(returned_result->routing_result_parameter)); + if (returned_override_euuid != NULL && info.used_responsible_pid && !(matched_policy->condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID)) { + uuid_copy(*returned_override_euuid, responsible_application_uuid); + } } else { bool drop_all = false; if (necp_drop_all_order > 0 || info.drop_order > 0 || drop_dest_policy_result == NECP_KERNEL_POLICY_RESULT_DROP) { @@ -7184,12 +7240,27 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc } if (inp->inp_flags2 & INP2_WANT_APP_POLICY && necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID) { + u_int32_t responsible_application_id = 0; + struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid)); if (existing_mapping) { info->application_id = existing_mapping->id; } - if (!(so->so_flags & SOF_DELEGATED)) { +#if defined(XNU_TARGET_OS_OSX) + if (so->so_rpid > 0) { + existing_mapping = necp_uuid_lookup_app_id_locked(so->so_ruuid); + if (existing_mapping != NULL) { + responsible_application_id = existing_mapping->id; + } + } +#endif + + if (responsible_application_id > 0) { + info->real_application_id = info->application_id; + info->application_id = responsible_application_id; + info->used_responsible_pid = true; + } else if (!(so->so_flags & SOF_DELEGATED)) { info->real_application_id = info->application_id; } else if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID) { struct necp_uuid_id_mapping *real_existing_mapping = necp_uuid_lookup_app_id_locked(so->last_uuid); @@ -7438,7 +7509,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) { skip_order = policy_search_array[i]->result_parameter.skip_policy_order; skip_session_order = policy_search_array[i]->session_order + 1; - if (skip_policy_id) { + if (skip_policy_id && *skip_policy_id == NECP_KERNEL_POLICY_ID_NONE) { *skip_policy_id = policy_search_array[i]->id; } continue; @@ -7588,6 +7659,11 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Check for loopback exception if (necp_socket_bypass(override_local_addr, override_remote_addr, inp)) { + if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { + // If the previous policy result was "socket scoped", un-scope the socket. + inp->inp_flags &= ~INP_BOUND_IF; + inp->inp_boundifp = NULL; + } // Mark socket as a pass inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; @@ -7604,7 +7680,6 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local lck_rw_lock_shared(&necp_kernel_policy_lock); necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, drop_order, &info); - inp->inp_policyresult.app_id = info.application_id; // Check info u_int32_t flowhash = necp_socket_calc_flowhash_locked(&info); @@ -7619,8 +7694,10 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local return inp->inp_policyresult.policy_id; } + inp->inp_policyresult.app_id = info.application_id; + // Match socket to policy - necp_kernel_policy_id skip_policy_id; + necp_kernel_policy_id skip_policy_id = NECP_KERNEL_POLICY_ID_NONE; u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; size_t route_rule_id_array_count = 0; matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id, inp->inp_route.ro_rt, &drop_dest_policy_result, &drop_all_bypass); @@ -7721,6 +7798,10 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local inp->inp_policyresult.results.result = matched_policy->result; memcpy(&inp->inp_policyresult.results.result_parameter, &matched_policy->result_parameter, sizeof(matched_policy->result_parameter)); + if (info.used_responsible_pid && (matched_policy->condition_mask & NECP_KERNEL_CONDITION_REAL_APP_ID)) { + inp->inp_policyresult.app_id = info.real_application_id; + } + if (necp_socket_is_connected(inp) && (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP || (matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && !necp_socket_uses_interface(inp, matched_policy->result_parameter.tunnel_interface_index)))) { @@ -9210,7 +9291,7 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel } else { packet->m_pkthdr.necp_mtag.necp_route_rule_id = inp->inp_policyresult.results.route_rule_id; } - packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id; + packet->m_pkthdr.necp_mtag.necp_app_id = (inp->inp_policyresult.app_id > UINT16_MAX ? (inp->inp_policyresult.app_id - UINT16_MAX) : inp->inp_policyresult.app_id); if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE && skip_policy_id != NECP_KERNEL_POLICY_ID_NO_MATCH) { @@ -9344,7 +9425,8 @@ necp_get_app_uuid_from_packet(struct mbuf *packet, bool found_mapping = FALSE; if (packet->m_pkthdr.necp_mtag.necp_app_id != 0) { lck_rw_lock_shared(&necp_kernel_policy_lock); - struct necp_uuid_id_mapping *entry = necp_uuid_lookup_uuid_with_app_id_locked(packet->m_pkthdr.necp_mtag.necp_app_id); + necp_app_id app_id = (packet->m_pkthdr.necp_mtag.necp_app_id < UINT16_MAX ? (packet->m_pkthdr.necp_mtag.necp_app_id + UINT16_MAX) : packet->m_pkthdr.necp_mtag.necp_app_id); + struct necp_uuid_id_mapping *entry = necp_uuid_lookup_uuid_with_app_id_locked(app_id); if (entry != NULL) { uuid_copy(app_uuid, entry->uuid); found_mapping = true; @@ -9746,6 +9828,11 @@ sysctl_handle_necp_drop_dest_level SYSCTL_HANDLER_ARGS case NECP_SESSION_PRIORITY_CONTROL: case NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL: case NECP_SESSION_PRIORITY_HIGH: + case NECP_SESSION_PRIORITY_HIGH_1: + case NECP_SESSION_PRIORITY_HIGH_2: + case NECP_SESSION_PRIORITY_HIGH_3: + case NECP_SESSION_PRIORITY_HIGH_4: + case NECP_SESSION_PRIORITY_HIGH_RESTRICTED: case NECP_SESSION_PRIORITY_DEFAULT: case NECP_SESSION_PRIORITY_LOW: if (tmp_drop_dest_policy.entry_count == 0) { diff --git a/bsd/net/necp.h b/bsd/net/necp.h index 5bf7a9de1..f658ad1ed 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -169,9 +169,14 @@ struct necp_packet_header { #define NECP_POLICY_RESULT_USE_NETAGENT 14 // netagent uuid_t #define NECP_POLICY_RESULT_NETAGENT_SCOPED 15 // netagent uuid_t #define NECP_POLICY_RESULT_SCOPED_DIRECT 16 // N/A, scopes to primary physical interface -#define NECP_POLICY_RESULT_ALLOW_UNENTITLED 17 // N/A +#define NECP_POLICY_RESULT_ALLOW_UNENTITLED 17 // N/A -#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_ALLOW_UNENTITLED +#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_ALLOW_UNENTITLED + +/* + * PASS Result Flags + */ +#define NECP_POLICY_PASS_NO_SKIP_IPSEC 0x01 /* * Route Rules @@ -238,11 +243,15 @@ struct necp_policy_condition_agent_type { #define NECP_SESSION_PRIORITY_UNKNOWN 0 #define NECP_SESSION_PRIORITY_CONTROL 1 -#define NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL 2 -#define NECP_SESSION_PRIORITY_HIGH 3 -#define NECP_SESSION_PRIORITY_DEFAULT 4 -#define NECP_SESSION_PRIORITY_LOW 5 - +#define NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL 2 +#define NECP_SESSION_PRIORITY_HIGH 3 +#define NECP_SESSION_PRIORITY_HIGH_1 4 +#define NECP_SESSION_PRIORITY_HIGH_2 5 +#define NECP_SESSION_PRIORITY_HIGH_3 6 +#define NECP_SESSION_PRIORITY_HIGH_4 7 +#define NECP_SESSION_PRIORITY_HIGH_RESTRICTED 8 +#define NECP_SESSION_PRIORITY_DEFAULT 9 +#define NECP_SESSION_PRIORITY_LOW 10 #define NECP_SESSION_NUM_PRIORITIES NECP_SESSION_PRIORITY_LOW typedef u_int32_t necp_policy_id; @@ -685,6 +694,8 @@ struct necp_client_result_interface { u_int32_t index; }; +#define NECP_USES_INTERFACE_OPTIONS_FOR_BROWSE 1 + struct necp_client_interface_option { u_int32_t interface_index; u_int32_t interface_generation; @@ -846,7 +857,8 @@ extern int necp_application_find_policy_match_internal(proc_t proc, u_int8_t *pa struct necp_client_endpoint *returned_v4_gateway, struct necp_client_endpoint *returned_v6_gateway, struct rtentry **returned_route, bool ignore_address, - bool has_client); + bool has_client, + uuid_t *returned_override_euuid); /* * TLV utilities * @@ -922,7 +934,9 @@ typedef u_int32_t necp_app_id; #define NECP_KERNEL_POLICY_RESULT_USE_NETAGENT NECP_POLICY_RESULT_USE_NETAGENT #define NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED NECP_POLICY_RESULT_NETAGENT_SCOPED #define NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT NECP_POLICY_RESULT_SCOPED_DIRECT -#define NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED NECP_POLICY_RESULT_ALLOW_UNENTITLED +#define NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED NECP_POLICY_RESULT_ALLOW_UNENTITLED + +#define NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC NECP_POLICY_PASS_NO_SKIP_IPSEC typedef struct { u_int32_t identifier; @@ -937,6 +951,7 @@ typedef union { u_int32_t skip_policy_order; u_int32_t route_rule_id; u_int32_t netagent_id; + u_int32_t pass_flags; necp_kernel_policy_service service; } necp_kernel_policy_result_parameter; diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c index 0e3dd1f47..0b3ef6782 100644 --- a/bsd/net/necp_client.c +++ b/bsd/net/necp_client.c @@ -382,6 +382,8 @@ struct necp_client { void *agent_handle; + uuid_t override_euuid; + size_t parameters_length; u_int8_t parameters[0]; @@ -1482,7 +1484,7 @@ necp_client_flow_is_viable(proc_t proc, struct necp_client *client, &result, &flow->necp_flow_flags, NULL, flow->interface_index, &flow->local_addr, &flow->remote_addr, NULL, NULL, - NULL, ignore_address, true); + NULL, ignore_address, true, NULL); // Check for blocking agents for (int i = 0; i < NECP_MAX_NETAGENTS; i++) { @@ -1634,6 +1636,60 @@ necp_client_mark_all_nonsocket_flows_as_invalid(struct necp_client *client) client->interface_option_count = 0; } +static inline bool +necp_netagent_is_required(const struct necp_client_parsed_parameters *parameters, + uuid_t *netagent_uuid) +{ + // Specific use agents only apply when required + bool required = false; + if (parameters != NULL) { + // Check required agent UUIDs + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { + if (uuid_is_null(parameters->required_netagents[i])) { + break; + } + if (uuid_compare(parameters->required_netagents[i], *netagent_uuid) == 0) { + required = true; + break; + } + } + + if (!required) { + // Check required agent types + bool fetched_type = false; + char netagent_domain[NETAGENT_DOMAINSIZE]; + char netagent_type[NETAGENT_TYPESIZE]; + memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE); + memset(&netagent_type, 0, NETAGENT_TYPESIZE); + + for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { + if (strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || + strlen(parameters->required_netagent_types[i].netagent_type) == 0) { + break; + } + + if (!fetched_type) { + if (netagent_get_agent_domain_and_type(*netagent_uuid, netagent_domain, netagent_type)) { + fetched_type = TRUE; + } else { + break; + } + } + + if ((strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || + strncmp(netagent_domain, parameters->required_netagent_types[i].netagent_domain, NETAGENT_DOMAINSIZE) == 0) && + (strlen(parameters->required_netagent_types[i].netagent_type) == 0 || + strncmp(netagent_type, parameters->required_netagent_types[i].netagent_type, NETAGENT_TYPESIZE) == 0)) { + required = true; + break; + } + } + } + } + + return required; +} + static bool necp_netagent_applies_to_client(struct necp_client *client, const struct necp_client_parsed_parameters *parameters, @@ -1701,53 +1757,7 @@ necp_netagent_applies_to_client(struct necp_client *client, if (flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) { // Specific use agents only apply when required - bool required = FALSE; - if (parameters != NULL) { - // Check required agent UUIDs - for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { - if (uuid_is_null(parameters->required_netagents[i])) { - break; - } - if (uuid_compare(parameters->required_netagents[i], *netagent_uuid) == 0) { - required = TRUE; - break; - } - } - - if (!required) { - // Check required agent types - bool fetched_type = FALSE; - char netagent_domain[NETAGENT_DOMAINSIZE]; - char netagent_type[NETAGENT_TYPESIZE]; - memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE); - memset(&netagent_type, 0, NETAGENT_TYPESIZE); - - for (int i = 0; i < NECP_MAX_AGENT_PARAMETERS; i++) { - if (strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || - strlen(parameters->required_netagent_types[i].netagent_type) == 0) { - break; - } - - if (!fetched_type) { - if (netagent_get_agent_domain_and_type(*netagent_uuid, netagent_domain, netagent_type)) { - fetched_type = TRUE; - } else { - break; - } - } - - if ((strlen(parameters->required_netagent_types[i].netagent_domain) == 0 || - strncmp(netagent_domain, parameters->required_netagent_types[i].netagent_domain, NETAGENT_DOMAINSIZE) == 0) && - (strlen(parameters->required_netagent_types[i].netagent_type) == 0 || - strncmp(netagent_type, parameters->required_netagent_types[i].netagent_type, NETAGENT_TYPESIZE) == 0)) { - required = TRUE; - break; - } - } - } - } - - applies = required; + applies = necp_netagent_is_required(parameters, netagent_uuid); } else { applies = TRUE; } @@ -1773,6 +1783,32 @@ necp_client_add_agent_interface_options(struct necp_client *client, } } +static void +necp_client_add_browse_interface_options(struct necp_client *client, + const struct necp_client_parsed_parameters *parsed_parameters, + ifnet_t ifp) +{ + if (ifp != NULL && ifp->if_agentids != NULL) { + for (u_int32_t i = 0; i < ifp->if_agentcount; i++) { + if (uuid_is_null(ifp->if_agentids[i])) { + continue; + } + + u_int32_t flags = netagent_get_flags(ifp->if_agentids[i]); + if ((flags & NETAGENT_FLAG_REGISTERED) && + (flags & NETAGENT_FLAG_ACTIVE) && + (flags & NETAGENT_FLAG_SUPPORTS_BROWSE) && + (!(flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) || + necp_netagent_is_required(parsed_parameters, &ifp->if_agentids[i]))) { + necp_client_add_interface_option_if_needed(client, ifp->if_index, ifnet_get_generation(ifp), &ifp->if_agentids[i]); + + // Finding one is enough + break; + } + } + } +} + static inline bool necp_client_address_is_valid(struct sockaddr *address) { @@ -2418,7 +2454,7 @@ necp_client_lookup_bb_radio_manager(struct necp_client *client, } error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, - &result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, true, true); + &result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, true, true, NULL); proc_rele(proc); proc = PROC_NULL; @@ -3069,7 +3105,8 @@ necp_calculate_client_result(proc_t proc, u_int32_t *flags, u_int32_t *reason, struct necp_client_endpoint *v4_gateway, - struct necp_client_endpoint *v6_gateway) + struct necp_client_endpoint *v6_gateway, + uuid_t *override_euuid) { struct rtentry *route = NULL; @@ -3087,7 +3124,8 @@ necp_calculate_client_result(proc_t proc, result, flags, reason, matching_if_index, NULL, NULL, v4_gateway, v6_gateway, - &route, false, true); + &route, false, true, + override_euuid); if (error != 0) { if (route != NULL) { rtfree(route); @@ -3220,14 +3258,16 @@ necp_update_client_result(proc_t proc, // Calculate the policy result struct necp_client_endpoint v4_gateway = {}; struct necp_client_endpoint v6_gateway = {}; - if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) { + uuid_t override_euuid; + uuid_clear(override_euuid); + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway, &override_euuid)) { FREE(parsed_parameters, M_NECP); return FALSE; } if (necp_update_parsed_parameters(parsed_parameters, &result)) { // Changed the parameters based on result, try again (only once) - if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway)) { + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags, &reason, &v4_gateway, &v6_gateway, &override_euuid)) { FREE(parsed_parameters, M_NECP); return FALSE; } @@ -3242,8 +3282,10 @@ necp_update_client_result(proc_t proc, // Save the last policy id on the client client->policy_id = result.policy_id; + uuid_copy(client->override_euuid, override_euuid); if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_MULTIPATH) || + (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_BROWSE) || ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) && result.routing_result != NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED)) { client->allow_multiple_flows = TRUE; @@ -3485,6 +3527,21 @@ necp_update_client_result(proc_t proc, } } } + } else if (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_BROWSE) { + if (result.routing_result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { + if (direct_interface != NULL) { + // Add browse option if it has an agent + necp_client_add_browse_interface_options(client, parsed_parameters, direct_interface); + } + } else { + // Get browse interface options from global list + struct ifnet *browse_interface = NULL; + TAILQ_FOREACH(browse_interface, &ifnet_head, if_link) { + if (necp_ifnet_matches_parameters(browse_interface, parsed_parameters, 0, NULL, true, false)) { + necp_client_add_browse_interface_options(client, parsed_parameters, browse_interface); + } + } + } } // Add agents @@ -5277,7 +5334,11 @@ necp_client_copy_parameters_locked(struct necp_client *client, } parameters->ethertype = parsed_parameters.ethertype; parameters->traffic_class = parsed_parameters.traffic_class; - uuid_copy(parameters->euuid, parsed_parameters.effective_uuid); + if (uuid_is_null(client->override_euuid)) { + uuid_copy(parameters->euuid, parsed_parameters.effective_uuid); + } else { + uuid_copy(parameters->euuid, client->override_euuid); + } parameters->is_listener = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) ? 1 : 0; parameters->is_interpose = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) ? 1 : 0; parameters->is_custom_ether = (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_CUSTOM_ETHER) ? 1 : 0; @@ -6188,7 +6249,7 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r } error = necp_application_find_policy_match_internal(p, parameters, uap->parameters_size, - &returned_result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, false, false); + &returned_result, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, false, false, NULL); if (error) { goto done; } diff --git a/bsd/net/network_agent.c b/bsd/net/network_agent.c index a7d27aed8..0072bfaca 100644 --- a/bsd/net/network_agent.c +++ b/bsd/net/network_agent.c @@ -732,6 +732,11 @@ netagent_handle_register_inner(struct netagent_session *session, struct netagent { lck_rw_lock_exclusive(&netagent_lock); + if (session->wrapper != NULL) { + lck_rw_done(&netagent_lock); + return EINVAL; + } + new_wrapper->control_unit = session->control_unit; new_wrapper->event_handler = session->event_handler; new_wrapper->event_context = session->event_context; @@ -757,6 +762,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent) { int data_size = 0; struct netagent_wrapper *new_wrapper = NULL; + uuid_t registered_uuid; struct netagent_session *session = (struct netagent_session *)_session; if (session == NULL) { @@ -790,6 +796,8 @@ netagent_register(netagent_session_t _session, struct netagent *agent) memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); + uuid_copy(registered_uuid, new_wrapper->netagent.netagent_uuid); + int error = netagent_handle_register_inner(session, new_wrapper); if (error != 0) { FREE(new_wrapper, M_NETAGENT); @@ -797,7 +805,7 @@ netagent_register(netagent_session_t _session, struct netagent *agent) } NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); + netagent_post_event(registered_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); return 0; } @@ -810,6 +818,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl struct netagent_wrapper *new_wrapper = NULL; u_int32_t response_error = 0; struct netagent *register_netagent = (struct netagent *)(void *)payload; + uuid_t registered_uuid; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -859,6 +868,8 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); __nochk_memcpy(&new_wrapper->netagent, register_netagent, sizeof(struct netagent) + data_size); + uuid_copy(registered_uuid, new_wrapper->netagent.netagent_uuid); + response_error = netagent_handle_register_inner(session, new_wrapper); if (response_error != 0) { FREE(new_wrapper, M_NETAGENT); @@ -866,7 +877,7 @@ netagent_handle_register_setopt(struct netagent_session *session, u_int8_t *payl } NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); + netagent_post_event(registered_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); done: return response_error; @@ -880,8 +891,7 @@ netagent_handle_register_message(struct netagent_session *session, u_int32_t mes int data_size = 0; struct netagent_wrapper *new_wrapper = NULL; u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; - uuid_t netagent_uuid; - uuid_clear(netagent_uuid); + uuid_t registered_uuid; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -928,11 +938,19 @@ netagent_handle_register_message(struct netagent_session *session, u_int32_t mes goto fail; } - (void)netagent_handle_register_inner(session, new_wrapper); + uuid_copy(registered_uuid, new_wrapper->netagent.netagent_uuid); + + error = netagent_handle_register_inner(session, new_wrapper); + if (error) { + NETAGENTLOG(LOG_ERR, "Failed to register agent: %d", error); + FREE(new_wrapper, M_NETAGENT); + response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; + goto fail; + } NETAGENTLOG0(LOG_DEBUG, "Registered new agent"); netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id); - netagent_post_event(new_wrapper->netagent.netagent_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); + netagent_post_event(registered_uuid, KEV_NETAGENT_REGISTERED, TRUE, false); return; fail: netagent_send_error_response(session, NETAGENT_MESSAGE_TYPE_REGISTER, message_id, response_error); @@ -1102,6 +1120,8 @@ netagent_update(netagent_session_t _session, struct netagent *agent) u_int8_t agent_changed; int data_size = 0; struct netagent_wrapper *new_wrapper = NULL; + bool should_update_immediately; + uuid_t updated_uuid; struct netagent_session *session = (struct netagent_session *)_session; if (session == NULL) { @@ -1134,10 +1154,12 @@ netagent_update(netagent_session_t _session, struct netagent *agent) memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); __nochk_memcpy(&new_wrapper->netagent, agent, sizeof(struct netagent) + data_size); + uuid_copy(updated_uuid, new_wrapper->netagent.netagent_uuid); + should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + int error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX); if (error == 0) { - bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); + netagent_post_event(updated_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed FREE(new_wrapper, M_NETAGENT); @@ -1158,6 +1180,8 @@ netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payloa errno_t response_error = 0; struct netagent *update_netagent = (struct netagent *)(void *)payload; u_int8_t agent_changed; + bool should_update_immediately; + uuid_t updated_uuid; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -1207,10 +1231,12 @@ netagent_handle_update_setopt(struct netagent_session *session, u_int8_t *payloa memset(new_wrapper, 0, sizeof(*new_wrapper) + data_size); __nochk_memcpy(&new_wrapper->netagent, update_netagent, sizeof(struct netagent) + data_size); + uuid_copy(updated_uuid, new_wrapper->netagent.netagent_uuid); + should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainPOSIX); if (response_error == 0) { - bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); + netagent_post_event(updated_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed FREE(new_wrapper, M_NETAGENT); @@ -1232,6 +1258,8 @@ netagent_handle_update_message(struct netagent_session *session, u_int32_t messa struct netagent_wrapper *new_wrapper = NULL; u_int32_t response_error = NETAGENT_MESSAGE_ERROR_INTERNAL; u_int8_t agent_changed; + uuid_t updated_uuid; + bool should_update_immediately; if (session == NULL) { NETAGENTLOG0(LOG_ERR, "Failed to find session"); @@ -1277,6 +1305,9 @@ netagent_handle_update_message(struct netagent_session *session, u_int32_t messa goto fail; } + uuid_copy(updated_uuid, new_wrapper->netagent.netagent_uuid); + should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (new_wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); + response_error = netagent_handle_update_inner(session, new_wrapper, data_size, &agent_changed, kNetagentErrorDomainUserDefined); if (response_error != 0) { FREE(new_wrapper, M_NETAGENT); @@ -1284,8 +1315,7 @@ netagent_handle_update_message(struct netagent_session *session, u_int32_t messa } netagent_send_success_response(session, NETAGENT_MESSAGE_TYPE_UPDATE, message_id); - bool should_update_immediately = (NETAGENT_FLAG_UPDATE_IMMEDIATELY == (session->wrapper->netagent.netagent_flags & NETAGENT_FLAG_UPDATE_IMMEDIATELY)); - netagent_post_event(session->wrapper->netagent.netagent_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); + netagent_post_event(updated_uuid, KEV_NETAGENT_UPDATED, agent_changed, should_update_immediately); if (agent_changed == FALSE) { // The session wrapper does not need the "new_wrapper" as nothing changed diff --git a/bsd/netinet/dhcp.h b/bsd/netinet/dhcp.h index 29c1ee818..5ab4ab095 100644 --- a/bsd/netinet/dhcp.h +++ b/bsd/netinet/dhcp.h @@ -101,6 +101,6 @@ typedef int32_t dhcp_lease_t; /* relative time */ #define DHCP_INFINITE_LEASE ((dhcp_lease_t)-1) #define DHCP_INFINITE_TIME ((dhcp_time_secs_t)-1) -#define DHCP_FLAGS_BROADCAST ((u_short)0x0001) +#define DHCP_FLAGS_BROADCAST ((u_int16_t)0x8000) #endif /* _NETINET_DHCP_H */ diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index 5a5ab0961..1a405129f 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -832,7 +833,7 @@ flow_divert_trie_insert(struct flow_divert_trie *trie, uint16_t string_start, si #define APPLE_WEBCLIP_ID_PREFIX "com.apple.webapp" static uint16_t -flow_divert_trie_search(struct flow_divert_trie *trie, uint8_t *string_bytes) +flow_divert_trie_search(struct flow_divert_trie *trie, const uint8_t *string_bytes) { uint16_t current = trie->root; uint16_t string_idx = 0; @@ -853,7 +854,6 @@ flow_divert_trie_search(struct flow_divert_trie *trie, uint8_t *string_bytes) return current; /* Got an exact match */ } else if (string_idx == strlen(APPLE_WEBCLIP_ID_PREFIX) && 0 == strncmp((const char *)string_bytes, APPLE_WEBCLIP_ID_PREFIX, string_idx)) { - string_bytes[string_idx] = '\0'; return current; /* Got an apple webclip id prefix match */ } else if (TRIE_NODE(trie, current).child_map != NULL_TRIE_IDX) { next = TRIE_CHILD(trie, current, string_bytes[string_idx]); @@ -953,33 +953,174 @@ flow_divert_find_proc_by_uuid(uuid_t uuid) } static int -flow_divert_get_src_proc(struct socket *so, proc_t *proc) +flow_divert_add_proc_info(struct flow_divert_pcb *fd_cb, proc_t proc, const char *signing_id, mbuf_t connect_packet, bool is_effective) { - int release = 0; + int error = 0; + int cdhash_error = 0; + unsigned char cdhash[SHA1_RESULTLEN] = { 0 }; + audit_token_t audit_token = {}; + const char *proc_cs_id = signing_id; - if (so->so_flags & SOF_DELEGATED) { - if ((*proc)->p_pid != so->e_pid) { - *proc = proc_find(so->e_pid); - release = 1; - } else if (uuid_compare((*proc)->p_uuid, so->e_uuid)) { - *proc = flow_divert_find_proc_by_uuid(so->e_uuid); - release = 1; + proc_lock(proc); + + if (proc_cs_id == NULL) { + if (proc->p_csflags & (CS_VALID | CS_DEBUGGED)) { + proc_cs_id = cs_identity_get(proc); + } else { + FDLOG0(LOG_ERR, fd_cb, "Signature of proc is invalid"); + } + } + + if (is_effective) { + lck_rw_lock_shared(&fd_cb->group->lck); + if (!(fd_cb->group->flags & FLOW_DIVERT_GROUP_FLAG_NO_APP_MAP)) { + if (proc_cs_id != NULL) { + uint16_t result = flow_divert_trie_search(&fd_cb->group->signing_id_trie, (const uint8_t *)proc_cs_id); + if (result == NULL_TRIE_IDX) { + FDLOG(LOG_WARNING, fd_cb, "%s did not match", proc_cs_id); + error = EPERM; + } else { + FDLOG(LOG_INFO, fd_cb, "%s matched", proc_cs_id); + } + } else { + error = EPERM; + } + } + lck_rw_done(&fd_cb->group->lck); + } + + if (error != 0) { + goto done; + } + + /* + * If signing_id is not NULL then it came from the flow divert token and will be added + * as part of the token, so there is no need to add it here. + */ + if (signing_id == NULL && proc_cs_id != NULL) { + error = flow_divert_packet_append_tlv(connect_packet, + (is_effective ? FLOW_DIVERT_TLV_SIGNING_ID : FLOW_DIVERT_TLV_APP_REAL_SIGNING_ID), + strlen(proc_cs_id), + proc_cs_id); + if (error != 0) { + FDLOG(LOG_ERR, fd_cb, "failed to append the signing ID: %d", error); + goto done; } - } else if (*proc == PROC_NULL) { - *proc = current_proc(); } - if (*proc != PROC_NULL) { - if ((*proc)->p_pid == 0) { - if (release) { - proc_rele(*proc); + cdhash_error = proc_getcdhash(proc, cdhash); + if (cdhash_error == 0) { + error = flow_divert_packet_append_tlv(connect_packet, + (is_effective ? FLOW_DIVERT_TLV_CDHASH : FLOW_DIVERT_TLV_APP_REAL_CDHASH), + sizeof(cdhash), + cdhash); + if (error) { + FDLOG(LOG_ERR, fd_cb, "failed to append the cdhash: %d", error); + goto done; + } + } else { + FDLOG(LOG_ERR, fd_cb, "failed to get the cdhash: %d", cdhash_error); + } + + task_t task = proc_task(proc); + if (task != TASK_NULL) { + mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT; + kern_return_t rc = task_info(task, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count); + if (rc == KERN_SUCCESS) { + int append_error = flow_divert_packet_append_tlv(connect_packet, + (is_effective ? FLOW_DIVERT_TLV_APP_AUDIT_TOKEN : FLOW_DIVERT_TLV_APP_REAL_AUDIT_TOKEN), + sizeof(audit_token_t), + &audit_token); + if (append_error) { + FDLOG(LOG_ERR, fd_cb, "failed to append app audit token: %d", append_error); } - release = 0; - *proc = PROC_NULL; } } - return release; +done: + proc_unlock(proc); + + return error; +} + +static int +flow_divert_add_all_proc_info(struct flow_divert_pcb *fd_cb, struct socket *so, proc_t proc, const char *signing_id, mbuf_t connect_packet) +{ + int error = 0; + proc_t effective_proc = PROC_NULL; + proc_t responsible_proc = PROC_NULL; + proc_t real_proc = proc_find(so->last_pid); + bool release_real_proc = true; + + proc_t src_proc = PROC_NULL; + proc_t real_src_proc = PROC_NULL; + + if (real_proc == PROC_NULL) { + FDLOG(LOG_ERR, fd_cb, "failed to find the real proc record for %d", so->last_pid); + release_real_proc = false; + real_proc = proc; + if (real_proc == PROC_NULL) { + real_proc = current_proc(); + } + } + + if (so->so_flags & SOF_DELEGATED) { + if (real_proc->p_pid != so->e_pid) { + effective_proc = proc_find(so->e_pid); + } else if (uuid_compare(real_proc->p_uuid, so->e_uuid)) { + effective_proc = flow_divert_find_proc_by_uuid(so->e_uuid); + } + } + +#if defined(XNU_TARGET_OS_OSX) + lck_rw_lock_shared(&fd_cb->group->lck); + if (!(fd_cb->group->flags & FLOW_DIVERT_GROUP_FLAG_NO_APP_MAP)) { + if (so->so_rpid > 0) { + responsible_proc = proc_find(so->so_rpid); + } + } + lck_rw_done(&fd_cb->group->lck); +#endif + + real_src_proc = real_proc; + + if (responsible_proc != PROC_NULL) { + src_proc = responsible_proc; + if (effective_proc != NULL) { + real_src_proc = effective_proc; + } + } else if (effective_proc != PROC_NULL) { + src_proc = effective_proc; + } else { + src_proc = real_proc; + } + + error = flow_divert_add_proc_info(fd_cb, src_proc, signing_id, connect_packet, true); + if (error != 0) { + goto done; + } + + if (real_src_proc != NULL && real_src_proc != src_proc) { + error = flow_divert_add_proc_info(fd_cb, real_src_proc, NULL, connect_packet, false); + if (error != 0) { + goto done; + } + } + +done: + if (responsible_proc != PROC_NULL) { + proc_rele(responsible_proc); + } + + if (effective_proc != PROC_NULL) { + proc_rele(effective_proc); + } + + if (real_proc != PROC_NULL && release_real_proc) { + proc_rele(real_proc); + } + + return error; } static int @@ -1020,20 +1161,21 @@ flow_divert_send_packet(struct flow_divert_pcb *fd_cb, mbuf_t packet, Boolean en static int flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr *to, struct socket *so, proc_t p, mbuf_t *out_connect_packet) { - int error = 0; - int flow_type = 0; + int error = 0; + int flow_type = 0; char *signing_id = NULL; - int free_signing_id = 0; mbuf_t connect_packet = NULL; - proc_t src_proc = p; - int release_proc = 0; + cfil_sock_id_t cfil_sock_id = CFIL_SOCK_ID_NONE; + const void *cfil_id = NULL; + size_t cfil_id_size = 0; + struct inpcb *inp = sotoinpcb(so); + struct ifnet *ifp = NULL; error = flow_divert_packet_init(fd_cb, FLOW_DIVERT_PKT_CONNECT, &connect_packet); if (error) { goto done; } - error = EPERM; if (fd_cb->connect_token != NULL && (fd_cb->flags & FLOW_DIVERT_HAS_HMAC)) { uint32_t sid_size = 0; @@ -1043,103 +1185,22 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr if (signing_id != NULL) { flow_divert_packet_get_tlv(fd_cb->connect_token, 0, FLOW_DIVERT_TLV_SIGNING_ID, sid_size, signing_id, NULL); FDLOG(LOG_INFO, fd_cb, "Got %s from token", signing_id); - free_signing_id = 1; } } } socket_unlock(so, 0); - release_proc = flow_divert_get_src_proc(so, &src_proc); - if (src_proc != PROC_NULL) { - proc_lock(src_proc); - if (signing_id == NULL) { - if (src_proc->p_csflags & (CS_VALID | CS_DEBUGGED)) { - const char * cs_id; - cs_id = cs_identity_get(src_proc); - signing_id = __DECONST(char *, cs_id); - } else { - FDLOG0(LOG_WARNING, fd_cb, "Signature is invalid"); - } - } - } else { - FDLOG0(LOG_WARNING, fd_cb, "Failed to determine the current proc"); - } - - if (signing_id != NULL) { - uint16_t result = NULL_TRIE_IDX; - lck_rw_lock_shared(&fd_cb->group->lck); - if (fd_cb->group->flags & FLOW_DIVERT_GROUP_FLAG_NO_APP_MAP) { - result = 1; - } else { - result = flow_divert_trie_search(&fd_cb->group->signing_id_trie, (uint8_t *)signing_id); - } - lck_rw_done(&fd_cb->group->lck); - if (result != NULL_TRIE_IDX) { - error = 0; - FDLOG(LOG_INFO, fd_cb, "%s matched", signing_id); - - error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_SIGNING_ID, strlen(signing_id), signing_id); - if (error == 0) { - if (src_proc != PROC_NULL) { - unsigned char cdhash[SHA1_RESULTLEN]; - error = proc_getcdhash(src_proc, cdhash); - if (error == 0) { - error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_CDHASH, sizeof(cdhash), cdhash); - if (error) { - FDLOG(LOG_ERR, fd_cb, "failed to append the cdhash: %d", error); - } - } else { - FDLOG(LOG_ERR, fd_cb, "failed to get the cdhash: %d", error); - } - } - } else { - FDLOG(LOG_ERR, fd_cb, "failed to append the signing ID: %d", error); - } - } else { - FDLOG(LOG_WARNING, fd_cb, "%s did not match", signing_id); - } - } else { - FDLOG0(LOG_WARNING, fd_cb, "Failed to get the code signing identity"); - if (fd_cb->group->flags & FLOW_DIVERT_GROUP_FLAG_NO_APP_MAP) { - error = 0; - } - } + error = flow_divert_add_all_proc_info(fd_cb, so, p, signing_id, connect_packet); - if (error == 0 && src_proc != PROC_NULL) { - task_t task = proc_task(src_proc); - if (task != TASK_NULL) { - audit_token_t audit_token; - mach_msg_type_number_t count = TASK_AUDIT_TOKEN_COUNT; - kern_return_t rc = task_info(task, TASK_AUDIT_TOKEN, (task_info_t)&audit_token, &count); - if (rc == KERN_SUCCESS) { - error = flow_divert_packet_append_tlv(connect_packet, - FLOW_DIVERT_TLV_APP_AUDIT_TOKEN, - sizeof(audit_token_t), - &audit_token); - if (error) { - FDLOG(LOG_ERR, fd_cb, "failed to append app audit token: %d", error); - error = 0; /* do not treat this as fatal error, proceed */ - } - } else { - FDLOG(LOG_ERR, fd_cb, "failed to retrieve app audit token: %d", rc); - } - } - } - - if (src_proc != PROC_NULL) { - proc_unlock(src_proc); - if (release_proc) { - proc_rele(src_proc); - } - } socket_lock(so, 0); - if (free_signing_id) { + if (signing_id != NULL) { FREE(signing_id, M_TEMP); } if (error) { + FDLOG(LOG_ERR, fd_cb, "Failed to add source proc info: %d", error); goto done; } @@ -1168,40 +1229,6 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr goto done; } - if (fd_cb->so->so_flags & SOF_DELEGATED) { - error = flow_divert_packet_append_tlv(connect_packet, - FLOW_DIVERT_TLV_PID, - sizeof(fd_cb->so->e_pid), - &fd_cb->so->e_pid); - if (error) { - goto done; - } - - error = flow_divert_packet_append_tlv(connect_packet, - FLOW_DIVERT_TLV_UUID, - sizeof(fd_cb->so->e_uuid), - &fd_cb->so->e_uuid); - if (error) { - goto done; - } - } else { - error = flow_divert_packet_append_tlv(connect_packet, - FLOW_DIVERT_TLV_PID, - sizeof(fd_cb->so->e_pid), - &fd_cb->so->last_pid); - if (error) { - goto done; - } - - error = flow_divert_packet_append_tlv(connect_packet, - FLOW_DIVERT_TLV_UUID, - sizeof(fd_cb->so->e_uuid), - &fd_cb->so->last_uuid); - if (error) { - goto done; - } - } - if (fd_cb->connect_token != NULL) { unsigned int token_len = m_length(fd_cb->connect_token); mbuf_concatenate(connect_packet, fd_cb->connect_token); @@ -1225,7 +1252,6 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr error = EALREADY; goto done; } else { - struct inpcb *inp = sotoinpcb(so); if (flow_divert_has_pcb_local_address(inp)) { error = flow_divert_inp_to_sockaddr(inp, &fd_cb->local_address); if (error) { @@ -1244,6 +1270,21 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr } } + if ((inp->inp_flags | INP_BOUND_IF) && inp->inp_boundifp != NULL) { + ifp = inp->inp_boundifp; + } else if (inp->inp_last_outifp != NULL) { + ifp = inp->inp_last_outifp; + } + + if (ifp != NULL) { + uint32_t flow_if_index = ifp->if_index; + error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_OUT_IF_INDEX, + sizeof(flow_if_index), &flow_if_index); + if (error) { + goto done; + } + } + if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) { uint32_t flags = FLOW_DIVERT_TOKEN_FLAG_TFO; error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_FLAGS, sizeof(flags), &flags); @@ -1252,6 +1293,22 @@ flow_divert_create_connect_packet(struct flow_divert_pcb *fd_cb, struct sockaddr } } + cfil_sock_id = cfil_sock_id_from_socket(so); + if (cfil_sock_id != CFIL_SOCK_ID_NONE) { + cfil_id = &cfil_sock_id; + cfil_id_size = sizeof(cfil_sock_id); + } else if (so->so_flags1 & SOF1_CONTENT_FILTER_SKIP) { + cfil_id = &inp->necp_client_uuid; + cfil_id_size = sizeof(inp->necp_client_uuid); + } + + if (cfil_id != NULL && cfil_id_size > 0 && cfil_id_size <= sizeof(uuid_t)) { + error = flow_divert_packet_append_tlv(connect_packet, FLOW_DIVERT_TLV_CFIL_ID, cfil_id_size, cfil_id); + if (error) { + goto done; + } + } + done: if (!error) { *out_connect_packet = connect_packet; @@ -2071,15 +2128,14 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off flow_divert_disconnect_socket(fd_cb->so); } else if (!(fd_cb->so->so_state & SS_CANTRCVMORE)) { if (SOCK_TYPE(fd_cb->so) == SOCK_STREAM) { - if (sbappendstream(&fd_cb->so->so_rcv, data)) { - fd_cb->bytes_received += data_size; - flow_divert_add_data_statistics(fd_cb, data_size, FALSE); - fd_cb->sb_size = fd_cb->so->so_rcv.sb_cc; + int appended = sbappendstream(&fd_cb->so->so_rcv, data); + fd_cb->bytes_received += data_size; + flow_divert_add_data_statistics(fd_cb, data_size, FALSE); + fd_cb->sb_size += data_size; + if (appended) { sorwakeup(fd_cb->so); - data = NULL; - } else { - FDLOG0(LOG_ERR, fd_cb, "received data, but appendstream failed"); } + data = NULL; } else if (SOCK_TYPE(fd_cb->so) == SOCK_DGRAM) { struct sockaddr *append_sa; mbuf_t mctl; @@ -2097,14 +2153,14 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off mctl = flow_divert_get_control_mbuf(fd_cb); int append_error = 0; - if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, &append_error)) { + if (sbappendaddr(&fd_cb->so->so_rcv, append_sa, data, mctl, &append_error) || append_error == EJUSTRETURN) { fd_cb->bytes_received += data_size; flow_divert_add_data_statistics(fd_cb, data_size, FALSE); - fd_cb->sb_size = fd_cb->so->so_rcv.sb_cc; - sorwakeup(fd_cb->so); + fd_cb->sb_size += data_size; + if (append_error == 0) { + sorwakeup(fd_cb->so); + } data = NULL; - } else if (append_error != EJUSTRETURN) { - FDLOG0(LOG_ERR, fd_cb, "received data, but sbappendaddr failed"); } if (!error) { FREE(append_sa, M_TEMP); @@ -2760,8 +2816,7 @@ flow_divert_inp_to_sockaddr(const struct inpcb *inp, struct sockaddr **local_soc static boolean_t flow_divert_has_pcb_local_address(const struct inpcb *inp) { - return inp->inp_lport != 0 - && (inp->inp_laddr.s_addr != INADDR_ANY || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)); + return inp->inp_lport != 0; } static errno_t diff --git a/bsd/netinet/flow_divert_proto.h b/bsd/netinet/flow_divert_proto.h index 5961653b9..705fa6b7e 100644 --- a/bsd/netinet/flow_divert_proto.h +++ b/bsd/netinet/flow_divert_proto.h @@ -63,13 +63,17 @@ #define FLOW_DIVERT_TLV_TARGET_PORT 23 #define FLOW_DIVERT_TLV_CDHASH 24 #define FLOW_DIVERT_TLV_SIGNING_ID 25 -#define FLOW_DIVERT_TLV_PID 26 -#define FLOW_DIVERT_TLV_UUID 27 + + #define FLOW_DIVERT_TLV_PREFIX_COUNT 28 #define FLOW_DIVERT_TLV_FLAGS 29 #define FLOW_DIVERT_TLV_FLOW_TYPE 30 #define FLOW_DIVERT_TLV_APP_DATA 31 #define FLOW_DIVERT_TLV_APP_AUDIT_TOKEN 32 +#define FLOW_DIVERT_TLV_APP_REAL_SIGNING_ID 33 +#define FLOW_DIVERT_TLV_APP_REAL_CDHASH 34 +#define FLOW_DIVERT_TLV_APP_REAL_AUDIT_TOKEN 35 +#define FLOW_DIVERT_TLV_CFIL_ID 36 #define FLOW_DIVERT_FLOW_TYPE_TCP 1 #define FLOW_DIVERT_FLOW_TYPE_UDP 3 diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index d1627fb39..d097b293f 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -3502,6 +3502,7 @@ inp_update_policy(struct inpcb *inp) uint32_t pflags = 0; int32_t ogencnt; int err = 0; + uint8_t *lookup_uuid = NULL; if (!net_io_policy_uuid || so == NULL || inp->inp_state == INPCB_STATE_DEAD) { @@ -3516,9 +3517,17 @@ inp_update_policy(struct inpcb *inp) return 0; } +#if defined(XNU_TARGET_OS_OSX) + if (so->so_rpid > 0) { + lookup_uuid = so->so_ruuid; + } +#endif + if (lookup_uuid == NULL) { + lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid); + } + ogencnt = so->so_policy_gencnt; - err = proc_uuid_policy_lookup(((so->so_flags & SOF_DELEGATED) ? - so->e_uuid : so->last_uuid), &pflags, &so->so_policy_gencnt); + err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt); /* * Discard cached generation count if the entry is gone (ENOENT), diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 38a45abfb..3e92c7b94 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1261,6 +1261,9 @@ sendit: necp_mark_packet_from_ip(m, necp_matched_policy_id); switch (necp_result) { case NECP_KERNEL_POLICY_RESULT_PASS: + if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) { + break; + } /* Check if the interface is allowed */ if (!necp_packet_is_allowed_over_interface(m, ifp)) { error = EHOSTUNREACH; diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index b8fbb62d4..9803eaf09 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -236,6 +236,10 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_do_autorcvbuf, 1, "Enable automatic socket buffer tuning"); +SYSCTL_SKMEM_TCP_INT(OID_AUTO, autotunereorder, + CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autotune_reorder, 1, + "Enable automatic socket buffer tuning even when reordering is present"); + SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autorcvbuf_max, 512 * 1024, "Maximum receive socket buffer size"); @@ -1159,10 +1163,6 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, * - the high water mark already reached the maximum * - the stream is in background and receive side is being * throttled - * - if there are segments in reassembly queue indicating loss, - * do not need to increase recv window during recovery as more - * data is not going to be sent. A duplicate ack sent during - * recovery should not change the receive window */ if (tcp_do_autorcvbuf == 0 || (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || @@ -1170,7 +1170,7 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, sbrcv->sb_hiwat >= rcvbuf_max || (tp->t_flagsext & TF_RECV_THROTTLE) || (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) || - !LIST_EMPTY(&tp->t_segq)) { + (!tcp_autotune_reorder && !LIST_EMPTY(&tp->t_segq))) { /* Can not resize the socket buffer, just return */ goto out; } @@ -1215,8 +1215,9 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) { if (tp->rfbuf_cnt + pktlen > (sbrcv->sb_hiwat - (sbrcv->sb_hiwat >> 1))) { - tp->rfbuf_cnt += pktlen; int32_t rcvbuf_inc, min_incr; + + tp->rfbuf_cnt += pktlen; /* * Increment the receive window by a * multiple of maximum sized segments. @@ -5401,6 +5402,11 @@ dodata: memcpy(&saved_hdr, ip, ip->ip_hl << 2); ip = (struct ip *)&saved_hdr[0]; } + + if (tcp_autotune_reorder) { + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen, TCP_AUTORCVBUF_MAX(ifp)); + } + memcpy(&saved_tcphdr, th, sizeof(struct tcphdr)); thflags = tcp_reass(tp, th, &tlen, m, ifp, &read_wakeup); th = &saved_tcphdr; diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 6f63e40f7..7c1988f1f 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -978,39 +978,47 @@ after_sack_rexmit: * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { - if (tp->t_state != TCPS_SYN_RECEIVED || tfo_enabled(tp)) - flags &= ~TH_SYN; - off--; - len++; - if (len > 0 && tp->t_state == TCPS_SYN_SENT) { - while (inp->inp_sndinprog_cnt == 0 && - tp->t_pktlist_head != NULL) { - packetlist = tp->t_pktlist_head; - packchain_listadd = tp->t_lastchain; - packchain_sent++; - TCP_PKTLIST_CLEAR(tp); - - error = tcp_ip_output(so, tp, packetlist, - packchain_listadd, tp_inp_options, - (so_options & SO_DONTROUTE), - (sack_rxmit || (sack_bytes_rxmt != 0)), - isipv6); + if (tp->t_state == TCPS_SYN_RECEIVED && tfo_enabled(tp) && tp->snd_nxt == tp->snd_una + 1) { + /* We are sending the SYN again! */ + off--; + len++; + } else { + if (tp->t_state != TCPS_SYN_RECEIVED || tfo_enabled(tp)) { + flags &= ~TH_SYN; } - /* - * tcp was closed while we were in ip, - * resume close - */ - if (inp->inp_sndinprog_cnt == 0 && - (tp->t_flags & TF_CLOSING)) { - tp->t_flags &= ~TF_CLOSING; - (void) tcp_close(tp); - } else { - tcp_check_timer_state(tp); + off--; + len++; + if (len > 0 && tp->t_state == TCPS_SYN_SENT) { + while (inp->inp_sndinprog_cnt == 0 && + tp->t_pktlist_head != NULL) { + packetlist = tp->t_pktlist_head; + packchain_listadd = tp->t_lastchain; + packchain_sent++; + TCP_PKTLIST_CLEAR(tp); + + error = tcp_ip_output(so, tp, packetlist, + packchain_listadd, tp_inp_options, + (so_options & SO_DONTROUTE), + (sack_rxmit || (sack_bytes_rxmt != 0)), + isipv6); + } + + /* + * tcp was closed while we were in ip, + * resume close + */ + if (inp->inp_sndinprog_cnt == 0 && + (tp->t_flags & TF_CLOSING)) { + tp->t_flags &= ~TF_CLOSING; + (void) tcp_close(tp); + } else { + tcp_check_timer_state(tp); + } + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, + 0,0,0,0,0); + return 0; } - KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, - 0,0,0,0,0); - return 0; } } diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index c19dc1706..df9551689 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -143,7 +143,7 @@ sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS return error; } - tt = s * TCP_RETRANSHZ / 1000; + tt = temp * TCP_RETRANSHZ / 1000; if (tt < 1 || tt > INT_MAX) { return EINVAL; } diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index b87f6ba7b..1d2f1a9a4 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -580,6 +580,9 @@ loopit: switch (necp_result) { case NECP_KERNEL_POLICY_RESULT_PASS: + if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) { + break; + } goto skip_ipsec; case NECP_KERNEL_POLICY_RESULT_DROP: error = EHOSTUNREACH; diff --git a/bsd/netinet6/nd6_prproxy.c b/bsd/netinet6/nd6_prproxy.c index c3cb5ecd4..100b4c482 100644 --- a/bsd/netinet6/nd6_prproxy.c +++ b/bsd/netinet6/nd6_prproxy.c @@ -279,7 +279,7 @@ nd6_ndprl_free(struct nd6_prproxy_prelist *ndprl) * Apply routing function on the affected upstream and downstream prefixes, * i.e. either set or clear RTF_PROXY on the cloning prefix route; all route * entries that were cloned off these prefixes will be blown away. Caller - * must have acquried proxy6_lock and must not be holding nd6_mutex. + * must have acquired proxy6_lock and must not be holding nd6_mutex. */ static void nd6_prproxy_prelist_setroute(boolean_t enable, diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index b7c473e6a..87d880897 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -174,6 +174,7 @@ __private_extern__ u_int64_t natt_now = 0; static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; +static LIST_HEAD(_custom_sahtree, secashead) custom_sahtree; /* registed list */ #define SPIHASHSIZE 128 @@ -470,11 +471,11 @@ static struct mbuf *key_setdumpsp(struct secpolicy *, u_int8_t, u_int32_t, u_int32_t); static u_int key_getspreqmsglen(struct secpolicy *); static int key_spdexpire(struct secpolicy *); -static struct secashead *key_newsah(struct secasindex *, ifnet_t, u_int, u_int8_t); +static struct secashead *key_newsah(struct secasindex *, ifnet_t, u_int, u_int8_t, u_int16_t); static struct secasvar *key_newsav(struct mbuf *, const struct sadb_msghdr *, struct secashead *, int *, struct socket *); -static struct secashead *key_getsah(struct secasindex *); +static struct secashead *key_getsah(struct secasindex *, u_int16_t); static struct secasvar *key_checkspidup(struct secasindex *, u_int32_t); static void key_setspi __P((struct secasvar *, u_int32_t)); static struct secasvar *key_getsavbyspi(struct secashead *, u_int32_t); @@ -640,6 +641,7 @@ key_init(struct protosw *pp, struct domain *dp) ipsec_policy_count = 0; LIST_INIT(&sahtree); + LIST_INIT(&custom_sahtree); for (i = 0; i <= SADB_SATYPE_MAX; i++) { LIST_INIT(®tree[i]); @@ -3808,7 +3810,8 @@ static struct secashead * key_newsah(struct secasindex *saidx, ifnet_t ipsec_if, u_int outgoing_if, - u_int8_t dir) + u_int8_t dir, + u_int16_t flags) { struct secashead *newsah; @@ -3817,6 +3820,8 @@ key_newsah(struct secasindex *saidx, panic("key_newsaidx: NULL pointer is passed.\n"); } + VERIFY(flags == SECURITY_ASSOCIATION_PFKEY || flags == SECURITY_ASSOCIATION_CUSTOM_IPSEC); + newsah = keydb_newsecashead(); if (newsah == NULL) { return NULL; @@ -3854,7 +3859,13 @@ key_newsah(struct secasindex *saidx, newsah->dir = dir; /* add to saidxtree */ newsah->state = SADB_SASTATE_MATURE; - LIST_INSERT_HEAD(&sahtree, newsah, chain); + newsah->flags = flags; + + if (flags == SECURITY_ASSOCIATION_PFKEY) { + LIST_INSERT_HEAD(&sahtree, newsah, chain); + } else { + LIST_INSERT_HEAD(&custom_sahtree, newsah, chain); + } key_start_timehandler(); return newsah; @@ -4296,18 +4307,33 @@ key_delsav( * others : found, pointer to a SA. */ static struct secashead * -key_getsah(struct secasindex *saidx) +key_getsah(struct secasindex *saidx, u_int16_t flags) { struct secashead *sah; LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED); - LIST_FOREACH(sah, &sahtree, chain) { - if (sah->state == SADB_SASTATE_DEAD) { - continue; + if ((flags & SECURITY_ASSOCIATION_ANY) == SECURITY_ASSOCIATION_ANY || + (flags & SECURITY_ASSOCIATION_PFKEY) == SECURITY_ASSOCIATION_PFKEY) { + LIST_FOREACH(sah, &sahtree, chain) { + if (sah->state == SADB_SASTATE_DEAD) { + continue; + } + if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) { + return sah; + } } - if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) { - return sah; + } + + if ((flags & SECURITY_ASSOCIATION_ANY) == SECURITY_ASSOCIATION_ANY || + (flags & SECURITY_ASSOCIATION_PFKEY) == SECURITY_ASSOCIATION_CUSTOM_IPSEC) { + LIST_FOREACH(sah, &custom_sahtree, chain) { + if (sah->state == SADB_SASTATE_DEAD) { + continue; + } + if (key_cmpsaidx(&sah->saidx, saidx, 0)) { + return sah; + } } } @@ -4322,9 +4348,9 @@ key_newsah2(struct secasindex *saidx, LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED); - sah = key_getsah(saidx); + sah = key_getsah(saidx, SECURITY_ASSOCIATION_ANY); if (!sah) { - return key_newsah(saidx, NULL, 0, dir); + return key_newsah(saidx, NULL, 0, dir, SECURITY_ASSOCIATION_PFKEY); } return sah; } @@ -7024,15 +7050,21 @@ key_getspi( } /* get a SA index */ - if ((newsah = key_getsah(&saidx)) == NULL) { + if ((newsah = key_getsah(&saidx, SECURITY_ASSOCIATION_ANY)) == NULL) { /* create a new SA index: key_addspi is always used for inbound spi */ - if ((newsah = key_newsah(&saidx, ipsec_if, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_IPSECIF), IPSEC_DIR_INBOUND)) == NULL) { + if ((newsah = key_newsah(&saidx, ipsec_if, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_IPSECIF), IPSEC_DIR_INBOUND, SECURITY_ASSOCIATION_PFKEY)) == NULL) { lck_mtx_unlock(sadb_mutex); ipseclog((LOG_DEBUG, "key_getspi: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } } + if ((newsah->flags & SECURITY_ASSOCIATION_CUSTOM_IPSEC) == SECURITY_ASSOCIATION_CUSTOM_IPSEC) { + lck_mtx_unlock(sadb_mutex); + ipseclog((LOG_ERR, "key_getspi: custom ipsec exists\n")); + return key_senderror(so, m, EEXIST); + } + /* get a new SA */ /* XXX rewrite */ newsav = key_newsav(m, mhp, newsah, &error, so); @@ -7348,7 +7380,7 @@ key_update( lck_mtx_lock(sadb_mutex); /* get a SA header */ - if ((sah = key_getsah(&saidx)) == NULL) { + if ((sah = key_getsah(&saidx, SECURITY_ASSOCIATION_PFKEY)) == NULL) { lck_mtx_unlock(sadb_mutex); ipseclog((LOG_DEBUG, "key_update: no SA index found.\n")); return key_senderror(so, m, ENOENT); @@ -7546,14 +7578,20 @@ key_migrate(struct socket *so, /* Find or create new SAH */ KEY_SETSECASIDX(proto, sah->saidx.mode, sah->saidx.reqid, src1 + 1, dst1 + 1, ipsec_if1 ? ipsec_if1->if_index : 0, &saidx1); - if ((newsah = key_getsah(&saidx1)) == NULL) { - if ((newsah = key_newsah(&saidx1, ipsec_if1, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_MIGRATE_IPSECIF), sah->dir)) == NULL) { + if ((newsah = key_getsah(&saidx1, SECURITY_ASSOCIATION_ANY)) == NULL) { + if ((newsah = key_newsah(&saidx1, ipsec_if1, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_MIGRATE_IPSECIF), sah->dir, SECURITY_ASSOCIATION_PFKEY)) == NULL) { lck_mtx_unlock(sadb_mutex); ipseclog((LOG_DEBUG, "key_migrate: No more memory.\n")); return key_senderror(so, m, ENOBUFS); } } + if ((newsah->flags & SECURITY_ASSOCIATION_CUSTOM_IPSEC) == SECURITY_ASSOCIATION_CUSTOM_IPSEC) { + lck_mtx_unlock(sadb_mutex); + ipseclog((LOG_ERR, "key_migrate: custom ipsec exists\n")); + return key_senderror(so, m, EEXIST); + } + /* Migrate SAV in to new SAH */ if (key_migratesav(sav, newsah) != 0) { lck_mtx_unlock(sadb_mutex); @@ -7738,9 +7776,9 @@ key_add( lck_mtx_lock(sadb_mutex); /* get a SA header */ - if ((newsah = key_getsah(&saidx)) == NULL) { + if ((newsah = key_getsah(&saidx, SECURITY_ASSOCIATION_ANY)) == NULL) { /* create a new SA header: key_addspi is always used for outbound spi */ - if ((newsah = key_newsah(&saidx, ipsec_if, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_IPSECIF), IPSEC_DIR_OUTBOUND)) == NULL) { + if ((newsah = key_newsah(&saidx, ipsec_if, key_get_outgoing_ifindex_from_message(mhp, SADB_X_EXT_IPSECIF), IPSEC_DIR_OUTBOUND, SECURITY_ASSOCIATION_PFKEY)) == NULL) { lck_mtx_unlock(sadb_mutex); ipseclog((LOG_DEBUG, "key_add: No more memory.\n")); bzero_keys(mhp); @@ -7748,6 +7786,13 @@ key_add( } } + if ((newsah->flags & SECURITY_ASSOCIATION_CUSTOM_IPSEC) == SECURITY_ASSOCIATION_CUSTOM_IPSEC) { + lck_mtx_unlock(sadb_mutex); + ipseclog((LOG_ERR, "key_add: custom ipsec exists\n")); + bzero_keys(mhp); + return key_senderror(so, m, EEXIST); + } + /* set spidx if there */ /* XXX rewrite */ error = key_setident(newsah, m, mhp); @@ -10885,3 +10930,115 @@ key_fill_offload_frames_for_savs(ifnet_t ifp, return frame_index; } + +#pragma mark Custom IPsec + +__private_extern__ bool +key_custom_ipsec_token_is_valid(void *ipsec_token) +{ + if (ipsec_token == NULL) { + return false; + } + + struct secashead *sah = (struct secashead *)ipsec_token; + + return (sah->flags & SECURITY_ASSOCIATION_CUSTOM_IPSEC) == SECURITY_ASSOCIATION_CUSTOM_IPSEC; +} + +__private_extern__ int +key_reserve_custom_ipsec(void **ipsec_token, union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst, + u_int8_t proto) +{ + if (src == NULL || dst == NULL) { + ipseclog((LOG_ERR, "register custom ipsec: invalid address\n")); + return EINVAL; + } + + if (src->sa.sa_family != dst->sa.sa_family) { + ipseclog((LOG_ERR, "register custom ipsec: address family mismatched\n")); + return EINVAL; + } + + if (src->sa.sa_len != dst->sa.sa_len) { + ipseclog((LOG_ERR, "register custom ipsec: address struct size mismatched\n")); + return EINVAL; + } + + if (ipsec_token == NULL) { + ipseclog((LOG_ERR, "register custom ipsec: invalid ipsec token\n")); + return EINVAL; + } + + switch (src->sa.sa_family) { + case AF_INET: + if (src->sa.sa_len != sizeof(struct sockaddr_in)) { + ipseclog((LOG_ERR, "register custom esp: invalid address length\n")); + return EINVAL; + } + break; + case AF_INET6: + if (src->sa.sa_len != sizeof(struct sockaddr_in6)) { + ipseclog((LOG_ERR, "register custom esp: invalid address length\n")); + return EINVAL; + } + break; + default: + ipseclog((LOG_ERR, "register custom esp: invalid address length\n")); + return EAFNOSUPPORT; + } + + if (proto != IPPROTO_ESP && proto != IPPROTO_AH) { + ipseclog((LOG_ERR, "register custom esp: invalid proto %u\n", proto)); + return EINVAL; + } + + struct secasindex saidx = {}; + KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, &src->sa, &dst->sa, 0, &saidx); + + lck_mtx_lock(sadb_mutex); + + struct secashead *sah = NULL; + if ((sah = key_getsah(&saidx, SECURITY_ASSOCIATION_ANY)) != NULL) { + lck_mtx_unlock(sadb_mutex); + ipseclog((LOG_ERR, "register custom esp: SA exists\n")); + return EEXIST; + } + + if ((sah = key_newsah(&saidx, NULL, 0, IPSEC_DIR_ANY, SECURITY_ASSOCIATION_CUSTOM_IPSEC)) == NULL) { + lck_mtx_unlock(sadb_mutex); + ipseclog((LOG_DEBUG, "register custom esp: No more memory.\n")); + return ENOBUFS; + } + + *ipsec_token = (void *)sah; + + lck_mtx_unlock(sadb_mutex); + return 0; +} + +__private_extern__ void +key_release_custom_ipsec(void **ipsec_token) +{ + struct secashead *sah = *ipsec_token; + VERIFY(sah != NULL); + + lck_mtx_lock(sadb_mutex); + + VERIFY((sah->flags & SECURITY_ASSOCIATION_CUSTOM_IPSEC) == SECURITY_ASSOCIATION_CUSTOM_IPSEC); + + bool sa_present = true; + if (LIST_FIRST(&sah->savtree[SADB_SASTATE_LARVAL]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_MATURE]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_DYING]) == NULL && + LIST_FIRST(&sah->savtree[SADB_SASTATE_DEAD]) == NULL) { + sa_present = false; + } + VERIFY(sa_present == false); + + key_delsah(sah); + + lck_mtx_unlock(sadb_mutex); + + *ipsec_token = NULL; + return; +} diff --git a/bsd/netkey/key.h b/bsd/netkey/key.h index 418f9792d..d07289b62 100644 --- a/bsd/netkey/key.h +++ b/bsd/netkey/key.h @@ -119,7 +119,9 @@ struct ifnet_keepalive_offload_frame; extern u_int32_t key_fill_offload_frames_for_savs(struct ifnet *, struct ifnet_keepalive_offload_frame *frames_array, u_int32_t, size_t); - +extern bool key_custom_ipsec_token_is_valid(void *); +extern int key_reserve_custom_ipsec(void **, union sockaddr_in_4_6 *, union sockaddr_in_4_6 *, u_int8_t proto); +extern void key_release_custom_ipsec(void **); #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETKEY_KEY_H_ */ diff --git a/bsd/netkey/keydb.h b/bsd/netkey/keydb.h index db7a04ef3..92e7e9655 100644 --- a/bsd/netkey/keydb.h +++ b/bsd/netkey/keydb.h @@ -49,6 +49,10 @@ struct secasindex { u_int ipsec_ifindex; }; +#define SECURITY_ASSOCIATION_ANY 0x0000 +#define SECURITY_ASSOCIATION_PFKEY 0x0001 +#define SECURITY_ASSOCIATION_CUSTOM_IPSEC 0x0010 + /* Security Association Data Base */ struct secashead { LIST_ENTRY(secashead) chain; @@ -68,6 +72,8 @@ struct secashead { /* The first of this list is newer SA */ struct route_in6 sa_route; /* route cache */ + + uint16_t flags; }; #define MAX_REPLAY_WINDOWS 4 diff --git a/bsd/nfs/krpc_subr.c b/bsd/nfs/krpc_subr.c index 50e547fb1..345762a0b 100644 --- a/bsd/nfs/krpc_subr.c +++ b/bsd/nfs/krpc_subr.c @@ -65,6 +65,9 @@ * */ +#include +#if CONFIG_NFS_CLIENT + #include #include #include @@ -604,3 +607,5 @@ out1: } return error; } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index dcb9647e8..60828d4c0 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -844,6 +844,7 @@ struct nfs_location_index; struct nfs_socket; struct nfs_socket_search; struct nfsrv_uc_arg; +struct direntry; /* * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. @@ -1288,6 +1289,7 @@ void nfs_vattr_set_bitmap(struct nfsmount *, uint32_t *, struct vnode_attr *) void nfs3_pathconf_cache(struct nfsmount *, struct nfs_fsattr *); int nfs3_mount_rpc(struct nfsmount *, struct sockaddr *, int, int, char *, vfs_context_t, int, fhandle_t *, struct nfs_sec *); void nfs3_umount_rpc(struct nfsmount *, vfs_context_t, int); +void nfs_rdirplus_update_node_attrs(nfsnode_t, struct direntry *, fhandle_t *, struct nfs_vattr *, uint64_t *); int nfs_node_access_slot(nfsnode_t, uid_t, int); void nfs_vnode_notify(nfsnode_t, uint32_t); diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index ffb82cb06..4d15ab0e4 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -26,6 +26,9 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include +#if CONFIG_NFS_CLIENT + /* * miscellaneous support functions for NFSv4 */ @@ -3032,3 +3035,5 @@ recheckdeleg: vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); } } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index 261da73e2..f6619bfe2 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -26,6 +26,9 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include +#if CONFIG_NFS_CLIENT + /* * vnode op calls for NFS version 4 */ @@ -961,6 +964,10 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) if (rdirplus) { microuptime(&now); + if (lastcookie == 0) { + dnp->n_rdirplusstamp_sof = now.tv_sec; + dnp->n_rdirplusstamp_eof = 0; + } } /* loop through the entries packing them into the buffer */ @@ -1096,6 +1103,7 @@ nextbuffer: } *(time_t*)(&dp->d_name[dp->d_namlen + 1 + fhlen]) = now.tv_sec; dp->d_reclen = reclen; + nfs_rdirplus_update_node_attrs(dnp, dp, &fh, nvattrp, &savedxid); } padstart = dp->d_name + dp->d_namlen + 1 + xlen; ndbhp->ndbh_count++; @@ -1117,6 +1125,9 @@ nextbuffer: ndbhp->ndbh_flags |= (NDB_FULL | NDB_EOF); nfs_node_lock_force(dnp); dnp->n_eofcookie = lastcookie; + if (rdirplus) { + dnp->n_rdirplusstamp_eof = now.tv_sec; + } nfs_node_unlock(dnp); } else { more_entries = 1; @@ -8952,3 +8963,5 @@ nfs4_vnop_removenamedstream( #endif #endif /* CONFIG_NFS4 */ + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 2e2dec099..e7ddfaaa5 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -64,6 +64,10 @@ * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $ */ + +#include +#if CONFIG_NFS_CLIENT + #include #include #include @@ -4152,3 +4156,5 @@ nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx) } return error; } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 9f5ec1030..65728d3b1 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -92,6 +92,9 @@ * - replaced big automatic arrays with MALLOC'd data */ +#include +#if CONFIG_NFS_CLIENT + #include #include #include @@ -826,3 +829,5 @@ out: } #endif /* NETHER */ + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_conf.h b/bsd/nfs/nfs_conf.h new file mode 100644 index 000000000..7b6662e46 --- /dev/null +++ b/bsd/nfs/nfs_conf.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _BSD_NFS_NFS_CONF_H_ +#define _BSD_NFS_NFS_CONF_H_ + +#if NFSCLIENT +#define CONFIG_NFS_CLIENT 1 +#endif /* NFSCLIENT */ + +#if NFSSERVER +#define CONFIG_NFS_SERVER 1 +#endif /* NFSSERVER */ + + +#if CONFIG_NFS_CLIENT || CONFIG_NFS_SERVER +#define CONFIG_NFS 1 +#endif /* CONFIG_NFS_CLIENT || CONFIG_NFS_SERVER */ + +#endif /* _BSD_NFS_NFS_CONF_H_ */ diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index 95d21f6c6..71188d12a 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -26,6 +26,9 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include +#if CONFIG_NFS + /************* * These functions implement RPCSEC_GSS security for the NFS client and server. * The code is specific to the use of Kerberos v5 and the use of DES MAC MD5 @@ -120,24 +123,24 @@ #define NFS_GSS_ISDBG (NFS_DEBUG_FACILITY & NFS_FAC_GSS) -#if NFSSERVER +#if CONFIG_NFS_SERVER u_long nfs_gss_svc_ctx_hash; struct nfs_gss_svc_ctx_hashhead *nfs_gss_svc_ctx_hashtbl; lck_mtx_t *nfs_gss_svc_ctx_mutex; lck_grp_t *nfs_gss_svc_grp; uint32_t nfsrv_gss_context_ttl = GSS_CTX_EXPIRE; #define GSS_SVC_CTX_TTL ((uint64_t)max(2*GSS_CTX_PEND, nfsrv_gss_context_ttl) * NSEC_PER_SEC) -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ -#if NFSCLIENT +#if CONFIG_NFS_CLIENT lck_grp_t *nfs_gss_clnt_grp; -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ #define KRB5_MAX_MIC_SIZE 128 uint8_t krb5_mech_oid[11] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; static uint8_t xdrpad[] = { 0x00, 0x00, 0x00, 0x00}; -#if NFSCLIENT +#if CONFIG_NFS_CLIENT static int nfs_gss_clnt_ctx_find(struct nfsreq *); static int nfs_gss_clnt_ctx_init(struct nfsreq *, struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_init_retry(struct nfsreq *, struct nfs_gss_clnt_ctx *); @@ -149,9 +152,9 @@ static void nfs_gss_clnt_ctx_clean(struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *, struct nfs_gss_clnt_ctx **); static void nfs_gss_clnt_ctx_destroy(struct nfs_gss_clnt_ctx *); static void nfs_gss_clnt_log_error(struct nfsreq *, struct nfs_gss_clnt_ctx *, uint32_t, uint32_t); -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ -#if NFSSERVER +#if CONFIG_NFS_SERVER static struct nfs_gss_svc_ctx *nfs_gss_svc_ctx_find(uint32_t); static void nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *); static void nfs_gss_svc_ctx_timer(void *, void *); @@ -160,7 +163,7 @@ static int nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *, uint32_t); /* This is only used by server code */ static void nfs_gss_nfsm_chain(struct nfsm_chain *, mbuf_t); -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ static void host_release_special_port(mach_port_t); static mach_port_t host_copy_special_port(mach_port_t); @@ -170,12 +173,12 @@ static int nfs_gss_mach_vmcopyout(vm_map_copy_t, uint32_t, u_char *); static int nfs_gss_mchain_length(mbuf_t); static int nfs_gss_append_chain(struct nfsm_chain *, mbuf_t); -#if NFSSERVER +#if CONFIG_NFS_SERVER thread_call_t nfs_gss_svc_ctx_timer_call; int nfs_gss_timer_on = 0; uint32_t nfs_gss_ctx_count = 0; const uint32_t nfs_gss_ctx_max = GSS_SVC_MAXCONTEXTS; -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* * Initialization when NFS starts @@ -183,18 +186,18 @@ const uint32_t nfs_gss_ctx_max = GSS_SVC_MAXCONTEXTS; void nfs_gss_init(void) { -#if NFSCLIENT +#if CONFIG_NFS_CLIENT nfs_gss_clnt_grp = lck_grp_alloc_init("rpcsec_gss_clnt", LCK_GRP_ATTR_NULL); -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ -#if NFSSERVER +#if CONFIG_NFS_SERVER nfs_gss_svc_grp = lck_grp_alloc_init("rpcsec_gss_svc", LCK_GRP_ATTR_NULL); nfs_gss_svc_ctx_hashtbl = hashinit(SVC_CTX_HASHSZ, M_TEMP, &nfs_gss_svc_ctx_hash); nfs_gss_svc_ctx_mutex = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); nfs_gss_svc_ctx_timer_call = thread_call_allocate(nfs_gss_svc_ctx_timer, NULL); -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ } /* @@ -389,7 +392,7 @@ rpc_gss_priv_data_create(gss_ctx_id_t ctx, mbuf_t *mb_head, uint32_t seqnum, uin return error; } -#if NFSCLIENT +#if CONFIG_NFS_CLIENT /* * Restore the argument or result from an rpc_gss_integ_data mbuf chain @@ -2818,14 +2821,14 @@ out: nfs_gss_clnt_ctx_unref(&req); return error; } -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ /************* * * Server functions */ -#if NFSSERVER +#if CONFIG_NFS_SERVER /* * Find a server context based on a handle value received @@ -3842,7 +3845,7 @@ nfs_gss_svc_cleanup(void) lck_mtx_unlock(nfs_gss_svc_ctx_mutex); } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /************* @@ -4013,7 +4016,7 @@ nfs_gss_append_chain(struct nfsm_chain *nmc, mbuf_t mc) return 0; } -#if NFSSERVER /* Only used by NFSSERVER */ +#if CONFIG_NFS_SERVER /* Only used by CONFIG_NFS_SERVER */ /* * Convert an mbuf chain to an NFS mbuf chain */ @@ -4034,7 +4037,7 @@ nfs_gss_nfsm_chain(struct nfsm_chain *nmc, mbuf_t mc) nmc->nmc_left = mbuf_trailingspace(tail); nmc->nmc_flags = 0; } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ #if 0 @@ -4061,3 +4064,5 @@ hexdump(const char *msg, void *data, size_t len) } } #endif + +#endif /* CONFIG_NFS */ diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index ab5f4f4c9..352e1b61e 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -55,6 +55,9 @@ * from BSDI nfs_lock.c,v 2.4 1998/12/14 23:49:56 jch Exp */ +#include +#if CONFIG_NFS_CLIENT + #include #include #include @@ -1050,3 +1053,5 @@ nfslockdnotify(proc_t p, user_addr_t argp) return error; } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index 90400cfa5..60bd5609f 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -65,6 +65,8 @@ * FreeBSD-Id: nfs_node.c,v 1.22 1997/10/28 14:06:20 bde Exp $ */ +#include +#if CONFIG_NFS_CLIENT #include #include @@ -1451,3 +1453,5 @@ out: return i <= nfsnodehash; } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 2ebb8994b..189978adf 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -65,6 +65,9 @@ * FreeBSD-Id: nfs_serv.c,v 1.52 1997/10/28 15:59:05 bde Exp $ */ +#include +#if CONFIG_NFS_SERVER + #include #include #include @@ -107,8 +110,6 @@ #include #endif -#if NFSSERVER - /* * NFS server globals */ @@ -4898,6 +4899,7 @@ nfsrv_statfs( VFSATTR_INIT(&va); VFSATTR_WANTED(&va, f_blocks); + VFSATTR_WANTED(&va, f_bfree); VFSATTR_WANTED(&va, f_bavail); VFSATTR_WANTED(&va, f_files); VFSATTR_WANTED(&va, f_ffree); @@ -5284,4 +5286,4 @@ nfsrv_authorize( return error; } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index d0a6fd327..e5c2a590e 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -65,6 +65,9 @@ * FreeBSD-Id: nfs_socket.c,v 1.30 1997/10/28 15:59:07 bde Exp $ */ +#include +#if CONFIG_NFS + /* * Socket operations for use by nfs */ @@ -116,13 +119,13 @@ boolean_t current_thread_aborted(void); kern_return_t thread_terminate(thread_t); -#if NFSSERVER +#if CONFIG_NFS_SERVER int nfsrv_sock_max_rec_queue_length = 128; /* max # RPC records queued on (UDP) socket */ int nfsrv_getstream(struct nfsrv_sock *, int); int nfsrv_getreq(struct nfsrv_descript *); extern int nfsv3_procid[NFS_NPROCS]; -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* * compare two sockaddr structures @@ -153,7 +156,7 @@ nfs_sockaddr_cmp(struct sockaddr *sa1, struct sockaddr *sa2) return -1; } -#if NFSCLIENT +#if CONFIG_NFS_CLIENT int nfs_connect_search_new_socket(struct nfsmount *, struct nfs_socket_search *, struct timeval *); int nfs_connect_search_socket_connect(struct nfsmount *, struct nfs_socket *, int); @@ -6324,9 +6327,9 @@ nfs_up(struct nfsmount *nmp, thread_t thd, int flags, const char *msg) } -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ -#if NFSSERVER +#if CONFIG_NFS_SERVER /* * Generate the rpc reply header @@ -7046,4 +7049,6 @@ nfsrv_wakenfsd(struct nfsrv_sock *slp) wakeup(nd); } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ + +#endif /* CONFIG_NFS */ diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index 639cca075..5addbf6fb 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -65,7 +65,9 @@ * FreeBSD-Id: nfs_srvcache.c,v 1.15 1997/10/12 20:25:46 phk Exp $ */ -#if NFSSERVER +#include +#if CONFIG_NFS_SERVER + /* * Reference: Chet Juszczak, "Improving the Performance and Correctness * of an NFS Server", in Proc. Winter 1989 USENIX Conference, @@ -455,4 +457,4 @@ nfsrv_cleancache(void) lck_mtx_unlock(nfsrv_reqcache_mutex); } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 9c061a432..b16d31846 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -65,6 +65,9 @@ * FreeBSD-Id: nfs_subs.c,v 1.47 1997/11/07 08:53:24 phk Exp $ */ +#include +#if CONFIG_NFS + /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and @@ -101,7 +104,7 @@ #include #include #include -#if NFSCLIENT +#if CONFIG_NFS_CLIENT #define _NFS_XDR_SUBS_FUNCS_ /* define this to get xdrbuf function definitions */ #endif #include @@ -217,7 +220,7 @@ vtonfsv2_mode(enum vtype vtype, mode_t m) } } -#if NFSSERVER +#if CONFIG_NFS_SERVER /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. @@ -248,7 +251,7 @@ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NOOP }; -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers @@ -293,7 +296,7 @@ nfs_mbuf_init(void) nfs_mbuf_minclsize = ms.minclsize; } -#if NFSSERVER +#if CONFIG_NFS_SERVER /* * allocate a list of mbufs to hold the given amount of data @@ -338,7 +341,7 @@ nfsm_mbuf_get_list(size_t size, mbuf_t *mp, int *mbcnt) return error; } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* * nfsm_chain_new_mbuf() @@ -830,7 +833,7 @@ nfsm_chain_get_uio(struct nfsm_chain *nmc, uint32_t len, uio_t uio) return error; } -#if NFSCLIENT +#if CONFIG_NFS_CLIENT int nfsm_chain_add_string_nfc(struct nfsm_chain *nmc, const uint8_t *s, uint32_t slen) @@ -2232,7 +2235,7 @@ nfs_mountopts(struct nfsmount *nmp, char *buf, int buflen) return c > buflen ? ENOMEM : 0; } -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ /* * Schedule a callout thread to run an NFS timer function @@ -2248,7 +2251,7 @@ nfs_interval_timer_start(thread_call_t call, int interval) } -#if NFSSERVER +#if CONFIG_NFS_SERVER int nfsrv_cmp_secflavs(struct nfs_sec *, struct nfs_sec *); int nfsrv_hang_addrlist(struct nfs_export *, struct user_nfs_export_args *); @@ -3027,6 +3030,8 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) vnode_t mvp = NULL, xvp = NULL; mount_t mp = NULL; char path[MAXPATHLEN]; + char fl_pathbuff[MAXPATHLEN]; + int fl_pathbuff_len = MAXPATHLEN; int expisroot; if (unxa->nxa_flags == NXA_CHECK) { @@ -3134,12 +3139,6 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) goto unlock_out; } if ((unxa->nxa_flags & (NXA_ADD | NXA_OFFLINE)) == NXA_ADD) { - /* if adding, verify that the mount is still what we expect */ - mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); - if (mp) { - mount_ref(mp, 0); - mount_iterdrop(mp); - } /* find exported FS root vnode */ NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(nxfs->nxfs_path), ctx); @@ -3153,6 +3152,20 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) error = EINVAL; goto out; } + /* if adding, verify that the mount is still what we expect */ + mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); + if (!mp) { + /* check for firmlink-free path */ + if (vn_getpath_no_firmlink(mvp, fl_pathbuff, &fl_pathbuff_len) == 0 && + fl_pathbuff_len > 0 && + !strncmp(nxfs->nxfs_path, fl_pathbuff, MAXPATHLEN)) { + mp = vfs_getvfs_by_mntonname(vnode_mount(mvp)->mnt_vfsstat.f_mntonname); + } + } + if (mp) { + mount_ref(mp, 0); + mount_iterdrop(mp); + } /* sanity check: this should be same mount */ if (mp != vnode_mount(mvp)) { error = EINVAL; @@ -4507,4 +4520,6 @@ nfsrv_errmap(struct nfsrv_descript *nd, int err) return (int)*defaulterrp; } -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ + +#endif /* CONFIG_NFS */ diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index fe4bb37cf..adb45d85d 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -64,6 +64,9 @@ * @(#)nfs_syscalls.c 8.5 (Berkeley) 3/30/95 * FreeBSD-Id: nfs_syscalls.c,v 1.32 1997/11/07 08:53:25 phk Exp $ */ + +#include + /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce * support for mandatory and extensible security protections. This notice @@ -121,7 +124,7 @@ kern_return_t thread_terminate(thread_t); /* XXX */ -#if NFSSERVER +#if CONFIG_NFS_SERVER extern const nfsrv_proc_t nfsrv_procs[NFS_NPROCS]; @@ -141,15 +144,17 @@ void nfsrv_zapsock(struct nfsrv_sock *); void nfsrv_slpderef(struct nfsrv_sock *); void nfsrv_slpfree(struct nfsrv_sock *); -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ +#if CONFIG_NFS /* * sysctl stuff */ SYSCTL_DECL(_vfs_generic); SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "nfs hinge"); +#endif /* CONFIG_NFS */ -#if NFSCLIENT +#if CONFIG_NFS_CLIENT SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "nfs client hinge"); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, ""); SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, ""); @@ -176,9 +181,9 @@ SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, root_steals_gss_context, CTLFLAG_R #if CONFIG_NFS4 SYSCTL_STRING(_vfs_generic_nfs_client, OID_AUTO, default_nfs4domain, CTLFLAG_RW | CTLFLAG_LOCKED, nfs4_default_domain, sizeof(nfs4_default_domain), ""); #endif -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ -#if NFSSERVER +#if CONFIG_NFS_SERVER SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "nfs server hinge"); SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, ""); SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, ""); @@ -202,12 +207,9 @@ SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_limit, CTLFLAG_RW | C SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_max_seen, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_uc_queue_max_seen, 0, ""); SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, upcall_queue_count, CTLFLAG_RD | CTLFLAG_LOCKED, __DECONST(int *, &nfsrv_uc_queue_count), 0, ""); #endif -#endif /* NFSSERVER */ - +#endif /* CONFIG_NFS_SERVER */ -#if NFSCLIENT - -#if CONFIG_NFS4 +#if CONFIG_NFS_CLIENT && CONFIG_NFS4 static int mapname2id(struct nfs_testmapid *map) { @@ -287,11 +289,21 @@ nfsclnt_testidmap(proc_t p, user_addr_t argp) return error ? error : coerror; } +#endif /* CONFIG_NFS_CLIENT && CONFIG_NFS4 */ + +#if !CONFIG_NFS_CLIENT +#define __no_nfs_client_unused __unused +#else +#define __no_nfs_client_unused /* nothing */ #endif int -nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) +nfsclnt( + proc_t p __no_nfs_client_unused, + struct nfsclnt_args *uap __no_nfs_client_unused, + __unused int *retval) { +#if CONFIG_NFS_CLIENT struct lockd_ans la; int error; @@ -314,8 +326,12 @@ nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) error = EINVAL; } return error; +#else + return ENOSYS; +#endif /* CONFIG_NFS_CLIENT */ } +#if CONFIG_NFS_CLIENT /* * Asynchronous I/O threads for client NFS. @@ -512,16 +528,20 @@ worktodo: return 0; } -#endif /* NFSCLIENT */ - +#endif /* CONFIG_NFS_CLIENT */ -#if NFSSERVER +#if !CONFIG_NFS_SERVER +#define __no_nfs_server_unused __unused +#else +#define __no_nfs_server_unused /* nothing */ +#endif /* * NFS server system calls * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c */ +#if CONFIG_NFS_SERVER static struct nfs_exportfs * nfsrv_find_exportfs(const char *ptr) { @@ -543,7 +563,10 @@ nfsrv_find_exportfs(const char *ptr) * Get file handle system call */ int -getfh(proc_t p, struct getfh_args *uap, __unused int *retval) +getfh( + proc_t p __no_nfs_server_unused, + struct getfh_args *uap __no_nfs_server_unused, + __unused int *retval) { vnode_t vp; struct nfs_filehandle nfh; @@ -665,7 +688,9 @@ out: error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t)); return error; } +#endif /* CONFIG_NFS_SERVER */ +#if CONFIG_NFS_SERVER extern const struct fileops vnops; /* @@ -676,9 +701,9 @@ extern const struct fileops vnops; * security hole. */ int -fhopen( proc_t p, - struct fhopen_args *uap, - int32_t *retval) +fhopen(proc_t p __no_nfs_server_unused, + struct fhopen_args *uap __no_nfs_server_unused, + int32_t *retval __no_nfs_server_unused) { vnode_t vp; struct nfs_filehandle nfh; @@ -835,12 +860,16 @@ bad: vnode_put(vp); return error; } +#endif /* CONFIG_NFS_SERVER */ +#if CONFIG_NFS_SERVER /* * NFS server pseudo system call */ int -nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval) +nfssvc(proc_t p __no_nfs_server_unused, + struct nfssvc_args *uap __no_nfs_server_unused, + __unused int *retval) { mbuf_t nam; struct user_nfsd_args user_nfsdarg; @@ -916,6 +945,9 @@ nfssvc(proc_t p, struct nfssvc_args *uap, __unused int *retval) } return error; } +#endif /* CONFIG_NFS_SERVER */ + +#if CONFIG_NFS_SERVER /* * Adds a socket to the list for servicing by nfsds. @@ -1831,4 +1863,4 @@ nfsrv_cleanup(void) nfsrv_udp6sock = NULL; } -#endif /* NFS_NOSERVER */ +#endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_upcall.c b/bsd/nfs/nfs_upcall.c index 9b83d3fc6..b719f88a0 100644 --- a/bsd/nfs/nfs_upcall.c +++ b/bsd/nfs/nfs_upcall.c @@ -25,6 +25,10 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + +#include +#if CONFIG_NFS_SERVER + #include #include #include @@ -398,3 +402,5 @@ direct: return; } + +#endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 902680c68..ef5457410 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -64,6 +64,10 @@ * @(#)nfs_vfsops.c 8.12 (Berkeley) 5/20/95 * FreeBSD-Id: nfs_vfsops.c,v 1.52 1997/11/12 05:42:21 julian Exp $ */ + +#include +#if CONFIG_NFS_CLIENT + /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce * support for mandatory and extensible security protections. This notice @@ -3041,6 +3045,7 @@ mountnfs( nmp->nm_iodlink.tqe_next = NFSNOLIST; nmp->nm_deadtimeout = 0; nmp->nm_curdeadtimeout = 0; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); /* enable RDIRPLUS by default. It will be reverted later in case NFSv2 is used */ NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_NOACL); nmp->nm_realm = NULL; nmp->nm_principal = NULL; @@ -6182,7 +6187,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, struct netfs_status *nsp = NULL; int timeoutmask; uint totlen, count, numThreads; -#if NFSSERVER +#if CONFIG_NFS_SERVER uint pos; struct nfs_exportfs *nxfs; struct nfs_export *nx; @@ -6195,7 +6200,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, struct nfs_user_stat_path_rec upath_rec; uint bytes_avail, bytes_total, recs_copied; uint numExports, numRecs; -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* * All names at this level are terminal. @@ -6303,7 +6308,7 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, *oldlenp = xb.xb_u.xb_buffer.xbb_len; xb_cleanup(&xb); break; -#if NFSSERVER +#if CONFIG_NFS_SERVER case NFS_EXPORTSTATS: /* setup export stat descriptor */ stat_desc.rec_vers = NFS_EXPORT_STAT_REC_VERSION; @@ -6549,7 +6554,7 @@ ustat_skip: error = copyout(&nfsrv_user_stat_node_count, oldp, sizeof(nfsrv_user_stat_node_count)); break; -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ case VFS_CTL_NOLOCKS: if (req->oldptr != USER_ADDR_NULL) { lck_mtx_lock(&nmp->nm_lock); @@ -6727,3 +6732,5 @@ ustat_skip: } return error; } + +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 1df01abc0..4f9208b25 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -65,6 +65,8 @@ * FreeBSD-Id: nfs_vnops.c,v 1.72 1997/11/07 09:20:48 phk Exp $ */ +#include +#if CONFIG_NFS_CLIENT /* * vnode op calls for Sun NFS version 2 and 3 @@ -462,6 +464,35 @@ int nfs_getattr_internal(nfsnode_t, struct nfs_vattr *, vfs_context_t, int); int nfs_refresh_fh(nfsnode_t, vfs_context_t); +/* + * Update nfsnode attributes to avoid extra getattr calls for each direntry. + * This function should be called only if RDIRPLUS flag is enabled. + */ +void +nfs_rdirplus_update_node_attrs(nfsnode_t dnp, struct direntry *dp, fhandle_t *fhp, struct nfs_vattr *nvattrp, uint64_t *savedxidp) +{ + nfsnode_t np; + struct componentname cn; + int isdot = (dp->d_namlen == 1) && (dp->d_name[0] == '.'); + int isdotdot = (dp->d_namlen == 2) && (dp->d_name[0] == '.') && (dp->d_name[1] == '.'); + + if (isdot || isdotdot) { + return; + } + + np = NULL; + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = dp->d_name; + cn.cn_namelen = dp->d_namlen; + cn.cn_nameiop = LOOKUP; + + nfs_nget(NFSTOMP(dnp), dnp, &cn, fhp->fh_data, fhp->fh_len, nvattrp, savedxidp, RPCAUTH_UNKNOWN, NG_NOCREATE, &np); + if (np) { + nfs_node_unlock(np); + vnode_put(NFSTOV(np)); + } +} + /* * Find the slot in the access cache for this UID. * If adding and no existing slot is found, reuse slots in FIFO order. @@ -1864,6 +1895,8 @@ nfs3_vnop_getattr( * } */*ap) { int error; + nfsnode_t np; + uint64_t supported_attrs; struct nfs_vattr nva; struct vnode_attr *vap = ap->a_vap; struct nfsmount *nmp; @@ -1878,7 +1911,9 @@ nfs3_vnop_getattr( /* Return the io size no matter what, since we don't go over the wire for this */ VATTR_RETURN(vap, va_iosize, nfs_iosize); - if ((vap->va_active & NFS3_SUPPORTED_VATTRS) == 0) { + supported_attrs = NFS3_SUPPORTED_VATTRS; + + if ((vap->va_active & supported_attrs) == 0) { return 0; } @@ -1887,6 +1922,18 @@ nfs3_vnop_getattr( (uint64_t)VM_KERNEL_ADDRPERM(ap->a_vp), ap->a_vp->v_name ? ap->a_vp->v_name : "empty"); } + + /* + * We should not go over the wire if only fileid was requested and has ever been populated. + */ + if ((vap->va_active & supported_attrs) == VNODE_ATTR_va_fileid) { + np = VTONFS(ap->a_vp); + if (np->n_attrstamp) { + VATTR_RETURN(vap, va_fileid, np->n_vattr.nva_fileid); + return 0; + } + } + error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, NGA_CACHED); if (error) { return error; @@ -3617,6 +3664,9 @@ skipread: out: nfs_node_lock_force(np); np->n_wrbusy--; + if ((ioflag & IO_SYNC) && !np->n_wrbusy && !np->n_numoutput) { + np->n_flag &= ~NMODIFIED; + } nfs_node_unlock(np); nfs_data_unlock(np); FSDBG_BOT(515, np, uio_offset(uio), uio_resid(uio), error); @@ -5441,7 +5491,7 @@ nfs_vnop_readdir( nfsnode_t dnp = VTONFS(dvp); struct nfsmount *nmp; uio_t uio = ap->a_uio; - int error, nfsvers, extended, numdirent, bigcookies, ptc, done; + int error, nfsvers, extended, numdirent, bigcookies, ptc, done, attrcachetimeout; uint16_t i, iptc, rlen, nlen; uint64_t cookie, nextcookie, lbn = 0; struct nfsbuf *bp = NULL; @@ -5449,6 +5499,7 @@ nfs_vnop_readdir( struct direntry *dp, *dpptc; struct dirent dent; char *cp = NULL; + struct timeval now; thread_t thd; nmp = VTONMP(dvp); @@ -5498,6 +5549,23 @@ nfs_vnop_readdir( } } + if (dnp->n_rdirplusstamp_eof && dnp->n_rdirplusstamp_sof) { + attrcachetimeout = nfs_attrcachetimeout(dnp); + microuptime(&now); + if (attrcachetimeout && (now.tv_sec - dnp->n_rdirplusstamp_sof > attrcachetimeout - 1)) { + dnp->n_rdirplusstamp_eof = dnp->n_rdirplusstamp_sof = 0; + nfs_invaldir(dnp); + nfs_node_unlock(dnp); + error = nfs_vinvalbuf(dvp, 0, ctx, 1); + if (!error) { + error = nfs_node_lock(dnp); + } + if (error) { + goto out; + } + } + } + /* * check for need to invalidate when (re)starting at beginning */ @@ -6021,6 +6089,8 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn struct nfsbuflists blist; daddr64_t lbn, nextlbn; int dotunder = (cnp->cn_namelen > 2) && (cnp->cn_nameptr[0] == '.') && (cnp->cn_nameptr[1] == '_'); + int isdot = (cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.'); + int isdotdot = (cnp->cn_namelen == 2) && (cnp->cn_nameptr[0] == '.') && (cnp->cn_nameptr[1] == '.'); nmp = NFSTONMP(dnp); if (nfs_mount_gone(nmp)) { @@ -6030,6 +6100,10 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn *npp = NULL; } + if (isdot || isdotdot) { + return 0; + } + /* first check most recent buffer (and next one too) */ lbn = dnp->n_lastdbl; for (i = 0; i < 2; i++) { @@ -6266,6 +6340,10 @@ noplus: if (rdirplus) { microuptime(&now); + if (lastcookie == 0) { + dnp->n_rdirplusstamp_sof = now.tv_sec; + dnp->n_rdirplusstamp_eof = 0; + } } /* loop through the entries packing them into the buffer */ @@ -6391,6 +6469,7 @@ nextbuffer: } *(time_t*)(&dp->d_name[dp->d_namlen + 1 + fhlen]) = now.tv_sec; dp->d_reclen = reclen; + nfs_rdirplus_update_node_attrs(dnp, dp, &fh, nvattrp, &savedxid); } padstart = dp->d_name + dp->d_namlen + 1 + xlen; ndbhp->ndbh_count++; @@ -6414,6 +6493,9 @@ nextbuffer: ndbhp->ndbh_flags |= (NDB_FULL | NDB_EOF); nfs_node_lock_force(dnp); dnp->n_eofcookie = lastcookie; + if (rdirplus) { + dnp->n_rdirplusstamp_eof = now.tv_sec; + } nfs_node_unlock(dnp); } else { more_entries = 1; @@ -8574,3 +8656,4 @@ nfs_vnode_notify(nfsnode_t np, uint32_t events) vnode_notify(NFSTOV(np), events, vap); } +#endif /* CONFIG_NFS_CLIENT */ diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index b16669fc5..4c6c8f56f 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -73,6 +73,8 @@ #ifdef __APPLE_API_PRIVATE +#include + int nfsm_rpchead(struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); int nfsm_rpchead2(struct nfsmount *, int, int, int, int, int, kauth_cred_t, struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); @@ -96,7 +98,7 @@ int nfsm_chain_get_fh_attr(struct nfsmount *, struct nfsm_chain *, nfsnode_t, int nfsm_chain_get_wcc_data_f(struct nfsm_chain *, nfsnode_t, struct timespec *, int *, u_int64_t *); int nfsm_chain_get_secinfo(struct nfsm_chain *, uint32_t *, int *); -#if NFSSERVER +#if CONFIG_NFS_SERVER void nfsm_adj(mbuf_t, int, int); int nfsm_mbuf_get_list(size_t, mbuf_t *, int *); @@ -106,7 +108,7 @@ int nfsm_chain_add_wcc_data_f(struct nfsrv_descript *, struct nfsm_chain *, int, int nfsm_chain_get_path_namei(struct nfsm_chain *, uint32_t, struct nameidata *); int nfsm_chain_get_sattr(struct nfsrv_descript *, struct nfsm_chain *, struct vnode_attr *); int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); -#endif /* NFSSERVER */ +#endif /* CONFIG_NFS_SERVER */ /* check name length */ #define nfsm_name_len_check(E, ND, LEN) \ diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index 9562d6144..83fa44505 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -574,6 +574,8 @@ struct nfsnode { u_int8_t n_access[NFS_ACCESS_CACHE_SIZE + 1]; /* ACCESS cache */ uid_t n_accessuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having access */ time_t n_accessstamp[NFS_ACCESS_CACHE_SIZE]; /* access cache timestamp */ + time_t n_rdirplusstamp_sof; /* Readdirplus sof timestamp */ + time_t n_rdirplusstamp_eof; /* Readdirplus eof timestamp */ union { struct { struct timespec n3_mtime; /* Prev modify time. */ diff --git a/bsd/sys/_types/_fd_def.h b/bsd/sys/_types/_fd_def.h index 13137df5f..d32ee5153 100644 --- a/bsd/sys/_types/_fd_def.h +++ b/bsd/sys/_types/_fd_def.h @@ -28,7 +28,10 @@ #ifndef _FD_SET #define _FD_SET -#include /* __int32_t */ +#include /* __int32_t and uintptr_t */ +#if !KERNEL +#include +#endif /* * Select uses bit masks of file descriptors in longs. These macros @@ -49,17 +52,77 @@ __BEGIN_DECLS typedef struct fd_set { __int32_t fds_bits[__DARWIN_howmany(__DARWIN_FD_SETSIZE, __DARWIN_NFDBITS)]; } fd_set; + +#if !KERNEL +int __darwin_check_fd_set_overflow(int, const void *, int) __attribute__((__weak_import__)); +#endif __END_DECLS +#if !KERNEL +__header_always_inline int +__darwin_check_fd_set(int _a, const void *_b) +{ + if ((uintptr_t)&__darwin_check_fd_set_overflow != (uintptr_t) 0) { +#if defined(_DARWIN_UNLIMITED_SELECT) || defined(_DARWIN_C_SOURCE) + return __darwin_check_fd_set_overflow(_a, _b, 1); +#else + return __darwin_check_fd_set_overflow(_a, _b, 0); +#endif + } else { + return 1; + } +} + /* This inline avoids argument side-effect issues with FD_ISSET() */ -static __inline int -__darwin_fd_isset(int _n, const struct fd_set *_p) +__header_always_inline int +__darwin_fd_isset(int _fd, const struct fd_set *_p) +{ + if (__darwin_check_fd_set(_fd, (const void *) _p)) { + return _p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] & ((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS))); + } + + return 0; +} + +__header_always_inline void +__darwin_fd_set(int _fd, struct fd_set *const _p) +{ + if (__darwin_check_fd_set(_fd, (const void *) _p)) { + (_p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] |= ((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS)))); + } +} + +__header_always_inline void +__darwin_fd_clr(int _fd, struct fd_set *const _p) +{ + if (__darwin_check_fd_set(_fd, (const void *) _p)) { + (_p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] &= ~((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS)))); + } +} + +#else /* KERNEL */ + +__header_always_inline int +__darwin_fd_isset(int _fd, const struct fd_set *_p) +{ + return _p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] & ((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS))); +} + +__header_always_inline void +__darwin_fd_set(int _fd, struct fd_set *const _p) +{ + (_p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] |= ((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS)))); +} + +__header_always_inline void +__darwin_fd_clr(int _fd, struct fd_set *const _p) { - return _p->fds_bits[(unsigned long)_n / __DARWIN_NFDBITS] & ((__int32_t)(((unsigned long)1) << ((unsigned long)_n % __DARWIN_NFDBITS))); + (_p->fds_bits[(unsigned long)_fd / __DARWIN_NFDBITS] &= ~((__int32_t)(((unsigned long)1) << ((unsigned long)_fd % __DARWIN_NFDBITS)))); } +#endif /* KERNEL */ -#define __DARWIN_FD_SET(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] |= ((__int32_t)(((unsigned long)1)<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) -#define __DARWIN_FD_CLR(n, p) do { int __fd = (n); ((p)->fds_bits[(unsigned long)__fd/__DARWIN_NFDBITS] &= ~((__int32_t)(((unsigned long)1)<<((unsigned long)__fd % __DARWIN_NFDBITS)))); } while(0) +#define __DARWIN_FD_SET(n, p) __darwin_fd_set((n), (p)) +#define __DARWIN_FD_CLR(n, p) __darwin_fd_clr((n), (p)) #define __DARWIN_FD_ISSET(n, p) __darwin_fd_isset((n), (p)) #if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 3 diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 5b4f4c133..d5eecd682 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -531,8 +531,10 @@ typedef struct vol_attributes_attr { #define ATTR_CMNEXT_NOFIRMLINKPATH 0x00000020 #define ATTR_CMNEXT_REALDEVID 0x00000040 #define ATTR_CMNEXT_REALFSID 0x00000080 +#define ATTR_CMNEXT_CLONEID 0x00000100 +#define ATTR_CMNEXT_EXT_FLAGS 0x00000200 -#define ATTR_CMNEXT_VALIDMASK 0x000000fc +#define ATTR_CMNEXT_VALIDMASK 0x000003fc #define ATTR_CMNEXT_SETMASK 0x00000000 /* Deprecated fork attributes */ diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index 98d3628dd..a5e6aae66 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -1460,6 +1460,7 @@ typedef struct dtrace_module_symbols { } dtrace_module_symbols_t; #define DTRACE_MODULE_SYMBOLS_SIZE(count) (sizeof(dtrace_module_symbols_t) + ((count - 1) * sizeof(dtrace_symbol_t))) +#define DTRACE_MODULE_SYMBOLS_COUNT(size) ((size - sizeof(dtrace_module_symbols_t)) / sizeof(dtrace_symbol_t) + 1) typedef struct dtrace_module_uuids_list { uint64_t dtmul_count; diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index a0138830e..ca681fc2b 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -124,6 +124,8 @@ struct image_params { uint64_t ip_dyld_fsid; uint64_t ip_dyld_fsobjid; unsigned int ip_simulator_binary; /* simulator binary flags */ + + ipc_port_t ip_sc_port; /* SUID port. */ }; /* diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index c30f0ba3d..a61252890 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -646,6 +646,7 @@ __BEGIN_DECLS /* The Kernel Debug Sub Classes for DBG_MISC */ #define DBG_MISC_COREBRIGHTNESS 0x01 +#define DBG_MISC_VIDEOENG 0x02 #define DBG_EVENT 0x10 #define DBG_MISC_INSTRUMENTS 0x11 #define DBG_MISC_INSTRUMENTSBT 0x12 diff --git a/bsd/sys/kern_memorystatus_freeze.h b/bsd/sys/kern_memorystatus_freeze.h index 6c5a8b6b6..c3894b394 100644 --- a/bsd/sys/kern_memorystatus_freeze.h +++ b/bsd/sys/kern_memorystatus_freeze.h @@ -44,14 +44,14 @@ typedef struct memorystatus_freeze_entry { #ifdef XNU_KERNEL_PRIVATE extern unsigned long freeze_threshold_percentage; -extern unsigned int memorystatus_frozen_count; +extern unsigned int memorystatus_frozen_count; /* # of processes that are currently frozen. */ extern unsigned int memorystatus_frozen_processes_max; extern unsigned int memorystatus_frozen_shared_mb; extern unsigned int memorystatus_frozen_shared_mb_max; extern unsigned int memorystatus_freeze_shared_mb_per_process_max; /* Max. MB allowed per process to be freezer-eligible. */ extern unsigned int memorystatus_freeze_private_shared_pages_ratio; /* Ratio of private:shared pages for a process to be freezer-eligible. */ extern unsigned int memorystatus_suspended_count; -extern unsigned int memorystatus_thaw_count; +extern unsigned int memorystatus_thaw_count; /* # of processes that have been thawed in the current interval. */ extern unsigned int memorystatus_refreeze_eligible_count; /* # of processes currently thawed i.e. have state on disk & in-memory */ void memorystatus_freeze_init(void); diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 3e1429761..bfeafb186 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -47,6 +47,14 @@ #define __KPI_MBUF__ #include #include + +#ifndef PRIVATE +#include +#define __NKE_API_DEPRECATED __API_DEPRECATED("Network Kernel Extension KPI is deprecated", macos(10.4, 10.15.4)) +#else +#define __NKE_API_DEPRECATED +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE #include #endif /* KERNEL_PRIVATE */ @@ -294,7 +302,8 @@ __BEGIN_DECLS * @param mbuf The mbuf. * @result A pointer to the data in the mbuf. */ -extern void *mbuf_data(mbuf_t mbuf); +extern void *mbuf_data(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_datastart @@ -307,7 +316,8 @@ extern void *mbuf_data(mbuf_t mbuf); * @param mbuf The mbuf. * @result A pointer to smallest possible value for data. */ -extern void *mbuf_datastart(mbuf_t mbuf); +extern void *mbuf_datastart(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_setdata @@ -323,7 +333,8 @@ extern void *mbuf_datastart(mbuf_t mbuf); * @param len The new length of data in the mbuf. * @result 0 on success, errno error on failure. */ -extern errno_t mbuf_setdata(mbuf_t mbuf, void *data, size_t len); +extern errno_t mbuf_setdata(mbuf_t mbuf, void *data, size_t len) +__NKE_API_DEPRECATED; /*! * @function mbuf_align_32 @@ -336,7 +347,8 @@ extern errno_t mbuf_setdata(mbuf_t mbuf, void *data, size_t len); * data location. * @result 0 on success, errno error on failure. */ -extern errno_t mbuf_align_32(mbuf_t mbuf, size_t len); +extern errno_t mbuf_align_32(mbuf_t mbuf, size_t len) +__NKE_API_DEPRECATED; /*! * @function mbuf_data_to_physical @@ -355,7 +367,8 @@ extern errno_t mbuf_align_32(mbuf_t mbuf, size_t len); * @result The 64 bit physical address of the mbuf data or NULL if ptr * does not point to data stored in an mbuf. */ -extern addr64_t mbuf_data_to_physical(void *ptr); +extern addr64_t mbuf_data_to_physical(void *ptr) +__NKE_API_DEPRECATED; /* Allocation */ @@ -368,7 +381,8 @@ extern addr64_t mbuf_data_to_physical(void *ptr); * @param mbuf The mbuf. * @result 0 on success, errno error on failure. */ -extern errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); +extern errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_gethdr @@ -380,7 +394,8 @@ extern errno_t mbuf_get(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); * @param mbuf The mbuf. * @result 0 on success, errno error on failure. */ -extern errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); +extern errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_attachcluster @@ -410,7 +425,8 @@ extern errno_t mbuf_gethdr(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); */ extern errno_t mbuf_attachcluster(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf, caddr_t extbuf, void (*extfree)(caddr_t, u_int, caddr_t), - size_t extsize, caddr_t extarg); + size_t extsize, caddr_t extarg) +__NKE_API_DEPRECATED; /*! * @function mbuf_alloccluster @@ -435,7 +451,8 @@ extern errno_t mbuf_attachcluster(mbuf_how_t how, mbuf_type_t type, * In this case, the caller is advised to use 4096 bytes or * smaller during subseqent requests. */ -extern errno_t mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr); +extern errno_t mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr) +__NKE_API_DEPRECATED; /*! * @function mbuf_freecluster @@ -446,7 +463,8 @@ extern errno_t mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr); * @param addr The address of the cluster. * @param size The actual size of the cluster. */ -extern void mbuf_freecluster(caddr_t addr, size_t size); +extern void mbuf_freecluster(caddr_t addr, size_t size) +__NKE_API_DEPRECATED; #ifdef BSD_KERNEL_PRIVATE /* @@ -491,6 +509,7 @@ extern errno_t mbuf_cluster_get_prop(mbuf_t mbuf, u_int32_t *prop); */ extern errno_t mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t *mbuf); +__NKE_API_DEPRECATED; /*! * @function mbuf_mclget @@ -507,7 +526,8 @@ extern errno_t mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, * will be freed. If you specify an mbuf value in *mbuf, * mbuf_mclget will not free it. */ -extern errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); +extern errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_allocpacket @@ -545,7 +565,8 @@ extern errno_t mbuf_mclget(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf); * chunks requested */ extern errno_t mbuf_allocpacket(mbuf_how_t how, size_t packetlen, - unsigned int * maxchunks, mbuf_t *mbuf); + unsigned int * maxchunks, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_allocpacket_list @@ -584,8 +605,8 @@ extern errno_t mbuf_allocpacket(mbuf_how_t how, size_t packetlen, * chunks requested */ extern errno_t mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, - size_t packetlen, unsigned int * maxchunks, mbuf_t *mbuf); - + size_t packetlen, unsigned int * maxchunks, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_getpacket @@ -595,7 +616,8 @@ extern errno_t mbuf_allocpacket_list(unsigned int numpkts, mbuf_how_t how, * @param mbuf Upon success, *mbuf will be a reference to the new mbuf. * @result 0 on success, errno error on failure. */ -extern errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf); +extern errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_free @@ -604,14 +626,16 @@ extern errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf); * @param mbuf The mbuf to free. * @result The next mbuf in the chain. */ -extern mbuf_t mbuf_free(mbuf_t mbuf); +extern mbuf_t mbuf_free(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_freem * @discussion Frees a chain of mbufs link through mnext. * @param mbuf The first mbuf in the chain to free. */ -extern void mbuf_freem(mbuf_t mbuf); +extern void mbuf_freem(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_freem_list @@ -620,7 +644,8 @@ extern void mbuf_freem(mbuf_t mbuf); * @param mbuf The first mbuf in the linked list to free. * @result The number of mbufs freed. */ -extern int mbuf_freem_list(mbuf_t mbuf); +extern int mbuf_freem_list(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_leadingspace @@ -629,7 +654,8 @@ extern int mbuf_freem_list(mbuf_t mbuf); * @param mbuf The mbuf. * @result The number of unused bytes at the start of the mbuf. */ -extern size_t mbuf_leadingspace(const mbuf_t mbuf); +extern size_t mbuf_leadingspace(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_trailingspace @@ -638,7 +664,8 @@ extern size_t mbuf_leadingspace(const mbuf_t mbuf); * @param mbuf The mbuf. * @result The number of unused bytes following the current data. */ -extern size_t mbuf_trailingspace(const mbuf_t mbuf); +extern size_t mbuf_trailingspace(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /* Manipulation */ @@ -657,7 +684,8 @@ extern size_t mbuf_trailingspace(const mbuf_t mbuf); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_copym(const mbuf_t src, size_t offset, size_t len, - mbuf_how_t how, mbuf_t *new_mbuf); + mbuf_how_t how, mbuf_t *new_mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_dup @@ -670,7 +698,8 @@ extern errno_t mbuf_copym(const mbuf_t src, size_t offset, size_t len, * @param new_mbuf Upon success, the newly allocated mbuf. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_dup(const mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf); +extern errno_t mbuf_dup(const mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_prepend @@ -685,7 +714,8 @@ extern errno_t mbuf_dup(const mbuf_t src, mbuf_how_t how, mbuf_t *new_mbuf); * @param how Blocking or non-blocking. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_prepend(mbuf_t *mbuf, size_t len, mbuf_how_t how); +extern errno_t mbuf_prepend(mbuf_t *mbuf, size_t len, mbuf_how_t how) +__NKE_API_DEPRECATED; /*! * @function mbuf_split @@ -701,7 +731,8 @@ extern errno_t mbuf_prepend(mbuf_t *mbuf, size_t len, mbuf_how_t how); * preserved. */ extern errno_t mbuf_split(mbuf_t src, size_t offset, mbuf_how_t how, - mbuf_t *new_mbuf); + mbuf_t *new_mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_pullup @@ -714,7 +745,8 @@ extern errno_t mbuf_split(mbuf_t src, size_t offset, mbuf_how_t how, * @result 0 upon success otherwise the errno error. In the case of an * error, the mbuf chain has been freed. */ -extern errno_t mbuf_pullup(mbuf_t *mbuf, size_t len); +extern errno_t mbuf_pullup(mbuf_t *mbuf, size_t len) +__NKE_API_DEPRECATED; /*! * @function mbuf_pulldown @@ -735,7 +767,8 @@ extern errno_t mbuf_pullup(mbuf_t *mbuf, size_t len); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t length, - mbuf_t *location); + mbuf_t *location) +__NKE_API_DEPRECATED; /*! * @function mbuf_adj @@ -746,7 +779,8 @@ extern errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t length, * @param mbuf The mbuf chain to trim. * @param len The number of bytes to trim from the mbuf chain. */ -extern void mbuf_adj(mbuf_t mbuf, int len); +extern void mbuf_adj(mbuf_t mbuf, int len) +__NKE_API_DEPRECATED; /*! * @function mbuf_adjustlen @@ -759,7 +793,8 @@ extern void mbuf_adj(mbuf_t mbuf, int len); * @param amount The number of bytes increment the length by. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_adjustlen(mbuf_t mbuf, int amount); +extern errno_t mbuf_adjustlen(mbuf_t mbuf, int amount) +__NKE_API_DEPRECATED; /*! * @function mbuf_concatenate @@ -778,7 +813,8 @@ extern errno_t mbuf_adjustlen(mbuf_t mbuf, int amount); * chain. Otherwise it returns NULL if the original dst mbuf * chain is NULL. */ -extern mbuf_t mbuf_concatenate(mbuf_t dst, mbuf_t src); +extern mbuf_t mbuf_concatenate(mbuf_t dst, mbuf_t src) +__NKE_API_DEPRECATED; /*! * @function mbuf_copydata @@ -793,7 +829,8 @@ extern mbuf_t mbuf_concatenate(mbuf_t dst, mbuf_t src); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_copydata(const mbuf_t mbuf, size_t offset, size_t length, - void *out_data); + void *out_data) +__NKE_API_DEPRECATED; /*! * @function mbuf_copyback @@ -818,7 +855,8 @@ extern errno_t mbuf_copydata(const mbuf_t mbuf, size_t offset, size_t length, * @result 0 upon success, EINVAL or ENOBUFS upon failure. */ extern errno_t mbuf_copyback(mbuf_t mbuf, size_t offset, size_t length, - const void *data, mbuf_how_t how); + const void *data, mbuf_how_t how) +__NKE_API_DEPRECATED; /*! * @function mbuf_mclhasreference @@ -828,7 +866,8 @@ extern errno_t mbuf_copyback(mbuf_t mbuf, size_t offset, size_t length, * @param mbuf The mbuf with the cluster to test. * @result 0 if there is no reference by another mbuf, 1 otherwise. */ -extern int mbuf_mclhasreference(mbuf_t mbuf); +extern int mbuf_mclhasreference(mbuf_t mbuf) +__NKE_API_DEPRECATED; /* mbuf header */ @@ -839,7 +878,8 @@ extern int mbuf_mclhasreference(mbuf_t mbuf); * @param mbuf The mbuf. * @result The next mbuf in the chain. */ -extern mbuf_t mbuf_next(const mbuf_t mbuf); +extern mbuf_t mbuf_next(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_setnext @@ -848,7 +888,8 @@ extern mbuf_t mbuf_next(const mbuf_t mbuf); * @param next The new next mbuf. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next); +extern errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next) +__NKE_API_DEPRECATED; /*! * @function mbuf_nextpkt @@ -856,7 +897,8 @@ extern errno_t mbuf_setnext(mbuf_t mbuf, mbuf_t next); * @param mbuf The mbuf. * @result The nextpkt. */ -extern mbuf_t mbuf_nextpkt(const mbuf_t mbuf); +extern mbuf_t mbuf_nextpkt(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_setnextpkt @@ -864,7 +906,8 @@ extern mbuf_t mbuf_nextpkt(const mbuf_t mbuf); * @param mbuf The mbuf. * @param nextpkt The new next packet. */ -extern void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt); +extern void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt) +__NKE_API_DEPRECATED; /*! * @function mbuf_len @@ -872,7 +915,8 @@ extern void mbuf_setnextpkt(mbuf_t mbuf, mbuf_t nextpkt); * @param mbuf The mbuf. * @result The length. */ -extern size_t mbuf_len(const mbuf_t mbuf); +extern size_t mbuf_len(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_setlen @@ -881,7 +925,8 @@ extern size_t mbuf_len(const mbuf_t mbuf); * @param mbuf The mbuf. * @param len The new length. */ -extern void mbuf_setlen(mbuf_t mbuf, size_t len); +extern void mbuf_setlen(mbuf_t mbuf, size_t len) +__NKE_API_DEPRECATED; /*! * @function mbuf_maxlen @@ -892,7 +937,8 @@ extern void mbuf_setlen(mbuf_t mbuf, size_t len); * @param mbuf The mbuf. * @result The maximum lenght of data for this mbuf. */ -extern size_t mbuf_maxlen(const mbuf_t mbuf); +extern size_t mbuf_maxlen(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_type @@ -900,7 +946,8 @@ extern size_t mbuf_maxlen(const mbuf_t mbuf); * @param mbuf The mbuf. * @result The type. */ -extern mbuf_type_t mbuf_type(const mbuf_t mbuf); +extern mbuf_type_t mbuf_type(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_settype @@ -909,7 +956,8 @@ extern mbuf_type_t mbuf_type(const mbuf_t mbuf); * @param new_type The new type. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type); +extern errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type) +__NKE_API_DEPRECATED; /*! * @function mbuf_flags @@ -917,7 +965,8 @@ extern errno_t mbuf_settype(mbuf_t mbuf, mbuf_type_t new_type); * @param mbuf The mbuf. * @result The flags. */ -extern mbuf_flags_t mbuf_flags(const mbuf_t mbuf); +extern mbuf_flags_t mbuf_flags(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_setflags @@ -927,7 +976,8 @@ extern mbuf_flags_t mbuf_flags(const mbuf_t mbuf); * cleared. Certain flags such as MBUF_EXT cannot be altered. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags); +extern errno_t mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags) +__NKE_API_DEPRECATED; /*! * @function mbuf_setflags_mask @@ -940,7 +990,8 @@ extern errno_t mbuf_setflags(mbuf_t mbuf, mbuf_flags_t flags); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, - mbuf_flags_t mask); + mbuf_flags_t mask) +__NKE_API_DEPRECATED; /*! * @function mbuf_copy_pkthdr @@ -949,7 +1000,8 @@ extern errno_t mbuf_setflags_mask(mbuf_t mbuf, mbuf_flags_t flags, * @param dest The mbuf to which the packet header will be copied. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src); +extern errno_t mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_len @@ -957,7 +1009,8 @@ extern errno_t mbuf_copy_pkthdr(mbuf_t dest, const mbuf_t src); * @param mbuf The mbuf containing the packet header * @result The length, in bytes, of the packet. */ -extern size_t mbuf_pkthdr_len(const mbuf_t mbuf); +extern size_t mbuf_pkthdr_len(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_setlen @@ -965,7 +1018,8 @@ extern size_t mbuf_pkthdr_len(const mbuf_t mbuf); * @param mbuf The mbuf containing the packet header. * @param len The new length of the packet. */ -extern void mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len); +extern void mbuf_pkthdr_setlen(mbuf_t mbuf, size_t len) +__NKE_API_DEPRECATED; #ifdef XNU_KERNEL_PRIVATE /*! @@ -987,7 +1041,8 @@ extern size_t mbuf_pkthdr_maxlen(const mbuf_t mbuf); * @param amount The number of bytes to adjust the packet header length * field by. */ -extern void mbuf_pkthdr_adjustlen(mbuf_t mbuf, int amount); +extern void mbuf_pkthdr_adjustlen(mbuf_t mbuf, int amount) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_rcvif @@ -1001,7 +1056,8 @@ extern void mbuf_pkthdr_adjustlen(mbuf_t mbuf, int amount); * @param mbuf The mbuf containing the packet header. * @result A reference to the interface. */ -extern ifnet_t mbuf_pkthdr_rcvif(const mbuf_t mbuf); +extern ifnet_t mbuf_pkthdr_rcvif(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_setrcvif @@ -1010,7 +1066,8 @@ extern ifnet_t mbuf_pkthdr_rcvif(const mbuf_t mbuf); * @param ifp A reference to an interface. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifp); +extern errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifp) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_header @@ -1018,7 +1075,8 @@ extern errno_t mbuf_pkthdr_setrcvif(mbuf_t mbuf, ifnet_t ifp); * @param mbuf The mbuf containing the packet header. * @result A pointer to the packet header. */ -extern void *mbuf_pkthdr_header(const mbuf_t mbuf); +extern void *mbuf_pkthdr_header(const mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_pkthdr_setheader @@ -1026,7 +1084,8 @@ extern void *mbuf_pkthdr_header(const mbuf_t mbuf); * @param mbuf The mbuf containing the packet header. * @param header A pointer to the header. */ -extern void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header); +extern void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header) +__NKE_API_DEPRECATED; /* Checksums */ @@ -1043,7 +1102,8 @@ extern void mbuf_pkthdr_setheader(mbuf_t mbuf, void *header); * original checksum was valid. * @param mbuf The mbuf that has been modified. */ -extern void mbuf_inbound_modified(mbuf_t mbuf); +extern void mbuf_inbound_modified(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_outbound_finalize @@ -1081,7 +1141,8 @@ extern void mbuf_inbound_modified(mbuf_t mbuf); * would be the length of an ethernet header. */ extern void mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, - size_t protocol_offset); + size_t protocol_offset) +__NKE_API_DEPRECATED; /*! * @function mbuf_set_vlan_tag @@ -1092,7 +1153,8 @@ extern void mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, * @param vlan The protocol family of the aux data to add. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_set_vlan_tag(mbuf_t mbuf, u_int16_t vlan); +extern errno_t mbuf_set_vlan_tag(mbuf_t mbuf, u_int16_t vlan) +__NKE_API_DEPRECATED; /*! * @function mbuf_get_vlan_tag @@ -1106,7 +1168,8 @@ extern errno_t mbuf_set_vlan_tag(mbuf_t mbuf, u_int16_t vlan); * @result 0 upon success otherwise the errno error. ENXIO indicates * that the vlan tag is not set. */ -extern errno_t mbuf_get_vlan_tag(mbuf_t mbuf, u_int16_t *vlan); +extern errno_t mbuf_get_vlan_tag(mbuf_t mbuf, u_int16_t *vlan) +__NKE_API_DEPRECATED; /*! * @function mbuf_clear_vlan_tag @@ -1115,7 +1178,8 @@ extern errno_t mbuf_get_vlan_tag(mbuf_t mbuf, u_int16_t *vlan); * @param mbuf The mbuf containing the packet. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_clear_vlan_tag(mbuf_t mbuf); +extern errno_t mbuf_clear_vlan_tag(mbuf_t mbuf) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -1147,7 +1211,8 @@ extern errno_t mbuf_set_csum_requested(mbuf_t mbuf, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_get_csum_requested(mbuf_t mbuf, - mbuf_csum_request_flags_t *request, u_int32_t *value); + mbuf_csum_request_flags_t *request, u_int32_t *value) +__NKE_API_DEPRECATED; /*! * @function mbuf_get_tso_requested @@ -1160,7 +1225,8 @@ extern errno_t mbuf_get_csum_requested(mbuf_t mbuf, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_get_tso_requested(mbuf_t mbuf, - mbuf_tso_request_flags_t *request, u_int32_t *value); + mbuf_tso_request_flags_t *request, u_int32_t *value) +__NKE_API_DEPRECATED; /*! * @function mbuf_clear_csum_requested @@ -1168,7 +1234,8 @@ extern errno_t mbuf_get_tso_requested(mbuf_t mbuf, * @param mbuf The mbuf containing the packet. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_clear_csum_requested(mbuf_t mbuf); +extern errno_t mbuf_clear_csum_requested(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_set_csum_performed @@ -1183,7 +1250,8 @@ extern errno_t mbuf_clear_csum_requested(mbuf_t mbuf); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_set_csum_performed(mbuf_t mbuf, - mbuf_csum_performed_flags_t flags, u_int32_t value); + mbuf_csum_performed_flags_t flags, u_int32_t value) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /* @@ -1210,7 +1278,8 @@ extern errno_t mbuf_get_csum_performed(mbuf_t mbuf, * legacy MLEN macro. * @result The number of bytes of available data. */ -extern u_int32_t mbuf_get_mlen(void); +extern u_int32_t mbuf_get_mlen(void) +__NKE_API_DEPRECATED; /*! * @function mbuf_get_mhlen @@ -1218,7 +1287,8 @@ extern u_int32_t mbuf_get_mlen(void); * header mbuf. This is equivalent to the legacy MHLEN macro. * @result The number of bytes of available data. */ -extern u_int32_t mbuf_get_mhlen(void); +extern u_int32_t mbuf_get_mhlen(void) +__NKE_API_DEPRECATED; /*! * @function mbuf_get_minclsize @@ -1227,7 +1297,8 @@ extern u_int32_t mbuf_get_mhlen(void); * legacy MINCLSIZE macro. * @result The minimum number of bytes before a cluster will be used. */ -extern u_int32_t mbuf_get_minclsize(void); +extern u_int32_t mbuf_get_minclsize(void) +__NKE_API_DEPRECATED; /*! * @function mbuf_clear_csum_performed @@ -1235,7 +1306,8 @@ extern u_int32_t mbuf_get_minclsize(void); * @param mbuf The mbuf containing the packet. * @result 0 upon success otherwise the errno error. */ -extern errno_t mbuf_clear_csum_performed(mbuf_t mbuf); +extern errno_t mbuf_clear_csum_performed(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_inet_cksum @@ -1264,7 +1336,8 @@ extern errno_t mbuf_clear_csum_performed(mbuf_t mbuf); * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_inet_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, - u_int32_t length, u_int16_t *csum); + u_int32_t length, u_int16_t *csum) +__NKE_API_DEPRECATED; /*! * @function mbuf_inet6_cksum @@ -1293,7 +1366,8 @@ extern errno_t mbuf_inet_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_inet6_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, - u_int32_t length, u_int16_t *csum); + u_int32_t length, u_int16_t *csum) +__NKE_API_DEPRECATED; /* mbuf tags */ @@ -1316,7 +1390,8 @@ extern errno_t mbuf_inet6_cksum(mbuf_t mbuf, int protocol, u_int32_t offset, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_tag_id_find(const char *module_string, - mbuf_tag_id_t *module_id); + mbuf_tag_id_t *module_id) +__NKE_API_DEPRECATED; /*! * @function mbuf_tag_allocate @@ -1340,7 +1415,8 @@ extern errno_t mbuf_tag_id_find(const char *module_string, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_tag_allocate(mbuf_t mbuf, mbuf_tag_id_t module_id, - mbuf_tag_type_t type, size_t length, mbuf_how_t how, void **data_p); + mbuf_tag_type_t type, size_t length, mbuf_how_t how, void **data_p) +__NKE_API_DEPRECATED; /*! * @function mbuf_tag_find @@ -1355,7 +1431,8 @@ extern errno_t mbuf_tag_allocate(mbuf_t mbuf, mbuf_tag_id_t module_id, * @result 0 upon success otherwise the errno error. */ extern errno_t mbuf_tag_find(mbuf_t mbuf, mbuf_tag_id_t module_id, - mbuf_tag_type_t type, size_t *length, void **data_p); + mbuf_tag_type_t type, size_t *length, void **data_p) +__NKE_API_DEPRECATED; /*! * @function mbuf_tag_free @@ -1365,7 +1442,8 @@ extern errno_t mbuf_tag_find(mbuf_t mbuf, mbuf_tag_id_t module_id, * @param type The type of the tag to free. */ extern void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, - mbuf_tag_type_t type); + mbuf_tag_type_t type) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE /*! @@ -1431,7 +1509,8 @@ extern void mbuf_del_drvaux(mbuf_t mbuf); * @discussion Get the mbuf statistics. * @param stats Storage to copy the stats in to. */ -extern void mbuf_stats(struct mbuf_stat *stats); +extern void mbuf_stats(struct mbuf_stat *stats) +__NKE_API_DEPRECATED; /*! @@ -1464,7 +1543,8 @@ typedef enum { * @param mbuf The mbuf to get the traffic class of. * @result The traffic class */ -extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); +extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf) +__NKE_API_DEPRECATED; /*! * @function mbuf_set_traffic_class @@ -1473,7 +1553,8 @@ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); * @param tc The traffic class * @result 0 on success, EINVAL if bad parameter is passed */ -extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); +extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc) +__NKE_API_DEPRECATED; /*! * @function mbuf_is_traffic_class_privileged @@ -1482,7 +1563,8 @@ extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); * @param mbuf The mbuf to retrieve the status from. * @result Non-zero if privileged, 0 otherwise. */ -extern int mbuf_is_traffic_class_privileged(mbuf_t mbuf); +extern int mbuf_is_traffic_class_privileged(mbuf_t mbuf) +__NKE_API_DEPRECATED; #ifdef KERNEL_PRIVATE diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 05e522c99..a8010e620 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -247,9 +247,7 @@ extern struct mount * dead_mountp; #define MNTK_SWAP_MOUNT 0x00000100 /* we are swapping to this mount */ #define MNTK_DENY_READDIREXT 0x00000200 /* Deny Extended-style readdir's for this volume */ #define MNTK_PERMIT_UNMOUNT 0x00000400 /* Allow (non-forced) unmounts by UIDs other than the one that mounted the volume */ -#ifdef NFSCLIENT #define MNTK_TYPENAME_OVERRIDE 0x00000800 /* override the fstypename for statfs() */ -#endif /* NFSCLIENT */ #define MNTK_KERNEL_MOUNT 0x00001000 /* mount came from kernel side */ #ifdef CONFIG_IMGSRC_ACCESS #define MNTK_HAS_MOVED 0x00002000 diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 87a39398b..e427f2ebe 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -245,7 +245,9 @@ extern int proc_selfpid(void); /* this routine returns the pid of the parent of the current process */ extern int proc_selfppid(void); /* this routine returns the csflags of the current process */ -extern int proc_selfcsflags(void); +extern uint64_t proc_selfcsflags(void); +/* this routine populates the given flags param with the csflags of the given process. Returns 0 on success, -1 on error. */ +extern int proc_csflags(proc_t p, uint64_t* flags); /* this routine returns sends a signal signum to the process identified by the pid */ extern void proc_signal(int pid, int signum); /* this routine checks whether any signal identified by the mask are pending in the process identified by the pid. The check is on all threads of the process. */ @@ -304,6 +306,16 @@ extern int proc_issetugid(proc_t p); extern int proc_tbe(proc_t); +/*! + * @function proc_gettty + * @abstract Copies the associated tty vnode for a given process if it exists. The caller needs to decrement the iocount of the vnode. + * @return 0 on success. ENOENT if the process has no associated TTY. EINVAL if arguments are NULL or vnode_getwithvid fails. + */ +extern int proc_gettty(proc_t p, vnode_t *vp); + +/* this routine populates the associated tty device for a given process if it exists, returns 0 on success or else returns EINVAL */ +extern int proc_gettty_dev(proc_t p, dev_t *dev); + /*! * @function proc_selfpgrpid * @abstract Get the process group id for the current process, as with proc_pgrpid(). diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 2e30057f1..3611e2c6a 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -320,10 +320,16 @@ struct socket { pid_t e_pid; /* pid of the effective owner */ u_int64_t e_upid; /* upid of the effective owner */ +#if defined(XNU_TARGET_OS_OSX) + pid_t so_rpid; /* pid of the responsible process */ +#endif uuid_t last_uuid; /* uuid of most recent accessor */ uuid_t e_uuid; /* uuid of effective owner */ uuid_t so_vuuid; /* UUID of the Voucher originator */ +#if defined(XNU_TARGET_OS_OSX) + uuid_t so_ruuid; /* UUID of the responsible process */ +#endif int32_t so_policy_gencnt; /* UUID policy gencnt */ diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index d963cfdb9..f0b7866f1 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -77,6 +77,7 @@ typedef enum { PSPA_AU_SESSION = 2, PSPA_IMP_WATCHPORTS = 3, PSPA_REGISTERED_PORTS = 4, + PSPA_SUID_CRED = 6, } pspa_t; /* diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index 18c9ad950..ca2a54e5b 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -540,6 +540,17 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); #endif #endif +/* + * Extended flags ("EF") returned by ATTR_CMNEXT_EXT_FLAGS from getattrlist/getattrlistbulk + */ +#define EF_MAY_SHARE_BLOCKS 0x00000001 /* file may share blocks with another file */ +#define EF_NO_XATTRS 0x00000002 /* file has no xattrs at all */ +#define EF_IS_SYNC_ROOT 0x00000004 /* file is a sync root for iCloud */ +#define EF_IS_PURGEABLE 0x00000008 /* file is purgeable */ +#define EF_IS_SPARSE 0x00000010 /* file has at least one sparse region */ + + + #ifndef KERNEL __BEGIN_DECLS diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index e5263caf4..7dfb01ef2 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -559,6 +559,8 @@ struct vnode_trigger_param { #define VNODE_ATTR_va_fsid64 (1LL<<41) /* 20000000000 */ #define VNODE_ATTR_va_write_gencount (1LL<<42) /* 40000000000 */ #define VNODE_ATTR_va_private_size (1LL<<43) /* 80000000000 */ +#define VNODE_ATTR_va_clone_id (1LL<<44) /* 100000000000 */ +#define VNODE_ATTR_va_extflags (1LL<<45) /* 200000000000 */ #define VNODE_ATTR_BIT(n) (VNODE_ATTR_ ## n) @@ -608,7 +610,9 @@ struct vnode_trigger_param { VNODE_ATTR_BIT(va_rsrc_alloc) | \ VNODE_ATTR_BIT(va_fsid64) | \ VNODE_ATTR_BIT(va_write_gencount) | \ - VNODE_ATTR_BIT(va_private_size)) + VNODE_ATTR_BIT(va_private_size) | \ + VNODE_ATTR_BIT(va_clone_id) | \ + VNODE_ATTR_BIT(va_extflags)) /* * Read-only attributes. @@ -637,8 +641,11 @@ struct vnode_trigger_param { VNODE_ATTR_BIT(va_rsrc_length) | \ VNODE_ATTR_BIT(va_rsrc_alloc) | \ VNODE_ATTR_BIT(va_fsid64) | \ - VNODE_ATTR_BIT(va_write_gencount) | \ - VNODE_ATTR_BIT(va_private_size)) + VNODE_ATTR_BIT(va_write_gencount) | \ + VNODE_ATTR_BIT(va_private_size) | \ + VNODE_ATTR_BIT(va_clone_id) | \ + VNODE_ATTR_BIT(va_extflags)) + /* * Attributes that can be applied to a new file object. */ @@ -742,6 +749,8 @@ struct vnode_attr { uint32_t va_write_gencount; /* counter that increments each time the file changes */ uint64_t va_private_size; /* If the file were deleted, how many bytes would be freed immediately */ + uint64_t va_clone_id; /* If a file is cloned this is a unique id shared by all "perfect" clones */ + uint64_t va_extflags; /* extended file/directory flags */ /* add new fields here only */ }; @@ -1689,6 +1698,19 @@ int vnode_isdyldsharedcache(vnode_t vp); */ int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); + +/*! + * @function vn_authorize_rmdir + * @abstract Authorize an rmdir operation given the vfs_context_t + * @discussion Check if the context assocated with vfs_context_t is allowed to rmdir the vnode vp in directory dvp. + * @param dvp Parent vnode of the directory to be rmdir'ed + * @param vp The vnode to be rmdir'ed + * @param cnp A componentname containing the name of the file to be rmdir'ed. May be NULL. + * @param reserved Pass NULL + * @return returns zero if the operation is allowed, non-zero indicates the rmdir is not authorized. + */ +int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); + /*! * @function vn_getpath_fsenter * @abstract Attempt to get a vnode's path, willing to enter the filesystem. @@ -1751,6 +1773,7 @@ int vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int * #define VN_GETPATH_FSENTER 0x0001 /* Can re-enter filesystem */ #define VN_GETPATH_NO_FIRMLINK 0x0002 #define VN_GETPATH_VOLUME_RELATIVE 0x0004 /* also implies VN_GETPATH_NO_FIRMLINK */ +#define VN_GETPATH_NO_PROCROOT 0x0008 /* Give the non chrooted path for a process */ #endif /* KERNEL_PRIVATE */ @@ -2379,6 +2402,7 @@ void vnode_clearnoflush(vnode_t); #define BUILDPATH_CHECK_MOVED 0x4 /* Return EAGAIN if the parent hierarchy is modified */ #define BUILDPATH_VOLUME_RELATIVE 0x8 /* Return path relative to the nearest mount point */ #define BUILDPATH_NO_FIRMLINK 0x10 /* Return non-firmlinked path */ +#define BUILDPATH_NO_PROCROOT 0x20 /* Return path relative to system root, not the process root */ int build_path(vnode_t first_vp, char *buff, int buflen, int *outlen, int flags, vfs_context_t ctx); diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index 4e271502f..29d6d9f72 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -449,7 +449,6 @@ int vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct component int vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path, vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved); -int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); int vn_authorize_mkdir(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index c344bef00..d7ba26600 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -545,6 +545,8 @@ static struct getattrlist_attrtab getattrlist_common_extended_tab[] = { {.attr = ATTR_CMNEXT_NOFIRMLINKPATH, .bits = 0, .size = sizeof(struct attrreference), .action = KAUTH_VNODE_READ_ATTRIBUTES}, {.attr = ATTR_CMNEXT_REALDEVID, .bits = VATTR_BIT(va_devid), .size = sizeof(uint32_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, {.attr = ATTR_CMNEXT_REALFSID, .bits = VATTR_BIT(va_fsid64), .size = sizeof(fsid_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_CLONEID, .bits = VATTR_BIT(va_clone_id), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, + {.attr = ATTR_CMNEXT_EXT_FLAGS, .bits = VATTR_BIT(va_extflags), .size = sizeof(uint64_t), .action = KAUTH_VNODE_READ_ATTRIBUTES}, {.attr = 0, .bits = 0, .size = 0, .action = 0} }; @@ -607,7 +609,8 @@ static struct getattrlist_attrtab getattrlistbulk_common_extended_tab[] = { #define VFS_DFLT_ATTR_CMN_EXT (ATTR_CMNEXT_PRIVATESIZE | ATTR_CMNEXT_LINKID | \ ATTR_CMNEXT_NOFIRMLINKPATH | ATTR_CMNEXT_REALDEVID | \ - ATTR_CMNEXT_REALFSID) + ATTR_CMNEXT_REALFSID | ATTR_CMNEXT_CLONEID | \ + ATTR_CMNEXT_EXT_FLAGS) #define VFS_DFLT_ATTR_DIR (ATTR_DIR_LINKCOUNT | ATTR_DIR_MOUNTSTATUS) @@ -984,6 +987,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: could not allocate f_vol_name buffer"); goto out; } + vs.f_vol_name[0] = '\0'; } VFS_DEBUG(ctx, vp, "ATTRLIST - calling to get %016llx with supported %016llx", vs.f_active, vs.f_supported); @@ -2357,6 +2361,26 @@ attr_pack_common_extended(mount_t mp, struct vnode *vp, struct attrlist *alp, } } + if (alp->forkattr & ATTR_CMNEXT_CLONEID) { + if (VATTR_IS_SUPPORTED(vap, va_clone_id)) { + ATTR_PACK8((*abp), vap->va_clone_id); + abp->actual.forkattr |= ATTR_CMNEXT_CLONEID; + } else if (!return_valid || pack_invalid) { + uint64_t zero_val = 0; + ATTR_PACK8((*abp), zero_val); + } + } + + if (alp->forkattr & ATTR_CMNEXT_EXT_FLAGS) { + if (VATTR_IS_SUPPORTED(vap, va_extflags)) { + ATTR_PACK8((*abp), vap->va_extflags); + abp->actual.forkattr |= ATTR_CMNEXT_EXT_FLAGS; + } else if (!return_valid || pack_invalid) { + uint64_t zero_val = 0; + ATTR_PACK8((*abp), zero_val); + } + } + return 0; } diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index 18a0906b8..b027e9535 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -450,7 +450,7 @@ build_path_with_parent(vnode_t first_vp, vnode_t parent_vp, char *buff, int bufl /* * Grab the process fd so we can evaluate fd_rdir. */ - if (vfs_context_proc(ctx)->p_fd) { + if (vfs_context_proc(ctx)->p_fd && !(flags & BUILDPATH_NO_PROCROOT)) { proc_root_dir_vp = vfs_context_proc(ctx)->p_fd->fd_rdir; } else { proc_root_dir_vp = NULL; diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 1d61ed284..fb97b1864 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -72,6 +72,8 @@ #include #include +#include + /* * These define the root filesystem, device, and root filesystem type. */ @@ -122,7 +124,7 @@ enum fs_type_num { */ static struct vfstable vfstbllist[] = { /* Sun-compatible Network Filesystem */ -#if NFSCLIENT +#if CONFIG_NFS_CLIENT { .vfc_vfsops = &nfs_vfsops, .vfc_name = "nfs", @@ -138,7 +140,7 @@ static struct vfstable vfstbllist[] = { .vfc_descsize = 0, .vfc_sysctl = NULL }, -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ /* Device Filesystem */ #if DEVFS @@ -321,7 +323,7 @@ const struct vnodeopv_desc *vfs_opv_descs[] = { #if MFS &mfs_vnodeop_opv_desc, #endif -#if NFSCLIENT +#if CONFIG_NFS_CLIENT &nfsv2_vnodeop_opv_desc, &spec_nfsv2nodeop_opv_desc, #if CONFIG_NFS4 @@ -334,7 +336,7 @@ const struct vnodeopv_desc *vfs_opv_descs[] = { &fifo_nfsv4nodeop_opv_desc, #endif /* CONFIG_NFS4 */ #endif /* FIFO */ -#endif /* NFSCLIENT */ +#endif /* CONFIG_NFS_CLIENT */ #if DEVFS &devfs_vnodeop_opv_desc, &devfs_spec_vnodeop_opv_desc, diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index f7916db48..5b5455b4b 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -263,7 +263,7 @@ unlock_fs_event_list(void) // forward prototype static void release_event_ref(kfs_event *kfse); -static int +static boolean_t watcher_cares_about_dev(fs_event_watcher *watcher, dev_t dev) { unsigned int i; @@ -271,20 +271,20 @@ watcher_cares_about_dev(fs_event_watcher *watcher, dev_t dev) // if devices_not_to_watch is NULL then we care about all // events from all devices if (watcher->devices_not_to_watch == NULL) { - return 1; + return true; } for (i = 0; i < watcher->num_devices; i++) { if (dev == watcher->devices_not_to_watch[i]) { // found a match! that means we do not // want events from this device. - return 0; + return false; } } // if we're here it's not in the devices_not_to_watch[] // list so that means we do care about it - return 1; + return true; } @@ -1564,35 +1564,47 @@ restart_watch: break; } - if (watcher->event_list[kfse->type] == FSE_REPORT && watcher_cares_about_dev(watcher, kfse->dev)) { - if (!(watcher->flags & WATCHER_APPLE_SYSTEM_SERVICE) && kfse->type != FSE_DOCID_CREATED && kfse->type != FSE_DOCID_CHANGED && is_ignored_directory(kfse->str)) { - // If this is not an Apple System Service, skip specified directories - // radar://12034844 - error = 0; - skipped = 1; + if (watcher->event_list[kfse->type] == FSE_REPORT) { + boolean_t watcher_cares; + + if (watcher->devices_not_to_watch == NULL) { + watcher_cares = true; } else { - skipped = 0; - if (last_event_ptr == kfse) { - last_event_ptr = NULL; - last_event_type = -1; - last_coalesced_time = 0; - } - error = copy_out_kfse(watcher, kfse, uio); - if (error != 0) { - // if an event won't fit or encountered an error while - // we were copying it out, then backup to the last full - // event and just bail out. if the error was ENOENT - // then we can continue regular processing, otherwise - // we should unlock things and return. - uio_setresid(uio, last_full_event_resid); - if (error != ENOENT) { - lck_rw_unlock_shared(&event_handling_lock); - error = 0; - goto get_out; + lock_watch_table(); + watcher_cares = watcher_cares_about_dev(watcher, kfse->dev); + unlock_watch_table(); + } + + if (watcher_cares) { + if (!(watcher->flags & WATCHER_APPLE_SYSTEM_SERVICE) && kfse->type != FSE_DOCID_CREATED && kfse->type != FSE_DOCID_CHANGED && is_ignored_directory(kfse->str)) { + // If this is not an Apple System Service, skip specified directories + // radar://12034844 + error = 0; + skipped = 1; + } else { + skipped = 0; + if (last_event_ptr == kfse) { + last_event_ptr = NULL; + last_event_type = -1; + last_coalesced_time = 0; + } + error = copy_out_kfse(watcher, kfse, uio); + if (error != 0) { + // if an event won't fit or encountered an error while + // we were copying it out, then backup to the last full + // event and just bail out. if the error was ENOENT + // then we can continue regular processing, otherwise + // we should unlock things and return. + uio_setresid(uio, last_full_event_resid); + if (error != ENOENT) { + lck_rw_unlock_shared(&event_handling_lock); + error = 0; + goto get_out; + } } - } - last_full_event_resid = uio_resid(uio); + last_full_event_resid = uio_resid(uio); + } } } diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 85d47741e..77c525baa 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -1261,13 +1261,15 @@ dirloop: tdp = dp; dp = tdp->v_mount->mnt_vnodecovered; - vnode_put(tdp); - if ((vnode_getwithref(dp))) { + vnode_put(tdp); dp = NULLVP; error = ENOENT; goto bad; } + + vnode_put(tdp); + ndp->ni_dvp = dp; dp_authorized = 0; } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index f8304f9ad..866780991 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -109,6 +109,8 @@ #include #include +#include + #include #include @@ -2408,11 +2410,11 @@ vclean(vnode_t vp, int flags) * Clean out any buffers associated with the vnode. */ if (flags & DOCLOSE) { -#if NFSCLIENT +#if CONFIG_NFS_CLIENT if (vp->v_tag == VT_NFS) { nfs_vinvalbuf(vp, V_SAVE, ctx, 0); } else -#endif +#endif /* CONFIG_NFS_CLIENT */ { VNOP_FSYNC(vp, MNT_WAIT, ctx); @@ -2903,6 +2905,9 @@ vn_getpath_ext(struct vnode *vp, struct vnode *dvp, char *pathbuf, int *len, int if (flags & VN_GETPATH_VOLUME_RELATIVE) { bpflags |= (BUILDPATH_VOLUME_RELATIVE | BUILDPATH_NO_FIRMLINK); } + if (flags & VN_GETPATH_NO_PROCROOT) { + bpflags |= BUILDPATH_NO_PROCROOT; + } } return build_path_with_parent(vp, dvp, pathbuf, *len, len, bpflags, vfs_context_current()); @@ -3925,11 +3930,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_ffree = (user64_long_t)sp->f_ffree; sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; -#ifdef NFSCLIENT +#ifdef CONFIG_NFS_CLIENT if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN); } else -#endif +#endif /* CONFIG_NFS_CLIENT */ { strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); } @@ -3987,11 +3992,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; -#ifdef NFSCLIENT +#ifdef CONFIG_NFS_CLIENT if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN); } else -#endif +#endif /* CONFIG_NFS_CLIENT */ { strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); } @@ -6227,26 +6232,21 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *v vp = *vpp; old_error = error; -#if CONFIG_MACF - if (!(flags & VN_CREATE_NOLABEL)) { - error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); - if (error) { - goto error; - } - } -#endif - /* * If some of the requested attributes weren't handled by the VNOP, * use our fallback code. */ - if (!VATTR_ALL_SUPPORTED(vap) && *vpp) { + if ((error == 0) && !VATTR_ALL_SUPPORTED(vap) && *vpp) { KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl); error = vnode_setattr_fallback(*vpp, vap, ctx); } + #if CONFIG_MACF -error: + if ((error == 0) && !(flags & VN_CREATE_NOLABEL)) { + error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); + } #endif + if ((error != 0) && (vp != (vnode_t)0)) { /* If we've done a compound open, close */ if (batched && (old_error == 0) && (vap->va_type == VREG)) { diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 84627d65e..a0a04deb8 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -133,6 +133,8 @@ #include #include +#include + #if ROUTEFS #include #endif /* ROUTEFS */ @@ -266,7 +268,7 @@ extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); * Virtual File System System Calls */ -#if NFSCLIENT || DEVFS || ROUTEFS +#if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS /* * Private in-kernel mounting spi (NFS only, not exported) */ @@ -322,7 +324,7 @@ kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, return error; } -#endif /* NFSCLIENT || DEVFS */ +#endif /* CONFIG_NFS_CLIENT || DEVFS */ /* * Mount a file system. @@ -829,14 +831,14 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */ vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE); -#if NFSCLIENT || DEVFS || ROUTEFS +#if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS if (kernelmount) { mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT; } if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) { mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT; } -#endif /* NFSCLIENT || DEVFS */ +#endif /* CONFIG_NFS_CLIENT || DEVFS */ update: @@ -4781,6 +4783,9 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, error = nameiat(&nd, fd1); if (error) { + if (error == EPERM) { + printf("XXX 54841485: nameiat() src EPERM\n"); + } return error; } vp = nd.ni_vp; @@ -4794,6 +4799,7 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, if (vp->v_type == VDIR) { if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) { error = EPERM; /* POSIX */ + printf("XXX 54841485: VDIR EPERM\n"); goto out; } @@ -4821,6 +4827,9 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, nd.ni_dirp = link; error = nameiat(&nd, fd2); if (error != 0) { + if (error == EPERM) { + printf("XXX 54841485: nameiat() dst EPERM\n"); + } goto out; } dvp = nd.ni_dvp; @@ -4828,12 +4837,18 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, #if CONFIG_MACF if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) { + if (error == EPERM) { + printf("XXX 54841485: mac_vnode_check_link() EPERM\n"); + } goto out2; } #endif /* or to anything that kauth doesn't want us to (eg. immutable items) */ if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) { + if (error == EPERM) { + printf("XXX 54841485: vnode_authorize() LINKTARGET EPERM\n"); + } goto out2; } @@ -4850,12 +4865,18 @@ linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2, /* authorize creation of the target note */ if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) { + if (error == EPERM) { + printf("XXX 54841485: vnode_authorize() ADD_FILE EPERM\n"); + } goto out2; } /* and finally make the link */ error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx); if (error) { + if (error == EPERM) { + printf("XXX 54841485: VNOP_LINK() EPERM\n"); + } goto out2; } @@ -5056,16 +5077,16 @@ symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd, error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx); } -#if CONFIG_MACF + /* do fallback attribute handling */ if (error == 0 && vp) { - error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx); + error = vnode_setattr_fallback(vp, &va, ctx); } -#endif - /* do fallback attribute handling */ +#if CONFIG_MACF if (error == 0 && vp) { - error = vnode_setattr_fallback(vp, &va, ctx); + error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx); } +#endif if (error == 0) { int update_flags = 0; @@ -7784,10 +7805,6 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, int fsevent; #endif /* CONFIG_FSE */ -#if CONFIG_MACF - (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp, - VNODE_LABEL_CREATE, ctx); -#endif /* * If some of the requested attributes weren't handled by the * VNOP, use our fallback code. @@ -7796,6 +7813,11 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, (void)vnode_setattr_fallback(tvp, &nva, ctx); } +#if CONFIG_MACF + (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp, + VNODE_LABEL_CREATE, ctx); +#endif + // Make sure the name & parent pointers are hooked up if (tvp->v_name == NULL) { update_flags |= VNODE_UPDATE_NAME; @@ -12720,7 +12742,9 @@ static int snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx) { + mount_t mp; vnode_t rvp, snapdvp, snapvp, vp, pvp; + struct fs_snapshot_mount_args smnt_data; int error; struct nameidata *snapndp, *dirndp; /* carving out a chunk for structs that are too big to be on stack. */ @@ -12756,20 +12780,28 @@ snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, vp = dirndp->ni_vp; pvp = dirndp->ni_dvp; + mp = vnode_mount(rvp); if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) { error = EINVAL; - } else { - mount_t mp = vnode_mount(rvp); - struct fs_snapshot_mount_args smnt_data; + goto out2; + } - smnt_data.sm_mp = mp; - smnt_data.sm_cnp = &snapndp->ni_cnd; - error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp, - &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE, - KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx); +#if CONFIG_MACF + error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr, + mp->mnt_vfsstat.f_fstypename); + if (error) { + goto out2; } +#endif + smnt_data.sm_mp = mp; + smnt_data.sm_cnp = &snapndp->ni_cnd; + error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp, + &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE, + KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx); + +out2: vnode_put(vp); vnode_put(pvp); nameidone(dirndp); diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index 2fae8525c..0a9075c05 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include @@ -430,12 +431,12 @@ vnode_pageout(struct vnode *vp, * of it's pages */ for (offset = upl_offset; isize; isize -= PAGE_SIZE, offset += PAGE_SIZE) { -#if NFSCLIENT +#if CONFIG_NFS_CLIENT if (vp->v_tag == VT_NFS) { /* check with nfs if page is OK to drop */ error = nfs_buf_page_inval(vp, (off_t)f_offset); } else -#endif +#endif /* CONFIG_NFS_CLIENT */ { blkno = ubc_offtoblk(vp, (off_t)f_offset); error = buf_invalblkno(vp, blkno, 0); @@ -487,12 +488,12 @@ vnode_pageout(struct vnode *vp, * Note we must not sleep here if the buffer is busy - that is * a lock inversion which causes deadlock. */ -#if NFSCLIENT +#if CONFIG_NFS_CLIENT if (vp->v_tag == VT_NFS) { /* check with nfs if page is OK to drop */ error = nfs_buf_page_inval(vp, (off_t)f_offset); } else -#endif +#endif /* CONFIG_NFS_CLIENT */ { blkno = ubc_offtoblk(vp, (off_t)f_offset); error = buf_invalblkno(vp, blkno, 0); diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index 4e8d858f5..f2ce580fc 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -696,6 +696,7 @@ _vfs_sysctl _vfs_typenum _vfs_unbusy _vfs_unmountbyfsid +_vn_authorize_rmdir _vn_authorize_unlink _vn_bwrite _vn_default_error diff --git a/config/IOKit.exports b/config/IOKit.exports index 0010db9f1..d29f1f01c 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -1453,6 +1453,7 @@ _gIOResourceMatchKey _gIOResourcesKey _gIOServiceKey _gIOServicePlane +_gIOSupportedPropertiesKey _gIOTerminatedNotification _gIOUserClientClassKey _gIOWillTerminateNotification @@ -1678,7 +1679,7 @@ __ZTVN25IODataQueueDispatchSource9MetaClassE __ZN25IODataQueueDispatchSource19DequeueWithCoalesceEPbU13block_pointerFvPKvmE __ZN25IODataQueueDispatchSource19EnqueueWithCoalesceEjPbU13block_pointerFvPvmE -__ZN11IOMemoryMap17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P23IOMemoryMapPrivateStateE +__ZN11IOMemoryMap17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P24_IOMemoryMapPrivateStateE __ZN12IOUserClient22AsyncCompletion_InvokeE5IORPCP15OSMetaClassBasePFvS2_P8OSActioniPKyjE __ZN12IOUserClient22_ExternalMethod_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPKyjP6OSDataP18IOMemoryDescriptorPyPjyPS6_S8_P8OSActionE __ZN12IOUserClient30CopyClientMemoryForType_InvokeE5IORPCP15OSMetaClassBasePFiS2_yPyPP18IOMemoryDescriptorE @@ -1691,7 +1692,7 @@ __ZN16IODispatchSource13Cancel_InvokeE5IORPCP15OSMetaClassBasePFiS2_U13block_poi __ZN16IODispatchSource16SetEnable_InvokeE5IORPCP15OSMetaClassBasePFiS2_bE __ZN16IODispatchSource19CheckForWork_InvokeE5IORPCP15OSMetaClassBasePFiS2_S0_bE __ZN16IODispatchSource30SetEnableWithCompletion_InvokeE5IORPCP15OSMetaClassBasePFiS2_bU13block_pointerFvvEE -__ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P16IOMDPrivateStateE +__ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P17_IOMDPrivateStateE __ZN18IOMemoryDescriptor20PrepareForDMA_InvokeE5IORPCP15OSMetaClassBasePFiS2_yP9IOServiceyyPyS5_PjP16IOAddressSegmentE __ZN24IOBufferMemoryDescriptor13Create_InvokeE5IORPCPFiyyyPPS_E __ZN24IOBufferMemoryDescriptor16SetLength_InvokeE5IORPCP15OSMetaClassBasePFiS2_yE diff --git a/config/MACFramework.exports b/config/MACFramework.exports index e274ed3e1..afe5c8e1d 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -12,6 +12,7 @@ _mac_label_set _mac_audit_text _mac_iokit_check_hid_control +_mac_mount_check_snapshot_mount _mac_vnode_check_trigger_resolve _sbuf_cat diff --git a/config/MasterVersion b/config/MasterVersion index 3a06932c5..441ebcc08 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -19.3.0 +19.4.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index 5447e64e6..0436ff54f 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -348,6 +348,7 @@ _pffindproto:_pffindproto_old _port_name_to_task _port_name_to_thread _post_sys_powersource +_proc_csflags _proc_get_syscall_filter_mask_size _proc_getexecutableoffset _proc_getexecutablevnode @@ -357,9 +358,12 @@ _proc_pidbackgrounded _proc_pidversion _proc_set_responsible_pid _proc_set_syscall_filter_mask +_proc_selfcsflags _proc_task _proc_uniqueid _proc_puniqueid +_proc_gettty +_proc_gettty_dev _proc_exitstatus _priv_check_cred _pru_abort_notsupp diff --git a/iokit/DriverKit/IOBufferMemoryDescriptor.iig b/iokit/DriverKit/IOBufferMemoryDescriptor.iig index 449d66ea5..9975a16e5 100644 --- a/iokit/DriverKit/IOBufferMemoryDescriptor.iig +++ b/iokit/DriverKit/IOBufferMemoryDescriptor.iig @@ -45,8 +45,7 @@ * IOBufferMemoryDescriptor describes a memory buffer allocated in the callers address space. * * @discussion - * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create() - * Methods in this class are used for memory that was supplied as a parameter. + * To allocate memory for I/O or sharing, use IOBufferMemoryDescriptor::Create(). * IOBufferMemoryDescriptor can be handed to any API that expects an IOMemoryDescriptor. */ diff --git a/iokit/DriverKit/IOKitKeys.h b/iokit/DriverKit/IOKitKeys.h new file mode 100644 index 000000000..758ac24be --- /dev/null +++ b/iokit/DriverKit/IOKitKeys.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * Common symbol definitions for IOKit. + * + * HISTORY + * + */ + + +#ifndef _IOKIT_IOKITKEYS_H +#define _IOKIT_IOKITKEYS_H + +// properties found in the registry root +#define kIOKitBuildVersionKey "IOKitBuildVersion" +#define kIOKitDiagnosticsKey "IOKitDiagnostics" +// a dictionary keyed by plane name +#define kIORegistryPlanesKey "IORegistryPlanes" +#define kIOCatalogueKey "IOCatalogue" + +// registry plane names +#define kIOServicePlane "IOService" +#define kIOPowerPlane "IOPower" +#define kIODeviceTreePlane "IODeviceTree" +#define kIOAudioPlane "IOAudio" +#define kIOFireWirePlane "IOFireWire" +#define kIOUSBPlane "IOUSB" + +// registry ID number +#define kIORegistryEntryIDKey "IORegistryEntryID" +// property name to get array of property names +#define kIORegistryEntryPropertyKeysKey "IORegistryEntryPropertyKeys" + +// IOService class name +#define kIOServiceClass "IOService" + +// IOResources class name +#define kIOResourcesClass "IOResources" + +// IOService driver probing property names +#define kIOClassKey "IOClass" +#define kIOProbeScoreKey "IOProbeScore" +#define kIOKitDebugKey "IOKitDebug" + +// Properties to be supported as API +#define kIOSupportedPropertiesKey "IOSupportedProperties" +// Properties writable by dexts +#define kIOUserServicePropertiesKey "IOUserServiceProperties" + + +// IOService matching property names +#define kIOProviderClassKey "IOProviderClass" +#define kIONameMatchKey "IONameMatch" +#define kIOPropertyMatchKey "IOPropertyMatch" +#define kIOPropertyExistsMatchKey "IOPropertyExistsMatch" +#define kIOPathMatchKey "IOPathMatch" +#define kIOLocationMatchKey "IOLocationMatch" +#define kIOParentMatchKey "IOParentMatch" +#define kIOResourceMatchKey "IOResourceMatch" +#define kIOResourceMatchedKey "IOResourceMatched" +#define kIOMatchedServiceCountKey "IOMatchedServiceCountMatch" + +#define kIONameMatchedKey "IONameMatched" + +#define kIOMatchCategoryKey "IOMatchCategory" +#define kIODefaultMatchCategoryKey "IODefaultMatchCategory" + +#define kIOMatchedPersonalityKey "IOMatchedPersonality" +#define kIORematchPersonalityKey "IORematchPersonality" +#define kIORematchCountKey "IORematchCount" +#define kIODEXTMatchCountKey "IODEXTMatchCount" + +// Entitlements to check against dext process +// Property is an array, one or more of which may match, of: +// an array of entitlement strings, all must be present +// Any array can be a single string. +#define kIOServiceDEXTEntitlementsKey "IOServiceDEXTEntitlements" + +// Entitlement required to open dext connection +#define kIODriverKitEntitlementKey "com.apple.developer.driverkit" + +// Entitlements required to open dext IOUserClient +// Property is an array of strings containing CFBundleIdentifiers of service being opened +#define kIODriverKitUserClientEntitlementsKey "com.apple.developer.driverkit.userclient-access" + +// Other DriverKit entitlements +#define kIODriverKitUSBTransportEntitlementKey "com.apple.developer.driverkit.transport.usb" +#define kIODriverKitHIDTransportEntitlementKey "com.apple.developer.driverkit.transport.hid" +#define kIODriverKitHIDFamilyDeviceEntitlementKey "com.apple.developer.driverkit.family.hid.device" +#define kIODriverKitHIDFamilyEventServiceEntitlementKey "com.apple.developer.driverkit.family.hid.eventservice" +#define kIODriverKitTransportBuiltinEntitlementKey "com.apple.developer.driverkit.builtin" + +// Entitlement required to read nvram root-only properties as non-root user +#define kIONVRAMReadAccessKey "com.apple.private.iokit.nvram-read-access" +// Entitlement required to write nvram properties as non-root user +#define kIONVRAMWriteAccessKey "com.apple.private.iokit.nvram-write-access" + +// When possible, defer matching of this driver until kextd has started. +#define kIOMatchDeferKey "IOMatchDefer" + +// Published after processor_start() has been called on all CPUs at boot time. +#define kIOAllCPUInitializedKey "IOAllCPUInitialized" + +// IOService default user client class, for loadable user clients +#define kIOUserClientClassKey "IOUserClientClass" + +// key to find IOMappers +#define kIOMapperIDKey "IOMapperID" + +#define kIOUserClientCrossEndianKey "IOUserClientCrossEndian" +#define kIOUserClientCrossEndianCompatibleKey "IOUserClientCrossEndianCompatible" +#define kIOUserClientSharedInstanceKey "IOUserClientSharedInstance" +#if KERNEL_PRIVATE +#define kIOUserClientMessageAppSuspendedKey "IOUserClientMessageAppSuspended" +#endif +// diagnostic string describing the creating task +#define kIOUserClientCreatorKey "IOUserClientCreator" +// the expected cdhash value of the userspace driver executable +#define kIOUserServerCDHashKey "IOUserServerCDHash" + +#define kIOUserUserClientKey "IOUserUserClient" + + +// IOService notification types +#define kIOPublishNotification "IOServicePublish" +#define kIOFirstPublishNotification "IOServiceFirstPublish" +#define kIOMatchedNotification "IOServiceMatched" +#define kIOFirstMatchNotification "IOServiceFirstMatch" +#define kIOTerminatedNotification "IOServiceTerminate" +#define kIOWillTerminateNotification "IOServiceWillTerminate" + +// IOService interest notification types +#define kIOGeneralInterest "IOGeneralInterest" +#define kIOBusyInterest "IOBusyInterest" +#define kIOAppPowerStateInterest "IOAppPowerStateInterest" +#define kIOPriorityPowerStateInterest "IOPriorityPowerStateInterest" + +#define kIOPlatformDeviceMessageKey "IOPlatformDeviceMessage" + +// IOService interest notification types +#define kIOCFPlugInTypesKey "IOCFPlugInTypes" + +#define kIOCompatibilityMatchKey "IOCompatibilityMatch" +#define kIOCompatibilityPropertiesKey "IOCompatibilityProperties" +#define kIOPathKey "IOPath" + + +// properties found in services that implement command pooling +#define kIOCommandPoolSizeKey "IOCommandPoolSize" // (OSNumber) + +// properties found in services that implement priority +#define kIOMaximumPriorityCountKey "IOMaximumPriorityCount" // (OSNumber) + +// properties found in services that have transfer constraints +#define kIOMaximumBlockCountReadKey "IOMaximumBlockCountRead" // (OSNumber) +#define kIOMaximumBlockCountWriteKey "IOMaximumBlockCountWrite" // (OSNumber) +#define kIOMaximumByteCountReadKey "IOMaximumByteCountRead" // (OSNumber) +#define kIOMaximumByteCountWriteKey "IOMaximumByteCountWrite" // (OSNumber) +#define kIOMaximumSegmentCountReadKey "IOMaximumSegmentCountRead" // (OSNumber) +#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber) +#define kIOMaximumSegmentByteCountReadKey "IOMaximumSegmentByteCountRead" // (OSNumber) +#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber) +#define kIOMinimumSegmentAlignmentByteCountKey "IOMinimumSegmentAlignmentByteCount" // (OSNumber) +#define kIOMaximumSegmentAddressableBitCountKey "IOMaximumSegmentAddressableBitCount" // (OSNumber) +#define kIOMinimumSaturationByteCountKey "IOMinimumSaturationByteCount" // (OSNumber) +#define kIOMaximumSwapWriteKey "IOMaximumSwapWrite" // (OSNumber) + +// properties found in services that wish to describe an icon +// +// IOIcon = +// { +// CFBundleIdentifier = "com.example.driver.example"; +// IOBundleResourceFile = "example.icns"; +// }; +// +// where IOBundleResourceFile is the filename of the resource + +#define kIOIconKey "IOIcon" // (OSDictionary) +#define kIOBundleResourceFileKey "IOBundleResourceFile" // (OSString) + +#define kIOBusBadgeKey "IOBusBadge" // (OSDictionary) +#define kIODeviceIconKey "IODeviceIcon" // (OSDictionary) + +// property of root that describes the machine's serial number as a string +#define kIOPlatformSerialNumberKey "IOPlatformSerialNumber" // (OSString) + +// property of root that describes the machine's UUID as a string +#define kIOPlatformUUIDKey "IOPlatformUUID" // (OSString) + +// IODTNVRAM property keys +#define kIONVRAMBootArgsKey "boot-args" +#define kIONVRAMDeletePropertyKey "IONVRAM-DELETE-PROPERTY" +#define kIONVRAMSyncNowPropertyKey "IONVRAM-SYNCNOW-PROPERTY" +#define kIONVRAMActivateCSRConfigPropertyKey "IONVRAM-ARMCSR-PROPERTY" +#define kIODTNVRAMPanicInfoKey "aapl,panic-info" + +// keys for complex boot information +#define kIOBootDeviceKey "IOBootDevice" // dict | array of dicts +#define kIOBootDevicePathKey "IOBootDevicePath" // arch-neutral OSString +#define kIOBootDeviceSizeKey "IOBootDeviceSize" // OSNumber of bytes + +// keys for OS Version information +#define kOSBuildVersionKey "OS Build Version" + +#endif /* ! _IOKIT_IOKITKEYS_H */ diff --git a/iokit/DriverKit/IOMemoryDescriptor.iig b/iokit/DriverKit/IOMemoryDescriptor.iig index 760d48cb1..c2c12063b 100644 --- a/iokit/DriverKit/IOMemoryDescriptor.iig +++ b/iokit/DriverKit/IOMemoryDescriptor.iig @@ -64,7 +64,7 @@ struct IOAddressSegment { uint64_t length; }; -struct IOMDPrivateState { +struct _IOMDPrivateState { uint64_t length; uint64_t options; }; @@ -159,7 +159,7 @@ class EXTENDS (IOMemoryDescriptor) IOMemoryDescriptorPrivate { virtual kern_return_t _CopyState( - IOMDPrivateState * state); + _IOMDPrivateState * state); }; diff --git a/iokit/DriverKit/IOMemoryMap.iig b/iokit/DriverKit/IOMemoryMap.iig index 716c87f09..4e49f8de1 100644 --- a/iokit/DriverKit/IOMemoryMap.iig +++ b/iokit/DriverKit/IOMemoryMap.iig @@ -37,7 +37,7 @@ #include -struct IOMemoryMapPrivateState { +struct _IOMemoryMapPrivateState { uint64_t length; uint64_t offset; uint64_t options; @@ -91,7 +91,7 @@ class EXTENDS (IOMemoryMap) IOMemoryMapPrivate { virtual kern_return_t _CopyState( - IOMemoryMapPrivateState * state); + _IOMemoryMapPrivateState * state); }; #endif /* ! _IOKIT_UIOMEMORYMAP_H */ diff --git a/iokit/DriverKit/IOReturn.h b/iokit/DriverKit/IOReturn.h index 5175ee7d0..13eb9b25d 100644 --- a/iokit/DriverKit/IOReturn.h +++ b/iokit/DriverKit/IOReturn.h @@ -46,8 +46,11 @@ extern "C" { #else /* PLATFORM_DriverKit */ -#ifndef _MACH_ERROR_H_ -#define _MACH_ERROR_H_ +#ifdef DRIVERKIT_PRIVATE + +#include + +#else /* DRIVERKIT_PRIVATE */ typedef int kern_return_t; @@ -76,7 +79,7 @@ typedef int kern_return_t; #define sub_emask (err_sub(0xfff)) #define code_emask (0x3fff) -#endif /* _MACH_ERROR_H_ */ +#endif /* DRIVERKIT_PRIVATE */ #endif /* PLATFORM_DriverKit */ @@ -112,6 +115,9 @@ typedef kern_return_t IOReturn; #define sub_iokit_smc err_sub(32) #endif #define sub_iokit_apfs err_sub(33) +#define sub_iokit_acpiec err_sub(34) +#define sub_iokit_timesync_avb err_sub(35) + #define sub_iokit_platform err_sub(0x2A) #define sub_iokit_audio_video err_sub(0x45) #define sub_iokit_cec err_sub(0x46) diff --git a/iokit/DriverKit/IOService.iig b/iokit/DriverKit/IOService.iig index 5885850dc..274950e8a 100644 --- a/iokit/DriverKit/IOService.iig +++ b/iokit/DriverKit/IOService.iig @@ -40,6 +40,7 @@ class IOMemoryDescriptor; class IOBufferMemoryDescriptor; class IOUserClient; +class OSAction; typedef char IOServiceName[128]; typedef char IOPropertyName[128]; @@ -245,7 +246,83 @@ public: Create( IOService * provider, const IOPropertyName propertiesKey, - IOService ** result); + IOService ** result) LOCAL; + + /*! + * @brief Start an IOService termination. + * @discussion An IOService object created with Create() may be removed by calling Terminate(). + * The termination is asynchronous and will later call Stop() on the service. + * @param options No options are currently defined, pass zero. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Terminate( + uint64_t options); + + /*! + * @brief Obtain supportable properties describing the provider chain. + * @discussion Obtain supportable properties describing the provider chain. This will be a subset of registry + * properties the OS considers supportable. + * The array is ordered with a dictionary of properties for each entry in the provider chain from this + * service towards the root. + * @param propertyKeys If only certain property values are need, they may be passed in this array. + * @param properties Returned, retained array of dictionaries of properties or NULL. The caller should release + * this array. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CopyProviderProperties( + OSArray * propertyKeys, + OSArray ** properties); + + + /*! @function IOCreatePropertyMatchingDictionary + * @abstract Construct a matching dictionary for property matching. + */ + static OSDictionary * + CreatePropertyMatchingDictionary(const char * key, OSObjectPtr value, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreatePropertyMatchingDictionary + * @abstract Construct a matching dictionary for property matching. + */ + static OSDictionary * + CreatePropertyMatchingDictionary(const char * key, const char * stringValue, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateKernelClassMatchingDictionary + * @abstract Construct a matching dictionary for kernel class matching. + */ + static OSDictionary * + CreateKernelClassMatchingDictionary(OSString * className, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateKernelClassMatchingDictionary + * @abstract Construct a matching dictionary for kernel class matching. + */ + static OSDictionary * + CreateKernelClassMatchingDictionary(const char * className, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateUserClassMatchingDictionary + * @abstract Construct a matching dictionary for user class matching. + */ + static OSDictionary * + CreateUserClassMatchingDictionary(OSString * className, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateUserClassMatchingDictionary + * @abstract Construct a matching dictionary for user class matching. + */ + static OSDictionary * + CreateUserClassMatchingDictionary(const char * className, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateNameMatchingDictionary + * @abstract Construct a matching dictionary for IOService name matching. + */ + static OSDictionary * + CreateNameMatchingDictionary(OSString * serviceName, OSDictionary * matching) LOCALONLY; + + /*! @function IOCreateNameMatchingDictionary + * @abstract Construct a matching dictionary for IOService name matching. + */ + static OSDictionary * + CreateNameMatchingDictionary(const char * serviceName, OSDictionary * matching) LOCALONLY; }; #endif /* ! _IOKIT_UIOSERVICE_H */ diff --git a/iokit/DriverKit/IOServiceNotificationDispatchSource.iig b/iokit/DriverKit/IOServiceNotificationDispatchSource.iig new file mode 100644 index 000000000..babc74c6c --- /dev/null +++ b/iokit/DriverKit/IOServiceNotificationDispatchSource.iig @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2019-2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_UIOSERVICEDISPATCHSOURCE_H +#define _IOKIT_UIOSERVICEDISPATCHSOURCE_H + +#include +#include +#include + + +typedef void (^IOServiceNotificationBlock)(uint64_t type, IOService * service, uint64_t options); + +enum { + kIOServiceNotificationTypeTerminated = 0x00000000, + kIOServiceNotificationTypeMatched = 0x00000001, + kIOServiceNotificationTypeLast = kIOServiceNotificationTypeMatched, + kIOServiceNotificationTypeNone = 0xFFFFFFFF, +}; + +class NATIVE KERNEL IOServiceNotificationDispatchSource : public IODispatchSource +{ +public: + + /*! + * @brief Create an IOServiceNotificationDispatchSource for IOService matching and termination events. + * @param matching An IOService matching dictionary. + * @param options None defined, pass zero. + * @param queue IODispatchQueue the source is attached to. Note that the ServiceNotificationReady + * handler is invoked on the queue set for the target method + * of the OSAction, not this queue. + * @param source Created source with +1 retain count to be released by the caller. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + static kern_return_t + Create( + OSDictionary * matching, + uint64_t options, + IODispatchQueue * queue, + IOServiceNotificationDispatchSource ** notification) LOCAL; + + virtual bool + init() override; + + virtual void + free() override; + + /*! + * @brief Control the enable state of the notification. + * @param enable Pass true to enable the source or false to disable. + * @param handler Optional block to be executed after the interrupt has been disabled and any pending + * interrupt handlers completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetEnableWithCompletion( + bool enable, + IODispatchSourceCancelHandler handler) override LOCAL; + + /*! + * @brief Cancel all callbacks from the event source. + * @discussion After cancellation, the source can only be freed. It cannot be reactivated. + * @param handler Handler block to be invoked after any callbacks have completed. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + Cancel(IODispatchSourceCancelHandler handler) override LOCAL; + + /*! + * @brief Set the handler block to run when the notification has become ready. + * @param action OSAction instance specifying the callback method. The OSAction object will be retained + * until SetHandler is called again or the event source is cancelled. + * The ServiceNotificationReady handler is invoked on the queue set for the target method of the + * OSAction. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + SetHandler( + OSAction * action TYPE(ServiceNotificationReady)); + + /*! + * @brief Invoke a block for each notification available in response to ServiceNotificationReady. + * @discussion The IOService object passed to the notification is only retained for the duration of the block. + * It should be retained by the block code if used beyond the invocation. + * @param block to be invoked with each notification + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + DeliverNotifications(IOServiceNotificationBlock block) LOCALONLY; + +private: + virtual kern_return_t + CheckForWork(bool synchronous) override LOCAL; + + virtual void + ServiceNotificationReady( + OSAction * action TARGET) LOCAL = 0; + + virtual kern_return_t + CopyNextNotification( + uint64_t * type, + IOService ** service, + uint64_t * options); +}; + +#endif /* ! _IOKIT_UIOSERVICEDISPATCHSOURCE_H */ diff --git a/iokit/DriverKit/Makefile b/iokit/DriverKit/Makefile index 62ab74bc2..e797df2d2 100644 --- a/iokit/DriverKit/Makefile +++ b/iokit/DriverKit/Makefile @@ -29,7 +29,7 @@ GENERATED_IMPL = $(patsubst %.iig,%.iig.cpp,$(ALL_DEFS)) INSTALL_MI_LIST = $(ALL_DEFS) INSTALL_DRIVERKIT_MI_LIST = $(ALL_DEFS) -OTHER_HEADERS = IOTypes.h IOReturn.h IORPC.h +OTHER_HEADERS = IOTypes.h IOReturn.h IORPC.h IOKitKeys.h EXPORT_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) INSTALL_MI_GEN_LIST = $(GENERATED_HEADERS) $(OTHER_HEADERS) diff --git a/iokit/DriverKit/OSAction.iig b/iokit/DriverKit/OSAction.iig index 999205c4e..ddb3b2e17 100644 --- a/iokit/DriverKit/OSAction.iig +++ b/iokit/DriverKit/OSAction.iig @@ -33,6 +33,7 @@ typedef void (^OSActionCancelHandler)(void); typedef void (^OSActionAbortedHandler)(void); +struct OSActionWaitToken; /*! * @class OSAction @@ -44,13 +45,22 @@ typedef void (^OSActionAbortedHandler)(void); * The callback is specified as a method and object pair. * State associated with the callback may be allocated and stored for the creator of the object. * Methods to allocate an OSAction instance are generated for each method defined in a class with - * a TYPE attribute, so there should not be any need to directly call OSAction::Create(). + * a TYPE attribute. The generated methods are named CreateAction{name of method with type attribute} + * and have the following declaration: + * + * kern_return_t CreateActionNameOfMethod(size_t referenceSize, OSAction **action); + * + * referenceSize refers to the size of additional state structure available to the creator of the OSAction + * with GetReference. If successful, the generated method returns kIOReturnSuccess and a created OSAction + * through the 'action' parameter with a +1 retain count to be released by the caller. See IOReturn.h for + * error codes. */ class NATIVE KERNEL OSAction : public OSObject { public: +#if DRIVERKIT_PRIVATE /*! * @brief Create an instance of OSAction. * @discussion Methods to allocate an OSAction instance are generated for each method defined in a class with @@ -72,6 +82,7 @@ public: uint64_t msgid, size_t referenceSize, OSAction ** action) LOCAL; +#endif virtual void free() override; @@ -105,6 +116,46 @@ public: kern_return_t SetAbortedHandler(OSActionAbortedHandler handler) LOCALONLY; + /*! + * @brief Mark this OSAction to be waited for later with Wait(). + * @discussion This call should be made before any possible invocation of the action. + * An OSAction instance only supports one waiter and WillWait() will return an error if already called. + * @param token Opaque value to be passed to a later call to Wait() and EndWait(). + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + kern_return_t + WillWait(OSActionWaitToken ** token) LOCALONLY; + + /*! + * @brief Discard the OSActionWaitToken for the action. + * @discussion Free any resources needed to wait for the action allocated by WillWait(). + * There should be no outstanding invocations of the action when EndWait is called, + * if necessary the action should be canceled before calling EndWait(). + * @param token Opaque value to be passed from an earlier call to WillWait(). + * @return kIOReturnSuccess on success. kIOReturnAborted if aborted or canceled. + kIOReturnTimeout if the deadline was passed. See IOReturn.h for error codes. + */ + kern_return_t + EndWait( + OSActionWaitToken * token) LOCALONLY; + + /*! + * @brief Wait for the action to be invoked. + * @discussion The current thread is blocked until the action invocation has completed, the action canceled + or aborted, or the deadline passed. + * @param token Opaque value to be passed from an earlier call to WillWait(). + * @param options Pass one of the kIOTimerClock* options to specify the timebase for the + * deadline. + * @param deadline Pass the time the wait should timeout, or zero for no timeout. + * @return kIOReturnSuccess on success. kIOReturnAborted if aborted or canceled. + kIOReturnTimeout if the deadline was passed. See IOReturn.h for error codes. + */ + kern_return_t + Wait( + OSActionWaitToken * token, + uint64_t options, + uint64_t deadline) LOCALONLY; + virtual void Aborted(void) LOCAL; }; diff --git a/iokit/DriverKit/OSObject.iig b/iokit/DriverKit/OSObject.iig index 38b55fab3..f97de5aa9 100644 --- a/iokit/DriverKit/OSObject.iig +++ b/iokit/DriverKit/OSObject.iig @@ -50,42 +50,75 @@ class OSObject; typedef OSObject * OSObjectPtr; #endif -#if __IIG && !__IIG_ATTRIBUTES_DEFINED__ +#if !__IIG_ATTRIBUTES_DEFINED__ #define __IIG_ATTRIBUTES_DEFINED__ 1 -#define KERNEL __attribute__((annotate("kernel"))) -#define NATIVE __attribute__((annotate("native"))) -#define LOCAL __attribute__((annotate("local"))) -#define LOCALONLY __attribute__((annotate("localonly"))) -#define REMOTE __attribute__((annotate("remote"))) - -#define LOCALHOST __attribute__((annotate("localhost"))) - -#define INVOKEREPLY __attribute__((annotate("invokereply"))) -#define REPLY __attribute__((annotate("reply"))) - -#define PORTMAKESEND __attribute__((annotate("MACH_MSG_TYPE_MAKE_SEND"))) -#define PORTCOPYSEND __attribute__((annotate("MACH_MSG_TYPE_COPY_SEND"))) - -#define TARGET __attribute__((annotate("target"))) -#define TYPE(p) __attribute__((annotate("type=" # p))) - -//#define ARRAY(maxcount) __attribute__((annotate(# maxcount), annotate("array"))) -#define EXTENDS(cls) __attribute__((annotate("extends=" # cls))) - -//#define INTERFACE __attribute__((annotate("interface"))) -//#define IMPLEMENTS(i) void implements(i *); - -#define QUEUENAME(name) __attribute__((annotate("queuename=" # name))) - -#define IIG_SERIALIZABLE __attribute__((annotate("serializable"))) - -#else +#if __IIG || __DOCUMENTATION__ + +#define IIG_KERNEL __attribute__((annotate("kernel"))) +#define IIG_NATIVE __attribute__((annotate("native"))) +#define IIG_LOCAL __attribute__((annotate("local"))) +#define IIG_LOCALONLY __attribute__((annotate("localonly"))) +#define IIG_REMOTE __attribute__((annotate("remote"))) +#define IIG_LOCALHOST __attribute__((annotate("localhost"))) +#define IIG_INVOKEREPLY __attribute__((annotate("invokereply"))) +#define IIG_REPLY __attribute__((annotate("reply"))) +#define IIG_PORTMAKESEND __attribute__((annotate("MACH_MSG_TYPE_MAKE_SEND"))) +#define IIG_PORTCOPYSEND __attribute__((annotate("MACH_MSG_TYPE_COPY_SEND"))) +#define IIG_TARGET __attribute__((annotate("target"))) +#define IIG_TYPE(p) __attribute__((annotate("type=" # p))) +//#define IIG_ARRAY(maxcount) __attribute__((annotate(# maxcount), annotate("array"))) +#define IIG_EXTENDS(cls) __attribute__((annotate("extends=" # cls))) +//#define IIG_INTERFACE __attribute__((annotate("interface"))) +//#define IIG_IMPLEMENTS(i) void __implements(i *); +#define IIG_QUEUENAME(name) __attribute__((annotate("queuename=" # name))) +#define IIG_SERIALIZABLE __attribute__((annotate("serializable"))) +#if __IIG +#define KERNEL IIG_KERNEL +#endif /* __IIG */ +#define NATIVE IIG_NATIVE +#define LOCAL IIG_LOCAL +#define LOCALONLY IIG_LOCALONLY +#define REMOTE IIG_REMOTE +#define LOCALHOST IIG_LOCALHOST +#define INVOKEREPLY IIG_INVOKEREPLY +#define REPLY IIG_REPLY +#define PORTMAKESEND IIG_PORTMAKESEND +#define PORTCOPYSEND IIG_PORTCOPYSEND +#define TARGET IIG_TARGET +#define TYPE(p) IIG_TYPE(p) +//#define ARRAY(maxcount) IIG_ARRAY(maxcount) +#define EXTENDS(cls) IIG_EXTENDS(cls) +//#define INTERFACE IIG_INTERFACE +//#define IMPLEMENTS(i) IIG_IMPLEMENTS(i) +#define QUEUENAME(name) IIG_QUEUENAME(name) + +#else /* __IIG || __DOCUMENTATION__ */ + +#define IIG_KERNEL +#define IIG_NATIVE +#define IIG_LOCAL +#define IIG_LOCALONLY +#define IIG_REMOTE +#define IIG_LOCALHOST +#define IIG_INVOKEREPLY +#define IIG_REPLY +#define IIG_PORTMAKESEND +#define IIG_PORTCOPYSEND +#define IIG_TARGET +#define IIG_TYPE(p) +//#define IIG_ARRAY(maxcount) +#define IIG_EXTENDS(cls) +//#define IIG_INTERFACE +//#define IIG_IMPLEMENTS(i) +#define IIG_QUEUENAME(name) #define IIG_SERIALIZABLE -#endif /* __IIG */ +#endif /* __IIG || __DOCUMENTATION__ */ + +#endif /* __IIG_ATTRIBUTES_DEFINED__ */ #if !__IIG diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index 682625f43..01f8f295f 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -174,6 +174,9 @@ public: * @param unload Flag to cause the actual unloading of the module. */ IOReturn terminateDriversForModule( const char * moduleName, bool unload = true); +#if XNU_KERNEL_PRIVATE + IOReturn terminateDrivers(OSDictionary * matching, io_name_t className); +#endif /* XNU_KERNEL_PRIVATE */ /*! * @function startMatching @@ -227,7 +230,6 @@ private: IOReturn unloadModule( OSString * moduleName ) const; IOReturn _removeDrivers(OSDictionary * matching); - IOReturn _terminateDrivers(OSDictionary * matching); }; extern const OSSymbol * gIOClassKey; diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index e8db76ced..34da31735 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -69,6 +69,12 @@ #define kIOProbeScoreKey "IOProbeScore" #define kIOKitDebugKey "IOKitDebug" +// Properties to be supported as API +#define kIOSupportedPropertiesKey "IOSupportedProperties" +// Properties writable by dexts +#define kIOUserServicePropertiesKey "IOUserServiceProperties" + + // IOService matching property names #define kIOProviderClassKey "IOProviderClass" #define kIONameMatchKey "IONameMatch" diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index f7cc9eae6..55973bad6 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -155,6 +155,7 @@ extern mach_port_name_t iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ); extern mach_port_t ipc_port_make_send(mach_port_t); +extern mach_port_t ipc_port_copy_send(mach_port_t); extern void ipc_port_release_send(ipc_port_t port); extern io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type); @@ -185,6 +186,8 @@ extern mach_msg_header_t * ipc_kmsg_msg_header(ipc_kmsg_t); extern kern_return_t uext_server(ipc_kmsg_t request, ipc_kmsg_t * preply); +extern kern_return_t +iokit_label_dext_task(task_t task); /* * Functions imported by iokit:IOMemoryDescriptor.cpp diff --git a/iokit/IOKit/IORegistryEntry.h b/iokit/IOKit/IORegistryEntry.h index c9e059654..af3b946ea 100644 --- a/iokit/IOKit/IORegistryEntry.h +++ b/iokit/IOKit/IORegistryEntry.h @@ -184,6 +184,11 @@ public: void *arg0 = NULL, void *arg1 = NULL, void *arg2 = NULL, void *arg3 = NULL); +#ifdef __BLOCKS__ + typedef IOReturn (^ActionBlock)(void); + IOReturn runPropertyActionBlock(ActionBlock block); +#endif /* __BLOCKS__ */ + private: #if __LP64__ OSMetaClassDeclareReservedUnused(IORegistryEntry, 0); diff --git a/iokit/IOKit/IOReturn.h b/iokit/IOKit/IOReturn.h index d93203222..13eb9b25d 100644 --- a/iokit/IOKit/IOReturn.h +++ b/iokit/IOKit/IOReturn.h @@ -46,6 +46,12 @@ extern "C" { #else /* PLATFORM_DriverKit */ +#ifdef DRIVERKIT_PRIVATE + +#include + +#else /* DRIVERKIT_PRIVATE */ + typedef int kern_return_t; #define KERN_SUCCESS 0 @@ -73,6 +79,8 @@ typedef int kern_return_t; #define sub_emask (err_sub(0xfff)) #define code_emask (0x3fff) +#endif /* DRIVERKIT_PRIVATE */ + #endif /* PLATFORM_DriverKit */ typedef kern_return_t IOReturn; diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index fdfded661..28c99e74c 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -162,6 +162,9 @@ extern const OSSymbol * gIODeviceMemoryKey; extern const OSSymbol * gIOInterruptControllersKey; extern const OSSymbol * gIOInterruptSpecifiersKey; +extern const OSSymbol * gIOSupportedPropertiesKey; +extern const OSSymbol * gIOUserServicePropertiesKey; + extern const OSSymbol * gIOBSDKey; extern const OSSymbol * gIOBSDNameKey; extern const OSSymbol * gIOBSDMajorKey; diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 76d6a4ef3..6db286316 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -103,10 +103,8 @@ enum { kIOPMDeviceUsable = 0x00008000, kIOPMLowPower = 0x00010000, #if PRIVATE -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) kIOPMAOTPower = 0x00020000, kIOPMAOTCapability = kIOPMAOTPower, -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ #endif /* PRIVATE */ kIOPMPreventIdleSleep = 0x00000040, kIOPMSleepCapability = 0x00000004, diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index ee04a5072..fd91d71e9 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -693,7 +693,6 @@ enum { #define kIOPMWakeEventSource 0x00000001 -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) /***************************************************************************** * * AOT defs @@ -769,8 +768,6 @@ struct IOPMAOTMetrics #define kIOPMAOTPowerKey "aot-power" -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - /***************************************************************************** * * System Sleep Policy diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 61334b1cd..22e38474e 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -808,7 +808,6 @@ private: OSArray * _systemWakeEventsArray; bool _acceptSystemWakeEvents; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) // AOT -- IOPMCalendarStruct _aotWakeTimeCalendar; IOTimerEventSource * _aotTimerES; @@ -838,7 +837,6 @@ public: bool isAOTMode(void); private: // -- AOT -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ void updateTasksSuspend(void); int findSuspendedPID(uint32_t pid, uint32_t *outRefCount); diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 7c0201e4d..6f8f07890 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -557,17 +557,13 @@ IOCatalogue::unloadModule(OSString * moduleName) const } IOReturn -IOCatalogue::_terminateDrivers(OSDictionary * matching) +IOCatalogue::terminateDrivers(OSDictionary * matching, io_name_t className) { OSDictionary * dict; OSIterator * iter; IOService * service; IOReturn ret; - if (!matching) { - return kIOReturnBadArgument; - } - ret = kIOReturnSuccess; dict = NULL; iter = IORegistryIterator::iterateOver(gIOServicePlane, @@ -576,25 +572,70 @@ IOCatalogue::_terminateDrivers(OSDictionary * matching) return kIOReturnNoMemory; } - OSKext::uniquePersonalityProperties( matching ); + if (matching) { + OSKext::uniquePersonalityProperties( matching ); + } // terminate instances. do { iter->reset(); while ((service = (IOService *)iter->getNextObject())) { - dict = service->getPropertyTable(); - if (!dict) { + if (className && !service->metaCast(className)) { continue; } - - /* Terminate only for personalities that match the matching dictionary. - * This comparison must be done with only the keys in the - * "matching" dict to enable general matching. - */ - if (!dict->isEqualTo(matching, matching)) { - continue; + if (matching) { + /* Terminate only for personalities that match the matching dictionary. + * This comparison must be done with only the keys in the + * "matching" dict to enable general matching. + */ + dict = service->getPropertyTable(); + if (!dict) { + continue; + } + if (!dict->isEqualTo(matching, matching)) { + continue; + } } + OSKext * kext; + const char * bundleIDStr; + OSObject * prop; + bool okToTerminate; + for (okToTerminate = true;;) { + kext = service->getMetaClass()->getKext(); + if (!kext) { + break; + } + bundleIDStr = kext->getIdentifierCString(); + if (!bundleIDStr) { + break; + } + prop = kext->getPropertyForHostArch(kOSBundleAllowUserTerminateKey); + if (prop) { + okToTerminate = (kOSBooleanTrue == prop); + break; + } + if (!strcmp(kOSKextKernelIdentifier, bundleIDStr)) { + okToTerminate = false; + break; + } + if (!strncmp("com.apple.", bundleIDStr, strlen("com.apple."))) { + okToTerminate = false; + break; + } + break; + } + if (!okToTerminate) { +#if DEVELOPMENT || DEBUG + okToTerminate = true; +#endif /* DEVELOPMENT || DEBUG */ + IOLog("%sallowing kextunload terminate for bundleID %s\n", + okToTerminate ? "" : "dis", bundleIDStr ? bundleIDStr : "?"); + if (!okToTerminate) { + ret = kIOReturnUnsupported; + break; + } + } if (!service->terminate(kIOServiceRequired | kIOServiceSynchronous)) { ret = kIOReturnUnsupported; break; @@ -649,7 +690,10 @@ IOCatalogue::terminateDrivers(OSDictionary * matching) { IOReturn ret; - ret = _terminateDrivers(matching); + if (!matching) { + return kIOReturnBadArgument; + } + ret = terminateDrivers(matching, NULL); IORWLockWrite(lock); if (kIOReturnSuccess == ret) { ret = _removeDrivers(matching); @@ -695,7 +739,7 @@ IOCatalogue::terminateDriversForModule( dict->setObject(gIOModuleIdentifierKey, moduleName); - ret = _terminateDrivers(dict); + ret = terminateDrivers(dict, NULL); /* No goto between IOLock calls! */ diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index f219e2f1d..1077c7920 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -221,14 +221,12 @@ IODeviceTreeAlloc( void * dtTop ) if (!intMap && child->getProperty( gIODTInterruptParentKey)) { intMap = true; } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (!strcmp("sep", child->getName()) || !strcmp("aop", child->getName()) || !strcmp("disp0", child->getName())) { uint32_t aotFlags = 1; child->setProperty("aot-power", &aotFlags, sizeof(aotFlags)); } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } regIter->release(); } diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index 9cf95e501..7c1a6a141 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -493,7 +493,7 @@ IOTrackingAddUser(IOTrackingQueue * queue, IOTrackingUser * mem, vm_size_t size) if ((kernel_task != current_task()) && (self = proc_self())) { bool user_64 = false; mem->btPID = proc_pid(self); - (void)backtrace_user(&mem->btUser[0], kIOTrackingCallSiteBTs - 1, &num, + num = backtrace_user(&mem->btUser[0], kIOTrackingCallSiteBTs - 1, NULL, &user_64, NULL); mem->user32 = !user_64; proc_rele(self); diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index c1114f9ec..d72b90091 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -324,15 +324,9 @@ static IOPMPowerState .outputPowerCharacter = kIOPMSleep, .inputPowerRequirement = SLEEP_POWER }, { .version = 1, -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) .capabilityFlags = kIOPMAOTCapability, .outputPowerCharacter = kIOPMAOTPower, .inputPowerRequirement = ON_POWER }, -#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - .capabilityFlags = 0, - .outputPowerCharacter = 0, - .inputPowerRequirement = 0xFFFFFFFF }, -#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ { .version = 1, .capabilityFlags = kIOPMPowerOn, .outputPowerCharacter = kIOPMPowerOn, @@ -983,11 +977,7 @@ IOPMrootDomain::updateTasksSuspend(void) { bool newSuspend; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) newSuspend = (tasksSuspended || _aotTasksSuspended); -#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - newSuspend = tasksSuspended; -#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ if (newSuspend == tasksSuspendState) { return; } @@ -1234,7 +1224,6 @@ static SYSCTL_INT(_debug, OID_AUTO, swd_panic, CTLFLAG_RW, &gSwdPanic, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, swd_panic_phase, CTLFLAG_RW, &swd_panic_phase, 0, ""); #endif -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) //****************************************************************************** // AOT @@ -1329,8 +1318,6 @@ static SYSCTL_PROC(_kern, OID_AUTO, aotmode, NULL, 0, sysctl_aotmode, "I", ""); //****************************************************************************** -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - static const OSSymbol * gIOPMSettingAutoWakeCalendarKey; static const OSSymbol * gIOPMSettingAutoWakeSecondsKey; @@ -1546,13 +1533,11 @@ IOPMrootDomain::start( IOService * nub ) &IOPMrootDomain::dispatchPowerEvent)); gIOPMWorkLoop->addEventSource(pmPowerStateQueue); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) _aotMode = 0; _aotTimerES = IOTimerEventSource::timerEventSource(this, OSMemberFunctionCast(IOTimerEventSource::Action, this, &IOPMrootDomain::aotEvaluate)); gIOPMWorkLoop->addEventSource(_aotTimerES); -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ // create our power parent patriarch = new IORootParent; @@ -1634,11 +1619,9 @@ IOPMrootDomain::start( IOService * nub ) sysctl_register_oid(&sysctl__kern_consoleoptions); sysctl_register_oid(&sysctl__kern_progressoptions); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) sysctl_register_oid(&sysctl__kern_aotmode); sysctl_register_oid(&sysctl__kern_aotmodebits); sysctl_register_oid(&sysctl__kern_aotmetrics); -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ #if HIBERNATION IOHibernateSystemInit(this); @@ -2647,10 +2630,8 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) unsigned long newState; clock_sec_t secs; clock_usec_t microsecs; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) clock_sec_t adjWakeTime; IOPMCalendarStruct nowCalendar; -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ ASSERT_GATED(); newState = getPowerState(); @@ -2664,7 +2645,6 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) notifierThread = current_thread(); switch (getPowerState()) { case SLEEP_STATE: { -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector) { secs = 0; microsecs = 0; @@ -2700,7 +2680,6 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) } } _aotPendingFlags &= ~kIOPMWakeEventAOTPerCycleFlags; -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ acceptSystemWakeEvents(true); // re-enable this timer for next sleep @@ -2710,13 +2689,9 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) logtime(secs); gIOLastSleepTime.tv_sec = secs; gIOLastSleepTime.tv_usec = microsecs; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (!_aotLastWakeTime) { gIOLastUserSleepTime = gIOLastSleepTime; } -#else - gIOLastUserSleepTime = gIOLastSleepTime; -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ gIOLastWakeTime.tv_sec = 0; gIOLastWakeTime.tv_usec = 0; @@ -2793,7 +2768,6 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) gIOLastWakeTime.tv_sec = secs; gIOLastWakeTime.tv_usec = microsecs; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) // aot if (_aotWakeTimeCalendar.selector != kPMCalendarTypeInvalid) { _aotWakeTimeCalendar.selector = kPMCalendarTypeInvalid; @@ -2818,7 +2792,6 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) setWakeTime(_aotTestTime); } } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ #if HIBERNATION LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); @@ -3356,7 +3329,6 @@ IOPMrootDomain::askChangeDownDone( *cancel = true; DLOG("cancel dark->sleep\n"); } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (_aotMode && (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector)) { uint64_t now = mach_continuous_time(); if (((now + _aotWakePreWindow) >= _aotWakeTimeContinuous) @@ -3365,7 +3337,6 @@ IOPMrootDomain::askChangeDownDone( IOLog("AOT wake window cancel: %qd, %qd\n", now, _aotWakeTimeContinuous); } } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } } @@ -3856,7 +3827,6 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) #endif /* !CONFIG_EMBEDDED */ } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) _aotReadyToFullWake = false; #if 0 if (_aotLingerTime) { @@ -3892,7 +3862,6 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) clock_interval_to_absolutetime_interval(2000, kMillisecondScale, &_aotWakePreWindow); clock_interval_to_absolutetime_interval(1100, kMillisecondScale, &_aotWakePostWindow); } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ #if HIBERNATION IOHibernateSystemSleep(); @@ -5629,13 +5598,11 @@ IOPMrootDomain::overrideOurPowerChange( uint32_t changeFlags = *inOutChangeFlags; uint32_t currentPowerState = (uint32_t) getPowerState(); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if ((AOT_STATE == powerState) && (ON_STATE == currentPowerState)) { // Assertion may have been taken in AOT leading to changePowerStateTo(AOT) *inOutChangeFlags |= kIOPMNotDone; return; } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ if (changeFlags & kIOPMParentInitiated) { // Root parent is permanently pegged at max power, @@ -5901,7 +5868,6 @@ IOPMrootDomain::handleOurPowerChangeStart( _desiredCapability, _currentCapability, _pendingCapability); } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if ((AOT_STATE == powerState) && (SLEEP_STATE != currentPowerState)) { panic("illegal AOT entry from %s", getPowerStateString(currentPowerState)); } @@ -5909,7 +5875,6 @@ IOPMrootDomain::handleOurPowerChangeStart( aotShouldExit(false, true); aotExit(false); } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } void @@ -7118,8 +7083,6 @@ IOPMConvertCalendarToSeconds(const IOPMCalendarStruct * dt) return secs; } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) - unsigned long IOPMrootDomain::getRUN_STATE(void) { @@ -7271,22 +7234,6 @@ IOPMrootDomain::aotEvaluate(IOTimerEventSource * timer) } } -#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - -unsigned long -IOPMrootDomain::getRUN_STATE(void) -{ - return ON_STATE; -} - -IOReturn -IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime) -{ - return kIOReturnUnsupported; -} - -#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - //****************************************************************************** // adjustPowerState // @@ -7305,7 +7252,6 @@ IOPMrootDomain::adjustPowerState( bool sleepASAP ) ASSERT_GATED(); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (_aotNow) { bool exitNow; @@ -7335,7 +7281,6 @@ IOPMrootDomain::adjustPowerState( bool sleepASAP ) } return; } -#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ if ((!idleSleepEnabled) || !checkSystemSleepEnabled()) { changePowerStateToPriv(getRUN_STATE()); @@ -8253,7 +8198,6 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) unsigned int IOPMrootDomain::idleSleepPreventersCount() { -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (_aotMode) { unsigned int count __block; count = 0; @@ -8264,7 +8208,6 @@ IOPMrootDomain::idleSleepPreventersCount() }); return count; } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ return preventIdleSleepList->getCount(); } @@ -8438,14 +8381,10 @@ IOPMrootDomain::evaluateAssertions(IOPMDriverAssertionType newAssertions, IOPMDr } if (changedBits & kIOPMDriverAssertionCPUBit) { -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (_aotNow) { IOLog("CPU assertions %d\n", (0 != (kIOPMDriverAssertionCPUBit & newAssertions))); } evaluatePolicy(_aotNow ? kStimulusNoIdleSleepPreventers : kStimulusDarkWakeEvaluate); -#else - evaluatePolicy(kStimulusDarkWakeEvaluate); -#endif if (!assertOnWakeSecs && gIOLastWakeAbsTime) { AbsoluteTime now; clock_usec_t microsecs; @@ -9882,10 +9821,7 @@ IOPMrootDomain::acceptSystemWakeEvents( bool accept ) _systemWakeEventsArray = OSArray::withCapacity(4); } _acceptSystemWakeEvents = (_systemWakeEventsArray != NULL); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) - if (!(_aotNow && (kIOPMWakeEventAOTExitFlags & _aotPendingFlags))) -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - { + if (!(_aotNow && (kIOPMWakeEventAOTExitFlags & _aotPendingFlags))) { gWakeReasonString[0] = '\0'; if (_systemWakeEventsArray) { _systemWakeEventsArray->flushCollection(); @@ -9954,7 +9890,6 @@ IOPMrootDomain::claimSystemWakeEvent( return; } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) IOOptionBits aotFlags = 0; bool needAOTEvaluate = FALSE; @@ -9978,7 +9913,6 @@ IOPMrootDomain::claimSystemWakeEvent( flags |= kIOPMWakeEventAOTPossibleExit; } #endif /* DEVELOPMENT || DEBUG */ -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ deviceName = device->copyName(gIOServicePlane); deviceRegId = OSNumber::withNumber(device->getRegistryEntryID(), 64); @@ -10001,7 +9935,6 @@ IOPMrootDomain::claimSystemWakeEvent( WAKEEVENT_LOCK(); addWakeReason = _acceptSystemWakeEvents; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (_aotMode) { IOLog("claimSystemWakeEvent(%s, %s, 0x%x) 0x%x %d\n", reason, deviceName->getCStringNoCopy(), (int)flags, _aotPendingFlags, _aotReadyToFullWake); } @@ -10026,7 +9959,6 @@ IOPMrootDomain::claimSystemWakeEvent( addWakeReason = _aotNow && _systemWakeEventsArray && ((kIOPMWakeEventAOTExitFlags & aotFlags)); needAOTEvaluate = _aotReadyToFullWake; } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ if (!gWakeReasonSysctlRegistered) { // Lazy registration until the platform driver stops registering @@ -10045,11 +9977,9 @@ IOPMrootDomain::claimSystemWakeEvent( } WAKEEVENT_UNLOCK(); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (needAOTEvaluate) { aotEvaluate(NULL); } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ done: if (deviceName) { diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index 45b3f42e6..456f93729 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -664,6 +664,25 @@ runPropertyAction(Action inAction, OSObject *target, return res; } +static IOReturn +IORegistryEntryActionToBlock(OSObject *target, + void *arg0, void *arg1, + void *arg2, void *arg3) +{ + IORegistryEntry::ActionBlock block = (typeof(block))arg0; + return block(); +} + +IOReturn +IORegistryEntry::runPropertyActionBlock(ActionBlock block) +{ + IOReturn res; + + res = runPropertyAction(&IORegistryEntryActionToBlock, this, block); + + return res; +} + OSObject * IORegistryEntry::getProperty( const OSString * aKey) const { diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index bd20598ef..a8387bf2a 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -134,6 +134,8 @@ const OSSymbol * gIOMatchedPersonalityKey; const OSSymbol * gIORematchPersonalityKey; const OSSymbol * gIORematchCountKey; const OSSymbol * gIODEXTMatchCountKey; +const OSSymbol * gIOSupportedPropertiesKey; +const OSSymbol * gIOUserServicePropertiesKey; #if !CONFIG_EMBEDDED const OSSymbol * gIOServiceLegacyMatchingRegistryIDKey; #endif @@ -429,6 +431,9 @@ IOService::initialize( void ) gIOInterruptSpecifiersKey = OSSymbol::withCStringNoCopy("IOInterruptSpecifiers"); + gIOSupportedPropertiesKey = OSSymbol::withCStringNoCopy(kIOSupportedPropertiesKey); + gIOUserServicePropertiesKey = OSSymbol::withCStringNoCopy(kIOUserServicePropertiesKey); + gIOMapperIDKey = OSSymbol::withCStringNoCopy(kIOMapperIDKey); gIOKitDebugKey = OSSymbol::withCStringNoCopy( kIOKitDebugKey ); diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index 5cdc56a78..5ef79da20 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -1222,7 +1222,6 @@ IOService::handleRegisterPowerDriver( IOPMRequest * request ) lowestPowerState = fPowerStates[0].stateOrderToIndex; fHighestPowerState = fPowerStates[numberOfStates - 1].stateOrderToIndex; -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) { uint32_t aotFlags; IOService * service; @@ -1254,7 +1253,6 @@ IOService::handleRegisterPowerDriver( IOPMRequest * request ) } } } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ // OR'in all the output power flags fMergedOutputPowerFlags = 0; @@ -1975,11 +1973,9 @@ IOService::handlePowerDomainDidChangeTo( IOPMRequest * request ) myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange | (parentChangeFlags & kIOPMRootBroadcastFlags); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (kIOPMAOTPower & fPowerStates[maxPowerState].inputPowerFlags) { IOLog("aotPS %s0x%qx[%ld]\n", getName(), getRegistryEntryID(), maxPowerState); } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ result = startPowerChange( /* flags */ myChangeFlags, @@ -2747,14 +2743,12 @@ IOService::computeDesiredState( unsigned long localClamp, bool computeOnly ) newPowerState = fHighestPowerState; } -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) if (getPMRootDomain()->isAOTMode()) { if ((kIOPMPreventIdleSleep & fPowerStates[newPowerState].capabilityFlags) && !(kIOPMPreventIdleSleep & fPowerStates[fDesiredPowerState].capabilityFlags)) { getPMRootDomain()->claimSystemWakeEvent(this, kIOPMWakeEventAOTExit, getName(), NULL); } } -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ fDesiredPowerState = newPowerState; @@ -5998,11 +5992,9 @@ IOService::pmTellAppWithResponse( OSObject * object, void * arg ) proc_suspended = get_task_pidsuspended((task_t) proc->task); if (proc_suspended) { logClientIDForNotification(object, context, "PMTellAppWithResponse - Suspended"); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) } else if (getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) { proc_suspended = true; context->skippedInDark++; -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } proc_rele(proc); if (proc_suspended) { @@ -6213,11 +6205,7 @@ IOService::pmTellCapabilityAppWithResponse( OSObject * object, void * arg ) } if (context->us == getPMRootDomain() && -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) getPMRootDomain()->isAOTMode() -#else /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ - false -#endif /* (defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ ) { OSNumber *clientID = NULL; boolean_t proc_suspended = FALSE; @@ -6605,11 +6593,9 @@ tellAppClientApplier( OSObject * object, void * arg ) proc_suspended = get_task_pidsuspended((task_t) proc->task); if (proc_suspended) { logClientIDForNotification(object, context, "tellAppClientApplier - Suspended"); -#if !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) } else if (IOService::getPMRootDomain()->isAOTMode() && get_task_suspended((task_t) proc->task)) { proc_suspended = true; context->skippedInDark++; -#endif /* !(defined(RC_HIDE_N144) || defined(RC_HIDE_N146)) */ } proc_rele(proc); if (proc_suspended) { diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 9b3cef8ce..e98f41f36 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -548,7 +548,31 @@ extern "C" { // functions called from osfmk/device/iokit_rpc.c void -iokit_add_reference( io_object_t obj, ipc_kobject_type_t type ) +iokit_port_object_description(io_object_t obj, kobject_description_t desc) +{ + IORegistryEntry * regEntry; + IOUserNotification * __unused noti; + _IOServiceNotifier * __unused serviceNoti; + OSSerialize * __unused s; + + if ((regEntry = OSDynamicCast(IORegistryEntry, obj))) { + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, "%s(0x%qx)", obj->getMetaClass()->getClassName(), regEntry->getRegistryEntryID()); +#if DEVELOPMENT || DEBUG + } else if ((noti = OSDynamicCast(IOUserNotification, obj)) + && ((serviceNoti = OSDynamicCast(_IOServiceNotifier, noti->holdNotify)))) { + s = OSSerialize::withCapacity(page_size); + if (s && serviceNoti->matching->serialize(s)) { + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, "%s(%s)", obj->getMetaClass()->getClassName(), s->text()); + } + OSSafeReleaseNULL(s); +#endif /* DEVELOPMENT || DEBUG */ + } else { + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, "%s", obj->getMetaClass()->getClassName()); + } +} + +void +iokit_add_reference( io_object_t obj, natural_t type ) { IOUserClient * uc; @@ -5645,28 +5669,7 @@ is_io_catalog_terminate( switch (flag) { #if !defined(SECURE_KERNEL) case kIOCatalogServiceTerminate: - OSIterator * iter; - IOService * service; - - iter = IORegistryIterator::iterateOver(gIOServicePlane, - kIORegistryIterateRecursively); - if (!iter) { - return kIOReturnNoMemory; - } - - do { - iter->reset(); - while ((service = (IOService *)iter->getNextObject())) { - if (service->metaCast(name)) { - if (!service->terminate( kIOServiceRequired - | kIOServiceSynchronous)) { - kr = kIOReturnUnsupported; - break; - } - } - } - } while (!service && !iter->isValid()); - iter->release(); + kr = gIOCatalogue->terminateDrivers(NULL, name); break; case kIOCatalogModuleUnload: diff --git a/iokit/Kernel/IOUserServer.cpp b/iokit/Kernel/IOUserServer.cpp index a436a8cb2..52508a761 100644 --- a/iokit/Kernel/IOUserServer.cpp +++ b/iokit/Kernel/IOUserServer.cpp @@ -60,6 +60,7 @@ #include #include #include +#include #include /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -104,8 +105,6 @@ class IOUserService : public IOService virtual bool start(IOService * provider) APPLE_KEXT_OVERRIDE; - virtual IOReturn - setProperties(OSObject * props) APPLE_KEXT_OVERRIDE; }; OSDefineMetaClassAndStructors(IOUserService, IOService) @@ -147,13 +146,6 @@ IOUserService::start(IOService * provider) return ok; } -IOReturn -IOUserService::setProperties(OSObject * properties) -{ - setProperty("USER", properties); - return kIOReturnSuccess; -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #undef super @@ -285,10 +277,41 @@ IMPL(IOService, SetDispatchQueue) kern_return_t IMPL(IOService, SetProperties) { - IOReturn ret = kIOReturnUnsupported; + IOUserServer * us; + OSDictionary * dict; + IOReturn ret; ret = setProperties(properties); + if (kIOReturnUnsupported == ret) { + dict = OSDynamicCast(OSDictionary, properties); + us = (typeof(us))thread_iokit_tls_get(0); + if (dict && reserved->uvars && (reserved->uvars->userServer == us)) { + ret = runPropertyActionBlock(^IOReturn (void) { + OSDictionary * userProps; + IOReturn ret; + + userProps = OSDynamicCast(OSDictionary, getProperty(gIOUserServicePropertiesKey)); + if (userProps) { + userProps = (typeof(userProps))userProps->copyCollection(); + } else { + userProps = OSDictionary::withCapacity(4); + } + if (!userProps) { + ret = kIOReturnNoMemory; + } else { + bool ok = userProps->merge(dict); + if (ok) { + ok = setProperty(gIOUserServicePropertiesKey, userProps); + } + OSSafeReleaseNULL(userProps); + ret = ok ? kIOReturnSuccess : kIOReturnNotWritable; + } + return ret; + }); + } + } + return ret; } @@ -585,6 +608,12 @@ IODispatchSource::free() super::free(); } +kern_return_t +IMPL(IODispatchSource, SetEnable) +{ + return SetEnableWithCompletion(enable, NULL); +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ struct IOInterruptDispatchSource_IVars { @@ -662,6 +691,10 @@ IOInterruptDispatchSource::free() assert(kIOReturnSuccess == ret); } + if (ivars && ivars->lock) { + IOSimpleLockFree(ivars->lock); + } + IOSafeDeleteNULL(ivars, IOInterruptDispatchSource_IVars, 1); super::free(); @@ -711,9 +744,9 @@ IMPL(IOInterruptDispatchSource, SetEnableWithCompletion) } kern_return_t -IMPL(IODispatchSource, SetEnable) +IMPL(IOInterruptDispatchSource, Cancel) { - return SetEnableWithCompletion(enable, NULL); + return kIOReturnUnsupported; } kern_return_t @@ -762,6 +795,266 @@ IMPL(IOInterruptDispatchSource, InterruptOccurred) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +enum { + kIOServiceNotificationTypeCount = kIOServiceNotificationTypeLast + 1, +}; + +struct IOServiceNotificationDispatchSource_IVars { + OSObject * serverName; + OSAction * action; + IOLock * lock; + IONotifier * notifier; + OSDictionary * interestNotifiers; + OSArray * pending[kIOServiceNotificationTypeCount]; + bool enable; +}; + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, Create) +{ + IOUserServer * us; + IOReturn ret; + IOServiceNotificationDispatchSource * inst; + + inst = OSTypeAlloc(IOServiceNotificationDispatchSource); + if (!inst->init()) { + OSSafeReleaseNULL(inst); + return kIOReturnNoMemory; + } + + us = (typeof(us))thread_iokit_tls_get(0); + assert(OSDynamicCast(IOUserServer, us)); + if (!us) { + OSSafeReleaseNULL(inst); + return kIOReturnError; + } + inst->ivars->serverName = us->copyProperty(gIOUserServerNameKey); + if (!inst->ivars->serverName) { + OSSafeReleaseNULL(inst); + return kIOReturnNoMemory; + } + + inst->ivars->lock = IOLockAlloc(); + if (!inst->ivars->lock) { + OSSafeReleaseNULL(inst); + return kIOReturnNoMemory; + } + for (uint32_t idx = 0; idx < kIOServiceNotificationTypeCount; idx++) { + inst->ivars->pending[idx] = OSArray::withCapacity(4); + if (!inst->ivars->pending[idx]) { + OSSafeReleaseNULL(inst); + return kIOReturnNoMemory; + } + } + inst->ivars->interestNotifiers = OSDictionary::withCapacity(4); + if (!inst->ivars->interestNotifiers) { + OSSafeReleaseNULL(inst); + return kIOReturnNoMemory; + } + + inst->ivars->notifier = IOService::addMatchingNotification(gIOMatchedNotification, matching, 0 /*priority*/, + ^bool (IOService * newService, IONotifier * notifier) { + bool notifyReady = false; + IONotifier * interest; + OSObject * serverName; + bool okToUse; + + serverName = newService->copyProperty(gIOUserServerNameKey); + okToUse = (serverName && inst->ivars->serverName->isEqualTo(serverName)); + OSSafeReleaseNULL(serverName); + if (!okToUse) { + return false; + } + + IOLockLock(inst->ivars->lock); + notifyReady = (0 == inst->ivars->pending[kIOServiceNotificationTypeMatched]->getCount()); + inst->ivars->pending[kIOServiceNotificationTypeMatched]->setObject(newService); + IOLockUnlock(inst->ivars->lock); + + interest = newService->registerInterest(gIOGeneralInterest, + ^IOReturn (uint32_t messageType, IOService * provider, + void * messageArgument, size_t argSize) { + IONotifier * interest; + bool notifyReady = false; + + switch (messageType) { + case kIOMessageServiceIsTerminated: + IOLockLock(inst->ivars->lock); + notifyReady = (0 == inst->ivars->pending[kIOServiceNotificationTypeTerminated]->getCount()); + inst->ivars->pending[kIOServiceNotificationTypeTerminated]->setObject(provider); + interest = (typeof(interest))inst->ivars->interestNotifiers->getObject((const OSSymbol *) newService); + assert(interest); + interest->remove(); + inst->ivars->interestNotifiers->removeObject((const OSSymbol *) newService); + IOLockUnlock(inst->ivars->lock); + break; + default: + break; + } + if (notifyReady && inst->ivars->action) { + inst->ServiceNotificationReady(inst->ivars->action); + } + return kIOReturnSuccess; + }); + if (interest) { + IOLockLock(inst->ivars->lock); + inst->ivars->interestNotifiers->setObject((const OSSymbol *) newService, interest); + IOLockUnlock(inst->ivars->lock); + } + if (notifyReady) { + if (inst->ivars->action) { + inst->ServiceNotificationReady(inst->ivars->action); + } + } + return false; + }); + + if (!inst->ivars->notifier) { + OSSafeReleaseNULL(inst); + ret = kIOReturnError; + } + + *notification = inst; + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, CopyNextNotification) +{ + IOService * next; + uint32_t idx; + + IOLockLock(ivars->lock); + for (idx = 0; idx < kIOServiceNotificationTypeCount; idx++) { + next = (IOService *) ivars->pending[idx]->getObject(0); + if (next) { + next->retain(); + ivars->pending[idx]->removeObject(0); + break; + } + } + IOLockUnlock(ivars->lock); + + if (idx == kIOServiceNotificationTypeCount) { + idx = kIOServiceNotificationTypeNone; + } + *type = idx; + *service = next; + *options = 0; + + return kIOReturnSuccess; +} + +bool +IOServiceNotificationDispatchSource::init() +{ + if (!super::init()) { + return false; + } + ivars = IONewZero(IOServiceNotificationDispatchSource_IVars, 1); + if (!ivars) { + return false; + } + + return true; +} + +void +IOServiceNotificationDispatchSource::free() +{ + if (ivars) { + OSSafeReleaseNULL(ivars->serverName); + if (ivars->interestNotifiers) { + ivars->interestNotifiers->iterateObjects(^bool (const OSSymbol * key, OSObject * object) { + IONotifier * interest = (typeof(interest))object; + interest->remove(); + return false; + }); + OSSafeReleaseNULL(ivars->interestNotifiers); + } + for (uint32_t idx = 0; idx < kIOServiceNotificationTypeCount; idx++) { + OSSafeReleaseNULL(ivars->pending[idx]); + } + if (ivars->lock) { + IOLockFree(ivars->lock); + ivars->lock = NULL; + } + if (ivars->notifier) { + ivars->notifier->remove(); + ivars->notifier = NULL; + } + IOSafeDeleteNULL(ivars, IOServiceNotificationDispatchSource_IVars, 1); + } + + super::free(); +} + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, SetHandler) +{ + IOReturn ret; + bool notifyReady; + + notifyReady = false; + + IOLockLock(ivars->lock); + OSSafeReleaseNULL(ivars->action); + action->retain(); + ivars->action = action; + if (action) { + for (uint32_t idx = 0; idx < kIOServiceNotificationTypeCount; idx++) { + notifyReady = (ivars->pending[idx]->getCount()); + if (notifyReady) { + break; + } + } + } + IOLockUnlock(ivars->lock); + + if (notifyReady) { + ServiceNotificationReady(action); + } + ret = kIOReturnSuccess; + + return ret; +} + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, SetEnableWithCompletion) +{ + if (enable == ivars->enable) { + return kIOReturnSuccess; + } + + IOLockLock(ivars->lock); + ivars->enable = enable; + IOLockUnlock(ivars->lock); + + return kIOReturnSuccess; +} + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, Cancel) +{ + return kIOReturnUnsupported; +} + +kern_return_t +IMPL(IOServiceNotificationDispatchSource, CheckForWork) +{ + return kIOReturnNotReady; +} + +kern_return_t +IOServiceNotificationDispatchSource::DeliverNotifications(IOServiceNotificationBlock block) +{ + return kIOReturnUnsupported; +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + kern_return_t IOUserServer::waitInterruptTrap(void * p1, void * p2, void * p3, void * p4, void * p5, void * p6) { @@ -907,6 +1200,10 @@ IMPL(IODispatchQueue, Create) kern_return_t IMPL(IODispatchQueue, SetPort) { + if (MACH_PORT_NULL != ivars->serverPort) { + return kIOReturnNotReady; + } + ivars->serverPort = port; return kIOReturnSuccess; } @@ -926,6 +1223,10 @@ IODispatchQueue::init() void IODispatchQueue::free() { + if (ivars && ivars->serverPort) { + ipc_port_release_send(ivars->serverPort); + ivars->serverPort = MACH_PORT_NULL; + } IOSafeDeleteNULL(ivars, IODispatchQueue_IVars, 1); super::free(); } @@ -1007,7 +1308,11 @@ struct IOPStrings { kern_return_t OSUserMetaClass::Dispatch(IORPC rpc) { - return const_cast(meta)->Dispatch(rpc); + if (meta) { + return const_cast(meta)->Dispatch(rpc); + } else { + return kIOReturnUnsupported; + } } void @@ -1290,7 +1595,7 @@ IOUserServer::copyInStringArray(const char * string, uint32_t userSize) array->count = 0; cstr = &array->strings[0]; end = &array->strings[array->dataSize]; - while ((len = cstr[0])) { + while ((len = (unsigned char)cstr[0])) { cstr++; if ((cstr + len) >= end) { break; @@ -1318,7 +1623,7 @@ IOUserServer::stringArrayIndex(IOPStrings * array, const char * look) cstr = &array->strings[0]; end = &array->strings[array->dataSize]; llen = strlen(look); - while ((len = cstr[0])) { + while ((len = (unsigned char)cstr[0])) { cstr++; if ((cstr + len) >= end) { break; @@ -1433,12 +1738,15 @@ IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * messag resultFlags |= kOSObjectRPCRemote; } if (service->reserved->uvars && service->reserved->uvars->userServer) { + IOLockLock(service->reserved->uvars->userServer->fLock); userMeta = (typeof(userMeta))service->reserved->uvars->userServer->fClasses->getObject(str); + IOLockUnlock(service->reserved->uvars->userServer->fLock); } } if (!str && !userMeta) { const OSMetaClass * meta; meta = obj->getMetaClass(); + IOLockLock(fLock); while (meta && !userMeta) { str = (OSString *) meta->getClassNameSymbol(); userMeta = (typeof(userMeta))fClasses->getObject(str); @@ -1446,10 +1754,13 @@ IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * messag meta = meta->getSuperClass(); } } + IOLockUnlock(fLock); } if (str) { if (!userMeta) { + IOLockLock(fLock); userMeta = (typeof(userMeta))fClasses->getObject(str); + IOLockUnlock(fLock); } if (kIODKLogSetup & gIODKDebug) { DKLOG("userMeta %s %p\n", str->getCStringNoCopy(), userMeta); @@ -1495,7 +1806,7 @@ IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * messag idx = 0; sendPort = NULL; if (queue && (kIODispatchQueueStopped != queue)) { - sendPort = ipc_port_make_send(queue->ivars->serverPort); + sendPort = ipc_port_copy_send(queue->ivars->serverPort); } replySize = sizeof(OSObject_Instantiate_Rpl) + queueCount * sizeof(machReply->objects[0]) @@ -1530,7 +1841,7 @@ IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * messag queue = uvars->queueArray[idx]; sendPort = NULL; if (queue) { - sendPort = ipc_port_make_send(queue->ivars->serverPort); + sendPort = ipc_port_copy_send(queue->ivars->serverPort); } machReply->objects[idx].type = MACH_MSG_PORT_DESCRIPTOR; machReply->objects[idx].disposition = MACH_MSG_TYPE_MOVE_SEND; @@ -1692,6 +2003,9 @@ IOUserServer::server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply) if (!message) { return kIOReturnIPCError; } + if (message->objectRefs == 0) { + return kIOReturnIPCError; + } ret = copyInObjects(msgin, message, msgin->msgh.msgh_size, true, false); if (kIOReturnSuccess != ret) { if (kIODKLogIPC & gIODKDebug) { @@ -1737,7 +2051,7 @@ IOUserServer::server(ipc_kmsg_t requestkmsg, ipc_kmsg_t * pReply) bzero((void *)msgout, replyAlloc); } - IORPC rpc = { .message = msgin, .sendSize = msgin->msgh.msgh_size, .reply = msgout, .replySize = replyAlloc }; + IORPC rpc = { .message = msgin, .reply = msgout, .sendSize = msgin->msgh.msgh_size, .replySize = replyAlloc }; if (object) { thread_iokit_tls_set(0, this); @@ -1978,7 +2292,7 @@ IOUserServer::rpc(IORPC rpc) port = queue->ivars->serverPort; } if (port) { - sendPort = ipc_port_make_send(port); + sendPort = ipc_port_copy_send(port); } IOLockUnlock(gIOUserServerLock); if (!sendPort) { @@ -1990,46 +2304,63 @@ IOUserServer::rpc(IORPC rpc) ret = copyOutObjects(mach, message, sendSize, false); mach->msgh.msgh_bits = MACH_MSGH_BITS_COMPLEX | - MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND, (oneway ? 0 : MACH_MSG_TYPE_MAKE_SEND_ONCE)); + MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, (oneway ? 0 : MACH_MSG_TYPE_MAKE_SEND_ONCE)); mach->msgh.msgh_remote_port = sendPort; mach->msgh.msgh_local_port = (oneway ? MACH_PORT_NULL : mig_get_reply_port()); mach->msgh.msgh_id = kIORPCVersionCurrent; mach->msgh.msgh_reserved = 0; + boolean_t message_moved; + if (oneway) { - ret = mach_msg_send_from_kernel(&mach->msgh, sendSize); + ret = kernel_mach_msg_send(&mach->msgh, sendSize, + MACH_SEND_MSG | MACH_SEND_ALWAYS | MACH_SEND_NOIMPORTANCE, + 0, &message_moved); } else { assert(replySize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))); - ret = mach_msg_rpc_from_kernel(&mach->msgh, sendSize, replySize); - if (KERN_SUCCESS == ret) { - if (kIORPCVersionCurrentReply != mach->msgh.msgh_id) { - ret = (MACH_NOTIFY_SEND_ONCE == mach->msgh.msgh_id) ? MIG_SERVER_DIED : MIG_REPLY_MISMATCH; - } else if ((replySize = mach->msgh.msgh_size) < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) { + ret = kernel_mach_msg_rpc(&mach->msgh, sendSize, replySize, FALSE, &message_moved); + } + + ipc_port_release_send(sendPort); + + if (MACH_MSG_SUCCESS != ret) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("mach_msg() failed 0x%x\n", ret); + } + if (!message_moved) { + // release ports + copyInObjects(mach, message, sendSize, false, true); + } + } + + if ((KERN_SUCCESS == ret) && !oneway) { + if (kIORPCVersionCurrentReply != mach->msgh.msgh_id) { + ret = (MACH_NOTIFY_SEND_ONCE == mach->msgh.msgh_id) ? MIG_SERVER_DIED : MIG_REPLY_MISMATCH; + } else if ((replySize = mach->msgh.msgh_size) < (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))) { // printf("BAD REPLY SIZE\n"); + ret = MIG_BAD_ARGUMENTS; + } else { + if (!(MACH_MSGH_BITS_COMPLEX & mach->msgh.msgh_bits)) { + mach->msgh_body.msgh_descriptor_count = 0; + } + message = IORPCMessageFromMach(mach, true); + if (!message) { + ret = kIOReturnIPCError; + } else if (message->msgid != msgid) { +// printf("BAD REPLY ID\n"); ret = MIG_BAD_ARGUMENTS; } else { - if (!(MACH_MSGH_BITS_COMPLEX & mach->msgh.msgh_bits)) { - mach->msgh_body.msgh_descriptor_count = 0; - } - message = IORPCMessageFromMach(mach, true); - if (!message) { - ret = kIOReturnIPCError; - } else if (message->msgid != msgid) { -// printf("BAD REPLY ID\n"); - ret = MIG_BAD_ARGUMENTS; - } else { - bool isError = (0 != (kIORPCMessageError & message->flags)); - ret = copyInObjects(mach, message, replySize, !isError, true); - if (kIOReturnSuccess != ret) { - if (kIODKLogIPC & gIODKDebug) { - DKLOG("rpc copyin(0x%x) %x\n", ret, mach->msgh.msgh_id); - } - return KERN_NOT_SUPPORTED; - } - if (isError) { - IORPCMessageErrorReturnContent * errorMsg = (typeof(errorMsg))message; - ret = errorMsg->result; + bool isError = (0 != (kIORPCMessageError & message->flags)); + ret = copyInObjects(mach, message, replySize, !isError, true); + if (kIOReturnSuccess != ret) { + if (kIODKLogIPC & gIODKDebug) { + DKLOG("rpc copyin(0x%x) %x\n", ret, mach->msgh.msgh_id); } + return KERN_NOT_SUPPORTED; + } + if (isError) { + IORPCMessageErrorReturnContent * errorMsg = (typeof(errorMsg))message; + ret = errorMsg->result; } } } @@ -2471,6 +2802,9 @@ IOUserClient * IOUserServer::withTask(task_t owningTask) } } + /* Mark the current task's space as eligible for uext object ports */ + iokit_label_dext_task(inst->fOwningTask); + inst->fLock = IOLockAlloc(); inst->fServices = OSArray::withCapacity(4); inst->fClasses = OSDictionary::withCapacity(16); @@ -2622,12 +2956,21 @@ IOUserServer::registerClass(OSClassDescription * desc, uint32_t size, OSUserMeta cls->name = sym; cls->meta = OSMetaClass::copyMetaClassWithName(sym); + IOLockLock(fLock); cls->superMeta = OSDynamicCast(OSUserMetaClass, fClasses->getObject(desc->superName)); - fClasses->setObject(sym, cls); + if (fClasses->getObject(sym) != NULL) { + /* class with this name exists */ + ret = kIOReturnBadArgument; + } else { + if (fClasses->setObject(sym, cls)) { + *pCls = cls; + } else { + /* could not add class to fClasses */ + ret = kIOReturnNoMemory; + } + } + IOLockUnlock(fLock); cls->release(); - - *pCls = cls; - return ret; } @@ -2807,13 +3150,6 @@ IOUserServer::serviceNewUserClient(IOService * service, task_t owningTask, void } } - ret = userUC->Start(service); - if (kIOReturnSuccess != ret) { - userUC->detach(this); - userUC->release(); - return ret; - } - *handler = userUC; return ret; @@ -2971,6 +3307,7 @@ IMPL(IOService, Create) service = OSDynamicCast(IOService, inst); if (service && service->init(properties) && service->attach(this)) { reserved->uvars->userServer->serviceAttach(service, this); + service->reserved->uvars->started = true; ret = kIOReturnSuccess; *result = service; } @@ -2986,6 +3323,25 @@ IMPL(IOService, Create) return ret; } +kern_return_t +IMPL(IOService, Terminate) +{ + IOUserServer * us; + + if (options) { + return kIOReturnUnsupported; + } + + us = (typeof(us))thread_iokit_tls_get(0); + if (!reserved->uvars + || (reserved->uvars->userServer != us)) { + return kIOReturnNotPermitted; + } + terminate(kIOServiceTerminateNeedWillTerminate); + + return kIOReturnSuccess; +} + kern_return_t IMPL(IOService, NewUserClient) { @@ -3009,6 +3365,80 @@ IMPL(IOService, SearchProperty) return object ? kIOReturnSuccess : kIOReturnNotFound; } +kern_return_t +IMPL(IOService, CopyProviderProperties) +{ + IOReturn ret; + OSArray * result; + IOService * provider; + + result = OSArray::withCapacity(8); + if (!result) { + return kIOReturnNoMemory; + } + + ret = kIOReturnSuccess; + for (provider = this; provider; provider = provider->getProvider()) { + OSObject * obj; + OSDictionary * props; + + obj = provider->copyProperty(gIOSupportedPropertiesKey); + props = OSDynamicCast(OSDictionary, obj); + if (!props) { + OSSafeReleaseNULL(obj); + props = provider->dictionaryWithProperties(); + } + if (!props) { + ret = kIOReturnNoMemory; + break; + } + bool __block addClass = true; + if (propertyKeys) { + OSDictionary * retProps; + retProps = OSDictionary::withCapacity(4); + addClass = false; + if (!retProps) { + ret = kIOReturnNoMemory; + break; + } + propertyKeys->iterateObjects(^bool (OSObject * _key) { + OSString * key = OSDynamicCast(OSString, _key); + if (gIOClassKey->isEqualTo(key)) { + addClass = true; + return false; + } + retProps->setObject(key, props->getObject(key)); + return false; + }); + OSSafeReleaseNULL(props); + props = retProps; + } + if (addClass) { + OSArray * classes = OSArray::withCapacity(8); + if (!classes) { + ret = kIOReturnNoMemory; + break; + } + for (const OSMetaClass * meta = provider->getMetaClass(); meta; meta = meta->getSuperClass()) { + classes->setObject(meta->getClassNameSymbol()); + } + props->setObject(gIOClassKey, classes); + OSSafeReleaseNULL(classes); + } + bool ok = result->setObject(props); + props->release(); + if (!ok) { + ret = kIOReturnNoMemory; + break; + } + } + if (kIOReturnSuccess != ret) { + OSSafeReleaseNULL(result); + } + *properties = result; + return ret; +} + void IOUserServer::systemPower(bool powerOff) { @@ -3182,8 +3612,6 @@ IOUserServer::serviceStop(IOService * service, IOService *) return kIOReturnSuccess; } - IOMachPortDestroyUserReferences(service, IKOT_UEXT_OBJECT); - if (uvars->queueArray && uvars->userMeta) { queueAlloc = 1; if (uvars->userMeta->queueNames) { @@ -3303,12 +3731,6 @@ IMPL(IOService, Stop) return kIOReturnSuccess; } -kern_return_t -IMPL(IOInterruptDispatchSource, Cancel) -{ - return kIOReturnUnsupported; -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #undef super @@ -3340,7 +3762,7 @@ IOUserUserClient::stop(IOService * provider) IOReturn IOUserUserClient::clientClose(void) { - terminate(); + terminate(kIOServiceTerminateNeedWillTerminate); return kIOReturnSuccess; } diff --git a/iokit/conf/files b/iokit/conf/files index 9b2578710..77b47578c 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -21,6 +21,7 @@ OPTIONS/mach_assert optional mach_assert ./DriverKit/IODispatchQueue.iig.cpp optional iokitcpp ./DriverKit/IOInterruptDispatchSource.iig.cpp optional iokitcpp ./DriverKit/IODataQueueDispatchSource.iig.cpp optional iokitcpp +./DriverKit/IOServiceNotificationDispatchSource.iig.cpp optional iokitcpp ./DriverKit/IOUserServer.iig.cpp optional iokitcpp # libIOKit diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index 0d564de95..6015d4683 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -1391,7 +1391,6 @@ OSMetaClass::getSuperClass() const } /********************************************************************* -* xxx - I want to rename this :-/ *********************************************************************/ const OSSymbol * OSMetaClass::getKmodName() const @@ -1403,6 +1402,14 @@ OSMetaClass::getKmodName() const return OSSymbol::withCStringNoCopy("unknown"); } +/********************************************************************* +*********************************************************************/ +OSKext * +OSMetaClass::getKext() const +{ + return reserved ? reserved->kext : NULL; +} + /********************************************************************* *********************************************************************/ unsigned int diff --git a/libkern/libkern/OSKextLib.h b/libkern/libkern/OSKextLib.h index b9011d072..f602dfd7d 100644 --- a/libkern/libkern/OSKextLib.h +++ b/libkern/libkern/OSKextLib.h @@ -335,6 +335,15 @@ __BEGIN_DECLS */ #define kOSBundleAllowUserLoadKey "OSBundleAllowUserLoad" +/*! + * @define kOSBundleAllowUserTerminateKey + * @abstract A boolean value indicating whether the kextunload tool + * is allowed to issue IOService terminate to classes defined in this kext. + * @discussion A boolean value indicating whether the kextunload tool + * is allowed to issue IOService terminate to classes defined in this kext. + */ +#define kOSBundleAllowUserTerminateKey "OSBundleAllowUserTerminate" + /*! * @define kOSKernelResourceKey * @abstract A boolean value indicating whether the kext represents a built-in diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 03da0e6c2..b9688fd51 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -136,6 +136,7 @@ class OSSerialize; #ifdef XNU_KERNEL_PRIVATE class OSOrderedSet; class OSCollection; +class OSKext; #endif /* XNU_KERNEL_PRIVATE */ struct IORPC; class OSInterface @@ -1729,6 +1730,7 @@ public: virtual OSObject * alloc() const = 0; #ifdef XNU_KERNEL_PRIVATE + OSKext * getKext() const; void addInstance(const OSObject * instance, bool super = false) const; void removeInstance(const OSObject * instance, bool super = false) const; void applyToInstances(OSMetaClassInstanceApplierFunction applier, diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c index 57d675094..62585338d 100644 --- a/libsyscall/mach/mach_port.c +++ b/libsyscall/mach/mach_port.c @@ -638,6 +638,21 @@ mach_port_kobject( return rv; } +kern_return_t +mach_port_kobject_description( + ipc_space_t task, + mach_port_name_t name, + natural_t *object_type, + mach_vm_address_t *object_addr, + kobject_description_t desc) +{ + kern_return_t rv; + + rv = _kernelrpc_mach_port_kobject_description(task, name, object_type, object_addr, desc); + + return rv; +} + kern_return_t mach_port_construct( ipc_space_t task, diff --git a/libsyscall/wrappers/_libc_funcptr.c b/libsyscall/wrappers/_libc_funcptr.c index 30a31c68c..8ebcc87c8 100644 --- a/libsyscall/wrappers/_libc_funcptr.c +++ b/libsyscall/wrappers/_libc_funcptr.c @@ -87,6 +87,18 @@ _pthread_clear_qos_tsd(mach_port_t thread_port) } } +__attribute__((visibility("hidden"))) +int +pthread_current_stack_contains_np(const void *addr, size_t len) +{ + if (_libkernel_functions->version >= 4 && + _libkernel_functions->pthread_current_stack_contains_np) { + return _libkernel_functions->pthread_current_stack_contains_np(addr, len); + } + + return 0; +} + /* * Upcalls to optimized libplatform string functions */ diff --git a/libsyscall/wrappers/_libkernel_init.h b/libsyscall/wrappers/_libkernel_init.h index 514afef02..42aba7be4 100644 --- a/libsyscall/wrappers/_libkernel_init.h +++ b/libsyscall/wrappers/_libkernel_init.h @@ -64,6 +64,9 @@ typedef const struct _libkernel_functions { /* The following functions are included in version 3 of this structure */ void (*pthread_clear_qos_tsd)(mach_port_t); + /* The following functions are included in version 4 of this structure */ + int (*pthread_current_stack_contains_np)(const void *, size_t); + /* Subsequent versions must only add pointers! */ } *_libkernel_functions_t; diff --git a/libsyscall/wrappers/spawn/posix_spawn.c b/libsyscall/wrappers/spawn/posix_spawn.c index ae5585b04..73dfc1a3e 100644 --- a/libsyscall/wrappers/spawn/posix_spawn.c +++ b/libsyscall/wrappers/spawn/posix_spawn.c @@ -943,6 +943,27 @@ posix_spawnattr_setspecialport_np( return posix_spawn_appendportaction_np(attr, &action); } +/* + * posix_spawnattr_setsuidcredport_np + * + * Description: Set an suid cred port to be used to execute with a different UID. + * + * Parameters: attr The spawn attributes object for the + * new process + * port The suid cred port + * + * Returns: 0 Success + */ +int +posix_spawnattr_setsuidcredport_np(posix_spawnattr_t *attr, mach_port_t port) +{ + _ps_port_action_t action = { + .port_type = PSPA_SUID_CRED, + .new_port = port, + }; + return posix_spawn_appendportaction_np(attr, &action); +} + /* * posix_spawnattr_setexceptionports_np * diff --git a/libsyscall/wrappers/spawn/spawn.h b/libsyscall/wrappers/spawn/spawn.h index 1b83c9d96..1bc1171fd 100644 --- a/libsyscall/wrappers/spawn/spawn.h +++ b/libsyscall/wrappers/spawn/spawn.h @@ -148,6 +148,8 @@ int posix_spawnattr_setexceptionports_np(posix_spawnattr_t * __restrict, int posix_spawnattr_setspecialport_np(posix_spawnattr_t * __restrict, mach_port_t, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); +int posix_spawnattr_setsuidcredport_np(posix_spawnattr_t * __restrict, mach_port_t) __SPI_AVAILABLE(ios(13.0), macos(10.15)); + int posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *, int) __API_AVAILABLE(macos(10.7), ios(4.3)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); diff --git a/libsyscall/wrappers/terminate_with_reason.c b/libsyscall/wrappers/terminate_with_reason.c index dd7719b47..9b46ed940 100644 --- a/libsyscall/wrappers/terminate_with_reason.c +++ b/libsyscall/wrappers/terminate_with_reason.c @@ -24,8 +24,38 @@ #include #include #include +#include #include +/* Crash simulation */ + +extern int pthread_current_stack_contains_np(const void *, unsigned long); +int +__darwin_check_fd_set_overflow(int n, const void *fd_set, int unlimited_select) +{ + if (n < 0) { + os_fault_with_payload(OS_REASON_LIBSYSTEM, OS_REASON_LIBSYSTEM_CODE_FAULT, + &n, sizeof(n), "FD_SET underflow", 0); + return 0; + } + + if (n >= __DARWIN_FD_SETSIZE) { + if (pthread_current_stack_contains_np((const void *) fd_set, sizeof(struct fd_set))) { + if (!unlimited_select) { + os_fault_with_payload(OS_REASON_LIBSYSTEM, OS_REASON_LIBSYSTEM_CODE_FAULT, + &n, sizeof(n), "FD_SET overflow", 0); + return 0; + } else { + return 1; + } + } else { + return 1; + } + } + + return 1; +} + /* System call entry points */ int __terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, diff --git a/osfmk/UserNotification/KUNCUserNotifications.c b/osfmk/UserNotification/KUNCUserNotifications.c index b5ddcad61..ae727ba02 100644 --- a/osfmk/UserNotification/KUNCUserNotifications.c +++ b/osfmk/UserNotification/KUNCUserNotifications.c @@ -402,7 +402,7 @@ convert_port_to_UNDReply( ip_unlock(port); return UND_REPLY_NULL; } - reply = (UNDReplyRef) port->ip_kobject; + reply = (UNDReplyRef) ip_get_kobject(port); assert(reply != UND_REPLY_NULL); ip_unlock(port); return reply; diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index 965ba291d..4c25645de 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -375,8 +375,10 @@ arm_init( + ((uintptr_t)&BootCpuData - (uintptr_t)(args->virtBase))); - thread_bootstrap(); - thread = current_thread(); + thread = thread_bootstrap(); + thread->machine.CpuDatap = &BootCpuData; + machine_set_current_thread(thread); + /* * Preemption is enabled for this thread so that it can lock mutexes without * tripping the preemption check. In reality scheduling is not enabled until @@ -384,7 +386,6 @@ arm_init( * preemption level is not really meaningful for the bootstrap thread. */ thread->machine.preemption_count = 0; - thread->machine.CpuDatap = &BootCpuData; #if __arm__ && __ARM_USER_PROTECT__ { unsigned int ttbr0_val, ttbr1_val, ttbcr_val; diff --git a/osfmk/arm/cswitch.s b/osfmk/arm/cswitch.s index 6f4d332fc..f7c8dcbd6 100644 --- a/osfmk/arm/cswitch.s +++ b/osfmk/arm/cswitch.s @@ -155,6 +155,8 @@ LEXT(Switch_context) add r3, r3, SS_R4 stmia r3!, {r4-r14} // Save general registers to pcb switch_threads: + ldr r3, [r2, ACT_CPUDATAP] + str r2, [r3, CPU_ACTIVE_THREAD] ldr r3, [r2, TH_KSTACKPTR] // get kernel stack top mcr p15, 0, r2, c13, c0, 4 // Write TPIDRPRW ldr r6, [r2, TH_CTH_SELF] diff --git a/osfmk/arm/locks.h b/osfmk/arm/locks.h index ce0e69e42..1fbddf86d 100644 --- a/osfmk/arm/locks.h +++ b/osfmk/arm/locks.h @@ -111,6 +111,8 @@ typedef struct _lck_mtx_ { #define LCK_FRAMES_MAX 8 extern uint64_t MutexSpin; +extern uint64_t low_MutexSpin; +extern int64_t high_MutexSpin; typedef struct { unsigned int type; diff --git a/osfmk/arm/locks_arm.c b/osfmk/arm/locks_arm.c index 49a261f31..7fc463e63 100644 --- a/osfmk/arm/locks_arm.c +++ b/osfmk/arm/locks_arm.c @@ -73,6 +73,9 @@ #include #include #include +#include +#include +#include #include #include @@ -117,7 +120,10 @@ int lck_mtx_adaptive_spin_mode = 0; typedef enum { SPINWAIT_ACQUIRED, /* Got the lock. */ SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */ - SPINWAIT_DID_SPIN, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */ + SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */ SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */ } spinwait_result_t; @@ -428,32 +434,6 @@ get_preemption_level(void) return current_thread()->machine.preemption_count; } -#if __SMP__ -static inline boolean_t -interlock_try_disable_interrupts( - lck_mtx_t *mutex, - boolean_t *istate) -{ - *istate = ml_set_interrupts_enabled(FALSE); - - if (interlock_try(mutex)) { - return 1; - } else { - ml_set_interrupts_enabled(*istate); - return 0; - } -} - -static inline void -interlock_unlock_enable_interrupts( - lck_mtx_t *mutex, - boolean_t istate) -{ - interlock_unlock(mutex); - ml_set_interrupts_enabled(istate); -} -#endif /* __SMP__ */ - /* * Routine: lck_spin_alloc_init */ @@ -2293,14 +2273,15 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t int has_interlock = (int)interlocked; #if __SMP__ __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock); - thread_t holder; - uint64_t overall_deadline; - uint64_t check_owner_deadline; - uint64_t cur_time; - spinwait_result_t retval = SPINWAIT_DID_SPIN; - int loopcount = 0; - uintptr_t state; - boolean_t istate; + thread_t owner, prev_owner; + uint64_t window_deadline, sliding_deadline, high_deadline; + uint64_t start_time, cur_time, avg_hold_time, bias, delta; + int loopcount = 0; + uint i, prev_owner_cpu; + int total_hold_time_samples, window_hold_time_samples, unfairness; + bool owner_on_core, adjust; + uintptr_t state, new_state, waiters; + spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR; if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) { if (!has_interlock) { @@ -2310,101 +2291,290 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t return SPINWAIT_DID_NOT_SPIN; } - state = ordered_load_mtx(lock); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0); - cur_time = mach_absolute_time(); - overall_deadline = cur_time + MutexSpin; - check_owner_deadline = cur_time; - - if (has_interlock) { - istate = ml_get_interrupts_enabled(); + start_time = mach_absolute_time(); + /* + * window_deadline represents the "learning" phase. + * The thread collects statistics about the lock during + * window_deadline and then it makes a decision on whether to spin more + * or block according to the concurrency behavior + * observed. + * + * Every thread can spin at least low_MutexSpin. + */ + window_deadline = start_time + low_MutexSpin; + /* + * Sliding_deadline is the adjusted spin deadline + * computed after the "learning" phase. + */ + sliding_deadline = window_deadline; + /* + * High_deadline is a hard deadline. No thread + * can spin more than this deadline. + */ + if (high_MutexSpin >= 0) { + high_deadline = start_time + high_MutexSpin; + } else { + high_deadline = start_time + low_MutexSpin * real_ncpus; } + /* + * Do not know yet which is the owner cpu. + * Initialize prev_owner_cpu with next cpu. + */ + prev_owner_cpu = (cpu_number() + 1) % real_ncpus; + total_hold_time_samples = 0; + window_hold_time_samples = 0; + avg_hold_time = 0; + adjust = TRUE; + bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus; + /* Snoop the lock state */ state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); + prev_owner = owner; + + if (has_interlock) { + if (owner == NULL) { + retval = SPINWAIT_INTERLOCK; + goto done_spinning; + } else { + /* + * We are holding the interlock, so + * we can safely dereference owner. + */ + if (!(owner->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU) || + (owner->state & TH_IDLE)) { + retval = SPINWAIT_DID_NOT_SPIN; + goto done_spinning; + } + } + interlock_unlock(lock); + has_interlock = 0; + } /* * Spin while: * - mutex is locked, and * - it's locked as a spin lock, and * - owner is running on another processor, and - * - owner (processor) is not idling, and * - we haven't spun for long enough. */ do { - if (!(state & LCK_ILOCK) || has_interlock) { - if (!has_interlock) { - has_interlock = interlock_try_disable_interrupts(lock, &istate); + /* + * Try to acquire the lock. + */ + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner == NULL) { + waiters = state & ARM_LCK_WAITERS; + if (waiters) { + /* + * preserve the waiter bit + * and try acquire the interlock. + * Note: we will successfully acquire + * the interlock only if we can also + * acquire the lock. + */ + new_state = ARM_LCK_WAITERS | LCK_ILOCK; + has_interlock = 1; + retval = SPINWAIT_INTERLOCK; + disable_preemption(); + } else { + new_state = LCK_MTX_THREAD_TO_STATE(thread); + retval = SPINWAIT_ACQUIRED; + } + + /* + * The cmpxchg will succed only if the lock + * is not owned (doesn't have an owner set) + * and it is not interlocked. + * It will not fail if there are waiters. + */ + if (os_atomic_cmpxchgv(&lock->lck_mtx_data, + waiters, new_state, &state, acquire)) { + goto done_spinning; + } else { + if (waiters) { + has_interlock = 0; + enable_preemption(); + } } + } - if (has_interlock) { - state = ordered_load_mtx(lock); - holder = LCK_MTX_STATE_TO_THREAD(state); + cur_time = mach_absolute_time(); - if (holder == NULL) { - retval = SPINWAIT_INTERLOCK; + /* + * Never spin past high_deadline. + */ + if (cur_time >= high_deadline) { + retval = SPINWAIT_DID_SPIN_HIGH_THR; + break; + } - if (istate) { - ml_set_interrupts_enabled(istate); - } + /* + * Check if owner is on core. If not block. + */ + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner) { + i = prev_owner_cpu; + owner_on_core = FALSE; - break; - } + disable_preemption(); + state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); - if (!(holder->machine.machine_thread_flags & MACHINE_THREAD_FLAGS_ON_CPU) || - (holder->state & TH_IDLE)) { - if (loopcount == 0) { - retval = SPINWAIT_DID_NOT_SPIN; + /* + * For scalability we want to check if the owner is on core + * without locking the mutex interlock. + * If we do not lock the mutex interlock, the owner that we see might be + * invalid, so we cannot dereference it. Therefore we cannot check + * any field of the thread to tell us if it is on core. + * Check if the thread that is running on the other cpus matches the owner. + */ + if (owner) { + do { + cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr; + if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) { + owner_on_core = TRUE; + break; } - - if (istate) { - ml_set_interrupts_enabled(istate); + if (++i >= real_ncpus) { + i = 0; } - - break; + } while (i != prev_owner_cpu); + enable_preemption(); + + if (owner_on_core) { + prev_owner_cpu = i; + } else { + prev_owner = owner; + state = ordered_load_mtx(lock); + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner == prev_owner) { + /* + * Owner is not on core. + * Stop spinning. + */ + if (loopcount == 0) { + retval = SPINWAIT_DID_NOT_SPIN; + } else { + retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE; + } + break; + } + /* + * Fall through if the owner changed while we were scanning. + * The new owner could potentially be on core, so loop + * again. + */ } - - interlock_unlock_enable_interrupts(lock, istate); - has_interlock = 0; + } else { + enable_preemption(); } } - cur_time = mach_absolute_time(); - - if (cur_time >= overall_deadline) { - break; + /* + * Save how many times we see the owner changing. + * We can roughly estimate the the mutex hold + * time and the fairness with that. + */ + if (owner != prev_owner) { + prev_owner = owner; + total_hold_time_samples++; + window_hold_time_samples++; } - check_owner_deadline = cur_time + (MutexSpin / SPINWAIT_OWNER_CHECK_COUNT); + /* + * Learning window expired. + * Try to adjust the sliding_deadline. + */ + if (cur_time >= window_deadline) { + /* + * If there was not contention during the window + * stop spinning. + */ + if (window_hold_time_samples < 1) { + retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION; + break; + } + + if (adjust) { + /* + * For a fair lock, we'd wait for at most (NCPU-1) periods, + * but the lock is unfair, so let's try to estimate by how much. + */ + unfairness = total_hold_time_samples / real_ncpus; + + if (unfairness == 0) { + /* + * We observed the owner changing `total_hold_time_samples` times which + * let us estimate the average hold time of this mutex for the duration + * of the spin time. + * avg_hold_time = (cur_time - start_time) / total_hold_time_samples; + * + * In this case spin at max avg_hold_time * (real_ncpus - 1) + */ + delta = cur_time - start_time; + sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples; + } else { + /* + * In this case at least one of the other cpus was able to get the lock twice + * while I was spinning. + * We could spin longer but it won't necessarily help if the system is unfair. + * Try to randomize the wait to reduce contention. + * + * We compute how much time we could potentially spin + * and distribute it over the cpus. + * + * bias is an integer between 0 and real_ncpus. + * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias + */ + delta = high_deadline - cur_time; + sliding_deadline = cur_time + ((delta * bias) / real_ncpus); + adjust = FALSE; + } + } - if (cur_time < check_owner_deadline) { - machine_delay_until(check_owner_deadline - cur_time, check_owner_deadline); + window_deadline += low_MutexSpin; + window_hold_time_samples = 0; } - /* Snoop the lock state */ - state = ordered_load_mtx(lock); + /* + * Stop spinning if we past + * the adjusted deadline. + */ + if (cur_time >= sliding_deadline) { + retval = SPINWAIT_DID_SPIN_SLIDING_THR; + break; + } - if (state == 0) { - /* Try to grab the lock. */ - if (os_atomic_cmpxchg(&lock->lck_mtx_data, - 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) { - retval = SPINWAIT_ACQUIRED; - break; - } + /* + * We want to arm the monitor for wfe, + * so load exclusively the lock. + * + * NOTE: + * we rely on the fact that wfe will + * eventually return even if the cache line + * is not modified. This way we will keep + * looping and checking if the deadlines expired. + */ + state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed); + owner = LCK_MTX_STATE_TO_THREAD(state); + if (owner != NULL) { + wait_for_event(); + state = ordered_load_mtx(lock); + } else { + atomic_exchange_abort(); } loopcount++; } while (TRUE); +done_spinning: #if CONFIG_DTRACE /* - * We've already kept a count via overall_deadline of how long we spun. - * If dtrace is active, then we compute backwards to decide how - * long we spun. - * * Note that we record a different probe id depending on whether * this is a direct or indirect mutex. This allows us to * penalize only lock groups that have debug/stats enabled @@ -2412,10 +2582,10 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t */ if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } /* The lockstat acquire event is recorded by the caller. */ #endif @@ -2437,6 +2607,7 @@ lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t return retval; } + /* * Common code for mutex locking as spinlock */ diff --git a/osfmk/arm/machine_routines.c b/osfmk/arm/machine_routines.c index df89b7500..1a21043f4 100644 --- a/osfmk/arm/machine_routines.c +++ b/osfmk/arm/machine_routines.c @@ -68,6 +68,9 @@ uint32_t LockTimeOut; uint32_t LockTimeOutUsec; uint64_t TLockTimeOut; uint64_t MutexSpin; +uint64_t low_MutexSpin; +int64_t high_MutexSpin; + boolean_t is_clock_configured = FALSE; #if CONFIG_NONFATAL_ASSERTS @@ -218,6 +221,15 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(10 * NSEC_PER_USEC, &abstime); } MutexSpin = abstime; + low_MutexSpin = MutexSpin; + /* + * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but + * real_ncpus is not set at this time + * + * NOTE: active spinning is disabled in arm. It can be activated + * by setting high_MutexSpin through the sysctl. + */ + high_MutexSpin = low_MutexSpin; } /* diff --git a/osfmk/arm/machine_routines_asm.s b/osfmk/arm/machine_routines_asm.s index 9d1896393..5b475d4fd 100644 --- a/osfmk/arm/machine_routines_asm.s +++ b/osfmk/arm/machine_routines_asm.s @@ -35,6 +35,8 @@ .align 2 .globl EXT(machine_set_current_thread) LEXT(machine_set_current_thread) + ldr r1, [r0, ACT_CPUDATAP] + str r0, [r1, CPU_ACTIVE_THREAD] mcr p15, 0, r0, c13, c0, 4 // Write TPIDRPRW ldr r1, [r0, TH_CTH_SELF] mrc p15, 0, r2, c13, c0, 3 // Read TPIDRURO diff --git a/osfmk/arm/pcb.c b/osfmk/arm/pcb.c index b78a1be8e..f42c4f4e1 100644 --- a/osfmk/arm/pcb.c +++ b/osfmk/arm/pcb.c @@ -193,6 +193,15 @@ machine_thread_init(void) "arm debug state"); } +/* + * Routine: machine_thread_template_init + * + */ +void +machine_thread_template_init(thread_t __unused thr_template) +{ + /* Nothing to do on this platform. */ +} /* * Routine: get_useraddr diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index 003be491f..2f01b4681 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -3923,7 +3923,7 @@ __unused pte_to_xprr_perm(pt_entry_t pte) case APRR_USER_RW_INDEX: return XPRR_USER_RW_PERM; case APRR_PPL_RX_INDEX: return XPRR_PPL_RX_PERM; case APRR_KERN_RX_INDEX: return XPRR_KERN_RX_PERM; - case APRR_PPL_RO_INDEX: return XPRR_PPL_RO_PERM; + case APRR_USER_XO_INDEX: return XPRR_USER_XO_PERM; case APRR_KERN_RO_INDEX: return XPRR_KERN_RO_PERM; case APRR_KERN0_RX_INDEX: return XPRR_KERN0_RO_PERM; case APRR_KERN0_RO_INDEX: return XPRR_KERN0_RO_PERM; @@ -3951,7 +3951,7 @@ xprr_perm_to_aprr_index(uint64_t perm) case XPRR_USER_RW_PERM: return APRR_USER_RW_INDEX; case XPRR_PPL_RX_PERM: return APRR_PPL_RX_INDEX; case XPRR_KERN_RX_PERM: return APRR_KERN_RX_INDEX; - case XPRR_PPL_RO_PERM: return APRR_PPL_RO_INDEX; + case XPRR_USER_XO_PERM: return APRR_USER_XO_INDEX; case XPRR_KERN_RO_PERM: return APRR_KERN_RO_INDEX; case XPRR_KERN0_RX_PERM: return APRR_KERN0_RO_INDEX; case XPRR_KERN0_RO_PERM: return APRR_KERN0_RO_INDEX; @@ -4643,8 +4643,18 @@ pmap_static_allocations_done(void) monitor_start_pa = BootArgs->topOfKernelData; monitor_end_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE; - /* The bootstrap page tables are mapped RO at boostrap. */ - pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM); + /* + * The bootstrap page tables are mapped RO at boostrap. + * + * Note that this function call requests switching XPRR permissions from + * XPRR_KERN_RO_PERM to XPRR_KERN_RO_PERM. Whilst this may seem redundant, + * pa_set_range_xprr_perm() does other things too, such as calling + * pa_set_range_monitor() on the requested address range and performing a number + * of integrity checks on the PTEs. We should still + * call this function for all PPL-owned memory, regardless of whether + * permissions are required to be changed or not. + */ + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM); monitor_start_pa = BootArgs->topOfKernelData + BOOTSTRAP_TABLE_SIZE; monitor_end_pa = avail_start; @@ -4652,10 +4662,20 @@ pmap_static_allocations_done(void) /* The other bootstrap allocations are mapped RW at bootstrap. */ pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); - /* The RO page tables are mapped RW at bootstrap. */ + /* + * The RO page tables are mapped RW at bootstrap and remain RW after the call + * to pa_set_range_xprr_perm(). We do this, as opposed to using XPRR_PPL_RW_PERM, + * to work around a functional issue on H11 devices where CTRR shifts the APRR + * lookup table index to USER_XO before APRR is applied, hence causing the hardware + * to believe we are dealing with an user XO page upon performing a translation. + * + * Note that this workaround does not pose a security risk, because the RO + * page tables still remain read-only, due to KTRR/CTRR, and further protecting + * them at the APRR level would be unnecessary. + */ monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin); monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin); - pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM); + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RW_PERM); monitor_start_pa = kvtophys(segPPLDATAB); monitor_end_pa = monitor_start_pa + segSizePPLDATA; @@ -4701,14 +4721,14 @@ pmap_static_allocations_done(void) monitor_start_pa = kvtophys(segPPLDATACONSTB); monitor_end_pa = monitor_start_pa + segSizePPLDATACONST; - pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_PPL_RO_PERM); + pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM); } /* * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security * precaution. The real RW mappings are at a different location with guard pages. */ - pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_PPL_RO_PERM); + pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM); } @@ -5150,6 +5170,11 @@ pmap_create_options_internal( if ((p = pmap_alloc_pmap()) == PMAP_NULL) { return PMAP_NULL; } + + if (ledger) { + pmap_ledger_validate(ledger); + pmap_ledger_retain(ledger); + } #else /* * Allocate a pmap struct from the pmap_zone. Then allocate @@ -5160,6 +5185,8 @@ pmap_create_options_internal( } #endif + p->ledger = ledger; + if (flags & PMAP_CREATE_64BIT) { p->min = MACH_VM_MIN_ADDRESS; p->max = MACH_VM_MAX_ADDRESS; @@ -5192,14 +5219,6 @@ pmap_create_options_internal( } -#if XNU_MONITOR - if (ledger) { - pmap_ledger_validate(ledger); - pmap_ledger_retain(ledger); - } -#endif /* XNU_MONITOR */ - - p->ledger = ledger; PMAP_LOCK_INIT(p); memset((void *) &p->stats, 0, sizeof(p->stats)); @@ -7294,13 +7313,14 @@ pmap_protect_options_internal( pte_set_was_writeable(tmplate, false); #if __APRR_SUPPORTED__ - if (__improbable(is_pte_xprr_protected(spte) && (pte_to_xprr_perm(spte) != XPRR_USER_JIT_PERM))) { + if (__improbable(is_pte_xprr_protected(spte) && (pte_to_xprr_perm(spte) != XPRR_USER_JIT_PERM) + && (pte_to_xprr_perm(spte) != XPRR_USER_XO_PERM))) { /* Only test for PPL protection here, User-JIT mappings may be mutated by this function. */ panic("%s: modifying a PPL mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx", __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate); } - if (__improbable(is_pte_xprr_protected(tmplate))) { + if (__improbable(is_pte_xprr_protected(tmplate) && (pte_to_xprr_perm(tmplate) != XPRR_USER_XO_PERM))) { panic("%s: creating an xPRR mapping pte_p=%p pmap=%p prot=%d options=%u, pte=0x%llx, tmplate=0x%llx", __func__, pte_p, pmap, prot, options, (uint64_t)spte, (uint64_t)tmplate); } @@ -8087,12 +8107,11 @@ Pmap_enter_loop: #if XNU_MONITOR if (!pmap_ppl_disable && (wimg_bits & PP_ATTR_MONITOR)) { uint64_t xprr_perm = pte_to_xprr_perm(pte); - pte &= ~ARM_PTE_XPRR_MASK; switch (xprr_perm) { case XPRR_KERN_RO_PERM: - pte |= xprr_perm_to_pte(XPRR_PPL_RO_PERM); break; case XPRR_KERN_RW_PERM: + pte &= ~ARM_PTE_XPRR_MASK; pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM); break; default: diff --git a/osfmk/arm64/cswitch.s b/osfmk/arm64/cswitch.s index 48e0879b1..c9b1893a1 100644 --- a/osfmk/arm64/cswitch.s +++ b/osfmk/arm64/cswitch.s @@ -146,6 +146,8 @@ */ .macro set_thread_registers msr TPIDR_EL1, $0 // Write new thread pointer to TPIDR_EL1 + ldr $1, [$0, ACT_CPUDATAP] + str $0, [$1, CPU_ACTIVE_THREAD] ldr $1, [$0, TH_CTH_SELF] // Get cthread pointer mrs $2, TPIDRRO_EL0 // Extract cpu number from TPIDRRO_EL0 and $2, $2, #(MACHDEP_CPUNUM_MASK) diff --git a/osfmk/arm64/kpc.c b/osfmk/arm64/kpc.c index 3a5a4d444..f19b8696b 100644 --- a/osfmk/arm64/kpc.c +++ b/osfmk/arm64/kpc.c @@ -62,21 +62,6 @@ void kpc_pmi_handler(unsigned int ctr); #define PMCR0_PMC_ENABLE_MASK(PMC) (UINT64_C(0x1) << PMCR_PMC_SHIFT(PMC)) #define PMCR0_PMC_DISABLE_MASK(PMC) (~PMCR0_PMC_ENABLE_MASK(PMC)) -/* how interrupts are generated on PMIs */ -#define PMCR0_INTGEN_SHIFT (8) -#define PMCR0_INTGEN_MASK (UINT64_C(0x7) << PMCR0_INTGEN_SHIFT) -#define PMCR0_INTGEN_OFF (UINT64_C(0) << PMCR0_INTGEN_SHIFT) -#define PMCR0_INTGEN_PMI (UINT64_C(1) << PMCR0_INTGEN_SHIFT) -#define PMCR0_INTGEN_AIC (UINT64_C(2) << PMCR0_INTGEN_SHIFT) -#define PMCR0_INTGEN_DBG_HLT (UINT64_C(3) << PMCR0_INTGEN_SHIFT) -#define PMCR0_INTGEN_FIQ (UINT64_C(4) << PMCR0_INTGEN_SHIFT) - -/* 10 unused */ - -/* set by hardware if PMI was generated */ -#define PMCR0_PMAI_SHIFT (11) -#define PMCR0_PMAI_MASK (UINT64_C(1) << PMCR0_PMAI_SHIFT) - /* overflow on a PMC generates an interrupt */ #define PMCR0_PMI_OFFSET (12) #define PMCR0_PMI_SHIFT(PMC) (PMCR0_PMI_OFFSET + PMCR_PMC_SHIFT(PMC)) @@ -360,33 +345,22 @@ dump_regs(void) static boolean_t enable_counter(uint32_t counter) { - int cpuid = cpu_number(); - uint64_t pmcr0 = 0, intgen_type; - boolean_t counter_running, pmi_enabled, intgen_correct, enabled; + uint64_t pmcr0 = 0; + boolean_t counter_running, pmi_enabled, enabled; pmcr0 = SREG_READ(SREG_PMCR0) | 0x3 /* leave the fixed counters enabled for monotonic */; counter_running = (pmcr0 & PMCR0_PMC_ENABLE_MASK(counter)) != 0; pmi_enabled = (pmcr0 & PMCR0_PMI_ENABLE_MASK(counter)) != 0; - /* TODO this should use the PMI path rather than AIC for the interrupt - * as it is faster - */ - intgen_type = PMCR0_INTGEN_AIC; - intgen_correct = (pmcr0 & PMCR0_INTGEN_MASK) == intgen_type; - - enabled = counter_running && pmi_enabled && intgen_correct; + enabled = counter_running && pmi_enabled; if (!enabled) { pmcr0 |= PMCR0_PMC_ENABLE_MASK(counter); pmcr0 |= PMCR0_PMI_ENABLE_MASK(counter); - pmcr0 &= ~PMCR0_INTGEN_MASK; - pmcr0 |= intgen_type; - SREG_WRITE(SREG_PMCR0, pmcr0); } - saved_PMCR[cpuid][0] = pmcr0; return enabled; } @@ -395,7 +369,6 @@ disable_counter(uint32_t counter) { uint64_t pmcr0; boolean_t enabled; - int cpuid = cpu_number(); if (counter < 2) { return true; @@ -409,7 +382,6 @@ disable_counter(uint32_t counter) SREG_WRITE(SREG_PMCR0, pmcr0); } - saved_PMCR[cpuid][0] = pmcr0; return enabled; } @@ -547,9 +519,6 @@ save_regs(void) assert(ml_get_interrupts_enabled() == FALSE); - /* Save current PMCR0/1 values. PMCR2-4 are in the RAWPMU set. */ - saved_PMCR[cpuid][0] = SREG_READ(SREG_PMCR0) | 0x3; - /* Save event selections. */ saved_PMESR[cpuid][0] = SREG_READ(SREG_PMESR0); saved_PMESR[cpuid][1] = SREG_READ(SREG_PMESR1); @@ -583,7 +552,6 @@ restore_regs(void) /* Restore PMCR0/1 values (with PMCR0 last to enable). */ SREG_WRITE(SREG_PMCR1, saved_PMCR[cpuid][1] | 0x30303); - SREG_WRITE(SREG_PMCR0, saved_PMCR[cpuid][0] | 0x3); } static uint64_t diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index 660a59f1b..875ddb7a5 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -725,7 +725,7 @@ check_ktrr_sctlr_trap: add sp, sp, ARM_CONTEXT_SIZE // Clean up stack b.ne Lel1_sp1_synchronous_vector_continue msr ELR_EL1, lr // Return to caller - eret + ERET_CONTEXT_SYNCHRONIZING #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ /* 64-bit first level exception handler dispatcher. @@ -1170,8 +1170,8 @@ Lexception_return_restore_registers: and x1, x4, BA_BOOT_FLAGS_DISABLE_USER_JOP cbnz x1, Ldisable_jop // if global user JOP disabled, always turn off JOP regardless of thread flag (kernel running with JOP on) mrs x2, TPIDR_EL1 - ldr x2, [x2, TH_DISABLE_USER_JOP] - cbz x2, Lskip_disable_jop // if thread has JOP enabled, leave it on (kernel running with JOP on) + ldr w2, [x2, TH_DISABLE_USER_JOP] + cbz w2, Lskip_disable_jop // if thread has JOP enabled, leave it on (kernel running with JOP on) Ldisable_jop: MOV64 x1, SCTLR_JOP_KEYS_ENABLED mrs x4, SCTLR_EL1 @@ -1257,7 +1257,7 @@ Lskip_disable_jop: Lskip_ttbr1_switch: #endif /* __ARM_KERNEL_PROTECT__ */ - eret + ERET_CONTEXT_SYNCHRONIZING user_take_ast: PUSH_FRAME diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index 037f34c13..b1901d145 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -79,6 +79,9 @@ uint32_t LockTimeOut; uint32_t LockTimeOutUsec; uint64_t TLockTimeOut; uint64_t MutexSpin; +uint64_t low_MutexSpin; +int64_t high_MutexSpin; + boolean_t is_clock_configured = FALSE; uint32_t yield_delay_us = 0; /* Must be less than cpu_idle_latency to ensure ml_delay_should_spin is true */ @@ -115,7 +118,7 @@ lockdown_handler_t lockdown_handler; void *lockdown_this; lck_mtx_t lockdown_handler_lck; lck_grp_t *lockdown_handler_grp; -int lockdown_done; +uint32_t lockdown_done; void ml_lockdown_init(void); void ml_lockdown_run_handler(void); @@ -841,6 +844,15 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(10 * NSEC_PER_USEC, &abstime); } MutexSpin = abstime; + low_MutexSpin = MutexSpin; + /* + * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but + * real_ncpus is not set at this time + * + * NOTE: active spinning is disabled in arm. It can be activated + * by setting high_MutexSpin through the sysctl. + */ + high_MutexSpin = low_MutexSpin; } /* diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index 51361f693..963af5e56 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Apple Inc. All rights reserved. + * Copyright (c) 2017-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include #include #include /* panic */ +#include #include #include #include /* CHAR_BIT */ @@ -84,8 +85,19 @@ bool mt_core_supported = true; #define PMC5 "s3_2_c15_c5_0" #define PMC6 "s3_2_c15_c6_0" #define PMC7 "s3_2_c15_c7_0" + +#define PMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \ + X(6, A); X(7, A) + +#if CORE_NCTRS > 8 #define PMC8 "s3_2_c15_c9_0" #define PMC9 "s3_2_c15_c10_0" +#define PMC_8_9(X, A) X(8, A); X(9, A) +#else // CORE_NCTRS > 8 +#define PMC_8_9(X, A) +#endif // CORE_NCTRS > 8 + +#define PMC_ALL(X, A) PMC_0_7(X, A); PMC_8_9(X, A) #define CTR_MAX ((UINT64_C(1) << 47) - 1) @@ -125,7 +137,7 @@ enum { PMCR0_INTGEN_HALT = 3, PMCR0_INTGEN_FIQ = 4, }; -#define PMCR0_INTGEN_SET(INT) ((uint64_t)(INT) << 8) +#define PMCR0_INTGEN_SET(X) ((uint64_t)(X) << 8) #if CPMU_AIC_PMI #define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_AIC) @@ -133,7 +145,9 @@ enum { #define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_FIQ) #endif /* !CPMU_AIC_PMI */ -#define PMCR0_PMI_EN(CTR) (UINT64_C(1) << (12 + CTR_POS(CTR))) +#define PMCR0_PMI_SHIFT (12) +#define PMCR0_CTR_GE8_PMI_SHIFT (44) +#define PMCR0_PMI_EN(CTR) (UINT64_C(1) << (PMCR0_PMI_SHIFT + CTR_POS(CTR))) /* fixed counters are always counting */ #define PMCR0_PMI_INIT (PMCR0_PMI_EN(CYCLES) | PMCR0_PMI_EN(INSTRS)) /* disable counting on a PMI */ @@ -144,8 +158,9 @@ enum { #define PMCR0_L2CGLOBAL_EN (UINT64_C(1) << 23) /* user mode access to configuration registers */ #define PMCR0_USEREN_EN (UINT64_C(1) << 30) +#define PMCR0_CTR_GE8_EN_SHIFT (32) -#define PMCR0_INIT (PMCR0_INTGEN_INIT | PMCR0_PMI_INIT | PMCR0_DISCNT_EN) +#define PMCR0_INIT (PMCR0_INTGEN_INIT | PMCR0_PMI_INIT) /* * PMCR1 controls which execution modes count events. @@ -194,6 +209,9 @@ core_init_execution_modes(void) #define PMSR_OVF(CTR) (1ULL << (CTR)) +#define PMESR0 "S3_1_c15_c5_0" +#define PMESR1 "S3_1_c15_c6_0" + static int core_init(__unused mt_device_t dev) { @@ -211,10 +229,9 @@ uint64_t mt_core_snap(unsigned int ctr) { switch (ctr) { - case 0: - return __builtin_arm_rsr64(PMC0); - case 1: - return __builtin_arm_rsr64(PMC1); +#define PMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(PMC ## CTR) + PMC_ALL(PMC_RD, 0); +#undef PMC_RD default: panic("monotonic: invalid core counter read: %u", ctr); __builtin_unreachable(); @@ -242,16 +259,29 @@ core_set_enabled(void) { uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0); pmcr0 |= PMCR0_INIT | PMCR0_FIXED_EN; - pmcr0 &= ~PMCR0_PMAI; + + if (kpc_get_running() & KPC_CLASS_CONFIGURABLE_MASK) { + uint64_t kpc_ctrs = kpc_get_configurable_pmc_mask( + KPC_CLASS_CONFIGURABLE_MASK) << MT_CORE_NFIXED; +#if KPC_ARM64_CONFIGURABLE_COUNT > 6 + uint64_t ctrs_ge8 = kpc_ctrs >> 8; + pmcr0 |= ctrs_ge8 << PMCR0_CTR_GE8_EN_SHIFT; + pmcr0 |= ctrs_ge8 << PMCR0_CTR_GE8_PMI_SHIFT; + kpc_ctrs &= (1ULL << 8) - 1; +#endif /* KPC_ARM64_CONFIGURABLE_COUNT > 6 */ + kpc_ctrs |= kpc_ctrs << PMCR0_PMI_SHIFT; + pmcr0 |= kpc_ctrs; + } + __builtin_arm_wsr64(PMCR0, pmcr0); #if MACH_ASSERT /* * Only check for the values that were ORed in. */ uint64_t pmcr0_check = __builtin_arm_rsr64(PMCR0); - if (!(pmcr0_check & (PMCR0_INIT | PMCR0_FIXED_EN))) { - panic("monotonic: hardware ignored enable (read %llx)", - pmcr0_check); + if ((pmcr0_check & (PMCR0_INIT | PMCR0_FIXED_EN)) != (PMCR0_INIT | PMCR0_FIXED_EN)) { + panic("monotonic: hardware ignored enable (read %llx, wrote %llx)", + pmcr0_check, pmcr0); } #endif /* MACH_ASSERT */ } @@ -1293,6 +1323,13 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) assert(cpu != NULL); assert(ml_get_interrupts_enabled() == FALSE); + __builtin_arm_wsr64(PMCR0, PMCR0_INIT); + /* + * Ensure the CPMU has flushed any increments at this point, so PMSR is up + * to date. + */ + __builtin_arm_isb(ISB_SY); + cpu->cpu_monotonic.mtc_npmis += 1; cpu->cpu_stat.pmi_cnt_wake += 1; @@ -1308,10 +1345,14 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) uint64_t pmsr = __builtin_arm_rsr64(PMSR); #if MONOTONIC_DEBUG - kprintf("monotonic: cpu = %d, PMSR = 0x%llx, PMCR0 = 0x%llx", + printf("monotonic: cpu = %d, PMSR = 0x%llx, PMCR0 = 0x%llx\n", cpu_number(), pmsr, pmcr0); #endif /* MONOTONIC_DEBUG */ +#if MACH_ASSERT + uint64_t handled = 0; +#endif /* MACH_ASSERT */ + /* * monotonic handles any fixed counter PMIs. */ @@ -1320,6 +1361,9 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) continue; } +#if MACH_ASSERT + handled |= 1ULL << i; +#endif /* MACH_ASSERT */ uint64_t count = mt_cpu_update_count(cpu, i); cpu->cpu_monotonic.mtc_counts[i] += count; mt_core_set_snap(i, mt_core_reset_values[i]); @@ -1334,6 +1378,9 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1), mt_microstackshot_ctr, user_mode); mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx); + } else if (mt_debug) { + KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 2), + i, count); } } @@ -1342,14 +1389,31 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) */ for (unsigned int i = MT_CORE_NFIXED; i < CORE_NCTRS; i++) { if (pmsr & PMSR_OVF(i)) { +#if MACH_ASSERT + handled |= 1ULL << i; +#endif /* MACH_ASSERT */ extern void kpc_pmi_handler(unsigned int ctr); kpc_pmi_handler(i); } } #if MACH_ASSERT - pmsr = __builtin_arm_rsr64(PMSR); - assert(pmsr == 0); + uint64_t pmsr_after_handling = __builtin_arm_rsr64(PMSR); + if (pmsr_after_handling != 0) { + unsigned int first_ctr_ovf = __builtin_ffsll(pmsr_after_handling) - 1; + uint64_t count = 0; + const char *extra = ""; + if (first_ctr_ovf >= CORE_NCTRS) { + extra = " (invalid counter)"; + } else { + count = mt_core_snap(first_ctr_ovf); + } + + panic("monotonic: PMI status not cleared on exit from handler, " + "PMSR = 0x%llx HANDLE -> -> 0x%llx, handled 0x%llx, " + "PMCR0 = 0x%llx, PMC%d = 0x%llx%s", pmsr, pmsr_after_handling, + handled, __builtin_arm_rsr64(PMCR0), first_ctr_ovf, count, extra); + } #endif /* MACH_ASSERT */ core_set_enabled(); diff --git a/osfmk/arm64/pcb.c b/osfmk/arm64/pcb.c index 3bf15f95a..ff4efbfdd 100644 --- a/osfmk/arm64/pcb.c +++ b/osfmk/arm64/pcb.c @@ -271,6 +271,15 @@ machine_thread_init(void) } +/* + * Routine: machine_thread_template_init + * + */ +void +machine_thread_template_init(thread_t __unused thr_template) +{ + /* Nothing to do on this platform. */ +} /* * Routine: get_useraddr diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index f4d967d14..69533e29e 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -1641,7 +1641,7 @@ typedef enum { #define XPRR_KERN0_RW_PERM (6ULL) #define XPRR_USER_RW_PERM (7ULL) #define XPRR_PPL_RX_PERM (8ULL) -#define XPRR_PPL_RO_PERM (9ULL) +#define XPRR_USER_XO_PERM (9ULL) #define XPRR_KERN_RX_PERM (10ULL) #define XPRR_KERN_RO_PERM (11ULL) #define XPRR_KERN0_RX_PERM (12ULL) @@ -1668,7 +1668,7 @@ typedef enum { #define APRR_USER_RW_INDEX (7ULL) /* AP_RWRW, PXN, XN */ #define APRR_PPL_RX_INDEX (8ULL) /* AP_RONA, PX, X */ #define APRR_KERN_RX_INDEX (9ULL) /* AP_RONA, PX, XN */ -#define APRR_PPL_RO_INDEX (10ULL) /* AP_RONA, PXN, X */ +#define APRR_USER_XO_INDEX (10ULL) /* AP_RONA, PXN, X */ #define APRR_KERN_RO_INDEX (11ULL) /* AP_RONA, PXN, XN */ #define APRR_KERN0_RX_INDEX (12ULL) /* AP_RORO, PX, X */ #define APRR_KERN0_RO_INDEX (13ULL) /* AP_RORO, PX, XN */ @@ -1693,7 +1693,7 @@ typedef enum { #define APRR_USER_RW_SHIFT (28ULL) /* AP_RWRW, PXN, XN */ #define APRR_PPL_RX_SHIFT (32ULL) /* AP_RONA, PX, X */ #define APRR_KERN_RX_SHIFT (36ULL) /* AP_RONA, PX, XN */ -#define APRR_PPL_RO_SHIFT (40ULL) /* AP_RONA, PXN, X */ +#define APRR_USER_XO_SHIFT (40ULL) /* AP_RONA, PXN, X */ #define APRR_KERN_RO_SHIFT (44ULL) /* AP_RONA, PXN, XN */ #define APRR_KERN0_RX_SHIFT (48ULL) /* AP_RORO, PX, X */ #define APRR_KERN0_RO_SHIFT (52ULL) /* AP_RORO, PX, XN */ @@ -1731,20 +1731,25 @@ typedef enum { #define APRR_EL1_RESET \ APRR_EL1_UNRESTRICTED +/* + * XO mappings bypass PAN protection (rdar://58360875) + * Revoke ALL kernel access permissions for XO mappings. + */ #define APRR_EL1_BASE \ - APRR_EL1_UNRESTRICTED + (APRR_EL1_UNRESTRICTED & \ + APRR_REMOVE(APRR_ATTR_R << APRR_USER_XO_SHIFT)) #if XNU_MONITOR #define APRR_EL1_DEFAULT \ (APRR_EL1_BASE & \ (APRR_REMOVE((APRR_ATTR_WX << APRR_PPL_RW_SHIFT) | \ - (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \ + (APRR_ATTR_WX << APRR_USER_XO_SHIFT) | \ (APRR_ATTR_WX << APRR_PPL_RX_SHIFT)))) #define APRR_EL1_PPL \ (APRR_EL1_BASE & \ (APRR_REMOVE((APRR_ATTR_X << APRR_PPL_RW_SHIFT) | \ - (APRR_ATTR_WX << APRR_PPL_RO_SHIFT) | \ + (APRR_ATTR_WX << APRR_USER_XO_SHIFT) | \ (APRR_ATTR_W << APRR_PPL_RX_SHIFT)))) #else #define APRR_EL1_DEFAULT \ @@ -1761,7 +1766,7 @@ typedef enum { (APRR_EL0_UNRESTRICTED & \ (APRR_REMOVE((APRR_ATTR_RWX << APRR_PPL_RW_SHIFT) | \ (APRR_ATTR_RWX << APRR_PPL_RX_SHIFT) | \ - (APRR_ATTR_RWX << APRR_PPL_RO_SHIFT)))) + (APRR_ATTR_RWX << APRR_USER_XO_SHIFT)))) #else #define APRR_EL0_BASE \ APRR_EL0_UNRESTRICTED @@ -1910,6 +1915,26 @@ cmp $0, $1 b.mi $2 // Unsigned "strictly less than" .endmacro +/* + * Macro intended to be used as a replacement for ERET. + * It prevents speculation past ERET instructions by padding + * up to the decoder width. + */ +.macro ERET_CONTEXT_SYNCHRONIZING +eret +#if __ARM_SB_AVAILABLE__ +sb // Technically unnecessary on Apple micro-architectures, may restrict mis-speculation on other architectures +#else /* __ARM_SB_AVAILABLE__ */ +isb // ISB technically unnecessary on Apple micro-architectures, may restrict mis-speculation on other architectures +nop // Sequence of six NOPs to pad out and terminate instruction decode group */ +nop +nop +nop +nop +nop +#endif /* !__ARM_SB_AVAILABLE__ */ +.endmacro + #endif /* __ASSEMBLER__ */ #define MSR(reg, src) __asm__ volatile ("msr " reg ", %0" :: "r" (src)) diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index a5d29d6c6..00cba8194 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -195,8 +195,8 @@ LEXT(reset_vector) /* spin until bootstrap core has completed machine lockdown */ adrp x17, EXT(lockdown_done)@page 1: - ldr x18, [x17, EXT(lockdown_done)@pageoff] - cbz x18, 1b + ldr w18, [x17, EXT(lockdown_done)@pageoff] + cbz w18, 1b // load stashed rorgn_begin adrp x17, EXT(rorgn_begin)@page @@ -254,8 +254,8 @@ Lfound_cpu_data_entry: /* spin until bootstrap core has completed machine lockdown */ adrp x17, EXT(lockdown_done)@page 1: - ldr x18, [x17, EXT(lockdown_done)@pageoff] - cbz x18, 1b + ldr w18, [x17, EXT(lockdown_done)@pageoff] + cbz w18, 1b // load stashed rorgn_begin adrp x17, EXT(rorgn_begin)@page diff --git a/osfmk/bank/bank.c b/osfmk/bank/bank.c index a281a029a..5b4ba01d1 100644 --- a/osfmk/bank/bank.c +++ b/osfmk/bank/bank.c @@ -424,11 +424,12 @@ bank_get_value( panic("Bogus bank type: %d passed in get_value\n", bank_element->be_type); } - /* Change the persona-id to holder task's persona-id if the task is not spawned in system persona */ + /* Do not replace persona id if the task is not spawned in system persona */ if (unique_persona && bank_merchant->bt_persona_id != persona_get_id(system_persona) && - bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona)) { - persona_id = bank_merchant->bt_persona_id; + bank_merchant->bt_persona_id != persona_get_id(proxy_system_persona) && + bank_merchant->bt_persona_id != persona_id) { + return KERN_INVALID_ARGUMENT; } if (bank_holder->bt_persona_id == persona_id) { diff --git a/osfmk/conf/files b/osfmk/conf/files index 31d2cbbbd..4d3aec134 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -201,6 +201,7 @@ osfmk/kern/hibernate.c optional hibernation osfmk/kern/remote_time.c standard osfmk/kern/memset_s.c standard osfmk/kern/copyout_shim.c optional copyout_shim +osfmk/kern/suid_cred.c standard ./mach/clock_server.c standard ./mach/clock_priv_server.c standard diff --git a/osfmk/device/device_types.h b/osfmk/device/device_types.h index 91d1b4770..2eb80ba77 100644 --- a/osfmk/device/device_types.h +++ b/osfmk/device/device_types.h @@ -70,6 +70,9 @@ #include #include #include +#ifdef MACH_KERNEL_PRIVATE +#include +#endif #if PRIVATE #define IOKIT_SERVER_VERSION 20190926 @@ -123,8 +126,10 @@ typedef struct IOObject * io_object_t; typedef io_object_t io_connect_t; typedef io_object_t uext_object_t; +extern void iokit_add_reference( io_object_t obj, natural_t type ); extern void iokit_remove_reference( io_object_t obj ); extern void iokit_remove_connect_reference( io_object_t obj ); +extern void iokit_port_object_description(io_object_t obj, kobject_description_t desc); extern io_object_t iokit_lookup_object_port( ipc_port_t port ); extern io_connect_t iokit_lookup_connect_port( ipc_port_t port ); diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index 15866381e..c4c0bce85 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -86,7 +86,7 @@ iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type) iokit_lock_port(port); if (ip_active(port) && (ip_kotype(port) == type)) { - obj = (io_object_t) port->ip_kobject; + obj = (io_object_t) ip_get_kobject(port); iokit_add_reference( obj, type ); } else { obj = NULL; @@ -137,7 +137,7 @@ iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_t iokit_lock_port(port); if (ip_kotype(port) == type) { - obj = (io_object_t) port->ip_kobject; + obj = (io_object_t) ip_get_kobject(port); iokit_add_reference(obj, type); } iokit_unlock_port(port); @@ -252,7 +252,12 @@ iokit_alloc_object_port( io_object_t obj, ipc_kobject_type_t type ) if (type == IKOT_IOKIT_CONNECT) { options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; } - return ipc_kobject_alloc_port((ipc_kobject_t) obj, type, options); + if (type == IKOT_UEXT_OBJECT) { + ipc_label_t label = IPC_LABEL_DEXT; + return ipc_kobject_alloc_labeled_port((ipc_kobject_t) obj, type, label, options); + } else { + return ipc_kobject_alloc_port((ipc_kobject_t) obj, type, options); + } } EXTERN kern_return_t @@ -345,7 +350,7 @@ iokit_no_senders( mach_no_senders_notification_t * notification ) if (IP_VALID(port)) { iokit_lock_port(port); if (ip_active(port)) { - obj = (io_object_t) port->ip_kobject; + obj = (io_object_t) ip_get_kobject(port); type = ip_kotype( port ); if ((IKOT_IOKIT_OBJECT == type) || (IKOT_IOKIT_CONNECT == type) @@ -400,6 +405,12 @@ iokit_notify( mach_msg_header_t * msg ) } } +kern_return_t +iokit_label_dext_task(task_t task) +{ + return ipc_space_add_label(task->itk_space, IPC_LABEL_DEXT); +} + /* need to create a pmap function to generalize */ unsigned int IODefaultCacheBits(addr64_t pa) diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 5aae6aaee..267b6259e 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -130,9 +130,9 @@ #include #if DEBUG || DEVELOPMENT -#define DPRINTF(x...) kprintf(x) +#define DPRINTF(x ...) kprintf(x) #else -#define DPRINTF(x...) +#define DPRINTF(x ...) #endif #ifndef ROUNDUP @@ -411,7 +411,7 @@ efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) } gPEEFIRuntimeServices = runtime; - }while (FALSE); + } while (FALSE); } static void @@ -489,7 +489,7 @@ efi_set_tables_32(EFI_SYSTEM_TABLE_32 * system_table) DPRINTF(" ResetSystem : 0x%x\n", runtime->ResetSystem); gPEEFIRuntimeServices = runtime; - }while (FALSE); + } while (FALSE); } @@ -501,7 +501,7 @@ efi_init(void) kprintf("Initializing EFI runtime services\n"); - do{ + do { vm_offset_t vm_size, vm_addr; vm_map_offset_t phys_addr; EfiMemoryRange *mptr; @@ -554,7 +554,7 @@ efi_init(void) } else { efi_set_tables_32((EFI_SYSTEM_TABLE_32 *) ml_static_ptovirt(args->efiSystemTable)); } - }while (FALSE); + } while (FALSE); return; } @@ -578,7 +578,7 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o kprintf("Reinitializing EFI runtime services\n"); - do{ + do { vm_offset_t vm_size, vm_addr; vm_map_offset_t phys_addr; EfiMemoryRange *mptr; @@ -647,7 +647,7 @@ hibernate_newruntime_map(void * map, vm_size_t map_size, uint32_t system_table_o } else { efi_set_tables_32((EFI_SYSTEM_TABLE_32 *) ml_static_ptovirt(args->efiSystemTable)); } - }while (FALSE); + } while (FALSE); kprintf("Done reinitializing EFI runtime services\n"); @@ -956,7 +956,7 @@ SavePanicInfo( /* Special handling of launchd died panics */ print_launchd_info(); } else { - panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), debugger_msg, FALSE, NULL); + panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80 : 48), debugger_msg, FALSE, NULL); } if (panic_options & DEBUGGER_OPTION_COPROC_INITIATED_PANIC) { @@ -1248,6 +1248,11 @@ panic_i386_backtrace(void *_frame, int nframes, const char *msg, boolean_t regdu int cn = cpu_number(); boolean_t old_doprnt_hide_pointers = doprnt_hide_pointers; +#if DEVELOPMENT || DEBUG + /* Turn off I/O tracing now that we're panicking */ + mmiotrace_enabled = 0; +#endif + if (pbtcpu != cn) { os_atomic_inc(&pbtcnt, relaxed); /* Spin on print backtrace lock, which serializes output diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index 1f69f62f3..9f0e07fe8 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -201,18 +201,24 @@ typedef struct cpu_data { struct cpu_data *cpu_this; /* pointer to myself */ thread_t cpu_active_thread; thread_t cpu_nthread; - volatile int cpu_preemption_level; int cpu_number; /* Logical CPU */ void *cpu_int_state; /* interrupt state */ vm_offset_t cpu_active_stack; /* kernel stack base */ vm_offset_t cpu_kernel_stack; /* kernel stack top */ vm_offset_t cpu_int_stack_top; - int cpu_interrupt_level; volatile int cpu_signals; /* IPI events */ volatile int cpu_prior_signals; /* Last set of events, * debugging */ ast_t cpu_pending_ast; + /* + * Note if rearranging fields: + * We want cpu_preemption_level on a different + * cache line than cpu_active_thread + * for optimizing mtx_spin phase. + */ + int cpu_interrupt_level; + volatile int cpu_preemption_level; volatile int cpu_running; #if !MONOTONIC boolean_t cpu_fixed_pmcs_enabled; diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 25a26de3b..ff6c8c1fc 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -872,9 +872,7 @@ cpuid_set_cpufamily(i386_cpu_info_t *info_p) break; case CPUID_MODEL_SKYLAKE: case CPUID_MODEL_SKYLAKE_DT: -#if !defined(RC_HIDE_XNU_J137) case CPUID_MODEL_SKYLAKE_W: -#endif cpufamily = CPUFAMILY_INTEL_SKYLAKE; break; case CPUID_MODEL_KABYLAKE: diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index a3a6ad6ee..146e77b15 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -262,12 +262,10 @@ #define CPUID_MODEL_SKYLAKE_ULT 0x4E #define CPUID_MODEL_SKYLAKE_ULX 0x4E #define CPUID_MODEL_SKYLAKE_DT 0x5E -#if !defined(RC_HIDE_XNU_J137) #define CPUID_MODEL_SKYLAKE_W 0x55 #define PLATID_XEON_SP_1 0x00 #define PLATID_XEON_SP_2 0x07 #define PLATID_MAYBE_XEON_SP 0x01 -#endif /* not RC_HIDE_XNU_J137 */ #define CPUID_MODEL_KABYLAKE 0x8E #define CPUID_MODEL_KABYLAKE_ULT 0x8E #define CPUID_MODEL_KABYLAKE_ULX 0x8E diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index b05c65b9a..1d7475429 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2019 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -145,37 +145,26 @@ fxsave64(struct x86_fx_thread_state *a) __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a)); } -#if !defined(RC_HIDE_XNU_J137) #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512) -#else -#define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX) -#endif zone_t ifps_zone[] = { [FP] = NULL, [AVX] = NULL, -#if !defined(RC_HIDE_XNU_J137) [AVX512] = NULL -#endif }; static uint32_t fp_state_size[] = { [FP] = sizeof(struct x86_fx_thread_state), [AVX] = sizeof(struct x86_avx_thread_state), -#if !defined(RC_HIDE_XNU_J137) [AVX512] = sizeof(struct x86_avx512_thread_state) -#endif }; static const char *xstate_name[] = { [UNDEFINED] = "UNDEFINED", [FP] = "FP", [AVX] = "AVX", -#if !defined(RC_HIDE_XNU_J137) [AVX512] = "AVX512" -#endif }; -#if !defined(RC_HIDE_XNU_J137) #define fpu_ZMM_capable (fpu_capability == AVX512) #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512) /* @@ -205,26 +194,16 @@ static const char *xstate_name[] = { * Note the initial state value is an AVX512 object but that the AVX initial * value is a subset of it. */ -#else -#define fpu_YMM_capable (fpu_capability == AVX) -#endif static uint32_t cpuid_reevaluated = 0; static void fpu_store_registers(void *, boolean_t); static void fpu_load_registers(void *); -#if !defined(RC_HIDE_XNU_J137) static const uint32_t xstate_xmask[] = { [FP] = FP_XMASK, [AVX] = AVX_XMASK, [AVX512] = AVX512_XMASK }; -#else -static const uint32_t xstate_xmask[] = { - [FP] = FP_XMASK, - [AVX] = AVX_XMASK, -}; -#endif static inline void xsave(struct x86_fx_thread_state *a, uint32_t rfbm) @@ -250,7 +229,6 @@ xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm) __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0)); } -#if !defined(RC_HIDE_XNU_J137) __unused static inline void vzeroupper(void) { @@ -349,8 +327,6 @@ DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp) } #endif /* DEBUG_AVX512 */ -#endif - #if DEBUG static inline unsigned short fnstsw(void) @@ -389,13 +365,11 @@ configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps) /* Clear vector register store */ bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg)); bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128)); -#if !defined(RC_HIDE_XNU_J137) if (fpu_ZMM_capable) { bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256)); bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM)); bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask)); } -#endif fps->fx.fp_valid = TRUE; fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32; @@ -458,7 +432,6 @@ init_fpu(void) PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc)); -#if !defined(RC_HIDE_XNU_J137) static boolean_t is_avx512_enabled = TRUE; if (cpu_number() == master_cpu) { if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) { @@ -467,14 +440,12 @@ init_fpu(void) is_avx512_enabled ? "and enabled" : "but disabled"); } } -#endif /* Configure the XSAVE context mechanism if the processor supports * AVX/YMM registers */ if (cpuid_features() & CPUID_FEATURE_XSAVE) { cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0]; -#if !defined(RC_HIDE_XNU_J137) if (is_avx512_enabled && (xs0p->extended_state[eax] & XFEM_ZMM) == XFEM_ZMM) { assert(xs0p->extended_state[eax] & XFEM_SSE); @@ -495,9 +466,7 @@ init_fpu(void) */ xsetbv(0, AVX_XMASK); fpu_default = AVX; - } else -#endif - if (xs0p->extended_state[eax] & XFEM_YMM) { + } else if (xs0p->extended_state[eax] & XFEM_YMM) { assert(xs0p->extended_state[eax] & XFEM_SSE); fpu_capability = AVX; fpu_default = AVX; @@ -636,9 +605,7 @@ fpu_store_registers(void *fstate, boolean_t is64) } break; case AVX: -#if !defined(RC_HIDE_XNU_J137) case AVX512: -#endif if (is64) { xsave64(ifps, xstate_xmask[xs]); ifps->fp_save_layout = XSAVE64; @@ -679,7 +646,6 @@ fpu_module_init(void) */ zone_change(ifps_zone[fpu_default], Z_ALIGNMENT_REQUIRED, TRUE); -#if !defined(RC_HIDE_XNU_J137) /* * If AVX512 is supported, create a separate savearea zone. * with allocation size: 19 pages = 32 * 2668 @@ -691,7 +657,6 @@ fpu_module_init(void) "x86 avx512 save state"); zone_change(ifps_zone[AVX512], Z_ALIGNMENT_REQUIRED, TRUE); } -#endif /* Determine MXCSR reserved bits and configure initial FPU state*/ configure_mxcsr_capability_mask(&initial_fp_state); @@ -784,6 +749,7 @@ fpu_set_fxstate( x86_float_state64_t *state; pcb_t pcb; boolean_t old_valid, fresh_state = FALSE; + xstate_t thr_xstate; if (fpu_capability == UNDEFINED) { return KERN_FAILURE; @@ -794,18 +760,22 @@ fpu_set_fxstate( return KERN_FAILURE; } -#if !defined(RC_HIDE_XNU_J137) + assert(thr_act != THREAD_NULL); + + thr_xstate = thread_xstate(thr_act); + if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) && - thread_xstate(thr_act) == AVX) { + thr_xstate == AVX) { if (!fpu_thread_promote_avx512(thr_act)) { return KERN_FAILURE; + } else { + /* Reload thr_xstate after successful promotion */ + thr_xstate = thread_xstate(thr_act); } } -#endif state = (x86_float_state64_t *)tstate; - assert(thr_act != THREAD_NULL); pcb = THREAD_TO_PCB(thr_act); if (state == NULL) { @@ -821,7 +791,7 @@ fpu_set_fxstate( simple_unlock(&pcb->lock); if (ifps != 0) { - fp_state_free(ifps, thread_xstate(thr_act)); + fp_state_free(ifps, thr_xstate); } } else { /* @@ -835,13 +805,13 @@ Retry: if (ifps == 0) { if (new_ifps == 0) { simple_unlock(&pcb->lock); - new_ifps = fp_state_alloc(thread_xstate(thr_act)); + new_ifps = fp_state_alloc(thr_xstate); goto Retry; } ifps = new_ifps; new_ifps = 0; pcb->ifps = ifps; - pcb->xstate = thread_xstate(thr_act); + pcb->xstate = thr_xstate; fresh_state = TRUE; } @@ -865,12 +835,12 @@ Retry: __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]); - switch (thread_xstate(thr_act)) { + switch (thr_xstate) { case UNDEFINED_FULL: case FP_FULL: case AVX_FULL: case AVX512_FULL: - panic("fpu_set_fxstate() INVALID xstate: 0x%x", thread_xstate(thr_act)); + panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate); break; case UNDEFINED: @@ -899,7 +869,6 @@ Retry: } break; } -#if !defined(RC_HIDE_XNU_J137) case AVX512: { struct x86_avx512_thread_state *iavx = (void *) ifps; union { @@ -938,7 +907,6 @@ Retry: } break; } -#endif } ifps->fp_valid = old_valid; @@ -957,7 +925,7 @@ Retry: simple_unlock(&pcb->lock); if (new_ifps != 0) { - fp_state_free(new_ifps, thread_xstate(thr_act)); + fp_state_free(new_ifps, thr_xstate); } } return KERN_SUCCESS; @@ -979,6 +947,7 @@ fpu_get_fxstate( x86_float_state64_t *state; kern_return_t ret = KERN_FAILURE; pcb_t pcb; + xstate_t thr_xstate = thread_xstate(thr_act); if (fpu_capability == UNDEFINED) { return KERN_FAILURE; @@ -989,12 +958,10 @@ fpu_get_fxstate( return KERN_FAILURE; } -#if !defined(RC_HIDE_XNU_J137) if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) && - thread_xstate(thr_act) != AVX512) { + thr_xstate != AVX512) { return KERN_FAILURE; } -#endif state = (x86_float_state64_t *)tstate; @@ -1033,12 +1000,12 @@ fpu_get_fxstate( } if (ifps->fp_valid) { __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]); - switch (thread_xstate(thr_act)) { + switch (thr_xstate) { case UNDEFINED_FULL: case FP_FULL: case AVX_FULL: case AVX512_FULL: - panic("fpu_get_fxstate() INVALID xstate: 0x%x", thread_xstate(thr_act)); + panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate); break; case UNDEFINED: @@ -1056,7 +1023,6 @@ fpu_get_fxstate( } break; } -#if !defined(RC_HIDE_XNU_J137) case AVX512: { struct x86_avx512_thread_state *iavx = (void *) ifps; union { @@ -1087,7 +1053,6 @@ fpu_get_fxstate( } break; } -#endif } ret = KERN_SUCCESS; @@ -1460,12 +1425,12 @@ fpSSEexterrflt(void) } -#if !defined(RC_HIDE_XNU_J137) /* * If a thread is using an AVX-sized savearea: * - allocate a new AVX512-sized area, * - copy the 256-bit state into the 512-bit area, * - deallocate the smaller area + * ASSUMES: thread is the current thread. */ static void fpu_savearea_promote_avx512(thread_t thread) @@ -1474,8 +1439,11 @@ fpu_savearea_promote_avx512(thread_t thread) struct x86_avx512_thread_state *ifps512 = NULL; pcb_t pcb = THREAD_TO_PCB(thread); boolean_t do_avx512_alloc = FALSE; + boolean_t intr; - DBG("fpu_upgrade_savearea(%p)\n", thread); + assert(thread == current_thread()); + + DBG("fpu_savearea_promote_avx512(%p)\n", thread); simple_lock(&pcb->lock, LCK_GRP_NULL); @@ -1483,11 +1451,21 @@ fpu_savearea_promote_avx512(thread_t thread) if (ifps == NULL) { pcb->xstate = AVX512; simple_unlock(&pcb->lock); - if (thread != current_thread()) { - /* nothing to be done */ + /* + * Now that the PCB xstate has been promoted, set XCR0 so + * that we don't re-trip #UD on the next AVX-512 instruction. + * + * Since this branch is taken when the first FP instruction + * attempted by this thread is an AVX-512 instruction, we + * call fpnoextflt() to allocate an appropriately-sized + * AVX-512 save-area, thereby avoiding the overhead of another + * fault that would be triggered immediately on return. + */ + intr = ml_set_interrupts_enabled(FALSE); + xsetbv(0, AVX512_XMASK); + current_cpu_datap()->cpu_xstate = AVX512; + (void)ml_set_interrupts_enabled(intr); - return; - } fpnoextflt(); return; } @@ -1495,6 +1473,7 @@ fpu_savearea_promote_avx512(thread_t thread) if (pcb->xstate != AVX512) { do_avx512_alloc = TRUE; } + simple_unlock(&pcb->lock); if (do_avx512_alloc == TRUE) { @@ -1502,19 +1481,17 @@ fpu_savearea_promote_avx512(thread_t thread) } simple_lock(&pcb->lock, LCK_GRP_NULL); - if (thread == current_thread()) { - boolean_t intr; - intr = ml_set_interrupts_enabled(FALSE); + intr = ml_set_interrupts_enabled(FALSE); - clear_ts(); - fp_save(thread); - clear_fpu(); + clear_ts(); + fp_save(thread); + clear_fpu(); + + xsetbv(0, AVX512_XMASK); + current_cpu_datap()->cpu_xstate = AVX512; + (void)ml_set_interrupts_enabled(intr); - xsetbv(0, AVX512_XMASK); - current_cpu_datap()->cpu_xstate = AVX512; - (void)ml_set_interrupts_enabled(intr); - } assert(ifps->fp.fp_valid); /* Allocate an AVX512 savearea and copy AVX state into it */ @@ -1568,9 +1545,10 @@ fpu_thread_promote_avx512(thread_t thread) * If the user is attempting an AVX512 instruction on a machine * that supports this, we switch the calling thread to use * a larger savearea, set its XCR0 bit mask to enable AVX512 and - * return directly via thread_exception_return(). - * Otherwise simply return. + * return to user_trap() with a 0 return value. + * Otherwise, simply return a nonzero value. */ + #define MAX_X86_INSN_LENGTH (15) int fpUDflt(user_addr_t rip) @@ -1632,7 +1610,6 @@ fpUDflt(user_addr_t rip) return 0; } -#endif /* !defined(RC_HIDE_XNU_J137) */ void fp_setvalid(boolean_t value) @@ -1657,13 +1634,11 @@ ml_fpu_avx_enabled(void) return fpu_capability >= AVX; } -#if !defined(RC_HIDE_XNU_J137) boolean_t ml_fpu_avx512_enabled(void) { return fpu_capability == AVX512; } -#endif static xstate_t task_xstate(task_t task) diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 8f20ce6eb..70fa633d7 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -829,7 +829,8 @@ i386_init(void) #endif /* MONOTONIC */ processor_bootstrap(); - thread_bootstrap(); + thread_t thread = thread_bootstrap(); + machine_set_current_thread(thread); pstate_trace(); kernel_debug_string_early("machine_startup"); diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index e553bc4a0..760a167cf 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -115,11 +115,18 @@ typedef struct _lck_mtx_ { /* Adaptive spin before blocking */ extern uint64_t MutexSpin; +extern uint64_t low_MutexSpin; +extern int64_t high_MutexSpin; typedef enum lck_mtx_spinwait_ret_type { LCK_MTX_SPINWAIT_ACQUIRED = 0, - LCK_MTX_SPINWAIT_SPUN = 1, - LCK_MTX_SPINWAIT_NO_SPIN = 2, + + LCK_MTX_SPINWAIT_SPUN_HIGH_THR = 1, + LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE = 2, + LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION = 3, + LCK_MTX_SPINWAIT_SPUN_SLIDING_THR = 4, + + LCK_MTX_SPINWAIT_NO_SPIN = 5, } lck_mtx_spinwait_ret_type_t; extern lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index c5b0d3037..25b329345 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -84,33 +84,35 @@ #include #include #include +#include +#include -#if CONFIG_DTRACE -#define DTRACE_RW_SHARED 0x0 //reader -#define DTRACE_RW_EXCL 0x1 //writer -#define DTRACE_NO_FLAG 0x0 //not applicable +#if CONFIG_DTRACE +#define DTRACE_RW_SHARED 0x0 //reader +#define DTRACE_RW_EXCL 0x1 //writer +#define DTRACE_NO_FLAG 0x0 //not applicable #endif /* CONFIG_DTRACE */ -#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 -#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 -#define LCK_RW_LCK_SHARED_CODE 0x102 -#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 -#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 -#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 +#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 +#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 +#define LCK_RW_LCK_SHARED_CODE 0x102 +#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 +#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 +#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 -#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 -#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 -#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 -#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 -#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 -#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 -#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 -#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 +#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 +#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 +#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 +#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 +#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 +#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 +#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 +#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 -#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) +#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) -unsigned int LcksOpts=0; +unsigned int LcksOpts = 0; #if DEVELOPMENT || DEBUG unsigned int LckDisablePreemptCheck = 0; @@ -118,15 +120,15 @@ unsigned int LckDisablePreemptCheck = 0; /* Forwards */ -#if USLOCK_DEBUG +#if USLOCK_DEBUG /* * Perform simple lock checks. */ -int uslock_check = 1; -int max_lock_loops = 100000000; -decl_simple_lock_data(extern , printf_lock); -decl_simple_lock_data(extern , panic_lock); -#endif /* USLOCK_DEBUG */ +int uslock_check = 1; +int max_lock_loops = 100000000; +decl_simple_lock_data(extern, printf_lock); +decl_simple_lock_data(extern, panic_lock); +#endif /* USLOCK_DEBUG */ extern unsigned int not_in_kdp; @@ -135,23 +137,23 @@ extern unsigned int not_in_kdp; * of the various lock routines. However, this information * is only used for debugging and statistics. */ -typedef void *pc_t; -#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) -#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) -#if ANY_LOCK_DEBUG -#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) -#define DECL_PC(pc) pc_t pc; -#else /* ANY_LOCK_DEBUG */ +typedef void *pc_t; +#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) +#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) +#if ANY_LOCK_DEBUG +#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) +#define DECL_PC(pc) pc_t pc; +#else /* ANY_LOCK_DEBUG */ #define DECL_PC(pc) -#ifdef lint +#ifdef lint /* * Eliminate lint complaints about unused local pc variables. */ -#define OBTAIN_PC(pc) ++pc -#else /* lint */ -#define OBTAIN_PC(pc) -#endif /* lint */ -#endif /* USLOCK_DEBUG */ +#define OBTAIN_PC(pc) ++pc +#else /* lint */ +#define OBTAIN_PC(pc) +#endif /* lint */ +#endif /* USLOCK_DEBUG */ /* * atomic exchange API is a low level abstraction of the operations @@ -166,9 +168,9 @@ typedef void *pc_t; static uint32_t atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord) { - uint32_t val; + uint32_t val; - (void)ord; // Memory order not used + (void)ord; // Memory order not used val = os_atomic_load(target, relaxed); *previous = val; return val; @@ -181,25 +183,29 @@ atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, } static void -atomic_exchange_abort(void) { } +atomic_exchange_abort(void) +{ +} static boolean_t atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait) { - uint32_t value, prev; + uint32_t value, prev; - for ( ; ; ) { + for (;;) { value = atomic_exchange_begin32(target, &prev, ord); if (value & test_mask) { - if (wait) + if (wait) { cpu_pause(); - else + } else { atomic_exchange_abort(); + } return FALSE; } value |= set_mask; - if (atomic_exchange_complete32(target, prev, value, ord)) + if (atomic_exchange_complete32(target, prev, value, ord)) { return TRUE; + } } } @@ -213,18 +219,18 @@ hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask * Portable lock package implementation of usimple_locks. */ -#if USLOCK_DEBUG -#define USLDBG(stmt) stmt -void usld_lock_init(usimple_lock_t, unsigned short); -void usld_lock_pre(usimple_lock_t, pc_t); -void usld_lock_post(usimple_lock_t, pc_t); -void usld_unlock(usimple_lock_t, pc_t); -void usld_lock_try_pre(usimple_lock_t, pc_t); -void usld_lock_try_post(usimple_lock_t, pc_t); -int usld_lock_common_checks(usimple_lock_t, char *); -#else /* USLOCK_DEBUG */ -#define USLDBG(stmt) -#endif /* USLOCK_DEBUG */ +#if USLOCK_DEBUG +#define USLDBG(stmt) stmt +void usld_lock_init(usimple_lock_t, unsigned short); +void usld_lock_pre(usimple_lock_t, pc_t); +void usld_lock_post(usimple_lock_t, pc_t); +void usld_unlock(usimple_lock_t, pc_t); +void usld_lock_try_pre(usimple_lock_t, pc_t); +void usld_lock_try_post(usimple_lock_t, pc_t); +int usld_lock_common_checks(usimple_lock_t, char *); +#else /* USLOCK_DEBUG */ +#define USLDBG(stmt) +#endif /* USLOCK_DEBUG */ /* * Forward definitions @@ -243,7 +249,6 @@ static boolean_t lck_rw_grab_shared(lck_rw_t *lock); static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect); static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state); static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state); -static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state); static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state); static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); @@ -254,15 +259,16 @@ static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint3 */ lck_spin_t * lck_spin_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) + lck_grp_t *grp, + lck_attr_t *attr) { - lck_spin_t *lck; + lck_spin_t *lck; - if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) + if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) { lck_spin_init(lck, grp, attr); + } - return(lck); + return lck; } /* @@ -270,8 +276,8 @@ lck_spin_alloc_init( */ void lck_spin_free( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { lck_spin_destroy(lck, grp); kfree(lck, sizeof(lck_spin_t)); @@ -282,9 +288,9 @@ lck_spin_free( */ void lck_spin_init( - lck_spin_t *lck, - lck_grp_t *grp, - __unused lck_attr_t *attr) + lck_spin_t *lck, + lck_grp_t *grp, + __unused lck_attr_t *attr) { usimple_lock_init((usimple_lock_t) lck, 0); if (grp) { @@ -298,11 +304,12 @@ lck_spin_init( */ void lck_spin_destroy( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { - if (lck->interlock == LCK_SPIN_TAG_DESTROYED) + if (lck->interlock == LCK_SPIN_TAG_DESTROYED) { return; + } lck->interlock = LCK_SPIN_TAG_DESTROYED; if (grp) { lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); @@ -316,8 +323,8 @@ lck_spin_destroy( */ void lck_spin_lock_grp( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { #pragma unused(grp) usimple_lock((usimple_lock_t) lck, grp); @@ -325,7 +332,7 @@ lck_spin_lock_grp( void lck_spin_lock( - lck_spin_t *lck) + lck_spin_t *lck) { usimple_lock((usimple_lock_t) lck, NULL); } @@ -335,24 +342,24 @@ lck_spin_lock( */ void lck_spin_unlock( - lck_spin_t *lck) + lck_spin_t *lck) { usimple_unlock((usimple_lock_t) lck); } boolean_t lck_spin_try_lock_grp( - lck_spin_t *lck, - lck_grp_t *grp) + lck_spin_t *lck, + lck_grp_t *grp) { #pragma unused(grp) boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp); -#if DEVELOPMENT || DEBUG +#if DEVELOPMENT || DEBUG if (lrval) { pltrace(FALSE); } #endif - return(lrval); + return lrval; } @@ -361,15 +368,15 @@ lck_spin_try_lock_grp( */ boolean_t lck_spin_try_lock( - lck_spin_t *lck) + lck_spin_t *lck) { boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL); -#if DEVELOPMENT || DEBUG +#if DEVELOPMENT || DEBUG if (lrval) { pltrace(FALSE); } #endif - return(lrval); + return lrval; } /* @@ -410,7 +417,8 @@ lck_spin_assert(lck_spin_t *lock, unsigned int type) * Returns: TRUE if lock is acquired. */ boolean_t -kdp_lck_spin_is_acquired(lck_spin_t *lck) { +kdp_lck_spin_is_acquired(lck_spin_t *lck) +{ if (not_in_kdp) { panic("panic: spinlock acquired check done outside of kernel debugger"); } @@ -424,21 +432,23 @@ kdp_lck_spin_is_acquired(lck_spin_t *lck) { */ void usimple_lock_init( - usimple_lock_t l, - __unused unsigned short tag) + usimple_lock_t l, + __unused unsigned short tag) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK USLDBG(usld_lock_init(l, tag)); hw_lock_init(&l->interlock); #else - simple_lock_init((simple_lock_t)l,tag); + simple_lock_init((simple_lock_t)l, tag); #endif } volatile uint32_t spinlock_owner_cpu = ~0; volatile usimple_lock_t spinlock_timed_out; -uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { +uint32_t +spinlock_timeout_NMI(uintptr_t thread_addr) +{ uint32_t i; for (i = 0; i < real_ncpus; i++) { @@ -464,21 +474,22 @@ uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { */ void (usimple_lock)( - usimple_lock_t l + usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); - if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { + if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { boolean_t uslock_acquired = FALSE; while (machine_timeout_suspended()) { enable_preemption(); - if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) + if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) { break; + } } if (uslock_acquired == FALSE) { @@ -487,11 +498,11 @@ void spinlock_timed_out = l; lock_cpu = spinlock_timeout_NMI(lowner); panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", - l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); + l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); } } #if DEVELOPMENT || DEBUG - pltrace(FALSE); + pltrace(FALSE); #endif USLDBG(usld_lock_post(l, pc)); @@ -513,15 +524,15 @@ void */ void usimple_unlock( - usimple_lock_t l) + usimple_lock_t l) { -#ifndef MACHINE_SIMPLE_LOCK +#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); USLDBG(usld_unlock(l, pc)); #if DEVELOPMENT || DEBUG - pltrace(TRUE); + pltrace(TRUE); #endif hw_lock_unlock(&l->interlock); #else @@ -544,11 +555,11 @@ usimple_unlock( */ unsigned int usimple_lock_try( - usimple_lock_t l, + usimple_lock_t l, lck_grp_t *grp) { -#ifndef MACHINE_SIMPLE_LOCK - unsigned int success; +#ifndef MACHINE_SIMPLE_LOCK + unsigned int success; DECL_PC(pc); OBTAIN_PC(pc); @@ -557,11 +568,11 @@ usimple_lock_try( #if DEVELOPMENT || DEBUG pltrace(FALSE); #endif - USLDBG(usld_lock_try_post(l, pc)); + USLDBG(usld_lock_try_post(l, pc)); } return success; #else - return(simple_lock_try((simple_lock_t)l, grp)); + return simple_lock_try((simple_lock_t)l, grp); #endif } @@ -570,10 +581,11 @@ usimple_lock_try( * and spinning on a lock. * */ -unsigned int +unsigned +int (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l, - uint64_t deadline - LCK_GRP_ARG(lck_grp_t *grp)) + uint64_t deadline + LCK_GRP_ARG(lck_grp_t *grp)) { boolean_t istate = ml_get_interrupts_enabled(); @@ -582,9 +594,10 @@ unsigned int } while (!simple_lock_try(l, grp)) { - if (!istate) + if (!istate) { cpu_signal_handler(NULL); - + } + if (deadline < mach_absolute_time()) { return 0; } @@ -597,15 +610,16 @@ unsigned int void (usimple_lock_try_lock_loop)(usimple_lock_t l - LCK_GRP_ARG(lck_grp_t *grp)) + LCK_GRP_ARG(lck_grp_t *grp)) { usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp); } -unsigned int +unsigned +int (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l, - uint64_t duration - LCK_GRP_ARG(lck_grp_t *grp)) + uint64_t duration + LCK_GRP_ARG(lck_grp_t *grp)) { uint64_t deadline; uint64_t base_at = mach_absolute_time(); @@ -621,17 +635,17 @@ unsigned int return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp); } -#if USLOCK_DEBUG +#if USLOCK_DEBUG /* * States of a usimple_lock. The default when initializing * a usimple_lock is setting it up for debug checking. */ -#define USLOCK_CHECKED 0x0001 /* lock is being checked */ -#define USLOCK_TAKEN 0x0002 /* lock has been taken */ -#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ -#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) -#define USLOCK_CHECKING(l) (uslock_check && \ - ((l)->debug.state & USLOCK_CHECKED)) +#define USLOCK_CHECKED 0x0001 /* lock is being checked */ +#define USLOCK_TAKEN 0x0002 /* lock has been taken */ +#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ +#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) +#define USLOCK_CHECKING(l) (uslock_check && \ + ((l)->debug.state & USLOCK_CHECKED)) /* * Initialize the debugging information contained @@ -639,11 +653,12 @@ unsigned int */ void usld_lock_init( - usimple_lock_t l, - __unused unsigned short tag) + usimple_lock_t l, + __unused unsigned short tag) { - if (l == USIMPLE_LOCK_NULL) + if (l == USIMPLE_LOCK_NULL) { panic("lock initialization: null lock pointer"); + } l->lock_type = USLOCK_TAG; l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; l->debug.lock_cpu = l->debug.unlock_cpu = 0; @@ -662,15 +677,18 @@ usld_lock_init( */ int usld_lock_common_checks( - usimple_lock_t l, - char *caller) + usimple_lock_t l, + char *caller) { - if (l == USIMPLE_LOCK_NULL) + if (l == USIMPLE_LOCK_NULL) { panic("%s: null lock pointer", caller); - if (l->lock_type != USLOCK_TAG) + } + if (l->lock_type != USLOCK_TAG) { panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); - if (!(l->debug.state & USLOCK_INIT)) + } + if (!(l->debug.state & USLOCK_INIT)) { panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); + } return USLOCK_CHECKING(l); } @@ -682,14 +700,15 @@ usld_lock_common_checks( /* ARGSUSED */ void usld_lock_pre( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - char caller[] = "usimple_lock"; + char caller[] = "usimple_lock"; - if (!usld_lock_common_checks(l, caller)) + if (!usld_lock_common_checks(l, caller)) { return; + } /* * Note that we have a weird case where we are getting a lock when we are] @@ -702,9 +721,9 @@ usld_lock_pre( if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && l->debug.lock_thread == (void *) current_thread()) { printf("%s: lock %p already locked (at %p) by", - caller, l, l->debug.lock_pc); + caller, l, l->debug.lock_pc); printf(" current thread %p (new attempt at pc %p)\n", - l->debug.lock_thread, pc); + l->debug.lock_thread, pc); panic("%s", caller); } mp_disable_preemption(); @@ -720,22 +739,25 @@ usld_lock_pre( */ void usld_lock_post( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "successful usimple_lock"; + int mycpu; + char caller[] = "successful usimple_lock"; - if (!usld_lock_common_checks(l, caller)) + if (!usld_lock_common_checks(l, caller)) { return; + } - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) + if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { panic("%s: lock %p became uninitialized", - caller, l); - if ((l->debug.state & USLOCK_TAKEN)) + caller, l); + } + if ((l->debug.state & USLOCK_TAKEN)) { panic("%s: lock 0x%p became TAKEN by someone else", - caller, l); + caller, l); + } mycpu = cpu_number(); l->debug.lock_thread = (void *)current_thread(); @@ -755,27 +777,30 @@ usld_lock_post( */ void usld_unlock( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "usimple_unlock"; + int mycpu; + char caller[] = "usimple_unlock"; - if (!usld_lock_common_checks(l, caller)) + if (!usld_lock_common_checks(l, caller)) { return; + } mycpu = cpu_number(); - if (!(l->debug.state & USLOCK_TAKEN)) + if (!(l->debug.state & USLOCK_TAKEN)) { panic("%s: lock 0x%p hasn't been taken", - caller, l); - if (l->debug.lock_thread != (void *) current_thread()) + caller, l); + } + if (l->debug.lock_thread != (void *) current_thread()) { panic("%s: unlocking lock 0x%p, owned by thread %p", - caller, l, l->debug.lock_thread); + caller, l, l->debug.lock_thread); + } if (l->debug.lock_cpu != mycpu) { printf("%s: unlocking lock 0x%p on cpu 0x%x", - caller, l, mycpu); + caller, l, mycpu); printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); panic("%s", caller); } @@ -796,13 +821,14 @@ usld_unlock( */ void usld_lock_try_pre( - usimple_lock_t l, - __unused pc_t pc) + usimple_lock_t l, + __unused pc_t pc) { - char caller[] = "usimple_lock_try"; + char caller[] = "usimple_lock_try"; - if (!usld_lock_common_checks(l, caller)) + if (!usld_lock_common_checks(l, caller)) { return; + } } @@ -816,21 +842,24 @@ usld_lock_try_pre( */ void usld_lock_try_post( - usimple_lock_t l, - pc_t pc) + usimple_lock_t l, + pc_t pc) { - int mycpu; - char caller[] = "successful usimple_lock_try"; + int mycpu; + char caller[] = "successful usimple_lock_try"; - if (!usld_lock_common_checks(l, caller)) + if (!usld_lock_common_checks(l, caller)) { return; + } - if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) + if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) { panic("%s: lock 0x%p became uninitialized", - caller, l); - if ((l->debug.state & USLOCK_TAKEN)) + caller, l); + } + if ((l->debug.state & USLOCK_TAKEN)) { panic("%s: lock 0x%p became TAKEN by someone else", - caller, l); + caller, l); + } mycpu = cpu_number(); l->debug.lock_thread = (void *) current_thread(); @@ -838,23 +867,24 @@ usld_lock_try_post( l->debug.lock_pc = pc; l->debug.lock_cpu = mycpu; } -#endif /* USLOCK_DEBUG */ +#endif /* USLOCK_DEBUG */ /* * Routine: lck_rw_alloc_init */ lck_rw_t * lck_rw_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) { - lck_rw_t *lck; + lck_grp_t *grp, + lck_attr_t *attr) +{ + lck_rw_t *lck; if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { bzero(lck, sizeof(lck_rw_t)); lck_rw_init(lck, grp, attr); } - return(lck); + return lck; } /* @@ -862,8 +892,9 @@ lck_rw_alloc_init( */ void lck_rw_free( - lck_rw_t *lck, - lck_grp_t *grp) { + lck_rw_t *lck, + lck_grp_t *grp) +{ lck_rw_destroy(lck, grp); kfree(lck, sizeof(lck_rw_t)); } @@ -873,12 +904,12 @@ lck_rw_free( */ void lck_rw_init( - lck_rw_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_rw_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? - attr : &LockDefaultLckAttr; + lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? + attr : &LockDefaultLckAttr; hw_lock_byte_init(&lck->lck_rw_interlock); lck->lck_rw_want_write = FALSE; @@ -888,7 +919,7 @@ lck_rw_init( lck->lck_r_waiting = lck->lck_w_waiting = 0; lck->lck_rw_tag = 0; lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & - LCK_ATTR_RW_SHARED_PRIORITY) == 0); + LCK_ATTR_RW_SHARED_PRIORITY) == 0); lck_grp_reference(grp); lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); @@ -899,11 +930,12 @@ lck_rw_init( */ void lck_rw_destroy( - lck_rw_t *lck, - lck_grp_t *grp) + lck_rw_t *lck, + lck_grp_t *grp) { - if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) + if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) { return; + } #if MACH_LDEBUG lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); #endif @@ -929,7 +961,7 @@ lck_rw_destroy( static inline boolean_t lck_interlock_lock(lck_rw_t *lck) { - boolean_t istate; + boolean_t istate; istate = ml_set_interrupts_enabled(FALSE); hw_lock_byte_lock(&lck->lck_rw_interlock); @@ -952,16 +984,18 @@ lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) static inline void lck_rw_lock_pause(boolean_t interrupts_enabled) { - if (!interrupts_enabled) + if (!interrupts_enabled) { handle_pending_TLB_flushes(); + } cpu_pause(); } static inline boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock) { - if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) + if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) { return TRUE; + } return FALSE; } @@ -976,7 +1010,7 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) { /* * there are already threads waiting on this lock... this - * implies that they have spun beyond their deadlines waiting for + * implies that they have spun beyond their deadlines waiting for * the desired state to show up so we will not bother spinning at this time... * or * the current number of threads sharing this lock exceeds our capacity to run them @@ -984,11 +1018,12 @@ lck_rw_deadline_for_spin(lck_rw_t *lck) * to be at 0, we'll not bother spinning since the latency for this to happen is * unpredictable... */ - return (mach_absolute_time()); + return mach_absolute_time(); } - return (mach_absolute_time() + MutexSpin); - } else - return (mach_absolute_time() + (100000LL * 1000000000LL)); + return mach_absolute_time() + MutexSpin; + } else { + return mach_absolute_time() + (100000LL * 1000000000LL); + } } @@ -1007,12 +1042,13 @@ lck_rw_interlock_spin(lck_rw_t *lock) static boolean_t lck_rw_grab_want(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed); - if ((data & LCK_RW_INTERLOCK) == 0) + if ((data & LCK_RW_INTERLOCK) == 0) { break; + } atomic_exchange_abort(); lck_rw_interlock_spin(lock); } @@ -1027,12 +1063,13 @@ lck_rw_grab_want(lck_rw_t *lock) static boolean_t lck_rw_grab_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); - if ((data & LCK_RW_INTERLOCK) == 0) + if ((data & LCK_RW_INTERLOCK) == 0) { break; + } atomic_exchange_abort(); lck_rw_interlock_spin(lock); } @@ -1051,19 +1088,19 @@ lck_rw_grab_shared(lck_rw_t *lock) */ static void lck_rw_lock_exclusive_gen( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int slept = 0; - int gotlock = 0; - int lockheld = 0; - wait_result_t res = 0; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int gotlock = 0; + int lockheld = 0; + wait_result_t res = 0; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE boolean_t dtrace_ls_initialized = FALSE; - boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE; + boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE; uint64_t wait_interval = 0; int readers_at_sleep = 0; #endif @@ -1071,9 +1108,8 @@ lck_rw_lock_exclusive_gen( /* * Try to acquire the lck_rw_want_write bit. */ - while ( !lck_rw_grab_want(lck)) { - -#if CONFIG_DTRACE + while (!lck_rw_grab_want(lck)) { +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); @@ -1089,38 +1125,39 @@ lck_rw_lock_exclusive_gen( } } #endif - if (istate == -1) + if (istate == -1) { istate = ml_get_interrupts_enabled(); + } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) + while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) { lck_rw_lock_pause(istate); + } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0); - if (gotlock) + if (gotlock) { break; + } /* * if we get here, the deadline has expired w/o us * being able to grab the lock exclusively * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { - istate = lck_interlock_lock(lck); if (lck->lck_rw_want_write) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1147,8 +1184,7 @@ lck_rw_lock_exclusive_gen( * and the interlock not held, we are safe to proceed */ while (lck_rw_held_read_or_upgrade(lck)) { - -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * Either sleeping or spinning is happening, start * a timing of our delay interval now. If we set it @@ -1170,27 +1206,29 @@ lck_rw_lock_exclusive_gen( } } #endif - if (istate == -1) + if (istate == -1) { istate = ml_get_interrupts_enabled(); + } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0); - while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) + while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) { lck_rw_lock_pause(istate); + } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0); - if ( !lockheld) + if (!lockheld) { break; + } /* * if we get here, the deadline has expired w/o us * being able to grab the lock exclusively * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { - istate = lck_interlock_lock(lck); if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { @@ -1200,7 +1238,7 @@ lck_rw_lock_exclusive_gen( thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1220,7 +1258,7 @@ lck_rw_lock_exclusive_gen( } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * Decide what latencies we suffered that are Dtrace events. * If we have set wait_interval, then we either spun or slept. @@ -1254,40 +1292,46 @@ lck_rw_lock_exclusive_gen( * Routine: lck_rw_done */ -lck_rw_type_t lck_rw_done(lck_rw_t *lock) +lck_rw_type_t +lck_rw_done(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); - if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ + if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */ atomic_exchange_abort(); lck_rw_interlock_spin(lock); continue; } if (data & LCK_RW_SHARED_MASK) { data -= LCK_RW_SHARED_READER; - if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */ + if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */ goto check_waiters; - } else { /* if reader count == 0, must be exclusive lock */ + } + } else { /* if reader count == 0, must be exclusive lock */ if (data & LCK_RW_WANT_UPGRADE) { data &= ~(LCK_RW_WANT_UPGRADE); } else { - if (data & LCK_RW_WANT_WRITE) + if (data & LCK_RW_WANT_WRITE) { data &= ~(LCK_RW_WANT_EXCL); - else /* lock is not 'owned', panic */ + } else { /* lock is not 'owned', panic */ panic("Releasing non-exclusive RW lock without a reader refcount!"); + } } check_waiters: if (prev & LCK_RW_W_WAITING) { data &= ~(LCK_RW_W_WAITING); - if ((prev & LCK_RW_PRIV_EXCL) == 0) + if ((prev & LCK_RW_PRIV_EXCL) == 0) { data &= ~(LCK_RW_R_WAITING); - } else + } + } else { data &= ~(LCK_RW_R_WAITING); + } } - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) { break; + } cpu_pause(); } return lck_rw_done_gen(lock, prev); @@ -1298,13 +1342,13 @@ check_waiters: * * called from lck_rw_done() * prior_lock_state is the value in the 1st - * word of the lock at the time of a successful + * word of the lock at the time of a successful * atomic compare and exchange with the new value... - * it represents the state of the lock before we + * it represents the state of the lock before we * decremented the rw_shared_count or cleared either - * rw_want_upgrade or rw_want_write and + * rw_want_upgrade or rw_want_write and * the lck_x_waiting bits... since the wrapper - * routine has already changed the state atomically, + * routine has already changed the state atomically, * we just need to decide if we should * wake up anyone and what value to return... we do * this by examining the state of the lock before @@ -1372,15 +1416,16 @@ lck_rw_done_gen( */ void lck_rw_unlock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) { - if (lck_rw_type == LCK_RW_TYPE_SHARED) + if (lck_rw_type == LCK_RW_TYPE_SHARED) { lck_rw_unlock_shared(lck); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) + } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { lck_rw_unlock_exclusive(lck); - else + } else { panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type); + } } @@ -1389,15 +1434,16 @@ lck_rw_unlock( */ void lck_rw_unlock_shared( - lck_rw_t *lck) + lck_rw_t *lck) { - lck_rw_type_t ret; + lck_rw_type_t ret; assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count); ret = lck_rw_done(lck); - if (ret != LCK_RW_TYPE_SHARED) + if (ret != LCK_RW_TYPE_SHARED) { panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret); + } } @@ -1406,14 +1452,15 @@ lck_rw_unlock_shared( */ void lck_rw_unlock_exclusive( - lck_rw_t *lck) + lck_rw_t *lck) { - lck_rw_type_t ret; + lck_rw_type_t ret; ret = lck_rw_done(lck); - if (ret != LCK_RW_TYPE_EXCLUSIVE) + if (ret != LCK_RW_TYPE_EXCLUSIVE) { panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret); + } } @@ -1422,15 +1469,16 @@ lck_rw_unlock_exclusive( */ void lck_rw_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) { - if (lck_rw_type == LCK_RW_TYPE_SHARED) + if (lck_rw_type == LCK_RW_TYPE_SHARED) { lck_rw_lock_shared(lck); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) + } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { lck_rw_lock_exclusive(lck); - else + } else { panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type); + } } /* @@ -1439,10 +1487,10 @@ lck_rw_lock( void lck_rw_lock_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; current_thread()->rwlock_count++; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) { atomic_exchange_abort(); @@ -1455,13 +1503,14 @@ lck_rw_lock_shared(lck_rw_t *lock) break; } data += LCK_RW_SHARED_READER; - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { break; + } cpu_pause(); } -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return; } @@ -1474,25 +1523,24 @@ lck_rw_lock_shared(lck_rw_t *lock) */ static void lck_rw_lock_shared_gen( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int gotlock = 0; - int slept = 0; - wait_result_t res = 0; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int gotlock = 0; + int slept = 0; + wait_result_t res = 0; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE uint64_t wait_interval = 0; int readers_at_sleep = 0; boolean_t dtrace_ls_initialized = FALSE; boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE; #endif - while ( !lck_rw_grab_shared(lck)) { - -#if CONFIG_DTRACE + while (!lck_rw_grab_shared(lck)) { +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0); @@ -1508,42 +1556,43 @@ lck_rw_lock_shared_gen( } } #endif - if (istate == -1) + if (istate == -1) { istate = ml_get_interrupts_enabled(); + } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); - while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) + while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) { lck_rw_lock_pause(istate); + } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); - if (gotlock) + if (gotlock) { break; + } /* * if we get here, the deadline has expired w/o us * being able to grab the lock for read * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { - istate = lck_interlock_lock(lck); if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); lck->lck_r_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); res = assert_wait(RW_LOCK_READER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1551,7 +1600,7 @@ lck_rw_lock_shared_gen( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, - trace_lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); @@ -1560,7 +1609,7 @@ lck_rw_lock_shared_gen( } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE if (dtrace_ls_enabled == TRUE) { if (slept == 0) { LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); @@ -1584,13 +1633,14 @@ lck_rw_lock_exclusive(lck_rw_t *lock) { current_thread()->rwlock_count++; if (atomic_test_and_set32(&lock->data, - (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), - LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { -#if CONFIG_DTRACE + (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), + LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) { +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); -#endif /* CONFIG_DTRACE */ - } else +#endif /* CONFIG_DTRACE */ + } else { lck_rw_lock_exclusive_gen(lock); + } } @@ -1603,9 +1653,9 @@ lck_rw_lock_exclusive(lck_rw_t *lock) boolean_t lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1614,22 +1664,26 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) } if (data & LCK_RW_WANT_UPGRADE) { data -= LCK_RW_SHARED_READER; - if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */ - data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */ + data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */ + } + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { return lck_rw_lock_shared_to_exclusive_failure(lock, prev); + } } else { - data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ - data -= LCK_RW_SHARED_READER; /* and shed our read count */ - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */ + data -= LCK_RW_SHARED_READER; /* and shed our read count */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { break; + } } cpu_pause(); } - /* we now own the WANT_UPGRADE */ - if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */ - lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ -#if CONFIG_DTRACE + /* we now own the WANT_UPGRADE */ + if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */ + lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */ + } +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0); #endif return TRUE; @@ -1646,12 +1700,12 @@ lck_rw_lock_shared_to_exclusive(lck_rw_t *lock) */ static boolean_t lck_rw_lock_shared_to_exclusive_failure( - lck_rw_t *lck, - uint32_t prior_lock_state) + lck_rw_t *lck, + uint32_t prior_lock_state) { - lck_rw_t *fake_lck; - thread_t thread = current_thread(); - uint32_t rwlock_count; + lck_rw_t *fake_lck; + thread_t thread = current_thread(); + uint32_t rwlock_count; /* Check if dropping the lock means that we need to unpromote */ rwlock_count = thread->rwlock_count--; @@ -1677,9 +1731,9 @@ lck_rw_lock_shared_to_exclusive_failure( } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, - VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); + VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); - return (FALSE); + return FALSE; } @@ -1693,16 +1747,16 @@ lck_rw_lock_shared_to_exclusive_failure( */ static boolean_t lck_rw_lock_shared_to_exclusive_success( - lck_rw_t *lck) + lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - uint64_t deadline = 0; - int slept = 0; - int still_shared = 0; - wait_result_t res; - boolean_t istate = -1; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + uint64_t deadline = 0; + int slept = 0; + int still_shared = 0; + wait_result_t res; + boolean_t istate = -1; -#if CONFIG_DTRACE +#if CONFIG_DTRACE uint64_t wait_interval = 0; int readers_at_sleep = 0; boolean_t dtrace_ls_initialized = FALSE; @@ -1710,8 +1764,7 @@ lck_rw_lock_shared_to_exclusive_success( #endif while (lck->lck_rw_shared_count != 0) { - -#if CONFIG_DTRACE +#if CONFIG_DTRACE if (dtrace_ls_initialized == FALSE) { dtrace_ls_initialized = TRUE; dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0); @@ -1727,40 +1780,42 @@ lck_rw_lock_shared_to_exclusive_success( } } #endif - if (istate == -1) + if (istate == -1) { istate = ml_get_interrupts_enabled(); + } deadline = lck_rw_deadline_for_spin(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); - while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) + while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) { lck_rw_lock_pause(istate); + } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); - if ( !still_shared) + if (!still_shared) { break; + } /* * if we get here, the deadline has expired w/o * the rw_shared_count having drained to 0 * check to see if we're allowed to do a thread_block */ if (lck->lck_rw_can_sleep) { - istate = lck_interlock_lock(lck); if (lck->lck_rw_shared_count != 0) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, - trace_lck, lck->lck_rw_shared_count, 0, 0, 0); + trace_lck, lck->lck_rw_shared_count, 0, 0, 0); lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); res = assert_wait(RW_LOCK_WRITER_EVENT(lck), - THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1768,14 +1823,14 @@ lck_rw_lock_shared_to_exclusive_success( slept++; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, - trace_lck, res, slept, 0, 0); + trace_lck, res, slept, 0, 0); } else { lck_interlock_unlock(lck, istate); break; } } } -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * We infer whether we took the sleep/spin path above by checking readers_at_sleep. */ @@ -1790,33 +1845,37 @@ lck_rw_lock_shared_to_exclusive_success( } LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); #endif - return (TRUE); + return TRUE; } /* * Routine: lck_rw_lock_exclusive_to_shared */ -void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) +void +lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); - lck_rw_interlock_spin(lock); /* wait for interlock to clear */ + lck_rw_interlock_spin(lock); /* wait for interlock to clear */ continue; } data += LCK_RW_SHARED_READER; - if (data & LCK_RW_WANT_UPGRADE) + if (data & LCK_RW_WANT_UPGRADE) { data &= ~(LCK_RW_WANT_UPGRADE); - else + } else { data &= ~(LCK_RW_WANT_EXCL); - if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) + } + if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) { data &= ~(LCK_RW_W_WAITING); - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) + } + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) { break; + } cpu_pause(); } return lck_rw_lock_exclusive_to_shared_gen(lock, prev); @@ -1825,7 +1884,7 @@ void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) /* * Routine: lck_rw_lock_exclusive_to_shared_gen - * Function: + * Function: * assembly fast path has already dropped * our exclusive state and bumped lck_rw_shared_count * all we need to do here is determine if anyone @@ -1833,16 +1892,16 @@ void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock) */ static void lck_rw_lock_exclusive_to_shared_gen( - lck_rw_t *lck, - uint32_t prior_lock_state) + lck_rw_t *lck, + uint32_t prior_lock_state) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); - lck_rw_t *fake_lck; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + lck_rw_t *fake_lck; fake_lck = (lck_rw_t *)&prior_lock_state; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, - trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); + trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); /* * don't wake up anyone waiting to take the lock exclusively @@ -1852,11 +1911,12 @@ lck_rw_lock_exclusive_to_shared_gen( * wake up any waiting readers if we don't have any writers waiting, * or the lock is NOT marked as rw_priv_excl (writers have privilege) */ - if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) + if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) { thread_wakeup(RW_LOCK_READER_EVENT(lck)); + } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, - trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); + trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); @@ -1869,27 +1929,29 @@ lck_rw_lock_exclusive_to_shared_gen( */ boolean_t lck_rw_try_lock( - lck_rw_t *lck, - lck_rw_type_t lck_rw_type) -{ - if (lck_rw_type == LCK_RW_TYPE_SHARED) - return(lck_rw_try_lock_shared(lck)); - else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) - return(lck_rw_try_lock_exclusive(lck)); - else + lck_rw_t *lck, + lck_rw_type_t lck_rw_type) +{ + if (lck_rw_type == LCK_RW_TYPE_SHARED) { + return lck_rw_try_lock_shared(lck); + } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { + return lck_rw_try_lock_exclusive(lck); + } else { panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type); - return(FALSE); + } + return FALSE; } /* * Routine: lck_rw_try_lock_shared */ -boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) +boolean_t +lck_rw_try_lock_shared(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1898,18 +1960,19 @@ boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) } if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); - return FALSE; /* lock is busy */ + return FALSE; /* lock is busy */ } - data += LCK_RW_SHARED_READER; /* Increment reader refcount */ - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + data += LCK_RW_SHARED_READER; /* Increment reader refcount */ + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { break; + } cpu_pause(); } current_thread()->rwlock_count++; /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */ -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return TRUE; } @@ -1918,11 +1981,12 @@ boolean_t lck_rw_try_lock_shared(lck_rw_t *lock) * Routine: lck_rw_try_lock_exclusive */ -boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock) +boolean_t +lck_rw_try_lock_exclusive(lck_rw_t *lock) { - uint32_t data, prev; + uint32_t data, prev; - for ( ; ; ) { + for (;;) { data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp); if (data & LCK_RW_INTERLOCK) { atomic_exchange_abort(); @@ -1931,26 +1995,27 @@ boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock) } if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) { atomic_exchange_abort(); - return FALSE; /* can't get it */ + return FALSE; /* can't get it */ } data |= LCK_RW_WANT_EXCL; - if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) + if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) { break; + } cpu_pause(); } current_thread()->rwlock_count++; -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL); -#endif /* CONFIG_DTRACE */ +#endif /* CONFIG_DTRACE */ return TRUE; } void lck_rw_assert( - lck_rw_t *lck, - unsigned int type) + lck_rw_t *lck, + unsigned int type) { switch (type) { case LCK_RW_ASSERT_SHARED: @@ -1960,7 +2025,7 @@ lck_rw_assert( break; case LCK_RW_ASSERT_EXCLUSIVE: if ((lck->lck_rw_want_write || - lck->lck_rw_want_upgrade) && + lck->lck_rw_want_upgrade) && lck->lck_rw_shared_count == 0) { return; } @@ -1974,8 +2039,8 @@ lck_rw_assert( break; case LCK_RW_ASSERT_NOTHELD: if (!(lck->lck_rw_want_write || - lck->lck_rw_want_upgrade || - lck->lck_rw_shared_count != 0)) { + lck->lck_rw_want_upgrade || + lck->lck_rw_shared_count != 0)) { return; } break; @@ -2023,7 +2088,8 @@ lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield) * NOT SAFE: To be used only by kernel debugger to avoid deadlock. */ boolean_t -kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { +kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) +{ if (not_in_kdp) { panic("panic: rw lock exclusive check done outside of kernel debugger"); } @@ -2072,7 +2138,7 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { * on acquire. */ -#ifdef MUTEX_ZONE +#ifdef MUTEX_ZONE extern zone_t lck_mtx_zone; #endif @@ -2081,18 +2147,20 @@ extern zone_t lck_mtx_zone; */ lck_mtx_t * lck_mtx_alloc_init( - lck_grp_t *grp, - lck_attr_t *attr) + lck_grp_t *grp, + lck_attr_t *attr) { - lck_mtx_t *lck; -#ifdef MUTEX_ZONE - if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) + lck_mtx_t *lck; +#ifdef MUTEX_ZONE + if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) { lck_mtx_init(lck, grp, attr); + } #else - if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) + if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) { lck_mtx_init(lck, grp, attr); + } #endif - return(lck); + return lck; } /* @@ -2100,11 +2168,11 @@ lck_mtx_alloc_init( */ void lck_mtx_free( - lck_mtx_t *lck, - lck_grp_t *grp) + lck_mtx_t *lck, + lck_grp_t *grp) { lck_mtx_destroy(lck, grp); -#ifdef MUTEX_ZONE +#ifdef MUTEX_ZONE zfree(lck_mtx_zone, lck); #else kfree(lck, sizeof(lck_mtx_t)); @@ -2116,9 +2184,9 @@ lck_mtx_free( */ static void lck_mtx_ext_init( - lck_mtx_ext_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_ext_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { bzero((void *)lck, sizeof(lck_mtx_ext_t)); @@ -2129,8 +2197,9 @@ lck_mtx_ext_init( lck->lck_mtx_grp = grp; - if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) + if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) { lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; + } lck->lck_mtx.lck_mtx_is_ext = 1; lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF; @@ -2141,17 +2210,18 @@ lck_mtx_ext_init( */ void lck_mtx_init( - lck_mtx_t *lck, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_t *lck, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_mtx_ext_t *lck_ext; - lck_attr_t *lck_attr; + lck_mtx_ext_t *lck_ext; + lck_attr_t *lck_attr; - if (attr != LCK_ATTR_NULL) + if (attr != LCK_ATTR_NULL) { lck_attr = attr; - else + } else { lck_attr = &LockDefaultLckAttr; + } if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { @@ -2173,17 +2243,18 @@ lck_mtx_init( */ void lck_mtx_init_ext( - lck_mtx_t *lck, - lck_mtx_ext_t *lck_ext, - lck_grp_t *grp, - lck_attr_t *attr) + lck_mtx_t *lck, + lck_mtx_ext_t *lck_ext, + lck_grp_t *grp, + lck_attr_t *attr) { - lck_attr_t *lck_attr; + lck_attr_t *lck_attr; - if (attr != LCK_ATTR_NULL) + if (attr != LCK_ATTR_NULL) { lck_attr = attr; - else + } else { lck_attr = &LockDefaultLckAttr; + } if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { lck_mtx_ext_init(lck_ext, grp, lck_attr); @@ -2225,13 +2296,14 @@ lck_mtx_lock_mark_destroyed( */ void lck_mtx_destroy( - lck_mtx_t *lck, - lck_grp_t *grp) + lck_mtx_t *lck, + lck_grp_t *grp) { boolean_t indirect; - if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) + if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) { return; + } #if MACH_LDEBUG lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); #endif @@ -2239,8 +2311,9 @@ lck_mtx_destroy( lck_mtx_lock_mark_destroyed(lck, indirect); - if (indirect) + if (indirect) { kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); + } lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); lck_grp_deallocate(grp); return; @@ -2262,7 +2335,7 @@ __attribute__((always_inline)) static boolean_t get_indirect_mutex( lck_mtx_t **lock, - uint32_t *state) + uint32_t *state) { *lock = &((*lock)->lck_mtx_ptr->lck_mtx); *state = ordered_load_mtx_state(*lock); @@ -2270,7 +2343,7 @@ get_indirect_mutex( } /* - * Routine: lck_mtx_unlock_slow + * Routine: lck_mtx_unlock_slow * * Unlocks a mutex held by current thread. * @@ -2281,11 +2354,11 @@ get_indirect_mutex( __attribute__((noinline)) void lck_mtx_unlock_slow( - lck_mtx_t *lock) + lck_mtx_t *lock) { - thread_t thread; - uint32_t state, prev; - boolean_t indirect = FALSE; + thread_t thread; + uint32_t state, prev; + boolean_t indirect = FALSE; state = ordered_load_mtx_state(lock); @@ -2298,13 +2371,15 @@ lck_mtx_unlock_slow( #if DEVELOPMENT | DEBUG thread_t owner = (thread_t)lock->lck_mtx_owner; - if(__improbable(owner != thread)) + if (__improbable(owner != thread)) { lck_mtx_owner_check_panic(lock); + } #endif /* check if it is held as a spinlock */ - if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) + if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) { goto unlock; + } lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state); @@ -2318,21 +2393,23 @@ unlock: if (__improbable(state & LCK_MTX_WAITERS_MSK)) { #if MACH_LDEBUG - if (thread) + if (thread) { thread->mutex_count--; + } #endif return lck_mtx_unlock_wakeup_tail(lock, state, indirect); } /* release interlock, promotion and clear spin flag */ state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK)); - ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ + ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ -#if MACH_LDEBUG +#if MACH_LDEBUG /* perform lock statistics after drop to prevent delay */ - if (thread) - thread->mutex_count--; /* lock statistic */ -#endif /* MACH_LDEBUG */ + if (thread) { + thread->mutex_count--; /* lock statistic */ + } +#endif /* MACH_LDEBUG */ /* re-enable preemption */ lck_mtx_unlock_finish_inline(lock, FALSE); @@ -2340,11 +2417,11 @@ unlock: return; } -#define LCK_MTX_LCK_WAIT_CODE 0x20 -#define LCK_MTX_LCK_WAKEUP_CODE 0x21 -#define LCK_MTX_LCK_SPIN_CODE 0x22 -#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 -#define LCK_MTX_LCK_DEMOTE_CODE 0x24 +#define LCK_MTX_LCK_WAIT_CODE 0x20 +#define LCK_MTX_LCK_WAKEUP_CODE 0x21 +#define LCK_MTX_LCK_SPIN_CODE 0x22 +#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 +#define LCK_MTX_LCK_DEMOTE_CODE 0x24 /* * Routine: lck_mtx_unlock_wakeup_tail @@ -2368,18 +2445,18 @@ unlock: */ __attribute__((noinline)) static void -lck_mtx_unlock_wakeup_tail ( - lck_mtx_t *mutex, +lck_mtx_unlock_wakeup_tail( + lck_mtx_t *mutex, uint32_t state, - boolean_t indirect) + boolean_t indirect) { struct turnstile *ts; - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); kern_return_t did_wake; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX); @@ -2396,7 +2473,7 @@ lck_mtx_unlock_wakeup_tail ( turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX); state -= LCK_MTX_WAITER; - state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK)); + state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK)); ordered_store_mtx_state_release(mutex, state); assert(current_thread()->turnstile != NULL); @@ -2404,13 +2481,13 @@ lck_mtx_unlock_wakeup_tail ( turnstile_cleanup(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); lck_mtx_unlock_finish_inline(mutex, indirect); } /* - * Routine: lck_mtx_lock_acquire_x86 + * Routine: lck_mtx_lock_acquire_x86 * * Invoked on acquiring the mutex when there is * contention (i.e. the assembly routine sees that @@ -2421,13 +2498,13 @@ lck_mtx_unlock_wakeup_tail ( __attribute__((always_inline)) static void lck_mtx_lock_acquire_inline( - lck_mtx_t *mutex, + lck_mtx_t *mutex, struct turnstile *ts) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */ assert(thread->waiting_for_mutex == NULL); @@ -2448,12 +2525,12 @@ lck_mtx_lock_acquire_inline( assert(current_thread()->turnstile != NULL); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); } void lck_mtx_lock_acquire_x86( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { return lck_mtx_lock_acquire_inline(mutex, NULL); } @@ -2467,8 +2544,8 @@ lck_mtx_lock_acquire_x86( __attribute__((noinline)) static void lck_mtx_lock_acquire_tail( - lck_mtx_t *mutex, - boolean_t indirect, + lck_mtx_t *mutex, + boolean_t indirect, struct turnstile *ts) { lck_mtx_lock_acquire_inline(mutex, ts); @@ -2478,7 +2555,7 @@ lck_mtx_lock_acquire_tail( __attribute__((noinline)) static boolean_t lck_mtx_try_lock_acquire_tail( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { lck_mtx_lock_acquire_inline(mutex, NULL); lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex)); @@ -2489,7 +2566,7 @@ lck_mtx_try_lock_acquire_tail( __attribute__((noinline)) static void lck_mtx_convert_spin_acquire_tail( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { lck_mtx_lock_acquire_inline(mutex, NULL); lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex)); @@ -2513,7 +2590,7 @@ lck_mtx_interlock_lock_set_and_clear_flags( uint32_t state, prev; state = *new_state; - for ( ; ; ) { + for (;;) { /* have to wait for interlock to clear */ while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) { cpu_pause(); @@ -2521,11 +2598,12 @@ lck_mtx_interlock_lock_set_and_clear_flags( } prev = state; /* prev contains snapshot for exchange */ state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */ - state &= ~and_flags; /* clear flags */ + state &= ~and_flags; /* clear flags */ disable_preemption(); - if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) + if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) { break; + } enable_preemption(); cpu_pause(); state = ordered_load_mtx_state(mutex); @@ -2564,53 +2642,18 @@ lck_mtx_interlock_try_lock_set_flags( if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) { return 0; } - prev = state; /* prev contains snapshot for exchange */ - state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ disable_preemption(); if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) { - *new_state = state; - return 1; + *new_state = state; + return 1; } enable_preemption(); return 0; } -static inline int -lck_mtx_interlock_try_lock( - lck_mtx_t *mutex, - uint32_t *new_state) -{ - return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state); -} - -static inline int -lck_mtx_interlock_try_lock_disable_interrupts( - lck_mtx_t *mutex, - boolean_t *istate) -{ - uint32_t state; - - *istate = ml_set_interrupts_enabled(FALSE); - state = ordered_load_mtx_state(mutex); - - if (lck_mtx_interlock_try_lock(mutex, &state)) { - return 1; - } else { - ml_set_interrupts_enabled(*istate); - return 0; - } -} - -static inline void -lck_mtx_interlock_unlock_enable_interrupts( - lck_mtx_t *mutex, - boolean_t istate) -{ - lck_mtx_ilk_unlock(mutex); - ml_set_interrupts_enabled(istate); -} - __attribute__((noinline)) static void lck_mtx_lock_contended( @@ -2641,8 +2684,11 @@ try_again: lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock); } - /* just fall through case LCK_MTX_SPINWAIT_SPUN */ - case LCK_MTX_SPINWAIT_SPUN: + /* just fall through case LCK_MTX_SPINWAIT_SPUN */ + case LCK_MTX_SPINWAIT_SPUN_HIGH_THR: + case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE: + case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION: + case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR: /* * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin * interlock not held @@ -2660,7 +2706,6 @@ try_again: */ goto try_again; } else { - /* grab the mutex */ state |= LCK_MTX_MLOCKED_MSK; ordered_store_mtx_state_release(lock, state); @@ -2740,7 +2785,7 @@ lck_mtx_lock_wait_interlock_to_clear( { uint32_t state; - for ( ; ; ) { + for (;;) { cpu_pause(); state = ordered_load_mtx_state(lock); if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { @@ -2762,7 +2807,7 @@ lck_mtx_try_lock_wait_interlock_to_clear( { uint32_t state; - for ( ; ; ) { + for (;;) { cpu_pause(); state = ordered_load_mtx_state(lock); if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) { @@ -2790,9 +2835,9 @@ void lck_mtx_lock_slow( lck_mtx_t *lock) { - boolean_t indirect = FALSE; - uint32_t state; - int first_miss = 0; + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; state = ordered_load_mtx_state(lock); @@ -2806,7 +2851,7 @@ lck_mtx_lock_slow( /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { /* no, must have been the mutex */ return lck_mtx_lock_contended(lock, indirect, &first_miss); } @@ -2824,7 +2869,7 @@ lck_mtx_lock_slow( lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); if (state & LCK_MTX_SPIN_MSK) { - /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ assert(state & LCK_MTX_ILOCKED_MSK); lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); } @@ -2850,7 +2895,7 @@ lck_mtx_lock_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif /* @@ -2887,7 +2932,7 @@ lck_mtx_try_lock_slow( */ /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { return FALSE; } @@ -2905,8 +2950,9 @@ lck_mtx_try_lock_slow( } if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) + if (indirect) { lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } return FALSE; } } @@ -2914,8 +2960,9 @@ lck_mtx_try_lock_slow( /* no - can't be INDIRECT, DESTROYED or locked */ while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) + if (indirect) { lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } return FALSE; } } @@ -2928,7 +2975,7 @@ lck_mtx_try_lock_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif /* @@ -2943,13 +2990,12 @@ lck_mtx_try_lock_slow( lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock)); return TRUE; - } __attribute__((noinline)) void lck_mtx_lock_spin_slow( - lck_mtx_t *lock) + lck_mtx_t *lock) { boolean_t indirect = FALSE; uint32_t state; @@ -2967,7 +3013,7 @@ lck_mtx_lock_spin_slow( /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { /* no, must have been the mutex */ return lck_mtx_lock_contended(lock, indirect, &first_miss); } @@ -2985,7 +3031,7 @@ lck_mtx_lock_spin_slow( lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock); if (state & LCK_MTX_SPIN_MSK) { - /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ assert(state & LCK_MTX_ILOCKED_MSK); lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); } @@ -2997,7 +3043,7 @@ lck_mtx_lock_spin_slow( } /* no - can't be INDIRECT, DESTROYED or locked */ - while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) { + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { return lck_mtx_lock_contended(lock, indirect, &first_miss); } @@ -3015,7 +3061,7 @@ lck_mtx_lock_spin_slow( } #endif -#if CONFIG_DTRACE +#if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0); #endif /* return with the interlock held and preemption disabled */ @@ -3042,7 +3088,7 @@ lck_mtx_try_lock_spin_slow( */ /* is the mutex already held and not indirect */ - if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) { return FALSE; } @@ -3060,8 +3106,9 @@ lck_mtx_try_lock_spin_slow( } if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) + if (indirect) { lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } return FALSE; } } @@ -3069,8 +3116,9 @@ lck_mtx_try_lock_spin_slow( /* no - can't be INDIRECT, DESTROYED or locked */ while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { - if (indirect) + if (indirect) { lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } return FALSE; } } @@ -3083,7 +3131,7 @@ lck_mtx_try_lock_spin_slow( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif @@ -3091,13 +3139,12 @@ lck_mtx_try_lock_spin_slow( LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0); #endif return TRUE; - } __attribute__((noinline)) void lck_mtx_convert_spin( - lck_mtx_t *lock) + lck_mtx_t *lock) { uint32_t state; @@ -3135,7 +3182,7 @@ lck_mtx_convert_spin( static inline boolean_t lck_mtx_lock_grab_mutex( - lck_mtx_t *lock) + lck_mtx_t *lock) { uint32_t state; @@ -3153,7 +3200,7 @@ lck_mtx_lock_grab_mutex( #if MACH_LDEBUG if (thread) { - thread->mutex_count++; /* lock statistic */ + thread->mutex_count++; /* lock statistic */ } #endif return TRUE; @@ -3162,8 +3209,8 @@ lck_mtx_lock_grab_mutex( __attribute__((noinline)) void lck_mtx_assert( - lck_mtx_t *lock, - unsigned int type) + lck_mtx_t *lock, + unsigned int type) { thread_t thread, owner; uint32_t state; @@ -3178,17 +3225,19 @@ lck_mtx_assert( owner = (thread_t)lock->lck_mtx_owner; if (type == LCK_MTX_ASSERT_OWNED) { - if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) + if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { panic("mutex (%p) not owned\n", lock); + } } else { - assert (type == LCK_MTX_ASSERT_NOTOWNED); - if (owner == thread) + assert(type == LCK_MTX_ASSERT_NOTOWNED); + if (owner == thread) { panic("mutex (%p) owned\n", lock); + } } } /* - * Routine: lck_mtx_lock_spinwait_x86 + * Routine: lck_mtx_lock_spinwait_x86 * * Invoked trying to acquire a mutex when there is contention but * the holder is running on another processor. We spin for up to a maximum @@ -3202,104 +3251,245 @@ lck_mtx_assert( __attribute__((noinline)) lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86( - lck_mtx_t *mutex) + lck_mtx_t *mutex) { - __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); - thread_t holder; - uint64_t overall_deadline; - uint64_t check_owner_deadline; - uint64_t cur_time; - lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN; - int loopcount = 0; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + thread_t owner, prev_owner; + uint64_t window_deadline, sliding_deadline, high_deadline; + uint64_t start_time, cur_time, avg_hold_time, bias, delta; + lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR; + int loopcount = 0; + int total_hold_time_samples, window_hold_time_samples, unfairness; + uint i, prev_owner_cpu; + bool owner_on_core, adjust; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0); - cur_time = mach_absolute_time(); - overall_deadline = cur_time + MutexSpin; - check_owner_deadline = cur_time; + start_time = mach_absolute_time(); + /* + * window_deadline represents the "learning" phase. + * The thread collects statistics about the lock during + * window_deadline and then it makes a decision on whether to spin more + * or block according to the concurrency behavior + * observed. + * + * Every thread can spin at least low_MutexSpin. + */ + window_deadline = start_time + low_MutexSpin; + /* + * Sliding_deadline is the adjusted spin deadline + * computed after the "learning" phase. + */ + sliding_deadline = window_deadline; + /* + * High_deadline is a hard deadline. No thread + * can spin more than this deadline. + */ + if (high_MutexSpin >= 0) { + high_deadline = start_time + high_MutexSpin; + } else { + high_deadline = start_time + low_MutexSpin * real_ncpus; + } + /* + * Do not know yet which is the owner cpu. + * Initialize prev_owner_cpu with next cpu. + */ + prev_owner_cpu = (cpu_number() + 1) % real_ncpus; + total_hold_time_samples = 0; + window_hold_time_samples = 0; + avg_hold_time = 0; + adjust = TRUE; + bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus; + + prev_owner = (thread_t) mutex->lck_mtx_owner; /* * Spin while: * - mutex is locked, and - * - its locked as a spin lock, and + * - it's locked as a spin lock, and * - owner is running on another processor, and - * - owner (processor) is not idling, and * - we haven't spun for long enough. */ do { + /* + * Try to acquire the lock. + */ if (__probable(lck_mtx_lock_grab_mutex(mutex))) { retval = LCK_MTX_SPINWAIT_ACQUIRED; break; } + cur_time = mach_absolute_time(); - if (cur_time >= overall_deadline) + /* + * Never spin past high_deadline. + */ + if (cur_time >= high_deadline) { + retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR; break; + } - if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) { - boolean_t istate; + /* + * Check if owner is on core. If not block. + */ + owner = (thread_t) mutex->lck_mtx_owner; + if (owner) { + i = prev_owner_cpu; + owner_on_core = FALSE; + + disable_preemption(); + owner = (thread_t) mutex->lck_mtx_owner; /* - * We will repeatedly peek at the state of the lock while spinning, - * and we will acquire the interlock to do so. - * The thread that will unlock the mutex will also need to acquire - * the interlock, and we want to avoid to slow it down. - * To avoid to get an interrupt while holding the interlock - * and increase the time we are holding it, we - * will try to acquire the interlock with interrupts disabled. - * This is safe because it is a "try_lock", if we can't acquire - * the interlock we re-enable the interrupts and fail, so it is - * ok to call it even if the interlock was already held. - */ - if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) { - - if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { - - if ( !(holder->machine.specFlags & OnProc) || - (holder->state & TH_IDLE)) { - - lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); - - if (loopcount == 0) + * For scalability we want to check if the owner is on core + * without locking the mutex interlock. + * If we do not lock the mutex interlock, the owner that we see might be + * invalid, so we cannot dereference it. Therefore we cannot check + * any field of the thread to tell us if it is on core. + * Check if the thread that is running on the other cpus matches the owner. + */ + if (owner) { + do { + if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) { + owner_on_core = TRUE; + break; + } + if (++i >= real_ncpus) { + i = 0; + } + } while (i != prev_owner_cpu); + enable_preemption(); + + if (owner_on_core) { + prev_owner_cpu = i; + } else { + prev_owner = owner; + owner = (thread_t) mutex->lck_mtx_owner; + if (owner == prev_owner) { + /* + * Owner is not on core. + * Stop spinning. + */ + if (loopcount == 0) { retval = LCK_MTX_SPINWAIT_NO_SPIN; + } else { + retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE; + } break; } + /* + * Fall through if the owner changed while we were scanning. + * The new owner could potentially be on core, so loop + * again. + */ } - lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); + } else { + enable_preemption(); + } + } - check_owner_deadline = cur_time + (MutexSpin / 4); + /* + * Save how many times we see the owner changing. + * We can roughly estimate the mutex hold + * time and the fairness with that. + */ + if (owner != prev_owner) { + prev_owner = owner; + total_hold_time_samples++; + window_hold_time_samples++; + } + + /* + * Learning window expired. + * Try to adjust the sliding_deadline. + */ + if (cur_time >= window_deadline) { + /* + * If there was not contention during the window + * stop spinning. + */ + if (window_hold_time_samples < 1) { + retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION; + break; } + + if (adjust) { + /* + * For a fair lock, we'd wait for at most (NCPU-1) periods, + * but the lock is unfair, so let's try to estimate by how much. + */ + unfairness = total_hold_time_samples / real_ncpus; + + if (unfairness == 0) { + /* + * We observed the owner changing `total_hold_time_samples` times which + * let us estimate the average hold time of this mutex for the duration + * of the spin time. + * avg_hold_time = (cur_time - start_time) / total_hold_time_samples; + * + * In this case spin at max avg_hold_time * (real_ncpus - 1) + */ + delta = cur_time - start_time; + sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples; + } else { + /* + * In this case at least one of the other cpus was able to get the lock twice + * while I was spinning. + * We could spin longer but it won't necessarily help if the system is unfair. + * Try to randomize the wait to reduce contention. + * + * We compute how much time we could potentially spin + * and distribute it over the cpus. + * + * bias is an integer between 0 and real_ncpus. + * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias + */ + delta = high_deadline - cur_time; + sliding_deadline = cur_time + ((delta * bias) / real_ncpus); + adjust = FALSE; + } + } + + window_deadline += low_MutexSpin; + window_hold_time_samples = 0; } - cpu_pause(); - loopcount++; + /* + * Stop spinning if we past + * the adjusted deadline. + */ + if (cur_time >= sliding_deadline) { + retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR; + break; + } + + if ((thread_t) mutex->lck_mtx_owner != NULL) { + cpu_pause(); + } + loopcount++; } while (TRUE); -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* - * We've already kept a count via overall_deadline of how long we spun. - * If dtrace is active, then we compute backwards to decide how - * long we spun. - * * Note that we record a different probe id depending on whether - * this is a direct or indirect mutex. This allows us to + * this is a direct or indirect mutex. This allows us to * penalize only lock groups that have debug/stats enabled * with dtrace processing if desired. */ if (__probable(mutex->lck_mtx_is_ext == 0)) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, - mach_absolute_time() - (overall_deadline - MutexSpin)); + mach_absolute_time() - start_time); } /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0); return retval; } @@ -3307,7 +3497,7 @@ lck_mtx_lock_spinwait_x86( /* - * Routine: lck_mtx_lock_wait_x86 + * Routine: lck_mtx_lock_wait_x86 * * Invoked in order to wait on contention. * @@ -3334,13 +3524,13 @@ lck_mtx_lock_spinwait_x86( */ __attribute__((noinline)) void -lck_mtx_lock_wait_x86 ( - lck_mtx_t *mutex, +lck_mtx_lock_wait_x86( + lck_mtx_t *mutex, struct turnstile **ts) { thread_t self = current_thread(); -#if CONFIG_DTRACE +#if CONFIG_DTRACE uint64_t sleep_start = 0; if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { @@ -3350,8 +3540,8 @@ lck_mtx_lock_wait_x86 ( __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), - mutex->lck_mtx_waiters, 0, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); assert(self->waiting_for_mutex == NULL); self->waiting_for_mutex = mutex; @@ -3384,10 +3574,10 @@ lck_mtx_lock_wait_x86 ( self->waiting_for_mutex = NULL; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), - mutex->lck_mtx_waiters, 0, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, 0, 0); -#if CONFIG_DTRACE +#if CONFIG_DTRACE /* * Record the Dtrace lockstat probe for blocking, block time * measured from when we were entered. @@ -3410,7 +3600,7 @@ lck_mtx_lock_wait_x86 ( * Returns: TRUE if lock is acquired. */ boolean_t -kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) +kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck) { if (not_in_kdp) { panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger"); @@ -3436,17 +3626,17 @@ void kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) { lck_rw_t *rwlck = NULL; - switch(waitinfo->wait_type) { - case kThreadWaitKernelRWLockRead: - rwlck = READ_EVENT_TO_RWLOCK(event); - break; - case kThreadWaitKernelRWLockWrite: - case kThreadWaitKernelRWLockUpgrade: - rwlck = WRITE_EVENT_TO_RWLOCK(event); - break; - default: - panic("%s was called with an invalid blocking type", __FUNCTION__); - break; + switch (waitinfo->wait_type) { + case kThreadWaitKernelRWLockRead: + rwlck = READ_EVENT_TO_RWLOCK(event); + break; + case kThreadWaitKernelRWLockWrite: + case kThreadWaitKernelRWLockUpgrade: + rwlck = WRITE_EVENT_TO_RWLOCK(event); + break; + default: + panic("%s was called with an invalid blocking type", __FUNCTION__); + break; } waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck); waitinfo->owner = 0; diff --git a/osfmk/i386/locks_i386_opt.c b/osfmk/i386/locks_i386_opt.c index 5720cf7e2..60fceb2a4 100644 --- a/osfmk/i386/locks_i386_opt.c +++ b/osfmk/i386/locks_i386_opt.c @@ -248,6 +248,10 @@ lck_mtx_lock_spin_always( * well as destroyed mutexes. */ + if (state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK)) { + return lck_mtx_lock_spin_slow(lock); + } + /* Note LCK_MTX_SPIN_MSK is set only if LCK_MTX_ILOCKED_MSK is set */ prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK); state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK; diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 84bfb4c40..e27b01b22 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -78,6 +78,8 @@ uint64_t TLBTimeOut; uint64_t LockTimeOutTSC; uint32_t LockTimeOutUsec; uint64_t MutexSpin; +uint64_t low_MutexSpin; +int64_t high_MutexSpin; uint64_t LastDebuggerEntryAllowance; uint64_t delay_spin_threshold; @@ -788,6 +790,12 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(10 * NSEC_PER_USEC, &abstime); } MutexSpin = (unsigned int)abstime; + low_MutexSpin = MutexSpin; + /* + * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but + * real_ncpus is not set at this time + */ + high_MutexSpin = -1; nanoseconds_to_absolutetime(4ULL * NSEC_PER_SEC, &LastDebuggerEntryAllowance); if (PE_parse_boot_argn("panic_restart_timeout", &prt, sizeof(prt))) { @@ -823,6 +831,7 @@ MACRO_END VIRTUAL_TIMEOUT_INFLATE64(LockTimeOutTSC); VIRTUAL_TIMEOUT_INFLATE64(TLBTimeOut); VIRTUAL_TIMEOUT_INFLATE64(MutexSpin); + VIRTUAL_TIMEOUT_INFLATE64(low_MutexSpin); VIRTUAL_TIMEOUT_INFLATE64(reportphyreaddelayabs); } diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index b2f1e478f..4f605d378 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -372,9 +372,7 @@ pmap_verify_noncacheable(uintptr_t vaddr); #ifdef XNU_KERNEL_PRIVATE boolean_t ml_fpu_avx_enabled(void); -#if !defined(RC_HIDE_XNU_J137) boolean_t ml_fpu_avx512_enabled(void); -#endif void interrupt_latency_tracker_setup(void); void interrupt_reset_latency_stats(void); diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 9ece881bd..1f154580b 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -1961,7 +1961,17 @@ machine_thread_init(void) fpu_module_init(); } +/* + * machine_thread_template_init: Initialize machine-specific portion of + * the thread template. + */ +void +machine_thread_template_init(thread_t thr_template) +{ + assert(fpu_default != UNDEFINED); + THREAD_TO_PCB(thr_template)->xstate = fpu_default; +} user_addr_t get_useraddr(void) diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index 022491a89..4ff579713 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -173,22 +173,18 @@ #define XCR0_YMM (1ULL << 2) /* YMM state available */ #define XCR0_BNDREGS (1ULL << 3) /* MPX Bounds register state */ #define XCR0_BNDCSR (1ULL << 4) /* MPX Bounds configuration/state */ -#if !defined(RC_HIDE_XNU_J137) #define XCR0_OPMASK (1ULL << 5) /* Opmask register state */ #define XCR0_ZMM_HI256 (1ULL << 6) /* ZMM upper 256-bit state */ #define XCR0_HI16_ZMM (1ULL << 7) /* ZMM16..ZMM31 512-bit state */ -#endif /* not RC_HIDE_XNU_J137 */ #define XFEM_X87 XCR0_X87 #define XFEM_SSE XCR0_SSE #define XFEM_YMM XCR0_YMM #define XFEM_BNDREGS XCR0_BNDREGS #define XFEM_BNDCSR XCR0_BNDCSR -#if !defined(XNU_HODE_J137) #define XFEM_OPMASK XCR0_OPMASK #define XFEM_ZMM_HI256 XCR0_ZMM_HI256 #define XFEM_HI16_ZMM XCR0_HI16_ZMM #define XFEM_ZMM (XFEM_ZMM_HI256 | XFEM_HI16_ZMM | XFEM_OPMASK) -#endif /* not XNU_HODE_J137 */ #define XCR0 (0) #define PMAP_PCID_PRESERVE (1ULL << 63) diff --git a/osfmk/i386/user_ldt.c b/osfmk/i386/user_ldt.c index fa5c0ce23..14f681628 100644 --- a/osfmk/i386/user_ldt.c +++ b/osfmk/i386/user_ldt.c @@ -77,9 +77,6 @@ #include #include -#include /* for IOTaskHasEntitlement */ -#include /* for csr_check */ - #include static void user_ldt_set_action(void *); @@ -88,8 +85,6 @@ static int i386_set_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t desc static int i386_get_ldt_impl(uint32_t *retval, uint64_t start_sel, uint64_t descs, uint64_t num_sels); -#define LDT_IN_64BITPROC_ENTITLEMENT "com.apple.security.ldt-in-64bit-process" - /* * Add the descriptors to the LDT, starting with * the descriptor for 'first_selector'. @@ -444,11 +439,6 @@ i386_set_ldt64( uint64_t descs, /* out */ uint64_t num_sels) { - if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 && - !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) { - return EPERM; - } - return i386_set_ldt_impl(retval, start_sel, descs, num_sels); } @@ -472,10 +462,5 @@ i386_get_ldt64( uint64_t descs, /* out */ uint64_t num_sels) { - if (csr_check(CSR_ALLOW_UNTRUSTED_KEXTS) != 0 && - !IOTaskHasEntitlement(current_task(), LDT_IN_64BITPROC_ENTITLEMENT)) { - return EPERM; - } - return i386_get_ldt_impl(retval, start_sel, descs, num_sels); } diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index 44d1efed8..c403b9f46 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -2578,7 +2578,7 @@ ipc_importance_send( ipc_voucher_t voucher; assert(ip_kotype(kmsg->ikm_voucher) == IKOT_VOUCHER); - voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + voucher = (ipc_voucher_t)ip_get_kobject(kmsg->ikm_voucher); /* check to see if the voucher has an importance attribute */ val_count = MACH_VOUCHER_ATTR_VALUE_MAX_NESTED; @@ -3190,7 +3190,7 @@ ipc_importance_receive( /* set up recipe to copy the old voucher */ if (IP_VALID(kmsg->ikm_voucher)) { - ipc_voucher_t sent_voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + ipc_voucher_t sent_voucher = (ipc_voucher_t)ip_get_kobject(kmsg->ikm_voucher); recipe->key = MACH_VOUCHER_ATTR_KEY_ALL; recipe->command = MACH_VOUCHER_ATTR_COPY; diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index ca4bcee84..a03871cb8 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -82,10 +82,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -212,7 +214,7 @@ ipc_bootstrap(void) #if MACH_ASSERT ipc_port_debug_init(); #endif - mig_init(); + ipc_kobject_init(); ipc_table_init(); ipc_voucher_init(); @@ -228,6 +230,8 @@ ipc_bootstrap(void) arcade_init(); #endif + suid_cred_init(); + if (PE_parse_boot_argn("prioritize_launch", &prioritize_launch_bootarg, sizeof(prioritize_launch_bootarg))) { prioritize_launch = !!prioritize_launch_bootarg; } diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index f1611fc82..3a40a39ff 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -4270,7 +4270,8 @@ ipc_kmsg_copyout_header( assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); } else { ip_lock(reply); - if (!ip_active(reply)) { + /* Is the reply port still active and allowed to be copied out? */ + if (!ip_active(reply) || !ip_label_check(space, reply, reply_type)) { /* clear the context value */ reply->ip_reply_context = 0; ip_unlock(reply); diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 76fc96b8e..adeef2d72 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -945,6 +945,7 @@ ipc_object_copyout( break; } + name = CAST_MACH_PORT_TO_NAME(object); kr = ipc_entry_get(space, &name, &entry); if (kr != KERN_SUCCESS) { @@ -968,6 +969,30 @@ ipc_object_copyout( return KERN_INVALID_CAPABILITY; } + /* Don't actually copyout rights we aren't allowed to */ + if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) { + io_unlock(object); + ipc_entry_dealloc(space, name, entry); + is_write_unlock(space); + + switch (msgt_name) { + case MACH_MSG_TYPE_PORT_SEND_ONCE: + ipc_port_release_sonce(ip_object_to_port(object)); + break; + case MACH_MSG_TYPE_PORT_SEND: + ipc_port_release_send(ip_object_to_port(object)); + break; + default: + /* + * We don't allow labeling of "kobjects" with receive + * rights at user-space or port-sets. So, if we get this far, + * something went VERY wrong. + */ + panic("ipc_object_copyout: bad port label check failure"); + } + return KERN_INVALID_CAPABILITY; + } + entry->ie_object = object; break; } @@ -1064,6 +1089,25 @@ ipc_object_copyout_name( return KERN_INVALID_CAPABILITY; } + /* Don't actually copyout rights we aren't allowed to */ + if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) { + io_unlock(object); + ipc_entry_dealloc(space, name, entry); + is_write_unlock(space); + + switch (msgt_name) { + case MACH_MSG_TYPE_PORT_SEND_ONCE: + ipc_port_release_sonce(ip_object_to_port(object)); + break; + case MACH_MSG_TYPE_PORT_SEND: + ipc_port_release_send(ip_object_to_port(object)); + break; + default: + panic("ipc_object_copyout_name: bad port label check failure"); + } + return KERN_INVALID_CAPABILITY; + } + entry->ie_object = object; } diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index 77ddc1333..4ca1ad542 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -131,8 +131,9 @@ struct ipc_object_header { * definitions in ipc_port.h. */ #define IO_BITS_PORT_INFO 0x0000f000 /* stupid port tricks */ -#define IO_BITS_KOTYPE 0x000007ff /* used by the object */ +#define IO_BITS_KOTYPE 0x000003ff /* used by the object */ #define IO_BITS_KOBJECT 0x00000800 /* port belongs to a kobject */ +#define IO_BITS_KOLABEL 0x00000400 /* The kobject has a label */ #define IO_BITS_OTYPE 0x7fff0000 /* determines a zone */ #define IO_BITS_ACTIVE 0x80000000 /* is object alive? */ @@ -141,7 +142,7 @@ struct ipc_object_header { #define io_otype(io) (((io)->io_bits & IO_BITS_OTYPE) >> 16) #define io_kotype(io) ((io)->io_bits & IO_BITS_KOTYPE) #define io_is_kobject(io) (((io)->io_bits & IO_BITS_KOBJECT) != IKOT_NONE) - +#define io_is_kolabeled(io) (((io)->io_bits & IO_BITS_KOLABEL) != 0) #define io_makebits(active, otype, kotype) \ (((active) ? IO_BITS_ACTIVE : 0) | ((otype) << 16) | (kotype)) diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 413139c0e..b9acef764 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -129,6 +129,7 @@ struct ipc_port { union { ipc_kobject_t kobject; + ipc_kobject_label_t kolabel; ipc_importance_task_t imp_task; ipc_port_t sync_inheritor_port; struct knote *sync_inheritor_knote; @@ -190,6 +191,7 @@ struct ipc_port { #define ip_timestamp data.timestamp #define ip_kobject kdata.kobject +#define ip_kolabel kdata.kolabel #define ip_imp_task kdata.imp_task #define ip_sync_inheritor_port kdata.sync_inheritor_port #define ip_sync_inheritor_knote kdata.sync_inheritor_knote @@ -280,6 +282,10 @@ MACRO_END #define ip_kotype(port) io_kotype(ip_to_object(port)) #define ip_is_kobject(port) io_is_kobject(ip_to_object(port)) +#define ip_is_kolabeled(port) io_is_kolabeled(ip_to_object(port)) +#define ip_get_kobject(port) ipc_kobject_get(port) +#define ip_label_check(space, port, msgt_name) \ + (!ip_is_kolabeled(port) || ipc_kobject_label_check((space), (port), (msgt_name))) #define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) #define ip_full(port) imq_full(&(port)->ip_messages) diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index 290c71673..3d2a0dc13 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -237,6 +237,7 @@ ipc_space_rand_freelist( kern_return_t ipc_space_create( ipc_table_size_t initial, + ipc_label_t label, ipc_space_t *spacep) { ipc_space_t space; @@ -271,6 +272,7 @@ ipc_space_create( space->is_table = table; space->is_table_next = initial + 1; space->is_task = NULL; + space->is_label = label; space->is_low_mod = new_size; space->is_high_mod = 0; space->is_node_id = HOST_LOCAL_NODE; /* HOST_LOCAL_NODE, except proxy spaces */ @@ -279,6 +281,67 @@ ipc_space_create( return KERN_SUCCESS; } +/* + * Routine: ipc_space_label + * Purpose: + * Modify the label on a space. The desired + * label must be a super-set of the current + * label for the space (as rights may already + * have been previously copied out under the + * old label value. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Updated the label + * KERN_INVALID_VALUE label not a superset of old + */ +kern_return_t +ipc_space_label( + ipc_space_t space, + ipc_label_t label) +{ + is_write_lock(space); + if (!is_active(space)) { + is_write_unlock(space); + return KERN_SUCCESS; + } + + if ((space->is_label & label) != space->is_label) { + is_write_unlock(space); + return KERN_INVALID_VALUE; + } + space->is_label = label; + is_write_unlock(space); + return KERN_SUCCESS; +} + +/* + * Routine: ipc_space_add_label + * Purpose: + * Modify the label on a space. The desired + * label is added to the labels already set + * on the space. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Updated the label + * KERN_INVALID_VALUE label not a superset of old + */ +kern_return_t +ipc_space_add_label( + ipc_space_t space, + ipc_label_t label) +{ + is_write_lock(space); + if (!is_active(space)) { + is_write_unlock(space); + return KERN_SUCCESS; + } + + space->is_label |= label; + is_write_unlock(space); + return KERN_SUCCESS; +} /* * Routine: ipc_space_create_special * Purpose: @@ -310,6 +373,7 @@ ipc_space_create_special( space->is_bits = IS_INACTIVE | 1; /* 1 ref, not active, not growing */ space->is_table = IE_NULL; space->is_task = TASK_NULL; + space->is_label = IPC_LABEL_SPECIAL; space->is_table_next = 0; space->is_low_mod = 0; space->is_high_mod = 0; diff --git a/osfmk/ipc/ipc_space.h b/osfmk/ipc/ipc_space.h index 161c55403..8a4466a4d 100644 --- a/osfmk/ipc/ipc_space.h +++ b/osfmk/ipc/ipc_space.h @@ -119,8 +119,9 @@ struct ipc_space { ipc_entry_num_t is_table_hashed;/* count of hashed elements */ ipc_entry_num_t is_table_free; /* count of free elements */ ipc_entry_t is_table; /* an array of entries */ - task_t is_task; /* associated task */ struct ipc_table_size *is_table_next; /* info for larger table */ + task_t is_task; /* associated task */ + ipc_label_t is_label; /* [private] mandatory access label */ ipc_entry_num_t is_low_mod; /* lowest modified entry during growth */ ipc_entry_num_t is_high_mod; /* highest modified entry during growth */ struct bool_gen bool_gen; /* state for boolean RNG */ @@ -225,8 +226,19 @@ extern kern_return_t ipc_space_create_special( /* Create a new IPC space */ extern kern_return_t ipc_space_create( ipc_table_size_t initial, + ipc_label_t label, ipc_space_t *spacep); +/* Change the label on an existing space */ +extern kern_return_t ipc_space_label( + ipc_space_t space, + ipc_label_t label); + +/* Add a label to an existing space */ +extern kern_return_t ipc_space_add_label( + ipc_space_t space, + ipc_label_t label); + /* Mark a space as dead and cleans up the entries*/ extern void ipc_space_terminate( ipc_space_t space); diff --git a/osfmk/ipc/ipc_types.h b/osfmk/ipc/ipc_types.h index eaf5a3798..f5fde0dae 100644 --- a/osfmk/ipc/ipc_types.h +++ b/osfmk/ipc/ipc_types.h @@ -62,6 +62,14 @@ typedef struct ipc_pset *ipc_pset_t; typedef struct ipc_kmsg *ipc_kmsg_t; typedef uint8_t sync_qos_count_t; +typedef uint64_t ipc_label_t; +#define IPC_LABEL_NONE ((ipc_label_t)0x0) +#define IPC_LABEL_DEXT ((ipc_label_t)0x1) +#define IPC_LABEL_PLATFORM ((ipc_label_t)0x2) +#define IPC_LABEL_SPECIAL ((ipc_label_t)0x3) + +typedef struct ipc_kobject_label *ipc_kobject_label_t; + #define IE_NULL ((ipc_entry_t) 0) #define ITS_NULL ((ipc_table_size_t) 0) diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index eeb226a87..bc739faa3 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -377,7 +377,12 @@ unsafe_convert_port_to_voucher( ipc_port_t port) { if (IP_VALID(port)) { - uintptr_t voucher = (uintptr_t) port->ip_kobject; + /* vouchers never labeled (they get transformed before use) */ + if (ip_is_kolabeled(port)) { + return (uintptr_t)IV_NULL; + } + + uintptr_t voucher = (uintptr_t)port->ip_kobject; /* * No need to lock because we have a reference on the @@ -407,7 +412,7 @@ convert_port_to_voucher( { if (IP_VALID(port)) { zone_require(port, ipc_object_zones[IOT_PORT]); - ipc_voucher_t voucher = (ipc_voucher_t) port->ip_kobject; + ipc_voucher_t voucher = (ipc_voucher_t) ip_get_kobject(port); /* * No need to lock because we have a reference on the @@ -487,13 +492,14 @@ ipc_voucher_notify(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; + ipc_voucher_t voucher = (ipc_voucher_t)ip_get_kobject(port); require_ip_active(port); assert(IKOT_VOUCHER == ip_kotype(port)); /* consume the reference donated by convert_voucher_to_port */ - zone_require((ipc_voucher_t)port->ip_kobject, ipc_voucher_zone); - ipc_voucher_release((ipc_voucher_t)port->ip_kobject); + zone_require(voucher, ipc_voucher_zone); + ipc_voucher_release(voucher); } /* @@ -671,7 +677,7 @@ convert_port_to_voucher_attr_control( { if (IP_VALID(port)) { zone_require(port, ipc_object_zones[IOT_PORT]); - ipc_voucher_attr_control_t ivac = (ipc_voucher_attr_control_t) port->ip_kobject; + ipc_voucher_attr_control_t ivac = (ipc_voucher_attr_control_t) ip_get_kobject(port); /* * No need to lock because we have a reference on the @@ -702,12 +708,15 @@ ipc_voucher_attr_control_notify(mach_msg_header_t *msg) { mach_no_senders_notification_t *notification = (void *)msg; ipc_port_t port = notification->not_header.msgh_remote_port; + ipc_voucher_attr_control_t ivac; require_ip_active(port); assert(IKOT_VOUCHER_ATTR_CONTROL == ip_kotype(port)); /* release the reference donated by convert_voucher_attr_control_to_port */ - ivac_release((ipc_voucher_attr_control_t)port->ip_kobject); + ivac = (ipc_voucher_attr_control_t)ip_get_kobject(port); + zone_require(ivac, ipc_voucher_attr_control_zone); + ivac_release(ivac); } /* @@ -2638,7 +2647,7 @@ ipc_get_pthpriority_from_kmsg_voucher( return KERN_FAILURE; } - pthread_priority_voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + pthread_priority_voucher = (ipc_voucher_t)ip_get_kobject(kmsg->ikm_voucher); kr = mach_voucher_extract_attr_recipe(pthread_priority_voucher, MACH_VOUCHER_ATTR_KEY_PTHPRIORITY, content_data, @@ -2683,7 +2692,7 @@ ipc_voucher_send_preprocessing(ipc_kmsg_t kmsg) } /* setup recipe for preprocessing of all the attributes. */ - pre_processed_voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + pre_processed_voucher = (ipc_voucher_t)ip_get_kobject(kmsg->ikm_voucher); kr = ipc_voucher_prepare_processing_recipe(pre_processed_voucher, (mach_voucher_attr_raw_recipe_array_t)recipes, @@ -2732,7 +2741,7 @@ ipc_voucher_receive_postprocessing( } /* setup recipe for auto redeem of all the attributes. */ - sent_voucher = (ipc_voucher_t)kmsg->ikm_voucher->ip_kobject; + sent_voucher = (ipc_voucher_t)ip_get_kobject(kmsg->ikm_voucher); kr = ipc_voucher_prepare_processing_recipe(sent_voucher, (mach_voucher_attr_raw_recipe_array_t)recipes, diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index cf1d90c0b..baa3b96d2 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -86,6 +86,7 @@ #include #include +#include #endif /* @@ -452,21 +453,23 @@ mach_port_dnrequest_info( #if !MACH_IPC_DEBUG kern_return_t -mach_port_kobject( +mach_port_kobject_description( __unused ipc_space_t space, __unused mach_port_name_t name, __unused natural_t *typep, - __unused mach_vm_address_t *addrp) + __unused mach_vm_address_t *addrp, + __unused kobject_description_t desc) { return KERN_FAILURE; } #else kern_return_t -mach_port_kobject( +mach_port_kobject_description( ipc_space_t space, mach_port_name_t name, natural_t *typep, - mach_vm_address_t *addrp) + mach_vm_address_t *addrp, + kobject_description_t desc) { ipc_entry_t entry; ipc_port_t port; @@ -500,19 +503,53 @@ mach_port_kobject( } *typep = (unsigned int) ip_kotype(port); - kaddr = (mach_vm_address_t)port->ip_kobject; + kaddr = (mach_vm_address_t)ip_get_kobject(port); *addrp = 0; #if (DEVELOPMENT || DEBUG) if (kaddr && ip_is_kobject(port)) { *addrp = VM_KERNEL_UNSLIDE_OR_PERM(kaddr); } #endif + + io_object_t obj = NULL; + natural_t kotype = ip_kotype(port); + if (desc) { + *desc = '\0'; + switch (kotype) { + case IKOT_IOKIT_OBJECT: + case IKOT_IOKIT_CONNECT: + case IKOT_IOKIT_IDENT: + case IKOT_UEXT_OBJECT: + obj = (io_object_t) kaddr; + iokit_add_reference(obj, IKOT_IOKIT_OBJECT); + break; + + default: + break; + } + } + ip_unlock(port); + if (obj) { + iokit_port_object_description(obj, desc); + iokit_remove_reference(obj); + } + return KERN_SUCCESS; } #endif /* MACH_IPC_DEBUG */ +kern_return_t +mach_port_kobject( + ipc_space_t space, + mach_port_name_t name, + natural_t *typep, + mach_vm_address_t *addrp) +{ + return mach_port_kobject_description(space, name, typep, addrp, NULL); +} + /* * Routine: mach_port_kernel_object [Legacy kernel call] * Purpose: diff --git a/osfmk/kern/arcade.c b/osfmk/kern/arcade.c index 48c4b6014..8bd54f11d 100644 --- a/osfmk/kern/arcade.c +++ b/osfmk/kern/arcade.c @@ -126,7 +126,7 @@ convert_port_to_arcade_register( /* No need to lock port because of how refs managed */ if (ip_kotype(port) == IKOT_ARCADE_REG) { assert(ip_active(port)); - arcade_reg = (arcade_register_t)port->ip_kobject; + arcade_reg = (arcade_register_t)ip_get_kobject(port); assert(arcade_reg == &arcade_register_global); assert(arcade_reg->ar_port == port); } diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index eb51597a2..a3e20dcc1 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -90,7 +90,7 @@ audit_session_porttoaia(ipc_port_t port) ip_lock(port); if (IKOT_AU_SESSIONPORT == ip_kotype(port)) { require_ip_active(port); - aia_p = (struct auditinfo_addr *)port->ip_kobject; + aia_p = (struct auditinfo_addr *)ip_get_kobject(port); } ip_unlock(port); } @@ -119,7 +119,7 @@ audit_session_nosenders(mach_msg_header_t *msg) require_ip_active(port); assert(IKOT_AU_SESSIONPORT == ip_kotype(port)); - port_aia_p = (struct auditinfo_addr *)port->ip_kobject; + port_aia_p = (struct auditinfo_addr *)ip_get_kobject(port); assert(NULL != port_aia_p); audit_session_aiaunref(port_aia_p); diff --git a/osfmk/kern/backtrace.c b/osfmk/kern/backtrace.c index 59667c828..3de320d9f 100644 --- a/osfmk/kern/backtrace.c +++ b/osfmk/kern/backtrace.c @@ -242,17 +242,17 @@ backtrace_interrupted(uintptr_t *bt, unsigned int max_frames, was_truncated_out) + 1; } -int +unsigned int backtrace_user(uintptr_t *bt, unsigned int max_frames, - unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out) + int *error_out, bool *user_64_out, bool *was_truncated_out) { return backtrace_thread_user(current_thread(), bt, max_frames, - frames_out, user_64_out, was_truncated_out); + error_out, user_64_out, was_truncated_out); } -int +unsigned int backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int max_frames, - unsigned int *frames_out, bool *user_64_out, bool *was_truncated_out) + int *error_out, bool *user_64_out, bool *was_truncated_out) { bool user_64; uintptr_t pc = 0, fp = 0, next_fp = 0; @@ -263,7 +263,6 @@ backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int max_frames, assert(bt != NULL); assert(max_frames > 0); - assert(frames_out != NULL); #if defined(__x86_64__) @@ -405,8 +404,10 @@ out: if (user_64_out) { *user_64_out = user_64; } + if (error_out) { + *error_out = err; + } - *frames_out = frame_index; - return err; + return frame_index; #undef INVALID_USER_FP } diff --git a/osfmk/kern/backtrace.h b/osfmk/kern/backtrace.h index 8b56b26df..4123482e1 100644 --- a/osfmk/kern/backtrace.h +++ b/osfmk/kern/backtrace.h @@ -108,17 +108,20 @@ unsigned int backtrace_interrupted(uintptr_t *bt, unsigned int btlen, * thread, nor can it be called from interrupt context or with interrupts * disabled. * - * @param btwritten On success, the number of return addresses written is stored - * here. + * @param error The precise error code that occurred is stored here, or 0 if no + * error occurred. * * @param user64 On success, true is stored here if user space was running in * 64-bit mode, and false is stored otherwise. * - * @return Returns 0 on success and an errno value on error. + * @param was_truncated true is stored here if the full stack could not be written + * to bt. + * + * @return Returns the number of frames written to bt. * * @seealso backtrace */ -int backtrace_user(uintptr_t *bt, unsigned int btlen, unsigned int *btwritten, +unsigned int backtrace_user(uintptr_t *bt, unsigned int btlen, int *error, bool *user64, bool *was_truncated); /* @@ -134,8 +137,8 @@ int backtrace_user(uintptr_t *bt, unsigned int btlen, unsigned int *btwritten, * * @see backtrace_user */ -int backtrace_thread_user(void *thread, uintptr_t *bt, unsigned int btlen, - unsigned int *btwritten, bool *user64, bool *was_truncated); +unsigned int backtrace_thread_user(void *thread, uintptr_t *bt, + unsigned int btlen, int *error, bool *user64, bool *was_truncated); __END_DECLS diff --git a/osfmk/kern/block_hint.h b/osfmk/kern/block_hint.h index 25fb8477e..a28e09f22 100644 --- a/osfmk/kern/block_hint.h +++ b/osfmk/kern/block_hint.h @@ -49,6 +49,7 @@ typedef enum thread_snapshot_wait_flags { kThreadWaitWorkloopSyncWait = 0x10, kThreadWaitOnProcess = 0x11, kThreadWaitSleepWithInheritor = 0x12, + kThreadWaitCompressor = 0x14, } __attribute__((packed)) block_hint_t; _Static_assert(sizeof(block_hint_t) <= sizeof(short), diff --git a/osfmk/kern/circle_queue.h b/osfmk/kern/circle_queue.h index 4ec2af237..8d9453f5f 100644 --- a/osfmk/kern/circle_queue.h +++ b/osfmk/kern/circle_queue.h @@ -149,6 +149,24 @@ circle_dequeue_tail(circle_queue_t cq) return elt; } +static inline void +circle_queue_rotate_head_forward(circle_queue_t cq) +{ + queue_entry_t first = circle_queue_first(cq); + if (first != NULL) { + cq->head = first->next; + } +} + +static inline void +circle_queue_rotate_head_backward(circle_queue_t cq) +{ + queue_entry_t last = circle_queue_last(cq); + if (last != NULL) { + cq->head = last; + } +} + /* * Macro: cqe_element * Function: diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index e885cee9b..101678d7b 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -1619,7 +1619,9 @@ clock_interval_to_deadline( clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime); - *result = mach_absolute_time() + abstime; + if (os_add_overflow(mach_absolute_time(), abstime, result)) { + *result = UINT64_MAX; + } } void @@ -1627,7 +1629,9 @@ clock_absolutetime_interval_to_deadline( uint64_t abstime, uint64_t *result) { - *result = mach_absolute_time() + abstime; + if (os_add_overflow(mach_absolute_time(), abstime, result)) { + *result = UINT64_MAX; + } } void @@ -1635,7 +1639,9 @@ clock_continuoustime_interval_to_deadline( uint64_t conttime, uint64_t *result) { - *result = mach_continuous_time() + conttime; + if (os_add_overflow(mach_continuous_time(), conttime, result)) { + *result = UINT64_MAX; + } } void @@ -1653,14 +1659,23 @@ clock_deadline_for_periodic_event( { assert(interval != 0); - *deadline += interval; + // *deadline += interval; + if (os_add_overflow(*deadline, interval, deadline)) { + *deadline = UINT64_MAX; + } if (*deadline <= abstime) { - *deadline = abstime + interval; - abstime = mach_absolute_time(); + // *deadline = abstime + interval; + if (os_add_overflow(abstime, interval, deadline)) { + *deadline = UINT64_MAX; + } + abstime = mach_absolute_time(); if (*deadline <= abstime) { - *deadline = abstime + interval; + // *deadline = abstime + interval; + if (os_add_overflow(abstime, interval, deadline)) { + *deadline = UINT64_MAX; + } } } } diff --git a/osfmk/kern/host_notify.c b/osfmk/kern/host_notify.c index dfb2703b4..36175a0a2 100644 --- a/osfmk/kern/host_notify.c +++ b/osfmk/kern/host_notify.c @@ -135,7 +135,7 @@ host_notify_port_destroy( ip_lock(port); if (ip_kotype(port) == IKOT_HOST_NOTIFY) { - entry = (host_notify_t)port->ip_kobject; + entry = (host_notify_t)ip_get_kobject(port); assert(entry != NULL); ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ip_unlock(port); @@ -187,7 +187,7 @@ host_notify_all( ip_lock(port); assert(ip_kotype(port) == IKOT_HOST_NOTIFY); - assert(port->ip_kobject == (ipc_kobject_t)entry); + assert(ip_get_kobject(port) == (ipc_kobject_t)entry); ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ip_unlock(port); diff --git a/osfmk/kern/ipc_clock.c b/osfmk/kern/ipc_clock.c index 800c7b857..58c5a7387 100644 --- a/osfmk/kern/ipc_clock.c +++ b/osfmk/kern/ipc_clock.c @@ -105,7 +105,7 @@ convert_port_to_clock( if (ip_active(port) && ((ip_kotype(port) == IKOT_CLOCK) || (ip_kotype(port) == IKOT_CLOCK_CTRL))) { - clock = (clock_t) port->ip_kobject; + clock = (clock_t)ip_get_kobject(port); } ip_unlock(port); } @@ -131,7 +131,7 @@ convert_port_to_clock_ctrl( ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_CLOCK_CTRL)) { - clock = (clock_t) port->ip_kobject; + clock = (clock_t) ip_get_kobject(port); } ip_unlock(port); } @@ -195,7 +195,7 @@ port_name_to_clock( return clock; } if (ip_kotype(port) == IKOT_CLOCK) { - clock = (clock_t) port->ip_kobject; + clock = (clock_t) ip_get_kobject(port); } ip_unlock(port); return clock; diff --git a/osfmk/kern/ipc_host.c b/osfmk/kern/ipc_host.c index 2b1b29008..7cf4b903b 100644 --- a/osfmk/kern/ipc_host.c +++ b/osfmk/kern/ipc_host.c @@ -281,7 +281,7 @@ convert_port_to_host( if (IP_VALID(port)) { if (ip_kotype(port) == IKOT_HOST || ip_kotype(port) == IKOT_HOST_PRIV) { - host = (host_t) port->ip_kobject; + host = (host_t) ip_get_kobject(port); require_ip_active(port); } } @@ -307,7 +307,7 @@ convert_port_to_host_priv( ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_HOST_PRIV)) { - host = (host_t) port->ip_kobject; + host = (host_t) ip_get_kobject(port); } ip_unlock(port); } @@ -335,7 +335,7 @@ convert_port_to_processor( ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_PROCESSOR)) { - processor = (processor_t) port->ip_kobject; + processor = (processor_t) ip_get_kobject(port); } ip_unlock(port); } @@ -404,7 +404,7 @@ ref_pset_port_locked(ipc_port_t port, boolean_t matchn, processor_set_t *ppset) if (ip_active(port) && ((ip_kotype(port) == IKOT_PSET) || (matchn && (ip_kotype(port) == IKOT_PSET_NAME)))) { - pset = (processor_set_t) port->ip_kobject; + pset = (processor_set_t) ip_get_kobject(port); } *ppset = pset; @@ -519,7 +519,7 @@ convert_port_to_host_security( ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_HOST_SECURITY)) { - host = (host_t) port->ip_kobject; + host = (host_t) ip_get_kobject(port); } ip_unlock(port); } diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index d29c63124..c942f3141 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -144,6 +144,7 @@ #include #include #include +#include #include @@ -182,7 +183,7 @@ static mig_hash_t mig_buckets[MAX_MIG_ENTRIES]; static int mig_table_max_displ; static mach_msg_size_t mig_reply_size = sizeof(mig_reply_error_t); - +static zone_t ipc_kobject_label_zone; const struct mig_subsystem *mig_e[] = { (const struct mig_subsystem *)&mach_vm_subsystem, @@ -223,7 +224,7 @@ const struct mig_subsystem *mig_e[] = { #endif }; -void +static void mig_init(void) { unsigned int i, n = sizeof(mig_e) / sizeof(const struct mig_subsystem *); @@ -267,6 +268,24 @@ mig_init(void) printf("mig_table_max_displ = %d\n", mig_table_max_displ); } +/* + * Routine: ipc_kobject_init + * Purpose: + * Deliver notifications to kobjects that care about them. + */ +void +ipc_kobject_init(void) +{ + int label_max = CONFIG_TASK_MAX + CONFIG_THREAD_MAX + 1000 /* UEXT estimate */; + + mig_init(); + + ipc_kobject_label_zone = + zinit(sizeof(struct ipc_kobject_label), + label_max * sizeof(struct ipc_kobject_label), + sizeof(struct ipc_kobject_label), + "ipc kobject labels"); +} /* * Routine: ipc_kobject_server @@ -604,13 +623,49 @@ ipc_kobject_set_atomically( port->ip_spares[2] = (port->ip_object.io_bits & IO_BITS_KOTYPE); #endif /* MACH_ASSERT */ port->ip_object.io_bits = (port->ip_object.io_bits & ~IO_BITS_KOTYPE) | type; - port->ip_kobject = kobject; + if (ip_is_kolabeled(port)) { + ipc_kobject_label_t labelp = port->ip_kolabel; + labelp->ikol_kobject = kobject; + } else { + port->ip_kobject = kobject; + } if (type != IKOT_NONE) { /* Once set, this bit can never be unset */ port->ip_object.io_bits |= IO_BITS_KOBJECT; } } +/* + * Routine: ipc_kobject_init_port + * Purpose: + * Initialize a kobject port with the given types and options. + * + * This function never fails. + */ +static inline void +ipc_kobject_init_port( + ipc_port_t port, + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_kobject_alloc_options_t options) +{ + ipc_kobject_set_atomically(port, kobject, type); + + if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { + ipc_port_make_send_locked(port); + } + if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { + ipc_port_make_sonce_locked(port); + port->ip_nsrequest = port; + } + if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { + port->ip_no_grant = 1; + } + if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { + port->ip_immovable_send = 1; + } +} + /* * Routine: ipc_kobject_alloc_port * Purpose: @@ -627,53 +682,52 @@ ipc_kobject_alloc_port( ipc_kobject_type_t type, ipc_kobject_alloc_options_t options) { - ipc_port_init_flags_t flags; - ipc_space_t space; - ipc_port_t port; + ipc_port_t port = ipc_port_alloc_kernel(); - if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) { - /* kobject port intended to be copied out to user-space */ - flags = IPC_PORT_INIT_MESSAGE_QUEUE; - space = IS_NULL; - } else { - /* true kernel-bound kobject port */ - flags = IPC_PORT_INIT_NONE; - space = ipc_space_kernel; - } - port = ipc_port_alloc_special(space, flags); if (port == IP_NULL) { panic("ipc_kobject_alloc_port(): failed to allocate port"); } - ipc_kobject_set_atomically(port, kobject, type); + ipc_kobject_init_port(port, kobject, type, options); + return port; +} - if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { - ipc_port_make_send_locked(port); - } +/* + * Routine: ipc_kobject_alloc_labeled_port + * Purpose: + * Allocate a kobject port and associated mandatory access label + * in the kernel space of the specified type. + * + * This function never fails. + * + * Conditions: + * No locks held (memory is allocated) + */ - if (options & IPC_KOBJECT_ALLOC_IN_TRANSIT) { - /* reset the port like it has been copied in circularity checked */ - if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { - panic("ipc_kobject_alloc_port(): invalid option for user-space port"); - } - port->ip_mscount = 0; - assert(port->ip_tempowner == 0); - assert(port->ip_receiver == IS_NULL); - port->ip_receiver = IS_NULL; - port->ip_receiver_name = MACH_PORT_NULL; - } else { - if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { - ipc_port_make_sonce_locked(port); - port->ip_nsrequest = port; - } - } - if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { - port->ip_immovable_send = 1; +ipc_port_t +ipc_kobject_alloc_labeled_port( + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_label_t label, + ipc_kobject_alloc_options_t options) +{ + ipc_port_t port; + ipc_kobject_label_t labelp; + + port = ipc_port_alloc_kernel(); + if (port == IP_NULL) { + panic("ipc_kobject_alloc_port(): failed to allocate port"); } - if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { - port->ip_no_grant = 1; + + labelp = (ipc_kobject_label_t)zalloc(ipc_kobject_label_zone); + if (labelp == NULL) { + panic("ipc_kobject_alloc_labeled_port(): failed to allocate label"); } + labelp->ikol_label = label; + port->ip_kolabel = labelp; + port->ip_object.io_bits |= IO_BITS_KOLABEL; + ipc_kobject_init_port(port, kobject, type, options); return port; } @@ -744,16 +798,92 @@ ipc_kobject_make_send_lazy_alloc_port( return rc; } +/* + * Routine: ipc_kobject_make_send_lazy_alloc_labeled_port + * Purpose: + * Make a send once for a kobject port. + * + * A location owning this port is passed in port_store. + * If no port exists, a port is made lazily. + * + * A send right is made for the port, and if this is the first one + * (possibly not for the first time), then the no-more-senders + * notification is rearmed. + * + * When a notification is armed, the kobject must donate + * one of its references to the port. It is expected + * the no-more-senders notification will consume this reference. + * + * Returns: + * TRUE if a notification was armed + * FALSE else + * + * Conditions: + * Nothing is locked, memory can be allocated. + * The caller must be able to donate a kobject reference to the port. + */ +boolean_t +ipc_kobject_make_send_lazy_alloc_labeled_port( + ipc_port_t *port_store, + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_label_t label) +{ + ipc_port_t port, previous; + boolean_t rc = FALSE; + + port = os_atomic_load(port_store, dependency); + + if (!IP_VALID(port)) { + port = ipc_kobject_alloc_labeled_port(kobject, type, label, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + if (os_atomic_cmpxchgv(port_store, IP_NULL, port, &previous, release)) { + return TRUE; + } + + // undo what ipc_kobject_alloc_port() did above + port->ip_nsrequest = IP_NULL; + port->ip_mscount = 0; + port->ip_sorights = 0; + port->ip_srights = 0; + ip_release(port); + ip_release(port); + zfree(ipc_kobject_label_zone, port->ip_kolabel); + port->ip_object.io_bits &= ~IO_BITS_KOLABEL; + port->ip_kolabel = NULL; + ipc_port_dealloc_kernel(port); + + port = previous; + assert(ip_is_kolabeled(port)); + } + + ip_lock(port); + ipc_port_make_send_locked(port); + if (port->ip_srights == 1) { + ipc_port_make_sonce_locked(port); + assert(port->ip_nsrequest == IP_NULL); + port->ip_nsrequest = port; + rc = TRUE; + } + ip_unlock(port); + + return rc; +} + + /* * Routine: ipc_kobject_destroy * Purpose: * Release any kernel object resources associated * with the port, which is being destroyed. * - * This should only be needed when resources are - * associated with a user's port. In the normal case, - * when the kernel is the receiver, the code calling - * ipc_port_dealloc_kernel should clean up the resources. + * This path to free object resources should only be + * needed when resources are associated with a user's port. + * In the normal case, when the kernel is the receiver, + * the code calling ipc_port_dealloc_kernel should clean + * up the object resources. + * + * Cleans up any kobject label that might be present. * Conditions: * The port is not locked, but it is dead. */ @@ -775,11 +905,56 @@ ipc_kobject_destroy( host_notify_port_destroy(port); break; + case IKOT_SUID_CRED: + suid_cred_destroy(port); + break; + default: break; } + + if (ip_is_kolabeled(port)) { + ipc_kobject_label_t labelp = port->ip_kolabel; + + assert(labelp != NULL); + assert(ip_is_kobject(port)); + port->ip_kolabel = NULL; + port->ip_object.io_bits &= ~IO_BITS_KOLABEL; + zfree(ipc_kobject_label_zone, labelp); + } } +/* + * Routine: ipc_kobject_label_check + * Purpose: + * Check to see if the space is allowed to possess a + * right for the given port. In order to qualify, the + * space label must contain all the privileges listed + * in the port/kobject label. + * + * Conditions: + * Space is write locked and active. + * Port is locked and active. + */ +boolean_t +ipc_kobject_label_check( + ipc_space_t space, + ipc_port_t port, + __unused mach_msg_type_name_t msgt_name) +{ + ipc_kobject_label_t labelp; + + assert(is_active(space)); + assert(ip_active(port)); + + /* Unlabled ports/kobjects are always allowed */ + if (!ip_is_kolabeled(port)) { + return TRUE; + } + + labelp = port->ip_kolabel; + return (labelp->ikol_label & space->is_label) == labelp->ikol_label; +} boolean_t ipc_kobject_notify( @@ -860,7 +1035,12 @@ ipc_kobject_notify( case IKOT_WORK_INTERVAL: work_interval_port_notify(request_header); return TRUE; + + case IKOT_SUID_CRED: + suid_cred_notify(request_header); + return TRUE; } + break; case MACH_NOTIFY_PORT_DELETED: diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 24913d602..03014d1dc 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -133,19 +133,24 @@ typedef natural_t ipc_kobject_type_t; #define IKOT_UEXT_OBJECT 41 #define IKOT_ARCADE_REG 42 +#define IKOT_SUID_CRED 48 + /* * Add new entries here and adjust IKOT_UNKNOWN. * Please keep ipc/ipc_object.c:ikot_print_array up to date. */ -#define IKOT_UNKNOWN 43 /* magic catchall */ +#define IKOT_UNKNOWN 49 /* magic catchall */ #define IKOT_MAX_TYPE (IKOT_UNKNOWN+1) /* # of IKOT_ types */ #ifdef MACH_KERNEL_PRIVATE -/* - * Define types of kernel objects that use page lists instead - * of entry lists for copyin of out of line memory. - */ +struct ipc_kobject_label { + ipc_label_t ikol_label; /* [private] mandatory access label */ + ipc_kobject_t ikol_kobject; /* actual kobject address */ +}; + +/* initialization of kobject subsystem */ +extern void ipc_kobject_init(void); /* Dispatch a kernel server function */ extern ipc_kmsg_t ipc_kobject_server( @@ -174,8 +179,8 @@ __options_decl(ipc_kobject_alloc_options_t, uint32_t, { IPC_KOBJECT_ALLOC_NO_GRANT = 0x00000004, /* Make all the send rights immovable */ IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008, - /* Make the port in-transit from the get-go */ - IPC_KOBJECT_ALLOC_IN_TRANSIT = 0x00000010, + /* Add a label structure to the port */ + IPC_KOBJECT_ALLOC_LABEL = 0x00000010, }); /* Allocates a kobject port, never fails */ @@ -184,12 +189,44 @@ extern ipc_port_t ipc_kobject_alloc_port( ipc_kobject_type_t type, ipc_kobject_alloc_options_t options); +/* Allocates a kobject port, never fails */ +extern ipc_port_t ipc_kobject_alloc_labeled_port( + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_label_t label, + ipc_kobject_alloc_options_t options); + /* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ extern boolean_t ipc_kobject_make_send_lazy_alloc_port( ipc_port_t *port_store, ipc_kobject_t kobject, ipc_kobject_type_t type) __result_use_check; +/* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ +extern boolean_t ipc_kobject_make_send_lazy_alloc_labeled_port( + ipc_port_t *port_store, + ipc_kobject_t kobject, + ipc_kobject_type_t type, + ipc_label_t label) __result_use_check; + +/* Get the kobject address associated with a port */ +static inline ipc_kobject_t +ipc_kobject_get(ipc_port_t port) +{ + if (ip_is_kobject(port)) { + if (ip_is_kolabeled(port)) { + return port->ip_kolabel->ikol_kobject; + } + return port->ip_kobject; + } + return 0; +} + +/* Check if a kobject can be copied out to a given space */ +extern boolean_t ipc_kobject_label_check( + ipc_space_t space, + ipc_port_t port, + mach_msg_type_name_t msgt_name); /* Release any kernel object resources associated with a port */ extern void ipc_kobject_destroy( diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 722384a00..6896e3793 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -203,12 +203,27 @@ mach_msg_send_from_kernel_with_options( mach_msg_size_t send_size, mach_msg_option_t option, mach_msg_timeout_t timeout_val) +{ + return kernel_mach_msg_send(msg, send_size, option, timeout_val, NULL); +} + +mach_msg_return_t +kernel_mach_msg_send( + mach_msg_header_t *msg, + mach_msg_size_t send_size, + mach_msg_option_t option, + mach_msg_timeout_t timeout_val, + boolean_t *message_moved) { ipc_kmsg_t kmsg; mach_msg_return_t mr; KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + if (message_moved) { + *message_moved = FALSE; + } + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); if (mr != MACH_MSG_SUCCESS) { KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); @@ -222,6 +237,10 @@ mach_msg_send_from_kernel_with_options( return mr; } + if (message_moved) { + *message_moved = TRUE; + } + /* * Until we are sure of its effects, we are disabling * importance donation from the kernel-side of user @@ -313,9 +332,6 @@ mach_msg_send_from_kernel_with_options_legacy( * MACH_RCV_PORT_DIED The reply port was deallocated. */ -mach_msg_return_t mach_msg_rpc_from_kernel_body(mach_msg_header_t *msg, - mach_msg_size_t send_size, mach_msg_size_t rcv_size, boolean_t legacy); - #if IKM_SUPPORT_LEGACY #undef mach_msg_rpc_from_kernel @@ -331,9 +347,8 @@ mach_msg_rpc_from_kernel( mach_msg_size_t send_size, mach_msg_size_t rcv_size) { - return mach_msg_rpc_from_kernel_body(msg, send_size, rcv_size, TRUE); + return kernel_mach_msg_rpc(msg, send_size, rcv_size, TRUE, NULL); } - #endif /* IKM_SUPPORT_LEGACY */ mach_msg_return_t @@ -342,18 +357,19 @@ mach_msg_rpc_from_kernel_proper( mach_msg_size_t send_size, mach_msg_size_t rcv_size) { - return mach_msg_rpc_from_kernel_body(msg, send_size, rcv_size, FALSE); + return kernel_mach_msg_rpc(msg, send_size, rcv_size, FALSE, NULL); } mach_msg_return_t -mach_msg_rpc_from_kernel_body( +kernel_mach_msg_rpc( mach_msg_header_t *msg, mach_msg_size_t send_size, mach_msg_size_t rcv_size, #if !IKM_SUPPORT_LEGACY __unused #endif - boolean_t legacy) + boolean_t legacy, + boolean_t *message_moved) { thread_t self = current_thread(); ipc_port_t reply; @@ -365,6 +381,10 @@ mach_msg_rpc_from_kernel_body( KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START); + if (message_moved) { + *message_moved = FALSE; + } + mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); if (mr != MACH_MSG_SUCCESS) { KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_END, mr); @@ -401,6 +421,10 @@ mach_msg_rpc_from_kernel_body( return mr; } + if (message_moved) { + *message_moved = TRUE; + } + /* * respect the thread's SEND_IMPORTANCE option to force importance * donation from the kernel-side of user threads @@ -1045,7 +1069,7 @@ convert_port_to_mig_object( * query it to get a reference to the desired interface. */ ppv = NULL; - mig_object = (mig_object_t)port->ip_kobject; + mig_object = (mig_object_t) ip_get_kobject(port); mig_object->pVtbl->QueryInterface((IMIGObject *)mig_object, iid, &ppv); ip_unlock(port); return (mig_object_t)ppv; @@ -1068,7 +1092,7 @@ mig_object_no_senders( assert(IKOT_MIG == ip_kotype(port)); /* consume the reference donated by convert_mig_object_to_port */ - mig_object_deallocate((mig_object_t)port->ip_kobject); + mig_object_deallocate((mig_object_t) ip_get_kobject(port)); } /* diff --git a/osfmk/kern/ipc_mig.h b/osfmk/kern/ipc_mig.h index a4fad67e1..48abc2591 100644 --- a/osfmk/kern/ipc_mig.h +++ b/osfmk/kern/ipc_mig.h @@ -155,6 +155,15 @@ mach_msg_rpc_from_kernel_proper( #define mach_msg_rpc_from_kernel mach_msg_rpc_from_kernel_proper +#ifdef XNU_KERNEL_PRIVATE +mach_msg_return_t kernel_mach_msg_rpc( + mach_msg_header_t *msg, + mach_msg_size_t send_size, + mach_msg_size_t rcv_size, + boolean_t legacy, + boolean_t *message_moved); +#endif /* XNU_KERNEL_PRIVATE */ + extern void mach_msg_destroy_from_kernel_proper( mach_msg_header_t *msg); @@ -168,6 +177,13 @@ extern mach_msg_return_t mach_msg_send_from_kernel_with_options_legacy( mach_msg_option_t option, mach_msg_timeout_t timeout_val); +extern mach_msg_return_t kernel_mach_msg_send( + mach_msg_header_t *msg, + mach_msg_size_t send_size, + mach_msg_option_t option, + mach_msg_timeout_t timeout_val, + boolean_t *message_moved); + extern mach_msg_return_t mach_msg_send_from_kernel_with_options( mach_msg_header_t *msg, mach_msg_size_t send_size, @@ -188,9 +204,6 @@ __END_DECLS extern void mach_msg_receive_continue(void); -/* Initialize kernel server dispatch table */ -extern void mig_init(void); - /* * Kernel implementation of the MIG object base class * diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index 16c3c5a51..aaec28a5d 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -82,7 +82,7 @@ fileport_port_to_fileglob(ipc_port_t port) ip_lock(port); if (ip_active(port) && IKOT_FILEPORT == ip_kotype(port)) { - fg = (void *)port->ip_kobject; + fg = (void *) ip_get_kobject(port); } ip_unlock(port); @@ -112,7 +112,7 @@ fileport_notify(mach_msg_header_t *msg) ip_lock(port); - fg = (struct fileglob *)port->ip_kobject; + fg = (struct fileglob *) ip_get_kobject(port); if (!ip_active(port)) { panic("Inactive port passed to fileport_notify()\n"); diff --git a/osfmk/kern/ipc_sync.c b/osfmk/kern/ipc_sync.c index 7f65888d5..cd1dd1afd 100644 --- a/osfmk/kern/ipc_sync.c +++ b/osfmk/kern/ipc_sync.c @@ -108,7 +108,7 @@ convert_port_to_semaphore(ipc_port_t port) */ if (ip_kotype(port) == IKOT_SEMAPHORE) { require_ip_active(port); - semaphore = (semaphore_t) port->ip_kobject; + semaphore = (semaphore_t) ip_get_kobject(port); semaphore_reference(semaphore); return semaphore; } @@ -169,7 +169,7 @@ semaphore_notify(mach_msg_header_t *msg) require_ip_active(port); assert(IKOT_SEMAPHORE == ip_kotype(port)); - semaphore_dereference((semaphore_t)port->ip_kobject); + semaphore_dereference((semaphore_t) ip_get_kobject(port)); } lock_set_t diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 7fa53a8aa..44c93bae2 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -136,7 +136,7 @@ ipc_task_init( int i; - kr = ipc_space_create(&ipc_table_entries[0], &space); + kr = ipc_space_create(&ipc_table_entries[0], IPC_LABEL_NONE, &space); if (kr != KERN_SUCCESS) { panic("ipc_task_init"); } @@ -1516,7 +1516,7 @@ convert_port_to_locked_task(ipc_port_t port) ip_unlock(port); return TASK_NULL; } - task = (task_t) port->ip_kobject; + task = (task_t) ip_get_kobject(port); assert(task != TASK_NULL); if (task_conversion_eval(ct, task)) { @@ -1562,7 +1562,7 @@ convert_port_to_locked_task_inspect(ipc_port_t port) ip_unlock(port); return TASK_INSPECT_NULL; } - task = (task_inspect_t)port->ip_kobject; + task = (task_inspect_t) ip_get_kobject(port); assert(task != TASK_INSPECT_NULL); /* * Normal lock ordering puts task_lock() before ip_lock(). @@ -1592,7 +1592,7 @@ convert_port_to_task_locked( if (ip_kotype(port) == IKOT_TASK) { task_t ct = current_task(); - task = (task_t)port->ip_kobject; + task = (task_t) ip_get_kobject(port); assert(task != TASK_NULL); if (task_conversion_eval(ct, task)) { @@ -1674,7 +1674,7 @@ convert_port_to_task_name( if (ip_active(port) && (ip_kotype(port) == IKOT_TASK || ip_kotype(port) == IKOT_TASK_NAME)) { - task = (task_name_t)port->ip_kobject; + task = (task_name_t) ip_get_kobject(port); assert(task != TASK_NAME_NULL); task_reference_internal(task); @@ -1696,7 +1696,7 @@ convert_port_to_task_inspect_locked( require_ip_active(port); if (ip_kotype(port) == IKOT_TASK) { - task = (task_inspect_t)port->ip_kobject; + task = (task_inspect_t) ip_get_kobject(port); assert(task != TASK_INSPECT_NULL); task_reference_internal(task); @@ -1751,7 +1751,7 @@ convert_port_to_task_suspension_token( if (ip_active(port) && ip_kotype(port) == IKOT_TASK_RESUME) { - task = (task_suspension_token_t)port->ip_kobject; + task = (task_suspension_token_t) ip_get_kobject(port); assert(task != TASK_NULL); task_reference_internal(task); @@ -1885,7 +1885,7 @@ convert_port_to_thread_locked( require_ip_active(port); if (ip_kotype(port) == IKOT_THREAD) { - thread = (thread_t)port->ip_kobject; + thread = (thread_t) ip_get_kobject(port); assert(thread != THREAD_NULL); if (options & PORT_TO_THREAD_NOT_CURRENT_THREAD) { @@ -1948,7 +1948,7 @@ convert_port_to_thread_inspect( if (ip_active(port) && ip_kotype(port) == IKOT_THREAD) { - thread = (thread_inspect_t)port->ip_kobject; + thread = (thread_inspect_t) ip_get_kobject(port); assert(thread != THREAD_INSPECT_NULL); thread_reference_internal((thread_t)thread); } diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 26176bc36..31a5ec30f 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -859,6 +859,7 @@ void /* if size was too large for a zone, then use kmem_free */ vm_map_t alloc_map = kernel_map; + size = round_page(size); if ((((vm_offset_t) data) >= kalloc_map_min) && (((vm_offset_t) data) <= kalloc_map_max)) { alloc_map = kalloc_map; diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index 7ff3981a7..cdb62018c 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #if defined(__x86_64__) @@ -2752,6 +2753,9 @@ stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo) case kThreadWaitSleepWithInheritor: kdp_sleep_with_inheritor_find_owner(thread->waitq, thread->wait_event, waitinfo); break; + case kThreadWaitCompressor: + kdp_compressor_busy_find_owner(thread->wait_event, waitinfo); + break; default: waitinfo->owner = 0; waitinfo->context = 0; diff --git a/osfmk/kern/mk_timer.c b/osfmk/kern/mk_timer.c index e7780c2e3..730d876b1 100644 --- a/osfmk/kern/mk_timer.c +++ b/osfmk/kern/mk_timer.c @@ -45,6 +45,7 @@ #include #include #include +#include static zone_t mk_timer_zone; @@ -62,16 +63,22 @@ mach_port_name_t mk_timer_create_trap( __unused struct mk_timer_create_trap_args *args) { - mk_timer_t timer; - ipc_space_t myspace = current_space(); - mach_port_name_t name = MACH_PORT_NULL; - ipc_port_t port; - kern_return_t result; - + mk_timer_t timer; + ipc_space_t myspace = current_space(); + mach_port_name_t name = MACH_PORT_NULL; + ipc_port_init_flags_t init_flags; + ipc_port_t port; + kern_return_t result; + + /* Allocate and initialize local state of a timer object */ timer = (mk_timer_t)zalloc(mk_timer_zone); if (timer == NULL) { return MACH_PORT_NULL; } + simple_lock_init(&timer->lock, 0); + thread_call_setup(&timer->call_entry, mk_timer_expire, timer); + timer->is_armed = timer->is_dead = FALSE; + timer->active = 0; /* Pre-allocate a kmsg for the timer messages */ ipc_kmsg_t kmsg; @@ -81,32 +88,24 @@ mk_timer_create_trap( return MACH_PORT_NULL; } - /* Allocate an in-transit kobject port with a send right */ - ipc_kobject_alloc_options_t options; - options = (IPC_KOBJECT_ALLOC_IN_TRANSIT | IPC_KOBJECT_ALLOC_MAKE_SEND); - port = ipc_kobject_alloc_port((ipc_kobject_t)timer, IKOT_TIMER, options); - assert(port != IP_NULL); + init_flags = IPC_PORT_INIT_MESSAGE_QUEUE; + result = ipc_port_alloc(myspace, init_flags, &name, &port); + if (result != KERN_SUCCESS) { + zfree(mk_timer_zone, timer); + ipc_kmsg_free(kmsg); + return MACH_PORT_NULL; + } - /* Associate the kmsg */ + /* Associate the pre-allocated kmsg with the port */ ipc_kmsg_set_prealloc(kmsg, port); - /* Initialize the timer object and bind port to it */ - simple_lock_init(&timer->lock, 0); - thread_call_setup(&timer->call_entry, mk_timer_expire, timer); - timer->is_armed = timer->is_dead = FALSE; - timer->active = 0; - timer->port = port; + /* port locked, receive right at user-space */ + ipc_kobject_set_atomically(port, (ipc_kobject_t)timer, IKOT_TIMER); - /* Copyout the receive right for the timer port to user-space */ - current_thread()->ith_knote = ITH_KNOTE_NULL; - result = ipc_object_copyout(myspace, ip_to_object(port), - MACH_MSG_TYPE_MOVE_RECEIVE, - NULL, NULL, &name); - if (result != KERN_SUCCESS) { - ipc_object_destroy(ip_to_object(port), MACH_MSG_TYPE_MOVE_RECEIVE); - /* should trigger mk_timer_port_destroy() call */ - return MACH_PORT_NULL; - } + /* make a (naked) send right for the timer to keep */ + timer->port = ipc_port_make_send_locked(port); + + ip_unlock(port); return name; } @@ -119,7 +118,7 @@ mk_timer_port_destroy( ip_lock(port); if (ip_kotype(port) == IKOT_TIMER) { - timer = (mk_timer_t)port->ip_kobject; + timer = (mk_timer_t) ip_get_kobject(port); assert(timer != NULL); ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); simple_lock(&timer->lock, LCK_GRP_NULL); @@ -274,7 +273,7 @@ mk_timer_arm_trap_internal(mach_port_name_t name, uint64_t expire_time, uint64_t } if (ip_kotype(port) == IKOT_TIMER) { - timer = (mk_timer_t)port->ip_kobject; + timer = (mk_timer_t) ip_get_kobject(port); assert(timer != NULL); simple_lock(&timer->lock, LCK_GRP_NULL); @@ -358,7 +357,7 @@ mk_timer_cancel_trap( } if (ip_kotype(port) == IKOT_TIMER) { - timer = (mk_timer_t)port->ip_kobject; + timer = (mk_timer_t) ip_get_kobject(port); assert(timer != NULL); simple_lock(&timer->lock, LCK_GRP_NULL); assert(timer->port == port); diff --git a/osfmk/kern/sched_clutch.c b/osfmk/kern/sched_clutch.c index d8a808f60..172efd303 100644 --- a/osfmk/kern/sched_clutch.c +++ b/osfmk/kern/sched_clutch.c @@ -67,12 +67,23 @@ static uint64_t sched_clutch_root_bucket_deadline_calculate(sched_clutch_root_bu static void sched_clutch_root_bucket_deadline_update(sched_clutch_root_bucket_t, sched_clutch_root_t, uint64_t); static int sched_clutch_root_bucket_pri_compare(sched_clutch_root_bucket_t, sched_clutch_root_bucket_t); +/* Options for clutch bucket ordering in the runq */ +__options_decl(sched_clutch_bucket_options_t, uint32_t, { + SCHED_CLUTCH_BUCKET_OPTIONS_NONE = 0x0, + /* Round robin clutch bucket on thread removal */ + SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR = 0x1, + /* Insert clutch bucket at head (for thread preemption) */ + SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ = 0x2, + /* Insert clutch bucket at tail (default) */ + SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ = 0x4, +}); + /* Clutch bucket level hierarchy management */ -static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t); -static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t); -static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); -static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); -static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t); +static void sched_clutch_bucket_hierarchy_insert(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t); +static void sched_clutch_bucket_hierarchy_remove(sched_clutch_root_t, sched_clutch_bucket_t, sched_bucket_t, uint64_t, sched_clutch_bucket_options_t); +static boolean_t sched_clutch_bucket_runnable(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t); +static boolean_t sched_clutch_bucket_update(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t); +static void sched_clutch_bucket_empty(sched_clutch_bucket_t, sched_clutch_root_t, uint64_t, sched_clutch_bucket_options_t); static void sched_clutch_bucket_cpu_usage_update(sched_clutch_bucket_t, uint64_t); static void sched_clutch_bucket_cpu_blocked_update(sched_clutch_bucket_t, uint64_t); @@ -87,7 +98,7 @@ static void sched_clutch_bucket_timeshare_update(sched_clutch_bucket_t); static boolean_t sched_thread_sched_pri_promoted(thread_t); /* Clutch membership management */ static boolean_t sched_clutch_thread_insert(sched_clutch_root_t, thread_t, integer_t); -static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t); +static void sched_clutch_thread_remove(sched_clutch_root_t, thread_t, uint64_t, sched_clutch_bucket_options_t); static thread_t sched_clutch_thread_highest(sched_clutch_root_t); /* Clutch properties updates */ @@ -317,6 +328,133 @@ sched_clutch_root_init( } } +/* + * Clutch Bucket Runqueues + * + * The clutch buckets are maintained in a runq at the root bucket level. The + * runq organization allows clutch buckets to be ordered based on various + * factors such as: + * + * - Clutch buckets are round robin'ed at the same priority level when a + * thread is selected from a clutch bucket. This prevents a clutch bucket + * from starving out other clutch buckets at the same priority. + * + * - Clutch buckets are inserted at the head when it becomes runnable due to + * thread preemption. This allows threads that were preempted to maintain + * their order in the queue. + * + */ + +/* + * sched_clutch_bucket_runq_init() + * + * Initialize a clutch bucket runq. + */ +static void +sched_clutch_bucket_runq_init( + sched_clutch_bucket_runq_t clutch_buckets_rq) +{ + clutch_buckets_rq->scbrq_highq = NOPRI; + for (uint8_t i = 0; i < BITMAP_LEN(NRQS); i++) { + clutch_buckets_rq->scbrq_bitmap[i] = 0; + } + clutch_buckets_rq->scbrq_count = 0; + for (int i = 0; i < NRQS; i++) { + circle_queue_init(&clutch_buckets_rq->scbrq_queues[i]); + } +} + +/* + * sched_clutch_bucket_runq_empty() + * + * Returns if a clutch bucket runq is empty. + */ +static boolean_t +sched_clutch_bucket_runq_empty( + sched_clutch_bucket_runq_t clutch_buckets_rq) +{ + return clutch_buckets_rq->scbrq_count == 0; +} + +/* + * sched_clutch_bucket_runq_peek() + * + * Returns the highest priority clutch bucket in the runq. + */ +static sched_clutch_bucket_t +sched_clutch_bucket_runq_peek( + sched_clutch_bucket_runq_t clutch_buckets_rq) +{ + if (clutch_buckets_rq->scbrq_count > 0) { + circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_buckets_rq->scbrq_highq]; + return cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink); + } else { + return NULL; + } +} + +/* + * sched_clutch_bucket_runq_enqueue() + * + * Enqueue a clutch bucket into the runq based on the options passed in. + */ +static void +sched_clutch_bucket_runq_enqueue( + sched_clutch_bucket_runq_t clutch_buckets_rq, + sched_clutch_bucket_t clutch_bucket, + sched_clutch_bucket_options_t options) +{ + circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority]; + if (circle_queue_empty(queue)) { + circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink); + bitmap_set(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority); + if (clutch_bucket->scb_priority > clutch_buckets_rq->scbrq_highq) { + clutch_buckets_rq->scbrq_highq = clutch_bucket->scb_priority; + } + } else { + if (options & SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ) { + circle_enqueue_head(queue, &clutch_bucket->scb_runqlink); + } else { + /* + * Default behavior (handles SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ & + * SCHED_CLUTCH_BUCKET_OPTIONS_NONE) + */ + circle_enqueue_tail(queue, &clutch_bucket->scb_runqlink); + } + } + clutch_buckets_rq->scbrq_count++; +} + +/* + * sched_clutch_bucket_runq_remove() + * + * Remove a clutch bucket from the runq. + */ +static void +sched_clutch_bucket_runq_remove( + sched_clutch_bucket_runq_t clutch_buckets_rq, + sched_clutch_bucket_t clutch_bucket) +{ + circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority]; + circle_dequeue(queue, &clutch_bucket->scb_runqlink); + assert(clutch_buckets_rq->scbrq_count > 0); + clutch_buckets_rq->scbrq_count--; + if (circle_queue_empty(queue)) { + bitmap_clear(clutch_buckets_rq->scbrq_bitmap, clutch_bucket->scb_priority); + clutch_buckets_rq->scbrq_highq = bitmap_first(clutch_buckets_rq->scbrq_bitmap, NRQS); + } +} + +static void +sched_clutch_bucket_runq_rotate( + sched_clutch_bucket_runq_t clutch_buckets_rq, + sched_clutch_bucket_t clutch_bucket) +{ + circle_queue_t queue = &clutch_buckets_rq->scbrq_queues[clutch_bucket->scb_priority]; + assert(clutch_bucket == cqe_queue_first(queue, struct sched_clutch_bucket, scb_runqlink)); + circle_queue_rotate_head_forward(queue); +} + /* * sched_clutch_root_bucket_init() * @@ -328,7 +466,7 @@ sched_clutch_root_bucket_init( sched_bucket_t bucket) { root_bucket->scrb_bucket = bucket; - priority_queue_init(&root_bucket->scrb_clutch_buckets, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP); + sched_clutch_bucket_runq_init(&root_bucket->scrb_clutch_buckets); priority_queue_entry_init(&root_bucket->scrb_pqlink); root_bucket->scrb_deadline = SCHED_CLUTCH_INVALID_TIME_64; root_bucket->scrb_warped_deadline = 0; @@ -738,7 +876,6 @@ sched_clutch_bucket_init( clutch_bucket->scb_interactivity_ts = 0; clutch_bucket->scb_blocked_ts = SCHED_CLUTCH_BUCKET_BLOCKED_TS_INVALID; - priority_queue_entry_init(&clutch_bucket->scb_pqlink); clutch_bucket->scb_clutch = clutch; clutch_bucket->scb_root = NULL; priority_queue_init(&clutch_bucket->scb_clutchpri_prioq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP); @@ -818,7 +955,8 @@ sched_clutch_bucket_hierarchy_insert( sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket, sched_bucket_t bucket, - uint64_t timestamp) + uint64_t timestamp, + sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); if (bucket > TH_BUCKET_FIXPRI) { @@ -835,12 +973,12 @@ sched_clutch_bucket_hierarchy_insert( sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; /* If this is the first clutch bucket in the root bucket, insert the root bucket into the root priority queue */ - if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) { sched_clutch_root_bucket_runnable(root_bucket, root_clutch, timestamp); } - /* Insert the clutch bucket into the root bucket priority queue */ - priority_queue_insert(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + /* Insert the clutch bucket into the root bucket run queue with order based on options */ + sched_clutch_bucket_runq_enqueue(&root_bucket->scrb_clutch_buckets, clutch_bucket, options); os_atomic_store(&clutch_bucket->scb_root, root_clutch, relaxed); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE, thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_RUNNABLE, clutch_bucket->scb_priority, 0); @@ -856,7 +994,8 @@ sched_clutch_bucket_hierarchy_remove( sched_clutch_root_t root_clutch, sched_clutch_bucket_t clutch_bucket, sched_bucket_t bucket, - uint64_t timestamp) + uint64_t timestamp, + __unused sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); if (bucket > TH_BUCKET_FIXPRI) { @@ -873,14 +1012,14 @@ sched_clutch_bucket_hierarchy_remove( sched_clutch_root_bucket_t root_bucket = &root_clutch->scr_buckets[bucket]; /* Remove the clutch bucket from the root bucket priority queue */ - priority_queue_remove(&root_bucket->scrb_clutch_buckets, &clutch_bucket->scb_pqlink, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + sched_clutch_bucket_runq_remove(&root_bucket->scrb_clutch_buckets, clutch_bucket); os_atomic_store(&clutch_bucket->scb_root, NULL, relaxed); clutch_bucket->scb_blocked_ts = timestamp; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_TG_BUCKET_STATE) | DBG_FUNC_NONE, thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, SCHED_CLUTCH_STATE_EMPTY, 0, 0); /* If the root bucket priority queue is now empty, remove it from the root priority queue */ - if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) { sched_clutch_root_bucket_empty(root_bucket, root_clutch, timestamp); } } @@ -1030,10 +1169,10 @@ static sched_clutch_bucket_t sched_clutch_root_bucket_highest_clutch_bucket( sched_clutch_root_bucket_t root_bucket) { - if (priority_queue_empty(&root_bucket->scrb_clutch_buckets)) { + if (sched_clutch_bucket_runq_empty(&root_bucket->scrb_clutch_buckets)) { return NULL; } - return priority_queue_max(&root_bucket->scrb_clutch_buckets, struct sched_clutch_bucket, scb_pqlink); + return sched_clutch_bucket_runq_peek(&root_bucket->scrb_clutch_buckets); } /* @@ -1047,12 +1186,13 @@ static boolean_t sched_clutch_bucket_runnable( sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, - uint64_t timestamp) + uint64_t timestamp, + sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); sched_clutch_bucket_cpu_blocked_update(clutch_bucket, timestamp); clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); - sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp); + sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options); /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */ sched_clutch_bucket_timeshare_update(clutch_bucket); int16_t root_old_pri = root_clutch->scr_priority; @@ -1063,32 +1203,35 @@ sched_clutch_bucket_runnable( /* * sched_clutch_bucket_update() * - * Update the clutch_bucket's position in the hierarchy based on whether - * the newly runnable thread changes its priority. Also update the root - * priority accordingly. + * Update the clutch_bucket's position in the hierarchy. This routine is + * called when a new thread is inserted or removed from a runnable clutch + * bucket. The options specify some properties about the clutch bucket + * insertion order into the clutch bucket runq. */ static boolean_t sched_clutch_bucket_update( sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, - uint64_t timestamp) + uint64_t timestamp, + sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); uint64_t new_pri = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); + sched_clutch_bucket_runq_t bucket_runq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets; if (new_pri == clutch_bucket->scb_priority) { + /* + * If SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR is specified, move the clutch bucket + * to the end of the runq. Typically used when a thread is selected for execution + * from a clutch bucket. + */ + if (options & SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR) { + sched_clutch_bucket_runq_rotate(bucket_runq, clutch_bucket); + } return false; } - struct priority_queue *bucket_prioq = &root_clutch->scr_buckets[clutch_bucket->scb_bucket].scrb_clutch_buckets; - - if (new_pri < clutch_bucket->scb_priority) { - clutch_bucket->scb_priority = new_pri; - priority_queue_entry_decrease(bucket_prioq, &clutch_bucket->scb_pqlink, - clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); - } else { - clutch_bucket->scb_priority = new_pri; - priority_queue_entry_increase(bucket_prioq, &clutch_bucket->scb_pqlink, - clutch_bucket->scb_priority, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); - } + sched_clutch_bucket_runq_remove(bucket_runq, clutch_bucket); + clutch_bucket->scb_priority = new_pri; + sched_clutch_bucket_runq_enqueue(bucket_runq, clutch_bucket, options); int16_t root_old_pri = root_clutch->scr_priority; sched_clutch_root_pri_update(root_clutch); @@ -1106,10 +1249,11 @@ static void sched_clutch_bucket_empty( sched_clutch_bucket_t clutch_bucket, sched_clutch_root_t root_clutch, - uint64_t timestamp) + uint64_t timestamp, + sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); - sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp); + sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options); clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); sched_clutch_root_pri_update(root_clutch); } @@ -1407,17 +1551,16 @@ sched_clutch_thread_insert( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_STATE) | DBG_FUNC_NONE, thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, thread_tid(thread), SCHED_CLUTCH_STATE_RUNNABLE, 0); - /* Enqueue the clutch into the hierarchy (if needed) and update properties */ + /* Enqueue the clutch into the hierarchy (if needed) and update properties; pick the insertion order based on thread options */ + sched_clutch_bucket_options_t scb_options = (options & SCHED_HEADQ) ? SCHED_CLUTCH_BUCKET_OPTIONS_HEADQ : SCHED_CLUTCH_BUCKET_OPTIONS_TAILQ; if (clutch_bucket->scb_thr_count == 0) { sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count); sched_clutch_thr_count_inc(&root_clutch->scr_thr_count); - /* Insert the newly runnable clutch bucket into the hierarchy */ - result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp); + result = sched_clutch_bucket_runnable(clutch_bucket, root_clutch, current_timestamp, scb_options); } else { sched_clutch_thr_count_inc(&clutch_bucket->scb_thr_count); sched_clutch_thr_count_inc(&root_clutch->scr_thr_count); - /* Update the position of the clutch bucket in the hierarchy */ - result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp); + result = sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, scb_options); } return result; } @@ -1433,7 +1576,8 @@ static void sched_clutch_thread_remove( sched_clutch_root_t root_clutch, thread_t thread, - uint64_t current_timestamp) + uint64_t current_timestamp, + sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); sched_clutch_t clutch = sched_clutch_for_thread(thread); @@ -1460,9 +1604,9 @@ sched_clutch_thread_remove( /* Remove the clutch from hierarchy (if needed) and update properties */ if (clutch_bucket->scb_thr_count == 0) { - sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp); + sched_clutch_bucket_empty(clutch_bucket, root_clutch, current_timestamp, options); } else { - sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp); + sched_clutch_bucket_update(clutch_bucket, root_clutch, current_timestamp, options); } } @@ -1498,8 +1642,8 @@ sched_clutch_thread_highest( thread_t thread = run_queue_peek(&clutch_bucket->scb_runq); assert(thread != NULL); - /* Remove and return the thread from the hierarchy */ - sched_clutch_thread_remove(root_clutch, thread, current_timestamp); + /* Remove and return the thread from the hierarchy; also round robin the clutch bucket if the priority remains unchanged */ + sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_SAMEPRI_RR); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_CLUTCH_THREAD_SELECT) | DBG_FUNC_NONE, thread_tid(thread), thread_group_get_id(clutch_bucket->scb_clutch->sc_tg), clutch_bucket->scb_bucket, 0, 0); return thread; @@ -1978,7 +2122,7 @@ sched_clutch_processor_queue_remove( */ if (SCHED_CLUTCH_THREAD_ELIGIBLE(thread)) { sched_clutch_root_t pset_clutch_root = sched_clutch_processor_root_clutch(processor); - sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time()); + sched_clutch_thread_remove(pset_clutch_root, thread, mach_absolute_time(), SCHED_CLUTCH_BUCKET_OPTIONS_NONE); } else { rq = sched_clutch_thread_bound_runq(processor, thread); run_queue_remove(rq, thread); @@ -2722,7 +2866,7 @@ sched_clutch_bucket_threads_drain(sched_clutch_bucket_t clutch_bucket, sched_clu uint64_t current_timestamp = mach_approximate_time(); while (thread_count > 0) { thread = run_queue_peek(&clutch_bucket->scb_runq); - sched_clutch_thread_remove(root_clutch, thread, current_timestamp); + sched_clutch_thread_remove(root_clutch, thread, current_timestamp, SCHED_CLUTCH_BUCKET_OPTIONS_NONE); enqueue_tail(clutch_threads, &thread->runq_links); thread_count--; } diff --git a/osfmk/kern/sched_clutch.h b/osfmk/kern/sched_clutch.h index eef5bee4d..5600eb378 100644 --- a/osfmk/kern/sched_clutch.h +++ b/osfmk/kern/sched_clutch.h @@ -58,6 +58,17 @@ */ #define SCHED_CLUTCH_THREAD_ELIGIBLE(thread) ((thread->bound_processor) == PROCESSOR_NULL) +/* + * Clutch Bucket Runqueue Structure. + */ +struct sched_clutch_bucket_runq { + int scbrq_highq; + bitmap_t scbrq_bitmap[BITMAP_LEN(NRQS_MAX)]; + int scbrq_count; + circle_queue_head_t scbrq_queues[NRQS_MAX]; +}; +typedef struct sched_clutch_bucket_runq *sched_clutch_bucket_runq_t; + /* * * Clutch hierarchy locking protocol @@ -84,7 +95,7 @@ struct sched_clutch_root_bucket { /* (I) sched bucket represented by this root bucket */ uint8_t scrb_bucket; /* (P) priority queue for all clutch buckets in this sched bucket */ - struct priority_queue scrb_clutch_buckets; + struct sched_clutch_bucket_runq scrb_clutch_buckets; /* (P) priority queue entry to use for enqueueing root bucket into root prioq */ struct priority_queue_entry scrb_pqlink; /* (P) ageout deadline for this root bucket */ @@ -226,8 +237,8 @@ struct sched_clutch_bucket { /* (A) CPU usage information for the clutch bucket */ sched_clutch_bucket_cpu_data_t scb_cpu_data; - /* (P) linkage for clutch_bucket in root_bucket priority queue */ - struct priority_queue_entry scb_pqlink; + /* (P) linkage for clutch_bucket in root_bucket runqueue */ + queue_chain_t scb_runqlink; /* (I) clutch to which this clutch bucket belongs */ struct sched_clutch *scb_clutch; /* (A) pointer to the root of the hierarchy this bucket is in */ diff --git a/osfmk/kern/sched_clutch.md b/osfmk/kern/sched_clutch.md index 64da1a58f..c54237a2a 100644 --- a/osfmk/kern/sched_clutch.md +++ b/osfmk/kern/sched_clutch.md @@ -65,7 +65,7 @@ The second level is the “thread group” level which decides which thread grou **Implementation** -The thread group level implements a variation of the FreeBSD ULE scheduler to decide which clutch bucket should be selected next for execution. Each clutch bucket with runnable threads is represented as an entry in a priority queue which is ordered by clutch bucket priorities. The clutch bucket selection algorithm simply selects the clutch bucket with the highest priority in the priority queue. The priority calculation for the clutch buckets is based on the following factors: +The thread group level implements a variation of the FreeBSD ULE scheduler to decide which clutch bucket should be selected next for execution. Each clutch bucket with runnable threads is represented as an entry in a runqueue which is ordered by clutch bucket priorities. The clutch bucket selection algorithm simply selects the clutch bucket with the highest priority in the clutch bucket runqueue. The priority calculation for the clutch buckets is based on the following factors: * **Highest runnable thread in the clutch bucket**: The clutch bucket maintains a priority queue which contains threads ordered by their promoted or base priority (whichever property made the thread eligible to be part of that clutch bucket). It uses the highest of these threads to calculate the base priority of the clutch bucket. The use of both base and sched priority allows the scheduler to honor priority differences specified from userspace via SPIs, priority boosts due to priority inheritance mechanisms like turnstiles and other priority affecting mechanisms outside the core scheduler. * **Interactivity score**: The scheduler calculates an interactivity score based on the ratio of voluntary blocking time and CPU usage time for the clutch bucket as a whole. This score allows the scheduler to prefer highly interactive thread groups over batch processing compute intensive thread groups. @@ -77,6 +77,8 @@ The interactivity score based algorithm is well suited for this level due to the * Since the priority calculation is fairly cheap, the scheduler is able to maintain up-to-date information about all thread groups which leads to more optimal decisions. * Thread groups provide a convenient abstraction for groups of threads working together for a user workload. Basing scheduling decisions on this abstraction allows the system to make interesting choices such as preferring Apps over daemons which is typically better for system responsiveness. +The clutch bucket runqueue data structure allows the clutch buckets to be inserted at the head of the queue when threads from that clutch bucket are pre-empted. The runqueues also rotate the clutch bucket to the end of the runqueue at the same priority level when a thread is selected for execution from the clutch bucket. This allows the system to round robin efficiently among clutch buckets at the same priority value especially on highly contended low CPU systems. + ### Thread Level At the lowest level the scheduler decides which thread within a clutch bucket should be selected next for execution. Each runnable thread in the clutch bucket is represented as an entry in a runqueue which is organized based on the schedpri of threads. The thread selection algorithm simply selects the highest priority thread in the runqueue. The schedpri calculation for the threads is based on the traditional Mach scheduling algorithm which uses load & CPU usage to decay priority for a thread. The thread decay model is more suited at this level as compared to the global scheduler because the load calculation only accounts for threads in the same clutch bucket. Since all threads in the same clutch bucket belong to the same thread group and scheduling bucket, this algorithm provides quick CPU access for latency sensitive threads within the clutch bucket without impacting other non-related threads in the system. diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index 04e9db25e..36a97ec7d 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -352,6 +352,9 @@ kernel_bootstrap(void) kernel_bootstrap_log("machine_init"); machine_init(); + kernel_bootstrap_log("thread_machine_init_template"); + thread_machine_init_template(); + kernel_bootstrap_log("clock_init"); clock_init(); diff --git a/osfmk/kern/suid_cred.c b/osfmk/kern/suid_cred.c new file mode 100644 index 000000000..b876d731c --- /dev/null +++ b/osfmk/kern/suid_cred.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * + * An SUID credential is a port type which allows a process to create a new + * process with a specific user id. It provides an alternative means to acheive + * this to the more traditional SUID bit file permission. + * + * To create a new SUID credential the process must be running as root and must + * have a special entitlement. When created, the credential is associated with a + * specific vnode and UID so the unprivileged owner of the credential may only + * create a new process from the file associated with that vnode and the + * resulting effective UID will be that of the UID in the credential. + */ + +#include +#include +#include + +#include +#include + +#include + +/* Declarations necessary to call vnode_lookup()/vnode_put(). */ +struct vnode; +struct vfs_context; +extern int vnode_lookup(const char *, int, struct vnode **, + struct vfs_context *); +extern struct vfs_context * vfs_context_current(void); +extern int vnode_put(struct vnode *); + +/* Declarations necessary to call kauth_cred_issuser(). */ +struct ucred; +extern int kauth_cred_issuser(struct ucred *); +extern struct ucred *kauth_cred_get(void); + +static struct zone *suid_cred_zone = NULL; + +/* Data associated with the suid cred port. Consumed during posix_spawn(). */ +struct suid_cred { + ipc_port_t port; + struct vnode *vnode; + uint32_t uid; +}; + +/* Allocs a new suid credential. The vnode reference will be owned by the newly + * created suid_cred_t. */ +static suid_cred_t +suid_cred_alloc(struct vnode *vnode, uint32_t uid) +{ + suid_cred_t sc = SUID_CRED_NULL; + + assert(vnode != NULL); + + sc = zalloc(suid_cred_zone); + if (sc != NULL) { + // Lazily allocated in convert_suid_cred_to_port(). + sc->port = IP_NULL; + sc->vnode = vnode; + sc->uid = uid; + } + + return sc; +} + +static void +suid_cred_free(suid_cred_t sc) +{ + assert(sc != NULL); + assert(sc->vnode != NULL); + + vnode_put(sc->vnode); + + sc->uid = UINT32_MAX; + sc->vnode = NULL; + sc->port = IP_NULL; + + zfree(suid_cred_zone, sc); +} + +void +suid_cred_destroy(ipc_port_t port) +{ + suid_cred_t sc = NULL; + + ip_lock(port); + assert(ip_kotype(port) == IKOT_SUID_CRED); + sc = (suid_cred_t)port->ip_kobject; + ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); + ip_unlock(port); + + assert(sc->port == port); + + suid_cred_free(sc); +} + +void +suid_cred_notify(mach_msg_header_t *msg) +{ + assert(msg->msgh_id == MACH_NOTIFY_NO_SENDERS); + + mach_no_senders_notification_t *not = (mach_no_senders_notification_t *)msg; + ipc_port_t port = not->not_header.msgh_remote_port; + + if (IP_VALID(port)) { + ipc_port_dealloc_kernel(port); + } +} + +ipc_port_t +convert_suid_cred_to_port(suid_cred_t sc) +{ + if (sc == NULL) { + return IP_NULL; + } + + if (!ipc_kobject_make_send_lazy_alloc_port(&sc->port, + (ipc_kobject_t) sc, IKOT_SUID_CRED)) { + suid_cred_free(sc); + return IP_NULL; + } + + return sc->port; +} + +/* + * Verify the suid cred port. The cached vnode should match the passed vnode. + * The uid to be used to spawn the new process is returned in 'uid'. + */ +int +suid_cred_verify(ipc_port_t port, struct vnode *vnode, uint32_t *uid) +{ + suid_cred_t sc = NULL; + int ret = -1; + + if (!IP_VALID(port)) { + return -1; + } + + ip_lock(port); + + if (ip_kotype(port) != IKOT_SUID_CRED) { + ip_unlock(port); + return -1; + } + + if (!ip_active(port)) { + ip_unlock(port); + return -1; + } + + sc = (suid_cred_t)port->ip_kobject; + + if (vnode != sc->vnode) { + ip_unlock(port); + return -1; + } + + *uid = sc->uid; + ret = 0; + + ipc_port_destroy(port); + return ret; +} + +void +suid_cred_init(void) +{ + const size_t sc_size = sizeof(struct suid_cred); + suid_cred_zone = zinit(sc_size, 1024 * sc_size, 0, "suid_cred"); +} + +kern_return_t +task_create_suid_cred( + task_t task, + suid_cred_path_t path, + suid_cred_uid_t uid, + suid_cred_t *sc_p) +{ + suid_cred_t sc = NULL; + struct vnode *vnode; + int err = -1; + + if (task == TASK_NULL || task != current_task()) { + return KERN_INVALID_ARGUMENT; + } + + // Task must have entitlement. + if (!IOTaskHasEntitlement(task, "com.apple.private.suid_cred")) { + return KERN_NO_ACCESS; + } + + // Thread must be root owned. + if (!kauth_cred_issuser(kauth_cred_get())) { + return KERN_NO_ACCESS; + } + + // Find the vnode for the path. + err = vnode_lookup(path, 0, &vnode, vfs_context_current()); + if (err != 0) { + return KERN_INVALID_ARGUMENT; + } + + sc = suid_cred_alloc(vnode, uid); + if (sc == NULL) { + (void) vnode_put(vnode); + return KERN_RESOURCE_SHORTAGE; + } + + *sc_p = sc; + + return KERN_SUCCESS; +} diff --git a/osfmk/kern/suid_cred.h b/osfmk/kern/suid_cred.h new file mode 100644 index 000000000..ff057621a --- /dev/null +++ b/osfmk/kern/suid_cred.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SUID_CRED_H +#define _KERN_SUID_CRED_H + +#if XNU_KERNEL_PRIVATE + +#include +#include + +struct vnode; + +extern ipc_port_t convert_suid_cred_to_port(suid_cred_t); + +extern void suid_cred_init(void); + +extern void suid_cred_notify(mach_msg_header_t *msg); + +extern int suid_cred_verify(ipc_port_t port, struct vnode *vnode, uint32_t *uid); + +extern void suid_cred_destroy(ipc_port_t port); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_SUID_CRED_H */ diff --git a/osfmk/kern/sysdiagnose.c b/osfmk/kern/sysdiagnose.c index a2a147207..75812e5c3 100644 --- a/osfmk/kern/sysdiagnose.c +++ b/osfmk/kern/sysdiagnose.c @@ -55,7 +55,7 @@ sysdiagnose_notify_user(uint32_t keycode) KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SYSDIAGNOSE, SYSDIAGNOSE_NOTIFY_USER) | DBG_FUNC_START, 0, 0, 0, 0, 0); - kr = send_sysdiagnose_notification(user_port, keycode); + kr = send_sysdiagnose_notification_with_audit_token(user_port, keycode); ipc_port_release_send(user_port); return kr; } diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index fd98be481..98d7250c0 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -2282,7 +2282,7 @@ task_port_notify(mach_msg_header_t *msg) require_ip_active(port); assert(IKOT_TASK == ip_kotype(port)); - task = (task_t) port->ip_kobject; + task = (task_t) ip_get_kobject(port); assert(task_is_a_corpse(task)); @@ -2682,18 +2682,7 @@ task_terminate_internal( pmap_set_process(task->map->pmap, pid, procname); #endif /* MACH_ASSERT */ - vm_map_remove(task->map, - task->map->min_offset, - task->map->max_offset, - /* - * Final cleanup: - * + no unnesting - * + remove immutable mappings - * + allow gaps in range - */ - (VM_MAP_REMOVE_NO_UNNESTING | - VM_MAP_REMOVE_IMMUTABLE | - VM_MAP_REMOVE_GAPS_OK)); + vm_map_terminate(task->map); /* release our shared region */ vm_shared_region_set(task, NULL); @@ -4267,6 +4256,7 @@ task_freeze( task_unlock(task); if (VM_CONFIG_COMPRESSOR_IS_PRESENT && + (kr == KERN_SUCCESS) && (eval_only == FALSE)) { vm_wake_compactor_swapper(); /* @@ -6735,6 +6725,7 @@ task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) int64_t io_delta = 0; int64_t * global_counter_to_update; boolean_t needs_telemetry = FALSE; + boolean_t is_external_device = FALSE; int ledger_to_update = 0; struct task_writes_counters * writes_counters_to_update; @@ -6751,32 +6742,42 @@ task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) global_counter_to_update = &global_logical_writes_count; ledger_to_update = task_ledgers.logical_writes; writes_counters_to_update = &task->task_writes_counters_internal; + is_external_device = FALSE; } else { global_counter_to_update = &global_logical_writes_to_external_count; ledger_to_update = task_ledgers.logical_writes_to_external; writes_counters_to_update = &task->task_writes_counters_external; + is_external_device = TRUE; } switch (flags) { case TASK_WRITE_IMMEDIATE: OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_immediate_writes)); ledger_credit(task->ledger, ledger_to_update, io_size); - coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + if (!is_external_device) { + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + } break; case TASK_WRITE_DEFERRED: OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_deferred_writes)); ledger_credit(task->ledger, ledger_to_update, io_size); - coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + if (!is_external_device) { + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + } break; case TASK_WRITE_INVALIDATED: OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_invalidated_writes)); ledger_debit(task->ledger, ledger_to_update, io_size); - coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, FALSE, io_size); + if (!is_external_device) { + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, FALSE, io_size); + } break; case TASK_WRITE_METADATA: OSAddAtomic64(io_size, (SInt64 *)&(writes_counters_to_update->task_metadata_writes)); ledger_credit(task->ledger, ledger_to_update, io_size); - coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + if (!is_external_device) { + coalition_io_ledger_update(task, FLAVOR_IO_LOGICAL_WRITES, TRUE, io_size); + } break; } @@ -6784,7 +6785,7 @@ task_update_logical_writes(task_t task, uint32_t io_size, int flags, void *vp) if (io_telemetry_limit != 0) { /* If io_telemetry_limit is 0, disable global updates and I/O telemetry */ needs_telemetry = global_update_logical_writes(io_delta, global_counter_to_update); - if (needs_telemetry) { + if (needs_telemetry && !is_external_device) { act_set_io_telemetry_ast(current_thread()); } } diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index b723f0b7c..29595b721 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -486,9 +486,10 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro uintptr_t frames[128]; bool user64_regs = false; - int backtrace_error = backtrace_user(frames, - sizeof(frames) / sizeof(frames[0]), &btcount, &user64_regs, NULL); - if (backtrace_error) { + int bterror = 0; + btcount = backtrace_user(frames, + sizeof(frames) / sizeof(frames[0]), &bterror, &user64_regs, NULL); + if (bterror != 0) { return; } bool user64_va = task_has_64Bit_addr(task); diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index e26f4dced..fde3ef327 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -246,7 +246,7 @@ void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(t os_refgrp_decl(static, thread_refgrp, "thread", NULL); -void +thread_t thread_bootstrap(void) { /* @@ -418,7 +418,13 @@ thread_bootstrap(void) /* fiddle with init thread to skip asserts in set_sched_pri */ init_thread.sched_pri = MAXPRI_KERNEL; - machine_set_current_thread(&init_thread); + return &init_thread; +} + +void +thread_machine_init_template(void) +{ + machine_thread_template_init(&thread_template); } extern boolean_t allow_qos_policy_set; diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 673259a16..f5b7cf612 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -675,7 +675,9 @@ struct thread { #define assert_thread_magic(thread) do { (void)(thread); } while (0) #endif -extern void thread_bootstrap(void); +extern thread_t thread_bootstrap(void); + +extern void thread_machine_init_template(void); extern void thread_init(void); @@ -861,7 +863,9 @@ extern kern_return_t machine_thread_dup( thread_t target, boolean_t is_corpse); -extern void machine_thread_init(void); +extern void machine_thread_init(void); + +extern void machine_thread_template_init(thread_t thr_template); extern kern_return_t machine_thread_create( thread_t thread, diff --git a/osfmk/kern/work_interval.c b/osfmk/kern/work_interval.c index 5986b975d..ed14fe308 100644 --- a/osfmk/kern/work_interval.c +++ b/osfmk/kern/work_interval.c @@ -129,7 +129,7 @@ work_interval_port_convert_locked(ipc_port_t port) return NULL; } - work_interval = (struct work_interval *)port->ip_kobject; + work_interval = (struct work_interval *) ip_get_kobject(port); wi_retain(work_interval); @@ -228,7 +228,7 @@ work_interval_port_notify(mach_msg_header_t *msg) port, port->ip_srights); } - work_interval = (struct work_interval *)port->ip_kobject; + work_interval = (struct work_interval *) ip_get_kobject(port); if (work_interval == NULL) { panic("work_interval_port_notify(): missing kobject: %p", port); diff --git a/osfmk/kperf/callstack.c b/osfmk/kperf/callstack.c index 4a38dd7c5..b42389c91 100644 --- a/osfmk/kperf/callstack.c +++ b/osfmk/kperf/callstack.c @@ -335,24 +335,30 @@ kperf_ucallstack_sample(struct kp_ucallstack *cs, struct kperf_context *context) bool user64 = false; bool trunc = false; - int err = backtrace_thread_user(thread, cs->kpuc_frames, - cs->kpuc_nframes - 1, &cs->kpuc_nframes, &user64, &trunc); - cs->kpuc_flags = CALLSTACK_KERNEL_WORDS; - if (user64) { - cs->kpuc_flags |= CALLSTACK_64BIT; - } - if (trunc) { - cs->kpuc_flags |= CALLSTACK_TRUNCATED; - } + int error = 0; + /* + * Leave space for the fixup information. + */ + unsigned int maxnframes = cs->kpuc_nframes - 1; + unsigned int nframes = backtrace_thread_user(thread, cs->kpuc_frames, + maxnframes, &error, &user64, &trunc); + cs->kpuc_nframes = MIN(maxnframes, nframes); - if (!err || err == EFAULT) { + /* + * Ignore EFAULT to get as much of the stack as possible. It will be + * marked as truncated, below. + */ + if (error == 0 || error == EFAULT) { callstack_fixup_user(cs, thread); cs->kpuc_flags |= CALLSTACK_VALID; } else { cs->kpuc_nframes = 0; - BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK, err); + BUF_INFO(PERF_CS_ERROR, ERR_GETSTACK, error); } + cs->kpuc_flags |= CALLSTACK_KERNEL_WORDS | (user64 ? CALLSTACK_64BIT : 0) | + (trunc ? CALLSTACK_TRUNCATED : 0); + BUF_INFO(PERF_CS_USAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread), cs->kpuc_flags, cs->kpuc_nframes); } diff --git a/osfmk/mach/i386/_structs.h b/osfmk/mach/i386/_structs.h index b998ba056..c9cc8992c 100644 --- a/osfmk/mach/i386/_structs.h +++ b/osfmk/mach/i386/_structs.h @@ -222,7 +222,6 @@ _STRUCT_XMM_REG }; #endif /* !__DARWIN_UNIX03 */ -#if !defined(RC_HIDE_XNU_J137) /* defn of 256 bit YMM regs */ #if __DARWIN_UNIX03 @@ -268,7 +267,6 @@ _STRUCT_OPMASK_REG char opmask_reg[8]; }; #endif /* !__DARWIN_UNIX03 */ -#endif /* not RC_HIDE_XNU_J137 */ /* * Floating point state. @@ -362,7 +360,6 @@ _STRUCT_X86_AVX_STATE32 _STRUCT_XMM_REG __fpu_ymmh7; /* YMMH 7 */ }; -#if !defined(RC_HIDE_XNU_J137) #define _STRUCT_X86_AVX512_STATE32 struct __darwin_i386_avx512_state _STRUCT_X86_AVX512_STATE32 { @@ -424,7 +421,6 @@ _STRUCT_X86_AVX512_STATE32 _STRUCT_YMM_REG __fpu_zmmh6; /* ZMMH 6 */ _STRUCT_YMM_REG __fpu_zmmh7; /* ZMMH 7 */ }; -#endif /* not RC_HIDE_XNU_J137 */ #else /* !__DARWIN_UNIX03 */ #define _STRUCT_X86_FLOAT_STATE32 struct i386_float_state @@ -510,7 +506,6 @@ _STRUCT_X86_AVX_STATE32 _STRUCT_XMM_REG fpu_ymmh7; /* YMMH 7 */ }; -#if !defined(RC_HIDE_XNU_J137) #define _STRUCT_X86_AVX512_STATE32 struct i386_avx512_state _STRUCT_X86_AVX512_STATE32 { @@ -572,7 +567,6 @@ _STRUCT_X86_AVX512_STATE32 _STRUCT_YMM_REG fpu_zmmh6; /* ZMMH 6 */ _STRUCT_YMM_REG fpu_zmmh7; /* ZMMH 7 */ }; -#endif /* not RC_HIDE_XNU_J137 */ #endif /* !__DARWIN_UNIX03 */ @@ -835,7 +829,6 @@ _STRUCT_X86_AVX_STATE64 _STRUCT_XMM_REG __fpu_ymmh15; /* YMMH 15 */ }; -#if !defined(RC_HIDE_XNU_J137) #define _STRUCT_X86_AVX512_STATE64 struct __darwin_x86_avx512_state64 _STRUCT_X86_AVX512_STATE64 { @@ -943,7 +936,6 @@ _STRUCT_X86_AVX512_STATE64 _STRUCT_ZMM_REG __fpu_zmm30; /* ZMM 30 */ _STRUCT_ZMM_REG __fpu_zmm31; /* ZMM 31 */ }; -#endif /* not RC_HIDE_XNU_J137 */ #else /* !__DARWIN_UNIX03 */ #define _STRUCT_X86_FLOAT_STATE64 struct x86_float_state64 @@ -1065,7 +1057,6 @@ _STRUCT_X86_AVX_STATE64 _STRUCT_XMM_REG fpu_ymmh15; /* YMMH 15 */ }; -#if !defined(RC_HIDE_XNU_J137) #define _STRUCT_X86_AVX512_STATE64 struct x86_avx512_state64 _STRUCT_X86_AVX512_STATE64 { @@ -1173,7 +1164,6 @@ _STRUCT_X86_AVX512_STATE64 _STRUCT_ZMM_REG fpu_zmm30; /* ZMM 30 */ _STRUCT_ZMM_REG fpu_zmm31; /* ZMM 31 */ }; -#endif /* not RC_HIDE_XNU_J137 */ #endif /* !__DARWIN_UNIX03 */ diff --git a/osfmk/mach/i386/fp_reg.h b/osfmk/mach/i386/fp_reg.h index 3d2c69302..c704e42c6 100644 --- a/osfmk/mach/i386/fp_reg.h +++ b/osfmk/mach/i386/fp_reg.h @@ -130,9 +130,7 @@ struct x86_avx512_thread_state { typedef union { struct x86_fx_thread_state fx; struct x86_avx_thread_state avx; -#if !defined(RC_HIDE_XNU_J137) struct x86_avx512_thread_state avx512; -#endif } x86_ext_thread_state_t; #define EVEX_PREFIX 0x62 /* AVX512's EVEX vector operation prefix */ diff --git a/osfmk/mach/i386/thread_state.h b/osfmk/mach/i386/thread_state.h index 69d1a03d9..759489dcf 100644 --- a/osfmk/mach/i386/thread_state.h +++ b/osfmk/mach/i386/thread_state.h @@ -33,11 +33,7 @@ #define _MACH_I386_THREAD_STATE_H_ /* Size of maximum exported thread state in words */ -#if !defined(RC_HIDE_XNU_J137) #define I386_THREAD_STATE_MAX (614) /* Size of biggest state possible */ -#else -#define I386_THREAD_STATE_MAX (224) /* Size of biggest state possible */ -#endif /* !defined(RC_HIDE_XNU_J137) */ #if defined (__i386__) || defined(__x86_64__) #define THREAD_STATE_MAX I386_THREAD_STATE_MAX diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index ea3328933..d62095ad7 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -76,6 +76,8 @@ subsystem #include #include +type kobject_description_t = c_string[*:512]; + /* * Returns the set of port and port set names * to which the target task has access, along with @@ -658,4 +660,17 @@ routine mach_port_swap_guard( new_guard : uint64_t); #endif +/* + * Return the type and address of the kernel object + * that the given send/receive right represents. + * This call is only valid on MACH_IPC_DEBUG kernels. + * Otherwise, KERN_FAILURE is returned. + */ +routine mach_port_kobject_description( + task : ipc_space_inspect_t; + name : mach_port_name_t; + out object_type : natural_t; + out object_addr : mach_vm_address_t; + out description : kobject_description_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index d2e9fb0b4..09613b3a8 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -175,6 +175,15 @@ type thread_act_consume_ref_t = mach_port_move_send_t #endif /* KERNEL_SERVER */ ; +type suid_cred_path_t = c_string[*:1024]; +type suid_cred_uid_t = uint32_t; +type suid_cred_t = mach_port_t +#if KERNEL_SERVER + outtran: mach_port_t convert_suid_cred_to_port(suid_cred_t) +#endif /* KERNEL_SERVER */ + ; + + /* thread_state_t: This inline array can hold * a machine-dependent amount of data, defined in * mach/machine/???? (currently THREAD_STATE_MAX, @@ -642,6 +651,7 @@ simport ; /* for arcade_register conversions */ simport ; /* pick up kernel-specific MIG things */ +simport ; #endif /* KERNEL_SERVER */ import ; diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index 5430caaeb..e46370cc8 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -133,8 +133,8 @@ typedef struct ledger *ledger_t; typedef struct alarm *alarm_t; typedef struct clock *clock_serv_t; typedef struct clock *clock_ctrl_t; - typedef struct arcade_register *arcade_register_t; +typedef struct suid_cred *suid_cred_t; /* * OBSOLETE: lock_set interfaces are obsolete. @@ -155,8 +155,8 @@ struct semaphore; struct ledger; struct alarm; struct clock; - struct arcade_register; +struct suid_cred; __END_DECLS @@ -190,8 +190,9 @@ typedef mach_port_t ledger_t; typedef mach_port_t alarm_t; typedef mach_port_t clock_serv_t; typedef mach_port_t clock_ctrl_t; - typedef mach_port_t arcade_register_t; +typedef mach_port_t suid_cred_t; + #endif /* KERNEL */ /* @@ -260,6 +261,8 @@ typedef exception_handler_t exception_port_t; typedef exception_handler_array_t exception_port_arrary_t; typedef char vfs_path_t[4096]; typedef char nspace_path_t[1024]; /* 1024 == PATH_MAX */ +typedef char suid_cred_path_t[1024]; +typedef uint32_t suid_cred_uid_t; #ifdef KERNEL #define TASK_NULL ((task_t) NULL) @@ -284,6 +287,7 @@ typedef char nspace_path_t[1024]; /* 1024 == PATH_MAX */ #define CLOCK_NULL ((clock_t) NULL) #define UND_SERVER_NULL ((UNDServerRef) NULL) #define ARCADE_REG_NULL ((arcade_register_t) NULL) +#define SUID_CRED_NULL ((suid_cred_t) NULL) #else #define TASK_NULL ((task_t) 0) #define TASK_NAME_NULL ((task_name_t) 0) @@ -307,6 +311,7 @@ typedef char nspace_path_t[1024]; /* 1024 == PATH_MAX */ #define CLOCK_NULL ((clock_t) 0) #define UND_SERVER_NULL ((UNDServerRef) 0) #define ARCADE_REG_NULL ((arcade_register_t) 0) +#define SUID_CRED_NULL ((suid_cred_t) 0) #endif /* DEPRECATED */ diff --git a/osfmk/mach/sysdiagnose_notification.defs b/osfmk/mach/sysdiagnose_notification.defs index af048e2b0..c74c29f74 100644 --- a/osfmk/mach/sysdiagnose_notification.defs +++ b/osfmk/mach/sysdiagnose_notification.defs @@ -47,4 +47,9 @@ simpleroutine sysdiagnose_notification( sysdiagnose_port : mach_port_t; flags : uint32_t); +simpleroutine sysdiagnose_notification_with_audit_token( + sysdiagnose_port : mach_port_t; + flags : uint32_t; + ServerAuditToken atoken : audit_token_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/task.defs b/osfmk/mach/task.defs index 378fe2039..8723a5255 100644 --- a/osfmk/mach/task.defs +++ b/osfmk/mach/task.defs @@ -512,5 +512,11 @@ routine task_set_exc_guard_behavior( task : task_t; behavior : task_exc_guard_behavior_t); +routine task_create_suid_cred( + task : task_t; + path : suid_cred_path_t; + uid : suid_cred_uid_t; + out delegation : suid_cred_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach_debug/mach_debug_types.h b/osfmk/mach_debug/mach_debug_types.h index 4ba2440df..8781b108e 100644 --- a/osfmk/mach_debug/mach_debug_types.h +++ b/osfmk/mach_debug/mach_debug_types.h @@ -89,5 +89,7 @@ struct mach_core_fileheader { struct mach_core_details files[MACH_CORE_FILEHEADER_MAXFILES]; }; +#define KOBJECT_DESCRIPTION_LENGTH 512 +typedef char kobject_description_t[KOBJECT_DESCRIPTION_LENGTH]; #endif /* _MACH_DEBUG_MACH_DEBUG_TYPES_H_ */ diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index db1574d06..207ee0fbc 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -1401,7 +1401,7 @@ memory_object_iopl_request( if (ip_kotype(port) == IKOT_NAMED_ENTRY) { vm_named_entry_t named_entry; - named_entry = (vm_named_entry_t)port->ip_kobject; + named_entry = (vm_named_entry_t) ip_get_kobject(port); /* a few checks to make sure user is obeying rules */ if (*upl_size == 0) { if (offset >= named_entry->size) { @@ -2340,7 +2340,7 @@ convert_port_to_upl( ip_unlock(port); return (upl_t)NULL; } - upl = (upl_t) port->ip_kobject; + upl = (upl_t) ip_get_kobject(port); ip_unlock(port); upl_lock(upl); upl->ref_count += 1; diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index c79a03e57..071a66d4e 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -55,6 +55,12 @@ extern boolean_t vm_darkwake_mode; +#if DEVELOPMENT || DEBUG +int do_cseg_wedge_thread(void); +int do_cseg_unwedge_thread(void); +static event_t debug_cseg_wait_event = NULL; +#endif /* DEVELOPMENT || DEBUG */ + #if POPCOUNT_THE_COMPRESSED_DATA boolean_t popcount_c_segs = TRUE; @@ -678,7 +684,29 @@ vm_compressor_init(void) compressor_pool_size = ((kernel_map->max_offset - kernel_map->min_offset) - kernel_map->size) - VM_RESERVE_SIZE; } compressor_pool_multiplier = 1; + +#elif defined(__arm64__) && defined(XNU_TARGET_OS_WATCH) + + /* + * On M9 watches the compressor can become big and can lead to + * churn in workingset resulting in audio drops. Setting a cap + * on the compressor size favors reclaiming unused memory + * sitting in idle band via jetsams + */ + +#define COMPRESSOR_CAP_PERCENTAGE 30ULL + + if (compressor_pool_max_size > max_mem) { + compressor_pool_max_size = max_mem; + } + + if (vm_compression_limit == 0) { + compressor_pool_size = (max_mem * COMPRESSOR_CAP_PERCENTAGE) / 100ULL; + } + compressor_pool_multiplier = 1; + #else + if (compressor_pool_max_size > max_mem) { compressor_pool_max_size = max_mem; } @@ -1074,11 +1102,48 @@ c_seg_do_minor_compaction_and_unlock(c_segment_t c_seg, boolean_t clear_busy, bo return c_seg_freed; } +void +kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo) +{ + c_segment_t c_seg = (c_segment_t) wait_event; + + waitinfo->owner = thread_tid(c_seg->c_busy_for_thread); + waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(c_seg); +} + +#if DEVELOPMENT || DEBUG +int +do_cseg_wedge_thread(void) +{ + struct c_segment c_seg; + c_seg.c_busy_for_thread = current_thread(); + + debug_cseg_wait_event = (event_t) &c_seg; + + thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor); + assert_wait((event_t) (&c_seg), THREAD_INTERRUPTIBLE); + + thread_block(THREAD_CONTINUE_NULL); + + return 0; +} + +int +do_cseg_unwedge_thread(void) +{ + thread_wakeup(debug_cseg_wait_event); + debug_cseg_wait_event = NULL; + + return 0; +} +#endif /* DEVELOPMENT || DEBUG */ void c_seg_wait_on_busy(c_segment_t c_seg) { c_seg->c_wanted = 1; + + thread_set_pending_block_hint(current_thread(), kThreadWaitCompressor); assert_wait((event_t) (c_seg), THREAD_UNINT); lck_mtx_unlock_always(&c_seg->c_lock); diff --git a/osfmk/vm/vm_compressor.h b/osfmk/vm/vm_compressor.h index c95cc518a..c3b722952 100644 --- a/osfmk/vm/vm_compressor.h +++ b/osfmk/vm/vm_compressor.h @@ -170,9 +170,7 @@ struct c_segment { unsigned int cseg_swap_size; #endif /* CHECKSUM_THE_SWAP */ -#if MACH_ASSERT thread_t c_busy_for_thread; -#endif /* MACH_ASSERT */ int c_slot_var_array_len; struct c_slot *c_slot_var_array; @@ -237,7 +235,7 @@ extern vm_offset_t c_buffers; assert((cseg)->c_busy); \ (cseg)->c_busy = 0; \ assert((cseg)->c_busy_for_thread != NULL); \ - assert((((cseg)->c_busy_for_thread = NULL), TRUE)); \ + (cseg)->c_busy_for_thread = NULL; \ if ((cseg)->c_wanted) { \ (cseg)->c_wanted = 0; \ thread_wakeup((event_t) (cseg)); \ @@ -249,7 +247,7 @@ extern vm_offset_t c_buffers; assert((cseg)->c_busy == 0); \ (cseg)->c_busy = 1; \ assert((cseg)->c_busy_for_thread == NULL); \ - assert((((cseg)->c_busy_for_thread = current_thread()), TRUE)); \ + (cseg)->c_busy_for_thread = current_thread(); \ MACRO_END @@ -373,6 +371,8 @@ extern uint32_t vm_compressor_catchup_threshold_divisor_overridden; extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, clock_sec_t, clock_nsec_t); +extern void kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo_t *waitinfo); + #define PAGE_REPLACEMENT_DISALLOWED(enable) (enable == TRUE ? lck_rw_lock_shared(&c_master_lock) : lck_rw_done(&c_master_lock)) #define PAGE_REPLACEMENT_ALLOWED(enable) (enable == TRUE ? lck_rw_lock_exclusive(&c_master_lock) : lck_rw_done(&c_master_lock)) diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index e3956937b..1622e5547 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -6682,7 +6682,7 @@ vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr uint64_t cupid = get_current_unique_pid(); uintptr_t bpc = 0; - uint32_t bfrs = 0; + int btr = 0; bool u64 = false; /* Capture a single-frame backtrace; this extracts just the program @@ -6690,7 +6690,7 @@ vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr * further user stack traversals, thus avoiding copyin()s and further * faults. */ - int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64, NULL); + unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL); if ((btr == 0) && (bfrs > 0)) { cfpc = bpc; diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 6146c8e40..ab0d87614 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1197,6 +1197,7 @@ vm_map_create_options( result->map_disallow_data_exec = FALSE; result->is_nested_map = FALSE; result->map_disallow_new_exec = FALSE; + result->terminated = FALSE; result->highest_entry_end = 0; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); @@ -4023,7 +4024,7 @@ vm_map_enter_mem_object_helper( } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) { vm_named_entry_t named_entry; - named_entry = (vm_named_entry_t) port->ip_kobject; + named_entry = (vm_named_entry_t) ip_get_kobject(port); if (flags & (VM_FLAGS_RETURN_DATA_ADDR | VM_FLAGS_RETURN_4K_DATA_ADDR)) { @@ -7612,7 +7613,7 @@ vm_map_delete( const vm_map_offset_t FIND_GAP = 1; /* a not page aligned value */ const vm_map_offset_t GAPS_OK = 2; /* a different not page aligned value */ - if (map != kernel_map && !(flags & VM_MAP_REMOVE_GAPS_OK)) { + if (map != kernel_map && !(flags & VM_MAP_REMOVE_GAPS_OK) && !map->terminated) { gap_start = FIND_GAP; } else { gap_start = GAPS_OK; @@ -8327,6 +8328,34 @@ vm_map_delete( return KERN_SUCCESS; } + +/* + * vm_map_terminate: + * + * Clean out a task's map. + */ +kern_return_t +vm_map_terminate( + vm_map_t map) +{ + vm_map_lock(map); + map->terminated = TRUE; + vm_map_unlock(map); + + return vm_map_remove(map, + map->min_offset, + map->max_offset, + /* + * Final cleanup: + * + no unnesting + * + remove immutable mappings + * + allow gaps in range + */ + (VM_MAP_REMOVE_NO_UNNESTING | + VM_MAP_REMOVE_IMMUTABLE | + VM_MAP_REMOVE_GAPS_OK)); +} + /* * vm_map_remove: * @@ -17809,7 +17838,7 @@ convert_port_entry_to_map( if (ip_active(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { named_entry = - (vm_named_entry_t)port->ip_kobject; + (vm_named_entry_t) ip_get_kobject(port); if (!(lck_mtx_try_lock(&(named_entry)->Lock))) { ip_unlock(port); @@ -17867,7 +17896,7 @@ try_again: ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { - named_entry = (vm_named_entry_t)port->ip_kobject; + named_entry = (vm_named_entry_t) ip_get_kobject(port); if (!(lck_mtx_try_lock(&(named_entry)->Lock))) { ip_unlock(port); try_failed_count++; @@ -18692,6 +18721,7 @@ again: } } + *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); if (evaluation_phase) { unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; @@ -18724,7 +18754,6 @@ again: goto again; } else { kr = KERN_SUCCESS; - *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); } done: diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 3360cdfb4..e49170c5d 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -516,7 +516,8 @@ struct _vm_map { /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */ /* boolean_t */ jit_entry_exists:1, /* boolean_t */ has_corpse_footprint:1, - /* reserved */ pad:20; + /* boolean_t */ terminated:1, + /* reserved */ pad:19; unsigned int timestamp; /* Version number */ }; @@ -1348,6 +1349,9 @@ extern kern_return_t vm_map_enter_mem_object_control( vm_prot_t max_protection, vm_inherit_t inheritance); +extern kern_return_t vm_map_terminate( + vm_map_t map); + #endif /* !XNU_KERNEL_PRIVATE */ /* Deallocate a region */ diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index d8befa53c..341ce4754 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -328,7 +328,7 @@ vm_shared_region_vm_map( assert(shared_region->sr_ref_count > 1); sr_handle = shared_region->sr_mem_entry; - sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle); sr_map = sr_mem_entry->backing.map; assert(sr_mem_entry->is_sub_map); @@ -912,7 +912,7 @@ vm_shared_region_destroy( assert(!shared_region->sr_persists); assert(!shared_region->sr_slid); - mem_entry = (vm_named_entry_t) shared_region->sr_mem_entry->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(shared_region->sr_mem_entry); assert(mem_entry->is_sub_map); assert(!mem_entry->internal); assert(!mem_entry->is_copy); @@ -1066,7 +1066,7 @@ vm_shared_region_undo_mappings( /* no need to lock because this data is never modified... */ sr_handle = shared_region->sr_mem_entry; - sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle); sr_map = sr_mem_entry->backing.map; sr_base_address = shared_region->sr_base_address; } @@ -1208,7 +1208,7 @@ vm_shared_region_map_file( /* no need to lock because this data is never modified... */ sr_handle = shared_region->sr_mem_entry; - sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle); sr_map = sr_mem_entry->backing.map; sr_base_address = shared_region->sr_base_address; @@ -1593,7 +1593,7 @@ vm_shared_region_trim_and_get(task_t task) } sr_handle = shared_region->sr_mem_entry; - sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle); sr_map = sr_mem_entry->backing.map; /* Trim the pmap if possible. */ @@ -2749,14 +2749,14 @@ vm_commpage_text_init(void) /* create the 32 bit comm text page */ unsigned int offset = (random() % _PFZ32_SLIDE_RANGE) << PAGE_SHIFT; /* restricting to 32bMAX-2PAGE */ _vm_commpage_init(&commpage_text32_handle, _COMM_PAGE_TEXT_AREA_LENGTH); - commpage_text32_entry = (vm_named_entry_t) commpage_text32_handle->ip_kobject; + commpage_text32_entry = (vm_named_entry_t) ip_get_kobject(commpage_text32_handle); commpage_text32_map = commpage_text32_entry->backing.map; commpage_text32_location = (user32_addr_t) (_COMM_PAGE32_TEXT_START + offset); /* XXX if (cpu_is_64bit_capable()) ? */ /* create the 64-bit comm page */ offset = (random() % _PFZ64_SLIDE_RANGE) << PAGE_SHIFT; /* restricting sliding upto 2Mb range */ _vm_commpage_init(&commpage_text64_handle, _COMM_PAGE_TEXT_AREA_LENGTH); - commpage_text64_entry = (vm_named_entry_t) commpage_text64_handle->ip_kobject; + commpage_text64_entry = (vm_named_entry_t) ip_get_kobject(commpage_text64_handle); commpage_text64_map = commpage_text64_entry->backing.map; commpage_text64_location = (user64_addr_t) (_COMM_PAGE64_TEXT_START + offset); @@ -2782,13 +2782,13 @@ vm_commpage_init(void) #if defined(__i386__) || defined(__x86_64__) /* create the 32-bit comm page */ _vm_commpage_init(&commpage32_handle, _COMM_PAGE32_AREA_LENGTH); - commpage32_entry = (vm_named_entry_t) commpage32_handle->ip_kobject; + commpage32_entry = (vm_named_entry_t) ip_get_kobject(commpage32_handle); commpage32_map = commpage32_entry->backing.map; /* XXX if (cpu_is_64bit_capable()) ? */ /* create the 64-bit comm page */ _vm_commpage_init(&commpage64_handle, _COMM_PAGE64_AREA_LENGTH); - commpage64_entry = (vm_named_entry_t) commpage64_handle->ip_kobject; + commpage64_entry = (vm_named_entry_t) ip_get_kobject(commpage64_handle); commpage64_map = commpage64_entry->backing.map; #endif /* __i386__ || __x86_64__ */ diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index ab106cb5a..027d0c992 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -2372,7 +2372,7 @@ mach_make_memory_entry_internal( if (IP_VALID(parent_handle) && ip_kotype(parent_handle) == IKOT_NAMED_ENTRY) { - parent_entry = (vm_named_entry_t) parent_handle->ip_kobject; + parent_entry = (vm_named_entry_t) ip_get_kobject(parent_handle); } else { parent_entry = NULL; } @@ -3646,7 +3646,7 @@ memory_entry_purgeable_control_internal( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); @@ -3709,7 +3709,7 @@ memory_entry_access_tracking_internal( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); @@ -3788,7 +3788,7 @@ mach_memory_entry_ownership( ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); @@ -3842,7 +3842,7 @@ mach_memory_entry_get_page_counts( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); @@ -3907,7 +3907,7 @@ mach_destroy_memory_entry( #if MACH_ASSERT assert(ip_kotype(port) == IKOT_NAMED_ENTRY); #endif /* MACH_ASSERT */ - named_entry = (vm_named_entry_t)port->ip_kobject; + named_entry = (vm_named_entry_t) ip_get_kobject(port); named_entry_lock(named_entry); named_entry->ref_count -= 1; @@ -3934,8 +3934,7 @@ mach_destroy_memory_entry( lck_mtx_unlock(&vm_named_entry_list_lock_data); #endif /* VM_NAMED_ENTRY_LIST */ - kfree(port->ip_kobject, - sizeof(struct vm_named_entry)); + kfree(named_entry, sizeof(struct vm_named_entry)); } else { named_entry_unlock(named_entry); } @@ -3961,7 +3960,7 @@ mach_memory_entry_page_op( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); @@ -4014,7 +4013,7 @@ mach_memory_entry_range_op( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ip_get_kobject(entry_port); named_entry_lock(mem_entry); diff --git a/osfmk/x86_64/monotonic_x86_64.c b/osfmk/x86_64/monotonic_x86_64.c index b182f653a..1e9a34313 100644 --- a/osfmk/x86_64/monotonic_x86_64.c +++ b/osfmk/x86_64/monotonic_x86_64.c @@ -37,6 +37,7 @@ #include #include #include +#include /* * Sanity check the compiler. @@ -164,7 +165,13 @@ static void enable_counters(void) { wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE); - wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN); + + uint64_t global_en = GLOBAL_CTRL_FIXED_EN; + if (kpc_get_running() & KPC_CLASS_CONFIGURABLE_MASK) { + global_en |= kpc_get_configurable_pmc_mask(KPC_CLASS_CONFIGURABLE_MASK); + } + + wrmsr64(GLOBAL_CTRL, global_en); } static void diff --git a/security/mac_base.c b/security/mac_base.c index 6a99e38fd..b4dad979c 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -2088,6 +2088,15 @@ mac_iokit_check_hid_control(kauth_cred_t cred __unused) return 0; } +int mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp, struct vnode *vp, struct componentname *cnp, + const char *name, const char *vfc_name); +int +mac_mount_check_snapshot_mount(vfs_context_t ctx __unused, struct vnode *rvp __unused, struct vnode *vp __unused, + struct componentname *cnp __unused, const char *name __unused, const char *vfc_name __unused) +{ + return 0; +} + int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused); int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused) diff --git a/security/mac_framework.h b/security/mac_framework.h index e8c27a348..865dfaa7a 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -271,6 +271,9 @@ int mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, const char *name); int mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, const char *name); +int mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp, + struct vnode *vp, struct componentname *cnp, const char *name, + const char *vfc_name); int mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp, const char *name); int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp); diff --git a/security/mac_policy.h b/security/mac_policy.h index 1b46adf7a..3f9ddbd52 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -1855,6 +1855,32 @@ typedef int mpo_mount_check_snapshot_delete_t( struct mount *mp, const char *name ); +/** + * @brief Access control check for fs_snapshot_mount + * @param cred Subject credential + * @param rvp Vnode of either the root directory of the + * filesystem to mount snapshot of, or the device from + * which to mount the snapshot. + * @param vp Vnode that is to be the mount point + * @param cnp Component name for vp + * @param name Name of snapshot to mount + * @param vfc_name Filesystem type name + * + * Determine whether the subject identified by the credential can + * mount the named snapshot from the filesystem at the given + * directory. + * + * @return Return 0 if access is granted, otherwise an appropriate value + * for errno should be returned. + */ +typedef int mpo_mount_check_snapshot_mount_t( + kauth_cred_t cred, + struct vnode *rvp, + struct vnode *vp, + struct componentname *cnp, + const char *name, + const char *vfc_name + ); /** * @brief Access control check for fs_snapshot_revert * @param cred Subject credential @@ -6296,7 +6322,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 59 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 62 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6450,7 +6476,7 @@ struct mac_policy_ops { mpo_vnode_check_trigger_resolve_t *mpo_vnode_check_trigger_resolve; mpo_mount_check_mount_late_t *mpo_mount_check_mount_late; - mpo_reserved_hook_t *mpo_reserved1; + mpo_mount_check_snapshot_mount_t *mpo_mount_check_snapshot_mount; mpo_reserved_hook_t *mpo_reserved2; mpo_skywalk_flow_check_connect_t *mpo_skywalk_flow_check_connect; mpo_skywalk_flow_check_listen_t *mpo_skywalk_flow_check_listen; diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 95afa830b..67452ded3 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -100,7 +100,7 @@ * KDBG_EVENTID(DBG_FSYSTEM, DBG_VFS, dcode) global event id, see bsd/sys/kdebug.h. * Note that dcode is multiplied by 4 and ORed as part of the construction. See bsd/kern/trace_codes * for list of system-wide {global event id, name} pairs. Currently DBG_VFS event ids are in range - * [0x3130000, 0x313016C]. + * [0x3130000, 0x3130170]. */ //#define VFS_TRACE_POLICY_OPS @@ -2337,6 +2337,29 @@ mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, return error; } +int +mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp, struct vnode *vp, struct componentname *cnp, + const char *name, const char *vfc_name) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return 0; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(92, vp); + MAC_CHECK(mount_check_snapshot_mount, cred, rvp, vp, cnp, name, vfc_name); + VFS_KERNEL_DEBUG_END1(92, vp); + return error; +} + int mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp, const char *name) diff --git a/tests/Makefile b/tests/Makefile index 610cecb15..790dc11c1 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -77,7 +77,7 @@ install-immovable_send_client: immovable_send_client kdebug: INVALID_ARCHS = i386 kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf -EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c +EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c ifneq ($(PLATFORM),iPhoneOS) EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c memorystatus_freeze_test.c @@ -112,7 +112,8 @@ memorystatus_zone_test: OTHER_CFLAGS += -isystem $(SDKROOT)/System/Library/Frame memorystatus_zone_test: OTHER_LDFLAGS += -framework ktrace memorystatus_zone_test: OTHER_LDFLAGS += -ldarwintest_utils -kpc: OTHER_LDFLAGS += -framework kperf +kpc: OTHER_LDFLAGS += -framework kperf -framework ktrace +kpc: INVALID_ARCHS = i386 kperf: INVALID_ARCHS = i386 kperf: OTHER_CFLAGS += kperf_helpers.c @@ -278,6 +279,9 @@ socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist net_tuntests: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +net_bridge: OTHER_CFLAGS += bpflib.c in_cksum.c +net_bridge: OTHER_LDFLAGS += -ldarwintest_utils + ifneq (osx,$(TARGET_NAME)) EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c else # target = osx @@ -352,4 +356,18 @@ debug_control_port_for_pid: CODE_SIGN_ENTITLEMENTS = ./debug_control_port_for_pi prng: OTHER_LDFLAGS += -ldarwintest_utils +OTHER_TEST_TARGETS += io_catalog_send_data + +io_catalog_send_data: INVALID_ARCHS = i386 +io_catalog_send_data: OTHER_CFLAGS += -DTEST_UNENTITLED -framework IOKit -framework CoreFoundation -framework Foundation +io_catalog_send_data: iokit/io_catalog_send_data.m + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + +task_create_suid_cred: CODE_SIGN_ENTITLEMENTS = ./task_create_suid_cred_entitlement.plist + +OTHER_TEST_TARGETS += task_create_suid_cred_unentitled +task_create_suid_cred_unentitled: OTHER_CFLAGS += -DUNENTITLED +task_create_suid_cred_unentitled: task_create_suid_cred.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets diff --git a/tests/bpflib.c b/tests/bpflib.c new file mode 100644 index 000000000..aa9b9133d --- /dev/null +++ b/tests/bpflib.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define PRIVATE_EXTERN __private_extern__ + +#include "bpflib.h" + +#ifdef TESTING +#include "util.h" +#endif /* TESTING */ + +PRIVATE_EXTERN int +bpf_set_timeout(int fd, struct timeval * tv_p) +{ + return ioctl(fd, BIOCSRTIMEOUT, tv_p); +} + +PRIVATE_EXTERN int +bpf_get_blen(int fd, int * blen) +{ + return ioctl(fd, BIOCGBLEN, blen); +} + +PRIVATE_EXTERN int +bpf_set_header_complete(int fd, u_int header_complete) +{ + return ioctl(fd, BIOCSHDRCMPLT, &header_complete); +} + +PRIVATE_EXTERN int +bpf_set_see_sent(int fd, u_int see_sent) +{ + return ioctl(fd, BIOCSSEESENT, &see_sent); +} + +PRIVATE_EXTERN int +bpf_dispose(int bpf_fd) +{ + if (bpf_fd >= 0) { + return close(bpf_fd); + } + return 0; +} + +PRIVATE_EXTERN int +bpf_new(void) +{ + char bpfdev[256]; + int i; + int fd = -1; + + for (i = 0; true; i++) { + snprintf(bpfdev, sizeof(bpfdev), "/dev/bpf%d", i); + fd = open(bpfdev, O_RDWR, 0); + if (fd >= 0) { +#ifdef SO_TC_CTL + int tc = SO_TC_CTL; + (void) ioctl(fd, BIOCSETTC, &tc); +#endif /* SO_TC_CTL */ + break; + } + if (errno != EBUSY) { + break; + } + } + return fd; +} + +PRIVATE_EXTERN int +bpf_setif(int fd, const char * en_name) +{ + struct ifreq ifr; + + strlcpy(ifr.ifr_name, en_name, sizeof(ifr.ifr_name)); + return ioctl(fd, BIOCSETIF, &ifr); +} + +PRIVATE_EXTERN int +bpf_set_immediate(int fd, u_int value) +{ + return ioctl(fd, BIOCIMMEDIATE, &value); +} + +PRIVATE_EXTERN int +bpf_filter_receive_none(int fd) +{ + struct bpf_insn insns[] = { + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct bpf_program prog; + + prog.bf_len = sizeof(insns) / sizeof(struct bpf_insn); + prog.bf_insns = insns; + return ioctl(fd, BIOCSETF, &prog); +} + +PRIVATE_EXTERN int +bpf_arp_filter(int fd, int type_offset, int type, u_int pkt_size) +{ + struct bpf_insn insns[] = { + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, type_offset), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, type, 0, 1), + BPF_STMT(BPF_RET + BPF_K, pkt_size), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct bpf_program prog; + + prog.bf_len = sizeof(insns) / sizeof(struct bpf_insn); + prog.bf_insns = insns; + return ioctl(fd, BIOCSETF, &prog); +} + +#ifdef TESTING +#include +#include +#include + + +void +bpf_read_continuously(int fd, u_int blen) +{ + int n; + char * rxbuf = malloc(blen); + + printf("rx buf len is %d\n", blen); + while (1) { + n = read(fd, rxbuf, blen); + if (n < 0) { + perror("bpf_read_continuously"); + return; + } + if (n == 0) { + continue; + } + print_data(rxbuf, n); + } +} + +int +main(int argc, char * argv[]) +{ + int fd = bpf_new(); + char * en_name = "en0"; + u_int bpf_blen = 0; + + if (fd < 0) { + perror("no bpf devices"); + exit(1); + } + + if (argc > 1) { + en_name = argv[1]; + } + (void)bpf_set_immediate(fd, 1); + if (bpf_arp_filter(fd, 12, ETHERTYPE_ARP, + sizeof(struct ether_arp) + sizeof(struct ether_header)) + < 0) { + perror("bpf_arp_filter"); + } + if (bpf_setif(fd, en_name) < 0) { + perror("bpf_attach"); + exit(1); + } + + if (bpf_get_blen(fd, &bpf_blen) < 0) { + perror("bpf_get_blen"); + exit(1); + } + bpf_read_continuously(fd, bpf_blen); + exit(0); + return 0; +} +#endif /* TESTING */ diff --git a/tests/bpflib.h b/tests/bpflib.h new file mode 100644 index 000000000..84e21742d --- /dev/null +++ b/tests/bpflib.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2000 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + + +#ifndef _S_BPFLIB_H +#define _S_BPFLIB_H + +int bpf_get_blen(int fd, int * blen); +int bpf_new(void); +int bpf_dispose(int fd); +int bpf_setif(int fd, const char * en_name); +int bpf_set_immediate(int fd, u_int value); +int bpf_filter_receive_none(int fd); +int bpf_arp_filter(int fd, int type_offset, int type, u_int packet_size); +int bpf_set_timeout(int fd, struct timeval * tv_p); +int bpf_set_header_complete(int fd, u_int header_complete); +int bpf_set_see_sent(int fd, u_int see_send); + +#endif /* _S_BPFLIB_H */ diff --git a/tests/fcntl.c b/tests/fcntl.c new file mode 100644 index 000000000..877425f3d --- /dev/null +++ b/tests/fcntl.c @@ -0,0 +1,41 @@ +#include +#include +#include +#include +#include + +/** Verify that F_ADDSIGS does not page fault off the end of the user blob + * 1. Find VA space for 3 pages + * 2. Unmap the last page + * 3. Start fs_blob_start at PAGE_SIZE + 1 bytes away from the end of the + * VA region (such that any read of more than PAGE_SIZE + 1 bytes will fault) + * 4. Call fcntl with the arguments and verify the output is not EFAULT + */ +T_DECL(fcntl_addsig, "Verify that fcntl(F_ADDSIGS) doesn't EFAULT", T_META_NAMESPACE("xnu.vfs")) { + void* blob_space = mmap(NULL, vm_page_size * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + T_ASSERT_NE(blob_space, MAP_FAILED, "Blob Region: %p [%zd]", blob_space, vm_page_size); + + T_ASSERT_POSIX_SUCCESS(munmap((char*)blob_space + (vm_page_size * 2), vm_page_size), NULL); + + size_t blob_size = vm_page_size + 1; + char* blob_start = ((char*)blob_space) + (vm_page_size * 2) - blob_size; + fsignatures_t args = { .fs_file_start = 0, .fs_blob_start = blob_start, .fs_blob_size = blob_size}; + + // Create test file to operate on + const char * tmp_dir = dt_tmpdir(); + char tmp_file_name[PATH_MAX]; + sprintf(tmp_file_name, "%s/foo", tmp_dir); + FILE* tmp_file = fopen(tmp_file_name, "wx"); + fprintf(tmp_file, "Just some random content"); + fclose(tmp_file); + + int fd = open(tmp_file_name, O_RDONLY); + T_ASSERT_POSIX_SUCCESS(fd, "tmp file: %s", tmp_file_name); + + // This command will fail, but should not fail with EFAULT + int result = fcntl(fd, F_ADDSIGS, &args); + int error = errno; + T_QUIET; T_EXPECT_EQ(result, -1, NULL); + // EBADEXEC is expected, but not required for success of this test + T_EXPECT_NE(error, EFAULT, "fcntl: %d (%d:%s)", result, error, strerror(error)); +} diff --git a/tests/in_cksum.c b/tests/in_cksum.c new file mode 100644 index 000000000..2dc3f49e3 --- /dev/null +++ b/tests/in_cksum.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include "in_cksum.h" + +typedef union { + char c[2]; + u_short s; +} short_union_t; + +typedef union { + u_short s[2]; + long l; +} long_union_t; + +static __inline__ void +reduce(int * sum) +{ + long_union_t l_util; + + l_util.l = *sum; + *sum = l_util.s[0] + l_util.s[1]; + if (*sum > 65535) { + *sum -= 65535; + } + return; +} + + +#include + +unsigned short +in_cksum(void * pkt, int len) +{ + u_short * w; + int sum = 0; + + w = (u_short *)pkt; + while ((len -= 32) >= 0) { + sum += w[0]; sum += w[1]; + sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; + sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; + sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; + sum += w[14]; sum += w[15]; + w += 16; + } + len += 32; + while ((len -= 8) >= 0) { + sum += w[0]; sum += w[1]; + sum += w[2]; sum += w[3]; + w += 4; + } + len += 8; + if (len) { + reduce(&sum); + while ((len -= 2) >= 0) { + sum += *w++; + } + } + if (len == -1) { /* odd-length packet */ + short_union_t s_util; + + s_util.s = 0; + s_util.c[0] = *((char *)w); + s_util.c[1] = 0; + sum += s_util.s; + } + reduce(&sum); + return ~sum & 0xffff; +} diff --git a/tests/in_cksum.h b/tests/in_cksum.h new file mode 100644 index 000000000..d84b916d1 --- /dev/null +++ b/tests/in_cksum.h @@ -0,0 +1,27 @@ +#ifndef _S_IN_CKSUM_H +#define _S_IN_CKSUM_H +/* + * Copyright (c) 2000 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +extern unsigned short in_cksum(void * pkt, int len); + +#endif /* _S_IN_CKSUM_H */ diff --git a/tests/iokit/io_catalog_send_data.m b/tests/iokit/io_catalog_send_data.m new file mode 100644 index 000000000..f7cf014a9 --- /dev/null +++ b/tests/iokit/io_catalog_send_data.m @@ -0,0 +1,136 @@ +/* + * io_catalog_send_data.m + * + * A regression test to build an IORegistry entry with mismatching + * IOService and IOUserClientClass via IOCatalogueSendData, to verify + * if exploit risk still exists in IOCatalogueSendData. + * + */ +#include + +#include +#include +#include + +#define kIOClassKey @"IOClass" +#define kIOProviderClassKey @"IOProviderClass" +#define kIOMatchCategoryKey @"IOMatchCategory" +#define kIOUserClientClassKey @"IOUserClientClass" +#define vIOProviderClassValue @"IOResources" + +T_GLOBAL_META(T_META_NAMESPACE("xnu.iokit"), + T_META_RUN_CONCURRENTLY(true)); + +kern_return_t +build_ioregistry_by_catalog_send_data(const char *match_name, + const char *userclient_name, const char *service_name) +{ + kern_return_t kret; + + NSArray *rootCatalogueArray = @[@{ + kIOProviderClassKey: vIOProviderClassValue, + kIOClassKey: @(service_name), + kIOUserClientClassKey: @(userclient_name), + kIOMatchCategoryKey: @(match_name) + }]; + + CFDataRef cfData = IOCFSerialize((__bridge CFTypeRef)rootCatalogueArray, + kIOCFSerializeToBinary); + + kret = IOCatalogueSendData(MACH_PORT_NULL, 1, CFDataGetBytePtr(cfData), + CFDataGetLength(cfData)); + + if (cfData) { + CFRelease(cfData); + } + + return kret; +} + +bool +test_open_ioregistry(const char *match_name, const char *service_name, + bool exploit) +{ + kern_return_t kret; + bool ioreg_found = false; + CFStringRef cfstrMatchName = NULL; + io_connect_t conn = IO_OBJECT_NULL; + io_iterator_t iter = IO_OBJECT_NULL, obj = IO_OBJECT_NULL; + CFMutableDictionaryRef service_info = NULL, properties = NULL; + + service_info = IOServiceMatching(service_name); + kret = IOServiceGetMatchingServices(kIOMasterPortDefault, service_info, &iter); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "IOServiceGetMatchingServices"); + cfstrMatchName = CFStringCreateWithCString(kCFAllocatorDefault, + match_name, kCFStringEncodingUTF8); + + while (obj = IOIteratorNext(iter)) { + kret = IORegistryEntryCreateCFProperties(obj, &properties, + kCFAllocatorDefault, kNilOptions); + if (kret != KERN_SUCCESS) { + T_LOG("IORegistryEntryCreateCFProperties fails, 0x%08X", + (uint32_t)kret); + IOObjectRelease(obj); + continue; + } + + CFStringRef value = CFDictionaryGetValue(properties, CFSTR("IOMatchCategory")); + if (value && CFGetTypeID(value) == CFStringGetTypeID() && + CFEqual(value, cfstrMatchName)) { + ioreg_found = true; + } else { + IOObjectRelease(obj); + continue; + } + + if (!exploit) { + goto bail; + } + + T_LOG("try to exploit by opening io service, possibly panic?"); + IOServiceOpen(obj, mach_task_self(), 0, &conn); + IOObjectRelease(obj); + + break; + } + +bail: + if (cfstrMatchName) { + CFRelease(cfstrMatchName); + } + + if (properties) { + CFRelease(properties); + } + + if (iter != IO_OBJECT_NULL) { + IOObjectRelease(iter); + } + + if (conn != IO_OBJECT_NULL) { + IOServiceClose(conn); + } + + return ioreg_found; +} + +T_DECL(io_catalog_send_data_test, "regression test to build an IORegistry entry" + " with mismatching IOService and IOUserClientClass by IOCatalogueSendData, " + "to verify if exploit risk still exists in IOCatalogueSendData for " + "potential DoS - ") +{ + kern_return_t kret; + + kret = build_ioregistry_by_catalog_send_data("fooBar", + "IOSurfaceRootUserClient", "IOReportHub"); +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) + /* this trick to build an entry by io_catalog_send_data should fail */ + T_EXPECT_EQ(kret, kIOReturnNotPrivileged, "build an entry with" + " mismatch IOService and IOUserClientClass by IOCatalogueSendData " + "should fail as kIOReturnNotPrivileged"); +#else + T_EXPECT_EQ(kret, KERN_SUCCESS, "IOCatalogueSendData should return success with kextd"); +#endif + T_EXPECT_FALSE(test_open_ioregistry("fooBar", "IOReportHub", false), + "Mismatched entry built by IOCatalogueSendData should not be opened"); +} diff --git a/tests/kpc.c b/tests/kpc.c index 62b87e68e..7e3236344 100644 --- a/tests/kpc.c +++ b/tests/kpc.c @@ -1,26 +1,298 @@ -/* Copyright (c) 2018 Apple Inc. All rights reserved. */ +// Copyright (c) 2018-2020 Apple Inc. All rights reserved. #include +#include +#include #include +#include +#include #include +#include #include #include +#include + +#include "ktrace_helpers.h" +#include "kperf_helpers.h" T_GLOBAL_META( T_META_NAMESPACE("xnu.ktrace"), T_META_ASROOT(true), T_META_CHECK_LEAKS(false)); -T_DECL(fixed_thread_counters, - "test that fixed thread counters return monotonically increasing values") +struct machine { + unsigned int ncpus; + unsigned int nfixed; + unsigned int nconfig; +}; + +static void +skip_if_unsupported(void) +{ + int r; + int supported = 0; + size_t supported_size = sizeof(supported); + + r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size, + NULL, 0); + if (r < 0) { + T_WITH_ERRNO; + T_SKIP("could not find \"kern.monotonic.supported\" sysctl"); + } + + if (!supported) { + T_SKIP("PMCs are not supported on this platform"); + } +} + +static struct rusage_info_v4 pre_ru = {}; + +static void +start_kpc(void) +{ + T_SETUPBEGIN; + + kpc_classmask_t classes = KPC_CLASS_FIXED_MASK | + KPC_CLASS_CONFIGURABLE_MASK; + int ret = kpc_set_counting(classes); + T_ASSERT_POSIX_SUCCESS(ret, "started counting"); + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&pre_ru); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "got rusage information"); + + kpc_classmask_t classes_on = kpc_get_counting(); + T_QUIET; + T_ASSERT_EQ(classes, classes_on, "classes counting is correct"); + + T_SETUPEND; +} + +static void kpc_reset_atend(void); + +#if defined(__arm__) || defined(__arm64__) +#define CYCLES_EVENT 0x02 +#else // defined(__arm__) || defined(__arm64__) +#define CYCLES_EVENT (0x10000 | 0x20000 | 0x3c) +#endif // !defined(__arm__) && !defined(__arm64__) + +static void +prepare_kpc(struct machine *mch, bool config, bool reset) { + T_SETUPBEGIN; + + if (!reset) { + T_ATEND(kpc_reset_atend); + } + + size_t ncpus_sz = sizeof(mch->ncpus); + int ret = sysctlbyname("hw.logicalcpu_max", &mch->ncpus, &ncpus_sz, + NULL, 0); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(hw.logicalcpu_max)"); + T_QUIET; + T_ASSERT_GT(mch->ncpus, 0, "must have some number of CPUs"); + + ret = kpc_force_all_ctrs_set(1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_force_all_ctrs_set(1)"); + + int forcing = 0; + ret = kpc_force_all_ctrs_get(&forcing); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_force_all_ctrs_get"); + T_QUIET; T_ASSERT_EQ(forcing, 1, "counters must be forced"); + + mch->nfixed = kpc_get_counter_count(KPC_CLASS_FIXED_MASK); + mch->nconfig = kpc_get_counter_count(KPC_CLASS_CONFIGURABLE_MASK); + + T_LOG("machine: ncpus = %d, nfixed = %d, nconfig = %d", mch->ncpus, + mch->nfixed, mch->nconfig); + + if (config) { + uint32_t nconfigs = kpc_get_config_count( + KPC_CLASS_CONFIGURABLE_MASK); + uint64_t *configs = calloc(nconfigs, sizeof(*configs)); + T_QUIET; T_ASSERT_NOTNULL(configs, "allocated config words"); + + for (unsigned int i = 0; i < nconfigs; i++) { + configs[i] = reset ? 0 : CYCLES_EVENT; + } + + ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, configs); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_set_config"); + } + + T_SETUPEND; +} + +static void +kpc_reset_atend(void) +{ + struct machine mch = {}; + prepare_kpc(&mch, true, true); + uint64_t *periods = calloc(mch.nconfig, sizeof(*periods)); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(periods, "allocate periods array"); + + int ret = kpc_set_period(KPC_CLASS_CONFIGURABLE_MASK, periods); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_set_period"); + free(periods); +} + +static void * +spin(void *arg) +{ + while (*(volatile int *)arg == 0) { + ; + } + + return NULL; +} + +static pthread_t * +start_threads(const struct machine *mch, void *(*func)(void *), void *arg) +{ + T_SETUPBEGIN; + + pthread_t *threads = calloc((unsigned int)mch->ncpus, + sizeof(*threads)); + T_QUIET; T_ASSERT_NOTNULL(threads, "allocated array of threads"); + for (unsigned int i = 0; i < mch->ncpus; i++) { + int error = pthread_create(&threads[i], NULL, func, arg); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "pthread_create"); + } + + T_SETUPEND; + + return threads; +} + +static void +end_threads(const struct machine *mch, pthread_t *threads) +{ + for (unsigned int i = 0; i < mch->ncpus; i++) { + int error = pthread_join(threads[i], NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "joined thread %d", i); + } + free(threads); +} + +struct tally { + uint64_t firstvalue; + uint64_t lastvalue; + uint64_t nchecks; + uint64_t nzero; + uint64_t nstuck; + uint64_t ndecrease; +}; + +static void +check_counters(unsigned int ncpus, unsigned int nctrs, struct tally *tallies, + uint64_t *counts) +{ + for (unsigned int i = 0; i < ncpus; i++) { + for (unsigned int j = 0; j < nctrs; j++) { + unsigned int ctr = i * nctrs + j; + struct tally *tly = &tallies[ctr]; + uint64_t count = counts[ctr]; + + if (counts[ctr] == 0) { + tly->nzero++; + } + if (tly->lastvalue == count) { + tly->nstuck++; + } + if (tly->lastvalue > count) { + tly->ndecrease++; + } + tly->lastvalue = count; + if (tly->nchecks == 0) { + tly->firstvalue = count; + } + tly->nchecks++; + } + } +} +static void +check_tally(const char *name, unsigned int ncpus, unsigned int nctrs, + struct tally *tallies) +{ + for (unsigned int i = 0; i < ncpus; i++) { + for (unsigned int j = 0; j < nctrs; j++) { + unsigned int ctr = i * nctrs + j; + struct tally *tly = &tallies[ctr]; + + T_LOG("CPU %2u PMC %u: nchecks = %llu, last value = %llx, " + "delta = %llu, nstuck = %llu", i, j, + tly->nchecks, tly->lastvalue, tly->lastvalue - tly->firstvalue, + tly->nstuck); + T_QUIET; T_EXPECT_GT(tly->nchecks, 0ULL, + "checked that CPU %d %s counter %d values", i, name, j); + T_QUIET; T_EXPECT_EQ(tly->nzero, 0ULL, + "CPU %d %s counter %d value was zero", i, name, j); + T_QUIET; T_EXPECT_EQ(tly->nstuck, 0ULL, + "CPU %d %s counter %d value was stuck", i, name, j); + T_QUIET; T_EXPECT_EQ(tly->ndecrease, 0ULL, + "CPU %d %s counter %d value decreased", i, name, j); + } + } +} + +#define TESTDUR_NS (5 * NSEC_PER_SEC) + +T_DECL(kpc_cpu_direct_configurable, + "test that configurable counters return monotonically increasing values") +{ + skip_if_unsupported(); + + struct machine mch = {}; + prepare_kpc(&mch, true, false); + + int until = 0; + pthread_t *threads = start_threads(&mch, spin, &until); + start_kpc(); + + T_SETUPBEGIN; + + uint64_t startns = clock_gettime_nsec_np(CLOCK_MONOTONIC); + uint64_t *counts = kpc_counterbuf_alloc(); + T_QUIET; T_ASSERT_NOTNULL(counts, "allocated space for counter values"); + memset(counts, 0, sizeof(*counts) * mch.ncpus * (mch.nfixed + mch.nconfig)); + struct tally *tly = calloc(mch.ncpus * mch.nconfig, sizeof(*tly)); + T_QUIET; T_ASSERT_NOTNULL(tly, "allocated space for tallies"); + + T_SETUPEND; + + int n = 0; + while (clock_gettime_nsec_np(CLOCK_MONOTONIC) - startns < TESTDUR_NS) { + int ret = kpc_get_cpu_counters(true, + KPC_CLASS_CONFIGURABLE_MASK, NULL, counts); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_get_cpu_counters"); + + check_counters(mch.ncpus, mch.nconfig, tly, counts); + + usleep(10000); + n++; + if (n % 100 == 0) { + T_LOG("checked 100 times"); + } + } + + check_tally("config", mch.ncpus, mch.nconfig, tly); + + until = 1; + end_threads(&mch, threads); +} + +T_DECL(kpc_thread_direct_instrs_cycles, + "test that fixed thread counters return monotonically increasing values") +{ int err; uint32_t ctrs_cnt; uint64_t *ctrs_a; uint64_t *ctrs_b; + skip_if_unsupported(); + T_SETUPBEGIN; ctrs_cnt = kpc_get_counter_count(KPC_CLASS_FIXED_MASK); @@ -68,10 +340,227 @@ T_DECL(fixed_thread_counters, free(ctrs_b); } +#define PMI_TEST_DURATION_NS (15 * NSEC_PER_SEC) +#define PERIODIC_CPU_COUNT_MS (250) +#define NTIMESLICES (72) +#define PMI_PERIOD (50ULL * 1000 * 1000) +#define END_EVENT KDBG_EVENTID(0xfe, 0xfe, 0) + +struct cpu { + uint64_t prev_count, max_skid; + unsigned int timeslices[NTIMESLICES]; +}; + +T_DECL(kpc_pmi_configurable, + "test that PMIs don't interfere with sampling counters in kperf") +{ + skip_if_unsupported(); + + start_controlling_ktrace(); + struct machine mch = {}; + prepare_kpc(&mch, true, false); + + T_SETUPBEGIN; + + uint64_t *periods = calloc(mch.nconfig, sizeof(*periods)); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(periods, "allocate periods array"); + periods[0] = PMI_PERIOD; + + int ret = kpc_set_period(KPC_CLASS_CONFIGURABLE_MASK, periods); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_set_period"); + free(periods); + + int32_t *actions = calloc(mch.nconfig, sizeof(*actions)); + actions[0] = 1; + ret = kpc_set_actionid(KPC_CLASS_CONFIGURABLE_MASK, actions); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kpc_set_actionid"); + free(actions); + + (void)kperf_action_count_set(1); + ret = kperf_action_samplers_set(1, KPERF_SAMPLER_TINFO); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kperf_action_samplers_set"); + + ktrace_config_t ktconfig = ktrace_config_create_current(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(ktconfig, "create current config"); + ret = ktrace_config_print_description(ktconfig, stdout); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "print config description"); + + struct cpu *cpus = calloc(mch.ncpus, sizeof(*cpus)); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(cpus, "allocate CPUs array"); + + __block unsigned int nsamples = 0; + __block uint64_t first_ns = 0; + __block uint64_t last_ns = 0; + + ktrace_session_t sess = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(sess, "ktrace_session_create"); + + ktrace_events_single(sess, PERF_KPC_PMI, ^(struct trace_point *tp) { + if (tp->debugid & DBG_FUNC_END) { + return; + } + + uint64_t cur_ns = 0; + int cret = ktrace_convert_timestamp_to_nanoseconds(sess, + tp->timestamp, &cur_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(cret, "convert timestamp"); + + uint64_t count = tp->arg2; + if (first_ns == 0) { + first_ns = cur_ns; + } + struct cpu *cpu = &cpus[tp->cpuid]; + + if (cpu->prev_count != 0) { + uint64_t delta = count - cpu->prev_count; + T_QUIET; T_EXPECT_GT(delta, PMI_PERIOD, + "counter delta should be greater than PMI period"); + uint64_t skid = delta - PMI_PERIOD; + if (skid > cpu->max_skid) { + cpu->max_skid = skid; + } + } + cpu->prev_count = count; + + double slice = (double)(cur_ns - first_ns) / PMI_TEST_DURATION_NS * + NTIMESLICES; + if (slice < NTIMESLICES) { + cpu->timeslices[(unsigned int)slice] += 1; + } + + nsamples++; + }); + + ktrace_events_single(sess, END_EVENT, ^(struct trace_point *tp __unused) { + int cret = ktrace_convert_timestamp_to_nanoseconds(sess, + tp->timestamp, &last_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(cret, "convert timestamp"); + + ktrace_end(sess, 1); + }); + + uint64_t *counts = kpc_counterbuf_alloc(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(counts, + "allocated counter values array"); + memset(counts, 0, sizeof(*counts) * mch.ncpus * (mch.nfixed + mch.nconfig)); + struct tally *tly = calloc(mch.ncpus * (mch.nconfig + mch.nfixed), + sizeof(*tly)); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(tly, "allocated tallies array"); + + dispatch_source_t cpu_count_timer = dispatch_source_create( + DISPATCH_SOURCE_TYPE_TIMER, 0, 0, dispatch_get_main_queue()); + dispatch_source_set_timer(cpu_count_timer, dispatch_time(DISPATCH_TIME_NOW, + PERIODIC_CPU_COUNT_MS * NSEC_PER_MSEC), + PERIODIC_CPU_COUNT_MS * NSEC_PER_MSEC, 0); + dispatch_source_set_cancel_handler(cpu_count_timer, ^{ + dispatch_release(cpu_count_timer); + }); + + __block uint64_t first_check_ns = 0; + __block uint64_t last_check_ns = 0; + + dispatch_source_set_event_handler(cpu_count_timer, ^{ + int cret = kpc_get_cpu_counters(true, + KPC_CLASS_FIXED_MASK | KPC_CLASS_CONFIGURABLE_MASK, NULL, counts); + T_QUIET; T_ASSERT_POSIX_SUCCESS(cret, "kpc_get_cpu_counters"); + + if (!first_check_ns) { + first_check_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC); + } else { + last_check_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC); + } + check_counters(mch.ncpus, mch.nfixed + mch.nconfig, tly, counts); + }); + + int stop = 0; + (void)start_threads(&mch, spin, &stop); + + ktrace_set_completion_handler(sess, ^{ + dispatch_cancel(cpu_count_timer); + + check_tally("config", mch.ncpus, mch.nfixed + mch.nconfig, tly); + + struct rusage_info_v4 post_ru = {}; + int ruret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, + (rusage_info_t *)&post_ru); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ruret, "got rusage information"); + + T_LOG("saw %llu cycles in process", post_ru.ri_cycles - pre_ru.ri_cycles); + uint64_t total = 0; + + unsigned int nsamplecpus = 0; + char sample_slices[NTIMESLICES + 1]; + sample_slices[NTIMESLICES] = '\0'; + for (unsigned int i = 0; i < mch.ncpus; i++) { + memset(sample_slices, '.', sizeof(sample_slices) - 1); + + struct cpu *cpu = &cpus[i]; + unsigned int nsampleslices = 0, ncpusamples = 0, + last_contiguous = 0; + bool seen_empty = false; + for (unsigned int j = 0; j < NTIMESLICES; j++) { + unsigned int nslice = cpu->timeslices[j]; + nsamples += nslice; + ncpusamples += nslice; + if (nslice > 0) { + nsampleslices++; + sample_slices[j] = '*'; + } else { + seen_empty = true; + } + if (!seen_empty) { + last_contiguous = j; + } + } + unsigned int ctr = i * (mch.nfixed + mch.nconfig) + mch.nfixed; + uint64_t delta = tly[ctr].lastvalue - tly[ctr].firstvalue; + T_LOG("%g GHz", (double)delta / (last_check_ns - first_check_ns)); + total += delta; + T_LOG("CPU %2u: %4u/%u, %6u/%llu, max skid = %llu (%.1f%%), " + "last contiguous = %u", i, + nsampleslices, NTIMESLICES, ncpusamples, delta / PMI_PERIOD, + cpu->max_skid, (double)cpu->max_skid / PMI_PERIOD * 100, + last_contiguous); + T_LOG("%s", sample_slices); + if (nsampleslices > 0) { + nsamplecpus++; + } + T_EXPECT_EQ(last_contiguous, NTIMESLICES - 1, + "CPU %2u: saw samples in each time slice", i); + } + T_LOG("kpc reported %llu total cycles", total); + T_LOG("saw %u sample events, across %u/%u cpus", nsamples, nsamplecpus, + mch.ncpus); + T_END; + }); + + int dbglvl = 3; + ret = sysctlbyname("kperf.debug_level", NULL, NULL, &dbglvl, + sizeof(dbglvl)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "set kperf debug level"); + ret = kperf_sample_set(1); + T_ASSERT_POSIX_SUCCESS(ret, "kperf_sample_set"); + + start_kpc(); + + int error = ktrace_start(sess, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, PMI_TEST_DURATION_NS), + dispatch_get_main_queue(), ^{ + T_LOG("ending tracing after timeout"); + kdebug_trace(END_EVENT, 0, 0, 0, 0); + }); + + dispatch_activate(cpu_count_timer); + + T_SETUPEND; + + dispatch_main(); +} + #if defined(__arm64__) -/* - * This policy only applies to arm64 devices. - */ +// This policy only applies to arm64 devices. static int g_prev_disablewl = 0; @@ -87,7 +576,7 @@ whitelist_atend(void) T_DECL(whitelist, "ensure kpc's whitelist is filled out") { - /* Start enforcing the whitelist. */ + // Start enforcing the whitelist. int set = 0; size_t getsz = sizeof(g_prev_disablewl); int ret = sysctlbyname("kpc.disable_whitelist", &g_prev_disablewl, &getsz, @@ -102,31 +591,29 @@ T_DECL(whitelist, "ensure kpc's whitelist is filled out") uint32_t nconfigs = kpc_get_config_count(KPC_CLASS_CONFIGURABLE_MASK); uint64_t *config = calloc(nconfigs, sizeof(*config)); - /* - * Check that events in the whitelist are allowed. CORE_CYCLE (0x2) is - * always present in the whitelist. - */ + // Check that events in the whitelist are allowed. CORE_CYCLE (0x2) is + // always present in the whitelist. config[0] = 0x02; ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); T_ASSERT_POSIX_SUCCESS(ret, "configured kpc to count cycles"); - /* Check that non-event bits are ignored by the whitelist. */ + // Check that non-event bits are ignored by the whitelist. config[0] = 0x102; ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); T_ASSERT_POSIX_SUCCESS(ret, "configured kpc to count cycles with non-event bits set"); - /* Check that configurations of non-whitelisted events fail. */ + // Check that configurations of non-whitelisted events fail. config[0] = 0xfe; ret = kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); T_ASSERT_POSIX_FAILURE(ret, EPERM, "shouldn't allow arbitrary events with whitelist enabled"); - /* Clean up the configuration. */ + // Clean up the configuration. config[0] = 0; (void)kpc_set_config(KPC_CLASS_CONFIGURABLE_MASK, config); free(config); } -#endif /* defined(__arm64__) */ +#endif // defined(__arm64__) diff --git a/tests/kperf.c b/tests/kperf.c index 29ceeab7d..c74b9671a 100644 --- a/tests/kperf.c +++ b/tests/kperf.c @@ -52,6 +52,7 @@ spinning_thread(void *semp) #define PERF_KPC_REG KDBG_EVENTID(DBG_PERF, 6, 5) #define PERF_KPC_REG32 KDBG_EVENTID(DBG_PERF, 6, 7) #define PERF_INSTR_DATA KDBG_EVENTID(DBG_PERF, 1, 17) +#define PERF_EVENT KDBG_EVENTID(DBG_PERF, 0, 0) #define SCHED_HANDOFF KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, \ MACH_STACK_HANDOFF) diff --git a/tests/kperf_helpers.h b/tests/kperf_helpers.h index 466f3d9a7..b31cc4dad 100644 --- a/tests/kperf_helpers.h +++ b/tests/kperf_helpers.h @@ -5,4 +5,7 @@ void configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms); +#define PERF_SAMPLE KDBG_EVENTID(DBG_PERF, 0, 0) +#define PERF_KPC_PMI KDBG_EVENTID(DBG_PERF, 6, 0) + #endif /* !defined(KPERF_HELPERS_H) */ diff --git a/tests/launchd_plists/com.apple.xnu.test.task_create_suid_cred.plist b/tests/launchd_plists/com.apple.xnu.test.task_create_suid_cred.plist new file mode 100644 index 000000000..463d039bf --- /dev/null +++ b/tests/launchd_plists/com.apple.xnu.test.task_create_suid_cred.plist @@ -0,0 +1,24 @@ + + + + + Label + com.apple.xnu.test.task_create_suid_cred + MachServices + + com.apple.xnu.test.task_create_suid_cred + + + ThrottleInterval + 1 + UserName + root + ProcessType + Adaptive + EnvironmentVariables + + MallocNanoZone + 1 + + + diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c index c9399519a..471312f80 100644 --- a/tests/memorystatus_freeze_test.c +++ b/tests/memorystatus_freeze_test.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include /* Needed for vm_region info */ @@ -36,7 +37,7 @@ T_GLOBAL_META( X(MEMORYSTATUS_CONTROL_FAILED) \ X(IS_FREEZABLE_NOT_AS_EXPECTED) \ X(MEMSTAT_PRIORITY_CHANGE_FAILED) \ - X(INVALID_ALLOCATE_PAGES_ARGUMENTS) \ + X(INVALID_ALLOCATE_PAGES_ARGUMENTS) \ X(EXIT_CODE_MAX) #define EXIT_CODES_ENUM(VAR) VAR, @@ -599,6 +600,7 @@ memorystatus_assertion_test_demote_frozen() /* these values will remain fixed during testing */ int active_limit_mb = 15; /* arbitrary */ int inactive_limit_mb = 7; /* arbitrary */ + int demote_value = 1; /* Launch the child process, and elevate its priority */ int requestedpriority; dispatch_source_t ds_signal, ds_exit; @@ -613,8 +615,8 @@ memorystatus_assertion_test_demote_frozen() /* Freeze the process, trigger agressive demotion, and check that it hasn't been demoted. */ freeze_process(child_pid); /* Agressive demotion */ - sysctl_ret = sysctlbyname("kern.memorystatus_demote_frozen_processes", NULL, NULL, NULL, 0); - T_ASSERT_POSIX_SUCCESS(sysctl_ret, "sysctl kern.memorystatus_demote_frozen_processes failed"); + sysctl_ret = sysctlbyname("kern.memorystatus_demote_frozen_processes", NULL, NULL, &demote_value, sizeof(demote_value)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctl_ret, "sysctl kern.memorystatus_demote_frozen_processes succeeded"); /* Check */ (void)check_properties(child_pid, requestedpriority, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_SET, "Priority was set"); T_LOG("Relinquishing our assertion."); @@ -622,7 +624,7 @@ memorystatus_assertion_test_demote_frozen() relinquish_assertion_priority(child_pid, 0x0); (void)check_properties(child_pid, JETSAM_PRIORITY_AGING_BAND2, inactive_limit_mb, 0x0, ASSERTION_STATE_IS_RELINQUISHED, "Assertion was reqlinquished."); /* Kill the child */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Unable to kill child process"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process"); T_END; }); @@ -650,3 +652,41 @@ memorystatus_assertion_test_demote_frozen() T_DECL(assertion_test_demote_frozen, "demoted frozen process goes to asserted priority.", T_META_ASROOT(true)) { memorystatus_assertion_test_demote_frozen(); } + +T_DECL(budget_replenishment, "budget replenishes properly") { + size_t length; + int ret; + static unsigned int kTestIntervalSecs = 60 * 60 * 32; // 32 Hours + unsigned int memorystatus_freeze_daily_mb_max, memorystatus_freeze_daily_pages_max; + static unsigned int kFixedPointFactor = 100; + static unsigned int kNumSecondsInDay = 60 * 60 * 24; + unsigned int new_budget, expected_new_budget_pages; + size_t new_budget_ln; + unsigned int page_size = (unsigned int) get_vmpage_size(); + + /* + * Calculate a new budget as if the previous interval expired kTestIntervalSecs + * ago and we used up its entire budget. + */ + length = sizeof(kTestIntervalSecs); + new_budget_ln = sizeof(new_budget); + ret = sysctlbyname("vm.memorystatus_freeze_calculate_new_budget", &new_budget, &new_budget_ln, &kTestIntervalSecs, length); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget"); + + // Grab the daily budget. + length = sizeof(memorystatus_freeze_daily_mb_max); + ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &memorystatus_freeze_daily_mb_max, &length, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max"); + + memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024 * 1024 / page_size; + + /* + * We're kTestIntervalSecs past a new interval. Which means we are owed kNumSecondsInDay + * seconds of budget. + */ + expected_new_budget_pages = memorystatus_freeze_daily_pages_max; + expected_new_budget_pages += ((kTestIntervalSecs * kFixedPointFactor) / (kNumSecondsInDay) + * memorystatus_freeze_daily_pages_max) / kFixedPointFactor; + + T_QUIET; T_ASSERT_EQ(new_budget, expected_new_budget_pages, "Calculate new budget behaves correctly."); +} diff --git a/tests/net_bridge.c b/tests/net_bridge.c new file mode 100644 index 000000000..54ad5b67c --- /dev/null +++ b/tests/net_bridge.c @@ -0,0 +1,3587 @@ +/* + * Copyright (c) 2019 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * net_bridge.c + * - test if_bridge.c functionality + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpflib.h" +#include "in_cksum.h" + +static bool S_debug; +static bool S_cleaning_up; + +#define ALL_ADDRS (uint32_t)(-1) + +#define DHCP_PAYLOAD_MIN sizeof(struct bootp) +#define DHCP_FLAGS_BROADCAST ((u_short)0x8000) + +typedef union { + char bytes[DHCP_PAYLOAD_MIN]; + /* force 4-byte alignment */ + uint32_t words[DHCP_PAYLOAD_MIN / sizeof(uint32_t)]; +} dhcp_min_payload, *dhcp_min_payload_t; + +#define ETHER_PKT_LEN (ETHER_HDR_LEN + ETHERMTU) +typedef union { + char bytes[ETHER_PKT_LEN]; + /* force 4-byte aligment */ + uint32_t words[ETHER_PKT_LEN / sizeof(uint32_t)]; +} ether_packet, *ether_packet_t; + +typedef struct { + struct ip ip; + struct udphdr udp; +} ip_udp_header_t; + +typedef struct { + struct in_addr src_ip; + struct in_addr dst_ip; + char zero; + char proto; + unsigned short length; +} udp_pseudo_hdr_t; + +typedef struct { + struct ip ip; + struct tcphdr tcp; +} ip_tcp_header_t; + +typedef union { + ip_udp_header_t udp; + ip_tcp_header_t tcp; +} ip_udp_tcp_header_u; + +typedef struct { + struct in_addr src_ip; + struct in_addr dst_ip; + char zero; + char proto; + unsigned short length; +} tcp_pseudo_hdr_t; + +typedef struct { + struct ip6_hdr ip6; + struct udphdr udp; +} ip6_udp_header_t; + +typedef struct { + struct in6_addr src_ip; + struct in6_addr dst_ip; + char zero; + char proto; + unsigned short length; +} udp6_pseudo_hdr_t; + +typedef struct { + char ifname[IFNAMSIZ]; + char member_ifname[IFNAMSIZ]; /* member of bridge */ + ether_addr_t member_mac; + int fd; + u_int unit; + u_int num_addrs; + void * rx_buf; + int rx_buf_size; + bool mac_nat; + + u_int test_count; + u_int test_address_count; + uint64_t test_address_present; +} switch_port, *switch_port_t; + +typedef struct { + u_int size; + u_int count; + bool mac_nat; + switch_port list[1]; +} switch_port_list, * switch_port_list_t; + +static struct ifbareq * +bridge_rt_table_copy(u_int * ret_count); + +static void +bridge_rt_table_log(struct ifbareq *rt_table, u_int count); + +static struct ifbrmne * +bridge_mac_nat_entries_copy(u_int * ret_count); + +static void +bridge_mac_nat_entries_log(struct ifbrmne * entries, u_int count); + +static void +system_cmd(const char *cmd, bool fail_on_error); + +static int +inet_dgram_socket(void) +{ + int s; + + s = socket(AF_INET, SOCK_DGRAM, 0); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(s, "socket(AF_INET, SOCK_DGRAM, 0)"); + return s; +} + + +/** +** Packet creation/display +**/ +#define BOOTP_SERVER_PORT 67 +#define BOOTP_CLIENT_PORT 68 + +#define TEST_SOURCE_PORT 14 +#define TEST_DEST_PORT 15 + +#define EA_UNIT_INDEX 4 +#define EA_ADDR_INDEX 5 + +static void +set_ethernet_address(ether_addr_t *eaddr, u_int unit, u_int addr_index) +{ + u_char *a = eaddr->octet; + + a[0] = 0x02; + a[2] = 0x00; + a[3] = 0x00; + a[1] = 0x00; + a[EA_UNIT_INDEX] = (u_char)unit; + a[EA_ADDR_INDEX] = (u_char)addr_index; +} + +#define TEN_NET 0x0a000000 +#define TEN_1_NET (TEN_NET | 0x010000) + +static void +get_ipv4_address(u_int unit, u_int addr_index, struct in_addr *ip) +{ + /* up to 255 units, 255 addresses */ + ip->s_addr = htonl(TEN_1_NET | (unit << 8) | addr_index); + return; +} + +#define IN6ADDR_ULA_INIT \ + {{{ 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} + +static struct in6_addr ula_address = IN6ADDR_ULA_INIT; + +#define ULA_UNIT_INDEX 14 +#define ULA_ADDR_INDEX 15 + +static void +get_ipv6_address(u_int unit, u_int addr_index, struct in6_addr *ip) +{ + *ip = ula_address; + /* up to 255 units, 255 addresses */ + ip->s6_addr[ULA_UNIT_INDEX] = (uint8_t)unit; + ip->s6_addr[ULA_ADDR_INDEX] = (uint8_t)addr_index; +} + + +static void +get_ip_address(uint8_t af, u_int unit, u_int addr_index, union ifbrip *ip) +{ + switch (af) { + case AF_INET: + get_ipv4_address(unit, addr_index, &ip->ifbrip_addr); + break; + case AF_INET6: + get_ipv6_address(unit, addr_index, &ip->ifbrip_addr6); + break; + default: + T_FAIL("unrecognized address family %u", af); + break; + } +} + +static bool +ip_addresses_are_equal(uint8_t af, union ifbrip * ip1, union ifbrip * ip2) +{ + bool equal; + + switch (af) { + case AF_INET: + equal = (ip1->ifbrip_addr.s_addr == ip2->ifbrip_addr.s_addr); + break; + case AF_INET6: + equal = IN6_ARE_ADDR_EQUAL(&ip1->ifbrip_addr6, + &ip2->ifbrip_addr6); + break; + default: + T_FAIL("unrecognized address family %u", af); + equal = false; + break; + } + return equal; +} + +static ether_addr_t ether_broadcast = { + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } +}; + +static ether_addr_t ether_external = { + { 0x80, 0x00, 0x00, 0x00, 0x00, 0x01 } +}; + +static inline struct in_addr +get_external_ipv4_address(void) +{ + struct in_addr ip; + + /* IP 10.1.255.1 */ + ip.s_addr = htonl(TEN_1_NET | 0xff01); + return ip; +} + +static inline void +get_external_ip_address(uint8_t af, union ifbrip * ip) +{ + switch (af) { + case AF_INET: + /* IP 10.1.255.1 */ + ip->ifbrip_addr = get_external_ipv4_address(); + break; + case AF_INET6: + /* fd80::1 */ + ip->ifbrip_addr6 = ula_address; + ip->ifbrip_addr6.s6_addr[1] = 0x80; + ip->ifbrip_addr6.s6_addr[15] = 0x01; + break; + default: + T_FAIL("unrecognized address family %u", af); + break; + } +} + +static inline void +get_broadcast_ip_address(uint8_t af, union ifbrip * ip) +{ + switch (af) { + case AF_INET: + ip->ifbrip_addr.s_addr = INADDR_BROADCAST; + break; + case AF_INET6: + /* 0xff0e::0 linklocal scope multicast */ + ip->ifbrip_addr6 = in6addr_any; + ip->ifbrip_addr6.s6_addr[0] = 0xff; + ip->ifbrip_addr6.s6_addr[1] = __IPV6_ADDR_SCOPE_LINKLOCAL; + break; + default: + T_FAIL("unrecognized address family %u", af); + break; + } +} + + +#define ETHER_NTOA_BUFSIZE (ETHER_ADDR_LEN * 3) +static const char * +ether_ntoa_buf(const ether_addr_t *n, char * buf, int buf_size) +{ + char * str; + + str = ether_ntoa(n); + strlcpy(buf, str, buf_size); + return buf; +} + +static const char * +inet_ptrtop(int af, const void * ptr, char * buf, socklen_t buf_size) +{ + union { + struct in_addr ip; + struct in6_addr ip6; + } u; + + switch (af) { + case AF_INET: + bcopy(ptr, &u.ip, sizeof(u.ip)); + break; + case AF_INET6: + bcopy(ptr, &u.ip6, sizeof(u.ip6)); + break; + default: + return NULL; + } + return inet_ntop(af, &u, buf, buf_size); +} + +static __inline__ char * +arpop_name(u_int16_t op) +{ + switch (op) { + case ARPOP_REQUEST: + return "ARP REQUEST"; + case ARPOP_REPLY: + return "ARP REPLY"; + case ARPOP_REVREQUEST: + return "REVARP REQUEST"; + case ARPOP_REVREPLY: + return "REVARP REPLY"; + default: + break; + } + return ""; +} + +static void +arp_frame_validate(const struct ether_arp * earp, u_int len, bool dump) +{ + const struct arphdr * arp_p; + int arphrd; + char buf_sender_ether[ETHER_NTOA_BUFSIZE]; + char buf_sender_ip[INET_ADDRSTRLEN]; + char buf_target_ether[ETHER_NTOA_BUFSIZE]; + char buf_target_ip[INET_ADDRSTRLEN]; + + T_QUIET; + T_ASSERT_GE(len, (u_int)sizeof(*earp), + "%s ARP packet size %u need %u", + __func__, len, (u_int)sizeof(*earp)); + if (!dump) { + return; + } + arp_p = &earp->ea_hdr; + arphrd = ntohs(arp_p->ar_hrd); + T_LOG("%s type=0x%x proto=0x%x", arpop_name(ntohs(arp_p->ar_op)), + arphrd, ntohs(arp_p->ar_pro)); + if (arp_p->ar_hln == sizeof(earp->arp_sha)) { + ether_ntoa_buf((const ether_addr_t *)earp->arp_sha, + buf_sender_ether, + sizeof(buf_sender_ether)); + ether_ntoa_buf((const ether_addr_t *)earp->arp_tha, + buf_target_ether, + sizeof(buf_target_ether)); + T_LOG("Sender H/W\t%s", buf_sender_ether); + T_LOG("Target H/W\t%s", buf_target_ether); + } + inet_ptrtop(AF_INET, earp->arp_spa, + buf_sender_ip, sizeof(buf_sender_ip)); + inet_ptrtop(AF_INET, earp->arp_tpa, + buf_target_ip, sizeof(buf_target_ip)); + T_LOG("Sender IP\t%s", buf_sender_ip); + T_LOG("Target IP\t%s", buf_target_ip); + return; +} + +static void +ip_frame_validate(const void * buf, u_int buf_len, bool dump) +{ + char buf_dst[INET_ADDRSTRLEN]; + char buf_src[INET_ADDRSTRLEN]; + const ip_udp_header_t * ip_udp; + u_int ip_len; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(struct ip), NULL); + ip_udp = (const ip_udp_header_t *)buf; + ip_len = ntohs(ip_udp->ip.ip_len); + inet_ptrtop(AF_INET, &ip_udp->ip.ip_src, + buf_src, sizeof(buf_src)); + inet_ptrtop(AF_INET, &ip_udp->ip.ip_dst, + buf_dst, sizeof(buf_dst)); + if (dump) { + T_LOG("ip src %s dst %s len %u id %d", + buf_src, buf_dst, ip_len, + ntohs(ip_udp->ip.ip_id)); + } + T_QUIET; + T_ASSERT_GE(buf_len, ip_len, NULL); + T_QUIET; + T_ASSERT_EQ(ip_udp->ip.ip_v, IPVERSION, NULL); + T_QUIET; + T_ASSERT_EQ((u_int)(ip_udp->ip.ip_hl << 2), + (u_int)sizeof(struct ip), NULL); + if (ip_udp->ip.ip_p == IPPROTO_UDP) { + u_int udp_len; + u_int data_len; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(*ip_udp), NULL); + udp_len = ntohs(ip_udp->udp.uh_ulen); + T_QUIET; + T_ASSERT_GE(udp_len, (u_int)sizeof(ip_udp->udp), NULL); + data_len = udp_len - (u_int)sizeof(ip_udp->udp); + if (dump) { + T_LOG("udp src 0x%x dst 0x%x len %u" + " csum 0x%x datalen %u", + ntohs(ip_udp->udp.uh_sport), + ntohs(ip_udp->udp.uh_dport), + udp_len, + ntohs(ip_udp->udp.uh_sum), + data_len); + } + } +} + +static void +ip6_frame_validate(const void * buf, u_int buf_len, bool dump) +{ + char buf_dst[INET6_ADDRSTRLEN]; + char buf_src[INET6_ADDRSTRLEN]; + const struct ip6_hdr * ip6; + u_int ip6_len; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(struct ip6_hdr), NULL); + ip6 = (const struct ip6_hdr *)buf; + ip6_len = ntohs(ip6->ip6_plen); + inet_ptrtop(AF_INET6, &ip6->ip6_src, buf_src, sizeof(buf_src)); + inet_ptrtop(AF_INET6, &ip6->ip6_dst, buf_dst, sizeof(buf_dst)); + if (dump) { + T_LOG("ip6 src %s dst %s len %u", buf_src, buf_dst, ip6_len); + } + T_QUIET; + T_ASSERT_GE(buf_len, ip6_len + (u_int)sizeof(struct ip6_hdr), NULL); + T_QUIET; + T_ASSERT_EQ((ip6->ip6_vfc & IPV6_VERSION_MASK), + IPV6_VERSION, NULL); + T_QUIET; + switch (ip6->ip6_nxt) { + case IPPROTO_UDP: { + u_int data_len; + const ip6_udp_header_t *ip6_udp; + u_int udp_len; + + ip6_udp = (const ip6_udp_header_t *)buf; + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(*ip6_udp), NULL); + udp_len = ntohs(ip6_udp->udp.uh_ulen); + T_QUIET; + T_ASSERT_GE(udp_len, (u_int)sizeof(ip6_udp->udp), NULL); + data_len = udp_len - (u_int)sizeof(ip6_udp->udp); + if (dump) { + T_LOG("udp src 0x%x dst 0x%x len %u" + " csum 0x%x datalen %u", + ntohs(ip6_udp->udp.uh_sport), + ntohs(ip6_udp->udp.uh_dport), + udp_len, + ntohs(ip6_udp->udp.uh_sum), + data_len); + } + break; + } + case IPPROTO_ICMPV6: { + const struct icmp6_hdr *icmp6; + u_int icmp6_len; + + icmp6_len = buf_len - sizeof(*ip6); + T_QUIET; + T_ASSERT_GE(buf_len, icmp6_len, NULL); + icmp6 = (const struct icmp6_hdr *)(ip6 + 1); + switch (icmp6->icmp6_type) { + case ND_NEIGHBOR_SOLICIT: + if (dump) { + T_LOG("neighbor solicit"); + } + break; + case ND_NEIGHBOR_ADVERT: + if (dump) { + T_LOG("neighbor advert"); + } + break; + case ND_ROUTER_SOLICIT: + if (dump) { + T_LOG("router solicit"); + } + break; + default: + if (dump) { + T_LOG("icmp6 code 0x%x", icmp6->icmp6_type); + } + break; + } + break; + } + default: + break; + } +} + +static void +ethernet_frame_validate(const void * buf, u_int buf_len, bool dump) +{ + char ether_dst[ETHER_NTOA_BUFSIZE]; + char ether_src[ETHER_NTOA_BUFSIZE]; + uint16_t ether_type; + const ether_header_t * eh_p; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(*eh_p), NULL); + eh_p = (const ether_header_t *)buf; + ether_type = ntohs(eh_p->ether_type); + ether_ntoa_buf((const ether_addr_t *)&eh_p->ether_dhost, + ether_dst, sizeof(ether_dst)); + ether_ntoa_buf((const ether_addr_t *)&eh_p->ether_shost, + ether_src, sizeof(ether_src)); + if (dump) { + T_LOG("ether dst %s src %s type 0x%x", + ether_dst, ether_src, ether_type); + } + switch (ether_type) { + case ETHERTYPE_IP: + ip_frame_validate(eh_p + 1, (u_int)(buf_len - sizeof(*eh_p)), + dump); + break; + case ETHERTYPE_ARP: + arp_frame_validate((const struct ether_arp *)(eh_p + 1), + (u_int)(buf_len - sizeof(*eh_p)), + dump); + break; + case ETHERTYPE_IPV6: + ip6_frame_validate(eh_p + 1, (u_int)(buf_len - sizeof(*eh_p)), + dump); + break; + default: + T_FAIL("unrecognized ethertype 0x%x", ether_type); + break; + } +} + +static u_int +ethernet_udp4_frame_populate(void * buf, size_t buf_len, + const ether_addr_t * src, + struct in_addr src_ip, + uint16_t src_port, + const ether_addr_t * dst, + struct in_addr dst_ip, + uint16_t dst_port, + const void * data, u_int data_len) +{ + ether_header_t * eh_p; + u_int frame_length; + static int ip_id; + ip_udp_header_t * ip_udp; + char * payload; + udp_pseudo_hdr_t * udp_pseudo; + + frame_length = (u_int)(sizeof(*eh_p) + sizeof(*ip_udp)) + data_len; + if (buf_len < frame_length) { + return 0; + } + + /* determine frame offsets */ + eh_p = (ether_header_t *)buf; + ip_udp = (ip_udp_header_t *)(void *)(eh_p + 1); + udp_pseudo = (udp_pseudo_hdr_t *)(void *) + (((char *)&ip_udp->udp) - sizeof(*udp_pseudo)); + payload = (char *)(eh_p + 1) + sizeof(*ip_udp); + + /* ethernet_header */ + bcopy(src, eh_p->ether_shost, ETHER_ADDR_LEN); + bcopy(dst, eh_p->ether_dhost, ETHER_ADDR_LEN); + eh_p->ether_type = htons(ETHERTYPE_IP); + + /* copy the data */ + bcopy(data, payload, data_len); + + /* fill in UDP pseudo header (gets overwritten by IP header below) */ + bcopy(&src_ip, &udp_pseudo->src_ip, sizeof(src_ip)); + bcopy(&dst_ip, &udp_pseudo->dst_ip, sizeof(dst_ip)); + udp_pseudo->zero = 0; + udp_pseudo->proto = IPPROTO_UDP; + udp_pseudo->length = htons(sizeof(ip_udp->udp) + data_len); + + /* fill in UDP header */ + ip_udp->udp.uh_sport = htons(src_port); + ip_udp->udp.uh_dport = htons(dst_port); + ip_udp->udp.uh_ulen = htons(sizeof(ip_udp->udp) + data_len); + ip_udp->udp.uh_sum = 0; + ip_udp->udp.uh_sum = in_cksum(udp_pseudo, (int)(sizeof(*udp_pseudo) + + sizeof(ip_udp->udp) + data_len)); + + /* fill in IP header */ + bzero(ip_udp, sizeof(ip_udp->ip)); + ip_udp->ip.ip_v = IPVERSION; + ip_udp->ip.ip_hl = sizeof(struct ip) >> 2; + ip_udp->ip.ip_ttl = MAXTTL; + ip_udp->ip.ip_p = IPPROTO_UDP; + bcopy(&src_ip, &ip_udp->ip.ip_src, sizeof(src_ip)); + bcopy(&dst_ip, &ip_udp->ip.ip_dst, sizeof(dst_ip)); + ip_udp->ip.ip_len = htons(sizeof(*ip_udp) + data_len); + ip_udp->ip.ip_id = htons(ip_id++); + + /* compute the IP checksum */ + ip_udp->ip.ip_sum = 0; /* needs to be zero for checksum */ + ip_udp->ip.ip_sum = in_cksum(&ip_udp->ip, sizeof(ip_udp->ip)); + + return frame_length; +} + +static u_int +ethernet_udp6_frame_populate(void * buf, size_t buf_len, + const ether_addr_t * src, + struct in6_addr *src_ip, + uint16_t src_port, + const ether_addr_t * dst, + struct in6_addr * dst_ip, + uint16_t dst_port, + const void * data, u_int data_len) +{ + ether_header_t * eh_p; + u_int frame_length; + ip6_udp_header_t * ip6_udp; + char * payload; + udp6_pseudo_hdr_t * udp6_pseudo; + + frame_length = (u_int)(sizeof(*eh_p) + sizeof(*ip6_udp)) + data_len; + if (buf_len < frame_length) { + return 0; + } + + /* determine frame offsets */ + eh_p = (ether_header_t *)buf; + ip6_udp = (ip6_udp_header_t *)(void *)(eh_p + 1); + udp6_pseudo = (udp6_pseudo_hdr_t *)(void *) + (((char *)&ip6_udp->udp) - sizeof(*udp6_pseudo)); + payload = (char *)(eh_p + 1) + sizeof(*ip6_udp); + + /* ethernet_header */ + bcopy(src, eh_p->ether_shost, ETHER_ADDR_LEN); + bcopy(dst, eh_p->ether_dhost, ETHER_ADDR_LEN); + eh_p->ether_type = htons(ETHERTYPE_IPV6); + + /* copy the data */ + bcopy(data, payload, data_len); + + /* fill in UDP pseudo header (gets overwritten by IP header below) */ + bcopy(src_ip, &udp6_pseudo->src_ip, sizeof(*src_ip)); + bcopy(dst_ip, &udp6_pseudo->dst_ip, sizeof(*dst_ip)); + udp6_pseudo->zero = 0; + udp6_pseudo->proto = IPPROTO_UDP; + udp6_pseudo->length = htons(sizeof(ip6_udp->udp) + data_len); + + /* fill in UDP header */ + ip6_udp->udp.uh_sport = htons(src_port); + ip6_udp->udp.uh_dport = htons(dst_port); + ip6_udp->udp.uh_ulen = htons(sizeof(ip6_udp->udp) + data_len); + ip6_udp->udp.uh_sum = 0; + ip6_udp->udp.uh_sum = in_cksum(udp6_pseudo, (int)(sizeof(*udp6_pseudo) + + sizeof(ip6_udp->udp) + data_len)); + + /* fill in IP header */ + bzero(&ip6_udp->ip6, sizeof(ip6_udp->ip6)); + ip6_udp->ip6.ip6_vfc = IPV6_VERSION; + ip6_udp->ip6.ip6_nxt = IPPROTO_UDP; + bcopy(src_ip, &ip6_udp->ip6.ip6_src, sizeof(*src_ip)); + bcopy(dst_ip, &ip6_udp->ip6.ip6_dst, sizeof(*dst_ip)); + ip6_udp->ip6.ip6_plen = htons(sizeof(struct udphdr) + data_len); + /* ip6_udp->ip6.ip6_flow = ? */ + return frame_length; +} + +static u_int +ethernet_udp_frame_populate(void * buf, size_t buf_len, + uint8_t af, + const ether_addr_t * src, + union ifbrip * src_ip, + uint16_t src_port, + const ether_addr_t * dst, + union ifbrip * dst_ip, + uint16_t dst_port, + const void * data, u_int data_len) +{ + u_int len; + + switch (af) { + case AF_INET: + len = ethernet_udp4_frame_populate(buf, buf_len, + src, + src_ip->ifbrip_addr, + src_port, + dst, + dst_ip->ifbrip_addr, + dst_port, + data, data_len); + break; + case AF_INET6: + len = ethernet_udp6_frame_populate(buf, buf_len, + src, + &src_ip->ifbrip_addr6, + src_port, + dst, + &dst_ip->ifbrip_addr6, + dst_port, + data, data_len); + break; + default: + T_FAIL("unrecognized address family %u", af); + len = 0; + break; + } + return len; +} + +static u_int +ethernet_arp_frame_populate(void * buf, u_int buf_len, + uint16_t op, + const ether_addr_t * sender_hw, + struct in_addr sender_ip, + const ether_addr_t * target_hw, + struct in_addr target_ip) +{ + ether_header_t * eh_p; + struct ether_arp * earp; + struct arphdr * arp_p; + u_int frame_length; + + frame_length = sizeof(*earp) + sizeof(*eh_p); + T_QUIET; + T_ASSERT_GE(buf_len, frame_length, + "%s buffer size %u needed %u", + __func__, buf_len, frame_length); + + /* ethernet_header */ + eh_p = (ether_header_t *)buf; + bcopy(sender_hw, eh_p->ether_shost, ETHER_ADDR_LEN); + if (target_hw != NULL) { + bcopy(target_hw, eh_p->ether_dhost, + sizeof(eh_p->ether_dhost)); + } else { + bcopy(ðer_broadcast, eh_p->ether_dhost, + sizeof(eh_p->ether_dhost)); + } + eh_p->ether_type = htons(ETHERTYPE_ARP); + + /* ARP payload */ + earp = (struct ether_arp *)(void *)(eh_p + 1); + arp_p = &earp->ea_hdr; + arp_p->ar_hrd = htons(ARPHRD_ETHER); + arp_p->ar_pro = htons(ETHERTYPE_IP); + arp_p->ar_hln = sizeof(earp->arp_sha); + arp_p->ar_pln = sizeof(struct in_addr); + arp_p->ar_op = htons(op); + bcopy(sender_hw, earp->arp_sha, sizeof(earp->arp_sha)); + bcopy(&sender_ip, earp->arp_spa, sizeof(earp->arp_spa)); + if (target_hw != NULL) { + bcopy(target_hw, earp->arp_tha, sizeof(earp->arp_tha)); + } else { + bzero(earp->arp_tha, sizeof(earp->arp_tha)); + } + bcopy(&target_ip, earp->arp_tpa, sizeof(earp->arp_tpa)); + return frame_length; +} + +static uint32_t G_generation; + +static uint32_t +next_generation(void) +{ + return G_generation++; +} + +static const void * +ethernet_frame_get_udp4_payload(void * buf, u_int buf_len, + u_int * ret_payload_length) +{ + ether_header_t * eh_p; + uint16_t ether_type; + ip_udp_header_t * ip_udp; + u_int ip_len; + u_int left; + const void * payload = NULL; + u_int payload_length = 0; + u_int udp_len; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)(sizeof(*eh_p) + sizeof(*ip_udp)), NULL); + left = buf_len; + eh_p = (ether_header_t *)buf; + ether_type = ntohs(eh_p->ether_type); + T_QUIET; + T_ASSERT_EQ((int)ether_type, ETHERTYPE_IP, NULL); + ip_udp = (ip_udp_header_t *)(void *)(eh_p + 1); + left -= sizeof(*eh_p); + ip_len = ntohs(ip_udp->ip.ip_len); + T_QUIET; + T_ASSERT_GE(left, ip_len, NULL); + T_QUIET; + T_ASSERT_EQ((int)ip_udp->ip.ip_v, IPVERSION, NULL); + T_QUIET; + T_ASSERT_EQ((u_int)ip_udp->ip.ip_hl << 2, (u_int)sizeof(struct ip), + NULL); + T_QUIET; + T_ASSERT_EQ((int)ip_udp->ip.ip_p, IPPROTO_UDP, NULL); + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(*ip_udp), NULL); + udp_len = ntohs(ip_udp->udp.uh_ulen); + T_QUIET; + T_ASSERT_GE(udp_len, (u_int)sizeof(ip_udp->udp), NULL); + payload_length = udp_len - (int)sizeof(ip_udp->udp); + if (payload_length > 0) { + payload = (ip_udp + 1); + } + if (payload == NULL) { + payload_length = 0; + } + *ret_payload_length = payload_length; + return payload; +} + +static const void * +ethernet_frame_get_udp6_payload(void * buf, u_int buf_len, + u_int * ret_payload_length) +{ + ether_header_t * eh_p; + uint16_t ether_type; + ip6_udp_header_t * ip6_udp; + u_int ip6_len; + u_int left; + const void * payload = NULL; + u_int payload_length = 0; + u_int udp_len; + + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)(sizeof(*eh_p) + sizeof(*ip6_udp)), NULL); + left = buf_len; + eh_p = (ether_header_t *)buf; + ether_type = ntohs(eh_p->ether_type); + T_QUIET; + T_ASSERT_EQ((int)ether_type, ETHERTYPE_IPV6, NULL); + ip6_udp = (ip6_udp_header_t *)(void *)(eh_p + 1); + left -= sizeof(*eh_p); + ip6_len = ntohs(ip6_udp->ip6.ip6_plen); + T_QUIET; + T_ASSERT_GE(left, ip6_len + (u_int)sizeof(struct ip6_hdr), NULL); + T_QUIET; + T_ASSERT_EQ((int)(ip6_udp->ip6.ip6_vfc & IPV6_VERSION_MASK), + IPV6_VERSION, NULL); + T_QUIET; + T_ASSERT_EQ((int)ip6_udp->ip6.ip6_nxt, IPPROTO_UDP, NULL); + T_QUIET; + T_ASSERT_GE(buf_len, (u_int)sizeof(*ip6_udp), NULL); + udp_len = ntohs(ip6_udp->udp.uh_ulen); + T_QUIET; + T_ASSERT_GE(udp_len, (u_int)sizeof(ip6_udp->udp), NULL); + payload_length = udp_len - (int)sizeof(ip6_udp->udp); + if (payload_length > 0) { + payload = (ip6_udp + 1); + } + if (payload == NULL) { + payload_length = 0; + } + *ret_payload_length = payload_length; + return payload; +} + +static const void * +ethernet_frame_get_udp_payload(uint8_t af, void * buf, u_int buf_len, + u_int * ret_payload_length) +{ + const void * payload; + + switch (af) { + case AF_INET: + payload = ethernet_frame_get_udp4_payload(buf, buf_len, + ret_payload_length); + break; + case AF_INET6: + payload = ethernet_frame_get_udp6_payload(buf, buf_len, + ret_payload_length); + break; + default: + T_FAIL("unrecognized address family %u", af); + payload = NULL; + break; + } + return payload; +} + +#define MIN_ICMP6_LEN ((u_int)(sizeof(ether_header_t) + \ + sizeof(struct ip6_hdr) + \ + sizeof(struct icmp6_hdr))) +#define ALIGNED_ND_OPT_LEN 8 +#define SET_ND_OPT_LEN(a) (u_int)((a) >> 3) +#define GET_ND_OPT_LEN(a) (u_int)((a) << 3) +#define ALIGN_ND_OPT(a) (u_int)roundup(a, ALIGNED_ND_OPT_LEN) +#define LINKADDR_OPT_LEN (ALIGN_ND_OPT(sizeof(struct nd_opt_hdr) + \ + sizeof(ether_addr_t))) +#define ETHER_IPV6_LEN (sizeof(*eh_p) + sizeof(*ip6)) + + + +static u_int +ethernet_nd6_frame_populate(void * buf, u_int buf_len, + uint8_t type, + const ether_addr_t * sender_hw, + struct in6_addr * sender_ip, + const ether_addr_t * dest_ether, + const ether_addr_t * target_hw, + struct in6_addr * target_ip) +{ + u_int data_len = 0; + ether_header_t * eh_p; + u_int frame_length; + struct icmp6_hdr * icmp6; + struct ip6_hdr * ip6; + struct nd_opt_hdr * nd_opt; + + switch (type) { + case ND_ROUTER_SOLICIT: + case ND_NEIGHBOR_ADVERT: + case ND_NEIGHBOR_SOLICIT: + break; + default: + T_FAIL("%s: unsupported type %u", __func__, type); + return 0; + } + + T_QUIET; + T_ASSERT_GE(buf_len, MIN_ICMP6_LEN, NULL); + + eh_p = (ether_header_t *)buf; + ip6 = (struct ip6_hdr *)(void *)(eh_p + 1); + icmp6 = (struct icmp6_hdr *)(void *)(ip6 + 1); + frame_length = sizeof(*eh_p) + sizeof(*ip6); + switch (type) { + case ND_NEIGHBOR_SOLICIT: { + struct nd_neighbor_solicit * nd_ns; + bool sender_is_specified; + + sender_is_specified = !IN6_IS_ADDR_UNSPECIFIED(sender_ip); + data_len = sizeof(*nd_ns); + if (sender_is_specified) { + data_len += LINKADDR_OPT_LEN; + } + frame_length += data_len; + T_QUIET; + T_ASSERT_GE(buf_len, frame_length, NULL); + nd_ns = (struct nd_neighbor_solicit *)(void *)icmp6; + if (sender_is_specified) { + /* add the source lladdr option */ + nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); + nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; + nd_opt->nd_opt_len = SET_ND_OPT_LEN(LINKADDR_OPT_LEN); + bcopy(sender_hw, (nd_opt + 1), sizeof(*sender_hw)); + } + bcopy(target_ip, &nd_ns->nd_ns_target, + sizeof(nd_ns->nd_ns_target)); + break; + } + case ND_NEIGHBOR_ADVERT: { + struct nd_neighbor_advert * nd_na; + + data_len = sizeof(*nd_na) + LINKADDR_OPT_LEN; + frame_length += data_len; + T_QUIET; + T_ASSERT_GE(buf_len, frame_length, NULL); + + nd_na = (struct nd_neighbor_advert *)(void *)icmp6; + bcopy(target_ip, &nd_na->nd_na_target, + sizeof(nd_na->nd_na_target)); + /* add the target lladdr option */ + nd_opt = (struct nd_opt_hdr *)(nd_na + 1); + nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + nd_opt->nd_opt_len = SET_ND_OPT_LEN(LINKADDR_OPT_LEN); + bcopy(target_hw, (nd_opt + 1), sizeof(*target_hw)); + break; + } + case ND_ROUTER_SOLICIT: { + struct nd_router_solicit * nd_rs; + + data_len = sizeof(*nd_rs) + LINKADDR_OPT_LEN; + frame_length += data_len; + T_QUIET; + T_ASSERT_GE(buf_len, frame_length, NULL); + + nd_rs = (struct nd_router_solicit *)(void *)icmp6; + + /* add the source lladdr option */ + nd_opt = (struct nd_opt_hdr *)(nd_rs + 1); + nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; + nd_opt->nd_opt_len = SET_ND_OPT_LEN(LINKADDR_OPT_LEN); + bcopy(sender_hw, (nd_opt + 1), sizeof(*sender_hw)); + break; + } + default: + T_FAIL("%s: unsupported type %u", __func__, type); + return 0; + } + /* icmp6 header */ + icmp6->icmp6_type = type; + icmp6->icmp6_code = 0; + icmp6->icmp6_cksum = 0; + icmp6->icmp6_data32[0] = 0; + + /* ethernet_header */ + bcopy(sender_hw, eh_p->ether_shost, ETHER_ADDR_LEN); + if (dest_ether != NULL) { + bcopy(dest_ether, eh_p->ether_dhost, + sizeof(eh_p->ether_dhost)); + } else { + /* XXX ether_dhost should be multicast */ + bcopy(ðer_broadcast, eh_p->ether_dhost, + sizeof(eh_p->ether_dhost)); + } + eh_p->ether_type = htons(ETHERTYPE_IPV6); + + /* IPv6 header */ + bzero(ip6, sizeof(*ip6)); + ip6->ip6_nxt = IPPROTO_ICMPV6; + ip6->ip6_vfc = IPV6_VERSION; + bcopy(sender_ip, &ip6->ip6_src, sizeof(ip6->ip6_src)); + /* XXX ip6_dst should be specific multicast */ + bcopy(&in6addr_linklocal_allnodes, &ip6->ip6_dst, sizeof(ip6->ip6_dst)); + ip6->ip6_plen = htons(data_len); + + return frame_length; +} + +/** +** Switch port +**/ +static void +switch_port_check_tx(switch_port_t port) +{ + int error; + struct kevent kev; + int kq; + struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000 * 1000}; + + kq = kqueue(); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kq, "kqueue check_tx"); + EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL); + error = kevent(kq, &kev, 1, &kev, 1, &ts); + T_QUIET; + T_ASSERT_EQ(error, 1, "kevent"); + T_QUIET; + T_ASSERT_EQ((int)kev.filter, EVFILT_WRITE, NULL); + T_QUIET; + T_ASSERT_EQ((int)kev.ident, port->fd, NULL); + T_QUIET; + T_ASSERT_NULL(kev.udata, NULL); + close(kq); + return; +} + +static void +switch_port_send_arp(switch_port_t port, + uint16_t op, + const ether_addr_t * sender_hw, + struct in_addr sender_ip, + const ether_addr_t * target_hw, + struct in_addr target_ip) +{ + u_int frame_length; + ether_packet pkt; + ssize_t n; + + /* make sure we can send */ + switch_port_check_tx(port); + frame_length = ethernet_arp_frame_populate(&pkt, sizeof(pkt), + op, + sender_hw, + sender_ip, + target_hw, + target_ip); + T_QUIET; + T_ASSERT_GT(frame_length, 0, "%s: frame_length %u", + __func__, frame_length); + if (S_debug) { + T_LOG("Port %s -> %s transmitting %u bytes", + port->ifname, port->member_ifname, frame_length); + } + ethernet_frame_validate(&pkt, frame_length, S_debug); + n = write(port->fd, &pkt, frame_length); + if (n < 0) { + T_ASSERT_POSIX_SUCCESS(n, "%s write fd %d failed %ld", + port->ifname, port->fd, n); + } + T_QUIET; + T_ASSERT_EQ((u_int)n, frame_length, + "%s fd %d wrote %ld", + port->ifname, port->fd, n); +} + + +static void +switch_port_send_nd6(switch_port_t port, + uint8_t type, + const ether_addr_t * sender_hw, + struct in6_addr * sender_ip, + const ether_addr_t * dest_ether, + const ether_addr_t * target_hw, + struct in6_addr * target_ip) +{ + u_int frame_length; + ether_packet pkt; + ssize_t n; + + /* make sure we can send */ + switch_port_check_tx(port); + frame_length = ethernet_nd6_frame_populate(&pkt, sizeof(pkt), + type, + sender_hw, + sender_ip, + dest_ether, + target_hw, + target_ip); + T_QUIET; + T_ASSERT_GT(frame_length, 0, "%s: frame_length %u", + __func__, frame_length); + if (S_debug) { + T_LOG("Port %s -> %s transmitting %u bytes", + port->ifname, port->member_ifname, frame_length); + } + ethernet_frame_validate(&pkt, frame_length, S_debug); + n = write(port->fd, &pkt, frame_length); + if (n < 0) { + T_ASSERT_POSIX_SUCCESS(n, "%s write fd %d failed %ld", + port->ifname, port->fd, n); + } + T_QUIET; + T_ASSERT_EQ((u_int)n, frame_length, + "%s fd %d wrote %ld", + port->ifname, port->fd, n); +} + + +static void +switch_port_send_udp(switch_port_t port, + uint8_t af, + const ether_addr_t * src_eaddr, + union ifbrip * src_ip, + uint16_t src_port, + const ether_addr_t * dst_eaddr, + union ifbrip * dst_ip, + uint16_t dst_port, + const void * payload, u_int payload_length) +{ + u_int frame_length; + ether_packet pkt; + ssize_t n; + + /* make sure we can send */ + switch_port_check_tx(port); + + /* generate the packet */ + frame_length + = ethernet_udp_frame_populate((void *)&pkt, + (u_int)sizeof(pkt), + af, + src_eaddr, + src_ip, + src_port, + dst_eaddr, + dst_ip, + dst_port, + payload, + payload_length); + T_QUIET; + T_ASSERT_GT(frame_length, 0, NULL); + if (S_debug) { + T_LOG("Port %s transmitting %u bytes", + port->ifname, frame_length); + } + ethernet_frame_validate(&pkt, frame_length, S_debug); + n = write(port->fd, &pkt, frame_length); + if (n < 0) { + T_ASSERT_POSIX_SUCCESS(n, "%s write fd %d failed %ld", + port->ifname, port->fd, n); + } + T_QUIET; + T_ASSERT_EQ((u_int)n, frame_length, + "%s fd %d wrote %ld", + port->ifname, port->fd, n); +} + + + +static void +switch_port_send_udp_addr_index(switch_port_t port, + uint8_t af, + u_int addr_index, + const ether_addr_t * dst_eaddr, + union ifbrip * dst_ip, + const void * payload, u_int payload_length) +{ + ether_addr_t eaddr; + union ifbrip ip; + + /* generate traffic for the unit and address */ + set_ethernet_address(&eaddr, port->unit, addr_index); + get_ip_address(af, port->unit, addr_index, &ip); + switch_port_send_udp(port, af, + &eaddr, &ip, TEST_SOURCE_PORT, + dst_eaddr, dst_ip, TEST_DEST_PORT, + payload, payload_length); +} + +typedef void +(packet_validator)(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, void * context); +typedef packet_validator * packet_validator_t; + +static void +switch_port_receive(switch_port_t port, + uint8_t af, + const void * payload, u_int payload_length, + packet_validator_t validator, + void * context) +{ + ether_header_t * eh_p; + ssize_t n; + char * offset; + + n = read(port->fd, port->rx_buf, (unsigned)port->rx_buf_size); + if (n < 0) { + if (errno == EAGAIN) { + return; + } + T_QUIET; + T_ASSERT_POSIX_SUCCESS(n, "read %s port %d fd %d", + port->ifname, port->unit, port->fd); + return; + } + for (offset = port->rx_buf; n > 0;) { + struct bpf_hdr * bpf = (struct bpf_hdr *)(void *)offset; + u_int pkt_len; + char * pkt; + u_int skip; + + pkt = offset + bpf->bh_hdrlen; + pkt_len = bpf->bh_caplen; + + eh_p = (ether_header_t *)(void *)pkt; + T_QUIET; + T_ASSERT_GE(pkt_len, (u_int)sizeof(*eh_p), + "short packet %ld", n); + + /* source shouldn't be broadcast/multicast */ + T_QUIET; + T_ASSERT_EQ(eh_p->ether_shost[0] & 0x01, 0, + "broadcast/multicast source"); + + if (S_debug) { + T_LOG("Port %s [unit %d] [fd %d] Received %u bytes", + port->ifname, port->unit, port->fd, pkt_len); + } + ethernet_frame_validate(pkt, pkt_len, S_debug); + + /* call the validation function */ + (*validator)(port, eh_p, pkt_len, context); + + if (payload != NULL) { + const void * p; + u_int p_len; + + p = ethernet_frame_get_udp_payload(af, pkt, pkt_len, + &p_len); + T_QUIET; + T_ASSERT_NOTNULL(p, "ethernet_frame_get_udp_payload"); + T_QUIET; + T_ASSERT_EQ(p_len, payload_length, + "payload length %u < expected %u", + p_len, payload_length); + T_QUIET; + T_ASSERT_EQ(bcmp(payload, p, payload_length), 0, + "unexpected payload"); + } + skip = BPF_WORDALIGN(pkt_len + bpf->bh_hdrlen); + if (skip == 0) { + break; + } + offset += skip; + n -= skip; + } + return; +} + +static void +switch_port_log(switch_port_t port) +{ + T_LOG("%s [unit %d] [member %s]%s bpf fd %d bufsize %d\n", + port->ifname, port->unit, + port->member_ifname, + port->mac_nat ? " [mac-nat]" : "", + port->fd, port->rx_buf_size); +} + +#define switch_port_list_size(port_count) \ + offsetof(switch_port_list, list[port_count]) + +static switch_port_list_t +switch_port_list_alloc(u_int port_count, bool mac_nat) +{ + switch_port_list_t list; + + list = (switch_port_list_t) + calloc(1, switch_port_list_size(port_count));; + list->size = port_count; + list->mac_nat = mac_nat; + return list; +} + +static void +switch_port_list_dealloc(switch_port_list_t list) +{ + u_int i; + switch_port_t port; + + for (i = 0, port = list->list; i < list->count; i++, port++) { + close(port->fd); + free(port->rx_buf); + } + free(list); + return; +} + +static errno_t +switch_port_list_add_port(switch_port_list_t port_list, u_int unit, + const char * ifname, const char * member_ifname, + ether_addr_t * member_mac, + u_int num_addrs, bool mac_nat) +{ + int buf_size; + errno_t err = EINVAL; + int fd = -1; + int opt; + switch_port_t p; + + if (port_list->count >= port_list->size) { + T_LOG("Internal error: port_list count %u >= size %u\n", + port_list->count, port_list->size); + goto failed; + } + fd = bpf_new(); + if (fd < 0) { + err = errno; + T_LOG("bpf_new"); + goto failed; + } + opt = 1; + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ioctl(fd, FIONBIO, &opt), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(bpf_set_immediate(fd, 1), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(bpf_setif(fd, ifname), "bpf set if %s", + ifname); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(bpf_set_see_sent(fd, 0), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(bpf_set_header_complete(fd, 1), NULL); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(bpf_get_blen(fd, &buf_size), NULL); + if (S_debug) { + T_LOG("%s [unit %d] [member %s] bpf fd %d bufsize %d\n", + ifname, unit, + member_ifname, fd, buf_size); + } + p = port_list->list + port_list->count++; + p->fd = fd; + p->unit = unit; + strlcpy(p->ifname, ifname, sizeof(p->ifname)); + strlcpy(p->member_ifname, member_ifname, sizeof(p->member_ifname)); + p->num_addrs = num_addrs; + p->rx_buf_size = buf_size; + p->rx_buf = malloc((unsigned)buf_size); + p->mac_nat = mac_nat; + p->member_mac = *member_mac; + return 0; + +failed: + if (fd >= 0) { + close(fd); + } + return err; +} + +static switch_port_t +switch_port_list_find_fd(switch_port_list_t ports, int fd) +{ + u_int i; + switch_port_t port; + + for (i = 0, port = ports->list; i < ports->count; i++, port++) { + if (port->fd == fd) { + return port; + } + } + return NULL; +} + +static void +switch_port_list_log(switch_port_list_t port_list) +{ + u_int i; + switch_port_t port; + + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + switch_port_log(port); + } + return; +} + +static switch_port_t +switch_port_list_find_member(switch_port_list_t ports, const char * member_ifname) +{ + u_int i; + switch_port_t port; + + for (i = 0, port = ports->list; i < ports->count; i++, port++) { + if (strcmp(port->member_ifname, member_ifname) == 0) { + return port; + } + } + return NULL; +} + +static void +switch_port_list_check_receive(switch_port_list_t ports, uint8_t af, + const void * payload, u_int payload_length, + packet_validator_t validator, + void * context) +{ + int i; + int n_events; + struct kevent kev[ports->count]; + int kq; + switch_port_t port; + struct timespec ts = { .tv_sec = 0, .tv_nsec = 10 * 1000 * 1000}; + u_int u; + + kq = kqueue(); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kq, "kqueue check_receive"); + for (u = 0, port = ports->list; u < ports->count; u++, port++) { + port->test_count = 0; + EV_SET(kev + u, port->fd, + EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, NULL); + } + + do { + n_events = kevent(kq, kev, (int)ports->count, kev, + (int)ports->count, &ts); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(n_events, "kevent receive %d", n_events); + for (i = 0; i < n_events; i++) { + T_QUIET; + T_ASSERT_EQ((int)kev[i].filter, EVFILT_READ, NULL); + T_QUIET; + T_ASSERT_NULL(kev[i].udata, NULL); + port = switch_port_list_find_fd(ports, + (int)kev[i].ident); + T_QUIET; + T_ASSERT_NE(port, NULL, + "port %p fd %d", (void *)port, + (int)kev[i].ident); + switch_port_receive(port, af, payload, payload_length, + validator, context); + } + } while (n_events != 0); + close(kq); +} + +static bool +switch_port_list_verify_rt_table(switch_port_list_t port_list, bool log) +{ + bool all_present = true; + u_int i; + u_int count; + struct ifbareq *ifba; + struct ifbareq *rt_table; + switch_port_t port; + + /* clear out current notion of how many addresses are present */ + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + port->test_address_count = 0; + port->test_address_present = 0; + } + rt_table = bridge_rt_table_copy(&count); + if (rt_table == NULL) { + return false; + } + if (log) { + bridge_rt_table_log(rt_table, count); + } + for (i = 0, ifba = rt_table; i < count; i++, ifba++) { + uint64_t addr_bit; + u_int addr_index; + u_int unit_index; + u_char * ea; + ether_addr_t * eaddr; + + eaddr = (ether_addr_t *)&ifba->ifba_dst; + ea = eaddr->octet; + addr_index = ea[EA_ADDR_INDEX]; + unit_index = ea[EA_UNIT_INDEX]; + port = switch_port_list_find_member(port_list, + ifba->ifba_ifsname); + T_QUIET; + T_ASSERT_NOTNULL(port, "switch_port_list_find_member %s", + ifba->ifba_ifsname); + if (!S_cleaning_up) { + T_QUIET; + T_ASSERT_EQ(unit_index, port->unit, NULL); + addr_bit = 1 << addr_index; + T_QUIET; + T_ASSERT_BITS_NOTSET(port->test_address_present, + addr_bit, "%s address %u", + ifba->ifba_ifsname, addr_index); + port->test_address_present |= addr_bit; + port->test_address_count++; + } + } + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (S_debug) { + T_LOG("%s unit %d [member %s] %u expect %u", + port->ifname, port->unit, port->member_ifname, + port->test_address_count, port->num_addrs); + } + if (port->test_address_count != port->num_addrs) { + all_present = false; + } + } + + free(rt_table); + return all_present; +} + +static bool +switch_port_list_verify_mac_nat(switch_port_list_t port_list, bool log) +{ + bool all_present = true; + u_int i; + u_int count; + static struct ifbrmne * entries; + switch_port_t port; + struct ifbrmne * scan; + + + /* clear out current notion of how many addresses are present */ + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + port->test_address_count = 0; + port->test_address_present = 0; + } + entries = bridge_mac_nat_entries_copy(&count); + if (entries == NULL) { + return false; + } + if (log) { + bridge_mac_nat_entries_log(entries, count); + } + for (i = 0, scan = entries; i < count; i++, scan++) { + uint8_t af; + uint64_t addr_bit; + u_int addr_index; + char buf_ip1[INET6_ADDRSTRLEN]; + char buf_ip2[INET6_ADDRSTRLEN]; + u_char * ea; + ether_addr_t * eaddr; + union ifbrip ip; + u_int unit_index; + + eaddr = (ether_addr_t *)&scan->ifbmne_mac; + ea = eaddr->octet; + addr_index = ea[EA_ADDR_INDEX]; + unit_index = ea[EA_UNIT_INDEX]; + port = switch_port_list_find_member(port_list, + scan->ifbmne_ifname); + T_QUIET; + T_ASSERT_NOTNULL(port, + "switch_port_list_find_member %s", + scan->ifbmne_ifname); + T_QUIET; + T_ASSERT_EQ(unit_index, port->unit, NULL); + af = scan->ifbmne_af; + get_ip_address(af, port->unit, addr_index, &ip); + addr_bit = 1 << addr_index; + T_QUIET; + T_ASSERT_TRUE(ip_addresses_are_equal(af, &ip, &scan->ifbmne_ip), + "mac nat entry IP address %s expected %s", + inet_ntop(af, &scan->ifbmne_ip_addr, + buf_ip1, sizeof(buf_ip1)), + inet_ntop(af, &ip, + buf_ip2, sizeof(buf_ip2))); + T_QUIET; + T_ASSERT_BITS_NOTSET(port->test_address_present, + addr_bit, "%s address %u", + scan->ifbmne_ifname, addr_index); + port->test_address_present |= addr_bit; + port->test_address_count++; + } + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->mac_nat) { + /* MAC-NAT interface should have no entries */ + T_QUIET; + T_ASSERT_EQ(port->test_address_count, 0, + "mac nat interface %s has %u entries", + port->member_ifname, + port->test_address_count); + } else { + if (S_debug) { + T_LOG("%s unit %d [member %s] %u expect %u", + port->ifname, port->unit, + port->member_ifname, + port->test_address_count, port->num_addrs); + } + if (port->test_address_count != port->num_addrs) { + all_present = false; + } + } + } + + free(entries); + + return all_present; +} + +/** +** Basic Bridge Tests +**/ +static void +send_generation(switch_port_t port, uint8_t af, u_int addr_index, + const ether_addr_t * dst_eaddr, union ifbrip * dst_ip, + uint32_t generation) +{ + uint32_t payload; + + payload = htonl(generation); + switch_port_send_udp_addr_index(port, af, addr_index, dst_eaddr, dst_ip, + &payload, sizeof(payload)); +} + +static void +check_receive_generation(switch_port_list_t ports, uint8_t af, + uint32_t generation, packet_validator_t validator, + __unused void * context) +{ + uint32_t payload; + + payload = htonl(generation); + switch_port_list_check_receive(ports, af, &payload, sizeof(payload), + validator, context); +} + +static void +validate_source_ether_mismatch(switch_port_t port, const ether_header_t * eh_p) +{ + /* source shouldn't be our own MAC addresses */ + T_QUIET; + T_ASSERT_NE(eh_p->ether_shost[EA_UNIT_INDEX], port->unit, + "ether source matches unit %d", port->unit); +} + +static void +validate_not_present_dhost(switch_port_t port, const ether_header_t * eh_p, + __unused u_int pkt_len, + __unused void * context) +{ + validate_source_ether_mismatch(port, eh_p); + T_QUIET; + T_ASSERT_EQ(bcmp(eh_p->ether_dhost, ðer_external, + sizeof(eh_p->ether_dhost)), 0, + "%s", __func__); + port->test_count++; +} + +static void +validate_broadcast_dhost(switch_port_t port, const ether_header_t * eh_p, + __unused u_int pkt_len, + __unused void * context) +{ + validate_source_ether_mismatch(port, eh_p); + T_QUIET; + T_ASSERT_NE((eh_p->ether_dhost[0] & 0x01), 0, + "%s", __func__); + port->test_count++; +} + +static void +validate_port_dhost(switch_port_t port, const ether_header_t * eh_p, + __unused u_int pkt_len, + __unused void * context) +{ + validate_source_ether_mismatch(port, eh_p); + T_QUIET; + T_ASSERT_EQ(eh_p->ether_dhost[EA_UNIT_INDEX], port->unit, + "wrong dhost unit %d != %d", + eh_p->ether_dhost[EA_UNIT_INDEX], port->unit); + port->test_count++; +} + + +static void +check_received_count(switch_port_list_t port_list, + switch_port_t port, uint32_t expected_packets) +{ + u_int i; + switch_port_t scan; + + for (i = 0, scan = port_list->list; i < port_list->count; i++, scan++) { + if (scan == port) { + T_QUIET; + T_ASSERT_EQ(port->test_count, 0, + "unexpected receive on port %d", + port->unit); + } else if (expected_packets == ALL_ADDRS) { + T_QUIET; + T_ASSERT_EQ(scan->test_count, scan->num_addrs, + "didn't receive on all addrs"); + } else { + T_QUIET; + T_ASSERT_EQ(scan->test_count, expected_packets, + "wrong receive count on port %s", scan->member_ifname); + } + } +} + +static void +unicast_send_all(switch_port_list_t port_list, uint8_t af, switch_port_t port) +{ + u_int i; + switch_port_t scan; + + for (i = 0, scan = port_list->list; i < port_list->count; i++, scan++) { + if (S_debug) { + T_LOG("Unicast send on %s", port->ifname); + } + for (u_int j = 0; j < scan->num_addrs; j++) { + ether_addr_t eaddr; + union ifbrip ip; + + set_ethernet_address(&eaddr, scan->unit, j); + get_ip_address(af, scan->unit, j, &ip); + switch_port_send_udp_addr_index(port, af, 0, &eaddr, &ip, + NULL, 0); + } + } +} + + +static void +bridge_learning_test_once(switch_port_list_t port_list, + uint8_t af, + packet_validator_t validator, + void * context, + const ether_addr_t * dst_eaddr, + bool retry) +{ + u_int i; + union ifbrip dst_ip; + switch_port_t port; + + get_broadcast_ip_address(af, &dst_ip); + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->test_address_count == port->num_addrs) { + /* already populated */ + continue; + } + if (S_debug) { + T_LOG("Sending on %s", port->ifname); + } + for (u_int j = 0; j < port->num_addrs; j++) { + uint32_t generation; + + if (retry) { + uint64_t addr_bit; + + addr_bit = 1 << j; + if ((port->test_address_present & addr_bit) + != 0) { + /* already present */ + continue; + } + T_LOG("Retry port %s unit %u address %u", + port->ifname, port->unit, j); + } + generation = next_generation(); + send_generation(port, + af, + j, + dst_eaddr, + &dst_ip, + generation); + + /* receive across all ports */ + check_receive_generation(port_list, + af, + generation, + validator, + context); + + /* ensure that every port saw the packet */ + check_received_count(port_list, port, 1); + } + } + return; +} + +static inline const char * +af_get_str(uint8_t af) +{ + return (af == AF_INET) ? "IPv4" : "IPv6"; +} + +static void +bridge_learning_test(switch_port_list_t port_list, + uint8_t af, + packet_validator_t validator, + void * context, + const ether_addr_t * dst_eaddr) +{ + char ntoabuf[ETHER_NTOA_BUFSIZE]; + u_int i; + switch_port_t port; + bool verified = false; + + ether_ntoa_buf(dst_eaddr, ntoabuf, sizeof(ntoabuf)); + + /* + * Send a broadcast frame from every port in the list so that the bridge + * learns our MAC address. + */ +#define BROADCAST_MAX_TRIES 20 + for (int try = 1; try < BROADCAST_MAX_TRIES; try++) { + bool retry = (try > 1); + + if (!retry) { + T_LOG("%s: %s #ports %u #addrs %u dest %s", + __func__, + af_get_str(af), + port_list->count, port_list->list->num_addrs, + ntoabuf); + } else { + T_LOG("%s: %s #ports %u #addrs %u dest %s (TRY=%d)", + __func__, + af_get_str(af), + port_list->count, port_list->list->num_addrs, + ntoabuf, try); + } + bridge_learning_test_once(port_list, af, validator, context, + dst_eaddr, retry); + /* + * In the event of a memory allocation failure, it's possible + * that the address was not learned. Figure out whether + * all addresses are present, and if not, we'll retry on + * those that are not present. + */ + verified = switch_port_list_verify_rt_table(port_list, false); + if (verified) { + break; + } + /* wait a short time to allow the system to recover */ + usleep(100 * 1000); + } + T_QUIET; + T_ASSERT_TRUE(verified, "All addresses present"); + + /* + * Since we just broadcast on every port in the switch, the bridge knows + * the port's MAC addresses. The bridge should not need to broadcast the + * packet to learn, which means the unicast traffic should only arrive + * on the intended port. + */ + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + /* send unicast packets to every other port's MAC addresses */ + unicast_send_all(port_list, af, port); + + /* receive all of that generated traffic */ + switch_port_list_check_receive(port_list, af, NULL, 0, + validate_port_dhost, NULL); + /* check that we saw all of the unicast packets */ + check_received_count(port_list, port, ALL_ADDRS); + } + T_PASS("%s", __func__); +} + +/** +** MAC-NAT tests +**/ +static void +mac_nat_check_received_count(switch_port_list_t port_list, switch_port_t port) +{ + u_int i; + switch_port_t scan; + + for (i = 0, scan = port_list->list; i < port_list->count; i++, scan++) { + u_int expected = 0; + + if (scan == port) { + expected = scan->num_addrs; + } + T_QUIET; + T_ASSERT_EQ(scan->test_count, expected, + "%s [member %s]%s expected %u actual %u", + scan->ifname, scan->member_ifname, + scan->mac_nat ? " [mac-nat]" : "", + expected, scan->test_count); + } +} + +static void +validate_mac_nat(switch_port_t port, const ether_header_t * eh_p, + __unused u_int pkt_len, + __unused void * context) +{ + if (port->mac_nat) { + bool equal; + + /* source must match MAC-NAT interface */ + equal = (bcmp(eh_p->ether_shost, &port->member_mac, + sizeof(port->member_mac)) == 0); + if (!equal) { + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_TRUE(equal, "source address match"); + port->test_count++; + } else { + validate_not_present_dhost(port, eh_p, pkt_len, NULL); + } +} + +static void +validate_mac_nat_in(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, __unused void * context) +{ + if (S_debug) { + T_LOG("%s received %u bytes", port->member_ifname, pkt_len); + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_EQ(eh_p->ether_dhost[EA_UNIT_INDEX], port->unit, + "dhost unit %u expected %u", + eh_p->ether_dhost[EA_UNIT_INDEX], port->unit); + port->test_count++; +} + +static void +validate_mac_nat_arp_out(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, void * context) +{ + const struct ether_arp * earp; + switch_port_t send_port = (switch_port_t)context; + + if (S_debug) { + T_LOG("%s received %u bytes", port->member_ifname, pkt_len); + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_EQ((int)ntohs(eh_p->ether_type), (int)ETHERTYPE_ARP, NULL); + earp = (const struct ether_arp *)(const void *)(eh_p + 1); + T_QUIET; + T_ASSERT_GE(pkt_len, (u_int)(sizeof(*eh_p) + sizeof(*earp)), NULL); + if (port->mac_nat) { + bool equal; + + /* source ethernet must match MAC-NAT interface */ + equal = (bcmp(eh_p->ether_shost, &port->member_mac, + sizeof(port->member_mac)) == 0); + if (!equal) { + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_TRUE(equal, "%s -> %s source address translated", + send_port->member_ifname, + port->member_ifname); + /* sender hw must match MAC-NAT interface */ + equal = (bcmp(earp->arp_sha, &port->member_mac, + sizeof(port->member_mac)) == 0); + if (!equal) { + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_TRUE(equal, "%s -> %s sender hardware translated", + send_port->member_ifname, + port->member_ifname); + } else { + /* source ethernet must match the sender */ + T_QUIET; + T_ASSERT_EQ(eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit); + /* source hw must match the sender */ + T_QUIET; + T_ASSERT_EQ(earp->arp_sha[EA_UNIT_INDEX], send_port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + earp->arp_sha[EA_UNIT_INDEX], send_port->unit); + } + port->test_count++; +} + +static void +validate_mac_nat_arp_in(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, void * context) +{ + const struct ether_arp * earp; + switch_port_t send_port = (switch_port_t)context; + + if (S_debug) { + T_LOG("%s received %u bytes", port->member_ifname, pkt_len); + ethernet_frame_validate(eh_p, pkt_len, true); + } + earp = (const struct ether_arp *)(const void *)(eh_p + 1); + T_QUIET; + T_ASSERT_EQ((int)ntohs(eh_p->ether_type), (int)ETHERTYPE_ARP, NULL); + T_QUIET; + T_ASSERT_GE(pkt_len, (u_int)(sizeof(*eh_p) + sizeof(*earp)), NULL); + T_QUIET; + T_ASSERT_FALSE(port->mac_nat, NULL); + + /* destination ethernet must match the unit */ + T_QUIET; + T_ASSERT_EQ(eh_p->ether_dhost[EA_UNIT_INDEX], port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + eh_p->ether_dhost[EA_UNIT_INDEX], port->unit); + /* source hw must match the sender */ + T_QUIET; + T_ASSERT_EQ(earp->arp_tha[EA_UNIT_INDEX], port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + earp->arp_tha[EA_UNIT_INDEX], port->unit); + port->test_count++; +} + +static void +mac_nat_test_arp_out(switch_port_list_t port_list) +{ + u_int i; + struct in_addr ip_dst; + switch_port_t port; + + ip_dst = get_external_ipv4_address(); + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->mac_nat) { + continue; + } + for (u_int j = 0; j < port->num_addrs; j++) { + ether_addr_t eaddr; + struct in_addr ip_src; + + set_ethernet_address(&eaddr, port->unit, j); + get_ipv4_address(port->unit, j, &ip_src); + switch_port_send_arp(port, + ARPOP_REQUEST, + &eaddr, + ip_src, + NULL, + ip_dst); + switch_port_list_check_receive(port_list, AF_INET, + NULL, 0, + validate_mac_nat_arp_out, + port); + check_received_count(port_list, port, 1); + } + } + T_PASS("%s", __func__); +} + +static void +mac_nat_send_arp_response(switch_port_t ext_port, switch_port_t port) +{ + struct in_addr ip_src; + + T_QUIET; + T_ASSERT_TRUE(ext_port->mac_nat, "%s is MAC-NAT interface", + ext_port->member_ifname); + ip_src = get_external_ipv4_address(); + for (u_int j = 0; j < port->num_addrs; j++) { + struct in_addr ip_dst; + + get_ipv4_address(port->unit, j, &ip_dst); + if (S_debug) { + T_LOG("Generating ARP destined to %s %s", + port->ifname, inet_ntoa(ip_dst)); + } + switch_port_send_arp(ext_port, + ARPOP_REPLY, + ðer_external, + ip_src, + &ext_port->member_mac, + ip_dst); + } +} + +static void +mac_nat_test_arp_in(switch_port_list_t port_list) +{ + u_int i; + struct in_addr ip_src; + switch_port_t port; + + ip_src = get_external_ipv4_address(); + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->mac_nat) { + continue; + } + mac_nat_send_arp_response(port_list->list, port); + + /* receive the generated traffic */ + switch_port_list_check_receive(port_list, AF_INET, NULL, 0, + validate_mac_nat_arp_in, + port_list->list); + + /* verify that only the single port got the packet */ + mac_nat_check_received_count(port_list, port); + } + T_PASS("%s", __func__); +} + +static void +validate_mac_nat_dhcp(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, void * context) +{ + u_int dp_flags; + const struct bootp_packet * pkt; + switch_port_t send_port = (switch_port_t)context; + + + T_QUIET; + T_ASSERT_GE(pkt_len, (u_int)sizeof(*pkt), NULL); + T_QUIET; + T_ASSERT_EQ((int)ntohs(eh_p->ether_type), (int)ETHERTYPE_IP, NULL); + pkt = (const struct bootp_packet *)(const void *)(eh_p + 1); + + dp_flags = ntohs(pkt->bp_bootp.bp_unused); + if (port->mac_nat) { + bool equal; + + /* Broadcast bit must be set */ + T_QUIET; + T_ASSERT_BITS_SET(dp_flags, (u_int)DHCP_FLAGS_BROADCAST, + "%s -> %s: flags 0x%x must have 0x%x", + send_port->member_ifname, + port->member_ifname, + dp_flags, DHCP_FLAGS_BROADCAST); + + /* source must match MAC-NAT interface */ + equal = (bcmp(eh_p->ether_shost, &port->member_mac, + sizeof(port->member_mac)) == 0); + if (!equal) { + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_TRUE(equal, "%s -> %s source address translated", + send_port->member_ifname, + port->member_ifname); + } else { + /* Broadcast bit must not be set */ + T_QUIET; + T_ASSERT_BITS_NOTSET(dp_flags, DHCP_FLAGS_BROADCAST, + "%s -> %s flags 0x%x must not have 0x%x", + send_port->member_ifname, + port->member_ifname, + dp_flags, DHCP_FLAGS_BROADCAST); + T_QUIET; + T_ASSERT_EQ(eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit); + } + port->test_count++; +} + +static u_int +make_dhcp_payload(dhcp_min_payload_t payload, ether_addr_t *eaddr) +{ + struct bootp * dhcp; + u_int payload_length; + + /* create a minimal BOOTP packet */ + payload_length = sizeof(*payload); + dhcp = (struct bootp *)payload; + bzero(dhcp, payload_length); + dhcp->bp_op = BOOTREQUEST; + dhcp->bp_htype = ARPHRD_ETHER; + dhcp->bp_hlen = sizeof(*eaddr); + bcopy(eaddr->octet, dhcp->bp_chaddr, sizeof(eaddr->octet)); + return payload_length; +} + +static void +mac_nat_test_dhcp(switch_port_list_t port_list) +{ + u_int i; + struct in_addr ip_dst = { INADDR_BROADCAST }; + struct in_addr ip_src = { INADDR_ANY }; + switch_port_t port; + + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + ether_addr_t eaddr; + dhcp_min_payload payload; + u_int payload_len; + + if (port->mac_nat) { + continue; + } + set_ethernet_address(&eaddr, port->unit, 0); + payload_len = make_dhcp_payload(&payload, &eaddr); + if (S_debug) { + T_LOG("%s: transmit DHCP packet (member %s)", + port->ifname, port->member_ifname); + } + switch_port_send_udp(port, + AF_INET, + &eaddr, + (union ifbrip *)&ip_src, + BOOTP_CLIENT_PORT, + ðer_broadcast, + (union ifbrip *)&ip_dst, + BOOTP_SERVER_PORT, + &payload, + payload_len); + + switch_port_list_check_receive(port_list, AF_INET, NULL, 0, + validate_mac_nat_dhcp, + port); + + check_received_count(port_list, port, 1); + } + T_PASS("%s", __func__); +} + + +static void +validate_mac_nat_nd6(switch_port_t port, + const struct icmp6_hdr * icmp6, + u_int icmp6_len, + uint8_t opt_type, + u_int nd_hdr_size, + switch_port_t send_port) +{ + const uint8_t * linkaddr; + const uint8_t * ptr; + const struct nd_opt_hdr * nd_opt; + u_int nd_size; + + ptr = (const uint8_t *)icmp6; + nd_size = nd_hdr_size + LINKADDR_OPT_LEN; + if (icmp6_len < nd_size) { + /* no LINKADDR option */ + return; + } + nd_opt = (const struct nd_opt_hdr *)(const void *)(ptr + nd_hdr_size); + T_QUIET; + T_ASSERT_EQ(nd_opt->nd_opt_type, opt_type, NULL); + T_QUIET; + T_ASSERT_EQ(GET_ND_OPT_LEN(nd_opt->nd_opt_len), LINKADDR_OPT_LEN, NULL); + linkaddr = (const uint8_t *)(nd_opt + 1); + if (port->mac_nat) { + bool equal; + + equal = (bcmp(linkaddr, &port->member_mac, + sizeof(port->member_mac)) == 0); + T_QUIET; + T_ASSERT_TRUE(equal, "%s -> %s sender hardware translated", + send_port->member_ifname, + port->member_ifname); + } else { + /* source hw must match the sender */ + T_QUIET; + T_ASSERT_EQ(linkaddr[EA_UNIT_INDEX], send_port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + linkaddr[EA_UNIT_INDEX], send_port->unit); + } +} + +static void +validate_mac_nat_icmp6_out(switch_port_t port, const struct icmp6_hdr * icmp6, + u_int icmp6_len, switch_port_t send_port) +{ + switch (icmp6->icmp6_type) { + case ND_NEIGHBOR_ADVERT: + validate_mac_nat_nd6(port, icmp6, icmp6_len, + ND_OPT_TARGET_LINKADDR, + sizeof(struct nd_neighbor_advert), + send_port); + break; + case ND_NEIGHBOR_SOLICIT: + validate_mac_nat_nd6(port, icmp6, icmp6_len, + ND_OPT_SOURCE_LINKADDR, + sizeof(struct nd_neighbor_solicit), + send_port); + break; + case ND_ROUTER_SOLICIT: + validate_mac_nat_nd6(port, icmp6, icmp6_len, + ND_OPT_SOURCE_LINKADDR, + sizeof(struct nd_router_solicit), + send_port); + break; + default: + T_FAIL("Unsupported icmp6 type %d", icmp6->icmp6_type); + break; + } +} + +static void +validate_mac_nat_nd6_out(switch_port_t port, const ether_header_t * eh_p, + u_int pkt_len, void * context) +{ + const struct icmp6_hdr * icmp6; + const struct ip6_hdr * ip6; + switch_port_t send_port = (switch_port_t)context; + + if (S_debug) { + T_LOG("%s received %u bytes", port->member_ifname, pkt_len); + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_EQ(ntohs(eh_p->ether_type), (u_short)ETHERTYPE_IPV6, NULL); + ip6 = (const struct ip6_hdr *)(const void *)(eh_p + 1); + icmp6 = (const struct icmp6_hdr *)(const void *)(ip6 + 1); + T_QUIET; + T_ASSERT_GE(pkt_len, (u_int)MIN_ICMP6_LEN, NULL); + T_QUIET; + T_ASSERT_EQ(ip6->ip6_nxt, IPPROTO_ICMPV6, NULL); + + /* validate the ethernet header */ + if (port->mac_nat) { + bool equal; + + /* source ethernet must match MAC-NAT interface */ + equal = (bcmp(eh_p->ether_shost, &port->member_mac, + sizeof(port->member_mac)) == 0); + if (!equal) { + ethernet_frame_validate(eh_p, pkt_len, true); + } + T_QUIET; + T_ASSERT_TRUE(equal, "%s -> %s source address translated", + send_port->member_ifname, + port->member_ifname); + } else { + /* source ethernet must match the sender */ + T_QUIET; + T_ASSERT_EQ(eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit, + "%s -> %s unit %u expected %u", + send_port->member_ifname, + port->member_ifname, + eh_p->ether_shost[EA_UNIT_INDEX], send_port->unit); + } + /* validate the icmp6 payload */ + validate_mac_nat_icmp6_out(port, icmp6, + pkt_len - ETHER_IPV6_LEN, + send_port); + port->test_count++; +} + +static void +mac_nat_test_nd6_out(switch_port_list_t port_list) +{ + ether_addr_t * ext_mac; + switch_port_t ext_port; + u_int i; + union ifbrip ip_dst; + switch_port_t port; + + get_external_ip_address(AF_INET6, &ip_dst); + ext_port = port_list->list; + T_QUIET; + T_ASSERT_TRUE(ext_port->mac_nat, NULL); + ext_mac = &ext_port->member_mac; + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->mac_nat) { + continue; + } + /* neighbor solicit */ + for (u_int j = 0; j < port->num_addrs; j++) { + ether_addr_t eaddr; + union ifbrip ip_src; + + set_ethernet_address(&eaddr, port->unit, j); + get_ip_address(AF_INET6, port->unit, j, &ip_src); + switch_port_send_nd6(port, + ND_NEIGHBOR_SOLICIT, + &eaddr, + &ip_src.ifbrip_addr6, + NULL, + NULL, + &ip_dst.ifbrip_addr6); + switch_port_list_check_receive(port_list, AF_INET, + NULL, 0, + validate_mac_nat_nd6_out, + port); + check_received_count(port_list, port, 1); + } + /* neighbor advert */ + for (u_int j = 0; j < port->num_addrs; j++) { + ether_addr_t eaddr; + union ifbrip ip_src; + + set_ethernet_address(&eaddr, port->unit, j); + get_ip_address(AF_INET6, port->unit, j, &ip_src); + switch_port_send_nd6(port, + ND_NEIGHBOR_ADVERT, + &eaddr, + &ip_src.ifbrip_addr6, + NULL, + &eaddr, + &ip_src.ifbrip_addr6); + switch_port_list_check_receive(port_list, AF_INET, + NULL, 0, + validate_mac_nat_nd6_out, + port); + check_received_count(port_list, port, 1); + } + /* router solicit */ + for (u_int j = 0; j < port->num_addrs; j++) { + ether_addr_t eaddr; + union ifbrip ip_src; + + set_ethernet_address(&eaddr, port->unit, j); + get_ip_address(AF_INET6, port->unit, j, &ip_src); + //get_ipv6ll_address(port->unit, j, &ip_src.ifbrip_addr6); + switch_port_send_nd6(port, + ND_ROUTER_SOLICIT, + &eaddr, + &ip_src.ifbrip_addr6, + NULL, + NULL, + NULL); + switch_port_list_check_receive(port_list, AF_INET, + NULL, 0, + validate_mac_nat_nd6_out, + port); + check_received_count(port_list, port, 1); + } + } + T_PASS("%s", __func__); +} + +static void +mac_nat_send_response(switch_port_t ext_port, uint8_t af, switch_port_t port) +{ + union ifbrip src_ip; + + T_QUIET; + T_ASSERT_TRUE(ext_port->mac_nat, "%s is MAC-NAT interface", + ext_port->member_ifname); + if (S_debug) { + T_LOG("Generating UDP traffic destined to %s", port->ifname); + } + get_external_ip_address(af, &src_ip); + for (u_int j = 0; j < port->num_addrs; j++) { + union ifbrip ip; + + get_ip_address(af, port->unit, j, &ip); + switch_port_send_udp(ext_port, + af, + ðer_external, + &src_ip, + TEST_DEST_PORT, + &ext_port->member_mac, + &ip, + TEST_SOURCE_PORT, + NULL, 0); + } +} + + +static void +mac_nat_test_ip_once(switch_port_list_t port_list, uint8_t af, bool retry) +{ + union ifbrip dst_ip; + u_int i; + switch_port_t port; + + get_external_ip_address(af, &dst_ip); + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->test_address_count == port->num_addrs) { + /* already populated */ + continue; + } + if (S_debug) { + T_LOG("Sending on %s", port->ifname); + } + for (u_int j = 0; j < port->num_addrs; j++) { + uint32_t generation; + + if (retry) { + uint64_t addr_bit; + + addr_bit = 1 << j; + if ((port->test_address_present & addr_bit) + != 0) { + /* already present */ + continue; + } + T_LOG("Retry port %s unit %u address %u", + port->ifname, port->unit, j); + } + + generation = next_generation(); + send_generation(port, + af, + j, + ðer_external, + &dst_ip, + generation); + + /* receive across all ports */ + check_receive_generation(port_list, + af, + generation, + validate_mac_nat, + NULL); + + /* ensure that every port saw the packet */ + check_received_count(port_list, port, 1); + } + } + return; +} + +static void +mac_nat_test_ip(switch_port_list_t port_list, uint8_t af) +{ + u_int i; + switch_port_t port; + bool verified = false; + + /* + * Send a packet from every port in the list so that the bridge + * learns the MAC addresses and IP addresses. + */ +#define MAC_NAT_MAX_TRIES 20 + for (int try = 1; try < BROADCAST_MAX_TRIES; try++) { + bool retry = (try > 1); + + if (!retry) { + T_LOG("%s: #ports %u #addrs %u", + __func__, + port_list->count, port_list->list->num_addrs); + } else { + T_LOG("%s: #ports %u #addrs %u destination (TRY=%d)", + __func__, + port_list->count, port_list->list->num_addrs, + try); + } + mac_nat_test_ip_once(port_list, af, retry); + /* + * In the event of a memory allocation failure, it's possible + * that the address was not learned. Figure out whether + * all addresses are present, and if not, we'll retry on + * those that are not present. + */ + verified = switch_port_list_verify_mac_nat(port_list, false); + if (verified) { + break; + } + /* wait a short time to allow the system to recover */ + usleep(100 * 1000); + } + T_QUIET; + T_ASSERT_TRUE(verified, "All addresses present"); + + /* + * The bridge now has an IP address <-> MAC address binding for every + * address on each internal interface. + * + * Generate an inbound packet on the MAC-NAT interface targeting + * each interface address. Verify that the packet appears on + * the appropriate internal address with appropriate translation. + */ + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (port->mac_nat) { + continue; + } + mac_nat_send_response(port_list->list, af, port); + + /* receive the generated traffic */ + switch_port_list_check_receive(port_list, AF_INET, NULL, 0, + validate_mac_nat_in, + NULL); + + /* verify that only the single port got the packet */ + mac_nat_check_received_count(port_list, port); + } + T_PASS("%s", __func__); +} + +/** +** interface management +**/ + +static int +ifnet_get_lladdr(int s, const char * ifname, ether_addr_t * eaddr) +{ + int err; + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + ifr.ifr_addr.sa_family = AF_LINK; + ifr.ifr_addr.sa_len = ETHER_ADDR_LEN; + err = ioctl(s, SIOCGIFLLADDR, &ifr); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "SIOCGIFLLADDR %s", ifname); + bcopy(ifr.ifr_addr.sa_data, eaddr->octet, ETHER_ADDR_LEN); + return err; +} + + +static int +ifnet_attach_ip(int s, char * name) +{ + int err; + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + err = ioctl(s, SIOCPROTOATTACH, &ifr); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "SIOCPROTOATTACH %s", ifr.ifr_name); + return err; +} + +#if 0 +static int +ifnet_detach_ip(int s, char * name) +{ + int err; + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + err = ioctl(s, SIOCPROTODETACH, &ifr); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "SIOCPROTODETACH %s", ifr.ifr_name); + return err; +} +#endif + +static int +ifnet_destroy(int s, const char * ifname, bool fail_on_error) +{ + int err; + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + err = ioctl(s, SIOCIFDESTROY, &ifr); + if (fail_on_error) { + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "SIOCSIFDESTROY %s", ifr.ifr_name); + } + if (err < 0) { + T_LOG("SIOCSIFDESTROY %s", ifr.ifr_name); + } + return err; +} + +static int +ifnet_set_flags(int s, const char * ifname, + uint16_t flags_set, uint16_t flags_clear) +{ + uint16_t flags_after; + uint16_t flags_before; + struct ifreq ifr; + int ret; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + ret = ioctl(s, SIOCGIFFLAGS, (caddr_t)&ifr); + if (ret != 0) { + T_LOG("SIOCGIFFLAGS %s", ifr.ifr_name); + return ret; + } + flags_before = (uint16_t)ifr.ifr_flags; + ifr.ifr_flags |= flags_set; + ifr.ifr_flags &= ~(flags_clear); + flags_after = (uint16_t)ifr.ifr_flags; + if (flags_before == flags_after) { + /* nothing to do */ + ret = 0; + } else { + /* issue the ioctl */ + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ioctl(s, SIOCSIFFLAGS, &ifr), + "SIOCSIFFLAGS %s 0x%x", + ifr.ifr_name, (uint16_t)ifr.ifr_flags); + if (S_debug) { + T_LOG("setflags(%s set 0x%x clear 0x%x) 0x%x => 0x%x", + ifr.ifr_name, flags_set, flags_clear, + flags_before, flags_after); + } + } + return ret; +} + +#define BRIDGE_NAME "bridge" +#define BRIDGE200 BRIDGE_NAME "200" + +#define FETH_NAME "feth" + +/* On some platforms with DEBUG kernel, we need to wait a while */ +#define SIFCREATE_RETRY 600 + +static int +ifnet_create(int s, const char * ifname) +{ + int error = 0; + struct ifreq ifr; + + bzero(&ifr, sizeof(ifr)); + strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + for (int i = 0; i < SIFCREATE_RETRY; i++) { + if (ioctl(s, SIOCIFCREATE, &ifr) < 0) { + error = errno; + T_LOG("SIOCSIFCREATE %s: %s", ifname, + strerror(error)); + if (error == EBUSY) { + /* interface is tearing down, try again */ + usleep(10000); + } else if (error == EEXIST) { + /* interface exists, try destroying it */ + (void)ifnet_destroy(s, ifname, false); + } else { + /* unexpected failure */ + break; + } + } else { + error = 0; + break; + } + } + if (error == 0) { + error = ifnet_set_flags(s, ifname, IFF_UP, 0); + } + return error; +} + +static int +siocdrvspec(int s, const char * ifname, + u_long op, void *arg, size_t argsize, bool set) +{ + struct ifdrv ifd; + + memset(&ifd, 0, sizeof(ifd)); + strlcpy(ifd.ifd_name, ifname, sizeof(ifd.ifd_name)); + ifd.ifd_cmd = op; + ifd.ifd_len = argsize; + ifd.ifd_data = arg; + return ioctl(s, set ? SIOCSDRVSPEC : SIOCGDRVSPEC, &ifd); +} + + +static int +fake_set_peer(int s, const char * feth, const char * feth_peer) +{ + struct if_fake_request iffr; + int ret; + + bzero((char *)&iffr, sizeof(iffr)); + if (feth_peer != NULL) { + strlcpy(iffr.iffr_peer_name, feth_peer, + sizeof(iffr.iffr_peer_name)); + } + ret = siocdrvspec(s, feth, IF_FAKE_S_CMD_SET_PEER, + &iffr, sizeof(iffr), true); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, + "SIOCDRVSPEC(%s, IF_FAKE_S_CMD_SET_PEER, %s)", + feth, (feth_peer != NULL) ? feth_peer : ""); + return ret; +} + +static int +bridge_add_member(int s, const char * bridge, const char * member) +{ + struct ifbreq req; + int ret; + + memset(&req, 0, sizeof(req)); + strlcpy(req.ifbr_ifsname, member, sizeof(req.ifbr_ifsname)); + ret = siocdrvspec(s, bridge, BRDGADD, &req, sizeof(req), true); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "%s %s %s", __func__, bridge, member); + return ret; +} + + +static int +bridge_set_mac_nat(int s, const char * bridge, const char * member, bool enable) +{ + uint32_t flags; + bool need_set = false; + struct ifbreq req; + int ret; + + memset(&req, 0, sizeof(req)); + strlcpy(req.ifbr_ifsname, member, sizeof(req.ifbr_ifsname)); + ret = siocdrvspec(s, bridge, BRDGGIFFLGS, &req, sizeof(req), false); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "BRDGGIFFLGS %s %s", bridge, member); + flags = req.ifbr_ifsflags; + if (enable) { + if ((flags & IFBIF_MAC_NAT) == 0) { + need_set = true; + req.ifbr_ifsflags |= IFBIF_MAC_NAT; + } + /* need to set it */ + } else if ((flags & IFBIF_MAC_NAT) != 0) { + /* need to clear it */ + need_set = true; + req.ifbr_ifsflags &= ~(uint32_t)IFBIF_MAC_NAT; + } + if (need_set) { + ret = siocdrvspec(s, bridge, BRDGSIFFLGS, + &req, sizeof(req), true); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "BRDGSIFFLGS %s %s 0x%x => 0x%x", + bridge, member, + flags, req.ifbr_ifsflags); + } + return ret; +} + +static struct ifbareq * +bridge_rt_table_copy_common(const char * bridge, u_int * ret_count) +{ + struct ifbaconf ifbac; + u_int len = 8 * 1024; + char * inbuf = NULL; + char * ninbuf; + int ret; + struct ifbareq * rt_table = NULL; + int s; + + s = inet_dgram_socket(); + + /* + * BRDGRTS should work like other ioctl's where passing in NULL + * for the buffer says "tell me how many there are". Unfortunately, + * it doesn't so we have to pass in a buffer, then check that it + * was too big. + */ + for (;;) { + ninbuf = realloc(inbuf, len); + T_QUIET; + T_ASSERT_NOTNULL((void *)ninbuf, "realloc %u", len); + ifbac.ifbac_len = len; + ifbac.ifbac_buf = inbuf = ninbuf; + ret = siocdrvspec(s, bridge, BRDGRTS, + &ifbac, sizeof(ifbac), false); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "%s %s", __func__, bridge); + if ((ifbac.ifbac_len + sizeof(*rt_table)) < len) { + /* we passed a buffer larger than what was required */ + break; + } + len *= 2; + } + if (ifbac.ifbac_len == 0) { + free(ninbuf); + T_LOG("No bridge routing entries"); + goto done; + } + *ret_count = ifbac.ifbac_len / sizeof(*rt_table); + rt_table = (struct ifbareq *)(void *)ninbuf; +done: + if (rt_table == NULL) { + *ret_count = 0; + } + if (s >= 0) { + close(s); + } + return rt_table; +} + +static struct ifbareq * +bridge_rt_table_copy(u_int * ret_count) +{ + return bridge_rt_table_copy_common(BRIDGE200, ret_count); +} + +static void +bridge_rt_table_log(struct ifbareq *rt_table, u_int count) +{ + u_int i; + char ntoabuf[ETHER_NTOA_BUFSIZE]; + struct ifbareq * ifba; + + for (i = 0, ifba = rt_table; i < count; i++, ifba++) { + ether_ntoa_buf((const ether_addr_t *)&ifba->ifba_dst, + ntoabuf, sizeof(ntoabuf)); + T_LOG("%s %s %lu", ifba->ifba_ifsname, ntoabuf, + ifba->ifba_expire); + } + return; +} + +static struct ifbrmne * +bridge_mac_nat_entries_copy_common(const char * bridge, u_int * ret_count) +{ + char * buf = NULL; + u_int count = 0; + int err; + u_int i; + struct ifbrmnelist mnl; + struct ifbrmne * ret_list = NULL; + int s; + char * scan; + + + s = inet_dgram_socket(); + + /* find out how many there are */ + bzero(&mnl, sizeof(mnl)); + err = siocdrvspec(s, bridge, BRDGGMACNATLIST, &mnl, sizeof(mnl), false); + if (err != 0 && S_cleaning_up) { + T_LOG("BRDGGMACNATLIST %s failed %d", bridge, errno); + goto done; + } + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "BRDGGMACNATLIST %s", bridge); + T_QUIET; + T_ASSERT_GE(mnl.ifbml_elsize, (uint16_t)sizeof(struct ifbrmne), + "mac nat entry size %u minsize %u", + mnl.ifbml_elsize, (u_int)sizeof(struct ifbrmne)); + if (mnl.ifbml_len == 0) { + goto done; + } + + /* call again with a buffer large enough to hold them */ + buf = malloc(mnl.ifbml_len); + T_QUIET; + T_ASSERT_NOTNULL(buf, "mac nat entries buffer"); + mnl.ifbml_buf = buf; + err = siocdrvspec(s, bridge, BRDGGMACNATLIST, &mnl, sizeof(mnl), false); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(err, "BRDGGMACNATLIST %s", bridge); + count = mnl.ifbml_len / mnl.ifbml_elsize; + if (count == 0) { + goto done; + } + if (mnl.ifbml_elsize == sizeof(struct ifbrmne)) { + /* element size is expected size, no need to "right-size" it */ + ret_list = (struct ifbrmne *)(void *)buf; + buf = NULL; + goto done; + } + /* element size is larger than we expect, create a "right-sized" array */ + ret_list = malloc(count * sizeof(*ret_list)); + T_QUIET; + T_ASSERT_NOTNULL(ret_list, "mac nat entries list"); + for (i = 0, scan = buf; i < count; i++, scan += mnl.ifbml_elsize) { + struct ifbrmne * ifbmne; + + ifbmne = (struct ifbrmne *)(void *)scan; + ret_list[i] = *ifbmne; + } +done: + if (s >= 0) { + close(s); + } + if (buf != NULL) { + free(buf); + } + *ret_count = count; + return ret_list; +} + +static struct ifbrmne * +bridge_mac_nat_entries_copy(u_int * ret_count) +{ + return bridge_mac_nat_entries_copy_common(BRIDGE200, ret_count); +} + +static void +bridge_mac_nat_entries_log(struct ifbrmne * entries, u_int count) +{ + u_int i; + char ntoabuf[ETHER_NTOA_BUFSIZE]; + char ntopbuf[INET6_ADDRSTRLEN]; + struct ifbrmne * scan; + + for (i = 0, scan = entries; i < count; i++, scan++) { + ether_ntoa_buf((const ether_addr_t *)&scan->ifbmne_mac, + ntoabuf, sizeof(ntoabuf)); + inet_ntop(scan->ifbmne_af, &scan->ifbmne_ip, + ntopbuf, sizeof(ntopbuf)); + printf("%s %s %s %lu\n", + scan->ifbmne_ifname, ntopbuf, ntoabuf, + (unsigned long)scan->ifbmne_expire); + } + return; +} + +/** +** Test Main +**/ +static u_int S_n_ports; +static switch_port_list_t S_port_list; + +static void +bridge_cleanup(const char * bridge, u_int n_ports, bool fail_on_error); + +static void +cleanup_common(bool dump_table) +{ + if (S_n_ports == 0) { + return; + } + S_cleaning_up = true; + if ((S_port_list != NULL && S_port_list->mac_nat) + || (dump_table && S_port_list != NULL)) { + switch_port_list_log(S_port_list); + if (S_port_list->mac_nat) { + switch_port_list_verify_mac_nat(S_port_list, true); + } + (void)switch_port_list_verify_rt_table(S_port_list, true); + } + if (S_debug) { + T_LOG("sleeping for 5 seconds\n"); + sleep(5); + } + bridge_cleanup(BRIDGE200, S_n_ports, false); + return; +} + +static void +cleanup(void) +{ + cleanup_common(true); + return; +} + +static void +sigint_handler(__unused int sig) +{ + cleanup_common(false); + signal(SIGINT, SIG_DFL); +} + +static switch_port_list_t +bridge_setup(char * bridge, u_int n_ports, u_int num_addrs, bool mac_nat) +{ + errno_t err; + switch_port_list_t list = NULL; + int s; + + S_n_ports = n_ports; + T_ATEND(cleanup); + T_SETUPBEGIN; + s = inet_dgram_socket(); + err = ifnet_create(s, bridge); + if (err != 0) { + goto done; + } + list = switch_port_list_alloc(n_ports, mac_nat); + for (u_int i = 0; i < n_ports; i++) { + bool do_mac_nat; + char ifname[IFNAMSIZ]; + char member_ifname[IFNAMSIZ]; + ether_addr_t member_mac; + + snprintf(ifname, sizeof(ifname), "%s%d", + FETH_NAME, i); + snprintf(member_ifname, sizeof(member_ifname), "%s%d", + FETH_NAME, i + n_ports); + err = ifnet_create(s, ifname); + if (err != 0) { + goto done; + } + ifnet_attach_ip(s, ifname); + err = ifnet_create(s, member_ifname); + if (err != 0) { + goto done; + } + err = ifnet_get_lladdr(s, member_ifname, &member_mac); + if (err != 0) { + goto done; + } + err = fake_set_peer(s, ifname, member_ifname); + if (err != 0) { + goto done; + } + /* add the interface's peer to the bridge */ + err = bridge_add_member(s, bridge, member_ifname); + if (err != 0) { + goto done; + } + + do_mac_nat = (i == 0 && mac_nat); + if (do_mac_nat) { + /* enable MAC NAT on unit 0 */ + err = bridge_set_mac_nat(s, bridge, member_ifname, + true); + if (err != 0) { + goto done; + } + } + /* we'll send/receive on the interface */ + err = switch_port_list_add_port(list, i, ifname, member_ifname, + &member_mac, num_addrs, + do_mac_nat); + if (err != 0) { + goto done; + } + } +done: + if (s >= 0) { + close(s); + } + if (err != 0 && list != NULL) { + switch_port_list_dealloc(list); + list = NULL; + } + T_SETUPEND; + return list; +} + +static void +bridge_cleanup(const char * bridge, u_int n_ports, bool fail_on_error) +{ + int s; + + s = inet_dgram_socket(); + ifnet_destroy(s, bridge, fail_on_error); + for (u_int i = 0; i < n_ports; i++) { + char ifname[IFNAMSIZ]; + char member_ifname[IFNAMSIZ]; + + snprintf(ifname, sizeof(ifname), "%s%d", + FETH_NAME, i); + snprintf(member_ifname, sizeof(member_ifname), "%s%d", + FETH_NAME, i + n_ports); + ifnet_destroy(s, ifname, fail_on_error); + ifnet_destroy(s, member_ifname, fail_on_error); + } + if (s >= 0) { + close(s); + } + S_n_ports = 0; + return; +} + +/* + * Basic Bridge Tests + * + * Broadcast + * - two cases: actual broadcast, unknown ethernet + * - send broadcast packets + * - verify all received + * - check bridge rt list contains all expected MAC addresses + * - send unicast ARP packets + * - verify packets received only on expected port + * + * MAC-NAT + * - verify ARP translation + * - verify IPv4 translation + * - verify DHCP broadcast bit conversion + * - verify IPv6 translation + * - verify ND6 translation (Neighbor, Router) + */ + +static void +bridge_test(packet_validator_t validator, + void * context, + const ether_addr_t * dst_eaddr, + uint8_t af, u_int n_ports, u_int num_addrs) +{ +#if TARGET_OS_BRIDGE + T_SKIP("Test uses too much memory"); +#else /* TARGET_OS_BRIDGE */ + switch_port_list_t port_list; + + signal(SIGINT, sigint_handler); + port_list = bridge_setup(BRIDGE200, n_ports, num_addrs, false); + if (port_list == NULL) { + T_FAIL("bridge_setup"); + return; + } + S_port_list = port_list; + bridge_learning_test(port_list, af, validator, context, dst_eaddr); + + //T_LOG("Sleeping for 5 seconds"); + //sleep(5); + bridge_cleanup(BRIDGE200, n_ports, true); + switch_port_list_dealloc(port_list); + return; +#endif /* TARGET_OS_BRIDGE */ +} + +static void +bridge_test_mac_nat_ipv4(u_int n_ports, u_int num_addrs) +{ +#if TARGET_OS_BRIDGE + T_SKIP("Test uses too much memory"); +#else /* TARGET_OS_BRIDGE */ + switch_port_list_t port_list; + + signal(SIGINT, sigint_handler); + port_list = bridge_setup(BRIDGE200, n_ports, num_addrs, true); + if (port_list == NULL) { + T_FAIL("bridge_setup"); + return; + } + S_port_list = port_list; + + /* verify that IPv4 packets get translated when necessary */ + mac_nat_test_ip(port_list, AF_INET); + + /* verify the DHCP broadcast bit gets set appropriately */ + mac_nat_test_dhcp(port_list); + + /* verify that ARP packet gets translated when necessary */ + mac_nat_test_arp_out(port_list); + mac_nat_test_arp_in(port_list); + + if (S_debug) { + T_LOG("Sleeping for 5 seconds"); + sleep(5); + } + bridge_cleanup(BRIDGE200, n_ports, true); + switch_port_list_dealloc(port_list); + return; +#endif /* TARGET_OS_BRIDGE */ +} + +static void +bridge_test_mac_nat_ipv6(u_int n_ports, u_int num_addrs) +{ +#if TARGET_OS_BRIDGE + T_SKIP("Test uses too much memory"); +#else /* TARGET_OS_BRIDGE */ + switch_port_list_t port_list; + + signal(SIGINT, sigint_handler); + port_list = bridge_setup(BRIDGE200, n_ports, num_addrs, true); + if (port_list == NULL) { + T_FAIL("bridge_setup"); + return; + } + S_port_list = port_list; + + /* verify that IPv6 packets get translated when necessary */ + mac_nat_test_ip(port_list, AF_INET6); + + /* verify that ND6 packet gets translated when necessary */ + mac_nat_test_nd6_out(port_list); + if (S_debug) { + T_LOG("Sleeping for 5 seconds"); + sleep(5); + } + bridge_cleanup(BRIDGE200, n_ports, true); + switch_port_list_dealloc(port_list); + return; +#endif /* TARGET_OS_BRIDGE */ +} + +static void +system_cmd(const char *cmd, bool fail_on_error) +{ + pid_t pid = -1; + int exit_status = 0; + const char *argv[] = { + "/usr/local/bin/bash", + "-c", + cmd, + NULL + }; + + int rc = dt_launch_tool(&pid, (char **)(void *)argv, false, NULL, NULL); + T_QUIET; + T_ASSERT_EQ(rc, 0, "dt_launch_tool(%s) failed", cmd); + + if (dt_waitpid(pid, &exit_status, NULL, 30)) { + T_QUIET; + T_ASSERT_MACH_SUCCESS(exit_status, "command(%s)", cmd); + } else { + if (fail_on_error) { + T_FAIL("dt_waitpid(%s) failed", cmd); + } + } +} + +static void +cleanup_pf(void) +{ + struct ifbrparam param; + int s = inet_dgram_socket(); + + system_cmd("pfctl -d", false); + system_cmd("pfctl -F all", false); + + param.ifbrp_filter = 0; + siocdrvspec(s, BRIDGE200, BRDGSFILT, + ¶m, sizeof(param), true); + return; +} + +static void +block_all_traffic(bool input, const char* infname1, const char* infname2) +{ + int s = inet_dgram_socket(); + int ret; + struct ifbrparam param; + char command[512]; + char *dir = input ? "in" : "out"; + + snprintf(command, sizeof(command), "echo \"block %s on %s all\nblock %s on %s all\n\" | pfctl -vvv -f -", + dir, infname1, dir, infname2); + /* enable block all filter */ + param.ifbrp_filter = IFBF_FILT_MEMBER | IFBF_FILT_ONLYIP; + ret = siocdrvspec(s, BRIDGE200, BRDGSFILT, + ¶m, sizeof(param), true); + T_ASSERT_POSIX_SUCCESS(ret, + "SIOCDRVSPEC(BRDGSFILT %s, 0x%x)", + BRIDGE200, param.ifbrp_filter); + // ignore errors such that not having pf.os doesn't raise any issues + system_cmd(command, false); + system_cmd("pfctl -e", true); + system_cmd("pfctl -s all", true); +} + +/* + * Basic bridge filter test + * + * For both broadcast and unicast transfers ensure that data can + * be blocked using pf on the bridge + */ + +static void +filter_test(uint8_t af) +{ +#if TARGET_OS_BRIDGE + T_SKIP("pfctl isn't valid on this platform"); +#else /* TARGET_OS_BRIDGE */ + switch_port_list_t port_list; + switch_port_t port; + const u_int n_ports = 2; + u_int num_addrs = 1; + u_int i; + char ntoabuf[ETHER_NTOA_BUFSIZE]; + union ifbrip dst_ip; + bool blocked = true; + bool input = true; + const char* ifnames[2]; + + signal(SIGINT, sigint_handler); + + T_ATEND(cleanup); + T_ATEND(cleanup_pf); + + port_list = bridge_setup(BRIDGE200, n_ports, num_addrs, false); + if (port_list == NULL) { + T_FAIL("bridge_setup"); + return; + } + + ether_ntoa_buf(ðer_broadcast, ntoabuf, sizeof(ntoabuf)); + + S_port_list = port_list; + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + ifnames[i] = port->member_ifname; + } + + get_broadcast_ip_address(af, &dst_ip); + do { + do { + if (blocked) { + block_all_traffic(input, ifnames[0], ifnames[1]); + } + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + if (S_debug) { + T_LOG("Sending on %s", port->ifname); + } + for (u_int j = 0; j < port->num_addrs; j++) { + uint32_t generation; + + generation = next_generation(); + send_generation(port, + af, + j, + ðer_broadcast, + &dst_ip, + generation); + + /* receive across all ports */ + check_receive_generation(port_list, + af, + generation, + validate_broadcast_dhost, + NULL); + + /* ensure that every port saw the right amount of packets*/ + if (blocked) { + check_received_count(port_list, port, 0); + } else { + check_received_count(port_list, port, 1); + } + } + } + T_PASS("%s broadcast %s %s", __func__, blocked ? "blocked" : "not blocked", input ? "input" : "output"); + input = !input; + cleanup_pf(); + } while (input == false && blocked); + blocked = !blocked; + } while (blocked == false); + + do { + do { + if (blocked) { + block_all_traffic(input, ifnames[0], ifnames[1]); + } + for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { + /* send unicast packets to every other port's MAC addresses */ + unicast_send_all(port_list, af, port); + + /* receive all of that generated traffic */ + switch_port_list_check_receive(port_list, af, NULL, 0, + validate_port_dhost, NULL); + + /* ensure that every port saw the right amount of packets*/ + if (blocked) { + check_received_count(port_list, port, 0); + } else { + check_received_count(port_list, port, 1); + } + } + T_PASS("%s unicast %s %s", __func__, blocked ? "blocked" : "not blocked", input ? "input" : "output"); + input = !input; + cleanup_pf(); + } while (input == false && blocked); + blocked = !blocked; + } while (blocked == false); + + bridge_cleanup(BRIDGE200, n_ports, true); + switch_port_list_dealloc(port_list); + return; +#endif /* TARGET_OS_BRIDGE */ +} + +T_DECL(if_bridge_bcast, + "bridge broadcast IPv4", + T_META_ASROOT(true)) +{ + bridge_test(validate_broadcast_dhost, NULL, ðer_broadcast, + AF_INET, 5, 1); +} + +T_DECL(if_bridge_bcast_many, + "bridge broadcast many IPv4", + T_META_ASROOT(true)) +{ + bridge_test(validate_broadcast_dhost, NULL, ðer_broadcast, + AF_INET, 5, 20); +} + +T_DECL(if_bridge_unknown, + "bridge unknown host IPv4", + T_META_ASROOT(true)) +{ + bridge_test(validate_not_present_dhost, NULL, ðer_external, + AF_INET, 5, 1); +} + +T_DECL(if_bridge_bcast_v6, + "bridge broadcast IPv6", + T_META_ASROOT(true)) +{ + bridge_test(validate_broadcast_dhost, NULL, ðer_broadcast, + AF_INET6, 5, 1); +} + +T_DECL(if_bridge_bcast_many_v6, + "bridge broadcast many IPv6", + T_META_ASROOT(true)) +{ + bridge_test(validate_broadcast_dhost, NULL, ðer_broadcast, + AF_INET6, 5, 20); +} + +T_DECL(if_bridge_unknown_v6, + "bridge unknown host IPv6", + T_META_ASROOT(true)) +{ + bridge_test(validate_not_present_dhost, NULL, ðer_external, + AF_INET6, 5, 1); +} + +T_DECL(if_bridge_mac_nat_ipv4, + "bridge mac nat ipv4", + T_META_ASROOT(true)) +{ + bridge_test_mac_nat_ipv4(5, 10); +} + +T_DECL(if_bridge_mac_nat_ipv6, + "bridge mac nat ipv6", + T_META_ASROOT(true)) +{ + bridge_test_mac_nat_ipv6(5, 10); +} + +T_DECL(if_bridge_filter_ipv4, + "bridge filter ipv4", + T_META_ASROOT(true)) +{ + filter_test(AF_INET); +} + +T_DECL(if_bridge_filter_ipv6, + "bridge filter ipv6", + T_META_ASROOT(true)) +{ + filter_test(AF_INET6); +} diff --git a/tests/netagent_race_infodisc_56244905.c b/tests/netagent_race_infodisc_56244905.c new file mode 100644 index 000000000..cc451d8fd --- /dev/null +++ b/tests/netagent_race_infodisc_56244905.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int finished = 0; + +#ifndef KEV_NETAGENT_SUBCLASS +#define KEV_NETAGENT_SUBCLASS 9 +#endif + +#ifndef NETAGENT_MESSAGE_TYPE_REGISTER +#define NETAGENT_MESSAGE_TYPE_REGISTER 1 +#endif + +#ifndef NETAGENT_MESSAGE_TYPE_UNREGISTER +#define NETAGENT_MESSAGE_TYPE_UNREGISTER 2 +#endif + +struct netagent_message_header { + uint8_t message_type; + uint8_t message_flags; + uint32_t message_id; + uint32_t message_error; + uint32_t message_payload_length; +}; + +struct kev_msg { + uint32_t total_size; + uint32_t vendor_code; + uint32_t kev_class; + uint32_t kev_subclass; + uint32_t id; + uint32_t event_code; +}; + +struct kev_netagent_data { + uuid_t netagent_uuid; +}; + +struct netagent { + uuid_t netagent_uuid; + char netagent_domain[32]; + char netagent_type[32]; + char netagent_desc[128]; + uint32_t netagent_flags; + uint32_t netagent_data_size; + uint8_t netagent_data[0]; +}; + +static void * +register_sockopt_racer(void *data) +{ + int s = *(int *)data; + struct { + struct netagent_message_header header; + struct netagent netagent; + } msg; + + bzero(&msg, sizeof(msg)); + msg.header.message_type = NETAGENT_MESSAGE_TYPE_REGISTER; + msg.header.message_payload_length = sizeof(struct netagent); + + while (!finished) { + send(s, &msg, sizeof(msg), 0); + } + + return NULL; +} + +static void * +register_message_racer(void *data) +{ + int s = *(int *)data; + struct netagent netagent; + + bzero(&netagent, sizeof(netagent)); + while (!finished) { + setsockopt(s, SYSPROTO_CONTROL, NETAGENT_MESSAGE_TYPE_REGISTER, &netagent, sizeof(netagent)); + } + + return NULL; +} + +#define SIZEOF_STRUCT_NETAGENT_WRAPPER 280 + +static void * +unregister_racer(void *data) +{ + int s = *(int *)data; + uint8_t spraybuf[SIZEOF_STRUCT_NETAGENT_WRAPPER]; + + memset(spraybuf, 0x41, sizeof(spraybuf)); + + while (!finished) { + setsockopt(s, SYSPROTO_CONTROL, NETAGENT_MESSAGE_TYPE_UNREGISTER, NULL, 0); + ioctl(-1, _IOW('x', 0, spraybuf), spraybuf); + } + + return NULL; +} + +#define NITERS 200000 + +T_DECL(netagent_race_infodisc_56244905, "Netagent race between register and post event.") +{ + int s; + int evsock; + pthread_t reg_th; + pthread_t unreg_th; + struct kev_request kev_req = { + .vendor_code = KEV_VENDOR_APPLE, + .kev_class = KEV_NETWORK_CLASS, + .kev_subclass = KEV_NETAGENT_SUBCLASS + }; + struct ctl_info ci; + struct sockaddr_ctl sc; + struct { + struct kev_msg msg; + struct kev_netagent_data nd; + } ev; + int n; + + T_SETUPBEGIN; + /* set up the event socket so we can receive notifications: */ + T_ASSERT_POSIX_SUCCESS(evsock = socket(AF_SYSTEM, SOCK_RAW, SYSPROTO_EVENT), NULL); + T_ASSERT_POSIX_SUCCESS(ioctl(evsock, SIOCSKEVFILT, &kev_req), NULL); + + /* this is the socket we'll race on: */ + T_ASSERT_POSIX_SUCCESS(s = socket(AF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL); + + /* connect to netagent: */ + bzero(&ci, sizeof(ci)); + strcpy(ci.ctl_name, "com.apple.net.netagent"); + T_ASSERT_POSIX_SUCCESS(ioctl(s, CTLIOCGINFO, &ci), NULL); + + bzero(&sc, sizeof(sc)); + sc.sc_id = ci.ctl_id; + + T_ASSERT_POSIX_SUCCESS(connect(s, (const struct sockaddr *)&sc, sizeof(sc)), NULL); + T_SETUPEND; + + /* variant 1: */ + /* spin off the racer threads: */ + T_ASSERT_POSIX_ZERO(pthread_create(®_th, NULL, register_message_racer, &s), NULL); + T_ASSERT_POSIX_ZERO(pthread_create(&unreg_th, NULL, unregister_racer, &s), NULL); + + /* keep going until we're done: */ + for (n = 0; n < NITERS; ++n) { + bzero(&ev, sizeof(ev)); + T_ASSERT_POSIX_SUCCESS(recv(evsock, &ev, sizeof(ev), 0), NULL); + + if (ev.nd.netagent_uuid[0] != 0) { + finished = 1; + T_ASSERT_FAIL("netagent register event leaked data: 0x%08lx", *(unsigned long *)ev.nd.netagent_uuid); + } + } + + finished = 1; + + T_ASSERT_POSIX_ZERO(pthread_join(reg_th, NULL), NULL); + T_ASSERT_POSIX_ZERO(pthread_join(unreg_th, NULL), NULL); + + finished = 0; + + /* variant 2: */ + /* spin off the racer threads: */ + T_ASSERT_POSIX_ZERO(pthread_create(®_th, NULL, register_sockopt_racer, &s), NULL); + T_ASSERT_POSIX_ZERO(pthread_create(&unreg_th, NULL, unregister_racer, &s), NULL); + + /* keep going until we're done: */ + for (n = 0; n < NITERS; ++n) { + bzero(&ev, sizeof(ev)); + T_ASSERT_POSIX_SUCCESS(recv(evsock, &ev, sizeof(ev), 0), NULL); + + if (ev.nd.netagent_uuid[0] != 0) { + finished = 1; + T_ASSERT_FAIL("netagent register event leaked data: 0x%08lx", *(unsigned long *)ev.nd.netagent_uuid); + } + } + + finished = 1; + + T_ASSERT_POSIX_ZERO(pthread_join(reg_th, NULL), NULL); + T_ASSERT_POSIX_ZERO(pthread_join(unreg_th, NULL), NULL); +} diff --git a/tests/socket_0byte_udp_poll_58140856.c b/tests/socket_0byte_udp_poll_58140856.c new file mode 100644 index 000000000..e87db4878 --- /dev/null +++ b/tests/socket_0byte_udp_poll_58140856.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); + +#define TEST_ADDR "127.0.0.1" +#define TEST_PORT 4242 + +static struct { + int fd; + struct sockaddr_in addr; +} server; + +static void +server_listen(void) +{ + int r; + + server.fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + T_ASSERT_POSIX_SUCCESS(server.fd, "socket"); + + memset(&server.addr, 0, sizeof(server.addr)); + server.addr.sin_family = AF_INET; + server.addr.sin_port = htons(TEST_PORT); + + inet_pton(AF_INET, TEST_ADDR, &server.addr.sin_addr); + + r = bind(server.fd, (struct sockaddr*) &server.addr, sizeof(server.addr)); + T_ASSERT_POSIX_SUCCESS(r, "bind"); +} + +static void +send_message(void) +{ + int fd; + struct msghdr msg; + struct iovec iov; + + fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + T_ASSERT_POSIX_SUCCESS(fd, "socket"); + + memset(&msg, 0, sizeof(msg)); + + msg.msg_name = &server.addr; + msg.msg_namelen = sizeof(server.addr); + + iov.iov_base = ""; + iov.iov_len = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ssize_t r = sendmsg(fd, &msg, 0); + T_ASSERT_EQ(r, (ssize_t)iov.iov_len, "sendmsg"); + + close(fd); +} + +static void +server_poll(void) +{ + int kq; + struct kevent event = { + .flags = EV_ADD, + .filter = EVFILT_READ, + .ident = (unsigned long)server.fd, + }; + int r; + + kq = kqueue(); + T_ASSERT_POSIX_SUCCESS(kq, "kqueue"); + + /* Add and poll */ + r = kevent(kq, &event, 1, &event, 1, NULL); + T_EXPECT_EQ(r, 1, "should return an event"); + + close(kq); +} + +T_DECL(socket_0byte_udp_poll_58140856, + "Tests that 0-sized UDP packets wake up kevent") +{ + T_LOG("Starting...\n"); + + /* Listen on UDP port */ + server_listen(); + + T_LOG("Server bound to [%s]:%d\n", TEST_ADDR, TEST_PORT); + + /* Send 0-UDP packet to that port */ + send_message(); + + T_LOG("Sent message to server\n"); + + /* Poll kqueue events */ + server_poll(); + + T_LOG("Got kqueue event\n"); + + close(server.fd); +} diff --git a/tests/stackshot_accuracy.m b/tests/stackshot_accuracy.m index a183a87dd..9ff129091 100644 --- a/tests/stackshot_accuracy.m +++ b/tests/stackshot_accuracy.m @@ -275,9 +275,7 @@ child_init(void) #if !TARGET_OS_OSX /* allow us to be frozen */ freeze_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, pid, 0, NULL, 0); - if (freeze_state == -1) { - T_SKIP("This device doesn't have CONFIG_FREEZE enabled."); - } else if (freeze_state == 0) { + if (freeze_state == 0) { T_LOG("CHILD was found to be UNFREEZABLE, enabling freezing."); memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, pid, 1, NULL, 0); freeze_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, pid, 0, NULL, 0); @@ -341,6 +339,14 @@ T_DECL(basic, "test that no-fault stackshot works correctly") T_LOG("parent pid: %d\n", getpid()); T_QUIET; T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + /* check if we can run the child successfully */ +#if !TARGET_OS_OSX + int freeze_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0); + if (freeze_state == -1) { + T_SKIP("This device doesn't have CONFIG_FREEZE enabled."); + } +#endif + /* setup signal handling */ signal(SIGUSR1, SIG_IGN); child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dq); @@ -354,7 +360,7 @@ T_DECL(basic, "test that no-fault stackshot works correctly") T_ATEND(kill_children); /* wait until the child has recursed enough */ - dispatch_semaphore_wait(child_done_sema, DISPATCH_TIME_FOREVER); + dispatch_semaphore_wait(child_done_sema, dispatch_time(DISPATCH_TIME_NOW, 10 /*seconds*/ * 1000000000ULL)); T_LOG("child finished, parent executing"); diff --git a/tests/stackshot_tests.m b/tests/stackshot_tests.m index 29fa817e1..1777335c3 100644 --- a/tests/stackshot_tests.m +++ b/tests/stackshot_tests.m @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,9 @@ static void initialize_thread(void); #define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x04 #define PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL 0x08 #define PARSE_STACKSHOT_TURNSTILEINFO 0x10 +#define PARSE_STACKSHOT_WAITINFO_CSEG 0x40 + +static uint64_t cseg_expected_threadid = 0; #define TEST_STACKSHOT_QUEUE_LABEL "houston.we.had.a.problem" #define TEST_STACKSHOT_QUEUE_LABEL_LENGTH sizeof(TEST_STACKSHOT_QUEUE_LABEL) @@ -891,6 +895,34 @@ T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always pop }); } +T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct waitinfo") +{ + int val = 1; + struct scenario scenario = { + .name = "cseg_waitinfo", + .quiet = false, + .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT), + }; + + dispatch_queue_t dq = dispatch_queue_create("com.apple.stackshot.cseg_waitinfo", NULL); + dispatch_semaphore_t child_ok = dispatch_semaphore_create(0); + + dispatch_async(dq, ^{ + pthread_threadid_np(NULL, &cseg_expected_threadid); + dispatch_semaphore_signal(child_ok); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread"); + }); + + dispatch_semaphore_wait(child_ok, DISPATCH_TIME_FOREVER); + sleep(1); + + T_LOG("taking stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child thread"); + parse_stackshot(PARSE_STACKSHOT_WAITINFO_CSEG, ssbuf, sslen, -1); + }); +} + #pragma mark performance tests #define SHOULD_REUSE_SIZE_HINT 0x01 @@ -1086,12 +1118,14 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int { bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA); bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE); + bool expect_cseg_waitinfo = (stackshot_parsing_flags & PARSE_STACKSHOT_WAITINFO_CSEG); bool expect_shared_cache_layout = false; bool expect_shared_cache_uuid = !delta; bool expect_dispatch_queue_label = (stackshot_parsing_flags & PARSE_STACKSHOT_DISPATCH_QUEUE_LABEL); bool expect_turnstile_lock = (stackshot_parsing_flags & PARSE_STACKSHOT_TURNSTILEINFO); bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false; bool found_dispatch_queue_label = false, found_turnstile_lock = false; + bool found_cseg_waitinfo = false; if (expect_shared_cache_uuid) { uuid_t shared_cache_uuid; @@ -1179,6 +1213,17 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int } } + if (expect_cseg_waitinfo) { + NSArray *winfos = container[@"task_snapshots"][@"thread_waitinfo"]; + + for (id i in winfos) { + if ([i[@"wait_type"] intValue] == kThreadWaitCompressor && [i[@"owner"] intValue] == cseg_expected_threadid) { + found_cseg_waitinfo = true; + break; + } + } + } + int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue]; if (expect_zombie_child && (pid == child_pid)) { found_zombie_child = true; @@ -1276,6 +1321,10 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int T_QUIET; T_ASSERT_TRUE(found_turnstile_lock, "found expected deadlock"); } + if (expect_cseg_waitinfo) { + T_QUIET; T_ASSERT_TRUE(found_cseg_waitinfo, "found c_seg waitinfo"); + } + T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata"); } diff --git a/tests/task_create_suid_cred.c b/tests/task_create_suid_cred.c new file mode 100644 index 000000000..9787918c3 --- /dev/null +++ b/tests/task_create_suid_cred.c @@ -0,0 +1,326 @@ +#include + +#include +#include +#include +#include +#include + +#if defined(UNENTITLED) + +/* + * Creating an suid credential should fail without an entitlement. + */ +T_DECL(task_create_suid_cred_unentitled, "task_create_suid_cred (no entitlment)", T_META_ASROOT(true)) +{ + kern_return_t ret = KERN_FAILURE; + suid_cred_t sc = SUID_CRED_NULL; + + ret = task_create_suid_cred(mach_task_self(), "/usr/bin/id", 0, &sc); + T_ASSERT_MACH_ERROR(ret, KERN_NO_ACCESS, "create a new suid cred for id (no entitlement)"); +} + +#else /* ENTITLED */ + +extern char **environ; +static const char *server_name = "com.apple.xnu.test.task_create_suid_cred"; + +/* + * This is a positive test case which spawns /usr/bin/id with a properly created + * suid credential and verifies that it correctly produces "euid=0" + * Not running as root. + */ +static void +test_id_cred(suid_cred_t sc_id) +{ + posix_spawnattr_t attr; + posix_spawn_file_actions_t file_actions; + pid_t pid = -1; + int status = -1; + char template[] = "/tmp/suid_cred.XXXXXX"; + char *path = NULL; + FILE *file = NULL; + char *line = NULL; + size_t linecap = 0; + ssize_t linelen = 0; + char *id[] = {"/usr/bin/id", NULL}; + kern_return_t ret = KERN_FAILURE; + + /* Send stdout to a temporary file. */ + path = mktemp(template); + T_QUIET; T_ASSERT_NOTNULL(path, NULL); + + ret = posix_spawn_file_actions_init(&file_actions); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawn_file_actions_addopen(&file_actions, 1, path, + O_WRONLY | O_CREAT | O_TRUNC, 0666); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawnattr_init(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + T_QUIET; T_ASSERT_NOTNULL(attr, NULL); + + // Attach the suid cred port + ret = posix_spawnattr_setsuidcredport_np(&attr, sc_id); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawnp(&pid, id[0], &file_actions, &attr, id, environ); + T_ASSERT_POSIX_ZERO(ret, "spawn with suid cred"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawn_file_actions_destroy(&file_actions); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + // Wait for id to finish executing and exit. + do { + ret = waitpid(pid, &status, 0); + } while (ret < 0 && errno == EINTR); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, NULL); + + // Read from the temp file and verify that euid is 0. + file = fopen(path, "re"); + T_QUIET; T_ASSERT_NOTNULL(file, NULL); + + linelen = getline(&line, &linecap, file); + T_QUIET; T_ASSERT_GT_LONG(linelen, 0L, NULL); + + T_ASSERT_NOTNULL(strstr(line, "euid=0"), "verify that euid is zero"); + + free(line); + ret = fclose(file); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = unlink(path); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); +} + +/* + * This is a negative test case which tries to spawn /usr/bin/id with a + * previously used credential. It is expected that posix_spawn() fails. + * sc_id should have already been used to successfully spawn /usr/bin/id. + */ +static void +test_id_cred_reuse(suid_cred_t sc_id) +{ + posix_spawnattr_t attr; + char *id[] = {"/usr/bin/id", NULL}; + kern_return_t ret = KERN_FAILURE; + + ret = posix_spawnattr_init(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + T_QUIET; T_ASSERT_NOTNULL(attr, NULL); + + // Attach the suid cred port + ret = posix_spawnattr_setsuidcredport_np(&attr, sc_id); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawnp(NULL, id[0], NULL, &attr, id, environ); + T_ASSERT_NE(ret, 0, "spawn with used suid cred"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); +} + +/* + * This is a negative test case which tries to spawn /usr/bin/id with a + * credential for /bin/ls. It is expected that posix_spawn() fails. + */ +static void +test_ls_cred(suid_cred_t sc_ls) +{ + posix_spawnattr_t attr; + char *id[] = {"/usr/bin/id", NULL}; + kern_return_t ret = KERN_FAILURE; + + ret = posix_spawnattr_init(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + T_QUIET; T_ASSERT_NOTNULL(attr, NULL); + + // Attach the suid cred port + ret = posix_spawnattr_setsuidcredport_np(&attr, sc_ls); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); + + ret = posix_spawnp(NULL, id[0], NULL, &attr, id, environ); + T_ASSERT_NE(ret, 0, "spawn with bad suid cred"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, NULL); +} + +/* + * The privileged/entitled "server" which creates suid credentials to pass to a + * client. Two creds are created, one for /usr/bin/id and the other for /bin/ls. + * It waits for the client to contact and replies with the above ports. + */ +T_HELPER_DECL(suid_cred_server_helper, "suid cred server") +{ + mach_port_t server_port = MACH_PORT_NULL; + kern_return_t ret = KERN_FAILURE; + suid_cred_t sc_id = SUID_CRED_NULL; + suid_cred_t sc_ls = SUID_CRED_NULL; + mach_msg_empty_rcv_t rmsg = {}; + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t id_port; + mach_msg_port_descriptor_t ls_port; + } smsg = {}; + + T_SETUPBEGIN; + + ret = bootstrap_check_in(bootstrap_port, server_name, &server_port); + T_ASSERT_MACH_SUCCESS(ret, NULL); + + T_SETUPEND; + + // Wait for a message to reply to. + rmsg.header.msgh_size = sizeof(rmsg); + rmsg.header.msgh_local_port = server_port; + + ret = mach_msg_receive(&rmsg.header); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, NULL); + + // Setup the reply. + smsg.header.msgh_remote_port = rmsg.header.msgh_remote_port; + smsg.header.msgh_local_port = MACH_PORT_NULL; + smsg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0) | MACH_MSGH_BITS_COMPLEX; + smsg.header.msgh_size = sizeof(smsg); + + smsg.body.msgh_descriptor_count = 2; + + // Create an suid cred for 'id' + ret = task_create_suid_cred(mach_task_self(), "/usr/bin/id", 0, &sc_id); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "create a new suid cred for id"); + T_QUIET; T_ASSERT_NE(sc_id, SUID_CRED_NULL, NULL); + + smsg.id_port.name = sc_id; + smsg.id_port.disposition = MACH_MSG_TYPE_COPY_SEND; + smsg.id_port.type = MACH_MSG_PORT_DESCRIPTOR; + + // Create an suid cred for 'ls' + ret = task_create_suid_cred(mach_task_self(), "/bin/ls", 0, &sc_ls); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "create a new suid cred for ls"); + T_QUIET; T_ASSERT_NE(sc_ls, SUID_CRED_NULL, NULL); + + smsg.ls_port.name = sc_ls; + smsg.ls_port.disposition = MACH_MSG_TYPE_COPY_SEND; + smsg.ls_port.type = MACH_MSG_PORT_DESCRIPTOR; + + ret = mach_msg_send(&smsg.header); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, NULL); +} + +/* + * The unprivileged "client" which requests suid credentials from the "server", + * and runs some test cases with those credentials: + * - A positive test case to spawn something with euid 0 + * - A negative test case to check that a cred can't be used twice + * - A negative test case to check that only the approved binary can be used + * with the credential. + */ +T_HELPER_DECL(suid_cred_client_helper, "suid cred client") +{ + mach_port_t server_port = MACH_PORT_NULL; + mach_port_t client_port = MACH_PORT_NULL; + kern_return_t ret = KERN_FAILURE; + suid_cred_t sc_id = SUID_CRED_NULL; + suid_cred_t sc_ls = SUID_CRED_NULL; + mach_msg_empty_send_t smsg = {}; + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t id_port; + mach_msg_port_descriptor_t ls_port; + mach_msg_trailer_t trailer; + } rmsg = {}; + + uid_t euid = geteuid(); + + T_SETUPBEGIN; + + // Make sure the effective UID is non-root. + if (euid == 0) { + ret = setuid(501); + T_ASSERT_POSIX_ZERO(ret, "setuid"); + } + + /* + * As this can race with the "server" starting, give it time to + * start up. + */ + for (int i = 0; i < 30; i++) { + ret = bootstrap_look_up(bootstrap_port, server_name, &server_port); + if (ret != BOOTSTRAP_UNKNOWN_SERVICE) { + break; + } + sleep(1); + } + + T_QUIET; T_ASSERT_NE(server_port, MACH_PORT_NULL, NULL); + + // Create a report to receive the reply on. + ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &client_port); + T_ASSERT_MACH_SUCCESS(ret, NULL); + + T_SETUPEND; + + // Request the SUID cred ports + smsg.header.msgh_remote_port = server_port; + smsg.header.msgh_local_port = client_port; + smsg.header.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE, 0, 0); + smsg.header.msgh_size = sizeof(smsg); + + ret = mach_msg_send(&smsg.header); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, NULL); + + // Wait for the reply. + rmsg.header.msgh_size = sizeof(rmsg); + rmsg.header.msgh_local_port = client_port; + + ret = mach_msg_receive(&rmsg.header); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, NULL); + + sc_id = rmsg.id_port.name; + T_QUIET; T_ASSERT_NE(sc_id, SUID_CRED_NULL, NULL); + test_id_cred(sc_id); + test_id_cred_reuse(sc_id); + + sc_ls = rmsg.ls_port.name; + T_QUIET; T_ASSERT_NE(sc_ls, SUID_CRED_NULL, NULL); + test_ls_cred(sc_ls); +} + +T_DECL(task_create_suid_cred, "task_create_suid_cred", T_META_ASROOT(true)) +{ + dt_helper_t helpers[] = { + dt_launchd_helper_domain("com.apple.xnu.test.task_create_suid_cred.plist", + "suid_cred_server_helper", NULL, LAUNCH_SYSTEM_DOMAIN), + dt_fork_helper("suid_cred_client_helper"), + }; + + dt_run_helpers(helpers, sizeof(helpers) / sizeof(helpers[0]), 60); +} + +/* + * Creating an suid credential should fail for non-root (even if entitled). + */ +T_DECL(task_create_suid_cred_no_root, "task_create_suid_cred (no root)", T_META_ASROOT(true)) +{ + kern_return_t ret = KERN_FAILURE; + suid_cred_t sc = SUID_CRED_NULL; + uid_t euid = geteuid(); + + // Make sure the effective UID is non-root. + if (euid == 0) { + ret = setuid(501); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "setuid"); + } + + ret = task_create_suid_cred(mach_task_self(), "/usr/bin/id", 0, &sc); + T_ASSERT_MACH_ERROR(ret, KERN_NO_ACCESS, "create a new suid cred for id (non-root)"); +} + +#endif /* ENTITLED */ diff --git a/tests/task_create_suid_cred_entitlement.plist b/tests/task_create_suid_cred_entitlement.plist new file mode 100644 index 000000000..03a832615 --- /dev/null +++ b/tests/task_create_suid_cred_entitlement.plist @@ -0,0 +1,10 @@ + + + + + + com.apple.private.suid_cred + + + + diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index ff2376e2e..01067a75d 100755 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -110,6 +110,30 @@ def IterateLinkageChain(queue_head, element_type, field_name, field_ofst=0): yield obj link = link.next +def IterateCircleQueue(queue_head, element_ptr_type, element_field_name): + """ iterate over a circle queue in kernel of type circle_queue_head_t. refer to osfmk/kern/circle_queue.h + params: + queue_head - lldb.SBValue : Value object for queue_head. + element_type - lldb.SBType : a pointer type of the element 'next' points to. Typically its structs like thread, task etc.. + element_field_name - str : name of the field in target struct. + returns: + A generator does not return. It is used for iterating. + SBValue : an object thats of type (element_type) queue_head->next. Always a pointer object + """ + head = queue_head.head.GetSBValue() + queue_head_addr = 0x0 + if head.TypeIsPointerType(): + queue_head_addr = head.GetValueAsUnsigned() + else: + queue_head_addr = head.GetAddress().GetLoadAddress(osplugin_target_obj) + cur_elt = head + while True: + if not cur_elt.IsValid() or cur_elt.GetValueAsUnsigned() == 0: + break + yield containerof(value(cur_elt), element_ptr_type, element_field_name) + cur_elt = cur_elt.GetChildMemberWithName('next') + if cur_elt.GetValueAsUnsigned() == queue_head_addr: + break def IterateQueue(queue_head, element_ptr_type, element_field_name, backwards=False, unpack_ptr_fn=None): """ Iterate over an Element Chain queue in kernel of type queue_head_t. (osfmk/kern/queue.h method 2) diff --git a/tools/lldbmacros/core/operating_system.py b/tools/lldbmacros/core/operating_system.py index c1fc18cc3..2e7e21847 100755 --- a/tools/lldbmacros/core/operating_system.py +++ b/tools/lldbmacros/core/operating_system.py @@ -649,32 +649,6 @@ def IterateQueue(queue_head, element_ptr_type, element_field_name): yield elt cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next') -def IterateCircleQueue(queue_head, element_ptr_type, element_field_name): - """ iterate over a circle queue in kernel of type circle_queue_head_t. refer to osfmk/kern/circle_queue.h - params: - queue_head - lldb.SBValue : Value object for queue_head. - element_type - lldb.SBType : a pointer type of the element 'next' points to. Typically its structs like thread, task etc.. - element_field_name - str : name of the field in target struct. - returns: - A generator does not return. It is used for iterating. - SBValue : an object thats of type (element_type) queue_head->next. Always a pointer object - """ - head = queue_head.head - queue_head_addr = 0x0 - if head.TypeIsPointerType(): - queue_head_addr = head.GetValueAsUnsigned() - else: - queue_head_addr = head.GetAddress().GetLoadAddress(osplugin_target_obj) - cur_elt = head - while True: - if not cur_elt.IsValid() or cur_elt.GetValueAsUnsigned() == 0: - break - elt = cur_elt.Cast(element_ptr_type) - yield elt - cur_elt = elt.GetChildMemberWithName(element_field_name).GetChildMemberWithName('next') - if cur_elt.GetValueAsUnsigned() == queue_head_addr: - break - def GetUniqueSessionID(process_obj): """ Create a unique session identifier. params: diff --git a/tools/lldbmacros/ipc.py b/tools/lldbmacros/ipc.py index 88ea13b5b..a1b02fced 100755 --- a/tools/lldbmacros/ipc.py +++ b/tools/lldbmacros/ipc.py @@ -500,13 +500,17 @@ def GetKObjectFromPort(portval): params: portval - core.value representation of 'ipc_port *' object returns: str - string of kobject information """ - kobject_str = "{0: <#020x}".format(portval.kdata.kobject) io_bits = unsigned(portval.ip_object.io_bits) - objtype_index = io_bits & 0x7ff + if io_bits & 0x400 : + kobject_val = portval.kdata.kolabel.ikol_kobject + else: + kobject_val = portval.kdata.kobject + kobject_str = "{0: <#020x}".format(kobject_val) + objtype_index = io_bits & 0x3ff if objtype_index < len(xnudefines.kobject_types) : objtype_str = xnudefines.kobject_types[objtype_index] if objtype_str == 'IOKIT_OBJ': - iokit_classnm = GetObjectTypeStr(portval.kdata.kobject) + iokit_classnm = GetObjectTypeStr(kobject_val) if not iokit_classnm: iokit_classnm = "" else: @@ -515,7 +519,7 @@ def GetKObjectFromPort(portval): else: desc_str = "kobject({0:s})".format(objtype_str) if xnudefines.kobject_types[objtype_index] in ('TASK_RESUME', 'TASK'): - desc_str += " " + GetProcNameForTask(Cast(portval.kdata.kobject, 'task *')) + desc_str += " " + GetProcNameForTask(Cast(kobject_val, 'task *')) else: desc_str = "kobject(UNKNOWN) {:d}".format(objtype_index) return kobject_str + " " + desc_str diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index 5db5554e5..dff373630 100755 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -1288,6 +1288,7 @@ kThreadWaitPThreadCondVar = 0x0e kThreadWaitParkedWorkQueue = 0x0f kThreadWaitWorkloopSyncWait = 0x10 kThreadWaitOnProcess = 0x11 +kThreadWaitCompressor = 0x14 UINT64_MAX = 0xffffffffffffffff @@ -1397,6 +1398,8 @@ def formatWaitInfo(info): s += "waitpid, for process group %d" % abs(owner - 2**64) else: s += "waitpid, for pid %d" % owner + elif type == kThreadWaitCompressor: + s += "in compressor segment %x, busy for thread %d" % (context, owner) else: s += "unknown type %d (owner %d, context %x)" % (type, owner, context) diff --git a/tools/lldbmacros/scheduler.py b/tools/lldbmacros/scheduler.py index 0708c7658..983a027e1 100755 --- a/tools/lldbmacros/scheduler.py +++ b/tools/lldbmacros/scheduler.py @@ -162,7 +162,7 @@ def ShowCurremtAbsTime(cmd_args=None): print "Last dispatch time known: %d MATUs" % cur_abstime -bucketStr = ["", "FIXPRI (>UI)", "TIMESHARE_FG", "TIMESHARE_IN", "TIMESHARE_DF", "TIMESHARE_UT", "TIMESHARE_BG"] +bucketStr = ["FIXPRI (>UI)", "TIMESHARE_FG", "TIMESHARE_IN", "TIMESHARE_DF", "TIMESHARE_UT", "TIMESHARE_BG"] @header(" {:>18s} | {:>20s} | {:>20s} | {:>10s} | {:>10s}".format('Thread Group', 'Interactivity Score', 'Last Timeshare Tick', 'pri_shift', 'highq')) def GetSchedClutchBucketSummary(clutch_bucket): @@ -176,13 +176,15 @@ def ShowSchedClutchForPset(pset): print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10d} | {:>10d} | {:>30s} | {:>30s} | {:>15s} | ".format("Root", "*", "*", addressof(root_clutch), root_clutch.scr_priority, root_clutch.scr_thr_count, "*", "*", "*") print "-" * 300 - for i in range(1, 7): + for i in range(0, 6): root_bucket = root_clutch.scr_buckets[i] print "{:>10s} | {:>20s} | {:>30s} | 0x{:16x} | {:>10s} | {:>10s} | {:>30s} | {:>30s} | {:>15d} | ".format("*", bucketStr[i], "*", addressof(root_bucket), "*", "*", "*", "*", root_bucket.scrb_deadline) - prioq = root_bucket.scrb_clutch_buckets + clutch_bucket_runq = root_bucket.scrb_clutch_buckets clutch_bucket_list = [] - for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'): - clutch_bucket_list.append(clutch_bucket) + for pri in range(0,128): + clutch_bucket_circleq = clutch_bucket_runq.scbrq_queues[pri] + for clutch_bucket in IterateCircleQueue(clutch_bucket_circleq, 'struct sched_clutch_bucket', 'scb_runqlink'): + clutch_bucket_list.append(clutch_bucket) if len(clutch_bucket_list) > 0: clutch_bucket_list.sort(key=lambda x: x.scb_priority, reverse=True) for clutch_bucket in clutch_bucket_list: @@ -236,10 +238,12 @@ def ShowSchedClutchRootBucket(cmd_args=[]): print "{:<30s} : {:d}".format("Deadline", root_bucket.scrb_deadline) print "{:<30s} : {:d}".format("Current Timestamp", GetRecentTimestamp()) print "\n" - prioq = root_bucket.scrb_clutch_buckets + clutch_bucket_runq = root_bucket.scrb_clutch_buckets clutch_bucket_list = [] - for clutch_bucket in IteratePriorityQueue(prioq, 'struct sched_clutch_bucket', 'scb_pqlink'): - clutch_bucket_list.append(clutch_bucket) + for pri in range(0,128): + clutch_bucket_circleq = clutch_bucket_runq.scbrq_queues[pri] + for clutch_bucket in IterateCircleQueue(clutch_bucket_circleq, 'struct sched_clutch_bucket', 'scb_runqlink'): + clutch_bucket_list.append(clutch_bucket) if len(clutch_bucket_list) > 0: print "=" * 240 print "{:>30s} | {:>18s} | {:>20s} | {:>20s} | ".format("Name", "Clutch Bucket", "Priority", "Count") + GetSchedClutchBucketSummary.header -- 2.45.2