From bca245acd4c03fd752d1a45f011ad495e60fe53d Mon Sep 17 00:00:00 2001 From: Apple Date: Wed, 3 Jun 2020 04:23:43 +0000 Subject: [PATCH] xnu-6153.121.1.tar.gz --- bsd/kern/bsd_init.c | 3 + bsd/kern/kern_control.c | 10 +- bsd/kern/kern_descrip.c | 25 ++ bsd/kern/kern_fork.c | 1 + bsd/kern/kern_proc.c | 2 +- bsd/kern/kern_sysctl.c | 13 +- bsd/kern/trace_codes | 2 + bsd/kern/uipc_socket.c | 36 +++ bsd/kern/uipc_socket2.c | 219 ++++++++------ bsd/kern/uipc_syscalls.c | 36 +-- bsd/net/content_filter.c | 423 +++++++++++++++++++++------- bsd/net/content_filter.h | 12 +- bsd/net/if_bridge.c | 4 +- bsd/net/necp.c | 6 +- bsd/net/pf_ioctl.c | 4 +- bsd/netinet/flow_divert.c | 22 +- bsd/netinet/in_pcb.c | 9 +- bsd/netinet/ip_icmp.c | 13 +- bsd/netinet/mptcp_subr.c | 5 + bsd/netinet/mptcp_usrreq.c | 13 +- bsd/netinet/mptcp_var.h | 1 + bsd/netinet/raw_ip.c | 111 ++++++-- bsd/netinet/udp_usrreq.c | 25 +- bsd/netinet6/icmp6.c | 32 +-- bsd/netinet6/raw_ip6.c | 121 ++++++-- bsd/netinet6/udp6_output.c | 28 +- bsd/netinet6/udp6_usrreq.c | 2 +- bsd/sys/namei.h | 3 + bsd/sys/proc_internal.h | 6 + bsd/sys/socketvar.h | 4 +- bsd/vfs/kpi_vfs.c | 50 ---- bsd/vfs/vfs_lookup.c | 171 ++--------- bsd/vfs/vfs_subr.c | 2 + bsd/vfs/vfs_syscalls.c | 27 +- config/MasterVersion | 2 +- iokit/IOKit/IOMemoryDescriptor.h | 4 + iokit/Kernel/IOMemoryDescriptor.cpp | 7 + osfmk/arm/pmap.c | 2 +- osfmk/arm64/locore.s | 2 +- osfmk/arm64/machine_routines_asm.h | 48 ++-- osfmk/arm64/machine_routines_asm.s | 113 ++++---- osfmk/arm64/pcb.c | 2 + osfmk/arm64/status.c | 18 +- osfmk/kern/task.c | 24 +- osfmk/mach/arm/thread_status.h | 64 +++-- osfmk/vm/vm_compressor.c | 205 +++++++++++--- osfmk/vm/vm_fault.c | 15 +- osfmk/vm/vm_fault.h | 6 + osfmk/vm/vm_pageout.h | 3 + tests/memorystatus_freeze_test.c | 13 +- tools/lldbmacros/kdp.py | 25 ++ tools/lldbmacros/pmap.py | 183 ++++++++---- 52 files changed, 1414 insertions(+), 763 deletions(-) diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index bc5b709e0..887cb454b 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -415,6 +415,7 @@ lck_grp_t * proc_kqhashlock_grp; lck_grp_t * proc_knhashlock_grp; lck_grp_t * proc_ucred_mlock_grp; lck_grp_t * proc_mlock_grp; +lck_grp_t * proc_dirslock_grp; lck_grp_attr_t * proc_lck_grp_attr; lck_attr_t * proc_lck_attr; lck_mtx_t * proc_list_mlock; @@ -533,6 +534,7 @@ bsd_init(void) proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr); proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr); proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr); + proc_dirslock_grp = lck_grp_alloc_init("proc-dirslock", proc_lck_grp_attr); #if CONFIG_XNUPOST sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL); sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init( @@ -548,6 +550,7 @@ bsd_init(void) lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr); lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr); + lck_rw_init(&kernproc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr); assert(bsd_simul_execs != 0); execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index 5430ff820..0151fac5e 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2017 Apple Inc. All rights reserved. + * Copyright (c) 1999-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1036,7 +1036,7 @@ ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, struct mbuf *m, } so_recv_data_stat(so, m, 0); - if (sbappend(&so->so_rcv, m) != 0) { + if (sbappend_nodrop(&so->so_rcv, m) != 0) { if ((flags & CTL_DATA_NOWAKEUP) == 0) { sorwakeup(so); } @@ -1133,7 +1133,7 @@ ctl_enqueuembuf_list(void *kctlref, u_int32_t unit, struct mbuf *m_list, */ m->m_nextpkt = NULL; so_recv_data_stat(so, m, 0); - if (sbappendrecord(&so->so_rcv, m) != 0) { + if (sbappendrecord_nodrop(&so->so_rcv, m) != 0) { needwakeup = 1; } else { /* @@ -1239,6 +1239,10 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, m->m_flags |= M_EOR; } so_recv_data_stat(so, m, 0); + /* + * No need to call the "nodrop" variant of sbappend + * because the mbuf is local to the scope of the function + */ if (sbappend(&so->so_rcv, m) != 0) { if ((flags & CTL_DATA_NOWAKEUP) == 0) { sorwakeup(so); diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 8e7a7db74..c17f84143 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -282,6 +282,30 @@ file_lock_init(void) } +void +proc_dirs_lock_shared(proc_t p) +{ + lck_rw_lock_shared(&p->p_dirs_lock); +} + +void +proc_dirs_unlock_shared(proc_t p) +{ + lck_rw_unlock_shared(&p->p_dirs_lock); +} + +void +proc_dirs_lock_exclusive(proc_t p) +{ + lck_rw_lock_exclusive(&p->p_dirs_lock); +} + +void +proc_dirs_unlock_exclusive(proc_t p) +{ + lck_rw_unlock_exclusive(&p->p_dirs_lock); +} + /* * proc_fdlock, proc_fdlock_spin * @@ -5061,6 +5085,7 @@ fdcopy(proc_t p, vnode_t uth_cdir) } /* Coming from a chroot environment and unable to get a reference... */ if (newfdp->fd_rdir == NULL && fdp->fd_rdir) { + proc_fdunlock(p); /* * We couldn't get a new reference on * the chroot directory being diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index c25c85ad2..e8de4d1c2 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -1339,6 +1339,7 @@ retry: * * XXX may fail to copy descriptors to child */ + lck_rw_init(&child_proc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr); child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir); #if SYSV_SHM diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index c5ea090ce..1a11358be 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -1301,7 +1301,7 @@ proc_gettty(proc_t p, vnode_t *vp) if (ttyvp) { if (vnode_getwithvid(ttyvp, ttyvid) == 0) { - *vp = procsp->s_ttyvp; + *vp = ttyvp; err = 0; } } else { diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 0e55c4445..edf8d8d22 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1089,7 +1089,6 @@ fill_user32_externproc(proc_t p, struct user32_extern_proc *__restrict exp) exp->p_pid = p->p_pid; exp->p_oppid = p->p_oppid; /* Mach related */ - exp->user_stack = p->user_stack; exp->p_debugger = p->p_debugger; exp->sigwait = p->sigwait; /* scheduling */ @@ -1142,7 +1141,6 @@ fill_user64_externproc(proc_t p, struct user64_extern_proc *__restrict exp) exp->p_pid = p->p_pid; exp->p_oppid = p->p_oppid; /* Mach related */ - exp->user_stack = p->user_stack; exp->p_debugger = p->p_debugger; exp->sigwait = p->sigwait; /* scheduling */ @@ -3657,6 +3655,9 @@ SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, ""); SYSCTL_INT(_vm, OID_AUTO, compressor_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, ""); +extern int min_csegs_per_major_compaction; +SYSCTL_INT(_vm, OID_AUTO, compressor_min_csegs_per_major_compaction, CTLFLAG_RW | CTLFLAG_LOCKED, &min_csegs_per_major_compaction, 0, ""); + SYSCTL_INT(_vm, OID_AUTO, vm_ripe_target_age_in_secs, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_ripe_target_age, 0, ""); SYSCTL_INT(_vm, OID_AUTO, compressor_eval_period_in_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_eval_period_in_msecs, 0, ""); @@ -4959,15 +4960,15 @@ sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS int error; mach_port_name_t task_port_name; task_t task; - int buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0; + size_t buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0; vmobject_list_output_t buffer; size_t output_size; if (buffer_size) { - const int min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t); + const size_t min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t); - if (buffer_size < min_size) { - buffer_size = min_size; + if (buffer_size < min_size || buffer_size > INT_MAX) { + return EINVAL; } buffer = kalloc(buffer_size); diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index 842ff323b..929b0b88e 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -246,6 +246,8 @@ 0x1300494 MACH_vm_page_expedite_no_memory 0x1300498 MACH_vm_page_grab 0x130049c MACH_vm_page_release +0x13004a0 MACH_vm_compressor_compact_and_swap +0x13004a4 MACH_vm_compressor_do_delayed_compactions 0x13004c0 MACH_vm_pressure_event 0x13004c4 MACH_vm_execve 0x13004c8 MACH_vm_wakeup_compactor_swapper diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index b94476d05..2a754f050 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1839,6 +1839,39 @@ soconnectxlocked(struct socket *so, struct sockaddr *src, (error = sodisconnectlocked(so)) != 0)) { error = EISCONN; } else { + if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) && + (flags & CONNECT_DATA_IDEMPOTENT)) { + so->so_flags1 |= SOF1_DATA_IDEMPOTENT; + + if (flags & CONNECT_DATA_AUTHENTICATED) { + so->so_flags1 |= SOF1_DATA_AUTHENTICATED; + } + } + + /* + * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data. + * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error) + * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data + * Case 3 allows user to combine write with connect even if they have + * no use for TFO (such as regular TCP, and UDP). + * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case) + */ + if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) && + ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) { + so->so_flags1 |= SOF1_PRECONNECT_DATA; + } + + /* + * If a user sets data idempotent and does not pass an uio, or + * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset + * SOF1_DATA_IDEMPOTENT. + */ + if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) && + (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) { + /* We should return EINVAL instead perhaps. */ + so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT; + } + /* * Run connect filter before calling protocol: * - non-blocking connect returns before completion; @@ -1856,6 +1889,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src, flags, arg, arglen, auio, bytes_written); if (error != 0) { so->so_state &= ~SS_ISCONNECTING; + if (error != EINPROGRESS) { + so->so_flags1 &= ~SOF1_PRECONNECT_DATA; + } } } } diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index cc3c37a52..cbac73e06 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2019 Apple Inc. All rights reserved. + * Copyright (c) 1998-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,9 @@ static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *, struct mbuf *); static void soevent_ifdenied(struct socket *); +static int sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop); +static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop); + /* * Primitive routines for operating on sockets and socket buffers */ @@ -872,13 +875,13 @@ sbrelease(struct sockbuf *sb) * the mbuf chain is recorded in sb. Empty mbufs are * discarded and mbufs are compacted where possible. */ -int -sbappend(struct sockbuf *sb, struct mbuf *m) +static int +sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop) { struct socket *so = sb->sb_so; if (m == NULL || (sb->sb_flags & SB_DROP)) { - if (m != NULL) { + if (m != NULL && !nodrop) { m_freem(m); } return 0; @@ -887,27 +890,30 @@ sbappend(struct sockbuf *sb, struct mbuf *m) SBLASTRECORDCHK(sb, "sbappend 1"); if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) { - return sbappendrecord(sb, m); + return sbappendrecord_common(sb, m, nodrop); } - if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) { - int error = sflt_data_in(so, NULL, &m, NULL, 0); - SBLASTRECORDCHK(sb, "sbappend 2"); + if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) { + ASSERT(nodrop == FALSE); + if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) { + int error = sflt_data_in(so, NULL, &m, NULL, 0); + SBLASTRECORDCHK(sb, "sbappend 2"); #if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(so, NULL, m, NULL, 0); - } + if (error == 0) { + error = cfil_sock_data_in(so, NULL, m, NULL, 0); + } #endif /* CONTENT_FILTER */ - if (error != 0) { - if (error != EJUSTRETURN) { - m_freem(m); + if (error != 0) { + if (error != EJUSTRETURN) { + m_freem(m); + } + return 0; } - return 0; + } else if (m) { + m->m_flags &= ~M_SKIPCFIL; } - } else if (m) { - m->m_flags &= ~M_SKIPCFIL; } /* If this is the first record, it's also the last record */ @@ -920,6 +926,18 @@ sbappend(struct sockbuf *sb, struct mbuf *m) return 1; } +int +sbappend(struct sockbuf *sb, struct mbuf *m) +{ + return sbappend_common(sb, m, FALSE); +} + +int +sbappend_nodrop(struct sockbuf *sb, struct mbuf *m) +{ + return sbappend_common(sb, m, TRUE); +} + /* * Similar to sbappend, except that this is optimized for stream sockets. */ @@ -943,24 +961,26 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m) SBLASTMBUFCHK(sb, __func__); - if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) { - int error = sflt_data_in(so, NULL, &m, NULL, 0); - SBLASTRECORDCHK(sb, "sbappendstream 1"); + if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) { + if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) { + int error = sflt_data_in(so, NULL, &m, NULL, 0); + SBLASTRECORDCHK(sb, "sbappendstream 1"); #if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(so, NULL, m, NULL, 0); - } + if (error == 0) { + error = cfil_sock_data_in(so, NULL, m, NULL, 0); + } #endif /* CONTENT_FILTER */ - if (error != 0) { - if (error != EJUSTRETURN) { - m_freem(m); + if (error != 0) { + if (error != EJUSTRETURN) { + m_freem(m); + } + return 0; } - return 0; + } else if (m) { + m->m_flags &= ~M_SKIPCFIL; } - } else if (m) { - m->m_flags &= ~M_SKIPCFIL; } sbcompress(sb, m, sb->sb_mbtail); @@ -1066,14 +1086,14 @@ sblastmbufchk(struct sockbuf *sb, const char *where) /* * Similar to sbappend, except the mbuf chain begins a new record. */ -int -sbappendrecord(struct sockbuf *sb, struct mbuf *m0) +static int +sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop) { struct mbuf *m; int space = 0; if (m0 == NULL || (sb->sb_flags & SB_DROP)) { - if (m0 != NULL) { + if (m0 != NULL && nodrop == FALSE) { m_freem(m0); } return 0; @@ -1084,29 +1104,34 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) } if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) { - m_freem(m0); + if (nodrop == FALSE) { + m_freem(m0); + } return 0; } - if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { - int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_record); + if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) { + ASSERT(nodrop == FALSE); + if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { + int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, + sock_data_filt_flag_record); #if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0); - } + if (error == 0) { + error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0); + } #endif /* CONTENT_FILTER */ - if (error != 0) { - SBLASTRECORDCHK(sb, "sbappendrecord 1"); - if (error != EJUSTRETURN) { - m_freem(m0); + if (error != 0) { + SBLASTRECORDCHK(sb, "sbappendrecord 1"); + if (error != EJUSTRETURN) { + m_freem(m0); + } + return 0; } - return 0; + } else if (m0) { + m0->m_flags &= ~M_SKIPCFIL; } - } else if (m0) { - m0->m_flags &= ~M_SKIPCFIL; } /* @@ -1133,6 +1158,18 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) return 1; } +int +sbappendrecord(struct sockbuf *sb, struct mbuf *m0) +{ + return sbappendrecord_common(sb, m0, FALSE); +} + +int +sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0) +{ + return sbappendrecord_common(sb, m0, TRUE); +} + /* * Concatenate address (optional), control (optional) and data into one * single mbuf chain. If sockbuf *sb is passed in, space check will be @@ -1276,35 +1313,37 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, return 0; } - /* Call socket data in filters */ - if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { - int error; - error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0); - SBLASTRECORDCHK(sb, __func__); + if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) { + /* Call socket data in filters */ + if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { + int error; + error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0); + SBLASTRECORDCHK(sb, __func__); #if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(sb->sb_so, asa, m0, control, - 0); - } + if (error == 0) { + error = cfil_sock_data_in(sb->sb_so, asa, m0, control, + 0); + } #endif /* CONTENT_FILTER */ - if (error) { - if (error != EJUSTRETURN) { - if (m0) { - m_freem(m0); - } - if (control != NULL && !sb_unix) { - m_freem(control); - } - if (error_out) { - *error_out = error; + if (error) { + if (error != EJUSTRETURN) { + if (m0) { + m_freem(m0); + } + if (control != NULL && !sb_unix) { + m_freem(control); + } + if (error_out) { + *error_out = error; + } } + return 0; } - return 0; + } else if (m0) { + m0->m_flags &= ~M_SKIPCFIL; } - } else if (m0) { - m0->m_flags &= ~M_SKIPCFIL; } mbuf_chain = sbconcat_mbufs(sb, asa, m0, control); @@ -1420,35 +1459,37 @@ sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, return 0; } - if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { - int error; + if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) { + if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) { + int error; - error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0); - SBLASTRECORDCHK(sb, __func__); + error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0); + SBLASTRECORDCHK(sb, __func__); #if CONTENT_FILTER - if (error == 0) { - error = cfil_sock_data_in(sb->sb_so, NULL, m0, control, - 0); - } + if (error == 0) { + error = cfil_sock_data_in(sb->sb_so, NULL, m0, control, + 0); + } #endif /* CONTENT_FILTER */ - if (error) { - if (error != EJUSTRETURN) { - if (m0) { - m_freem(m0); - } - if (control != NULL && !sb_unix) { - m_freem(control); - } - if (error_out) { - *error_out = error; + if (error) { + if (error != EJUSTRETURN) { + if (m0) { + m_freem(m0); + } + if (control != NULL && !sb_unix) { + m_freem(control); + } + if (error_out) { + *error_out = error; + } } + return 0; } - return 0; + } else if (m0) { + m0->m_flags &= ~M_SKIPCFIL; } - } else if (m0) { - m0->m_flags &= ~M_SKIPCFIL; } result = sbappendcontrol_internal(sb, m0, control); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 5c929755a..e2455a1f3 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1004,7 +1004,6 @@ connectitx(struct socket *so, struct sockaddr *src, user_ssize_t *bytes_written) { int error; -#pragma unused (flags) VERIFY(dst != NULL); @@ -1027,41 +1026,8 @@ connectitx(struct socket *so, struct sockaddr *src, goto out; } - if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) && - (flags & CONNECT_DATA_IDEMPOTENT)) { - so->so_flags1 |= SOF1_DATA_IDEMPOTENT; - - if (flags & CONNECT_DATA_AUTHENTICATED) { - so->so_flags1 |= SOF1_DATA_AUTHENTICATED; - } - } - - /* - * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data. - * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error) - * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data - * Case 3 allows user to combine write with connect even if they have - * no use for TFO (such as regular TCP, and UDP). - * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case) - */ - if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) && - ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) { - so->so_flags1 |= SOF1_PRECONNECT_DATA; - } - - /* - * If a user sets data idempotent and does not pass an uio, or - * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset - * SOF1_DATA_IDEMPOTENT. - */ - if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) && - (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) { - /* We should return EINVAL instead perhaps. */ - so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT; - } - error = soconnectxlocked(so, src, dst, p, ifscope, - aid, pcid, 0, NULL, 0, auio, bytes_written); + aid, pcid, flags, NULL, 0, auio, bytes_written); if (error != 0) { goto out; } diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 626c2b2bf..11b248d0c 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2019 Apple Inc. All rights reserved. + * Copyright (c) 2013-2020 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -26,7 +26,7 @@ * * The socket content filter subsystem provides a way for user space agents to * make filtering decisions based on the content of the data being sent and - * received by TCP/IP sockets. + * received by INET/INET6 sockets. * * A content filter user space agents gets a copy of the data and the data is * also kept in kernel buffer until the user space agents makes a pass or drop @@ -42,23 +42,24 @@ * filter agent until an ultimate pass or drop decision is made by the * user space filter agent. * - * It should be noted that messages about many TCP/IP sockets can be multiplexed + * It should be noted that messages about many INET/INET6 sockets can be multiplexed * over a single kernel control socket. * * Notes: - * - The current implementation is limited to TCP sockets. + * - The current implementation supports all INET/INET6 sockets (i.e. TCP, + * UDP, ICMP, etc). * - The current implementation supports up to two simultaneous content filters - * for the sake of simplicity of the implementation. + * for iOS devices and eight simultaneous content filters for OSX. * * * NECP FILTER CONTROL UNIT * * A user space filter agent uses the Network Extension Control Policy (NECP) - * database to specify which TCP/IP sockets need to be filtered. The NECP + * database to specify which INET/INET6 sockets need to be filtered. The NECP * criteria may be based on a variety of properties like user ID or proc UUID. * * The NECP "filter control unit" is used by the socket content filter subsystem - * to deliver the relevant TCP/IP content information to the appropriate + * to deliver the relevant INET/INET6 content information to the appropriate * user space filter agent via its kernel control socket instance. * This works as follows: * @@ -69,15 +70,15 @@ * content filter kernel control socket via the socket option * CFIL_OPT_NECP_CONTROL_UNIT. * - * 3) The NECP database is consulted to find out if a given TCP/IP socket + * 3) The NECP database is consulted to find out if a given INET/INET6 socket * needs to be subjected to content filtering and returns the corresponding * NECP filter control unit -- the NECP filter control unit is actually - * stored in the TCP/IP socket structure so the NECP lookup is really simple. + * stored in the INET/INET6 socket structure so the NECP lookup is really simple. * * 4) The NECP filter control unit is then used to find the corresponding * kernel control socket instance. * - * Note: NECP currently supports a single filter control unit per TCP/IP socket + * Note: NECP currently supports a single filter control unit per INET/INET6 socket * but this restriction may be soon lifted. * * @@ -87,23 +88,26 @@ * communicate over the kernel control socket via an asynchronous * messaging protocol (this is not a request-response protocol). * The socket content filter subsystem sends event messages to the user - * space filter agent about the TCP/IP sockets it is interested to filter. + * space filter agent about the INET/INET6 sockets it is interested to filter. * The user space filter agent sends action messages to either allow * data to pass or to disallow the data flow (and drop the connection). * * All messages over a content filter kernel control socket share the same * common header of type "struct cfil_msg_hdr". The message type tells if * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION". - * The message header field "cfm_sock_id" identifies a given TCP/IP socket. + * The message header field "cfm_sock_id" identifies a given INET/INET6 flow. + * For TCP, flows are per-socket. For UDP and other datagrame protocols, there + * could be multiple flows per socket. + * * Note the message header length field may be padded for alignment and can * be larger than the actual content of the message. * The field "cfm_op" describe the kind of event or action. * * Here are the kinds of content filter events: - * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered - * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed - * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket - * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket + * - CFM_OP_SOCKET_ATTACHED: a new INET/INET6 socket is being filtered + * - CFM_OP_SOCKET_CLOSED: A INET/INET6 socket is closed + * - CFM_OP_DATA_OUT: A span of data is being sent on a INET/INET6 socket + * - CFM_OP_DATA_IN: A span of data is being or received on a INET/INET6 socket * * * EVENT MESSAGES @@ -135,10 +139,10 @@ * * The CFM_OP_DATA_UPDATE action messages let the user space filter * agent allow data to flow up to the specified pass offset -- there - * is a pass offset for outgoing data and a pass offset for incoming data. - * When a new TCP/IP socket is attached to the content filter, each pass offset - * is initially set to 0 so not data is allowed to pass by default. - * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE + * is a pass offset for outgoing data and a pass offset for incoming data. + * When a new INET/INET6 socket is attached to the content filter and a flow is + * created, each pass offset is initially set to 0 so no data is allowed to pass by + * default. When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE * then the data flow becomes unrestricted. * * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message @@ -148,12 +152,11 @@ * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages * to tell the kernel how much data it wants to see by using the peek offsets. * Just like pass offsets, there is a peek offset for each direction. - * When a new TCP/IP socket is attached to the content filter, each peek offset - * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event - * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message - * with a greater than 0 peek offset is sent by the user space filter agent. - * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE - * then the flow of update data events becomes unrestricted. + * When a new INET/INET6 flow is created, each peek offset is initially set to 0 + * so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages are dispatched by default + * until a CFM_OP_DATA_UPDATE action message with a greater than 0 peek offset is sent + * by the user space filter agent. When the peek offset is set to CFM_MAX_OFFSET via + * a CFM_OP_DATA_UPDATE then the flow of update data events becomes unrestricted. * * Note that peek offsets cannot be smaller than the corresponding pass offset. * Also a peek offsets cannot be smaller than the corresponding end offset @@ -161,11 +164,14 @@ * to set a too small peek value is silently ignored. * * - * PER SOCKET "struct cfil_info" + * PER FLOW "struct cfil_info" * - * As soon as a TCP/IP socket gets attached to a content filter, a + * As soon as a INET/INET6 socket gets attached to a content filter, a * "struct cfil_info" is created to hold the content filtering state for this - * socket. + * socket. For UDP and other datagram protocols, as soon as traffic is seen for + * each new flow identified by its 4-tuple of source address/port and destination + * address/port, a "struct cfil_info" is created. Each datagram socket may + * have multiple flows maintained in a hash table of "struct cfil_info" entries. * * The content filtering state is made of the following information * for each direction: @@ -198,9 +204,9 @@ * * CONTENT FILTER QUEUES * - * Data that is being filtered is steered away from the TCP/IP socket buffer + * Data that is being filtered is steered away from the INET/INET6 socket buffer * and instead will sit in one of three content filter queues until the data - * can be re-injected into the TCP/IP socket buffer. + * can be re-injected into the INET/INET6 socket buffer. * * A content filter queue is represented by "struct cfil_queue" that contains * a list of mbufs and the start and end offset of the data span of @@ -213,7 +219,7 @@ * c) The "cfi_inject_q" of "struct cfil_info" * * Note: The sequence (a),(b) may be repeated several times if there is more - * than one content filter attached to the TCP/IP socket. + * than one content filter attached to the INET/INET6 socket. * * The "cfe_ctl_q" queue holds data than cannot be delivered to the * kernel conntrol socket for two reasons: @@ -228,13 +234,13 @@ * * The "cfi_inject_q" queue holds data that has been fully allowed to pass * by the user space filter agent and that needs to be re-injected into the - * TCP/IP socket. + * INET/INET6 socket. * * * IMPACT ON FLOW CONTROL * * An essential aspect of the content filer subsystem is to minimize the - * impact on flow control of the TCP/IP sockets being filtered. + * impact on flow control of the INET/INET6 sockets being filtered. * * The processing overhead of the content filtering may have an effect on * flow control by adding noticeable delays and cannot be eliminated -- @@ -244,7 +250,7 @@ * The amount of data being filtered is kept in buffers while waiting for * a decision by the user space filter agent. This amount of data pending * needs to be subtracted from the amount of data available in the - * corresponding TCP/IP socket buffer. This is done by modifying + * corresponding INET/INET6 socket buffer. This is done by modifying * sbspace() and tcp_sbspace() to account for amount of data pending * in the content filter. * @@ -256,14 +262,14 @@ * cfil read-write lock held as shared so it can be re-entered from multiple * threads. * - * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is + * The per INET/INET6 socket content filterstate -- "struct cfil_info" -- is * protected by the socket lock. * - * A TCP/IP socket lock cannot be taken while the cfil read-write lock + * A INET/INET6 socket lock cannot be taken while the cfil read-write lock * is held. That's why we have some sequences where we drop the cfil read-write - * lock before taking the TCP/IP lock. + * lock before taking the INET/INET6 lock. * - * It is also important to lock the TCP/IP socket buffer while the content + * It is also important to lock the INET/INET6 socket buffer while the content * filter is modifying the amount of pending data. Otherwise the calculations * in sbspace() and tcp_sbspace() could be wrong. * @@ -277,10 +283,34 @@ * To read the other fields of "struct content_filter" we have to take * "cfil_lck_rw" in shared mode. * + * DATAGRAM SPECIFICS: + * + * The socket content filter supports all INET/INET6 protocols. However + * the treatments for TCP sockets and for datagram (UDP, ICMP, etc) sockets + * are slightly different. + * + * Each datagram socket may have multiple flows. Each flow is identified + * by the flow's source address/port and destination address/port tuple + * and is represented as a "struct cfil_info" entry. For each socket, + * a hash table is used to maintain the collection of flows under that socket. + * + * Each datagram flow is uniquely identified by it's "struct cfil_info" cfi_sock_id. + * The highest 32-bits of the cfi_sock_id contains the socket's so_gencnt. This portion + * of the cfi_sock_id is used locate the socket during socket lookup. The lowest 32-bits + * of the cfi_sock_id contains a hash of the flow's 4-tuple. This portion of the cfi_sock_id + * is used as the hash value for the flow hash table lookup within the parent socket. + * + * Since datagram sockets may not be connected, flow states may not be maintained in the + * socket structures and thus have to be saved for each packet. These saved states will be + * used for both outgoing and incoming reinjections. For outgoing packets, destination + * address/port as well as the current socket states will be saved. During reinjection, + * these saved states will be used instead. For incoming packets, control and address + * mbufs will be chained to the data. During reinjection, the whole chain will be queued + * onto the incoming socket buffer. * * LIMITATIONS * - * - For TCP sockets only + * - Support all INET/INET6 sockets, such as TCP, UDP, ICMP, etc * * - Does not support TCP unordered messages */ @@ -288,13 +318,8 @@ /* * TO DO LIST * - * SOONER: - * * Deal with OOB * - * LATER: - * - * If support datagram, enqueue control and address mbufs as well */ #include @@ -314,6 +339,8 @@ #include #include +#define _IP_VHL +#include #include #include #include @@ -332,6 +359,7 @@ #define MAX_CONTENT_FILTER 8 #endif +extern struct inpcbinfo ripcbinfo; struct cfil_entry; /* @@ -477,6 +505,7 @@ struct cfil_info { uint64_t cfi_byte_outbound_count; boolean_t cfi_isSignatureLatest; /* Indicates if signature covers latest flow attributes */ + u_int32_t cfi_debug; struct cfi_buf { /* * cfi_pending_first and cfi_pending_last describe the total @@ -535,7 +564,24 @@ TAILQ_HEAD(cfil_sock_head_stats, cfil_info) cfil_sock_head_stats; LIST_HEAD(cfilhashhead, cfil_hash_entry); #define CFILHASHSIZE 16 #define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport)) + +#define IS_INET(so) (so && so->so_proto && so->so_proto->pr_domain && (so->so_proto->pr_domain->dom_family == AF_INET || so->so_proto->pr_domain->dom_family == AF_INET6)) +#define IS_TCP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP) #define IS_UDP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP) +#define IS_ICMP(so) (so && so->so_proto && (so->so_proto->pr_type == SOCK_RAW || so->so_proto->pr_type == SOCK_DGRAM) && \ + (so->so_proto->pr_protocol == IPPROTO_ICMP || so->so_proto->pr_protocol == IPPROTO_ICMPV6)) +#define IS_RAW(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_RAW && so->so_proto->pr_protocol == IPPROTO_RAW) + +#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX) +#define IS_IP_DGRAM(so) (IS_INET(so) && IS_UDP(so)) +#else +#define IS_IP_DGRAM(so) (IS_INET(so) && !IS_TCP(so)) +#endif + +#define OPTIONAL_IP_HEADER(so) (!IS_TCP(so) && !IS_UDP(so)) +#define GET_SO_PROTO(so) ((so && so->so_proto) ? so->so_proto->pr_protocol : IPPROTO_MAX) +#define IS_INP_V6(inp) (inp && (inp->inp_vflag & INP_IPV6)) + #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \ ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)))) #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \ @@ -632,6 +678,7 @@ struct cfil_tag { union sockaddr_in_4_6 cfil_faddr; uint32_t cfil_so_state_change_cnt; short cfil_so_options; + int cfil_inp_flags; }; #define CFIL_HASH_ENTRY_ZONE_NAME "cfil_entry_hash" @@ -745,11 +792,12 @@ static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *); static unsigned int cfil_data_length(struct mbuf *, int *, int *); static errno_t cfil_db_init(struct socket *); static void cfil_db_free(struct socket *so); -struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *); +struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *, boolean_t); struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t); struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *); +void cfil_db_update_entry_local(struct cfil_db *, struct cfil_hash_entry *, struct sockaddr *); void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *); -struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *); +struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *, int); struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t); static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *, struct mbuf *, struct mbuf *, uint32_t); @@ -772,7 +820,8 @@ void cfil_info_show(void); bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t); bool cfil_info_action_timed_out(struct cfil_info *, int); bool cfil_info_buffer_threshold_exceeded(struct cfil_info *); -struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *); +struct m_tag *cfil_dgram_save_socket_state(struct cfil_info *, struct mbuf *); +boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags); static void cfil_udp_gc_thread_func(void *, wait_result_t); static void cfil_info_udp_expire(void *, wait_result_t); static bool fill_cfil_hash_entry_from_address(struct cfil_hash_entry *, bool, struct sockaddr *); @@ -1600,6 +1649,21 @@ find_udp: } lck_rw_done(pcbinfo->ipi_lock); + pcbinfo = &ripcbinfo; + lck_rw_lock_shared(pcbinfo->ipi_lock); + LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + if (inp->inp_state != INPCB_STATE_DEAD && + inp->inp_socket != NULL && + inp->inp_socket->so_cfil_db != NULL && + (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) { + if (cfil_socket_safe_lock(inp)) { + so = inp->inp_socket; + } + break; + } + } + lck_rw_done(pcbinfo->ipi_lock); + done: if (so == NULL) { OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found); @@ -1872,6 +1936,11 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, error = EINVAL; goto unlock; } + + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED MSG FROM FILTER"); + } + entry = &cfil_info->cfi_entries[kcunit - 1]; if (entry->cfe_filter == NULL) { CFIL_LOG(LOG_NOTICE, "so %llx no filter", @@ -1897,6 +1966,16 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, switch (msghdr->cfm_op) { case CFM_OP_DATA_UPDATE: + + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DATA_UPDATE"); + CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: ", + (uint64_t)VM_KERNEL_ADDRPERM(so), + cfil_info->cfi_sock_id, + action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset, + action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset); + } + #if VERDICT_DEBUG CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: ", (uint64_t)VM_KERNEL_ADDRPERM(so), @@ -1941,6 +2020,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, break; case CFM_OP_DROP: + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DROP"); + CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: ", + (uint64_t)VM_KERNEL_ADDRPERM(so), + cfil_info->cfi_sock_id, + action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset, + action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset); + } + #if VERDICT_DEBUG CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: ", (uint64_t)VM_KERNEL_ADDRPERM(so), @@ -2830,7 +2918,7 @@ done: errno_t cfil_sock_detach(struct socket *so) { - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { cfil_db_free(so); return 0; } @@ -3025,7 +3113,7 @@ cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state, boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT); union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote; union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local; - cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, inp->inp_vflag & INP_IPV4, outgoing); + cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, !IS_INP_V6(inp), outgoing); } data.byte_count_in = cfil_info->cfi_byte_inbound_count; @@ -3135,7 +3223,7 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, if (hash_entry_ptr != NULL) { cfil_fill_event_msg_addresses(hash_entry_ptr, inp, &msg_attached.cfs_src, &msg_attached.cfs_dst, - inp->inp_vflag & INP_IPV4, conn_dir == CFS_CONNECTION_DIR_OUT); + !IS_INP_V6(inp), conn_dir == CFS_CONNECTION_DIR_OUT); } msg_attached.cfs_conn_dir = conn_dir; @@ -3157,6 +3245,10 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, } } + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING ATTACH UP"); + } + cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, &msg_attached); #if LIFECYCLE_DEBUG @@ -3258,6 +3350,10 @@ cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, u goto done; } + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DISCONNECT UP"); + } + #if LIFECYCLE_DEBUG cfil_info_log(LOG_ERR, cfil_info, outgoing ? "CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP": @@ -3371,6 +3467,10 @@ cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int k cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed); + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING CLOSED UP"); + } + #if LIFECYCLE_DEBUG CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec); #endif @@ -3507,6 +3607,7 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ struct cfe_buf *entrybuf; struct content_filter *cfc; struct timeval tv; + int inp_flags = 0; cfil_rw_lock_shared(&cfil_lck_rw); @@ -3575,6 +3676,24 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ data_req->cfd_start_offset = entrybuf->cfe_peeked; data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen; + data_req->cfd_flags = 0; + if (OPTIONAL_IP_HEADER(so)) { + /* + * For non-UDP/TCP traffic, indicate to filters if optional + * IP header is present: + * outgoing - indicate according to INP_HDRINCL flag + * incoming - For IPv4 only, stripping of IP header is + * optional. But for CFIL, we delay stripping + * at rip_input. So CFIL always expects IP + * frames. IP header will be stripped according + * to INP_STRIPHDR flag later at reinjection. + */ + if ((!outgoing && !IS_INP_V6(inp)) || + (outgoing && cfil_dgram_peek_socket_state(data, &inp_flags) && (inp_flags & INP_HDRINCL))) { + data_req->cfd_flags |= CFD_DATA_FLAG_IP_HEADER; + } + } + /* * Copy address/port into event msg. * For non connected sockets need to copy addresses from passed @@ -3582,7 +3701,11 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ */ cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, &data_req->cfc_src, &data_req->cfc_dst, - inp->inp_vflag & INP_IPV4, outgoing); + !IS_INP_V6(inp), outgoing); + + if (cfil_info->cfi_debug) { + cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DATA UP"); + } if (cfil_info->cfi_isSignatureLatest == false) { cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req); @@ -3608,6 +3731,12 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen); #endif + if (cfil_info->cfi_debug) { + CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u (%s)", + (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen, + data_req->cfd_flags & CFD_DATA_FLAG_IP_HEADER ? "IP HDR" : "NO IP HDR"); + } + done: if (error == ENOBUFS) { entry->cfe_flags |= CFEF_FLOW_CONTROLLED; @@ -3955,6 +4084,9 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou struct cfil_queue *inject_q; int need_rwakeup = 0; int count = 0; + struct inpcb *inp = NULL; + struct ip *ip = NULL; + unsigned int hlen; if (cfil_info == NULL) { return 0; @@ -3984,10 +4116,13 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou datalen = cfil_data_length(data, &mbcnt, &mbnum); #if DATA_DEBUG - CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: data %llx datalen %u (mbcnt %u)", - remote_addr_ptr ? "UNCONNECTED" : "CONNECTED", + CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: data %llx datalen %u (mbcnt %u)", (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt); #endif + if (cfil_info->cfi_debug) { + CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: data %llx datalen %u (mbcnt %u)", + (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt); + } /* Remove data from queue and adjust stats */ cfil_queue_remove(inject_q, data, datalen); @@ -4011,12 +4146,26 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou data->m_flags |= M_SKIPCFIL; /* - * NOTE: We currently only support TCP and UDP. - * For RAWIP, MPTCP and message TCP we'll + * NOTE: We currently only support TCP, UDP, ICMP, + * ICMPv6 and RAWIP. For MPTCP and message TCP we'll * need to call the appropriate sbappendxxx() * of fix sock_inject_data_in() */ - if (IS_UDP(so) == TRUE) { + if (IS_IP_DGRAM(so)) { + if (OPTIONAL_IP_HEADER(so)) { + inp = sotoinpcb(so); + if (inp && (inp->inp_flags & INP_STRIPHDR)) { + mbuf_t data_start = cfil_data_start(data); + if (data_start != NULL && (data_start->m_flags & M_PKTHDR)) { + ip = mtod(data_start, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + data_start->m_len -= hlen; + data_start->m_pkthdr.len -= hlen; + data_start->m_data += hlen; + } + } + } + if (sbappendchain(&so->so_rcv, data, 0)) { need_rwakeup = 1; } @@ -4042,6 +4191,10 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: injected %d", (uint64_t)VM_KERNEL_ADDRPERM(so), count); #endif + if (cfil_info->cfi_debug) { + CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: injected %d", + (uint64_t)VM_KERNEL_ADDRPERM(so), count); + } /* A single wakeup is for several packets is more efficient */ if (need_rwakeup) { @@ -4662,7 +4815,7 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s cfi_buf->cfi_pending_mbcnt += mbcnt; cfi_buf->cfi_pending_mbnum += mbnum; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max || cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) { cfi_buf->cfi_tail_drop_cnt++; @@ -4697,12 +4850,12 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s // Is cfil attached to this filter? kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry); if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) { - if (IS_UDP(so) && chain == NULL) { - /* UDP only: + if (IS_IP_DGRAM(so) && chain == NULL) { + /* Datagrams only: * Chain addr (incoming only TDB), control (optional) and data into one chain. * This full chain will be reinjected into socket after recieving verdict. */ - (void) cfil_udp_save_socket_state(cfil_info, data); + (void) cfil_dgram_save_socket_state(cfil_info, data); chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control); if (chain == NULL) { return ENOBUFS; @@ -4741,7 +4894,7 @@ cfil_sock_data_out(struct socket *so, struct sockaddr *to, { int error = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags); } @@ -4796,7 +4949,7 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from, { int error = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags); } @@ -4846,7 +4999,7 @@ cfil_sock_shutdown(struct socket *so, int *how) { int error = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_sock_udp_shutdown(so, how); } @@ -4932,7 +5085,7 @@ cfil_sock_is_closed(struct socket *so) errno_t error = 0; int kcunit; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { cfil_sock_udp_is_closed(so); return; } @@ -4977,7 +5130,7 @@ cfil_sock_notify_shutdown(struct socket *so, int how) errno_t error = 0; int kcunit; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { cfil_sock_udp_notify_shutdown(so, how, 0, 0); return; } @@ -5010,7 +5163,7 @@ cfil_filters_attached(struct socket *so) uint32_t kcunit; int attached = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_filters_udp_attached(so, FALSE); } @@ -5051,7 +5204,7 @@ cfil_sock_close_wait(struct socket *so) struct timespec ts; int error; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { cfil_sock_udp_close_wait(so); return; } @@ -5118,7 +5271,7 @@ cfil_sock_data_pending(struct sockbuf *sb) struct socket *so = sb->sb_so; uint64_t pending = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_sock_udp_data_pending(sb, FALSE); } @@ -5160,7 +5313,7 @@ cfil_sock_data_space(struct sockbuf *sb) struct socket *so = sb->sb_so; uint64_t pending = 0; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { return cfil_sock_udp_data_pending(sb, TRUE); } @@ -5205,7 +5358,7 @@ cfil_sock_buf_update(struct sockbuf *sb) int error; struct socket *so = sb->sb_so; - if (IS_UDP(so)) { + if (IS_IP_DGRAM(so)) { cfil_sock_udp_buf_update(sb); return; } @@ -5485,8 +5638,9 @@ cfil_hash_entry_log(int level, struct socket *so, struct cfil_hash_entry *entry, return; } - CFIL_LOG(level, "<%s>: lport %d fport %d laddr %s faddr %s", + CFIL_LOG(level, "<%s>: <%s(%d) so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s", msg, + IS_UDP(so) ? "UDP" : "proto", GET_SO_PROTO(so), (uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId, ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote); } @@ -5686,25 +5840,25 @@ fill_cfil_hash_entry_from_inp(struct cfil_hash_entry *entry, bool isLocal, struc return FALSE; } - if (inp->inp_vflag & INP_IPV4) { + if (inp->inp_vflag & INP_IPV6) { if (isLocal == TRUE) { entry->cfentry_lport = inp->inp_lport; - entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr; + entry->cfentry_laddr.addr6 = inp->in6p_laddr; } else { entry->cfentry_fport = inp->inp_fport; - entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr; + entry->cfentry_faddr.addr6 = inp->in6p_faddr; } - entry->cfentry_family = AF_INET; + entry->cfentry_family = AF_INET6; return TRUE; - } else if (inp->inp_vflag & INP_IPV6) { + } else if (inp->inp_vflag & INP_IPV4) { if (isLocal == TRUE) { entry->cfentry_lport = inp->inp_lport; - entry->cfentry_laddr.addr6 = inp->in6p_laddr; + entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr; } else { entry->cfentry_fport = inp->inp_fport; - entry->cfentry_faddr.addr6 = inp->in6p_faddr; + entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr; } - entry->cfentry_family = AF_INET6; + entry->cfentry_family = AF_INET; return TRUE; } return FALSE; @@ -5775,12 +5929,13 @@ cfil_db_lookup_entry_with_sockid(struct cfil_db *db, u_int64_t sock_id) } struct cfil_hash_entry * -cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote) +cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote, boolean_t remoteOnly) { - struct cfil_hash_entry matchentry; + struct cfil_hash_entry matchentry = { }; struct cfil_hash_entry *nextentry = NULL; struct inpcb *inp = sotoinpcb(db->cfdb_so); u_int32_t hashkey_faddr = 0, hashkey_laddr = 0; + u_int16_t hashkey_fport = 0, hashkey_lport = 0; int inp_hash_element = 0; struct cfilhashhead *cfilhash = NULL; @@ -5790,10 +5945,12 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr goto done; } - if (local != NULL) { - fill_cfil_hash_entry_from_address(&matchentry, TRUE, local); - } else { - fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp); + if (remoteOnly == false) { + if (local != NULL) { + fill_cfil_hash_entry_from_address(&matchentry, TRUE, local); + } else { + fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp); + } } if (remote != NULL) { fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote); @@ -5804,16 +5961,18 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr #if INET6 if (inp->inp_vflag & INP_IPV6) { hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3]; - hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3]; + hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr6.s6_addr32[3] : 0; } else #endif /* INET6 */ { hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr; - hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr; + hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr : 0; } - inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr, - matchentry.cfentry_lport, matchentry.cfentry_fport); + hashkey_fport = matchentry.cfentry_fport; + hashkey_lport = (remoteOnly == false) ? matchentry.cfentry_lport : 0; + + inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr, hashkey_lport, hashkey_fport); inp_hash_element &= db->cfdb_hashmask; cfilhash = &db->cfdb_hashbase[inp_hash_element]; @@ -5821,9 +5980,9 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr LIST_FOREACH(nextentry, cfilhash, cfentry_link) { #if INET6 if ((inp->inp_vflag & INP_IPV6) && - nextentry->cfentry_lport == matchentry.cfentry_lport && + (remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) && nextentry->cfentry_fport == matchentry.cfentry_fport && - IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) && + (remoteOnly || IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6)) && IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) { #if DATA_DEBUG cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry"); @@ -5831,9 +5990,9 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr return nextentry; } else #endif /* INET6 */ - if (nextentry->cfentry_lport == matchentry.cfentry_lport && + if ((remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) && nextentry->cfentry_fport == matchentry.cfentry_fport && - nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr && + (remoteOnly || nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr) && nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) { #if DATA_DEBUG cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry"); @@ -5925,6 +6084,27 @@ done: return entry; } +void +cfil_db_update_entry_local(struct cfil_db *db, struct cfil_hash_entry *entry, struct sockaddr *local) +{ + struct inpcb *inp = sotoinpcb(db->cfdb_so); + + CFIL_LOG(LOG_INFO, ""); + + if (inp == NULL || entry == NULL) { + return; + } + + if (local != NULL) { + fill_cfil_hash_entry_from_address(entry, TRUE, local); + } else { + fill_cfil_hash_entry_from_inp(entry, TRUE, inp); + } + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: local updated"); + + return; +} + struct cfil_info * cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id) { @@ -5952,7 +6132,7 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id) } struct cfil_hash_entry * -cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote) +cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote, int debug) { struct cfil_hash_entry *hash_entry = NULL; @@ -5967,7 +6147,16 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out } // See if flow already exists. - hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote); + hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, false); + if (hash_entry == NULL) { + // No match with both local and remote, try match with remote only + hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, true); + if (hash_entry != NULL) { + // Simply update the local address into the original flow, keeping + // its sockId and flow_hash unchanged. + cfil_db_update_entry_local(so->so_cfil_db, hash_entry, local); + } + } if (hash_entry != NULL) { return hash_entry; } @@ -5987,6 +6176,7 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out return NULL; } hash_entry->cfentry_cfil->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN; + hash_entry->cfentry_cfil->cfi_debug = debug; #if LIFECYCLE_DEBUG cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED"); @@ -6010,6 +6200,10 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out /* Hold a reference on the socket for each flow */ so->so_usecount++; + if (debug) { + cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED"); + } + error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, 0, outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN); /* We can recover from flow control or out of memory errors */ @@ -6031,6 +6225,7 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so, uint32_t filter_control_unit; struct cfil_hash_entry *hash_entry = NULL; struct cfil_info *cfil_info = NULL; + int debug = 0; socket_lock_assert_owned(so); @@ -6061,7 +6256,7 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so, return error; } - hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote); + hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote, debug); if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) { CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow"); return EPIPE; @@ -6739,7 +6934,7 @@ cfil_info_udp_expire(void *v, wait_result_t w) break; } - if (IS_UDP(cfil_info->cfi_so)) { + if (IS_IP_DGRAM(cfil_info->cfi_so)) { if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) || cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) || cfil_info_buffer_threshold_exceeded(cfil_info)) { @@ -6808,17 +7003,20 @@ go_sleep: } struct m_tag * -cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m) +cfil_dgram_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m) { struct m_tag *tag = NULL; struct cfil_tag *ctag = NULL; struct cfil_hash_entry *hash_entry = NULL; + struct inpcb *inp = NULL; if (cfil_info == NULL || cfil_info->cfi_so == NULL || cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) { return NULL; } + inp = sotoinpcb(cfil_info->cfi_so); + /* Allocate a tag */ tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, sizeof(struct cfil_tag), M_DONTWAIT, m); @@ -6827,6 +7025,7 @@ cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m) ctag = (struct cfil_tag*)(tag + 1); ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt; ctag->cfil_so_options = cfil_info->cfi_so->so_options; + ctag->cfil_inp_flags = inp ? inp->inp_flags : 0; hash_entry = cfil_info->cfi_hash_entry; if (hash_entry->cfentry_family == AF_INET6) { @@ -6845,8 +7044,8 @@ cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m) } struct m_tag * -cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options, - struct sockaddr **faddr) +cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options, + struct sockaddr **faddr, int *inp_flags) { struct m_tag *tag = NULL; struct cfil_tag *ctag = NULL; @@ -6863,6 +7062,9 @@ cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *opt if (faddr) { *faddr = (struct sockaddr *) &ctag->cfil_faddr; } + if (inp_flags) { + *inp_flags = ctag->cfil_inp_flags; + } /* * Unlink tag and hand it over to caller. @@ -6874,6 +7076,23 @@ cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *opt return NULL; } +boolean_t +cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags) +{ + struct m_tag *tag = NULL; + struct cfil_tag *ctag = NULL; + + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL); + if (tag) { + ctag = (struct cfil_tag *)(tag + 1); + if (inp_flags) { + *inp_flags = ctag->cfil_inp_flags; + } + return true; + } + return false; +} + static int cfil_dispatch_stats_event_locked(int kcunit, struct cfil_stats_report_buffer *buffer, uint32_t stats_count) { @@ -7057,7 +7276,7 @@ cfil_stats_collect_flow_stats_for_filter(int kcunit, union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL; union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr; cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, - src, dst, inp->inp_vflag & INP_IPV4, outgoing); + src, dst, !IS_INP_V6(inp), outgoing); } } diff --git a/bsd/net/content_filter.h b/bsd/net/content_filter.h index e3829bf02..2944eba1e 100644 --- a/bsd/net/content_filter.h +++ b/bsd/net/content_filter.h @@ -212,6 +212,11 @@ struct cfil_msg_sock_attached { uint32_t cfs_signature_length; }; +/* + * CFIL data flags + */ +#define CFD_DATA_FLAG_IP_HEADER 0x00000001 /* Data includes IP header */ + /* * struct cfil_msg_data_event * @@ -235,6 +240,7 @@ struct cfil_msg_data_event { uint64_t cfd_end_offset; cfil_crypto_signature cfd_signature; uint32_t cfd_signature_length; + uint32_t cfd_flags; /* Actual content data immediatly follows */ }; @@ -525,8 +531,10 @@ extern void cfil_sock_buf_update(struct sockbuf *sb); extern cfil_sock_id_t cfil_sock_id_from_socket(struct socket *so); -extern struct m_tag *cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, - short *options, struct sockaddr **faddr); +extern struct m_tag *cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, + short *options, struct sockaddr **faddr, int *inp_flags); +extern boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags); + #endif /* BSD_KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index 80bc27d78..e72c0d57c 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -140,6 +140,7 @@ #include #include #include +#include #include /* for struct arpcom */ #include @@ -2475,7 +2476,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) switch (ifs->if_type) { case IFT_ETHER: if (strcmp(ifs->if_name, "en") == 0 && - ifs->if_subfamily == IFNET_SUBFAMILY_WIFI) { + ifs->if_subfamily == IFNET_SUBFAMILY_WIFI && + (ifs->if_eflags & IFEF_IPV4_ROUTER) == 0) { /* XXX is there a better way to identify Wi-Fi STA? */ mac_nat = TRUE; } diff --git a/bsd/net/necp.c b/bsd/net/necp.c index d410484ca..c73dadf89 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -7276,10 +7276,10 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc necp_get_parent_cred_result(NULL, info); } } + } - if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { - info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false; - } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { + info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false; } if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) { diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 560dee4ab..9ba6cc70c 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -1127,6 +1127,7 @@ pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p, dst->tagname[sizeof(dst->tagname) - 1] = '\0'; dst->match_tagname[sizeof(dst->match_tagname) - 1] = '\0'; dst->overload_tblname[sizeof(dst->overload_tblname) - 1] = '\0'; + dst->owner[sizeof(dst->owner) - 1] = '\0'; dst->cuid = kauth_cred_getuid(p->p_ucred); dst->cpid = p->p_pid; @@ -1158,7 +1159,8 @@ pf_rule_copyout(struct pf_rule *src, struct pf_rule *dst) dst->kif = NULL; dst->overload_tbl = NULL; - TAILQ_INIT(&dst->rpool.list); + dst->rpool.list.tqh_first = NULL; + dst->rpool.list.tqh_last = NULL; dst->rpool.cur = NULL; dst->entries.tqe_prev = NULL; diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index 1a405129f..83d34f1a7 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -1689,6 +1689,7 @@ flow_divert_send_app_data(struct flow_divert_pcb *fd_cb, mbuf_t data, struct soc "sbappendaddr failed. send buffer size = %u, send_window = %u, error = %d\n", fd_cb->so->so_snd.sb_cc, fd_cb->send_window, error); } + error = 0; } else { if (!sbappendrecord(&fd_cb->so->so_snd, data)) { FDLOG(LOG_ERR, fd_cb, @@ -2104,6 +2105,9 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off FDLOG0(LOG_INFO, fd_cb, "No remote address provided"); error = 0; } else { + if (remote_address.ss_len > sizeof(remote_address)) { + remote_address.ss_len = sizeof(remote_address); + } /* validate the address */ if (flow_divert_is_sockaddr_valid((struct sockaddr *)&remote_address)) { got_remote_sa = TRUE; @@ -3247,6 +3251,9 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr struct flow_divert_pcb *fd_cb = so->so_fd_pcb; int error = 0; struct inpcb *inp; +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; +#endif VERIFY((so->so_flags & SOF_FLOW_DIVERT) && so->so_fd_pcb != NULL); @@ -3284,7 +3291,7 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr */ if (to == NULL && so->so_cfil_db) { struct sockaddr *cfil_faddr = NULL; - struct m_tag *cfil_tag = cfil_udp_get_socket_state(data, NULL, NULL, &cfil_faddr); + cfil_tag = cfil_dgram_get_socket_state(data, NULL, NULL, &cfil_faddr, NULL); if (cfil_tag) { to = (struct sockaddr *)(void *)cfil_faddr; } @@ -3323,6 +3330,12 @@ done: if (control) { mbuf_free(control); } +#if CONTENT_FILTER + if (cfil_tag) { + m_tag_free(cfil_tag); + } +#endif + return error; } @@ -3444,7 +3457,12 @@ flow_divert_attach(struct socket *so, uint32_t flow_id, uint32_t ctl_unit) sorwakeup(so); } } - flow_divert_set_protosw(so); + if (SOCK_TYPE(so) == SOCK_STREAM) { + flow_divert_set_protosw(so); + } else if (SOCK_TYPE(so) == SOCK_DGRAM) { + flow_divert_set_udp_protosw(so); + } + socket_unlock(so, 0); fd_cb->so = so; diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index d097b293f..4f3be1b1d 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -3520,15 +3520,16 @@ inp_update_policy(struct inpcb *inp) #if defined(XNU_TARGET_OS_OSX) if (so->so_rpid > 0) { lookup_uuid = so->so_ruuid; + ogencnt = so->so_policy_gencnt; + err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt); } #endif - if (lookup_uuid == NULL) { + if (lookup_uuid == NULL || err == ENOENT) { lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid); + ogencnt = so->so_policy_gencnt; + err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt); } - ogencnt = so->so_policy_gencnt; - err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt); - /* * Discard cached generation count if the entry is gone (ENOENT), * so that we go thru the checks below. diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 44804c8a2..65adc858a 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -80,6 +80,7 @@ #include #include +#include #define _IP_VHL #include @@ -1289,6 +1290,7 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct in_ifaddr *ia = NULL; int icmplen; int error = EINVAL; + int inp_flags = inp ? inp->inp_flags : 0; if (inp == NULL #if NECP @@ -1301,7 +1303,16 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, goto bad; } - if ((inp->inp_flags & INP_HDRINCL) != 0) { +#if CONTENT_FILTER + /* + * If socket is subject to Content Filter, get inp_flags from saved state + */ + if (so->so_cfil_db && nam == NULL) { + cfil_dgram_peek_socket_state(m, &inp_flags); + } +#endif + + if ((inp_flags & INP_HDRINCL) != 0) { /* Expect 32-bit aligned data ptr on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); /* diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index 9e8637a92..356298cf7 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -2394,6 +2394,11 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src, goto out_err; } + if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) { + error = EOVERFLOW; + goto out_err; + } + mpts = mptcp_subflow_alloc(); if (mpts == NULL) { os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n", diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c index 0012e4497..db728ca45 100644 --- a/bsd/netinet/mptcp_usrreq.c +++ b/bsd/netinet/mptcp_usrreq.c @@ -222,7 +222,7 @@ out: } static int -mptcp_entitlement_check(struct socket *mp_so) +mptcp_entitlement_check(struct socket *mp_so, uint8_t svctype) { struct mptses *mpte = mpsotompte(mp_so); @@ -254,7 +254,7 @@ mptcp_entitlement_check(struct socket *mp_so) } #endif - if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) { + if (svctype == MPTCP_SVCTYPE_AGGREGATE) { if (mptcp_developer_mode) { return 0; } @@ -274,7 +274,7 @@ mptcp_entitlement_check(struct socket *mp_so) deny: os_log_error(mptcp_log_handle, "%s - %lx: MPTCP prohibited on svc %u\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype); + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), svctype); return -1; } @@ -354,7 +354,7 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, } if (!(mpte->mpte_flags & MPTE_SVCTYPE_CHECKED)) { - if (mptcp_entitlement_check(mp_so) < 0) { + if (mptcp_entitlement_check(mp_so, mpte->mpte_svctype) < 0) { error = EPERM; goto out; } @@ -1713,13 +1713,12 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) goto err_out; } - mpte->mpte_svctype = optval; - - if (mptcp_entitlement_check(mp_so) < 0) { + if (mptcp_entitlement_check(mp_so, optval) < 0) { error = EACCES; goto err_out; } + mpte->mpte_svctype = optval; mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED; goto out; diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index f13bfb950..c1063229d 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -62,6 +62,7 @@ struct mptses { struct mptcb *mpte_mptcb; /* ptr to MPTCP PCB */ TAILQ_HEAD(, mptopt) mpte_sopts; /* list of socket options */ TAILQ_HEAD(, mptsub) mpte_subflows; /* list of subflows */ +#define MPTCP_MAX_NUM_SUBFLOWS 256 uint16_t mpte_numflows; /* # of subflows in list */ uint16_t mpte_nummpcapflows; /* # of MP_CAP subflows */ sae_associd_t mpte_associd; /* MPTCP association ID */ diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index dc552d9e3..b3838d3e7 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -86,6 +86,7 @@ #include #include #include +#include #define _IP_VHL #include @@ -277,7 +278,14 @@ rip_input(struct mbuf *m, int iphlen) continue; } } - if (last->inp_flags & INP_STRIPHDR) { + if (last->inp_flags & INP_STRIPHDR +#if CONTENT_FILTER + /* + * If socket is subject to Content Filter, delay stripping until reinject + */ + && (last->inp_socket->so_cfil_db == NULL) +#endif + ) { n->m_len -= iphlen; n->m_pkthdr.len -= iphlen; n->m_data += iphlen; @@ -330,7 +338,14 @@ rip_input(struct mbuf *m, int iphlen) goto unlock; } } - if (last->inp_flags & INP_STRIPHDR) { + if (last->inp_flags & INP_STRIPHDR +#if CONTENT_FILTER + /* + * If socket is subject to Content Filter, delay stripping until reinject + */ + && (last->inp_socket->so_cfil_db == NULL) +#endif + ) { m->m_len -= iphlen; m->m_pkthdr.len -= iphlen; m->m_data += iphlen; @@ -370,10 +385,74 @@ rip_output( struct ip *ip; struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + int inp_flags = inp ? inp->inp_flags : 0; struct ip_out_args ipoa; struct ip_moptions *imo; int tos = IPTOS_UNSPEC; int error = 0; +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; + bool cfil_faddr_use = false; + uint32_t cfil_so_state_change_cnt = 0; + short cfil_so_options = 0; + int cfil_inp_flags = 0; + struct sockaddr *cfil_faddr = NULL; + struct sockaddr_in *cfil_sin; +#endif + +#if CONTENT_FILTER + /* + * If socket is subject to Content Filter and no addr is passed in, + * retrieve CFIL saved state from mbuf and use it if necessary. + */ + if (so->so_cfil_db && dst == INADDR_ANY) { + cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, &cfil_inp_flags); + if (cfil_tag) { + cfil_sin = SIN(cfil_faddr); + flags = (cfil_so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + inp_flags = cfil_inp_flags; + if (inp && inp->inp_faddr.s_addr == INADDR_ANY) { + /* + * Socket is unconnected, simply use the saved faddr as 'addr' to go through + * the connect/disconnect logic. + */ + dst = cfil_sin->sin_addr.s_addr; + } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) && + (inp->inp_fport != cfil_sin->sin_port || + inp->inp_faddr.s_addr != cfil_sin->sin_addr.s_addr)) { + /* + * Socket is connected but socket state and dest addr/port changed. + * We need to use the saved faddr and socket options. + */ + cfil_faddr_use = true; + } + m_tag_free(cfil_tag); + } + } +#endif + + if (so->so_state & SS_ISCONNECTED) { + if (dst != INADDR_ANY) { + if (m != NULL) { + m_freem(m); + } + if (control != NULL) { + m_freem(control); + } + return EISCONN; + } + dst = cfil_faddr_use ? cfil_sin->sin_addr.s_addr : inp->inp_faddr.s_addr; + } else { + if (dst == INADDR_ANY) { + if (m != NULL) { + m_freem(m); + } + if (control != NULL) { + m_freem(control); + } + return ENOTCONN; + } + } bzero(&ipoa, sizeof(ipoa)); ipoa.ipoa_boundif = IFSCOPE_NONE; @@ -436,7 +515,7 @@ rip_output( * If the user handed us a complete IP packet, use it. * Otherwise, allocate an mbuf for a header and fill it in. */ - if ((inp->inp_flags & INP_HDRINCL) == 0) { + if ((inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return EMSGSIZE; @@ -493,8 +572,12 @@ rip_output( /* * We need a route to perform NECP route rule checks */ - if (net_qos_policy_restricted != 0 && - ROUTE_UNUSABLE(&inp->inp_route)) { + if ((net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&inp->inp_route)) +#if CONTENT_FILTER + || cfil_faddr_use +#endif + ) { struct sockaddr_in to; struct sockaddr_in from; struct in_addr laddr = ip->ip_src; @@ -600,6 +683,10 @@ rip_output( if ((rt->rt_flags & (RTF_MULTICAST | RTF_BROADCAST)) || inp->inp_socket == NULL || +#if CONTENT_FILTER + /* Discard temporary route for cfil case */ + cfil_faddr_use || +#endif !(inp->inp_socket->so_state & SS_ISCONNECTED)) { rt = NULL; /* unusable */ } @@ -1067,7 +1154,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, { #pragma unused(flags, p) struct inpcb *inp = sotoinpcb(so); - u_int32_t dst; + u_int32_t dst = INADDR_ANY; int error = 0; if (inp == NULL @@ -1083,17 +1170,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, goto bad; } - if (so->so_state & SS_ISCONNECTED) { - if (nam != NULL) { - error = EISCONN; - goto bad; - } - dst = inp->inp_faddr.s_addr; - } else { - if (nam == NULL) { - error = ENOTCONN; - goto bad; - } + if (nam != NULL) { dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr; } return rip_output(m, so, dst, control); diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index ed16674e5..818f05cce 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -1479,6 +1479,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, #if CONTENT_FILTER struct m_tag *cfil_tag = NULL; bool cfil_faddr_use = false; + bool sndinprog_cnt_used = false; uint32_t cfil_so_state_change_cnt = 0; short cfil_so_options = 0; struct sockaddr *cfil_faddr = NULL; @@ -1510,7 +1511,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * retrieve CFIL saved state from mbuf and use it if necessary. */ if (so->so_cfil_db && !addr) { - cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr); + cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL); if (cfil_tag) { sin = (struct sockaddr_in *)(void *)cfil_faddr; if (inp && inp->inp_faddr.s_addr == INADDR_ANY) { @@ -1673,6 +1674,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, fport = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_port; } #endif + inp->inp_sndinprog_cnt++; + sndinprog_cnt_used = true; if (addr) { sin = (struct sockaddr_in *)(void *)addr; @@ -1936,8 +1939,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR; } - inp->inp_sndinprog_cnt++; - socket_unlock(so, 0); error = ip_output(m, inpopts, &ro, soopts, mopts, &ipoa); m = NULL; @@ -1971,14 +1972,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp_set_fc_state(inp, adv->code); } - VERIFY(inp->inp_sndinprog_cnt > 0); - if (--inp->inp_sndinprog_cnt == 0) { - inp->inp_flags &= ~(INP_FC_FEEDBACK); - if (inp->inp_sndingprog_waiters > 0) { - wakeup(&inp->inp_sndinprog_cnt); - } - } - /* Synchronize PCB cached route */ inp_route_copyin(inp, &ro); @@ -2057,6 +2050,16 @@ release: m_tag_free(cfil_tag); } #endif + if (sndinprog_cnt_used) { + VERIFY(inp->inp_sndinprog_cnt > 0); + if (--inp->inp_sndinprog_cnt == 0) { + inp->inp_flags &= ~(INP_FC_FEEDBACK); + if (inp->inp_sndingprog_waiters > 0) { + wakeup(&inp->inp_sndinprog_cnt); + } + } + sndinprog_cnt_used = false; + } return error; } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index a7376b686..042244b83 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -3157,8 +3157,6 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, #pragma unused(flags, p) int error = 0; struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam; struct icmp6_hdr *icmp6; if (inp == NULL @@ -3174,28 +3172,6 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, return rip6_output(m, so, SIN6(nam), control, 0); } - /* always copy sockaddr to avoid overwrites */ - if (so->so_state & SS_ISCONNECTED) { - if (nam != NULL) { - error = EISCONN; - goto bad; - } - /* XXX */ - bzero(&tmp, sizeof(tmp)); - tmp.sin6_family = AF_INET6; - tmp.sin6_len = sizeof(struct sockaddr_in6); - bcopy(&inp->in6p_faddr, &tmp.sin6_addr, - sizeof(struct in6_addr)); - dst = &tmp; - } else { - if (nam == NULL) { - error = ENOTCONN; - goto bad; - } - tmp = *(struct sockaddr_in6 *)(void *)nam; - dst = &tmp; - } - /* * For an ICMPv6 packet, we should know its type and code */ @@ -3224,13 +3200,7 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, } } -#if ENABLE_DEFAULT_SCOPE - if (dst->sin6_scope_id == 0) { /* not change if specified */ - dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr); - } -#endif - - return rip6_output(m, so, dst, control, 0); + return rip6_output(m, so, SIN6(nam), control, 0); bad: VERIFY(error != 0); diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 5b2b17517..15b48d475 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -363,8 +364,80 @@ rip6_output( int netsvctype = _NET_SERVICE_TYPE_UNSPEC; struct ip6_out_args ip6oa; int flags = IPV6_OUTARGS; + struct sockaddr_in6 tmp; +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; + bool cfil_faddr_use = false; + uint32_t cfil_so_state_change_cnt = 0; + short cfil_so_options = 0; + struct sockaddr *cfil_faddr = NULL; + struct sockaddr_in6 *cfil_sin6 = NULL; +#endif in6p = sotoin6pcb(so); + if (in6p == NULL) { + error = EINVAL; + goto bad; + } + +#if CONTENT_FILTER + /* + * If socket is subject to Content Filter and no addr is passed in, + * retrieve CFIL saved state from mbuf and use it if necessary. + */ + if (so->so_cfil_db && !dstsock) { + cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL); + if (cfil_tag) { + cfil_sin6 = SIN6(cfil_faddr); + if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { + /* + * Socket is unconnected, simply use the saved faddr as 'addr' to go through + * the connect/disconnect logic. + */ + dstsock = cfil_sin6; + } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) && + (in6p->in6p_fport != cfil_sin6->sin6_port || + !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &cfil_sin6->sin6_addr))) { + /* + * Socket is connected but socket state and dest addr/port changed. + * We need to use the saved faddr and socket options. + */ + cfil_faddr_use = true; + } + } + } +#endif + + /* always copy sockaddr to avoid overwrites */ + if (so->so_state & SS_ISCONNECTED) { + if (dstsock != NULL) { + error = EISCONN; + goto bad; + } + /* XXX */ + bzero(&tmp, sizeof(tmp)); + tmp.sin6_family = AF_INET6; + tmp.sin6_len = sizeof(struct sockaddr_in6); + bcopy( +#if CONTENT_FILTER + cfil_faddr_use ? &cfil_sin6->sin6_addr : +#endif + &in6p->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); + dstsock = &tmp; + } else { + if (dstsock == NULL) { + error = ENOTCONN; + goto bad; + } + tmp = *dstsock; + dstsock = &tmp; + } + +#if ENABLE_DEFAULT_SCOPE + if (dstsock->sin6_scope_id == 0) { /* not change if specified */ + dstsock->sin6_scope_id = scope6_addr2default(&dstsock->sin6_addr); + } +#endif bzero(&ip6oa, sizeof(ip6oa)); ip6oa.ip6oa_boundif = IFSCOPE_NONE; @@ -604,8 +677,12 @@ rip6_output( /* * We need a route to perform NECP route rule checks */ - if (net_qos_policy_restricted != 0 && - ROUTE_UNUSABLE(&in6p->in6p_route)) { + if ((net_qos_policy_restricted != 0 && + ROUTE_UNUSABLE(&in6p->in6p_route)) +#if CONTENT_FILTER + || cfil_faddr_use +#endif + ) { struct sockaddr_in6 to; struct sockaddr_in6 from; @@ -697,6 +774,10 @@ rip6_output( if ((rt->rt_flags & RTF_MULTICAST) || in6p->in6p_socket == NULL || +#if CONTENT_FILTER + /* Discard temporary route for cfil case */ + cfil_faddr_use || +#endif !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) { rt = NULL; /* unusable */ } @@ -772,6 +853,12 @@ freectl: if (oifp != NULL) { ifnet_release(oifp); } +#if CONTENT_FILTER + if (cfil_tag) { + m_tag_free(cfil_tag); + } +#endif + return error; } @@ -1053,8 +1140,6 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, { #pragma unused(flags, p) struct inpcb *inp = sotoinpcb(so); - struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam; int error = 0; if (inp == NULL @@ -1070,33 +1155,7 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, goto bad; } - /* always copy sockaddr to avoid overwrites */ - if (so->so_state & SS_ISCONNECTED) { - if (nam != NULL) { - error = EISCONN; - goto bad; - } - /* XXX */ - bzero(&tmp, sizeof(tmp)); - tmp.sin6_family = AF_INET6; - tmp.sin6_len = sizeof(struct sockaddr_in6); - bcopy(&inp->in6p_faddr, &tmp.sin6_addr, - sizeof(struct in6_addr)); - dst = &tmp; - } else { - if (nam == NULL) { - error = ENOTCONN; - goto bad; - } - tmp = *(struct sockaddr_in6 *)(void *)nam; - dst = &tmp; - } -#if ENABLE_DEFAULT_SCOPE - if (dst->sin6_scope_id == 0) { /* not change if specified */ - dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr); - } -#endif - return rip6_output(m, so, dst, control, 1); + return rip6_output(m, so, SIN6(nam), control, 1); bad: VERIFY(error != 0); diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index 66025ca43..eba8e037e 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -173,6 +173,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, #if CONTENT_FILTER struct m_tag *cfil_tag = NULL; bool cfil_faddr_use = false; + bool sndinprog_cnt_used = false; uint32_t cfil_so_state_change_cnt = 0; struct sockaddr *cfil_faddr = NULL; struct sockaddr_in6 *cfil_sin6 = NULL; @@ -216,7 +217,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, * retrieve CFIL saved state from mbuf and use it if necessary. */ if (so->so_cfil_db && !addr6) { - cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr); + cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr, NULL); if (cfil_tag) { cfil_sin6 = (struct sockaddr_in6 *)(void *)cfil_faddr; if ((so->so_state_change_cnt != cfil_so_state_change_cnt) && @@ -250,6 +251,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, ip6oa.ip6oa_sotc = sotc; ip6oa.ip6oa_netsvctype = netsvctype; + in6p->inp_sndinprog_cnt++; + sndinprog_cnt_used = true; + if (addr6) { /* * IPv4 version of udp_output calls in_pcbconnect in this case, @@ -529,8 +533,6 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, IM6O_UNLOCK(im6o); } - in6p->inp_sndinprog_cnt++; - socket_unlock(so, 0); error = ip6_output(m, optp, &ro, flags, im6o, NULL, &ip6oa); m = NULL; @@ -568,14 +570,6 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, inp_set_fc_state(in6p, adv->code); } - VERIFY(in6p->inp_sndinprog_cnt > 0); - if (--in6p->inp_sndinprog_cnt == 0) { - in6p->inp_flags &= ~(INP_FC_FEEDBACK); - if (in6p->inp_sndingprog_waiters > 0) { - wakeup(&in6p->inp_sndinprog_cnt); - } - } - if (ro.ro_rt != NULL) { struct ifnet *outif = ro.ro_rt->rt_ifp; @@ -661,6 +655,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, goto releaseopt; release: + if (m != NULL) { m_freem(m); } @@ -677,5 +672,16 @@ releaseopt: m_tag_free(cfil_tag); } #endif + if (sndinprog_cnt_used) { + VERIFY(in6p->inp_sndinprog_cnt > 0); + if (--in6p->inp_sndinprog_cnt == 0) { + in6p->inp_flags &= ~(INP_FC_FEEDBACK); + if (in6p->inp_sndingprog_waiters > 0) { + wakeup(&in6p->inp_sndinprog_cnt); + } + } + sndinprog_cnt_used = false; + } + return error; } diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index 2917f5c7e..356d7b99d 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -1012,7 +1012,7 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, #if CONTENT_FILTER //If socket is subject to UDP Content Filter and unconnected, get addr from tag. if (so->so_cfil_db && !addr && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { - cfil_tag = cfil_udp_get_socket_state(m, NULL, NULL, &cfil_faddr); + cfil_tag = cfil_dgram_get_socket_state(m, NULL, NULL, &cfil_faddr, NULL); if (cfil_tag) { addr = (struct sockaddr *)cfil_faddr; } diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 816e849c4..aa7aa4b4b 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -131,6 +131,7 @@ struct nameidata { #define NAMEI_CONTLOOKUP 0x002 /* Continue processing a lookup which was partially processed in a compound VNOP */ #define NAMEI_TRAILINGSLASH 0x004 /* There was at least one trailing slash after last component */ #define NAMEI_UNFINISHED 0x008 /* We broke off a lookup to do a compound op */ + /* * XXX Hack: we need to encode the intended VNOP in order to * be able to include information about which operations a filesystem @@ -143,6 +144,8 @@ struct nameidata { #define NAMEI_COMPOUNDRENAME 0x100 #define NAMEI_COMPOUND_OP_MASK (NAMEI_COMPOUNDOPEN | NAMEI_COMPOUNDREMOVE | NAMEI_COMPOUNDMKDIR | NAMEI_COMPOUNDRMDIR | NAMEI_COMPOUNDRENAME) +#define NAMEI_NOPROCLOCK 0x1000 /* do not take process lock (set by vnode_lookup) */ + #ifdef KERNEL /* * namei operational modifier flags, stored in ni_cnd.flags diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 763515e8f..57e827b40 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -419,6 +419,7 @@ struct proc { #if !CONFIG_EMBEDDED uint64_t p_user_data; /* general-purpose storage for userland-provided data */ #endif /* !CONFIG_EMBEDDED */ + lck_rw_t p_dirs_lock; /* keeps fd_cdir and fd_rdir stable across a lookup */ }; #define PGRPID_DEAD 0xdeaddead @@ -681,6 +682,7 @@ extern lck_grp_t * proc_knhashlock_grp; extern lck_grp_t * proc_mlock_grp; extern lck_grp_t * proc_ucred_mlock_grp; extern lck_grp_t * proc_slock_grp; +extern lck_grp_t * proc_dirslock_grp; extern lck_grp_attr_t * proc_lck_grp_attr; extern lck_attr_t * proc_lck_attr; @@ -702,6 +704,10 @@ extern void proc_fdlock(struct proc *); extern void proc_fdlock_spin(struct proc *); extern void proc_fdunlock(struct proc *); extern void proc_fdlock_assert(proc_t p, int assertflags); +extern void proc_dirs_lock_shared(struct proc *); +extern void proc_dirs_unlock_shared(struct proc *); +extern void proc_dirs_lock_exclusive(struct proc *); +extern void proc_dirs_unlock_exclusive(struct proc *); extern void proc_ucred_lock(struct proc *); extern void proc_ucred_unlock(struct proc *); __private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid, diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 3611e2c6a..cbb856761 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2019 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -768,6 +768,7 @@ extern int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, int *error_out); extern int sbappendchain(struct sockbuf *sb, struct mbuf *m, int space); extern int sbappendrecord(struct sockbuf *sb, struct mbuf *m0); +extern int sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0); extern void sbflush(struct sockbuf *sb); extern int sbspace(struct sockbuf *sb); extern int soabort(struct socket *so); @@ -829,6 +830,7 @@ extern void so_acquire_accept_list(struct socket *, struct socket *); extern void so_release_accept_list(struct socket *); extern int sbappend(struct sockbuf *sb, struct mbuf *m); +extern int sbappend_nodrop(struct sockbuf *sb, struct mbuf *m); extern int sbappendstream(struct sockbuf *sb, struct mbuf *m); extern int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int *error_out); diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index c85c8485f..dff5f82d3 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1330,56 +1330,6 @@ vfs_context_cwd(vfs_context_t ctx) return cwd; } -/* - * vfs_context_get_cwd - * - * Description: Returns a vnode for the current working directory for the - * supplied context. The returned vnode has an iocount on it - * which must be released with a vnode_put(). - * - * Parameters: vfs_context_t The context to use - * - * Returns: vnode_t The current working directory - * for this context - * - * Notes: The function first attempts to obtain the current directory - * from the thread, and if it is not present there, falls back - * to obtaining it from the process instead. If it can't be - * obtained from either place, we return NULLVP. - */ -vnode_t -vfs_context_get_cwd(vfs_context_t ctx) -{ - vnode_t cwd = NULLVP; - - if (ctx != NULL && ctx->vc_thread != NULL) { - uthread_t uth = get_bsdthread_info(ctx->vc_thread); - proc_t proc; - - /* - * Get the cwd from the thread; if there isn't one, get it - * from the process, instead. - */ - cwd = uth->uu_cdir; - - if (cwd) { - if ((vnode_get(cwd) != 0)) { - cwd = NULLVP; - } - } else if ((proc = (proc_t)get_bsdthreadtask_info(ctx->vc_thread)) != NULL && - proc->p_fd != NULL) { - proc_fdlock(proc); - cwd = proc->p_fd->fd_cdir; - if (cwd && (vnode_get(cwd) != 0)) { - cwd = NULLVP; - } - proc_fdunlock(proc); - } - } - - return cwd; -} - /* * vfs_context_create * diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 77c525baa..9aad31d57 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -113,7 +113,7 @@ static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, v #endif static int lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx); -static int handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx); +static int lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx); static int lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx); static void lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation); static int lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly, @@ -167,8 +167,6 @@ namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ struct vnode *dp; /* the directory we are searching */ - struct vnode *rootdir_with_usecount = NULLVP; - struct vnode *startdir_with_usecount = NULLVP; struct vnode *usedvp = ndp->ni_dvp; /* store pointer to vp in case we must loop due to * heavy vnode pressure */ u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */ @@ -185,6 +183,8 @@ namei(struct nameidata *ndp) int volfs_restarts = 0; #endif size_t bytes_copied = 0; + bool take_proc_lock = !(ndp->ni_flag & NAMEI_NOPROCLOCK); + bool proc_lock_taken = false; fdp = p->p_fd; @@ -351,29 +351,15 @@ retry_copy: /* * determine the starting point for the translation. * - * We may need to upto 2 usecounts on vnodes before starting the translation - * We need to have a usecount on the root directory for the process - * for the entire duration of the lookup. This is because symlink - * translation can restart translation at / if a symlink is encountered. - * - * For the duration of this lookup at rootdir for this lookup is the one - * we fetch now under the proc_fdlock even the if the proc rootdir changes - * once we let go of the proc_fdlock. - * - * In the future we may consider holding off a chroot till we complete - * in progress lookups. - * - * If the starting directory is not the process rootdir then we need - * a usecount on the starting directory as well for the duration of the - * lookup. - * - * Getting an addtional usecount involves first getting an iocount under - * the lock that ensures that a usecount is on the directory. Once we - * get an iocount we can release the lock and we will be free to get a - * usecount without the vnode getting recycled. Once we get the usecount - * we can release the icoount which we used to get our usecount. + * We hold the proc_dirs lock across the lookup so that the + * process rootdir and cwd are stable (i.e. the usecounts + * on them are mainatained for the duration of the lookup) */ - proc_fdlock(p); + if (take_proc_lock) { + assert(proc_lock_taken == false); + proc_dirs_lock_shared(p); + proc_lock_taken = true; + } if (!(fdp->fd_flags & FD_CHROOT)) { ndp->ni_rootdir = rootvnode; } else { @@ -382,10 +368,8 @@ retry_copy: if (!ndp->ni_rootdir) { if (!(fdp->fd_flags & FD_CHROOT)) { - proc_fdunlock(p); printf("rootvnode is not set\n"); } else { - proc_fdunlock(p); /* This should be a panic */ printf("fdp->fd_rdir is not set\n"); } @@ -393,43 +377,10 @@ retry_copy: goto error_out; } - /* - * We have the proc_fdlock here so we still have a usecount - * on ndp->ni_rootdir. - * - * However we need to get our own usecount on it in order to - * ensure that the vnode isn't recycled to something else. - * - * Note : It's fine if the vnode is force reclaimed but with - * a usecount it won't be reused until we release the reference. - * - * In order to get that usecount however, we need to first - * get non blocking iocount since we'll be doing this under - * the proc_fdlock. - */ - if (vnode_get(ndp->ni_rootdir) != 0) { - proc_fdunlock(p); - error = ENOENT; - goto error_out; - } - - proc_fdunlock(p); - - /* Now we can safely get our own ref on ni_rootdir */ - error = vnode_ref_ext(ndp->ni_rootdir, O_EVTONLY, 0); - vnode_put(ndp->ni_rootdir); - if (error) { - ndp->ni_rootdir = NULLVP; - goto error_out; - } - - rootdir_with_usecount = ndp->ni_rootdir; - cnp->cn_nameptr = cnp->cn_pnbuf; ndp->ni_usedvp = NULLVP; - bool dp_needs_put = false; if (*(cnp->cn_nameptr) == '/') { while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; @@ -440,40 +391,15 @@ retry_copy: dp = ndp->ni_dvp; ndp->ni_usedvp = dp; } else { - dp = vfs_context_get_cwd(ctx); - if (dp) { - dp_needs_put = true; - } + dp = vfs_context_cwd(ctx); } if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) { - if (dp_needs_put) { - vnode_put(dp); - dp_needs_put = false; - } dp = NULLVP; error = ENOENT; goto error_out; } - if (dp != rootdir_with_usecount) { - error = vnode_ref_ext(dp, O_EVTONLY, 0); - if (error) { - if (dp_needs_put) { - vnode_put(dp); - dp_needs_put = false; - } - dp = NULLVP; - goto error_out; - } - startdir_with_usecount = dp; - } - - if (dp_needs_put) { - vnode_put(dp); - dp_needs_put = false; - } - ndp->ni_dvp = NULLVP; ndp->ni_vp = NULLVP; @@ -492,7 +418,6 @@ retry_copy: goto error_out; } #endif - ndp->ni_startdir = dp; dp = NULLVP; @@ -504,46 +429,19 @@ retry_copy: * Check for symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { - if (startdir_with_usecount) { - vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0); - startdir_with_usecount = NULLVP; - } - if (rootdir_with_usecount) { - vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0); - rootdir_with_usecount = NULLVP; + if (proc_lock_taken) { + proc_dirs_unlock_shared(p); + proc_lock_taken = false; } return 0; } continue_symlink: - /* - * Gives us a new path to process, and a starting dir (with an iocount). - * The iocount is needed to take a usecount on the vnode returned - * (if it is not a vnode we already have a usecount on). - */ - error = handle_symlink_for_namei(ndp, &dp, ctx); + /* Gives us a new path to process, and a starting dir */ + error = lookup_handle_symlink(ndp, &dp, ctx); if (error != 0) { break; } - - if (dp == ndp->ni_rootdir && startdir_with_usecount) { - vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0); - startdir_with_usecount = NULLVP; - } else if (dp != startdir_with_usecount) { - if (startdir_with_usecount) { - vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0); - startdir_with_usecount = NULLVP; - } - error = vnode_ref_ext(dp, O_EVTONLY, 0); - if (error) { - vnode_put(dp); - dp = NULLVP; - goto error_out; - } - startdir_with_usecount = dp; - } - /* iocount not required on dp anymore */ - vnode_put(dp); } /* * only come here if we fail to handle a SYMLINK... @@ -559,6 +457,10 @@ out_drop: vnode_put(ndp->ni_vp); } error_out: + if (proc_lock_taken) { + proc_dirs_unlock_shared(p); + proc_lock_taken = false; + } if ((cnp->cn_flags & HASBUF)) { cnp->cn_flags &= ~HASBUF; FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); @@ -567,15 +469,6 @@ error_out: ndp->ni_vp = NULLVP; ndp->ni_dvp = NULLVP; - if (startdir_with_usecount) { - vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0); - startdir_with_usecount = NULLVP; - } - if (rootdir_with_usecount) { - vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0); - rootdir_with_usecount = NULLVP; - } - #if CONFIG_VOLFS /* * Deal with volfs fallout. @@ -1672,10 +1565,10 @@ out: /* * Takes ni_vp and ni_dvp non-NULL. Returns with *new_dp set to the location - * at which to start a lookup with a resolved path and with an iocount. + * at which to start a lookup with a resolved path, and all other iocounts dropped. */ static int -handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) +lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) { int error; char *cp; /* pointer into pathname argument */ @@ -1766,18 +1659,17 @@ handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t c /* * starting point for 'relative' * symbolic link path - * - * If the starting point is not the root we have to return an iocounted - * dp to namei so we don't release the icoount here. */ dp = ndp->ni_dvp; - ndp->ni_dvp = NULLVP; /* * get rid of references returned via 'lookup' */ vnode_put(ndp->ni_vp); + vnode_put(ndp->ni_dvp); /* ALWAYS have a dvp for a symlink */ + ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; /* * Check if symbolic link restarts us at the root @@ -1787,20 +1679,9 @@ handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t c cnp->cn_nameptr++; ndp->ni_pathlen--; } - vnode_put(dp); if ((dp = ndp->ni_rootdir) == NULLVP) { return ENOENT; } - if (vnode_get(dp) != 0) { - return ENOENT; - } - } - - if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) { - if (dp) { - vnode_put(dp); - } - return ENOENT; } *new_dp = dp; diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 866780991..0d44e828b 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -5948,6 +5948,8 @@ vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx, if (start_dvp && (path[0] != '/')) { nd.ni_dvp = start_dvp; nd.ni_cnd.cn_flags |= USEDVP; + /* Don't take proc lock vnode_lookupat with a startdir specified */ + nd.ni_flag |= NAMEI_NOPROCLOCK; } if ((error = namei(&nd))) { diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index a0a04deb8..f27adda4d 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -211,6 +211,9 @@ struct fd_vn_data * fg_vn_data_alloc(void); */ #define MAX_AUTHORIZE_ENOENT_RETRIES 1024 +/* Max retry limit for rename due to vnode recycling. */ +#define MAX_RENAME_ERECYCLE_RETRIES 1024 + static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg, int unlink_flags); @@ -2005,6 +2008,7 @@ checkdirs_callback(proc_t p, void * arg) return PROC_RETURNED; } + proc_dirs_lock_exclusive(p); /* * Now do the work. Note: we dropped the proc_fdlock, so we * have to do all of the checks again. @@ -2024,6 +2028,7 @@ checkdirs_callback(proc_t p, void * arg) } } proc_fdunlock(p); + proc_dirs_unlock_exclusive(p); /* * Dispose of any references that are no longer needed. @@ -3586,10 +3591,12 @@ common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread) return ENOENT; } } else { + proc_dirs_lock_exclusive(p); proc_fdlock(p); tvp = fdp->fd_cdir; fdp->fd_cdir = vp; proc_fdunlock(p); + proc_dirs_unlock_exclusive(p); } if (tvp) { @@ -3659,10 +3666,12 @@ chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_threa return ENOENT; } } else { + proc_dirs_lock_exclusive(p); proc_fdlock(p); tvp = fdp->fd_cdir; fdp->fd_cdir = ndp->ni_vp; proc_fdunlock(p); + proc_dirs_unlock_exclusive(p); } if (tvp) { @@ -3781,11 +3790,21 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval) } vnode_put(nd.ni_vp); + /* + * This lock provides the guarantee that as long as you hold the lock + * fdp->fd_rdir has a usecount on it. This is used to take an iocount + * on a referenced vnode in namei when determining the rootvnode for + * a process. + */ + /* needed for synchronization with lookup */ + proc_dirs_lock_exclusive(p); + /* needed for setting the flag and other activities on the fd itself */ proc_fdlock(p); tvp = fdp->fd_rdir; fdp->fd_rdir = nd.ni_vp; fdp->fd_flags |= FD_CHROOT; proc_fdunlock(p); + proc_dirs_unlock_exclusive(p); if (tvp != NULL) { vnode_rele(tvp); @@ -8478,7 +8497,13 @@ skipped_lookup: * but other filesystems susceptible to this race could return it, too. */ if (error == ERECYCLE) { - do_retry = 1; + if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) { + do_retry = 1; + retry_count += 1; + } else { + printf("rename retry limit due to ERECYCLE reached\n"); + error = ENOENT; + } } /* diff --git a/config/MasterVersion b/config/MasterVersion index 441ebcc08..9259dc79e 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -19.4.0 +19.5.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 0c19f4964..7b6beff04 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -67,6 +67,10 @@ enum IODirection kIODirectionPrepareReserved1 = 0x00000010, #define IODIRECTIONPREPARENONCOHERENTDEFINED 1 kIODirectionPrepareNonCoherent = 0x00000020, +#if KERNEL_PRIVATE +#define IODIRECTIONPREPAREAVOIDTHROTTLING 1 + kIODirectionPrepareAvoidThrottling = 0x00000100, +#endif // these flags are valid for the complete() method only #define IODIRECTIONCOMPLETEWITHERRORDEFINED 1 diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index d73a4343b..a4e7d0536 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -3440,6 +3440,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) upl_abort(iopl.fIOPL, 0); upl_deallocate(iopl.fIOPL); } + error = kIOReturnNoMemory; goto abortExit; } dataP = NULL; @@ -3740,6 +3741,10 @@ IOGeneralMemoryDescriptor::prepare(IODirection forDirection) } if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) { + if ((forDirection & kIODirectionPrepareAvoidThrottling) && NEED_TO_HARD_THROTTLE_THIS_TASK()) { + error = kIOReturnNotReady; + goto finish; + } error = wireVirtual(forDirection); } @@ -3751,6 +3756,8 @@ IOGeneralMemoryDescriptor::prepare(IODirection forDirection) } } +finish: + if (_prepareLock) { IOLockUnlock(_prepareLock); } diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index 2f01b4681..cd07ccaba 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -4671,7 +4671,7 @@ pmap_static_allocations_done(void) * * Note that this workaround does not pose a security risk, because the RO * page tables still remain read-only, due to KTRR/CTRR, and further protecting - * them at the APRR level would be unnecessary. + * them would be unnecessary. */ monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin); monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin); diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index 875ddb7a5..1efc217f9 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -1148,7 +1148,7 @@ Lskip_el0_eret_mapping: Lexception_return_restore_registers: mov x0, sp // x0 = &pcb // Loads authed $x0->ss_64.pc into x1 and $x0->ss_64.cpsr into w2 - AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24 + AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24, el0_state_allowed=1 /* Restore special register state */ ldr w3, [sp, NS64_FPSR] diff --git a/osfmk/arm64/machine_routines_asm.h b/osfmk/arm64/machine_routines_asm.h index 7f5f8ed29..e1896caa6 100644 --- a/osfmk/arm64/machine_routines_asm.h +++ b/osfmk/arm64/machine_routines_asm.h @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include "assym.s" @@ -44,23 +45,29 @@ * * On CPUs with PAC support, this macro will auth the above values with ml_check_signed_state(). * - * arg0 - scratch register 1 - * arg1 - scratch register 2 - * arg2 - scratch register 3 - * arg3 - scratch register 4 - * arg4 - scratch register 5 + * tmp1 - scratch register 1 + * tmp2 - scratch register 2 + * tmp3 - scratch register 3 + * tmp4 - scratch register 4 + * tmp5 - scratch register 5 */ /* BEGIN IGNORE CODESTYLE */ -.macro AUTH_THREAD_STATE_IN_X0 - ldr x1, [x0, SS64_PC] +.macro AUTH_THREAD_STATE_IN_X0 tmp1, tmp2, tmp3, tmp4, tmp5, el0_state_allowed=0 ldr w2, [x0, SS64_CPSR] +.if \el0_state_allowed==0 +#if __has_feature(ptrauth_calls) + // If testing for a canary CPSR value, ensure that we do not observe writes to other fields without it + dmb ld +#endif +.endif + ldr x1, [x0, SS64_PC] ldp x16, x17, [x0, SS64_X16] #if defined(HAS_APPLE_PAC) // Save x3-x5 to preserve across call - mov $2, x3 - mov $3, x4 - mov $4, x5 + mov \tmp3, x3 + mov \tmp4, x4 + mov \tmp5, x5 /* * Arg0: The ARM context pointer (already in x0) @@ -71,20 +78,27 @@ * Stash saved state PC and CPSR in other registers to avoid reloading potentially unauthed * values from memory. (ml_check_signed_state will clobber x1 and x2.) */ - mov $0, x1 - mov $1, x2 + mov \tmp1, x1 + mov \tmp2, x2 ldr x3, [x0, SS64_LR] mov x4, x16 mov x5, x17 bl EXT(ml_check_signed_state) - mov x1, $0 - mov x2, $1 + mov x1, \tmp1 + mov x2, \tmp2 + +.if \el0_state_allowed==0 + and \tmp2, \tmp2, #PSR64_MODE_MASK + cbnz \tmp2, 1f + bl EXT(ml_auth_thread_state_invalid_cpsr) +1: +.endif // LR was already loaded/authed earlier, if we reload it we might be loading a potentially unauthed value mov lr, x3 - mov x3, $2 - mov x4, $3 - mov x5, $4 + mov x3, \tmp3 + mov x4, \tmp4 + mov x5, \tmp5 #else ldr lr, [x0, SS64_LR] #endif /* defined(HAS_APPLE_PAC) */ diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 9d41431fe..191997c13 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -415,25 +415,32 @@ L_mmu_kvtop_wpreflight_invalid: /* * SET_RECOVERY_HANDLER * - * Sets up a page fault recovery handler + * Sets up a page fault recovery handler. This macro clobbers x16 and x17. * - * arg0 - persisted thread pointer - * arg1 - persisted recovery handler - * arg2 - scratch reg - * arg3 - recovery label + * label - recovery label + * tpidr - persisted thread pointer + * old_handler - persisted recovery handler + * label_in_adr_range - whether \label is within 1 MB of PC */ -.macro SET_RECOVERY_HANDLER - mrs $0, TPIDR_EL1 // Load thread pointer - adrp $2, $3@page // Load the recovery handler address - add $2, $2, $3@pageoff +.macro SET_RECOVERY_HANDLER label, tpidr=x16, old_handler=x10, label_in_adr_range=0 + // Note: x16 and x17 are designated for use as temporaries in + // interruptible PAC routines. DO NOT CHANGE THESE REGISTER ASSIGNMENTS. +.if \label_in_adr_range==1 // Load the recovery handler address + adr x17, \label +.else + adrp x17, \label@page + add x17, x17, \label@pageoff +.endif #if defined(HAS_APPLE_PAC) - add $1, $0, TH_RECOVER - movk $1, #PAC_DISCRIMINATOR_RECOVER, lsl 48 - pacia $2, $1 // Sign with IAKey + blended discriminator + mrs x16, TPIDR_EL1 + add x16, x16, TH_RECOVER + movk x16, #PAC_DISCRIMINATOR_RECOVER, lsl 48 + pacia x17, x16 // Sign with IAKey + blended discriminator #endif - ldr $1, [$0, TH_RECOVER] // Save previous recovery handler - str $2, [$0, TH_RECOVER] // Set new signed recovery handler + mrs \tpidr, TPIDR_EL1 // Load thread pointer + ldr \old_handler, [\tpidr, TH_RECOVER] // Save previous recovery handler + str x17, [\tpidr, TH_RECOVER] // Set new signed recovery handler .endmacro /* @@ -441,18 +448,18 @@ L_mmu_kvtop_wpreflight_invalid: * * Clears page fault handler set by SET_RECOVERY_HANDLER * - * arg0 - thread pointer saved by SET_RECOVERY_HANDLER - * arg1 - old recovery handler saved by SET_RECOVERY_HANDLER + * tpidr - thread pointer saved by SET_RECOVERY_HANDLER + * old_handler - old recovery handler saved by SET_RECOVERY_HANDLER */ -.macro CLEAR_RECOVERY_HANDLER - str $1, [$0, TH_RECOVER] // Restore the previous recovery handler +.macro CLEAR_RECOVERY_HANDLER tpidr=x16, old_handler=x10 + str \old_handler, [\tpidr, TH_RECOVER] // Restore the previous recovery handler .endmacro .text .align 2 copyio_error: - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER mov x0, #EFAULT // Return an EFAULT error POP_FRAME ARM64_STACK_EPILOG @@ -466,7 +473,7 @@ copyio_error: LEXT(_bcopyin) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error /* If len is less than 16 bytes, just do a bytewise copy */ cmp x2, #16 b.lt 2f @@ -486,7 +493,7 @@ LEXT(_bcopyin) strb w3, [x1], #1 b.hi 2b 3: - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER mov x0, #0 POP_FRAME ARM64_STACK_EPILOG @@ -500,11 +507,11 @@ LEXT(_bcopyin) LEXT(_copyin_atomic32) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error ldr w8, [x0] str w8, [x1] mov x0, #0 - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -517,7 +524,7 @@ LEXT(_copyin_atomic32) LEXT(_copyin_atomic32_wait_if_equals) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error ldxr w8, [x0] cmp w8, w1 mov x0, ESTALE @@ -526,7 +533,7 @@ LEXT(_copyin_atomic32_wait_if_equals) wfe 1: clrex - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -539,11 +546,11 @@ LEXT(_copyin_atomic32_wait_if_equals) LEXT(_copyin_atomic64) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error ldr x8, [x0] str x8, [x1] mov x0, #0 - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -557,10 +564,10 @@ LEXT(_copyin_atomic64) LEXT(_copyout_atomic32) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error str w0, [x1] mov x0, #0 - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -573,10 +580,10 @@ LEXT(_copyout_atomic32) LEXT(_copyout_atomic64) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error str x0, [x1] mov x0, #0 - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -590,7 +597,7 @@ LEXT(_copyout_atomic64) LEXT(_bcopyout) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error /* If len is less than 16 bytes, just do a bytewise copy */ cmp x2, #16 b.lt 2f @@ -610,7 +617,7 @@ LEXT(_bcopyout) strb w3, [x1], #1 b.hi 2b 3: - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER mov x0, #0 POP_FRAME ARM64_STACK_EPILOG @@ -628,17 +635,7 @@ LEXT(_bcopyout) LEXT(_bcopyinstr) ARM64_STACK_PROLOG PUSH_FRAME - adr x4, Lcopyinstr_error // Get address for recover - mrs x10, TPIDR_EL1 // Get thread pointer - ldr x11, [x10, TH_RECOVER] // Save previous recover - -#if defined(HAS_APPLE_PAC) - add x5, x10, TH_RECOVER // Sign new pointer with IAKey + blended discriminator - movk x5, #PAC_DISCRIMINATOR_RECOVER, lsl 48 - pacia x4, x5 -#endif - str x4, [x10, TH_RECOVER] // Store new recover - + SET_RECOVERY_HANDLER Lcopyinstr_error, label_in_adr_range=1 mov x4, #0 // x4 - total bytes copied Lcopyinstr_loop: ldrb w5, [x0], #1 // Load a byte from the user source @@ -656,7 +653,7 @@ Lcopyinstr_done: Lcopyinstr_error: mov x0, #EFAULT // Return EFAULT on error Lcopyinstr_exit: - str x11, [x10, TH_RECOVER] // Restore old recover + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -672,9 +669,9 @@ Lcopyinstr_exit: * x3 : temp * x5 : temp (kernel virtual base) * x9 : temp - * x10 : thread pointer (set by SET_RECOVERY_HANDLER) - * x11 : old recovery function (set by SET_RECOVERY_HANDLER) + * x10 : old recovery function (set by SET_RECOVERY_HANDLER) * x12, x13 : backtrace data + * x16 : thread pointer (set by SET_RECOVERY_HANDLER) * */ .text @@ -683,7 +680,7 @@ Lcopyinstr_exit: LEXT(copyinframe) ARM64_STACK_PROLOG PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, copyio_error + SET_RECOVERY_HANDLER copyio_error cbnz w2, Lcopyinframe64 // Check frame size adrp x5, EXT(gVirtBase)@page // For 32-bit frame, make sure we're not trying to copy from kernel add x5, x5, EXT(gVirtBase)@pageoff @@ -714,7 +711,7 @@ Lcopyinframe_valid: mov w0, #0 // Success Lcopyinframe_done: - CLEAR_RECOVERY_HANDLER x10, x11 + CLEAR_RECOVERY_HANDLER POP_FRAME ARM64_STACK_EPILOG @@ -1124,6 +1121,24 @@ Lcheck_hash_panic: CALL_EXTERN panic_with_thread_kernel_state Lcheck_hash_str: .asciz "JOP Hash Mismatch Detected (PC, CPSR, or LR corruption)" + +/** + * void ml_auth_thread_state_invalid_cpsr(arm_saved_state_t *ss) + * + * Panics due to an invalid CPSR value in ss. + */ + .text + .align 2 + .globl EXT(ml_auth_thread_state_invalid_cpsr) +LEXT(ml_auth_thread_state_invalid_cpsr) + ARM64_STACK_PROLOG + PUSH_FRAME + mov x1, x0 + adr x0, Linvalid_cpsr_str + CALL_EXTERN panic_with_thread_kernel_state + +Linvalid_cpsr_str: + .asciz "Thread state corruption detected (PE mode == 0)" #endif /* HAS_APPLE_PAC */ .text diff --git a/osfmk/arm64/pcb.c b/osfmk/arm64/pcb.c index ff4efbfdd..29f2b7185 100644 --- a/osfmk/arm64/pcb.c +++ b/osfmk/arm64/pcb.c @@ -343,6 +343,7 @@ machine_stack_attach(thread_t thread, #if defined(HAS_APPLE_PAC) /* Sign the initial kernel stack saved state */ const uint32_t default_cpsr = PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK; + boolean_t intr = ml_set_interrupts_enabled(FALSE); asm volatile ( "mov x0, %[ss]" "\n" @@ -376,6 +377,7 @@ machine_stack_attach(thread_t thread, [SS64_LR] "i"(offsetof(struct arm_saved_state, ss_64.lr)) : "x0", "x1", "x2", "x3", "x4", "x5", "x6" ); + ml_set_interrupts_enabled(intr); #else savestate->lr = (uintptr_t)thread_continue; savestate->cpsr = (PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK) | current_el; diff --git a/osfmk/arm64/status.c b/osfmk/arm64/status.c index 28f87b0a1..a9f1eec26 100644 --- a/osfmk/arm64/status.c +++ b/osfmk/arm64/status.c @@ -105,17 +105,31 @@ thread_state64_to_saved_state(const arm_thread_state64_t * ts64, arm_saved_state_t * saved_state) { uint32_t i; +#if __has_feature(ptrauth_calls) + boolean_t intr = ml_set_interrupts_enabled(FALSE); +#endif /* __has_feature(ptrauth_calls) */ assert(is_saved_state64(saved_state)); + set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64); +#if __has_feature(ptrauth_calls) + /* + * Make writes to ts64->cpsr visible first, since it's useful as a + * canary to detect thread-state corruption. + */ + __builtin_arm_dmb(DMB_ST); +#endif set_saved_state_fp(saved_state, ts64->fp); set_saved_state_lr(saved_state, ts64->lr); set_saved_state_sp(saved_state, ts64->sp); set_saved_state_pc(saved_state, ts64->pc); - set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64); for (i = 0; i < 29; i++) { set_saved_state_reg(saved_state, i, ts64->x[i]); } + +#if __has_feature(ptrauth_calls) + ml_set_interrupts_enabled(intr); +#endif /* __has_feature(ptrauth_calls) */ } #endif /* __arm64__ */ @@ -1316,7 +1330,9 @@ machine_thread_state_initialize(thread_t thread) #if defined(HAS_APPLE_PAC) /* Sign the initial user-space thread state */ if (thread->machine.upcb != NULL) { + boolean_t intr = ml_set_interrupts_enabled(FALSE); ml_sign_thread_state(thread->machine.upcb, 0, 0, 0, 0, 0); + ml_set_interrupts_enabled(intr); } #endif /* defined(HAS_APPLE_PAC) */ diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 98d7250c0..df259cf29 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -7447,14 +7447,21 @@ void task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num) { vm_object_t find_vmo; - int64_t size = 0; + unsigned int i = 0; + unsigned int vmobj_limit = len / sizeof(vm_object_query_data_t); task_objq_lock(task); if (query != NULL) { queue_iterate(&task->task_objq, find_vmo, vm_object_t, task_objq) { - int byte_size; - vm_object_query_t p = &query[size++]; + vm_object_query_t p = &query[i]; + + /* + * Clear the entire vm_object_query_t struct as we are using + * only the first 6 bits in the uint64_t bitfield for this + * anonymous struct member. + */ + bzero(p, sizeof(*p)); p->object_id = (vm_object_id_t) VM_KERNEL_ADDRPERM(find_vmo); p->virtual_size = find_vmo->internal ? find_vmo->vo_size : 0; @@ -7471,16 +7478,17 @@ task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num) p->compressed_size = 0; } - /* make sure to not overrun */ - byte_size = (int) size * sizeof(vm_object_query_data_t); - if ((int)(byte_size + sizeof(vm_object_query_data_t)) > len) { + i++; + + /* Make sure to not overrun */ + if (i == vmobj_limit) { break; } } } else { - size = task->task_owned_objects; + i = task->task_owned_objects; } task_objq_unlock(task); - *num = size; + *num = i; } diff --git a/osfmk/mach/arm/thread_status.h b/osfmk/mach/arm/thread_status.h index 548c48e1c..30f2f097d 100644 --- a/osfmk/mach/arm/thread_status.h +++ b/osfmk/mach/arm/thread_status.h @@ -504,6 +504,18 @@ typedef struct arm_saved_state arm_saved_state_t; #if defined(XNU_KERNEL_PRIVATE) #if defined(HAS_APPLE_PAC) + +#include + +/* + * Used by MANIPULATE_SIGNED_THREAD_STATE(), potentially from C++ (IOKit) code. + * Open-coded to prevent a circular dependency between mach/arm/thread_status.h + * and osfmk/arm/machine_routines.h. + */ +__BEGIN_DECLS +extern boolean_t ml_set_interrupts_enabled(boolean_t); +__END_DECLS + /* * Methods used to sign and check thread state to detect corruptions of saved * thread state across exceptions and context switches. @@ -531,30 +543,34 @@ extern void ml_check_signed_state(const arm_saved_state_t *, uint64_t, uint32_t, * x6: scratch register * x7: scratch register */ -#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...) \ - asm volatile ( \ - "mov x8, lr" "\n" \ - "mov x0, %[iss]" "\n" \ - "ldp x4, x5, [x0, %[SS64_X16]]" "\n" \ - "ldr x6, [x0, %[SS64_PC]]" "\n" \ - "ldr w7, [x0, %[SS64_CPSR]]" "\n" \ - "ldr x3, [x0, %[SS64_LR]]" "\n" \ - "mov x1, x6" "\n" \ - "mov w2, w7" "\n" \ - "bl _ml_check_signed_state" "\n" \ - "mov x1, x6" "\n" \ - "mov w2, w7" "\n" \ - _instr "\n" \ - "bl _ml_sign_thread_state" "\n" \ - "mov lr, x8" "\n" \ - : \ - : [iss] "r"(_iss), \ - [SS64_X16] "i"(ss64_offsetof(x[16])), \ - [SS64_PC] "i"(ss64_offsetof(pc)), \ - [SS64_CPSR] "i"(ss64_offsetof(cpsr)), \ - [SS64_LR] "i"(ss64_offsetof(lr)),##__VA_ARGS__ \ - : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" \ - ) +#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...) \ + do { \ + boolean_t _intr = ml_set_interrupts_enabled(FALSE); \ + asm volatile ( \ + "mov x8, lr" "\n" \ + "mov x0, %[iss]" "\n" \ + "ldp x4, x5, [x0, %[SS64_X16]]" "\n" \ + "ldr x6, [x0, %[SS64_PC]]" "\n" \ + "ldr w7, [x0, %[SS64_CPSR]]" "\n" \ + "ldr x3, [x0, %[SS64_LR]]" "\n" \ + "mov x1, x6" "\n" \ + "mov w2, w7" "\n" \ + "bl _ml_check_signed_state" "\n" \ + "mov x1, x6" "\n" \ + "mov w2, w7" "\n" \ + _instr "\n" \ + "bl _ml_sign_thread_state" "\n" \ + "mov lr, x8" "\n" \ + : \ + : [iss] "r"(_iss), \ + [SS64_X16] "i"(ss64_offsetof(x[16])), \ + [SS64_PC] "i"(ss64_offsetof(pc)), \ + [SS64_CPSR] "i"(ss64_offsetof(cpsr)), \ + [SS64_LR] "i"(ss64_offsetof(lr)),##__VA_ARGS__ \ + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" \ + ); \ + ml_set_interrupts_enabled(_intr); \ + } while (0) static inline void check_and_sign_copied_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src) diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index 071a66d4e..37a6f2844 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -1653,6 +1653,7 @@ c_seg_alloc_nextslot(c_segment_t c_seg) } +#define C_SEG_MAJOR_COMPACT_STATS_MAX (30) struct { uint64_t asked_permission; @@ -1662,7 +1663,11 @@ struct { uint64_t wasted_space_in_swapouts; uint64_t count_of_swapouts; uint64_t count_of_freed_segs; -} c_seg_major_compact_stats; + uint64_t bailed_compactions; + uint64_t bytes_freed_rate_us; +} c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX]; + +int c_seg_major_compact_stats_now = 0; #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 90) / 100) @@ -1673,7 +1678,7 @@ c_seg_major_compact_ok( c_segment_t c_seg_dst, c_segment_t c_seg_src) { - c_seg_major_compact_stats.asked_permission++; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++; if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE && c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) { @@ -1720,7 +1725,7 @@ c_seg_major_compact( c_seg_dst->c_was_major_compacted++; c_seg_src->c_was_major_donor++; #endif - c_seg_major_compact_stats.compactions++; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++; dst_slot = c_seg_dst->c_nextslot; @@ -1766,8 +1771,8 @@ c_seg_major_compact( c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK; - c_seg_major_compact_stats.moved_slots++; - c_seg_major_compact_stats.moved_bytes += c_size; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size; cslot_copy(c_dst, c_src); c_dst->c_offset = c_seg_dst->c_nextoffset; @@ -2319,6 +2324,8 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all) boolean_t needs_to_swap = FALSE; + VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0); + #if !CONFIG_EMBEDDED LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); #endif /* !CONFIG_EMBEDDED */ @@ -2348,6 +2355,8 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all) } lck_mtx_lock_spin_always(c_list_lock); } + + VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0); } @@ -2689,15 +2698,20 @@ do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg) } } +int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS; void vm_compressor_compact_and_swap(boolean_t flush_all) { c_segment_t c_seg, c_seg_next; - boolean_t keep_compacting; + boolean_t keep_compacting, switch_state; clock_sec_t now; clock_nsec_t nsec; + mach_timespec_t start_ts, end_ts; + unsigned int number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields; + uint64_t bytes_to_free, bytes_freed, delta_usec; + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count); if (fastwake_warmup == TRUE) { uint64_t starting_warmup_count; @@ -2731,6 +2745,16 @@ vm_compressor_compact_and_swap(boolean_t flush_all) */ clock_get_system_nanotime(&now, &nsec); + start_ts.tv_sec = (int) now; + start_ts.tv_nsec = nsec; + delta_usec = 0; + number_considered = 0; + wanted_cseg_found = 0; + number_yields = 0; + bytes_to_free = 0; + bytes_freed = 0; + yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS); + while (!queue_empty(&c_age_list_head) && compaction_swapper_abort == 0) { if (hibernate_flushing == TRUE) { clock_sec_t sec; @@ -2764,6 +2788,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all) lck_mtx_unlock_always(c_list_lock); + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0); + thread_block(THREAD_CONTINUE_NULL); lck_mtx_lock_spin_always(c_list_lock); @@ -2783,6 +2809,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all) * to do minor compactions to make * more memory available */ + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0); + continue; } @@ -2804,11 +2832,14 @@ vm_compressor_compact_and_swap(boolean_t flush_all) lck_mtx_lock_spin_always(c_list_lock); + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0); + if (needs_to_swap == FALSE) { break; } } if (queue_empty(&c_age_list_head)) { + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0); break; } c_seg = (c_segment_t) queue_first(&c_age_list_head); @@ -2816,12 +2847,15 @@ vm_compressor_compact_and_swap(boolean_t flush_all) assert(c_seg->c_state == C_ON_AGE_Q); if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) { + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0); break; } lck_mtx_lock_spin_always(&c_seg->c_lock); if (c_seg->c_busy) { + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0); + lck_mtx_unlock_always(c_list_lock); c_seg_wait_on_busy(c_seg); lck_mtx_lock_spin_always(c_list_lock); @@ -2835,13 +2869,15 @@ vm_compressor_compact_and_swap(boolean_t flush_all) * found an empty c_segment and freed it * so go grab the next guy in the queue */ - c_seg_major_compact_stats.count_of_freed_segs++; + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0); + c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++; continue; } /* * Major compaction */ keep_compacting = TRUE; + switch_state = TRUE; while (keep_compacting == TRUE) { assert(c_seg->c_busy); @@ -2856,6 +2892,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all) assert(c_seg_next->c_state == C_ON_AGE_Q); + number_considered++; + if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) { break; } @@ -2863,7 +2901,24 @@ vm_compressor_compact_and_swap(boolean_t flush_all) lck_mtx_lock_spin_always(&c_seg_next->c_lock); if (c_seg_next->c_busy) { + /* + * We are going to block for our neighbor. + * If our c_seg is wanted, we should unbusy + * it because we don't know how long we might + * have to block here. + */ + if (c_seg->c_wanted) { + lck_mtx_unlock_always(&c_seg_next->c_lock); + switch_state = FALSE; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++; + wanted_cseg_found++; + break; + } + lck_mtx_unlock_always(c_list_lock); + + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0); + c_seg_wait_on_busy(c_seg_next); lck_mtx_lock_spin_always(c_list_lock); @@ -2872,12 +2927,14 @@ vm_compressor_compact_and_swap(boolean_t flush_all) /* grab that segment */ C_SEG_BUSY(c_seg_next); + bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset); if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) { /* * found an empty c_segment and freed it * so we can't continue to use c_seg_next */ - c_seg_major_compact_stats.count_of_freed_segs++; + bytes_freed += bytes_to_free; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++; continue; } @@ -2888,6 +2945,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all) keep_compacting = c_seg_major_compact(c_seg, c_seg_next); + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0); + PAGE_REPLACEMENT_DISALLOWED(TRUE); lck_mtx_lock_spin_always(&c_seg_next->c_lock); @@ -2901,54 +2960,78 @@ vm_compressor_compact_and_swap(boolean_t flush_all) * by passing TRUE, we ask for c_busy to be cleared * and c_wanted to be taken care of */ + bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset); if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) { - c_seg_major_compact_stats.count_of_freed_segs++; + bytes_freed += bytes_to_free; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++; + } else { + bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset); + bytes_freed += bytes_to_free; } PAGE_REPLACEMENT_DISALLOWED(FALSE); /* relock the list */ lck_mtx_lock_spin_always(c_list_lock); + + if (c_seg->c_wanted) { + /* + * Our c_seg is in demand. Let's + * unbusy it and wakeup the waiters + * instead of continuing the compaction + * because we could be in this loop + * for a while. + */ + switch_state = FALSE; + wanted_cseg_found++; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++; + break; + } } /* major compaction */ + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, wanted_cseg_found, 0); + lck_mtx_lock_spin_always(&c_seg->c_lock); assert(c_seg->c_busy); assert(!c_seg->c_on_minorcompact_q); - if (VM_CONFIG_SWAP_IS_ACTIVE) { - /* - * This mode of putting a generic c_seg on the swapout list is - * only supported when we have general swapping enabled - */ - c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); - } else { - if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) { - assert(VM_CONFIG_SWAP_IS_PRESENT); + if (switch_state) { + if (VM_CONFIG_SWAP_IS_ACTIVE) { /* - * we are running compressor sweeps with swap-behind - * make sure the c_seg has aged enough before swapping it - * out... + * This mode of putting a generic c_seg on the swapout list is + * only supported when we have general swapping enabled */ - if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) { - c_seg->c_overage_swap = TRUE; - c_overage_swapped_count++; - c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); + c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); + } else { + if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) { + assert(VM_CONFIG_SWAP_IS_PRESENT); + /* + * we are running compressor sweeps with swap-behind + * make sure the c_seg has aged enough before swapping it + * out... + */ + if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) { + c_seg->c_overage_swap = TRUE; + c_overage_swapped_count++; + c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE); + } } } + if (c_seg->c_state == C_ON_AGE_Q) { + /* + * this c_seg didn't get moved to the swapout queue + * so we need to move it out of the way... + * we just did a major compaction on it so put it + * on that queue + */ + c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE); + } else { + c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used; + c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++; + } } - if (c_seg->c_state == C_ON_AGE_Q) { - /* - * this c_seg didn't get moved to the swapout queue - * so we need to move it out of the way... - * we just did a major compaction on it so put it - * on that queue - */ - c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE); - } else { - c_seg_major_compact_stats.wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used; - c_seg_major_compact_stats.count_of_swapouts++; - } + C_SEG_WAKEUP_DONE(c_seg); lck_mtx_unlock_always(&c_seg->c_lock); @@ -2960,7 +3043,55 @@ vm_compressor_compact_and_swap(boolean_t flush_all) lck_mtx_lock_spin_always(c_list_lock); } + + if (number_considered >= yield_after_considered_per_pass) { + if (wanted_cseg_found) { + /* + * We stopped major compactions on a c_seg + * that is wanted. We don't know the priority + * of the waiter unfortunately but we are at + * a very high priority and so, just in case + * the waiter is a critical system daemon or + * UI thread, let's give up the CPU in case + * the system is running a few CPU intensive + * tasks. + */ + lck_mtx_unlock_always(c_list_lock); + + mutex_pause(2); /* 100us yield */ + + number_yields++; + + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0); + + lck_mtx_lock_spin_always(c_list_lock); + } + + number_considered = 0; + wanted_cseg_found = 0; + } } + clock_get_system_nanotime(&now, &nsec); + end_ts.tv_sec = (int) now; + end_ts.tv_nsec = nsec; + + SUB_MACH_TIMESPEC(&end_ts, &start_ts); + + delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100); + + delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */ + + c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec); + + if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) { + c_seg_major_compact_stats_now = 0; + } else { + c_seg_major_compact_stats_now++; + } + + assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX); + + VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count); } diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 1622e5547..0cfa66169 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -138,12 +138,15 @@ extern struct vnode *vnode_pager_lookup_vnode(memory_object_t); uint64_t vm_hard_throttle_threshold; - -#define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \ - ((vm_page_free_count < vm_page_throttle_limit || \ - HARD_THROTTLE_LIMIT_REACHED()) && \ - proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED)) - +OS_ALWAYS_INLINE +boolean_t +NEED_TO_HARD_THROTTLE_THIS_TASK(void) +{ + return vm_wants_task_throttled(current_task()) || + ((vm_page_free_count < vm_page_throttle_limit || + HARD_THROTTLE_LIMIT_REACHED()) && + proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED); +} #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */ #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */ diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index 8fe4c76d8..185764384 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -198,6 +198,12 @@ extern void vm_rtfault_record_init(void); #endif /* MACH_KERNEL_PRIVATE */ +#if XNU_KERNEL_PRIVATE + +boolean_t NEED_TO_HARD_THROTTLE_THIS_TASK(void); + +#endif + #endif /* KERNEL_PRIVATE */ #endif /* _VM_VM_FAULT_H_ */ diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index d9e16eab3..d4e947c16 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -155,6 +155,9 @@ extern int vm_debug_events; #define VM_PAGE_GRAB 0x126 #define VM_PAGE_RELEASE 0x127 +#define VM_COMPRESSOR_COMPACT_AND_SWAP 0x128 +#define VM_COMPRESSOR_DO_DELAYED_COMPACTIONS 0x129 + #define VM_PRESSURE_EVENT 0x130 #define VM_EXECVE 0x131 diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c index 471312f80..1ed21b3d9 100644 --- a/tests/memorystatus_freeze_test.c +++ b/tests/memorystatus_freeze_test.c @@ -671,22 +671,29 @@ T_DECL(budget_replenishment, "budget replenishes properly") { length = sizeof(kTestIntervalSecs); new_budget_ln = sizeof(new_budget); ret = sysctlbyname("vm.memorystatus_freeze_calculate_new_budget", &new_budget, &new_budget_ln, &kTestIntervalSecs, length); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget"); + T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget"); // Grab the daily budget. length = sizeof(memorystatus_freeze_daily_mb_max); ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &memorystatus_freeze_daily_mb_max, &length, NULL, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max"); + T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max"); - memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024 * 1024 / page_size; + memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024UL * 1024UL / page_size; + T_LOG("memorystatus_freeze_daily_mb_max %u", memorystatus_freeze_daily_mb_max); + T_LOG("memorystatus_freeze_daily_pages_max %u", memorystatus_freeze_daily_pages_max); + T_LOG("page_size %u", page_size); /* * We're kTestIntervalSecs past a new interval. Which means we are owed kNumSecondsInDay * seconds of budget. */ expected_new_budget_pages = memorystatus_freeze_daily_pages_max; + T_LOG("expected_new_budget_pages before %u", expected_new_budget_pages); + T_ASSERT_EQ(kTestIntervalSecs, 60 * 60 * 32, "kTestIntervalSecs did not change"); expected_new_budget_pages += ((kTestIntervalSecs * kFixedPointFactor) / (kNumSecondsInDay) * memorystatus_freeze_daily_pages_max) / kFixedPointFactor; + T_LOG("expected_new_budget_pages after %u", expected_new_budget_pages); + T_LOG("memorystatus_freeze_daily_pages_max after %u", memorystatus_freeze_daily_pages_max); T_QUIET; T_ASSERT_EQ(new_budget, expected_new_budget_pages, "Calculate new budget behaves correctly."); } diff --git a/tools/lldbmacros/kdp.py b/tools/lldbmacros/kdp.py index e8bc324b1..b6d3f4f69 100755 --- a/tools/lldbmacros/kdp.py +++ b/tools/lldbmacros/kdp.py @@ -2,6 +2,8 @@ from xnu import * from utils import * import sys +current_KDP_mode = "swhosted" + def GetKDPPacketHeaderInt(request=0, is_reply=False, seq=0, length=0, key=0): """ create a 64 bit number that could be saved as pkt_hdr_t params: @@ -283,3 +285,26 @@ def KDPSetDumpInfo(cmd_args=None): print "Failed to save the dumpinfo." return retval +@lldb_command('kdpmode') +def KDPMode(cmd_args=None): + """ + Change KDP mode between software hosted and hardware probe. + When lldb is connected to a KDP server backed by a hardware debug tool + setting this to 'hwprobe' enables physical memory access. + + swhosted: LLDB is connected to the target using a serial or socket connection. + hwprobe: LLDB is connected to the target using a hardware probe. + + usage: kdpmode + mode: 'swhosted' or 'hwprobe' + """ + global current_KDP_mode + + if cmd_args == None or len(cmd_args) == 0: + return current_KDP_mode + if len(cmd_args) > 1 or cmd_args[0] not in {'swhosted', 'hwprobe'}: + print "Invalid Arguments", KDPMode.__doc__ + else: + current_KDP_mode = cmd_args[0] + return + diff --git a/tools/lldbmacros/pmap.py b/tools/lldbmacros/pmap.py index 8bff2689c..8bb7134fc 100755 --- a/tools/lldbmacros/pmap.py +++ b/tools/lldbmacros/pmap.py @@ -2,6 +2,7 @@ from xnu import * import xnudefines from kdp import * from utils import * +import struct def ReadPhysInt(phys_addr, bitsize = 64, cpuval = None): """ Read a physical memory data based on address. @@ -65,38 +66,69 @@ def KDPReadPhysMEM(address, bits): print "Target is not connected over kdp. Nothing to do here." return retval - input_address = unsigned(addressof(kern.globals.manual_pkt.input)) - len_address = unsigned(addressof(kern.globals.manual_pkt.len)) - data_address = unsigned(addressof(kern.globals.manual_pkt.data)) - if not WriteInt32ToMemoryAddress(0, input_address): - return retval - - kdp_pkt_size = GetType('kdp_readphysmem64_req_t').GetByteSize() - if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address): - return retval + if "hwprobe" == KDPMode(): + # Send the proper KDP command and payload to the bare metal debug tool via a KDP server + addr_for_kdp = struct.unpack("Q", address))[0] + byte_count = struct.unpack("I", bits/8))[0] + packet = "{0:016x}{1:08x}{2:04x}".format(addr_for_kdp, byte_count, 0x0) - data_addr = int(addressof(kern.globals.manual_pkt)) - pkt = kern.GetValueFromAddress(data_addr, 'kdp_readphysmem64_req_t *') + ret_obj = lldb.SBCommandReturnObject() + ci = lldb.debugger.GetCommandInterpreter() + ci.HandleCommand('process plugin packet send -c 25 -p {0}'.format(packet), ret_obj) - header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_READPHYSMEM64'), length=kdp_pkt_size) + if ret_obj.Succeeded(): + value = ret_obj.GetOutput() - if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and - WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and - WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and - WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu))) - ): - - if WriteInt32ToMemoryAddress(1, input_address): - # now read data from the kdp packet - data_address = unsigned(addressof(kern.GetValueFromAddress(int(addressof(kern.globals.manual_pkt.data)), 'kdp_readphysmem64_reply_t *').data)) if bits == 64 : - retval = kern.GetValueFromAddress(data_address, 'uint64_t *').GetSBValue().Dereference().GetValueAsUnsigned() + pack_fmt = "Q", address))[0] + byte_count = struct.unpack("I", bits/8))[0] + + if bits == 64 : + pack_fmt = ">Q" + unpack_fmt = "