lck_grp_t * proc_knhashlock_grp;
lck_grp_t * proc_ucred_mlock_grp;
lck_grp_t * proc_mlock_grp;
+lck_grp_t * proc_dirslock_grp;
lck_grp_attr_t * proc_lck_grp_attr;
lck_attr_t * proc_lck_attr;
lck_mtx_t * proc_list_mlock;
proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr);
proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr);
proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr);
+ proc_dirslock_grp = lck_grp_alloc_init("proc-dirslock", proc_lck_grp_attr);
#if CONFIG_XNUPOST
sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL);
sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init(
lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr);
lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr);
lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr);
+ lck_rw_init(&kernproc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
assert(bsd_simul_execs != 0);
execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr);
/*
- * Copyright (c) 1999-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
}
so_recv_data_stat(so, m, 0);
- if (sbappend(&so->so_rcv, m) != 0) {
+ if (sbappend_nodrop(&so->so_rcv, m) != 0) {
if ((flags & CTL_DATA_NOWAKEUP) == 0) {
sorwakeup(so);
}
*/
m->m_nextpkt = NULL;
so_recv_data_stat(so, m, 0);
- if (sbappendrecord(&so->so_rcv, m) != 0) {
+ if (sbappendrecord_nodrop(&so->so_rcv, m) != 0) {
needwakeup = 1;
} else {
/*
m->m_flags |= M_EOR;
}
so_recv_data_stat(so, m, 0);
+ /*
+ * No need to call the "nodrop" variant of sbappend
+ * because the mbuf is local to the scope of the function
+ */
if (sbappend(&so->so_rcv, m) != 0) {
if ((flags & CTL_DATA_NOWAKEUP) == 0) {
sorwakeup(so);
}
+void
+proc_dirs_lock_shared(proc_t p)
+{
+ lck_rw_lock_shared(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_unlock_shared(proc_t p)
+{
+ lck_rw_unlock_shared(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_lock_exclusive(proc_t p)
+{
+ lck_rw_lock_exclusive(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_unlock_exclusive(proc_t p)
+{
+ lck_rw_unlock_exclusive(&p->p_dirs_lock);
+}
+
/*
* proc_fdlock, proc_fdlock_spin
*
}
/* Coming from a chroot environment and unable to get a reference... */
if (newfdp->fd_rdir == NULL && fdp->fd_rdir) {
+ proc_fdunlock(p);
/*
* We couldn't get a new reference on
* the chroot directory being
*
* XXX may fail to copy descriptors to child
*/
+ lck_rw_init(&child_proc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir);
#if SYSV_SHM
if (ttyvp) {
if (vnode_getwithvid(ttyvp, ttyvid) == 0) {
- *vp = procsp->s_ttyvp;
+ *vp = ttyvp;
err = 0;
}
} else {
exp->p_pid = p->p_pid;
exp->p_oppid = p->p_oppid;
/* Mach related */
- exp->user_stack = p->user_stack;
exp->p_debugger = p->p_debugger;
exp->sigwait = p->sigwait;
/* scheduling */
exp->p_pid = p->p_pid;
exp->p_oppid = p->p_oppid;
/* Mach related */
- exp->user_stack = p->user_stack;
exp->p_debugger = p->p_debugger;
exp->sigwait = p->sigwait;
/* scheduling */
SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, "");
SYSCTL_INT(_vm, OID_AUTO, compressor_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
+extern int min_csegs_per_major_compaction;
+SYSCTL_INT(_vm, OID_AUTO, compressor_min_csegs_per_major_compaction, CTLFLAG_RW | CTLFLAG_LOCKED, &min_csegs_per_major_compaction, 0, "");
+
SYSCTL_INT(_vm, OID_AUTO, vm_ripe_target_age_in_secs, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_ripe_target_age, 0, "");
SYSCTL_INT(_vm, OID_AUTO, compressor_eval_period_in_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_eval_period_in_msecs, 0, "");
int error;
mach_port_name_t task_port_name;
task_t task;
- int buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
+ size_t buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
vmobject_list_output_t buffer;
size_t output_size;
if (buffer_size) {
- const int min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t);
+ const size_t min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t);
- if (buffer_size < min_size) {
- buffer_size = min_size;
+ if (buffer_size < min_size || buffer_size > INT_MAX) {
+ return EINVAL;
}
buffer = kalloc(buffer_size);
0x1300494 MACH_vm_page_expedite_no_memory
0x1300498 MACH_vm_page_grab
0x130049c MACH_vm_page_release
+0x13004a0 MACH_vm_compressor_compact_and_swap
+0x13004a4 MACH_vm_compressor_do_delayed_compactions
0x13004c0 MACH_vm_pressure_event
0x13004c4 MACH_vm_execve
0x13004c8 MACH_vm_wakeup_compactor_swapper
(error = sodisconnectlocked(so)) != 0)) {
error = EISCONN;
} else {
+ if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
+ (flags & CONNECT_DATA_IDEMPOTENT)) {
+ so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
+
+ if (flags & CONNECT_DATA_AUTHENTICATED) {
+ so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
+ }
+ }
+
+ /*
+ * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
+ * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
+ * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
+ * Case 3 allows user to combine write with connect even if they have
+ * no use for TFO (such as regular TCP, and UDP).
+ * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
+ */
+ if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
+ ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
+ so->so_flags1 |= SOF1_PRECONNECT_DATA;
+ }
+
+ /*
+ * If a user sets data idempotent and does not pass an uio, or
+ * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
+ * SOF1_DATA_IDEMPOTENT.
+ */
+ if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
+ (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
+ /* We should return EINVAL instead perhaps. */
+ so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
+ }
+
/*
* Run connect filter before calling protocol:
* - non-blocking connect returns before completion;
flags, arg, arglen, auio, bytes_written);
if (error != 0) {
so->so_state &= ~SS_ISCONNECTING;
+ if (error != EINPROGRESS) {
+ so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
+ }
}
}
}
/*
- * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
struct mbuf *);
static void soevent_ifdenied(struct socket *);
+static int sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop);
+static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop);
+
/*
* Primitive routines for operating on sockets and socket buffers
*/
* the mbuf chain is recorded in sb. Empty mbufs are
* discarded and mbufs are compacted where possible.
*/
-int
-sbappend(struct sockbuf *sb, struct mbuf *m)
+static int
+sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop)
{
struct socket *so = sb->sb_so;
if (m == NULL || (sb->sb_flags & SB_DROP)) {
- if (m != NULL) {
+ if (m != NULL && !nodrop) {
m_freem(m);
}
return 0;
SBLASTRECORDCHK(sb, "sbappend 1");
if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) {
- return sbappendrecord(sb, m);
+ return sbappendrecord_common(sb, m, nodrop);
}
- if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
- int error = sflt_data_in(so, NULL, &m, NULL, 0);
- SBLASTRECORDCHK(sb, "sbappend 2");
+ if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+ ASSERT(nodrop == FALSE);
+ if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
+ int error = sflt_data_in(so, NULL, &m, NULL, 0);
+ SBLASTRECORDCHK(sb, "sbappend 2");
#if CONTENT_FILTER
- if (error == 0) {
- error = cfil_sock_data_in(so, NULL, m, NULL, 0);
- }
+ if (error == 0) {
+ error = cfil_sock_data_in(so, NULL, m, NULL, 0);
+ }
#endif /* CONTENT_FILTER */
- if (error != 0) {
- if (error != EJUSTRETURN) {
- m_freem(m);
+ if (error != 0) {
+ if (error != EJUSTRETURN) {
+ m_freem(m);
+ }
+ return 0;
}
- return 0;
+ } else if (m) {
+ m->m_flags &= ~M_SKIPCFIL;
}
- } else if (m) {
- m->m_flags &= ~M_SKIPCFIL;
}
/* If this is the first record, it's also the last record */
return 1;
}
+int
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+ return sbappend_common(sb, m, FALSE);
+}
+
+int
+sbappend_nodrop(struct sockbuf *sb, struct mbuf *m)
+{
+ return sbappend_common(sb, m, TRUE);
+}
+
/*
* Similar to sbappend, except that this is optimized for stream sockets.
*/
SBLASTMBUFCHK(sb, __func__);
- if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
- int error = sflt_data_in(so, NULL, &m, NULL, 0);
- SBLASTRECORDCHK(sb, "sbappendstream 1");
+ if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+ if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
+ int error = sflt_data_in(so, NULL, &m, NULL, 0);
+ SBLASTRECORDCHK(sb, "sbappendstream 1");
#if CONTENT_FILTER
- if (error == 0) {
- error = cfil_sock_data_in(so, NULL, m, NULL, 0);
- }
+ if (error == 0) {
+ error = cfil_sock_data_in(so, NULL, m, NULL, 0);
+ }
#endif /* CONTENT_FILTER */
- if (error != 0) {
- if (error != EJUSTRETURN) {
- m_freem(m);
+ if (error != 0) {
+ if (error != EJUSTRETURN) {
+ m_freem(m);
+ }
+ return 0;
}
- return 0;
+ } else if (m) {
+ m->m_flags &= ~M_SKIPCFIL;
}
- } else if (m) {
- m->m_flags &= ~M_SKIPCFIL;
}
sbcompress(sb, m, sb->sb_mbtail);
/*
* Similar to sbappend, except the mbuf chain begins a new record.
*/
-int
-sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+static int
+sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop)
{
struct mbuf *m;
int space = 0;
if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
- if (m0 != NULL) {
+ if (m0 != NULL && nodrop == FALSE) {
m_freem(m0);
}
return 0;
}
if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
- m_freem(m0);
+ if (nodrop == FALSE) {
+ m_freem(m0);
+ }
return 0;
}
- if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
- int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
- sock_data_filt_flag_record);
+ if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+ ASSERT(nodrop == FALSE);
+ if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+ int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
+ sock_data_filt_flag_record);
#if CONTENT_FILTER
- if (error == 0) {
- error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
- }
+ if (error == 0) {
+ error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
+ }
#endif /* CONTENT_FILTER */
- if (error != 0) {
- SBLASTRECORDCHK(sb, "sbappendrecord 1");
- if (error != EJUSTRETURN) {
- m_freem(m0);
+ if (error != 0) {
+ SBLASTRECORDCHK(sb, "sbappendrecord 1");
+ if (error != EJUSTRETURN) {
+ m_freem(m0);
+ }
+ return 0;
}
- return 0;
+ } else if (m0) {
+ m0->m_flags &= ~M_SKIPCFIL;
}
- } else if (m0) {
- m0->m_flags &= ~M_SKIPCFIL;
}
/*
return 1;
}
+int
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+ return sbappendrecord_common(sb, m0, FALSE);
+}
+
+int
+sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0)
+{
+ return sbappendrecord_common(sb, m0, TRUE);
+}
+
/*
* Concatenate address (optional), control (optional) and data into one
* single mbuf chain. If sockbuf *sb is passed in, space check will be
return 0;
}
- /* Call socket data in filters */
- if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
- int error;
- error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
- SBLASTRECORDCHK(sb, __func__);
+ if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+ /* Call socket data in filters */
+ if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+ int error;
+ error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
+ SBLASTRECORDCHK(sb, __func__);
#if CONTENT_FILTER
- if (error == 0) {
- error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
- 0);
- }
+ if (error == 0) {
+ error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
+ 0);
+ }
#endif /* CONTENT_FILTER */
- if (error) {
- if (error != EJUSTRETURN) {
- if (m0) {
- m_freem(m0);
- }
- if (control != NULL && !sb_unix) {
- m_freem(control);
- }
- if (error_out) {
- *error_out = error;
+ if (error) {
+ if (error != EJUSTRETURN) {
+ if (m0) {
+ m_freem(m0);
+ }
+ if (control != NULL && !sb_unix) {
+ m_freem(control);
+ }
+ if (error_out) {
+ *error_out = error;
+ }
}
+ return 0;
}
- return 0;
+ } else if (m0) {
+ m0->m_flags &= ~M_SKIPCFIL;
}
- } else if (m0) {
- m0->m_flags &= ~M_SKIPCFIL;
}
mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
return 0;
}
- if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
- int error;
+ if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+ if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+ int error;
- error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
- SBLASTRECORDCHK(sb, __func__);
+ error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
+ SBLASTRECORDCHK(sb, __func__);
#if CONTENT_FILTER
- if (error == 0) {
- error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
- 0);
- }
+ if (error == 0) {
+ error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
+ 0);
+ }
#endif /* CONTENT_FILTER */
- if (error) {
- if (error != EJUSTRETURN) {
- if (m0) {
- m_freem(m0);
- }
- if (control != NULL && !sb_unix) {
- m_freem(control);
- }
- if (error_out) {
- *error_out = error;
+ if (error) {
+ if (error != EJUSTRETURN) {
+ if (m0) {
+ m_freem(m0);
+ }
+ if (control != NULL && !sb_unix) {
+ m_freem(control);
+ }
+ if (error_out) {
+ *error_out = error;
+ }
}
+ return 0;
}
- return 0;
+ } else if (m0) {
+ m0->m_flags &= ~M_SKIPCFIL;
}
- } else if (m0) {
- m0->m_flags &= ~M_SKIPCFIL;
}
result = sbappendcontrol_internal(sb, m0, control);
user_ssize_t *bytes_written)
{
int error;
-#pragma unused (flags)
VERIFY(dst != NULL);
goto out;
}
- if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
- (flags & CONNECT_DATA_IDEMPOTENT)) {
- so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
-
- if (flags & CONNECT_DATA_AUTHENTICATED) {
- so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
- }
- }
-
- /*
- * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
- * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
- * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
- * Case 3 allows user to combine write with connect even if they have
- * no use for TFO (such as regular TCP, and UDP).
- * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
- */
- if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
- ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
- so->so_flags1 |= SOF1_PRECONNECT_DATA;
- }
-
- /*
- * If a user sets data idempotent and does not pass an uio, or
- * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
- * SOF1_DATA_IDEMPOTENT.
- */
- if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
- (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
- /* We should return EINVAL instead perhaps. */
- so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
- }
-
error = soconnectxlocked(so, src, dst, p, ifscope,
- aid, pcid, 0, NULL, 0, auio, bytes_written);
+ aid, pcid, flags, NULL, 0, auio, bytes_written);
if (error != 0) {
goto out;
}
/*
- * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
*
* The socket content filter subsystem provides a way for user space agents to
* make filtering decisions based on the content of the data being sent and
- * received by TCP/IP sockets.
+ * received by INET/INET6 sockets.
*
* A content filter user space agents gets a copy of the data and the data is
* also kept in kernel buffer until the user space agents makes a pass or drop
* filter agent until an ultimate pass or drop decision is made by the
* user space filter agent.
*
- * It should be noted that messages about many TCP/IP sockets can be multiplexed
+ * It should be noted that messages about many INET/INET6 sockets can be multiplexed
* over a single kernel control socket.
*
* Notes:
- * - The current implementation is limited to TCP sockets.
+ * - The current implementation supports all INET/INET6 sockets (i.e. TCP,
+ * UDP, ICMP, etc).
* - The current implementation supports up to two simultaneous content filters
- * for the sake of simplicity of the implementation.
+ * for iOS devices and eight simultaneous content filters for OSX.
*
*
* NECP FILTER CONTROL UNIT
*
* A user space filter agent uses the Network Extension Control Policy (NECP)
- * database to specify which TCP/IP sockets need to be filtered. The NECP
+ * database to specify which INET/INET6 sockets need to be filtered. The NECP
* criteria may be based on a variety of properties like user ID or proc UUID.
*
* The NECP "filter control unit" is used by the socket content filter subsystem
- * to deliver the relevant TCP/IP content information to the appropriate
+ * to deliver the relevant INET/INET6 content information to the appropriate
* user space filter agent via its kernel control socket instance.
* This works as follows:
*
* content filter kernel control socket via the socket option
* CFIL_OPT_NECP_CONTROL_UNIT.
*
- * 3) The NECP database is consulted to find out if a given TCP/IP socket
+ * 3) The NECP database is consulted to find out if a given INET/INET6 socket
* needs to be subjected to content filtering and returns the corresponding
* NECP filter control unit -- the NECP filter control unit is actually
- * stored in the TCP/IP socket structure so the NECP lookup is really simple.
+ * stored in the INET/INET6 socket structure so the NECP lookup is really simple.
*
* 4) The NECP filter control unit is then used to find the corresponding
* kernel control socket instance.
*
- * Note: NECP currently supports a single filter control unit per TCP/IP socket
+ * Note: NECP currently supports a single filter control unit per INET/INET6 socket
* but this restriction may be soon lifted.
*
*
* communicate over the kernel control socket via an asynchronous
* messaging protocol (this is not a request-response protocol).
* The socket content filter subsystem sends event messages to the user
- * space filter agent about the TCP/IP sockets it is interested to filter.
+ * space filter agent about the INET/INET6 sockets it is interested to filter.
* The user space filter agent sends action messages to either allow
* data to pass or to disallow the data flow (and drop the connection).
*
* All messages over a content filter kernel control socket share the same
* common header of type "struct cfil_msg_hdr". The message type tells if
* it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
- * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
+ * The message header field "cfm_sock_id" identifies a given INET/INET6 flow.
+ * For TCP, flows are per-socket. For UDP and other datagrame protocols, there
+ * could be multiple flows per socket.
+ *
* Note the message header length field may be padded for alignment and can
* be larger than the actual content of the message.
* The field "cfm_op" describe the kind of event or action.
*
* Here are the kinds of content filter events:
- * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
- * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
- * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
- * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
+ * - CFM_OP_SOCKET_ATTACHED: a new INET/INET6 socket is being filtered
+ * - CFM_OP_SOCKET_CLOSED: A INET/INET6 socket is closed
+ * - CFM_OP_DATA_OUT: A span of data is being sent on a INET/INET6 socket
+ * - CFM_OP_DATA_IN: A span of data is being or received on a INET/INET6 socket
*
*
* EVENT MESSAGES
*
* The CFM_OP_DATA_UPDATE action messages let the user space filter
* agent allow data to flow up to the specified pass offset -- there
- * is a pass offset for outgoing data and a pass offset for incoming data.
- * When a new TCP/IP socket is attached to the content filter, each pass offset
- * is initially set to 0 so not data is allowed to pass by default.
- * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
+ * is a pass offset for outgoing data and a pass offset for incoming data.
+ * When a new INET/INET6 socket is attached to the content filter and a flow is
+ * created, each pass offset is initially set to 0 so no data is allowed to pass by
+ * default. When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
* then the data flow becomes unrestricted.
*
* Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
* A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
* to tell the kernel how much data it wants to see by using the peek offsets.
* Just like pass offsets, there is a peek offset for each direction.
- * When a new TCP/IP socket is attached to the content filter, each peek offset
- * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
- * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
- * with a greater than 0 peek offset is sent by the user space filter agent.
- * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
- * then the flow of update data events becomes unrestricted.
+ * When a new INET/INET6 flow is created, each peek offset is initially set to 0
+ * so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages are dispatched by default
+ * until a CFM_OP_DATA_UPDATE action message with a greater than 0 peek offset is sent
+ * by the user space filter agent. When the peek offset is set to CFM_MAX_OFFSET via
+ * a CFM_OP_DATA_UPDATE then the flow of update data events becomes unrestricted.
*
* Note that peek offsets cannot be smaller than the corresponding pass offset.
* Also a peek offsets cannot be smaller than the corresponding end offset
* to set a too small peek value is silently ignored.
*
*
- * PER SOCKET "struct cfil_info"
+ * PER FLOW "struct cfil_info"
*
- * As soon as a TCP/IP socket gets attached to a content filter, a
+ * As soon as a INET/INET6 socket gets attached to a content filter, a
* "struct cfil_info" is created to hold the content filtering state for this
- * socket.
+ * socket. For UDP and other datagram protocols, as soon as traffic is seen for
+ * each new flow identified by its 4-tuple of source address/port and destination
+ * address/port, a "struct cfil_info" is created. Each datagram socket may
+ * have multiple flows maintained in a hash table of "struct cfil_info" entries.
*
* The content filtering state is made of the following information
* for each direction:
*
* CONTENT FILTER QUEUES
*
- * Data that is being filtered is steered away from the TCP/IP socket buffer
+ * Data that is being filtered is steered away from the INET/INET6 socket buffer
* and instead will sit in one of three content filter queues until the data
- * can be re-injected into the TCP/IP socket buffer.
+ * can be re-injected into the INET/INET6 socket buffer.
*
* A content filter queue is represented by "struct cfil_queue" that contains
* a list of mbufs and the start and end offset of the data span of
* c) The "cfi_inject_q" of "struct cfil_info"
*
* Note: The sequence (a),(b) may be repeated several times if there is more
- * than one content filter attached to the TCP/IP socket.
+ * than one content filter attached to the INET/INET6 socket.
*
* The "cfe_ctl_q" queue holds data than cannot be delivered to the
* kernel conntrol socket for two reasons:
*
* The "cfi_inject_q" queue holds data that has been fully allowed to pass
* by the user space filter agent and that needs to be re-injected into the
- * TCP/IP socket.
+ * INET/INET6 socket.
*
*
* IMPACT ON FLOW CONTROL
*
* An essential aspect of the content filer subsystem is to minimize the
- * impact on flow control of the TCP/IP sockets being filtered.
+ * impact on flow control of the INET/INET6 sockets being filtered.
*
* The processing overhead of the content filtering may have an effect on
* flow control by adding noticeable delays and cannot be eliminated --
* The amount of data being filtered is kept in buffers while waiting for
* a decision by the user space filter agent. This amount of data pending
* needs to be subtracted from the amount of data available in the
- * corresponding TCP/IP socket buffer. This is done by modifying
+ * corresponding INET/INET6 socket buffer. This is done by modifying
* sbspace() and tcp_sbspace() to account for amount of data pending
* in the content filter.
*
* cfil read-write lock held as shared so it can be re-entered from multiple
* threads.
*
- * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
+ * The per INET/INET6 socket content filterstate -- "struct cfil_info" -- is
* protected by the socket lock.
*
- * A TCP/IP socket lock cannot be taken while the cfil read-write lock
+ * A INET/INET6 socket lock cannot be taken while the cfil read-write lock
* is held. That's why we have some sequences where we drop the cfil read-write
- * lock before taking the TCP/IP lock.
+ * lock before taking the INET/INET6 lock.
*
- * It is also important to lock the TCP/IP socket buffer while the content
+ * It is also important to lock the INET/INET6 socket buffer while the content
* filter is modifying the amount of pending data. Otherwise the calculations
* in sbspace() and tcp_sbspace() could be wrong.
*
* To read the other fields of "struct content_filter" we have to take
* "cfil_lck_rw" in shared mode.
*
+ * DATAGRAM SPECIFICS:
+ *
+ * The socket content filter supports all INET/INET6 protocols. However
+ * the treatments for TCP sockets and for datagram (UDP, ICMP, etc) sockets
+ * are slightly different.
+ *
+ * Each datagram socket may have multiple flows. Each flow is identified
+ * by the flow's source address/port and destination address/port tuple
+ * and is represented as a "struct cfil_info" entry. For each socket,
+ * a hash table is used to maintain the collection of flows under that socket.
+ *
+ * Each datagram flow is uniquely identified by it's "struct cfil_info" cfi_sock_id.
+ * The highest 32-bits of the cfi_sock_id contains the socket's so_gencnt. This portion
+ * of the cfi_sock_id is used locate the socket during socket lookup. The lowest 32-bits
+ * of the cfi_sock_id contains a hash of the flow's 4-tuple. This portion of the cfi_sock_id
+ * is used as the hash value for the flow hash table lookup within the parent socket.
+ *
+ * Since datagram sockets may not be connected, flow states may not be maintained in the
+ * socket structures and thus have to be saved for each packet. These saved states will be
+ * used for both outgoing and incoming reinjections. For outgoing packets, destination
+ * address/port as well as the current socket states will be saved. During reinjection,
+ * these saved states will be used instead. For incoming packets, control and address
+ * mbufs will be chained to the data. During reinjection, the whole chain will be queued
+ * onto the incoming socket buffer.
*
* LIMITATIONS
*
- * - For TCP sockets only
+ * - Support all INET/INET6 sockets, such as TCP, UDP, ICMP, etc
*
* - Does not support TCP unordered messages
*/
/*
* TO DO LIST
*
- * SOONER:
- *
* Deal with OOB
*
- * LATER:
- *
- * If support datagram, enqueue control and address mbufs as well
*/
#include <sys/types.h>
#include <net/content_filter.h>
#include <net/content_filter_crypto.h>
+#define _IP_VHL
+#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#define MAX_CONTENT_FILTER 8
#endif
+extern struct inpcbinfo ripcbinfo;
struct cfil_entry;
/*
uint64_t cfi_byte_outbound_count;
boolean_t cfi_isSignatureLatest; /* Indicates if signature covers latest flow attributes */
+ u_int32_t cfi_debug;
struct cfi_buf {
/*
* cfi_pending_first and cfi_pending_last describe the total
LIST_HEAD(cfilhashhead, cfil_hash_entry);
#define CFILHASHSIZE 16
#define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport))
+
+#define IS_INET(so) (so && so->so_proto && so->so_proto->pr_domain && (so->so_proto->pr_domain->dom_family == AF_INET || so->so_proto->pr_domain->dom_family == AF_INET6))
+#define IS_TCP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP)
#define IS_UDP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
+#define IS_ICMP(so) (so && so->so_proto && (so->so_proto->pr_type == SOCK_RAW || so->so_proto->pr_type == SOCK_DGRAM) && \
+ (so->so_proto->pr_protocol == IPPROTO_ICMP || so->so_proto->pr_protocol == IPPROTO_ICMPV6))
+#define IS_RAW(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_RAW && so->so_proto->pr_protocol == IPPROTO_RAW)
+
+#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX)
+#define IS_IP_DGRAM(so) (IS_INET(so) && IS_UDP(so))
+#else
+#define IS_IP_DGRAM(so) (IS_INET(so) && !IS_TCP(so))
+#endif
+
+#define OPTIONAL_IP_HEADER(so) (!IS_TCP(so) && !IS_UDP(so))
+#define GET_SO_PROTO(so) ((so && so->so_proto) ? so->so_proto->pr_protocol : IPPROTO_MAX)
+#define IS_INP_V6(inp) (inp && (inp->inp_vflag & INP_IPV6))
+
#define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
#define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
union sockaddr_in_4_6 cfil_faddr;
uint32_t cfil_so_state_change_cnt;
short cfil_so_options;
+ int cfil_inp_flags;
};
#define CFIL_HASH_ENTRY_ZONE_NAME "cfil_entry_hash"
static unsigned int cfil_data_length(struct mbuf *, int *, int *);
static errno_t cfil_db_init(struct socket *);
static void cfil_db_free(struct socket *so);
-struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *, boolean_t);
struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t);
struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+void cfil_db_update_entry_local(struct cfil_db *, struct cfil_hash_entry *, struct sockaddr *);
void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *);
-struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *);
+struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *, int);
struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t);
static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *,
struct mbuf *, struct mbuf *, uint32_t);
bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t);
bool cfil_info_action_timed_out(struct cfil_info *, int);
bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
-struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *);
+struct m_tag *cfil_dgram_save_socket_state(struct cfil_info *, struct mbuf *);
+boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags);
static void cfil_udp_gc_thread_func(void *, wait_result_t);
static void cfil_info_udp_expire(void *, wait_result_t);
static bool fill_cfil_hash_entry_from_address(struct cfil_hash_entry *, bool, struct sockaddr *);
}
lck_rw_done(pcbinfo->ipi_lock);
+ pcbinfo = &ripcbinfo;
+ lck_rw_lock_shared(pcbinfo->ipi_lock);
+ LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+ if (inp->inp_state != INPCB_STATE_DEAD &&
+ inp->inp_socket != NULL &&
+ inp->inp_socket->so_cfil_db != NULL &&
+ (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
+ if (cfil_socket_safe_lock(inp)) {
+ so = inp->inp_socket;
+ }
+ break;
+ }
+ }
+ lck_rw_done(pcbinfo->ipi_lock);
+
done:
if (so == NULL) {
OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
error = EINVAL;
goto unlock;
}
+
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED MSG FROM FILTER");
+ }
+
entry = &cfil_info->cfi_entries[kcunit - 1];
if (entry->cfe_filter == NULL) {
CFIL_LOG(LOG_NOTICE, "so %llx no filter",
switch (msghdr->cfm_op) {
case CFM_OP_DATA_UPDATE:
+
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DATA_UPDATE");
+ CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+ (uint64_t)VM_KERNEL_ADDRPERM(so),
+ cfil_info->cfi_sock_id,
+ action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+ action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+ }
+
#if VERDICT_DEBUG
CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
(uint64_t)VM_KERNEL_ADDRPERM(so),
break;
case CFM_OP_DROP:
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DROP");
+ CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+ (uint64_t)VM_KERNEL_ADDRPERM(so),
+ cfil_info->cfi_sock_id,
+ action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+ action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+ }
+
#if VERDICT_DEBUG
CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
(uint64_t)VM_KERNEL_ADDRPERM(so),
errno_t
cfil_sock_detach(struct socket *so)
{
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
cfil_db_free(so);
return 0;
}
boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote;
union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local;
- cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, inp->inp_vflag & INP_IPV4, outgoing);
+ cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, !IS_INP_V6(inp), outgoing);
}
data.byte_count_in = cfil_info->cfi_byte_inbound_count;
if (hash_entry_ptr != NULL) {
cfil_fill_event_msg_addresses(hash_entry_ptr, inp,
&msg_attached.cfs_src, &msg_attached.cfs_dst,
- inp->inp_vflag & INP_IPV4, conn_dir == CFS_CONNECTION_DIR_OUT);
+ !IS_INP_V6(inp), conn_dir == CFS_CONNECTION_DIR_OUT);
}
msg_attached.cfs_conn_dir = conn_dir;
}
}
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING ATTACH UP");
+ }
+
cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, &msg_attached);
#if LIFECYCLE_DEBUG
goto done;
}
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DISCONNECT UP");
+ }
+
#if LIFECYCLE_DEBUG
cfil_info_log(LOG_ERR, cfil_info, outgoing ?
"CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP":
cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed);
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING CLOSED UP");
+ }
+
#if LIFECYCLE_DEBUG
CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: <sock id %llu> op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec);
#endif
struct cfe_buf *entrybuf;
struct content_filter *cfc;
struct timeval tv;
+ int inp_flags = 0;
cfil_rw_lock_shared(&cfil_lck_rw);
data_req->cfd_start_offset = entrybuf->cfe_peeked;
data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
+ data_req->cfd_flags = 0;
+ if (OPTIONAL_IP_HEADER(so)) {
+ /*
+ * For non-UDP/TCP traffic, indicate to filters if optional
+ * IP header is present:
+ * outgoing - indicate according to INP_HDRINCL flag
+ * incoming - For IPv4 only, stripping of IP header is
+ * optional. But for CFIL, we delay stripping
+ * at rip_input. So CFIL always expects IP
+ * frames. IP header will be stripped according
+ * to INP_STRIPHDR flag later at reinjection.
+ */
+ if ((!outgoing && !IS_INP_V6(inp)) ||
+ (outgoing && cfil_dgram_peek_socket_state(data, &inp_flags) && (inp_flags & INP_HDRINCL))) {
+ data_req->cfd_flags |= CFD_DATA_FLAG_IP_HEADER;
+ }
+ }
+
/*
* Copy address/port into event msg.
* For non connected sockets need to copy addresses from passed
*/
cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
&data_req->cfc_src, &data_req->cfc_dst,
- inp->inp_vflag & INP_IPV4, outgoing);
+ !IS_INP_V6(inp), outgoing);
+
+ if (cfil_info->cfi_debug) {
+ cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DATA UP");
+ }
if (cfil_info->cfi_isSignatureLatest == false) {
cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req);
(uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen);
#endif
+ if (cfil_info->cfi_debug) {
+ CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u (%s)",
+ (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen,
+ data_req->cfd_flags & CFD_DATA_FLAG_IP_HEADER ? "IP HDR" : "NO IP HDR");
+ }
+
done:
if (error == ENOBUFS) {
entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
struct cfil_queue *inject_q;
int need_rwakeup = 0;
int count = 0;
+ struct inpcb *inp = NULL;
+ struct ip *ip = NULL;
+ unsigned int hlen;
if (cfil_info == NULL) {
return 0;
datalen = cfil_data_length(data, &mbcnt, &mbnum);
#if DATA_DEBUG
- CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: <so %llx> data %llx datalen %u (mbcnt %u)",
- remote_addr_ptr ? "UNCONNECTED" : "CONNECTED",
+ CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> data %llx datalen %u (mbcnt %u)",
(uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
#endif
+ if (cfil_info->cfi_debug) {
+ CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> data %llx datalen %u (mbcnt %u)",
+ (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
+ }
/* Remove data from queue and adjust stats */
cfil_queue_remove(inject_q, data, datalen);
data->m_flags |= M_SKIPCFIL;
/*
- * NOTE: We currently only support TCP and UDP.
- * For RAWIP, MPTCP and message TCP we'll
+ * NOTE: We currently only support TCP, UDP, ICMP,
+ * ICMPv6 and RAWIP. For MPTCP and message TCP we'll
* need to call the appropriate sbappendxxx()
* of fix sock_inject_data_in()
*/
- if (IS_UDP(so) == TRUE) {
+ if (IS_IP_DGRAM(so)) {
+ if (OPTIONAL_IP_HEADER(so)) {
+ inp = sotoinpcb(so);
+ if (inp && (inp->inp_flags & INP_STRIPHDR)) {
+ mbuf_t data_start = cfil_data_start(data);
+ if (data_start != NULL && (data_start->m_flags & M_PKTHDR)) {
+ ip = mtod(data_start, struct ip *);
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ data_start->m_len -= hlen;
+ data_start->m_pkthdr.len -= hlen;
+ data_start->m_data += hlen;
+ }
+ }
+ }
+
if (sbappendchain(&so->so_rcv, data, 0)) {
need_rwakeup = 1;
}
CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
(uint64_t)VM_KERNEL_ADDRPERM(so), count);
#endif
+ if (cfil_info->cfi_debug) {
+ CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
+ (uint64_t)VM_KERNEL_ADDRPERM(so), count);
+ }
/* A single wakeup is for several packets is more efficient */
if (need_rwakeup) {
cfi_buf->cfi_pending_mbcnt += mbcnt;
cfi_buf->cfi_pending_mbnum += mbnum;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max ||
cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) {
cfi_buf->cfi_tail_drop_cnt++;
// Is cfil attached to this filter?
kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry);
if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
- if (IS_UDP(so) && chain == NULL) {
- /* UDP only:
+ if (IS_IP_DGRAM(so) && chain == NULL) {
+ /* Datagrams only:
* Chain addr (incoming only TDB), control (optional) and data into one chain.
* This full chain will be reinjected into socket after recieving verdict.
*/
- (void) cfil_udp_save_socket_state(cfil_info, data);
+ (void) cfil_dgram_save_socket_state(cfil_info, data);
chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control);
if (chain == NULL) {
return ENOBUFS;
{
int error = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags);
}
{
int error = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags);
}
{
int error = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_sock_udp_shutdown(so, how);
}
errno_t error = 0;
int kcunit;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
cfil_sock_udp_is_closed(so);
return;
}
errno_t error = 0;
int kcunit;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
cfil_sock_udp_notify_shutdown(so, how, 0, 0);
return;
}
uint32_t kcunit;
int attached = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_filters_udp_attached(so, FALSE);
}
struct timespec ts;
int error;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
cfil_sock_udp_close_wait(so);
return;
}
struct socket *so = sb->sb_so;
uint64_t pending = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_sock_udp_data_pending(sb, FALSE);
}
struct socket *so = sb->sb_so;
uint64_t pending = 0;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
return cfil_sock_udp_data_pending(sb, TRUE);
}
int error;
struct socket *so = sb->sb_so;
- if (IS_UDP(so)) {
+ if (IS_IP_DGRAM(so)) {
cfil_sock_udp_buf_update(sb);
return;
}
return;
}
- CFIL_LOG(level, "<%s>: <UDP so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
+ CFIL_LOG(level, "<%s>: <%s(%d) so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
msg,
+ IS_UDP(so) ? "UDP" : "proto", GET_SO_PROTO(so),
(uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId,
ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote);
}
return FALSE;
}
- if (inp->inp_vflag & INP_IPV4) {
+ if (inp->inp_vflag & INP_IPV6) {
if (isLocal == TRUE) {
entry->cfentry_lport = inp->inp_lport;
- entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
+ entry->cfentry_laddr.addr6 = inp->in6p_laddr;
} else {
entry->cfentry_fport = inp->inp_fport;
- entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
+ entry->cfentry_faddr.addr6 = inp->in6p_faddr;
}
- entry->cfentry_family = AF_INET;
+ entry->cfentry_family = AF_INET6;
return TRUE;
- } else if (inp->inp_vflag & INP_IPV6) {
+ } else if (inp->inp_vflag & INP_IPV4) {
if (isLocal == TRUE) {
entry->cfentry_lport = inp->inp_lport;
- entry->cfentry_laddr.addr6 = inp->in6p_laddr;
+ entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
} else {
entry->cfentry_fport = inp->inp_fport;
- entry->cfentry_faddr.addr6 = inp->in6p_faddr;
+ entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
}
- entry->cfentry_family = AF_INET6;
+ entry->cfentry_family = AF_INET;
return TRUE;
}
return FALSE;
}
struct cfil_hash_entry *
-cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
+cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote, boolean_t remoteOnly)
{
- struct cfil_hash_entry matchentry;
+ struct cfil_hash_entry matchentry = { };
struct cfil_hash_entry *nextentry = NULL;
struct inpcb *inp = sotoinpcb(db->cfdb_so);
u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
+ u_int16_t hashkey_fport = 0, hashkey_lport = 0;
int inp_hash_element = 0;
struct cfilhashhead *cfilhash = NULL;
goto done;
}
- if (local != NULL) {
- fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
- } else {
- fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
+ if (remoteOnly == false) {
+ if (local != NULL) {
+ fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
+ } else {
+ fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
+ }
}
if (remote != NULL) {
fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote);
#if INET6
if (inp->inp_vflag & INP_IPV6) {
hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3];
- hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3];
+ hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr6.s6_addr32[3] : 0;
} else
#endif /* INET6 */
{
hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr;
- hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr;
+ hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr : 0;
}
- inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr,
- matchentry.cfentry_lport, matchentry.cfentry_fport);
+ hashkey_fport = matchentry.cfentry_fport;
+ hashkey_lport = (remoteOnly == false) ? matchentry.cfentry_lport : 0;
+
+ inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr, hashkey_lport, hashkey_fport);
inp_hash_element &= db->cfdb_hashmask;
cfilhash = &db->cfdb_hashbase[inp_hash_element];
LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
#if INET6
if ((inp->inp_vflag & INP_IPV6) &&
- nextentry->cfentry_lport == matchentry.cfentry_lport &&
+ (remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) &&
nextentry->cfentry_fport == matchentry.cfentry_fport &&
- IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) &&
+ (remoteOnly || IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6)) &&
IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) {
#if DATA_DEBUG
cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry");
return nextentry;
} else
#endif /* INET6 */
- if (nextentry->cfentry_lport == matchentry.cfentry_lport &&
+ if ((remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) &&
nextentry->cfentry_fport == matchentry.cfentry_fport &&
- nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr &&
+ (remoteOnly || nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr) &&
nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) {
#if DATA_DEBUG
cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry");
return entry;
}
+void
+cfil_db_update_entry_local(struct cfil_db *db, struct cfil_hash_entry *entry, struct sockaddr *local)
+{
+ struct inpcb *inp = sotoinpcb(db->cfdb_so);
+
+ CFIL_LOG(LOG_INFO, "");
+
+ if (inp == NULL || entry == NULL) {
+ return;
+ }
+
+ if (local != NULL) {
+ fill_cfil_hash_entry_from_address(entry, TRUE, local);
+ } else {
+ fill_cfil_hash_entry_from_inp(entry, TRUE, inp);
+ }
+ cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: local updated");
+
+ return;
+}
+
struct cfil_info *
cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
{
}
struct cfil_hash_entry *
-cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote)
+cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote, int debug)
{
struct cfil_hash_entry *hash_entry = NULL;
}
// See if flow already exists.
- hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote);
+ hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, false);
+ if (hash_entry == NULL) {
+ // No match with both local and remote, try match with remote only
+ hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, true);
+ if (hash_entry != NULL) {
+ // Simply update the local address into the original flow, keeping
+ // its sockId and flow_hash unchanged.
+ cfil_db_update_entry_local(so->so_cfil_db, hash_entry, local);
+ }
+ }
if (hash_entry != NULL) {
return hash_entry;
}
return NULL;
}
hash_entry->cfentry_cfil->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN;
+ hash_entry->cfentry_cfil->cfi_debug = debug;
#if LIFECYCLE_DEBUG
cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
/* Hold a reference on the socket for each flow */
so->so_usecount++;
+ if (debug) {
+ cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
+ }
+
error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, 0,
outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN);
/* We can recover from flow control or out of memory errors */
uint32_t filter_control_unit;
struct cfil_hash_entry *hash_entry = NULL;
struct cfil_info *cfil_info = NULL;
+ int debug = 0;
socket_lock_assert_owned(so);
return error;
}
- hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote);
+ hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote, debug);
if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) {
CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow");
return EPIPE;
break;
}
- if (IS_UDP(cfil_info->cfi_so)) {
+ if (IS_IP_DGRAM(cfil_info->cfi_so)) {
if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) ||
cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) ||
cfil_info_buffer_threshold_exceeded(cfil_info)) {
}
struct m_tag *
-cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
+cfil_dgram_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
{
struct m_tag *tag = NULL;
struct cfil_tag *ctag = NULL;
struct cfil_hash_entry *hash_entry = NULL;
+ struct inpcb *inp = NULL;
if (cfil_info == NULL || cfil_info->cfi_so == NULL ||
cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) {
return NULL;
}
+ inp = sotoinpcb(cfil_info->cfi_so);
+
/* Allocate a tag */
tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP,
sizeof(struct cfil_tag), M_DONTWAIT, m);
ctag = (struct cfil_tag*)(tag + 1);
ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt;
ctag->cfil_so_options = cfil_info->cfi_so->so_options;
+ ctag->cfil_inp_flags = inp ? inp->inp_flags : 0;
hash_entry = cfil_info->cfi_hash_entry;
if (hash_entry->cfentry_family == AF_INET6) {
}
struct m_tag *
-cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
- struct sockaddr **faddr)
+cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
+ struct sockaddr **faddr, int *inp_flags)
{
struct m_tag *tag = NULL;
struct cfil_tag *ctag = NULL;
if (faddr) {
*faddr = (struct sockaddr *) &ctag->cfil_faddr;
}
+ if (inp_flags) {
+ *inp_flags = ctag->cfil_inp_flags;
+ }
/*
* Unlink tag and hand it over to caller.
return NULL;
}
+boolean_t
+cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags)
+{
+ struct m_tag *tag = NULL;
+ struct cfil_tag *ctag = NULL;
+
+ tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL);
+ if (tag) {
+ ctag = (struct cfil_tag *)(tag + 1);
+ if (inp_flags) {
+ *inp_flags = ctag->cfil_inp_flags;
+ }
+ return true;
+ }
+ return false;
+}
+
static int
cfil_dispatch_stats_event_locked(int kcunit, struct cfil_stats_report_buffer *buffer, uint32_t stats_count)
{
union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL;
union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr;
cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
- src, dst, inp->inp_vflag & INP_IPV4, outgoing);
+ src, dst, !IS_INP_V6(inp), outgoing);
}
}
uint32_t cfs_signature_length;
};
+/*
+ * CFIL data flags
+ */
+#define CFD_DATA_FLAG_IP_HEADER 0x00000001 /* Data includes IP header */
+
/*
* struct cfil_msg_data_event
*
uint64_t cfd_end_offset;
cfil_crypto_signature cfd_signature;
uint32_t cfd_signature_length;
+ uint32_t cfd_flags;
/* Actual content data immediatly follows */
};
extern cfil_sock_id_t cfil_sock_id_from_socket(struct socket *so);
-extern struct m_tag *cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt,
- short *options, struct sockaddr **faddr);
+extern struct m_tag *cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt,
+ short *options, struct sockaddr **faddr, int *inp_flags);
+extern boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags);
+
#endif /* BSD_KERNEL_PRIVATE */
__END_DECLS
#include <net/if_var.h>
#include <net/if_media.h>
#include <net/net_api_stats.h>
+#include <net/pfvar.h>
#include <netinet/in.h> /* for struct arpcom */
#include <netinet/in_systm.h>
switch (ifs->if_type) {
case IFT_ETHER:
if (strcmp(ifs->if_name, "en") == 0 &&
- ifs->if_subfamily == IFNET_SUBFAMILY_WIFI) {
+ ifs->if_subfamily == IFNET_SUBFAMILY_WIFI &&
+ (ifs->if_eflags & IFEF_IPV4_ROUTER) == 0) {
/* XXX is there a better way to identify Wi-Fi STA? */
mac_nat = TRUE;
}
necp_get_parent_cred_result(NULL, info);
}
}
+ }
- if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
- info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false;
- }
+ if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+ info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false;
}
if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) {
dst->tagname[sizeof(dst->tagname) - 1] = '\0';
dst->match_tagname[sizeof(dst->match_tagname) - 1] = '\0';
dst->overload_tblname[sizeof(dst->overload_tblname) - 1] = '\0';
+ dst->owner[sizeof(dst->owner) - 1] = '\0';
dst->cuid = kauth_cred_getuid(p->p_ucred);
dst->cpid = p->p_pid;
dst->kif = NULL;
dst->overload_tbl = NULL;
- TAILQ_INIT(&dst->rpool.list);
+ dst->rpool.list.tqh_first = NULL;
+ dst->rpool.list.tqh_last = NULL;
dst->rpool.cur = NULL;
dst->entries.tqe_prev = NULL;
"sbappendaddr failed. send buffer size = %u, send_window = %u, error = %d\n",
fd_cb->so->so_snd.sb_cc, fd_cb->send_window, error);
}
+ error = 0;
} else {
if (!sbappendrecord(&fd_cb->so->so_snd, data)) {
FDLOG(LOG_ERR, fd_cb,
FDLOG0(LOG_INFO, fd_cb, "No remote address provided");
error = 0;
} else {
+ if (remote_address.ss_len > sizeof(remote_address)) {
+ remote_address.ss_len = sizeof(remote_address);
+ }
/* validate the address */
if (flow_divert_is_sockaddr_valid((struct sockaddr *)&remote_address)) {
got_remote_sa = TRUE;
struct flow_divert_pcb *fd_cb = so->so_fd_pcb;
int error = 0;
struct inpcb *inp;
+#if CONTENT_FILTER
+ struct m_tag *cfil_tag = NULL;
+#endif
VERIFY((so->so_flags & SOF_FLOW_DIVERT) && so->so_fd_pcb != NULL);
*/
if (to == NULL && so->so_cfil_db) {
struct sockaddr *cfil_faddr = NULL;
- struct m_tag *cfil_tag = cfil_udp_get_socket_state(data, NULL, NULL, &cfil_faddr);
+ cfil_tag = cfil_dgram_get_socket_state(data, NULL, NULL, &cfil_faddr, NULL);
if (cfil_tag) {
to = (struct sockaddr *)(void *)cfil_faddr;
}
if (control) {
mbuf_free(control);
}
+#if CONTENT_FILTER
+ if (cfil_tag) {
+ m_tag_free(cfil_tag);
+ }
+#endif
+
return error;
}
sorwakeup(so);
}
}
- flow_divert_set_protosw(so);
+ if (SOCK_TYPE(so) == SOCK_STREAM) {
+ flow_divert_set_protosw(so);
+ } else if (SOCK_TYPE(so) == SOCK_DGRAM) {
+ flow_divert_set_udp_protosw(so);
+ }
+
socket_unlock(so, 0);
fd_cb->so = so;
#if defined(XNU_TARGET_OS_OSX)
if (so->so_rpid > 0) {
lookup_uuid = so->so_ruuid;
+ ogencnt = so->so_policy_gencnt;
+ err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
}
#endif
- if (lookup_uuid == NULL) {
+ if (lookup_uuid == NULL || err == ENOENT) {
lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
+ ogencnt = so->so_policy_gencnt;
+ err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
}
- ogencnt = so->so_policy_gencnt;
- err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
-
/*
* Discard cached generation count if the entry is gone (ENOENT),
* so that we go thru the checks below.
#include <net/if.h>
#include <net/route.h>
+#include <net/content_filter.h>
#define _IP_VHL
#include <netinet/in.h>
struct in_ifaddr *ia = NULL;
int icmplen;
int error = EINVAL;
+ int inp_flags = inp ? inp->inp_flags : 0;
if (inp == NULL
#if NECP
goto bad;
}
- if ((inp->inp_flags & INP_HDRINCL) != 0) {
+#if CONTENT_FILTER
+ /*
+ * If socket is subject to Content Filter, get inp_flags from saved state
+ */
+ if (so->so_cfil_db && nam == NULL) {
+ cfil_dgram_peek_socket_state(m, &inp_flags);
+ }
+#endif
+
+ if ((inp_flags & INP_HDRINCL) != 0) {
/* Expect 32-bit aligned data ptr on strict-align platforms */
MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
/*
goto out_err;
}
+ if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
+ error = EOVERFLOW;
+ goto out_err;
+ }
+
mpts = mptcp_subflow_alloc();
if (mpts == NULL) {
os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
}
static int
-mptcp_entitlement_check(struct socket *mp_so)
+mptcp_entitlement_check(struct socket *mp_so, uint8_t svctype)
{
struct mptses *mpte = mpsotompte(mp_so);
}
#endif
- if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
+ if (svctype == MPTCP_SVCTYPE_AGGREGATE) {
if (mptcp_developer_mode) {
return 0;
}
deny:
os_log_error(mptcp_log_handle, "%s - %lx: MPTCP prohibited on svc %u\n",
- __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype);
+ __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), svctype);
return -1;
}
}
if (!(mpte->mpte_flags & MPTE_SVCTYPE_CHECKED)) {
- if (mptcp_entitlement_check(mp_so) < 0) {
+ if (mptcp_entitlement_check(mp_so, mpte->mpte_svctype) < 0) {
error = EPERM;
goto out;
}
goto err_out;
}
- mpte->mpte_svctype = optval;
-
- if (mptcp_entitlement_check(mp_so) < 0) {
+ if (mptcp_entitlement_check(mp_so, optval) < 0) {
error = EACCES;
goto err_out;
}
+ mpte->mpte_svctype = optval;
mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
goto out;
struct mptcb *mpte_mptcb; /* ptr to MPTCP PCB */
TAILQ_HEAD(, mptopt) mpte_sopts; /* list of socket options */
TAILQ_HEAD(, mptsub) mpte_subflows; /* list of subflows */
+#define MPTCP_MAX_NUM_SUBFLOWS 256
uint16_t mpte_numflows; /* # of subflows in list */
uint16_t mpte_nummpcapflows; /* # of MP_CAP subflows */
sae_associd_t mpte_associd; /* MPTCP association ID */
#include <net/if.h>
#include <net/net_api_stats.h>
#include <net/route.h>
+#include <net/content_filter.h>
#define _IP_VHL
#include <netinet/in.h>
continue;
}
}
- if (last->inp_flags & INP_STRIPHDR) {
+ if (last->inp_flags & INP_STRIPHDR
+#if CONTENT_FILTER
+ /*
+ * If socket is subject to Content Filter, delay stripping until reinject
+ */
+ && (last->inp_socket->so_cfil_db == NULL)
+#endif
+ ) {
n->m_len -= iphlen;
n->m_pkthdr.len -= iphlen;
n->m_data += iphlen;
goto unlock;
}
}
- if (last->inp_flags & INP_STRIPHDR) {
+ if (last->inp_flags & INP_STRIPHDR
+#if CONTENT_FILTER
+ /*
+ * If socket is subject to Content Filter, delay stripping until reinject
+ */
+ && (last->inp_socket->so_cfil_db == NULL)
+#endif
+ ) {
m->m_len -= iphlen;
m->m_pkthdr.len -= iphlen;
m->m_data += iphlen;
struct ip *ip;
struct inpcb *inp = sotoinpcb(so);
int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+ int inp_flags = inp ? inp->inp_flags : 0;
struct ip_out_args ipoa;
struct ip_moptions *imo;
int tos = IPTOS_UNSPEC;
int error = 0;
+#if CONTENT_FILTER
+ struct m_tag *cfil_tag = NULL;
+ bool cfil_faddr_use = false;
+ uint32_t cfil_so_state_change_cnt = 0;
+ short cfil_so_options = 0;
+ int cfil_inp_flags = 0;
+ struct sockaddr *cfil_faddr = NULL;
+ struct sockaddr_in *cfil_sin;
+#endif
+
+#if CONTENT_FILTER
+ /*
+ * If socket is subject to Content Filter and no addr is passed in,
+ * retrieve CFIL saved state from mbuf and use it if necessary.
+ */
+ if (so->so_cfil_db && dst == INADDR_ANY) {
+ cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, &cfil_inp_flags);
+ if (cfil_tag) {
+ cfil_sin = SIN(cfil_faddr);
+ flags = (cfil_so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+ inp_flags = cfil_inp_flags;
+ if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
+ /*
+ * Socket is unconnected, simply use the saved faddr as 'addr' to go through
+ * the connect/disconnect logic.
+ */
+ dst = cfil_sin->sin_addr.s_addr;
+ } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+ (inp->inp_fport != cfil_sin->sin_port ||
+ inp->inp_faddr.s_addr != cfil_sin->sin_addr.s_addr)) {
+ /*
+ * Socket is connected but socket state and dest addr/port changed.
+ * We need to use the saved faddr and socket options.
+ */
+ cfil_faddr_use = true;
+ }
+ m_tag_free(cfil_tag);
+ }
+ }
+#endif
+
+ if (so->so_state & SS_ISCONNECTED) {
+ if (dst != INADDR_ANY) {
+ if (m != NULL) {
+ m_freem(m);
+ }
+ if (control != NULL) {
+ m_freem(control);
+ }
+ return EISCONN;
+ }
+ dst = cfil_faddr_use ? cfil_sin->sin_addr.s_addr : inp->inp_faddr.s_addr;
+ } else {
+ if (dst == INADDR_ANY) {
+ if (m != NULL) {
+ m_freem(m);
+ }
+ if (control != NULL) {
+ m_freem(control);
+ }
+ return ENOTCONN;
+ }
+ }
bzero(&ipoa, sizeof(ipoa));
ipoa.ipoa_boundif = IFSCOPE_NONE;
* If the user handed us a complete IP packet, use it.
* Otherwise, allocate an mbuf for a header and fill it in.
*/
- if ((inp->inp_flags & INP_HDRINCL) == 0) {
+ if ((inp_flags & INP_HDRINCL) == 0) {
if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
m_freem(m);
return EMSGSIZE;
/*
* We need a route to perform NECP route rule checks
*/
- if (net_qos_policy_restricted != 0 &&
- ROUTE_UNUSABLE(&inp->inp_route)) {
+ if ((net_qos_policy_restricted != 0 &&
+ ROUTE_UNUSABLE(&inp->inp_route))
+#if CONTENT_FILTER
+ || cfil_faddr_use
+#endif
+ ) {
struct sockaddr_in to;
struct sockaddr_in from;
struct in_addr laddr = ip->ip_src;
if ((rt->rt_flags & (RTF_MULTICAST | RTF_BROADCAST)) ||
inp->inp_socket == NULL ||
+#if CONTENT_FILTER
+ /* Discard temporary route for cfil case */
+ cfil_faddr_use ||
+#endif
!(inp->inp_socket->so_state & SS_ISCONNECTED)) {
rt = NULL; /* unusable */
}
{
#pragma unused(flags, p)
struct inpcb *inp = sotoinpcb(so);
- u_int32_t dst;
+ u_int32_t dst = INADDR_ANY;
int error = 0;
if (inp == NULL
goto bad;
}
- if (so->so_state & SS_ISCONNECTED) {
- if (nam != NULL) {
- error = EISCONN;
- goto bad;
- }
- dst = inp->inp_faddr.s_addr;
- } else {
- if (nam == NULL) {
- error = ENOTCONN;
- goto bad;
- }
+ if (nam != NULL) {
dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr;
}
return rip_output(m, so, dst, control);
#if CONTENT_FILTER
struct m_tag *cfil_tag = NULL;
bool cfil_faddr_use = false;
+ bool sndinprog_cnt_used = false;
uint32_t cfil_so_state_change_cnt = 0;
short cfil_so_options = 0;
struct sockaddr *cfil_faddr = NULL;
* retrieve CFIL saved state from mbuf and use it if necessary.
*/
if (so->so_cfil_db && !addr) {
- cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr);
+ cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL);
if (cfil_tag) {
sin = (struct sockaddr_in *)(void *)cfil_faddr;
if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
fport = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_port;
}
#endif
+ inp->inp_sndinprog_cnt++;
+ sndinprog_cnt_used = true;
if (addr) {
sin = (struct sockaddr_in *)(void *)addr;
ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR;
}
- inp->inp_sndinprog_cnt++;
-
socket_unlock(so, 0);
error = ip_output(m, inpopts, &ro, soopts, mopts, &ipoa);
m = NULL;
inp_set_fc_state(inp, adv->code);
}
- VERIFY(inp->inp_sndinprog_cnt > 0);
- if (--inp->inp_sndinprog_cnt == 0) {
- inp->inp_flags &= ~(INP_FC_FEEDBACK);
- if (inp->inp_sndingprog_waiters > 0) {
- wakeup(&inp->inp_sndinprog_cnt);
- }
- }
-
/* Synchronize PCB cached route */
inp_route_copyin(inp, &ro);
m_tag_free(cfil_tag);
}
#endif
+ if (sndinprog_cnt_used) {
+ VERIFY(inp->inp_sndinprog_cnt > 0);
+ if (--inp->inp_sndinprog_cnt == 0) {
+ inp->inp_flags &= ~(INP_FC_FEEDBACK);
+ if (inp->inp_sndingprog_waiters > 0) {
+ wakeup(&inp->inp_sndinprog_cnt);
+ }
+ }
+ sndinprog_cnt_used = false;
+ }
return error;
}
#pragma unused(flags, p)
int error = 0;
struct inpcb *inp = sotoinpcb(so);
- struct sockaddr_in6 tmp;
- struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam;
struct icmp6_hdr *icmp6;
if (inp == NULL
return rip6_output(m, so, SIN6(nam), control, 0);
}
- /* always copy sockaddr to avoid overwrites */
- if (so->so_state & SS_ISCONNECTED) {
- if (nam != NULL) {
- error = EISCONN;
- goto bad;
- }
- /* XXX */
- bzero(&tmp, sizeof(tmp));
- tmp.sin6_family = AF_INET6;
- tmp.sin6_len = sizeof(struct sockaddr_in6);
- bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
- sizeof(struct in6_addr));
- dst = &tmp;
- } else {
- if (nam == NULL) {
- error = ENOTCONN;
- goto bad;
- }
- tmp = *(struct sockaddr_in6 *)(void *)nam;
- dst = &tmp;
- }
-
/*
* For an ICMPv6 packet, we should know its type and code
*/
}
}
-#if ENABLE_DEFAULT_SCOPE
- if (dst->sin6_scope_id == 0) { /* not change if specified */
- dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr);
- }
-#endif
-
- return rip6_output(m, so, dst, control, 0);
+ return rip6_output(m, so, SIN6(nam), control, 0);
bad:
VERIFY(error != 0);
#include <net/net_api_stats.h>
#include <net/route.h>
#include <net/if_types.h>
+#include <net/content_filter.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
int netsvctype = _NET_SERVICE_TYPE_UNSPEC;
struct ip6_out_args ip6oa;
int flags = IPV6_OUTARGS;
+ struct sockaddr_in6 tmp;
+#if CONTENT_FILTER
+ struct m_tag *cfil_tag = NULL;
+ bool cfil_faddr_use = false;
+ uint32_t cfil_so_state_change_cnt = 0;
+ short cfil_so_options = 0;
+ struct sockaddr *cfil_faddr = NULL;
+ struct sockaddr_in6 *cfil_sin6 = NULL;
+#endif
in6p = sotoin6pcb(so);
+ if (in6p == NULL) {
+ error = EINVAL;
+ goto bad;
+ }
+
+#if CONTENT_FILTER
+ /*
+ * If socket is subject to Content Filter and no addr is passed in,
+ * retrieve CFIL saved state from mbuf and use it if necessary.
+ */
+ if (so->so_cfil_db && !dstsock) {
+ cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL);
+ if (cfil_tag) {
+ cfil_sin6 = SIN6(cfil_faddr);
+ if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
+ /*
+ * Socket is unconnected, simply use the saved faddr as 'addr' to go through
+ * the connect/disconnect logic.
+ */
+ dstsock = cfil_sin6;
+ } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+ (in6p->in6p_fport != cfil_sin6->sin6_port ||
+ !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &cfil_sin6->sin6_addr))) {
+ /*
+ * Socket is connected but socket state and dest addr/port changed.
+ * We need to use the saved faddr and socket options.
+ */
+ cfil_faddr_use = true;
+ }
+ }
+ }
+#endif
+
+ /* always copy sockaddr to avoid overwrites */
+ if (so->so_state & SS_ISCONNECTED) {
+ if (dstsock != NULL) {
+ error = EISCONN;
+ goto bad;
+ }
+ /* XXX */
+ bzero(&tmp, sizeof(tmp));
+ tmp.sin6_family = AF_INET6;
+ tmp.sin6_len = sizeof(struct sockaddr_in6);
+ bcopy(
+#if CONTENT_FILTER
+ cfil_faddr_use ? &cfil_sin6->sin6_addr :
+#endif
+ &in6p->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr));
+ dstsock = &tmp;
+ } else {
+ if (dstsock == NULL) {
+ error = ENOTCONN;
+ goto bad;
+ }
+ tmp = *dstsock;
+ dstsock = &tmp;
+ }
+
+#if ENABLE_DEFAULT_SCOPE
+ if (dstsock->sin6_scope_id == 0) { /* not change if specified */
+ dstsock->sin6_scope_id = scope6_addr2default(&dstsock->sin6_addr);
+ }
+#endif
bzero(&ip6oa, sizeof(ip6oa));
ip6oa.ip6oa_boundif = IFSCOPE_NONE;
/*
* We need a route to perform NECP route rule checks
*/
- if (net_qos_policy_restricted != 0 &&
- ROUTE_UNUSABLE(&in6p->in6p_route)) {
+ if ((net_qos_policy_restricted != 0 &&
+ ROUTE_UNUSABLE(&in6p->in6p_route))
+#if CONTENT_FILTER
+ || cfil_faddr_use
+#endif
+ ) {
struct sockaddr_in6 to;
struct sockaddr_in6 from;
if ((rt->rt_flags & RTF_MULTICAST) ||
in6p->in6p_socket == NULL ||
+#if CONTENT_FILTER
+ /* Discard temporary route for cfil case */
+ cfil_faddr_use ||
+#endif
!(in6p->in6p_socket->so_state & SS_ISCONNECTED)) {
rt = NULL; /* unusable */
}
if (oifp != NULL) {
ifnet_release(oifp);
}
+#if CONTENT_FILTER
+ if (cfil_tag) {
+ m_tag_free(cfil_tag);
+ }
+#endif
+
return error;
}
{
#pragma unused(flags, p)
struct inpcb *inp = sotoinpcb(so);
- struct sockaddr_in6 tmp;
- struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam;
int error = 0;
if (inp == NULL
goto bad;
}
- /* always copy sockaddr to avoid overwrites */
- if (so->so_state & SS_ISCONNECTED) {
- if (nam != NULL) {
- error = EISCONN;
- goto bad;
- }
- /* XXX */
- bzero(&tmp, sizeof(tmp));
- tmp.sin6_family = AF_INET6;
- tmp.sin6_len = sizeof(struct sockaddr_in6);
- bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
- sizeof(struct in6_addr));
- dst = &tmp;
- } else {
- if (nam == NULL) {
- error = ENOTCONN;
- goto bad;
- }
- tmp = *(struct sockaddr_in6 *)(void *)nam;
- dst = &tmp;
- }
-#if ENABLE_DEFAULT_SCOPE
- if (dst->sin6_scope_id == 0) { /* not change if specified */
- dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr);
- }
-#endif
- return rip6_output(m, so, dst, control, 1);
+ return rip6_output(m, so, SIN6(nam), control, 1);
bad:
VERIFY(error != 0);
#if CONTENT_FILTER
struct m_tag *cfil_tag = NULL;
bool cfil_faddr_use = false;
+ bool sndinprog_cnt_used = false;
uint32_t cfil_so_state_change_cnt = 0;
struct sockaddr *cfil_faddr = NULL;
struct sockaddr_in6 *cfil_sin6 = NULL;
* retrieve CFIL saved state from mbuf and use it if necessary.
*/
if (so->so_cfil_db && !addr6) {
- cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr);
+ cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr, NULL);
if (cfil_tag) {
cfil_sin6 = (struct sockaddr_in6 *)(void *)cfil_faddr;
if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
ip6oa.ip6oa_sotc = sotc;
ip6oa.ip6oa_netsvctype = netsvctype;
+ in6p->inp_sndinprog_cnt++;
+ sndinprog_cnt_used = true;
+
if (addr6) {
/*
* IPv4 version of udp_output calls in_pcbconnect in this case,
IM6O_UNLOCK(im6o);
}
- in6p->inp_sndinprog_cnt++;
-
socket_unlock(so, 0);
error = ip6_output(m, optp, &ro, flags, im6o, NULL, &ip6oa);
m = NULL;
inp_set_fc_state(in6p, adv->code);
}
- VERIFY(in6p->inp_sndinprog_cnt > 0);
- if (--in6p->inp_sndinprog_cnt == 0) {
- in6p->inp_flags &= ~(INP_FC_FEEDBACK);
- if (in6p->inp_sndingprog_waiters > 0) {
- wakeup(&in6p->inp_sndinprog_cnt);
- }
- }
-
if (ro.ro_rt != NULL) {
struct ifnet *outif = ro.ro_rt->rt_ifp;
goto releaseopt;
release:
+
if (m != NULL) {
m_freem(m);
}
m_tag_free(cfil_tag);
}
#endif
+ if (sndinprog_cnt_used) {
+ VERIFY(in6p->inp_sndinprog_cnt > 0);
+ if (--in6p->inp_sndinprog_cnt == 0) {
+ in6p->inp_flags &= ~(INP_FC_FEEDBACK);
+ if (in6p->inp_sndingprog_waiters > 0) {
+ wakeup(&in6p->inp_sndinprog_cnt);
+ }
+ }
+ sndinprog_cnt_used = false;
+ }
+
return error;
}
#if CONTENT_FILTER
//If socket is subject to UDP Content Filter and unconnected, get addr from tag.
if (so->so_cfil_db && !addr && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
- cfil_tag = cfil_udp_get_socket_state(m, NULL, NULL, &cfil_faddr);
+ cfil_tag = cfil_dgram_get_socket_state(m, NULL, NULL, &cfil_faddr, NULL);
if (cfil_tag) {
addr = (struct sockaddr *)cfil_faddr;
}
#define NAMEI_CONTLOOKUP 0x002 /* Continue processing a lookup which was partially processed in a compound VNOP */
#define NAMEI_TRAILINGSLASH 0x004 /* There was at least one trailing slash after last component */
#define NAMEI_UNFINISHED 0x008 /* We broke off a lookup to do a compound op */
+
/*
* XXX Hack: we need to encode the intended VNOP in order to
* be able to include information about which operations a filesystem
#define NAMEI_COMPOUNDRENAME 0x100
#define NAMEI_COMPOUND_OP_MASK (NAMEI_COMPOUNDOPEN | NAMEI_COMPOUNDREMOVE | NAMEI_COMPOUNDMKDIR | NAMEI_COMPOUNDRMDIR | NAMEI_COMPOUNDRENAME)
+#define NAMEI_NOPROCLOCK 0x1000 /* do not take process lock (set by vnode_lookup) */
+
#ifdef KERNEL
/*
* namei operational modifier flags, stored in ni_cnd.flags
#if !CONFIG_EMBEDDED
uint64_t p_user_data; /* general-purpose storage for userland-provided data */
#endif /* !CONFIG_EMBEDDED */
+ lck_rw_t p_dirs_lock; /* keeps fd_cdir and fd_rdir stable across a lookup */
};
#define PGRPID_DEAD 0xdeaddead
extern lck_grp_t * proc_mlock_grp;
extern lck_grp_t * proc_ucred_mlock_grp;
extern lck_grp_t * proc_slock_grp;
+extern lck_grp_t * proc_dirslock_grp;
extern lck_grp_attr_t * proc_lck_grp_attr;
extern lck_attr_t * proc_lck_attr;
extern void proc_fdlock_spin(struct proc *);
extern void proc_fdunlock(struct proc *);
extern void proc_fdlock_assert(proc_t p, int assertflags);
+extern void proc_dirs_lock_shared(struct proc *);
+extern void proc_dirs_unlock_shared(struct proc *);
+extern void proc_dirs_lock_exclusive(struct proc *);
+extern void proc_dirs_unlock_exclusive(struct proc *);
extern void proc_ucred_lock(struct proc *);
extern void proc_ucred_unlock(struct proc *);
__private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid,
/*
- * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
struct mbuf *m0, struct mbuf *control, int *error_out);
extern int sbappendchain(struct sockbuf *sb, struct mbuf *m, int space);
extern int sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
+extern int sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0);
extern void sbflush(struct sockbuf *sb);
extern int sbspace(struct sockbuf *sb);
extern int soabort(struct socket *so);
extern void so_release_accept_list(struct socket *);
extern int sbappend(struct sockbuf *sb, struct mbuf *m);
+extern int sbappend_nodrop(struct sockbuf *sb, struct mbuf *m);
extern int sbappendstream(struct sockbuf *sb, struct mbuf *m);
extern int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
struct mbuf *control, int *error_out);
return cwd;
}
-/*
- * vfs_context_get_cwd
- *
- * Description: Returns a vnode for the current working directory for the
- * supplied context. The returned vnode has an iocount on it
- * which must be released with a vnode_put().
- *
- * Parameters: vfs_context_t The context to use
- *
- * Returns: vnode_t The current working directory
- * for this context
- *
- * Notes: The function first attempts to obtain the current directory
- * from the thread, and if it is not present there, falls back
- * to obtaining it from the process instead. If it can't be
- * obtained from either place, we return NULLVP.
- */
-vnode_t
-vfs_context_get_cwd(vfs_context_t ctx)
-{
- vnode_t cwd = NULLVP;
-
- if (ctx != NULL && ctx->vc_thread != NULL) {
- uthread_t uth = get_bsdthread_info(ctx->vc_thread);
- proc_t proc;
-
- /*
- * Get the cwd from the thread; if there isn't one, get it
- * from the process, instead.
- */
- cwd = uth->uu_cdir;
-
- if (cwd) {
- if ((vnode_get(cwd) != 0)) {
- cwd = NULLVP;
- }
- } else if ((proc = (proc_t)get_bsdthreadtask_info(ctx->vc_thread)) != NULL &&
- proc->p_fd != NULL) {
- proc_fdlock(proc);
- cwd = proc->p_fd->fd_cdir;
- if (cwd && (vnode_get(cwd) != 0)) {
- cwd = NULLVP;
- }
- proc_fdunlock(proc);
- }
- }
-
- return cwd;
-}
-
/*
* vfs_context_create
*
#endif
static int lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx);
-static int handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx);
+static int lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx);
static int lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx);
static void lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation);
static int lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly,
{
struct filedesc *fdp; /* pointer to file descriptor state */
struct vnode *dp; /* the directory we are searching */
- struct vnode *rootdir_with_usecount = NULLVP;
- struct vnode *startdir_with_usecount = NULLVP;
struct vnode *usedvp = ndp->ni_dvp; /* store pointer to vp in case we must loop due to
* heavy vnode pressure */
u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */
int volfs_restarts = 0;
#endif
size_t bytes_copied = 0;
+ bool take_proc_lock = !(ndp->ni_flag & NAMEI_NOPROCLOCK);
+ bool proc_lock_taken = false;
fdp = p->p_fd;
/*
* determine the starting point for the translation.
*
- * We may need to upto 2 usecounts on vnodes before starting the translation
- * We need to have a usecount on the root directory for the process
- * for the entire duration of the lookup. This is because symlink
- * translation can restart translation at / if a symlink is encountered.
- *
- * For the duration of this lookup at rootdir for this lookup is the one
- * we fetch now under the proc_fdlock even the if the proc rootdir changes
- * once we let go of the proc_fdlock.
- *
- * In the future we may consider holding off a chroot till we complete
- * in progress lookups.
- *
- * If the starting directory is not the process rootdir then we need
- * a usecount on the starting directory as well for the duration of the
- * lookup.
- *
- * Getting an addtional usecount involves first getting an iocount under
- * the lock that ensures that a usecount is on the directory. Once we
- * get an iocount we can release the lock and we will be free to get a
- * usecount without the vnode getting recycled. Once we get the usecount
- * we can release the icoount which we used to get our usecount.
+ * We hold the proc_dirs lock across the lookup so that the
+ * process rootdir and cwd are stable (i.e. the usecounts
+ * on them are mainatained for the duration of the lookup)
*/
- proc_fdlock(p);
+ if (take_proc_lock) {
+ assert(proc_lock_taken == false);
+ proc_dirs_lock_shared(p);
+ proc_lock_taken = true;
+ }
if (!(fdp->fd_flags & FD_CHROOT)) {
ndp->ni_rootdir = rootvnode;
} else {
if (!ndp->ni_rootdir) {
if (!(fdp->fd_flags & FD_CHROOT)) {
- proc_fdunlock(p);
printf("rootvnode is not set\n");
} else {
- proc_fdunlock(p);
/* This should be a panic */
printf("fdp->fd_rdir is not set\n");
}
goto error_out;
}
- /*
- * We have the proc_fdlock here so we still have a usecount
- * on ndp->ni_rootdir.
- *
- * However we need to get our own usecount on it in order to
- * ensure that the vnode isn't recycled to something else.
- *
- * Note : It's fine if the vnode is force reclaimed but with
- * a usecount it won't be reused until we release the reference.
- *
- * In order to get that usecount however, we need to first
- * get non blocking iocount since we'll be doing this under
- * the proc_fdlock.
- */
- if (vnode_get(ndp->ni_rootdir) != 0) {
- proc_fdunlock(p);
- error = ENOENT;
- goto error_out;
- }
-
- proc_fdunlock(p);
-
- /* Now we can safely get our own ref on ni_rootdir */
- error = vnode_ref_ext(ndp->ni_rootdir, O_EVTONLY, 0);
- vnode_put(ndp->ni_rootdir);
- if (error) {
- ndp->ni_rootdir = NULLVP;
- goto error_out;
- }
-
- rootdir_with_usecount = ndp->ni_rootdir;
-
cnp->cn_nameptr = cnp->cn_pnbuf;
ndp->ni_usedvp = NULLVP;
- bool dp_needs_put = false;
if (*(cnp->cn_nameptr) == '/') {
while (*(cnp->cn_nameptr) == '/') {
cnp->cn_nameptr++;
dp = ndp->ni_dvp;
ndp->ni_usedvp = dp;
} else {
- dp = vfs_context_get_cwd(ctx);
- if (dp) {
- dp_needs_put = true;
- }
+ dp = vfs_context_cwd(ctx);
}
if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
- if (dp_needs_put) {
- vnode_put(dp);
- dp_needs_put = false;
- }
dp = NULLVP;
error = ENOENT;
goto error_out;
}
- if (dp != rootdir_with_usecount) {
- error = vnode_ref_ext(dp, O_EVTONLY, 0);
- if (error) {
- if (dp_needs_put) {
- vnode_put(dp);
- dp_needs_put = false;
- }
- dp = NULLVP;
- goto error_out;
- }
- startdir_with_usecount = dp;
- }
-
- if (dp_needs_put) {
- vnode_put(dp);
- dp_needs_put = false;
- }
-
ndp->ni_dvp = NULLVP;
ndp->ni_vp = NULLVP;
goto error_out;
}
#endif
-
ndp->ni_startdir = dp;
dp = NULLVP;
* Check for symbolic link
*/
if ((cnp->cn_flags & ISSYMLINK) == 0) {
- if (startdir_with_usecount) {
- vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
- startdir_with_usecount = NULLVP;
- }
- if (rootdir_with_usecount) {
- vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0);
- rootdir_with_usecount = NULLVP;
+ if (proc_lock_taken) {
+ proc_dirs_unlock_shared(p);
+ proc_lock_taken = false;
}
return 0;
}
continue_symlink:
- /*
- * Gives us a new path to process, and a starting dir (with an iocount).
- * The iocount is needed to take a usecount on the vnode returned
- * (if it is not a vnode we already have a usecount on).
- */
- error = handle_symlink_for_namei(ndp, &dp, ctx);
+ /* Gives us a new path to process, and a starting dir */
+ error = lookup_handle_symlink(ndp, &dp, ctx);
if (error != 0) {
break;
}
-
- if (dp == ndp->ni_rootdir && startdir_with_usecount) {
- vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
- startdir_with_usecount = NULLVP;
- } else if (dp != startdir_with_usecount) {
- if (startdir_with_usecount) {
- vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
- startdir_with_usecount = NULLVP;
- }
- error = vnode_ref_ext(dp, O_EVTONLY, 0);
- if (error) {
- vnode_put(dp);
- dp = NULLVP;
- goto error_out;
- }
- startdir_with_usecount = dp;
- }
- /* iocount not required on dp anymore */
- vnode_put(dp);
}
/*
* only come here if we fail to handle a SYMLINK...
vnode_put(ndp->ni_vp);
}
error_out:
+ if (proc_lock_taken) {
+ proc_dirs_unlock_shared(p);
+ proc_lock_taken = false;
+ }
if ((cnp->cn_flags & HASBUF)) {
cnp->cn_flags &= ~HASBUF;
FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
ndp->ni_vp = NULLVP;
ndp->ni_dvp = NULLVP;
- if (startdir_with_usecount) {
- vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
- startdir_with_usecount = NULLVP;
- }
- if (rootdir_with_usecount) {
- vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0);
- rootdir_with_usecount = NULLVP;
- }
-
#if CONFIG_VOLFS
/*
* Deal with volfs fallout.
/*
* Takes ni_vp and ni_dvp non-NULL. Returns with *new_dp set to the location
- * at which to start a lookup with a resolved path and with an iocount.
+ * at which to start a lookup with a resolved path, and all other iocounts dropped.
*/
static int
-handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
+lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
{
int error;
char *cp; /* pointer into pathname argument */
/*
* starting point for 'relative'
* symbolic link path
- *
- * If the starting point is not the root we have to return an iocounted
- * dp to namei so we don't release the icoount here.
*/
dp = ndp->ni_dvp;
- ndp->ni_dvp = NULLVP;
/*
* get rid of references returned via 'lookup'
*/
vnode_put(ndp->ni_vp);
+ vnode_put(ndp->ni_dvp); /* ALWAYS have a dvp for a symlink */
+
ndp->ni_vp = NULLVP;
+ ndp->ni_dvp = NULLVP;
/*
* Check if symbolic link restarts us at the root
cnp->cn_nameptr++;
ndp->ni_pathlen--;
}
- vnode_put(dp);
if ((dp = ndp->ni_rootdir) == NULLVP) {
return ENOENT;
}
- if (vnode_get(dp) != 0) {
- return ENOENT;
- }
- }
-
- if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
- if (dp) {
- vnode_put(dp);
- }
- return ENOENT;
}
*new_dp = dp;
if (start_dvp && (path[0] != '/')) {
nd.ni_dvp = start_dvp;
nd.ni_cnd.cn_flags |= USEDVP;
+ /* Don't take proc lock vnode_lookupat with a startdir specified */
+ nd.ni_flag |= NAMEI_NOPROCLOCK;
}
if ((error = namei(&nd))) {
*/
#define MAX_AUTHORIZE_ENOENT_RETRIES 1024
+/* Max retry limit for rename due to vnode recycling. */
+#define MAX_RENAME_ERECYCLE_RETRIES 1024
+
static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
int unlink_flags);
return PROC_RETURNED;
}
+ proc_dirs_lock_exclusive(p);
/*
* Now do the work. Note: we dropped the proc_fdlock, so we
* have to do all of the checks again.
}
}
proc_fdunlock(p);
+ proc_dirs_unlock_exclusive(p);
/*
* Dispose of any references that are no longer needed.
return ENOENT;
}
} else {
+ proc_dirs_lock_exclusive(p);
proc_fdlock(p);
tvp = fdp->fd_cdir;
fdp->fd_cdir = vp;
proc_fdunlock(p);
+ proc_dirs_unlock_exclusive(p);
}
if (tvp) {
return ENOENT;
}
} else {
+ proc_dirs_lock_exclusive(p);
proc_fdlock(p);
tvp = fdp->fd_cdir;
fdp->fd_cdir = ndp->ni_vp;
proc_fdunlock(p);
+ proc_dirs_unlock_exclusive(p);
}
if (tvp) {
}
vnode_put(nd.ni_vp);
+ /*
+ * This lock provides the guarantee that as long as you hold the lock
+ * fdp->fd_rdir has a usecount on it. This is used to take an iocount
+ * on a referenced vnode in namei when determining the rootvnode for
+ * a process.
+ */
+ /* needed for synchronization with lookup */
+ proc_dirs_lock_exclusive(p);
+ /* needed for setting the flag and other activities on the fd itself */
proc_fdlock(p);
tvp = fdp->fd_rdir;
fdp->fd_rdir = nd.ni_vp;
fdp->fd_flags |= FD_CHROOT;
proc_fdunlock(p);
+ proc_dirs_unlock_exclusive(p);
if (tvp != NULL) {
vnode_rele(tvp);
* but other filesystems susceptible to this race could return it, too.
*/
if (error == ERECYCLE) {
- do_retry = 1;
+ if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
+ do_retry = 1;
+ retry_count += 1;
+ } else {
+ printf("rename retry limit due to ERECYCLE reached\n");
+ error = ENOENT;
+ }
}
/*
-19.4.0
+19.5.0
# The first line of this file contains the master version number for the kernel.
# All other instances of the kernel version in xnu are derived from this file.
kIODirectionPrepareReserved1 = 0x00000010,
#define IODIRECTIONPREPARENONCOHERENTDEFINED 1
kIODirectionPrepareNonCoherent = 0x00000020,
+#if KERNEL_PRIVATE
+#define IODIRECTIONPREPAREAVOIDTHROTTLING 1
+ kIODirectionPrepareAvoidThrottling = 0x00000100,
+#endif
// these flags are valid for the complete() method only
#define IODIRECTIONCOMPLETEWITHERRORDEFINED 1
upl_abort(iopl.fIOPL, 0);
upl_deallocate(iopl.fIOPL);
}
+ error = kIOReturnNoMemory;
goto abortExit;
}
dataP = NULL;
}
if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) {
+ if ((forDirection & kIODirectionPrepareAvoidThrottling) && NEED_TO_HARD_THROTTLE_THIS_TASK()) {
+ error = kIOReturnNotReady;
+ goto finish;
+ }
error = wireVirtual(forDirection);
}
}
}
+finish:
+
if (_prepareLock) {
IOLockUnlock(_prepareLock);
}
*
* Note that this workaround does not pose a security risk, because the RO
* page tables still remain read-only, due to KTRR/CTRR, and further protecting
- * them at the APRR level would be unnecessary.
+ * them would be unnecessary.
*/
monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin);
monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
Lexception_return_restore_registers:
mov x0, sp // x0 = &pcb
// Loads authed $x0->ss_64.pc into x1 and $x0->ss_64.cpsr into w2
- AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24
+ AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24, el0_state_allowed=1
/* Restore special register state */
ldr w3, [sp, NS64_FPSR]
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
+#include <arm64/proc_reg.h>
#include <pexpert/arm64/board_config.h>
#include "assym.s"
*
* On CPUs with PAC support, this macro will auth the above values with ml_check_signed_state().
*
- * arg0 - scratch register 1
- * arg1 - scratch register 2
- * arg2 - scratch register 3
- * arg3 - scratch register 4
- * arg4 - scratch register 5
+ * tmp1 - scratch register 1
+ * tmp2 - scratch register 2
+ * tmp3 - scratch register 3
+ * tmp4 - scratch register 4
+ * tmp5 - scratch register 5
*/
/* BEGIN IGNORE CODESTYLE */
-.macro AUTH_THREAD_STATE_IN_X0
- ldr x1, [x0, SS64_PC]
+.macro AUTH_THREAD_STATE_IN_X0 tmp1, tmp2, tmp3, tmp4, tmp5, el0_state_allowed=0
ldr w2, [x0, SS64_CPSR]
+.if \el0_state_allowed==0
+#if __has_feature(ptrauth_calls)
+ // If testing for a canary CPSR value, ensure that we do not observe writes to other fields without it
+ dmb ld
+#endif
+.endif
+ ldr x1, [x0, SS64_PC]
ldp x16, x17, [x0, SS64_X16]
#if defined(HAS_APPLE_PAC)
// Save x3-x5 to preserve across call
- mov $2, x3
- mov $3, x4
- mov $4, x5
+ mov \tmp3, x3
+ mov \tmp4, x4
+ mov \tmp5, x5
/*
* Arg0: The ARM context pointer (already in x0)
* Stash saved state PC and CPSR in other registers to avoid reloading potentially unauthed
* values from memory. (ml_check_signed_state will clobber x1 and x2.)
*/
- mov $0, x1
- mov $1, x2
+ mov \tmp1, x1
+ mov \tmp2, x2
ldr x3, [x0, SS64_LR]
mov x4, x16
mov x5, x17
bl EXT(ml_check_signed_state)
- mov x1, $0
- mov x2, $1
+ mov x1, \tmp1
+ mov x2, \tmp2
+
+.if \el0_state_allowed==0
+ and \tmp2, \tmp2, #PSR64_MODE_MASK
+ cbnz \tmp2, 1f
+ bl EXT(ml_auth_thread_state_invalid_cpsr)
+1:
+.endif
// LR was already loaded/authed earlier, if we reload it we might be loading a potentially unauthed value
mov lr, x3
- mov x3, $2
- mov x4, $3
- mov x5, $4
+ mov x3, \tmp3
+ mov x4, \tmp4
+ mov x5, \tmp5
#else
ldr lr, [x0, SS64_LR]
#endif /* defined(HAS_APPLE_PAC) */
/*
* SET_RECOVERY_HANDLER
*
- * Sets up a page fault recovery handler
+ * Sets up a page fault recovery handler. This macro clobbers x16 and x17.
*
- * arg0 - persisted thread pointer
- * arg1 - persisted recovery handler
- * arg2 - scratch reg
- * arg3 - recovery label
+ * label - recovery label
+ * tpidr - persisted thread pointer
+ * old_handler - persisted recovery handler
+ * label_in_adr_range - whether \label is within 1 MB of PC
*/
-.macro SET_RECOVERY_HANDLER
- mrs $0, TPIDR_EL1 // Load thread pointer
- adrp $2, $3@page // Load the recovery handler address
- add $2, $2, $3@pageoff
+.macro SET_RECOVERY_HANDLER label, tpidr=x16, old_handler=x10, label_in_adr_range=0
+ // Note: x16 and x17 are designated for use as temporaries in
+ // interruptible PAC routines. DO NOT CHANGE THESE REGISTER ASSIGNMENTS.
+.if \label_in_adr_range==1 // Load the recovery handler address
+ adr x17, \label
+.else
+ adrp x17, \label@page
+ add x17, x17, \label@pageoff
+.endif
#if defined(HAS_APPLE_PAC)
- add $1, $0, TH_RECOVER
- movk $1, #PAC_DISCRIMINATOR_RECOVER, lsl 48
- pacia $2, $1 // Sign with IAKey + blended discriminator
+ mrs x16, TPIDR_EL1
+ add x16, x16, TH_RECOVER
+ movk x16, #PAC_DISCRIMINATOR_RECOVER, lsl 48
+ pacia x17, x16 // Sign with IAKey + blended discriminator
#endif
- ldr $1, [$0, TH_RECOVER] // Save previous recovery handler
- str $2, [$0, TH_RECOVER] // Set new signed recovery handler
+ mrs \tpidr, TPIDR_EL1 // Load thread pointer
+ ldr \old_handler, [\tpidr, TH_RECOVER] // Save previous recovery handler
+ str x17, [\tpidr, TH_RECOVER] // Set new signed recovery handler
.endmacro
/*
*
* Clears page fault handler set by SET_RECOVERY_HANDLER
*
- * arg0 - thread pointer saved by SET_RECOVERY_HANDLER
- * arg1 - old recovery handler saved by SET_RECOVERY_HANDLER
+ * tpidr - thread pointer saved by SET_RECOVERY_HANDLER
+ * old_handler - old recovery handler saved by SET_RECOVERY_HANDLER
*/
-.macro CLEAR_RECOVERY_HANDLER
- str $1, [$0, TH_RECOVER] // Restore the previous recovery handler
+.macro CLEAR_RECOVERY_HANDLER tpidr=x16, old_handler=x10
+ str \old_handler, [\tpidr, TH_RECOVER] // Restore the previous recovery handler
.endmacro
.text
.align 2
copyio_error:
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
mov x0, #EFAULT // Return an EFAULT error
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_bcopyin)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
/* If len is less than 16 bytes, just do a bytewise copy */
cmp x2, #16
b.lt 2f
strb w3, [x1], #1
b.hi 2b
3:
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
mov x0, #0
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_copyin_atomic32)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
ldr w8, [x0]
str w8, [x1]
mov x0, #0
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_copyin_atomic32_wait_if_equals)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
ldxr w8, [x0]
cmp w8, w1
mov x0, ESTALE
wfe
1:
clrex
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_copyin_atomic64)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
ldr x8, [x0]
str x8, [x1]
mov x0, #0
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_copyout_atomic32)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
str w0, [x1]
mov x0, #0
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_copyout_atomic64)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
str x0, [x1]
mov x0, #0
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_bcopyout)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
/* If len is less than 16 bytes, just do a bytewise copy */
cmp x2, #16
b.lt 2f
strb w3, [x1], #1
b.hi 2b
3:
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
mov x0, #0
POP_FRAME
ARM64_STACK_EPILOG
LEXT(_bcopyinstr)
ARM64_STACK_PROLOG
PUSH_FRAME
- adr x4, Lcopyinstr_error // Get address for recover
- mrs x10, TPIDR_EL1 // Get thread pointer
- ldr x11, [x10, TH_RECOVER] // Save previous recover
-
-#if defined(HAS_APPLE_PAC)
- add x5, x10, TH_RECOVER // Sign new pointer with IAKey + blended discriminator
- movk x5, #PAC_DISCRIMINATOR_RECOVER, lsl 48
- pacia x4, x5
-#endif
- str x4, [x10, TH_RECOVER] // Store new recover
-
+ SET_RECOVERY_HANDLER Lcopyinstr_error, label_in_adr_range=1
mov x4, #0 // x4 - total bytes copied
Lcopyinstr_loop:
ldrb w5, [x0], #1 // Load a byte from the user source
Lcopyinstr_error:
mov x0, #EFAULT // Return EFAULT on error
Lcopyinstr_exit:
- str x11, [x10, TH_RECOVER] // Restore old recover
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
* x3 : temp
* x5 : temp (kernel virtual base)
* x9 : temp
- * x10 : thread pointer (set by SET_RECOVERY_HANDLER)
- * x11 : old recovery function (set by SET_RECOVERY_HANDLER)
+ * x10 : old recovery function (set by SET_RECOVERY_HANDLER)
* x12, x13 : backtrace data
+ * x16 : thread pointer (set by SET_RECOVERY_HANDLER)
*
*/
.text
LEXT(copyinframe)
ARM64_STACK_PROLOG
PUSH_FRAME
- SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+ SET_RECOVERY_HANDLER copyio_error
cbnz w2, Lcopyinframe64 // Check frame size
adrp x5, EXT(gVirtBase)@page // For 32-bit frame, make sure we're not trying to copy from kernel
add x5, x5, EXT(gVirtBase)@pageoff
mov w0, #0 // Success
Lcopyinframe_done:
- CLEAR_RECOVERY_HANDLER x10, x11
+ CLEAR_RECOVERY_HANDLER
POP_FRAME
ARM64_STACK_EPILOG
CALL_EXTERN panic_with_thread_kernel_state
Lcheck_hash_str:
.asciz "JOP Hash Mismatch Detected (PC, CPSR, or LR corruption)"
+
+/**
+ * void ml_auth_thread_state_invalid_cpsr(arm_saved_state_t *ss)
+ *
+ * Panics due to an invalid CPSR value in ss.
+ */
+ .text
+ .align 2
+ .globl EXT(ml_auth_thread_state_invalid_cpsr)
+LEXT(ml_auth_thread_state_invalid_cpsr)
+ ARM64_STACK_PROLOG
+ PUSH_FRAME
+ mov x1, x0
+ adr x0, Linvalid_cpsr_str
+ CALL_EXTERN panic_with_thread_kernel_state
+
+Linvalid_cpsr_str:
+ .asciz "Thread state corruption detected (PE mode == 0)"
#endif /* HAS_APPLE_PAC */
.text
#if defined(HAS_APPLE_PAC)
/* Sign the initial kernel stack saved state */
const uint32_t default_cpsr = PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK;
+ boolean_t intr = ml_set_interrupts_enabled(FALSE);
asm volatile (
"mov x0, %[ss]" "\n"
[SS64_LR] "i"(offsetof(struct arm_saved_state, ss_64.lr))
: "x0", "x1", "x2", "x3", "x4", "x5", "x6"
);
+ ml_set_interrupts_enabled(intr);
#else
savestate->lr = (uintptr_t)thread_continue;
savestate->cpsr = (PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK) | current_el;
arm_saved_state_t * saved_state)
{
uint32_t i;
+#if __has_feature(ptrauth_calls)
+ boolean_t intr = ml_set_interrupts_enabled(FALSE);
+#endif /* __has_feature(ptrauth_calls) */
assert(is_saved_state64(saved_state));
+ set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64);
+#if __has_feature(ptrauth_calls)
+ /*
+ * Make writes to ts64->cpsr visible first, since it's useful as a
+ * canary to detect thread-state corruption.
+ */
+ __builtin_arm_dmb(DMB_ST);
+#endif
set_saved_state_fp(saved_state, ts64->fp);
set_saved_state_lr(saved_state, ts64->lr);
set_saved_state_sp(saved_state, ts64->sp);
set_saved_state_pc(saved_state, ts64->pc);
- set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64);
for (i = 0; i < 29; i++) {
set_saved_state_reg(saved_state, i, ts64->x[i]);
}
+
+#if __has_feature(ptrauth_calls)
+ ml_set_interrupts_enabled(intr);
+#endif /* __has_feature(ptrauth_calls) */
}
#endif /* __arm64__ */
#if defined(HAS_APPLE_PAC)
/* Sign the initial user-space thread state */
if (thread->machine.upcb != NULL) {
+ boolean_t intr = ml_set_interrupts_enabled(FALSE);
ml_sign_thread_state(thread->machine.upcb, 0, 0, 0, 0, 0);
+ ml_set_interrupts_enabled(intr);
}
#endif /* defined(HAS_APPLE_PAC) */
task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num)
{
vm_object_t find_vmo;
- int64_t size = 0;
+ unsigned int i = 0;
+ unsigned int vmobj_limit = len / sizeof(vm_object_query_data_t);
task_objq_lock(task);
if (query != NULL) {
queue_iterate(&task->task_objq, find_vmo, vm_object_t, task_objq)
{
- int byte_size;
- vm_object_query_t p = &query[size++];
+ vm_object_query_t p = &query[i];
+
+ /*
+ * Clear the entire vm_object_query_t struct as we are using
+ * only the first 6 bits in the uint64_t bitfield for this
+ * anonymous struct member.
+ */
+ bzero(p, sizeof(*p));
p->object_id = (vm_object_id_t) VM_KERNEL_ADDRPERM(find_vmo);
p->virtual_size = find_vmo->internal ? find_vmo->vo_size : 0;
p->compressed_size = 0;
}
- /* make sure to not overrun */
- byte_size = (int) size * sizeof(vm_object_query_data_t);
- if ((int)(byte_size + sizeof(vm_object_query_data_t)) > len) {
+ i++;
+
+ /* Make sure to not overrun */
+ if (i == vmobj_limit) {
break;
}
}
} else {
- size = task->task_owned_objects;
+ i = task->task_owned_objects;
}
task_objq_unlock(task);
- *num = size;
+ *num = i;
}
#if defined(XNU_KERNEL_PRIVATE)
#if defined(HAS_APPLE_PAC)
+
+#include <sys/cdefs.h>
+
+/*
+ * Used by MANIPULATE_SIGNED_THREAD_STATE(), potentially from C++ (IOKit) code.
+ * Open-coded to prevent a circular dependency between mach/arm/thread_status.h
+ * and osfmk/arm/machine_routines.h.
+ */
+__BEGIN_DECLS
+extern boolean_t ml_set_interrupts_enabled(boolean_t);
+__END_DECLS
+
/*
* Methods used to sign and check thread state to detect corruptions of saved
* thread state across exceptions and context switches.
* x6: scratch register
* x7: scratch register
*/
-#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...) \
- asm volatile ( \
- "mov x8, lr" "\n" \
- "mov x0, %[iss]" "\n" \
- "ldp x4, x5, [x0, %[SS64_X16]]" "\n" \
- "ldr x6, [x0, %[SS64_PC]]" "\n" \
- "ldr w7, [x0, %[SS64_CPSR]]" "\n" \
- "ldr x3, [x0, %[SS64_LR]]" "\n" \
- "mov x1, x6" "\n" \
- "mov w2, w7" "\n" \
- "bl _ml_check_signed_state" "\n" \
- "mov x1, x6" "\n" \
- "mov w2, w7" "\n" \
- _instr "\n" \
- "bl _ml_sign_thread_state" "\n" \
- "mov lr, x8" "\n" \
- : \
- : [iss] "r"(_iss), \
- [SS64_X16] "i"(ss64_offsetof(x[16])), \
- [SS64_PC] "i"(ss64_offsetof(pc)), \
- [SS64_CPSR] "i"(ss64_offsetof(cpsr)), \
- [SS64_LR] "i"(ss64_offsetof(lr)),##__VA_ARGS__ \
- : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" \
- )
+#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...) \
+ do { \
+ boolean_t _intr = ml_set_interrupts_enabled(FALSE); \
+ asm volatile ( \
+ "mov x8, lr" "\n" \
+ "mov x0, %[iss]" "\n" \
+ "ldp x4, x5, [x0, %[SS64_X16]]" "\n" \
+ "ldr x6, [x0, %[SS64_PC]]" "\n" \
+ "ldr w7, [x0, %[SS64_CPSR]]" "\n" \
+ "ldr x3, [x0, %[SS64_LR]]" "\n" \
+ "mov x1, x6" "\n" \
+ "mov w2, w7" "\n" \
+ "bl _ml_check_signed_state" "\n" \
+ "mov x1, x6" "\n" \
+ "mov w2, w7" "\n" \
+ _instr "\n" \
+ "bl _ml_sign_thread_state" "\n" \
+ "mov lr, x8" "\n" \
+ : \
+ : [iss] "r"(_iss), \
+ [SS64_X16] "i"(ss64_offsetof(x[16])), \
+ [SS64_PC] "i"(ss64_offsetof(pc)), \
+ [SS64_CPSR] "i"(ss64_offsetof(cpsr)), \
+ [SS64_LR] "i"(ss64_offsetof(lr)),##__VA_ARGS__ \
+ : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" \
+ ); \
+ ml_set_interrupts_enabled(_intr); \
+ } while (0)
static inline void
check_and_sign_copied_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src)
}
+#define C_SEG_MAJOR_COMPACT_STATS_MAX (30)
struct {
uint64_t asked_permission;
uint64_t wasted_space_in_swapouts;
uint64_t count_of_swapouts;
uint64_t count_of_freed_segs;
-} c_seg_major_compact_stats;
+ uint64_t bailed_compactions;
+ uint64_t bytes_freed_rate_us;
+} c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
+
+int c_seg_major_compact_stats_now = 0;
#define C_MAJOR_COMPACTION_SIZE_APPROPRIATE ((C_SEG_BUFSIZE * 90) / 100)
c_segment_t c_seg_dst,
c_segment_t c_seg_src)
{
- c_seg_major_compact_stats.asked_permission++;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
c_seg_dst->c_was_major_compacted++;
c_seg_src->c_was_major_donor++;
#endif
- c_seg_major_compact_stats.compactions++;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
dst_slot = c_seg_dst->c_nextslot;
c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
- c_seg_major_compact_stats.moved_slots++;
- c_seg_major_compact_stats.moved_bytes += c_size;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
cslot_copy(c_dst, c_src);
c_dst->c_offset = c_seg_dst->c_nextoffset;
boolean_t needs_to_swap = FALSE;
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
+
#if !CONFIG_EMBEDDED
LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
#endif /* !CONFIG_EMBEDDED */
}
lck_mtx_lock_spin_always(c_list_lock);
}
+
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
}
}
}
+int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
void
vm_compressor_compact_and_swap(boolean_t flush_all)
{
c_segment_t c_seg, c_seg_next;
- boolean_t keep_compacting;
+ boolean_t keep_compacting, switch_state;
clock_sec_t now;
clock_nsec_t nsec;
+ mach_timespec_t start_ts, end_ts;
+ unsigned int number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
+ uint64_t bytes_to_free, bytes_freed, delta_usec;
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
if (fastwake_warmup == TRUE) {
uint64_t starting_warmup_count;
*/
clock_get_system_nanotime(&now, &nsec);
+ start_ts.tv_sec = (int) now;
+ start_ts.tv_nsec = nsec;
+ delta_usec = 0;
+ number_considered = 0;
+ wanted_cseg_found = 0;
+ number_yields = 0;
+ bytes_to_free = 0;
+ bytes_freed = 0;
+ yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
+
while (!queue_empty(&c_age_list_head) && compaction_swapper_abort == 0) {
if (hibernate_flushing == TRUE) {
clock_sec_t sec;
lck_mtx_unlock_always(c_list_lock);
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
+
thread_block(THREAD_CONTINUE_NULL);
lck_mtx_lock_spin_always(c_list_lock);
* to do minor compactions to make
* more memory available
*/
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
+
continue;
}
lck_mtx_lock_spin_always(c_list_lock);
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
+
if (needs_to_swap == FALSE) {
break;
}
}
if (queue_empty(&c_age_list_head)) {
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
break;
}
c_seg = (c_segment_t) queue_first(&c_age_list_head);
assert(c_seg->c_state == C_ON_AGE_Q);
if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
break;
}
lck_mtx_lock_spin_always(&c_seg->c_lock);
if (c_seg->c_busy) {
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
+
lck_mtx_unlock_always(c_list_lock);
c_seg_wait_on_busy(c_seg);
lck_mtx_lock_spin_always(c_list_lock);
* found an empty c_segment and freed it
* so go grab the next guy in the queue
*/
- c_seg_major_compact_stats.count_of_freed_segs++;
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
continue;
}
/*
* Major compaction
*/
keep_compacting = TRUE;
+ switch_state = TRUE;
while (keep_compacting == TRUE) {
assert(c_seg->c_busy);
assert(c_seg_next->c_state == C_ON_AGE_Q);
+ number_considered++;
+
if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
break;
}
lck_mtx_lock_spin_always(&c_seg_next->c_lock);
if (c_seg_next->c_busy) {
+ /*
+ * We are going to block for our neighbor.
+ * If our c_seg is wanted, we should unbusy
+ * it because we don't know how long we might
+ * have to block here.
+ */
+ if (c_seg->c_wanted) {
+ lck_mtx_unlock_always(&c_seg_next->c_lock);
+ switch_state = FALSE;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
+ wanted_cseg_found++;
+ break;
+ }
+
lck_mtx_unlock_always(c_list_lock);
+
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
+
c_seg_wait_on_busy(c_seg_next);
lck_mtx_lock_spin_always(c_list_lock);
/* grab that segment */
C_SEG_BUSY(c_seg_next);
+ bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
/*
* found an empty c_segment and freed it
* so we can't continue to use c_seg_next
*/
- c_seg_major_compact_stats.count_of_freed_segs++;
+ bytes_freed += bytes_to_free;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
continue;
}
keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
+
PAGE_REPLACEMENT_DISALLOWED(TRUE);
lck_mtx_lock_spin_always(&c_seg_next->c_lock);
* by passing TRUE, we ask for c_busy to be cleared
* and c_wanted to be taken care of
*/
+ bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
- c_seg_major_compact_stats.count_of_freed_segs++;
+ bytes_freed += bytes_to_free;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
+ } else {
+ bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
+ bytes_freed += bytes_to_free;
}
PAGE_REPLACEMENT_DISALLOWED(FALSE);
/* relock the list */
lck_mtx_lock_spin_always(c_list_lock);
+
+ if (c_seg->c_wanted) {
+ /*
+ * Our c_seg is in demand. Let's
+ * unbusy it and wakeup the waiters
+ * instead of continuing the compaction
+ * because we could be in this loop
+ * for a while.
+ */
+ switch_state = FALSE;
+ wanted_cseg_found++;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
+ break;
+ }
} /* major compaction */
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, wanted_cseg_found, 0);
+
lck_mtx_lock_spin_always(&c_seg->c_lock);
assert(c_seg->c_busy);
assert(!c_seg->c_on_minorcompact_q);
- if (VM_CONFIG_SWAP_IS_ACTIVE) {
- /*
- * This mode of putting a generic c_seg on the swapout list is
- * only supported when we have general swapping enabled
- */
- c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
- } else {
- if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
- assert(VM_CONFIG_SWAP_IS_PRESENT);
+ if (switch_state) {
+ if (VM_CONFIG_SWAP_IS_ACTIVE) {
/*
- * we are running compressor sweeps with swap-behind
- * make sure the c_seg has aged enough before swapping it
- * out...
+ * This mode of putting a generic c_seg on the swapout list is
+ * only supported when we have general swapping enabled
*/
- if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
- c_seg->c_overage_swap = TRUE;
- c_overage_swapped_count++;
- c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+ c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+ } else {
+ if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
+ assert(VM_CONFIG_SWAP_IS_PRESENT);
+ /*
+ * we are running compressor sweeps with swap-behind
+ * make sure the c_seg has aged enough before swapping it
+ * out...
+ */
+ if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
+ c_seg->c_overage_swap = TRUE;
+ c_overage_swapped_count++;
+ c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+ }
}
}
+ if (c_seg->c_state == C_ON_AGE_Q) {
+ /*
+ * this c_seg didn't get moved to the swapout queue
+ * so we need to move it out of the way...
+ * we just did a major compaction on it so put it
+ * on that queue
+ */
+ c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
+ } else {
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used;
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
+ }
}
- if (c_seg->c_state == C_ON_AGE_Q) {
- /*
- * this c_seg didn't get moved to the swapout queue
- * so we need to move it out of the way...
- * we just did a major compaction on it so put it
- * on that queue
- */
- c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
- } else {
- c_seg_major_compact_stats.wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used;
- c_seg_major_compact_stats.count_of_swapouts++;
- }
+
C_SEG_WAKEUP_DONE(c_seg);
lck_mtx_unlock_always(&c_seg->c_lock);
lck_mtx_lock_spin_always(c_list_lock);
}
+
+ if (number_considered >= yield_after_considered_per_pass) {
+ if (wanted_cseg_found) {
+ /*
+ * We stopped major compactions on a c_seg
+ * that is wanted. We don't know the priority
+ * of the waiter unfortunately but we are at
+ * a very high priority and so, just in case
+ * the waiter is a critical system daemon or
+ * UI thread, let's give up the CPU in case
+ * the system is running a few CPU intensive
+ * tasks.
+ */
+ lck_mtx_unlock_always(c_list_lock);
+
+ mutex_pause(2); /* 100us yield */
+
+ number_yields++;
+
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
+
+ lck_mtx_lock_spin_always(c_list_lock);
+ }
+
+ number_considered = 0;
+ wanted_cseg_found = 0;
+ }
}
+ clock_get_system_nanotime(&now, &nsec);
+ end_ts.tv_sec = (int) now;
+ end_ts.tv_nsec = nsec;
+
+ SUB_MACH_TIMESPEC(&end_ts, &start_ts);
+
+ delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
+
+ delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
+
+ c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
+
+ if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
+ c_seg_major_compact_stats_now = 0;
+ } else {
+ c_seg_major_compact_stats_now++;
+ }
+
+ assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
+
+ VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
}
uint64_t vm_hard_throttle_threshold;
-
-#define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \
- ((vm_page_free_count < vm_page_throttle_limit || \
- HARD_THROTTLE_LIMIT_REACHED()) && \
- proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
-
+OS_ALWAYS_INLINE
+boolean_t
+NEED_TO_HARD_THROTTLE_THIS_TASK(void)
+{
+ return vm_wants_task_throttled(current_task()) ||
+ ((vm_page_free_count < vm_page_throttle_limit ||
+ HARD_THROTTLE_LIMIT_REACHED()) &&
+ proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
+}
#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
#endif /* MACH_KERNEL_PRIVATE */
+#if XNU_KERNEL_PRIVATE
+
+boolean_t NEED_TO_HARD_THROTTLE_THIS_TASK(void);
+
+#endif
+
#endif /* KERNEL_PRIVATE */
#endif /* _VM_VM_FAULT_H_ */
#define VM_PAGE_GRAB 0x126
#define VM_PAGE_RELEASE 0x127
+#define VM_COMPRESSOR_COMPACT_AND_SWAP 0x128
+#define VM_COMPRESSOR_DO_DELAYED_COMPACTIONS 0x129
+
#define VM_PRESSURE_EVENT 0x130
#define VM_EXECVE 0x131
length = sizeof(kTestIntervalSecs);
new_budget_ln = sizeof(new_budget);
ret = sysctlbyname("vm.memorystatus_freeze_calculate_new_budget", &new_budget, &new_budget_ln, &kTestIntervalSecs, length);
- T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget");
+ T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget");
// Grab the daily budget.
length = sizeof(memorystatus_freeze_daily_mb_max);
ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &memorystatus_freeze_daily_mb_max, &length, NULL, 0);
- T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max");
+ T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max");
- memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024 * 1024 / page_size;
+ memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024UL * 1024UL / page_size;
+ T_LOG("memorystatus_freeze_daily_mb_max %u", memorystatus_freeze_daily_mb_max);
+ T_LOG("memorystatus_freeze_daily_pages_max %u", memorystatus_freeze_daily_pages_max);
+ T_LOG("page_size %u", page_size);
/*
* We're kTestIntervalSecs past a new interval. Which means we are owed kNumSecondsInDay
* seconds of budget.
*/
expected_new_budget_pages = memorystatus_freeze_daily_pages_max;
+ T_LOG("expected_new_budget_pages before %u", expected_new_budget_pages);
+ T_ASSERT_EQ(kTestIntervalSecs, 60 * 60 * 32, "kTestIntervalSecs did not change");
expected_new_budget_pages += ((kTestIntervalSecs * kFixedPointFactor) / (kNumSecondsInDay)
* memorystatus_freeze_daily_pages_max) / kFixedPointFactor;
+ T_LOG("expected_new_budget_pages after %u", expected_new_budget_pages);
+ T_LOG("memorystatus_freeze_daily_pages_max after %u", memorystatus_freeze_daily_pages_max);
T_QUIET; T_ASSERT_EQ(new_budget, expected_new_budget_pages, "Calculate new budget behaves correctly.");
}
from utils import *
import sys
+current_KDP_mode = "swhosted"
+
def GetKDPPacketHeaderInt(request=0, is_reply=False, seq=0, length=0, key=0):
""" create a 64 bit number that could be saved as pkt_hdr_t
params:
print "Failed to save the dumpinfo."
return retval
+@lldb_command('kdpmode')
+def KDPMode(cmd_args=None):
+ """
+ Change KDP mode between software hosted and hardware probe.
+ When lldb is connected to a KDP server backed by a hardware debug tool
+ setting this to 'hwprobe' enables physical memory access.
+
+ swhosted: LLDB is connected to the target using a serial or socket connection.
+ hwprobe: LLDB is connected to the target using a hardware probe.
+
+ usage: kdpmode <mode>
+ mode: 'swhosted' or 'hwprobe'
+ """
+ global current_KDP_mode
+
+ if cmd_args == None or len(cmd_args) == 0:
+ return current_KDP_mode
+ if len(cmd_args) > 1 or cmd_args[0] not in {'swhosted', 'hwprobe'}:
+ print "Invalid Arguments", KDPMode.__doc__
+ else:
+ current_KDP_mode = cmd_args[0]
+ return
+
import xnudefines
from kdp import *
from utils import *
+import struct
def ReadPhysInt(phys_addr, bitsize = 64, cpuval = None):
""" Read a physical memory data based on address.
print "Target is not connected over kdp. Nothing to do here."
return retval
- input_address = unsigned(addressof(kern.globals.manual_pkt.input))
- len_address = unsigned(addressof(kern.globals.manual_pkt.len))
- data_address = unsigned(addressof(kern.globals.manual_pkt.data))
- if not WriteInt32ToMemoryAddress(0, input_address):
- return retval
-
- kdp_pkt_size = GetType('kdp_readphysmem64_req_t').GetByteSize()
- if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
- return retval
+ if "hwprobe" == KDPMode():
+ # Send the proper KDP command and payload to the bare metal debug tool via a KDP server
+ addr_for_kdp = struct.unpack("<Q", struct.pack(">Q", address))[0]
+ byte_count = struct.unpack("<I", struct.pack(">I", bits/8))[0]
+ packet = "{0:016x}{1:08x}{2:04x}".format(addr_for_kdp, byte_count, 0x0)
- data_addr = int(addressof(kern.globals.manual_pkt))
- pkt = kern.GetValueFromAddress(data_addr, 'kdp_readphysmem64_req_t *')
+ ret_obj = lldb.SBCommandReturnObject()
+ ci = lldb.debugger.GetCommandInterpreter()
+ ci.HandleCommand('process plugin packet send -c 25 -p {0}'.format(packet), ret_obj)
- header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_READPHYSMEM64'), length=kdp_pkt_size)
+ if ret_obj.Succeeded():
+ value = ret_obj.GetOutput()
- if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
- WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
- WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
- WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
- ):
-
- if WriteInt32ToMemoryAddress(1, input_address):
- # now read data from the kdp packet
- data_address = unsigned(addressof(kern.GetValueFromAddress(int(addressof(kern.globals.manual_pkt.data)), 'kdp_readphysmem64_reply_t *').data))
if bits == 64 :
- retval = kern.GetValueFromAddress(data_address, 'uint64_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ pack_fmt = "<Q"
+ unpack_fmt = ">Q"
if bits == 32 :
- retval = kern.GetValueFromAddress(data_address, 'uint32_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ pack_fmt = "<I"
+ unpack_fmt = ">I"
if bits == 16 :
- retval = kern.GetValueFromAddress(data_address, 'uint16_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ pack_fmt = "<H"
+ unpack_fmt = ">H"
if bits == 8 :
- retval = kern.GetValueFromAddress(data_address, 'uint8_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ pack_fmt = "<B"
+ unpack_fmt = ">B"
+
+ retval = struct.unpack(unpack_fmt, struct.pack(pack_fmt, int(value[-((bits/4)+1):], 16)))[0]
+
+ else:
+ input_address = unsigned(addressof(kern.globals.manual_pkt.input))
+ len_address = unsigned(addressof(kern.globals.manual_pkt.len))
+ data_address = unsigned(addressof(kern.globals.manual_pkt.data))
+
+ if not WriteInt32ToMemoryAddress(0, input_address):
+ return retval
+
+ kdp_pkt_size = GetType('kdp_readphysmem64_req_t').GetByteSize()
+ if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
+ return retval
+
+ data_addr = int(addressof(kern.globals.manual_pkt))
+ pkt = kern.GetValueFromAddress(data_addr, 'kdp_readphysmem64_req_t *')
+
+ header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_READPHYSMEM64'), length=kdp_pkt_size)
+
+ if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
+ WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
+ WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
+ WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
+ ):
+
+ if WriteInt32ToMemoryAddress(1, input_address):
+ # now read data from the kdp packet
+ data_address = unsigned(addressof(kern.GetValueFromAddress(int(addressof(kern.globals.manual_pkt.data)), 'kdp_readphysmem64_reply_t *').data))
+ if bits == 64 :
+ retval = kern.GetValueFromAddress(data_address, 'uint64_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ if bits == 32 :
+ retval = kern.GetValueFromAddress(data_address, 'uint32_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ if bits == 16 :
+ retval = kern.GetValueFromAddress(data_address, 'uint16_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+ if bits == 8 :
+ retval = kern.GetValueFromAddress(data_address, 'uint8_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+
return retval
if "kdp" != GetConnectionProtocol():
print "Target is not connected over kdp. Nothing to do here."
return False
- input_address = unsigned(addressof(kern.globals.manual_pkt.input))
- len_address = unsigned(addressof(kern.globals.manual_pkt.len))
- data_address = unsigned(addressof(kern.globals.manual_pkt.data))
- if not WriteInt32ToMemoryAddress(0, input_address):
- return False
+
+ if "hwprobe" == KDPMode():
+ # Send the proper KDP command and payload to the bare metal debug tool via a KDP server
+ addr_for_kdp = struct.unpack("<Q", struct.pack(">Q", address))[0]
+ byte_count = struct.unpack("<I", struct.pack(">I", bits/8))[0]
+
+ if bits == 64 :
+ pack_fmt = ">Q"
+ unpack_fmt = "<Q"
+ if bits == 32 :
+ pack_fmt = ">I"
+ unpack_fmt = "<I"
+ if bits == 16 :
+ pack_fmt = ">H"
+ unpack_fmt = "<H"
+ if bits == 8 :
+ pack_fmt = ">B"
+ unpack_fmt = "<B"
+
+ data_val = struct.unpack(unpack_fmt, struct.pack(pack_fmt, intval))[0]
+
+ packet = "{0:016x}{1:08x}{2:04x}{3:016x}".format(addr_for_kdp, byte_count, 0x0, data_val)
+
+ ret_obj = lldb.SBCommandReturnObject()
+ ci = lldb.debugger.GetCommandInterpreter()
+ ci.HandleCommand('process plugin packet send -c 26 -p {0}'.format(packet), ret_obj)
+
+ if ret_obj.Succeeded():
+ return True
+ else:
+ return False
- kdp_pkt_size = GetType('kdp_writephysmem64_req_t').GetByteSize() + (bits / 8)
- if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
- return False
+ else:
+ input_address = unsigned(addressof(kern.globals.manual_pkt.input))
+ len_address = unsigned(addressof(kern.globals.manual_pkt.len))
+ data_address = unsigned(addressof(kern.globals.manual_pkt.data))
+ if not WriteInt32ToMemoryAddress(0, input_address):
+ return False
- data_addr = int(addressof(kern.globals.manual_pkt))
- pkt = kern.GetValueFromAddress(data_addr, 'kdp_writephysmem64_req_t *')
-
- header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_WRITEPHYSMEM64'), length=kdp_pkt_size)
-
- if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
- WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
- WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
- WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
- ):
-
- if bits == 8:
- if not WriteInt8ToMemoryAddress(intval, int(addressof(pkt.data))):
- return False
- if bits == 16:
- if not WriteInt16ToMemoryAddress(intval, int(addressof(pkt.data))):
- return False
- if bits == 32:
- if not WriteInt32ToMemoryAddress(intval, int(addressof(pkt.data))):
- return False
- if bits == 64:
- if not WriteInt64ToMemoryAddress(intval, int(addressof(pkt.data))):
- return False
- if WriteInt32ToMemoryAddress(1, input_address):
- return True
- return False
+ kdp_pkt_size = GetType('kdp_writephysmem64_req_t').GetByteSize() + (bits / 8)
+ if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
+ return False
+
+ data_addr = int(addressof(kern.globals.manual_pkt))
+ pkt = kern.GetValueFromAddress(data_addr, 'kdp_writephysmem64_req_t *')
+
+ header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_WRITEPHYSMEM64'), length=kdp_pkt_size)
+
+ if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
+ WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
+ WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
+ WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
+ ):
+
+ if bits == 8:
+ if not WriteInt8ToMemoryAddress(intval, int(addressof(pkt.data))):
+ return False
+ if bits == 16:
+ if not WriteInt16ToMemoryAddress(intval, int(addressof(pkt.data))):
+ return False
+ if bits == 32:
+ if not WriteInt32ToMemoryAddress(intval, int(addressof(pkt.data))):
+ return False
+ if bits == 64:
+ if not WriteInt64ToMemoryAddress(intval, int(addressof(pkt.data))):
+ return False
+ if WriteInt32ToMemoryAddress(1, input_address):
+ return True
+ return False
def WritePhysInt(phys_addr, int_val, bitsize = 64):