]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-6153.121.1.tar.gz macos-10155 v6153.121.1
authorApple <opensource@apple.com>
Wed, 3 Jun 2020 04:23:43 +0000 (04:23 +0000)
committerApple <opensource@apple.com>
Wed, 3 Jun 2020 04:23:43 +0000 (04:23 +0000)
52 files changed:
bsd/kern/bsd_init.c
bsd/kern/kern_control.c
bsd/kern/kern_descrip.c
bsd/kern/kern_fork.c
bsd/kern/kern_proc.c
bsd/kern/kern_sysctl.c
bsd/kern/trace_codes
bsd/kern/uipc_socket.c
bsd/kern/uipc_socket2.c
bsd/kern/uipc_syscalls.c
bsd/net/content_filter.c
bsd/net/content_filter.h
bsd/net/if_bridge.c
bsd/net/necp.c
bsd/net/pf_ioctl.c
bsd/netinet/flow_divert.c
bsd/netinet/in_pcb.c
bsd/netinet/ip_icmp.c
bsd/netinet/mptcp_subr.c
bsd/netinet/mptcp_usrreq.c
bsd/netinet/mptcp_var.h
bsd/netinet/raw_ip.c
bsd/netinet/udp_usrreq.c
bsd/netinet6/icmp6.c
bsd/netinet6/raw_ip6.c
bsd/netinet6/udp6_output.c
bsd/netinet6/udp6_usrreq.c
bsd/sys/namei.h
bsd/sys/proc_internal.h
bsd/sys/socketvar.h
bsd/vfs/kpi_vfs.c
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_subr.c
bsd/vfs/vfs_syscalls.c
config/MasterVersion
iokit/IOKit/IOMemoryDescriptor.h
iokit/Kernel/IOMemoryDescriptor.cpp
osfmk/arm/pmap.c
osfmk/arm64/locore.s
osfmk/arm64/machine_routines_asm.h
osfmk/arm64/machine_routines_asm.s
osfmk/arm64/pcb.c
osfmk/arm64/status.c
osfmk/kern/task.c
osfmk/mach/arm/thread_status.h
osfmk/vm/vm_compressor.c
osfmk/vm/vm_fault.c
osfmk/vm/vm_fault.h
osfmk/vm/vm_pageout.h
tests/memorystatus_freeze_test.c
tools/lldbmacros/kdp.py
tools/lldbmacros/pmap.py

index bc5b709e0f185cdf5e3038557d0b6e61c90bb260..887cb454b7bfe5b23b1e83d1b942956f30257196 100644 (file)
@@ -415,6 +415,7 @@ lck_grp_t * proc_kqhashlock_grp;
 lck_grp_t * proc_knhashlock_grp;
 lck_grp_t * proc_ucred_mlock_grp;
 lck_grp_t * proc_mlock_grp;
+lck_grp_t * proc_dirslock_grp;
 lck_grp_attr_t * proc_lck_grp_attr;
 lck_attr_t * proc_lck_attr;
 lck_mtx_t * proc_list_mlock;
@@ -533,6 +534,7 @@ bsd_init(void)
        proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr);
        proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr);
        proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr);
+       proc_dirslock_grp = lck_grp_alloc_init("proc-dirslock", proc_lck_grp_attr);
 #if CONFIG_XNUPOST
        sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL);
        sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init(
@@ -548,6 +550,7 @@ bsd_init(void)
        lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr);
        lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr);
        lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr);
+       lck_rw_init(&kernproc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
 
        assert(bsd_simul_execs != 0);
        execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr);
index 5430ff820668713051fcbbdfc64c29e9d230065a..0151fac5e65b10c42da6b74804541c44103ac5b1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -1036,7 +1036,7 @@ ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, struct mbuf *m,
        }
 
        so_recv_data_stat(so, m, 0);
-       if (sbappend(&so->so_rcv, m) != 0) {
+       if (sbappend_nodrop(&so->so_rcv, m) != 0) {
                if ((flags & CTL_DATA_NOWAKEUP) == 0) {
                        sorwakeup(so);
                }
@@ -1133,7 +1133,7 @@ ctl_enqueuembuf_list(void *kctlref, u_int32_t unit, struct mbuf *m_list,
                         */
                        m->m_nextpkt = NULL;
                        so_recv_data_stat(so, m, 0);
-                       if (sbappendrecord(&so->so_rcv, m) != 0) {
+                       if (sbappendrecord_nodrop(&so->so_rcv, m) != 0) {
                                needwakeup = 1;
                        } else {
                                /*
@@ -1239,6 +1239,10 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len,
                m->m_flags |= M_EOR;
        }
        so_recv_data_stat(so, m, 0);
+       /*
+        * No need to call the "nodrop" variant of sbappend
+        * because the mbuf is local to the scope of the function
+        */
        if (sbappend(&so->so_rcv, m) != 0) {
                if ((flags & CTL_DATA_NOWAKEUP) == 0) {
                        sorwakeup(so);
index 8e7a7db7483b27f95a2cd573b0c687c37f0de295..c17f8414362cdb57d4d9227e7d12b50fd777aa21 100644 (file)
@@ -282,6 +282,30 @@ file_lock_init(void)
 }
 
 
+void
+proc_dirs_lock_shared(proc_t p)
+{
+       lck_rw_lock_shared(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_unlock_shared(proc_t p)
+{
+       lck_rw_unlock_shared(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_lock_exclusive(proc_t p)
+{
+       lck_rw_lock_exclusive(&p->p_dirs_lock);
+}
+
+void
+proc_dirs_unlock_exclusive(proc_t p)
+{
+       lck_rw_unlock_exclusive(&p->p_dirs_lock);
+}
+
 /*
  * proc_fdlock, proc_fdlock_spin
  *
@@ -5061,6 +5085,7 @@ fdcopy(proc_t p, vnode_t uth_cdir)
        }
        /* Coming from a chroot environment and unable to get a reference... */
        if (newfdp->fd_rdir == NULL && fdp->fd_rdir) {
+               proc_fdunlock(p);
                /*
                 * We couldn't get a new reference on
                 * the chroot directory being
index c25c85ad265bfa4ad5a60137d3b2d2d09c6b8f89..e8de4d1c2433558496ffb0fd3235c8fa690e8e58 100644 (file)
@@ -1339,6 +1339,7 @@ retry:
         *
         * XXX may fail to copy descriptors to child
         */
+       lck_rw_init(&child_proc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr);
        child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir);
 
 #if SYSV_SHM
index c5ea090ce7b1914c26a94ec0b08870610ff28bf9..1a11358bea2c631aade0feff61353651f177a703 100644 (file)
@@ -1301,7 +1301,7 @@ proc_gettty(proc_t p, vnode_t *vp)
 
                if (ttyvp) {
                        if (vnode_getwithvid(ttyvp, ttyvid) == 0) {
-                               *vp = procsp->s_ttyvp;
+                               *vp = ttyvp;
                                err = 0;
                        }
                } else {
index 0e55c44451203aec65e8c3c1e614f2f69aa0f899..edf8d8d22e72eabe235faf4422ab8e56c9590340 100644 (file)
@@ -1089,7 +1089,6 @@ fill_user32_externproc(proc_t p, struct user32_extern_proc *__restrict exp)
        exp->p_pid = p->p_pid;
        exp->p_oppid = p->p_oppid;
        /* Mach related  */
-       exp->user_stack = p->user_stack;
        exp->p_debugger = p->p_debugger;
        exp->sigwait = p->sigwait;
        /* scheduling */
@@ -1142,7 +1141,6 @@ fill_user64_externproc(proc_t p, struct user64_extern_proc *__restrict exp)
        exp->p_pid = p->p_pid;
        exp->p_oppid = p->p_oppid;
        /* Mach related  */
-       exp->user_stack = p->user_stack;
        exp->p_debugger = p->p_debugger;
        exp->sigwait = p->sigwait;
        /* scheduling */
@@ -3657,6 +3655,9 @@ SYSCTL_INT(_vm, OID_AUTO, compressor_is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm
 SYSCTL_INT(_vm, OID_AUTO, compressor_swapout_target_age, CTLFLAG_RD | CTLFLAG_LOCKED, &swapout_target_age, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, compressor_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
 
+extern int min_csegs_per_major_compaction;
+SYSCTL_INT(_vm, OID_AUTO, compressor_min_csegs_per_major_compaction, CTLFLAG_RW | CTLFLAG_LOCKED, &min_csegs_per_major_compaction, 0, "");
+
 SYSCTL_INT(_vm, OID_AUTO, vm_ripe_target_age_in_secs, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_ripe_target_age, 0, "");
 
 SYSCTL_INT(_vm, OID_AUTO, compressor_eval_period_in_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_eval_period_in_msecs, 0, "");
@@ -4959,15 +4960,15 @@ sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS
        int error;
        mach_port_name_t task_port_name;
        task_t task;
-       int buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
+       size_t buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0;
        vmobject_list_output_t buffer;
        size_t output_size;
 
        if (buffer_size) {
-               const int min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t);
+               const size_t min_size = sizeof(vm_object_query_data_t) + sizeof(int64_t);
 
-               if (buffer_size < min_size) {
-                       buffer_size = min_size;
+               if (buffer_size < min_size || buffer_size > INT_MAX) {
+                       return EINVAL;
                }
 
                buffer = kalloc(buffer_size);
index 842ff323badd1f1e80d024bf3e052bfc3a75b34b..929b0b88eef6655cebc86c0885e75cbc7127e8b5 100644 (file)
 0x1300494      MACH_vm_page_expedite_no_memory
 0x1300498      MACH_vm_page_grab
 0x130049c      MACH_vm_page_release
+0x13004a0      MACH_vm_compressor_compact_and_swap
+0x13004a4      MACH_vm_compressor_do_delayed_compactions
 0x13004c0      MACH_vm_pressure_event
 0x13004c4      MACH_vm_execve
 0x13004c8      MACH_vm_wakeup_compactor_swapper
index b94476d05ea29c77beaee69b10de9cdb7766b19f..2a754f0500deeb178ab8fc63ca3e0239d09c8052 100644 (file)
@@ -1839,6 +1839,39 @@ soconnectxlocked(struct socket *so, struct sockaddr *src,
            (error = sodisconnectlocked(so)) != 0)) {
                error = EISCONN;
        } else {
+               if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
+                   (flags & CONNECT_DATA_IDEMPOTENT)) {
+                       so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
+
+                       if (flags & CONNECT_DATA_AUTHENTICATED) {
+                               so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
+                       }
+               }
+
+               /*
+                * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
+                * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
+                * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
+                * Case 3 allows user to combine write with connect even if they have
+                * no use for TFO (such as regular TCP, and UDP).
+                * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
+                */
+               if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
+                   ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
+                       so->so_flags1 |= SOF1_PRECONNECT_DATA;
+               }
+
+               /*
+                * If a user sets data idempotent and does not pass an uio, or
+                * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
+                * SOF1_DATA_IDEMPOTENT.
+                */
+               if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
+                   (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
+                       /* We should return EINVAL instead perhaps. */
+                       so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
+               }
+
                /*
                 * Run connect filter before calling protocol:
                 *  - non-blocking connect returns before completion;
@@ -1856,6 +1889,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src,
                            flags, arg, arglen, auio, bytes_written);
                        if (error != 0) {
                                so->so_state &= ~SS_ISCONNECTING;
+                               if (error != EINPROGRESS) {
+                                       so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
+                               }
                        }
                }
        }
index cc3c37a5226dd5678ca7457636ba5bf48624c0b4..cbac73e06f9246eedd6a1936abdd320655013832 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -119,6 +119,9 @@ static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
     struct mbuf *);
 static void soevent_ifdenied(struct socket *);
 
+static int sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop);
+static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop);
+
 /*
  * Primitive routines for operating on sockets and socket buffers
  */
@@ -872,13 +875,13 @@ sbrelease(struct sockbuf *sb)
  * the mbuf chain is recorded in sb.  Empty mbufs are
  * discarded and mbufs are compacted where possible.
  */
-int
-sbappend(struct sockbuf *sb, struct mbuf *m)
+static int
+sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop)
 {
        struct socket *so = sb->sb_so;
 
        if (m == NULL || (sb->sb_flags & SB_DROP)) {
-               if (m != NULL) {
+               if (m != NULL && !nodrop) {
                        m_freem(m);
                }
                return 0;
@@ -887,27 +890,30 @@ sbappend(struct sockbuf *sb, struct mbuf *m)
        SBLASTRECORDCHK(sb, "sbappend 1");
 
        if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR)) {
-               return sbappendrecord(sb, m);
+               return sbappendrecord_common(sb, m, nodrop);
        }
 
-       if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
-               int error = sflt_data_in(so, NULL, &m, NULL, 0);
-               SBLASTRECORDCHK(sb, "sbappend 2");
+       if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+               ASSERT(nodrop == FALSE);
+               if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
+                       int error = sflt_data_in(so, NULL, &m, NULL, 0);
+                       SBLASTRECORDCHK(sb, "sbappend 2");
 
 #if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(so, NULL, m, NULL, 0);
-               }
+                       if (error == 0) {
+                               error = cfil_sock_data_in(so, NULL, m, NULL, 0);
+                       }
 #endif /* CONTENT_FILTER */
 
-               if (error != 0) {
-                       if (error != EJUSTRETURN) {
-                               m_freem(m);
+                       if (error != 0) {
+                               if (error != EJUSTRETURN) {
+                                       m_freem(m);
+                               }
+                               return 0;
                        }
-                       return 0;
+               } else if (m) {
+                       m->m_flags &= ~M_SKIPCFIL;
                }
-       } else if (m) {
-               m->m_flags &= ~M_SKIPCFIL;
        }
 
        /* If this is the first record, it's also the last record */
@@ -920,6 +926,18 @@ sbappend(struct sockbuf *sb, struct mbuf *m)
        return 1;
 }
 
+int
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+       return sbappend_common(sb, m, FALSE);
+}
+
+int
+sbappend_nodrop(struct sockbuf *sb, struct mbuf *m)
+{
+       return sbappend_common(sb, m, TRUE);
+}
+
 /*
  * Similar to sbappend, except that this is optimized for stream sockets.
  */
@@ -943,24 +961,26 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m)
 
        SBLASTMBUFCHK(sb, __func__);
 
-       if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
-               int error = sflt_data_in(so, NULL, &m, NULL, 0);
-               SBLASTRECORDCHK(sb, "sbappendstream 1");
+       if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+               if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
+                       int error = sflt_data_in(so, NULL, &m, NULL, 0);
+                       SBLASTRECORDCHK(sb, "sbappendstream 1");
 
 #if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(so, NULL, m, NULL, 0);
-               }
+                       if (error == 0) {
+                               error = cfil_sock_data_in(so, NULL, m, NULL, 0);
+                       }
 #endif /* CONTENT_FILTER */
 
-               if (error != 0) {
-                       if (error != EJUSTRETURN) {
-                               m_freem(m);
+                       if (error != 0) {
+                               if (error != EJUSTRETURN) {
+                                       m_freem(m);
+                               }
+                               return 0;
                        }
-                       return 0;
+               } else if (m) {
+                       m->m_flags &= ~M_SKIPCFIL;
                }
-       } else if (m) {
-               m->m_flags &= ~M_SKIPCFIL;
        }
 
        sbcompress(sb, m, sb->sb_mbtail);
@@ -1066,14 +1086,14 @@ sblastmbufchk(struct sockbuf *sb, const char *where)
 /*
  * Similar to sbappend, except the mbuf chain begins a new record.
  */
-int
-sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+static int
+sbappendrecord_common(struct sockbuf *sb, struct mbuf *m0, boolean_t nodrop)
 {
        struct mbuf *m;
        int space = 0;
 
        if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
-               if (m0 != NULL) {
+               if (m0 != NULL && nodrop == FALSE) {
                        m_freem(m0);
                }
                return 0;
@@ -1084,29 +1104,34 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
        }
 
        if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
-               m_freem(m0);
+               if (nodrop == FALSE) {
+                       m_freem(m0);
+               }
                return 0;
        }
 
-       if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
-               int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
-                   sock_data_filt_flag_record);
+       if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+               ASSERT(nodrop == FALSE);
+               if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+                       int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
+                           sock_data_filt_flag_record);
 
 #if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
-               }
+                       if (error == 0) {
+                               error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
+                       }
 #endif /* CONTENT_FILTER */
 
-               if (error != 0) {
-                       SBLASTRECORDCHK(sb, "sbappendrecord 1");
-                       if (error != EJUSTRETURN) {
-                               m_freem(m0);
+                       if (error != 0) {
+                               SBLASTRECORDCHK(sb, "sbappendrecord 1");
+                               if (error != EJUSTRETURN) {
+                                       m_freem(m0);
+                               }
+                               return 0;
                        }
-                       return 0;
+               } else if (m0) {
+                       m0->m_flags &= ~M_SKIPCFIL;
                }
-       } else if (m0) {
-               m0->m_flags &= ~M_SKIPCFIL;
        }
 
        /*
@@ -1133,6 +1158,18 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
        return 1;
 }
 
+int
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+       return sbappendrecord_common(sb, m0, FALSE);
+}
+
+int
+sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0)
+{
+       return sbappendrecord_common(sb, m0, TRUE);
+}
+
 /*
  * Concatenate address (optional), control (optional) and data into one
  * single mbuf chain.  If sockbuf *sb is passed in, space check will be
@@ -1276,35 +1313,37 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
                return 0;
        }
 
-       /* Call socket data in filters */
-       if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
-               int error;
-               error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
-               SBLASTRECORDCHK(sb, __func__);
+       if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+               /* Call socket data in filters */
+               if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+                       int error;
+                       error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
+                       SBLASTRECORDCHK(sb, __func__);
 
 #if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
-                           0);
-               }
+                       if (error == 0) {
+                               error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
+                                   0);
+                       }
 #endif /* CONTENT_FILTER */
 
-               if (error) {
-                       if (error != EJUSTRETURN) {
-                               if (m0) {
-                                       m_freem(m0);
-                               }
-                               if (control != NULL && !sb_unix) {
-                                       m_freem(control);
-                               }
-                               if (error_out) {
-                                       *error_out = error;
+                       if (error) {
+                               if (error != EJUSTRETURN) {
+                                       if (m0) {
+                                               m_freem(m0);
+                                       }
+                                       if (control != NULL && !sb_unix) {
+                                               m_freem(control);
+                                       }
+                                       if (error_out) {
+                                               *error_out = error;
+                                       }
                                }
+                               return 0;
                        }
-                       return 0;
+               } else if (m0) {
+                       m0->m_flags &= ~M_SKIPCFIL;
                }
-       } else if (m0) {
-               m0->m_flags &= ~M_SKIPCFIL;
        }
 
        mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
@@ -1420,35 +1459,37 @@ sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
                return 0;
        }
 
-       if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
-               int error;
+       if (SOCK_DOM(sb->sb_so) == PF_INET || SOCK_DOM(sb->sb_so) == PF_INET6) {
+               if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
+                       int error;
 
-               error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
-               SBLASTRECORDCHK(sb, __func__);
+                       error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
+                       SBLASTRECORDCHK(sb, __func__);
 
 #if CONTENT_FILTER
-               if (error == 0) {
-                       error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
-                           0);
-               }
+                       if (error == 0) {
+                               error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
+                                   0);
+                       }
 #endif /* CONTENT_FILTER */
 
-               if (error) {
-                       if (error != EJUSTRETURN) {
-                               if (m0) {
-                                       m_freem(m0);
-                               }
-                               if (control != NULL && !sb_unix) {
-                                       m_freem(control);
-                               }
-                               if (error_out) {
-                                       *error_out = error;
+                       if (error) {
+                               if (error != EJUSTRETURN) {
+                                       if (m0) {
+                                               m_freem(m0);
+                                       }
+                                       if (control != NULL && !sb_unix) {
+                                               m_freem(control);
+                                       }
+                                       if (error_out) {
+                                               *error_out = error;
+                                       }
                                }
+                               return 0;
                        }
-                       return 0;
+               } else if (m0) {
+                       m0->m_flags &= ~M_SKIPCFIL;
                }
-       } else if (m0) {
-               m0->m_flags &= ~M_SKIPCFIL;
        }
 
        result = sbappendcontrol_internal(sb, m0, control);
index 5c929755a2e91c72814828323fda1cea9a0a862d..e2455a1f311f24cb8593ac5370f006fb5c097712 100644 (file)
@@ -1004,7 +1004,6 @@ connectitx(struct socket *so, struct sockaddr *src,
     user_ssize_t *bytes_written)
 {
        int error;
-#pragma unused (flags)
 
        VERIFY(dst != NULL);
 
@@ -1027,41 +1026,8 @@ connectitx(struct socket *so, struct sockaddr *src,
                goto out;
        }
 
-       if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
-           (flags & CONNECT_DATA_IDEMPOTENT)) {
-               so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
-
-               if (flags & CONNECT_DATA_AUTHENTICATED) {
-                       so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
-               }
-       }
-
-       /*
-        * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
-        * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
-        * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
-        * Case 3 allows user to combine write with connect even if they have
-        * no use for TFO (such as regular TCP, and UDP).
-        * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
-        */
-       if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
-           ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
-               so->so_flags1 |= SOF1_PRECONNECT_DATA;
-       }
-
-       /*
-        * If a user sets data idempotent and does not pass an uio, or
-        * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
-        * SOF1_DATA_IDEMPOTENT.
-        */
-       if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
-           (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
-               /* We should return EINVAL instead perhaps. */
-               so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
-       }
-
        error = soconnectxlocked(so, src, dst, p, ifscope,
-           aid, pcid, 0, NULL, 0, auio, bytes_written);
+           aid, pcid, flags, NULL, 0, auio, bytes_written);
        if (error != 0) {
                goto out;
        }
index 626c2b2bf6ddbb965f793e99966ee1e16c6ead88..11b248d0cffd52d0edcca8f3fb13b00933502af6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -26,7 +26,7 @@
  *
  * The socket content filter subsystem provides a way for user space agents to
  * make filtering decisions based on the content of the data being sent and
- * received by TCP/IP sockets.
+ * received by INET/INET6 sockets.
  *
  * A content filter user space agents gets a copy of the data and the data is
  * also kept in kernel buffer until the user space agents makes a pass or drop
  * filter agent until an ultimate pass or drop decision is made by the
  * user space filter agent.
  *
- * It should be noted that messages about many TCP/IP sockets can be multiplexed
+ * It should be noted that messages about many INET/INET6 sockets can be multiplexed
  * over a single kernel control socket.
  *
  * Notes:
- * - The current implementation is limited to TCP sockets.
+ * - The current implementation supports all INET/INET6 sockets (i.e. TCP,
+ *   UDP, ICMP, etc).
  * - The current implementation supports up to two simultaneous content filters
- *   for the sake of simplicity of the implementation.
+ *   for iOS devices and eight simultaneous content filters for OSX.
  *
  *
  * NECP FILTER CONTROL UNIT
  *
  * A user space filter agent uses the Network Extension Control Policy (NECP)
- * database to specify which TCP/IP sockets need to be filtered. The NECP
+ * database to specify which INET/INET6 sockets need to be filtered. The NECP
  * criteria may be based on a variety of properties like user ID or proc UUID.
  *
  * The NECP "filter control unit" is used by the socket content filter subsystem
- * to deliver the relevant TCP/IP content information to the appropriate
+ * to deliver the relevant INET/INET6 content information to the appropriate
  * user space filter agent via its kernel control socket instance.
  * This works as follows:
  *
  *    content filter kernel control socket via the socket option
  *    CFIL_OPT_NECP_CONTROL_UNIT.
  *
- * 3) The NECP database is consulted to find out if a given TCP/IP socket
+ * 3) The NECP database is consulted to find out if a given INET/INET6 socket
  *    needs to be subjected to content filtering and returns the corresponding
  *    NECP filter control unit  -- the NECP filter control unit is actually
- *    stored in the TCP/IP socket structure so the NECP lookup is really simple.
+ *    stored in the INET/INET6 socket structure so the NECP lookup is really simple.
  *
  * 4) The NECP filter control unit is then used to find the corresponding
  *    kernel control socket instance.
  *
- * Note: NECP currently supports a single filter control unit per TCP/IP socket
+ * Note: NECP currently supports a single filter control unit per INET/INET6 socket
  *       but this restriction may be soon lifted.
  *
  *
  * communicate over the kernel control socket via an asynchronous
  * messaging protocol (this is not a request-response protocol).
  * The socket content filter subsystem sends event messages to the user
- * space filter agent about the TCP/IP sockets it is interested to filter.
+ * space filter agent about the INET/INET6 sockets it is interested to filter.
  * The user space filter agent sends action messages to either allow
  * data to pass or to disallow the data flow (and drop the connection).
  *
  * All messages over a content filter kernel control socket share the same
  * common header of type "struct cfil_msg_hdr". The message type tells if
  * it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
- * The message header field "cfm_sock_id" identifies a given TCP/IP socket.
+ * The message header field "cfm_sock_id" identifies a given INET/INET6 flow.
+ * For TCP, flows are per-socket.  For UDP and other datagrame protocols, there
+ * could be multiple flows per socket.
+ *
  * Note the message header length field may be padded for alignment and can
  * be larger than the actual content of the message.
  * The field "cfm_op" describe the kind of event or action.
  *
  * Here are the kinds of content filter events:
- * - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
- * - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
- * - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
- * - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
+ * - CFM_OP_SOCKET_ATTACHED: a new INET/INET6 socket is being filtered
+ * - CFM_OP_SOCKET_CLOSED: A INET/INET6 socket is closed
+ * - CFM_OP_DATA_OUT: A span of data is being sent on a INET/INET6 socket
+ * - CFM_OP_DATA_IN: A span of data is being or received on a INET/INET6 socket
  *
  *
  * EVENT MESSAGES
  *
  * The CFM_OP_DATA_UPDATE action messages let the user space filter
  * agent allow data to flow up to the specified pass offset -- there
- * is a pass offset for outgoing data and  a pass offset for incoming data.
- * When a new TCP/IP socket is attached to the content filter, each pass offset
- * is initially set to 0 so not data is allowed to pass by default.
- * When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
+ * is a pass offset for outgoing data and a pass offset for incoming data.
+ * When a new INET/INET6 socket is attached to the content filter and a flow is
+ * created, each pass offset is initially set to 0 so no data is allowed to pass by
+ * default.  When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
  * then the data flow becomes unrestricted.
  *
  * Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
  * A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
  * to tell the kernel how much data it wants to see by using the peek offsets.
  * Just like pass offsets, there is a peek offset for each direction.
- * When a new TCP/IP socket is attached to the content filter, each peek offset
- * is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
- * messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
- * with a greater than 0 peek offset is sent by the user space filter agent.
- * When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
- * then the flow of update data events becomes unrestricted.
+ * When a new INET/INET6 flow is created, each peek offset is initially set to 0
+ * so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages are dispatched by default
+ * until a CFM_OP_DATA_UPDATE action message with a greater than 0 peek offset is sent
+ * by the user space filter agent.  When the peek offset is set to CFM_MAX_OFFSET via
+ * a CFM_OP_DATA_UPDATE then the flow of update data events becomes unrestricted.
  *
  * Note that peek offsets cannot be smaller than the corresponding pass offset.
  * Also a peek offsets cannot be smaller than the corresponding end offset
  * to set a too small peek value is silently ignored.
  *
  *
- * PER SOCKET "struct cfil_info"
+ * PER FLOW "struct cfil_info"
  *
- * As soon as a TCP/IP socket gets attached to a content filter, a
+ * As soon as a INET/INET6 socket gets attached to a content filter, a
  * "struct cfil_info" is created to hold the content filtering state for this
- * socket.
+ * socket.  For UDP and other datagram protocols, as soon as traffic is seen for
+ * each new flow identified by its 4-tuple of source address/port and destination
+ * address/port, a "struct cfil_info" is created.  Each datagram socket may
+ * have multiple flows maintained in a hash table of "struct cfil_info" entries.
  *
  * The content filtering state is made of the following information
  * for each direction:
  *
  * CONTENT FILTER QUEUES
  *
- * Data that is being filtered is steered away from the TCP/IP socket buffer
+ * Data that is being filtered is steered away from the INET/INET6 socket buffer
  * and instead will sit in one of three content filter queues until the data
- * can be re-injected into the TCP/IP socket buffer.
+ * can be re-injected into the INET/INET6 socket buffer.
  *
  * A content filter queue is represented by "struct cfil_queue" that contains
  * a list of mbufs and the start and end offset of the data span of
  * c) The "cfi_inject_q" of "struct cfil_info"
  *
  * Note: The sequence (a),(b) may be repeated several times if there is more
- * than one content filter attached to the TCP/IP socket.
+ * than one content filter attached to the INET/INET6 socket.
  *
  * The "cfe_ctl_q" queue holds data than cannot be delivered to the
  * kernel conntrol socket for two reasons:
  *
  * The "cfi_inject_q" queue holds data that has been fully allowed to pass
  * by the user space filter agent and that needs to be re-injected into the
- * TCP/IP socket.
+ * INET/INET6 socket.
  *
  *
  * IMPACT ON FLOW CONTROL
  *
  * An essential aspect of the content filer subsystem is to minimize the
- * impact on flow control of the TCP/IP sockets being filtered.
+ * impact on flow control of the INET/INET6 sockets being filtered.
  *
  * The processing overhead of the content filtering may have an effect on
  * flow control by adding noticeable delays and cannot be eliminated --
  * The amount of data being filtered is kept in buffers while waiting for
  * a decision by the user space filter agent. This amount of data pending
  * needs to be subtracted from the amount of data available in the
- * corresponding TCP/IP socket buffer. This is done by modifying
+ * corresponding INET/INET6 socket buffer. This is done by modifying
  * sbspace() and tcp_sbspace() to account for amount of data pending
  * in the content filter.
  *
  * cfil read-write lock held as shared so it can be re-entered from multiple
  * threads.
  *
- * The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
+ * The per INET/INET6 socket content filterstate -- "struct cfil_info" -- is
  * protected by the socket lock.
  *
- * A TCP/IP socket lock cannot be taken while the cfil read-write lock
+ * A INET/INET6 socket lock cannot be taken while the cfil read-write lock
  * is held. That's why we have some sequences where we drop the cfil read-write
- * lock before taking the TCP/IP lock.
+ * lock before taking the INET/INET6 lock.
  *
- * It is also important to lock the TCP/IP socket buffer while the content
+ * It is also important to lock the INET/INET6 socket buffer while the content
  * filter is modifying the amount of pending data. Otherwise the calculations
  * in sbspace() and tcp_sbspace()  could be wrong.
  *
  * To read the other fields of "struct content_filter" we have to take
  * "cfil_lck_rw" in shared mode.
  *
+ * DATAGRAM SPECIFICS:
+ *
+ * The socket content filter supports all INET/INET6 protocols.  However
+ * the treatments for TCP sockets and for datagram (UDP, ICMP, etc) sockets
+ * are slightly different.
+ *
+ * Each datagram socket may have multiple flows.  Each flow is identified
+ * by the flow's source address/port and destination address/port tuple
+ * and is represented as a "struct cfil_info" entry.  For each socket,
+ * a hash table is used to maintain the collection of flows under that socket.
+ *
+ * Each datagram flow is uniquely identified by it's "struct cfil_info" cfi_sock_id.
+ * The highest 32-bits of the cfi_sock_id contains the socket's so_gencnt.  This portion
+ * of the cfi_sock_id is used locate the socket during socket lookup.  The lowest 32-bits
+ * of the cfi_sock_id contains a hash of the flow's 4-tuple.  This portion of the cfi_sock_id
+ * is used as the hash value for the flow hash table lookup within the parent socket.
+ *
+ * Since datagram sockets may not be connected, flow states may not be maintained in the
+ * socket structures and thus have to be saved for each packet.  These saved states will be
+ * used for both outgoing and incoming reinjections.  For outgoing packets, destination
+ * address/port as well as the current socket states will be saved.  During reinjection,
+ * these saved states will be used instead.  For incoming packets, control and address
+ * mbufs will be chained to the data.  During reinjection, the whole chain will be queued
+ * onto the incoming socket buffer.
  *
  * LIMITATIONS
  *
- * - For TCP sockets only
+ * - Support all INET/INET6 sockets, such as TCP, UDP, ICMP, etc
  *
  * - Does not support TCP unordered messages
  */
 /*
  *     TO DO LIST
  *
- *     SOONER:
- *
  *     Deal with OOB
  *
- *     LATER:
- *
- *     If support datagram, enqueue control and address mbufs as well
  */
 
 #include <sys/types.h>
 #include <net/content_filter.h>
 #include <net/content_filter_crypto.h>
 
+#define _IP_VHL
+#include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #define MAX_CONTENT_FILTER 8
 #endif
 
+extern struct inpcbinfo ripcbinfo;
 struct cfil_entry;
 
 /*
@@ -477,6 +505,7 @@ struct cfil_info {
        uint64_t                cfi_byte_outbound_count;
 
        boolean_t               cfi_isSignatureLatest;                  /* Indicates if signature covers latest flow attributes */
+       u_int32_t               cfi_debug;
        struct cfi_buf {
                /*
                 * cfi_pending_first and cfi_pending_last describe the total
@@ -535,7 +564,24 @@ TAILQ_HEAD(cfil_sock_head_stats, cfil_info) cfil_sock_head_stats;
 LIST_HEAD(cfilhashhead, cfil_hash_entry);
 #define CFILHASHSIZE 16
 #define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport))
+
+#define IS_INET(so) (so && so->so_proto && so->so_proto->pr_domain && (so->so_proto->pr_domain->dom_family == AF_INET || so->so_proto->pr_domain->dom_family == AF_INET6))
+#define IS_TCP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_STREAM && so->so_proto->pr_protocol == IPPROTO_TCP)
 #define IS_UDP(so) (so && so->so_proto && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
+#define IS_ICMP(so) (so && so->so_proto && (so->so_proto->pr_type == SOCK_RAW || so->so_proto->pr_type == SOCK_DGRAM) && \
+                                          (so->so_proto->pr_protocol == IPPROTO_ICMP || so->so_proto->pr_protocol == IPPROTO_ICMPV6))
+#define IS_RAW(so)  (so && so->so_proto && so->so_proto->pr_type == SOCK_RAW  && so->so_proto->pr_protocol == IPPROTO_RAW)
+
+#if !TARGET_OS_OSX && !defined(XNU_TARGET_OS_OSX)
+#define IS_IP_DGRAM(so) (IS_INET(so) && IS_UDP(so))
+#else
+#define IS_IP_DGRAM(so) (IS_INET(so) && !IS_TCP(so))
+#endif
+
+#define OPTIONAL_IP_HEADER(so) (!IS_TCP(so) && !IS_UDP(so))
+#define GET_SO_PROTO(so) ((so && so->so_proto) ? so->so_proto->pr_protocol : IPPROTO_MAX)
+#define IS_INP_V6(inp) (inp && (inp->inp_vflag & INP_IPV6))
+
 #define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
                                                                  ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
 #define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
@@ -632,6 +678,7 @@ struct cfil_tag {
        union sockaddr_in_4_6 cfil_faddr;
        uint32_t cfil_so_state_change_cnt;
        short cfil_so_options;
+       int cfil_inp_flags;
 };
 
 #define    CFIL_HASH_ENTRY_ZONE_NAME    "cfil_entry_hash"
@@ -745,11 +792,12 @@ static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
 static unsigned int cfil_data_length(struct mbuf *, int *, int *);
 static errno_t cfil_db_init(struct socket *);
 static void cfil_db_free(struct socket *so);
-struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *, boolean_t);
 struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t);
 struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+void cfil_db_update_entry_local(struct cfil_db *, struct cfil_hash_entry *, struct sockaddr *);
 void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *);
-struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *);
+struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *, int);
 struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t);
 static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *,
     struct mbuf *, struct mbuf *, uint32_t);
@@ -772,7 +820,8 @@ void cfil_info_show(void);
 bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t);
 bool cfil_info_action_timed_out(struct cfil_info *, int);
 bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
-struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *);
+struct m_tag *cfil_dgram_save_socket_state(struct cfil_info *, struct mbuf *);
+boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags);
 static void cfil_udp_gc_thread_func(void *, wait_result_t);
 static void cfil_info_udp_expire(void *, wait_result_t);
 static bool fill_cfil_hash_entry_from_address(struct cfil_hash_entry *, bool, struct sockaddr *);
@@ -1600,6 +1649,21 @@ find_udp:
        }
        lck_rw_done(pcbinfo->ipi_lock);
 
+       pcbinfo = &ripcbinfo;
+       lck_rw_lock_shared(pcbinfo->ipi_lock);
+       LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+               if (inp->inp_state != INPCB_STATE_DEAD &&
+                   inp->inp_socket != NULL &&
+                   inp->inp_socket->so_cfil_db != NULL &&
+                   (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
+                       if (cfil_socket_safe_lock(inp)) {
+                               so = inp->inp_socket;
+                       }
+                       break;
+               }
+       }
+       lck_rw_done(pcbinfo->ipi_lock);
+
 done:
        if (so == NULL) {
                OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
@@ -1872,6 +1936,11 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                error = EINVAL;
                goto unlock;
        }
+
+       if (cfil_info->cfi_debug) {
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED MSG FROM FILTER");
+       }
+
        entry = &cfil_info->cfi_entries[kcunit - 1];
        if (entry->cfe_filter == NULL) {
                CFIL_LOG(LOG_NOTICE, "so %llx no filter",
@@ -1897,6 +1966,16 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
 
        switch (msghdr->cfm_op) {
        case CFM_OP_DATA_UPDATE:
+
+               if (cfil_info->cfi_debug) {
+                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DATA_UPDATE");
+                       CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+                           (uint64_t)VM_KERNEL_ADDRPERM(so),
+                           cfil_info->cfi_sock_id,
+                           action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+                           action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+               }
+
 #if VERDICT_DEBUG
                CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
                    (uint64_t)VM_KERNEL_ADDRPERM(so),
@@ -1941,6 +2020,15 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                break;
 
        case CFM_OP_DROP:
+               if (cfil_info->cfi_debug) {
+                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: RECEIVED CFM_OP_DROP");
+                       CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+                           (uint64_t)VM_KERNEL_ADDRPERM(so),
+                           cfil_info->cfi_sock_id,
+                           action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+                           action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+               }
+
 #if VERDICT_DEBUG
                CFIL_LOG(LOG_ERR, "CFIL: VERDICT DROP RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
                    (uint64_t)VM_KERNEL_ADDRPERM(so),
@@ -2830,7 +2918,7 @@ done:
 errno_t
 cfil_sock_detach(struct socket *so)
 {
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                cfil_db_free(so);
                return 0;
        }
@@ -3025,7 +3113,7 @@ cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state,
                boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT);
                union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote;
                union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local;
-               cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, inp->inp_vflag & INP_IPV4, outgoing);
+               cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, !IS_INP_V6(inp), outgoing);
        }
 
        data.byte_count_in = cfil_info->cfi_byte_inbound_count;
@@ -3135,7 +3223,7 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info,
        if (hash_entry_ptr != NULL) {
                cfil_fill_event_msg_addresses(hash_entry_ptr, inp,
                    &msg_attached.cfs_src, &msg_attached.cfs_dst,
-                   inp->inp_vflag & INP_IPV4, conn_dir == CFS_CONNECTION_DIR_OUT);
+                   !IS_INP_V6(inp), conn_dir == CFS_CONNECTION_DIR_OUT);
        }
        msg_attached.cfs_conn_dir = conn_dir;
 
@@ -3157,6 +3245,10 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info,
                }
        }
 
+       if (cfil_info->cfi_debug) {
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING ATTACH UP");
+       }
+
        cfil_dispatch_attach_event_sign(entry->cfe_filter->cf_crypto_state, cfil_info, &msg_attached);
 
 #if LIFECYCLE_DEBUG
@@ -3258,6 +3350,10 @@ cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, u
                goto done;
        }
 
+       if (cfil_info->cfi_debug) {
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DISCONNECT UP");
+       }
+
 #if LIFECYCLE_DEBUG
        cfil_info_log(LOG_ERR, cfil_info, outgoing ?
            "CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP":
@@ -3371,6 +3467,10 @@ cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int k
 
        cfil_dispatch_closed_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, &msg_closed);
 
+       if (cfil_info->cfi_debug) {
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING CLOSED UP");
+       }
+
 #if LIFECYCLE_DEBUG
        CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: <sock id %llu> op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec);
 #endif
@@ -3507,6 +3607,7 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_
        struct cfe_buf *entrybuf;
        struct content_filter *cfc;
        struct timeval tv;
+       int inp_flags = 0;
 
        cfil_rw_lock_shared(&cfil_lck_rw);
 
@@ -3575,6 +3676,24 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_
        data_req->cfd_start_offset = entrybuf->cfe_peeked;
        data_req->cfd_end_offset = entrybuf->cfe_peeked + copylen;
 
+       data_req->cfd_flags = 0;
+       if (OPTIONAL_IP_HEADER(so)) {
+               /*
+                * For non-UDP/TCP traffic, indicate to filters if optional
+                * IP header is present:
+                *      outgoing - indicate according to INP_HDRINCL flag
+                *      incoming - For IPv4 only, stripping of IP header is
+                *                 optional.  But for CFIL, we delay stripping
+                *                 at rip_input.  So CFIL always expects IP
+                *                 frames. IP header will be stripped according
+                *                 to INP_STRIPHDR flag later at reinjection.
+                */
+               if ((!outgoing && !IS_INP_V6(inp)) ||
+                   (outgoing && cfil_dgram_peek_socket_state(data, &inp_flags) && (inp_flags & INP_HDRINCL))) {
+                       data_req->cfd_flags |= CFD_DATA_FLAG_IP_HEADER;
+               }
+       }
+
        /*
         * Copy address/port into event msg.
         * For non connected sockets need to copy addresses from passed
@@ -3582,7 +3701,11 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_
         */
        cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
            &data_req->cfc_src, &data_req->cfc_dst,
-           inp->inp_vflag & INP_IPV4, outgoing);
+           !IS_INP_V6(inp), outgoing);
+
+       if (cfil_info->cfi_debug) {
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DATA UP");
+       }
 
        if (cfil_info->cfi_isSignatureLatest == false) {
                cfil_dispatch_data_event_sign(entry->cfe_filter->cf_crypto_state, so, cfil_info, data_req);
@@ -3608,6 +3731,12 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_
            (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen);
 #endif
 
+       if (cfil_info->cfi_debug) {
+               CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u (%s)",
+                   (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen,
+                   data_req->cfd_flags & CFD_DATA_FLAG_IP_HEADER ? "IP HDR" : "NO IP HDR");
+       }
+
 done:
        if (error == ENOBUFS) {
                entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
@@ -3955,6 +4084,9 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou
        struct cfil_queue *inject_q;
        int need_rwakeup = 0;
        int count = 0;
+       struct inpcb *inp = NULL;
+       struct ip *ip = NULL;
+       unsigned int hlen;
 
        if (cfil_info == NULL) {
                return 0;
@@ -3984,10 +4116,13 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou
                datalen = cfil_data_length(data, &mbcnt, &mbnum);
 
 #if DATA_DEBUG
-               CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: <so %llx> data %llx datalen %u (mbcnt %u)",
-                   remote_addr_ptr ? "UNCONNECTED" : "CONNECTED",
+               CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> data %llx datalen %u (mbcnt %u)",
                    (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
 #endif
+               if (cfil_info->cfi_debug) {
+                       CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> data %llx datalen %u (mbcnt %u)",
+                           (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
+               }
 
                /* Remove data from queue and adjust stats */
                cfil_queue_remove(inject_q, data, datalen);
@@ -4011,12 +4146,26 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou
                        data->m_flags |= M_SKIPCFIL;
 
                        /*
-                        * NOTE: We currently only support TCP and UDP.
-                        * For RAWIP, MPTCP and message TCP we'll
+                        * NOTE: We currently only support TCP, UDP, ICMP,
+                        * ICMPv6 and RAWIP.  For MPTCP and message TCP we'll
                         * need to call the appropriate sbappendxxx()
                         * of fix sock_inject_data_in()
                         */
-                       if (IS_UDP(so) == TRUE) {
+                       if (IS_IP_DGRAM(so)) {
+                               if (OPTIONAL_IP_HEADER(so)) {
+                                       inp = sotoinpcb(so);
+                                       if (inp && (inp->inp_flags & INP_STRIPHDR)) {
+                                               mbuf_t data_start = cfil_data_start(data);
+                                               if (data_start != NULL && (data_start->m_flags & M_PKTHDR)) {
+                                                       ip = mtod(data_start, struct ip *);
+                                                       hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+                                                       data_start->m_len -= hlen;
+                                                       data_start->m_pkthdr.len -= hlen;
+                                                       data_start->m_data += hlen;
+                                               }
+                                       }
+                               }
+
                                if (sbappendchain(&so->so_rcv, data, 0)) {
                                        need_rwakeup = 1;
                                }
@@ -4042,6 +4191,10 @@ cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int ou
        CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
            (uint64_t)VM_KERNEL_ADDRPERM(so), count);
 #endif
+       if (cfil_info->cfi_debug) {
+               CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
+                   (uint64_t)VM_KERNEL_ADDRPERM(so), count);
+       }
 
        /* A single wakeup is for several packets is more efficient */
        if (need_rwakeup) {
@@ -4662,7 +4815,7 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s
        cfi_buf->cfi_pending_mbcnt += mbcnt;
        cfi_buf->cfi_pending_mbnum += mbnum;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max ||
                    cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) {
                        cfi_buf->cfi_tail_drop_cnt++;
@@ -4697,12 +4850,12 @@ cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, s
                        // Is cfil attached to this filter?
                        kcunit = CFI_ENTRY_KCUNIT(cfil_info, iter_entry);
                        if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
-                               if (IS_UDP(so) && chain == NULL) {
-                                       /* UDP only:
+                               if (IS_IP_DGRAM(so) && chain == NULL) {
+                                       /* Datagrams only:
                                         * Chain addr (incoming only TDB), control (optional) and data into one chain.
                                         * This full chain will be reinjected into socket after recieving verdict.
                                         */
-                                       (void) cfil_udp_save_socket_state(cfil_info, data);
+                                       (void) cfil_dgram_save_socket_state(cfil_info, data);
                                        chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control);
                                        if (chain == NULL) {
                                                return ENOBUFS;
@@ -4741,7 +4894,7 @@ cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
 {
        int error = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags);
        }
 
@@ -4796,7 +4949,7 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from,
 {
        int error = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags);
        }
 
@@ -4846,7 +4999,7 @@ cfil_sock_shutdown(struct socket *so, int *how)
 {
        int error = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_sock_udp_shutdown(so, how);
        }
 
@@ -4932,7 +5085,7 @@ cfil_sock_is_closed(struct socket *so)
        errno_t error = 0;
        int kcunit;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                cfil_sock_udp_is_closed(so);
                return;
        }
@@ -4977,7 +5130,7 @@ cfil_sock_notify_shutdown(struct socket *so, int how)
        errno_t error = 0;
        int kcunit;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                cfil_sock_udp_notify_shutdown(so, how, 0, 0);
                return;
        }
@@ -5010,7 +5163,7 @@ cfil_filters_attached(struct socket *so)
        uint32_t kcunit;
        int attached = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_filters_udp_attached(so, FALSE);
        }
 
@@ -5051,7 +5204,7 @@ cfil_sock_close_wait(struct socket *so)
        struct timespec ts;
        int error;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                cfil_sock_udp_close_wait(so);
                return;
        }
@@ -5118,7 +5271,7 @@ cfil_sock_data_pending(struct sockbuf *sb)
        struct socket *so = sb->sb_so;
        uint64_t pending = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_sock_udp_data_pending(sb, FALSE);
        }
 
@@ -5160,7 +5313,7 @@ cfil_sock_data_space(struct sockbuf *sb)
        struct socket *so = sb->sb_so;
        uint64_t pending = 0;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                return cfil_sock_udp_data_pending(sb, TRUE);
        }
 
@@ -5205,7 +5358,7 @@ cfil_sock_buf_update(struct sockbuf *sb)
        int error;
        struct socket *so = sb->sb_so;
 
-       if (IS_UDP(so)) {
+       if (IS_IP_DGRAM(so)) {
                cfil_sock_udp_buf_update(sb);
                return;
        }
@@ -5485,8 +5638,9 @@ cfil_hash_entry_log(int level, struct socket *so, struct cfil_hash_entry *entry,
                return;
        }
 
-       CFIL_LOG(level, "<%s>: <UDP so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
+       CFIL_LOG(level, "<%s>: <%s(%d) so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
            msg,
+           IS_UDP(so) ? "UDP" : "proto", GET_SO_PROTO(so),
            (uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId,
            ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote);
 }
@@ -5686,25 +5840,25 @@ fill_cfil_hash_entry_from_inp(struct cfil_hash_entry *entry, bool isLocal, struc
                return FALSE;
        }
 
-       if (inp->inp_vflag & INP_IPV4) {
+       if (inp->inp_vflag & INP_IPV6) {
                if (isLocal == TRUE) {
                        entry->cfentry_lport = inp->inp_lport;
-                       entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
+                       entry->cfentry_laddr.addr6 = inp->in6p_laddr;
                } else {
                        entry->cfentry_fport = inp->inp_fport;
-                       entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
+                       entry->cfentry_faddr.addr6 = inp->in6p_faddr;
                }
-               entry->cfentry_family = AF_INET;
+               entry->cfentry_family = AF_INET6;
                return TRUE;
-       } else if (inp->inp_vflag & INP_IPV6) {
+       } else if (inp->inp_vflag & INP_IPV4) {
                if (isLocal == TRUE) {
                        entry->cfentry_lport = inp->inp_lport;
-                       entry->cfentry_laddr.addr6 = inp->in6p_laddr;
+                       entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
                } else {
                        entry->cfentry_fport = inp->inp_fport;
-                       entry->cfentry_faddr.addr6 = inp->in6p_faddr;
+                       entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
                }
-               entry->cfentry_family = AF_INET6;
+               entry->cfentry_family = AF_INET;
                return TRUE;
        }
        return FALSE;
@@ -5775,12 +5929,13 @@ cfil_db_lookup_entry_with_sockid(struct cfil_db *db, u_int64_t sock_id)
 }
 
 struct cfil_hash_entry *
-cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
+cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote, boolean_t remoteOnly)
 {
-       struct cfil_hash_entry matchentry;
+       struct cfil_hash_entry matchentry = { };
        struct cfil_hash_entry *nextentry = NULL;
        struct inpcb *inp = sotoinpcb(db->cfdb_so);
        u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
+       u_int16_t hashkey_fport = 0, hashkey_lport = 0;
        int inp_hash_element = 0;
        struct cfilhashhead *cfilhash = NULL;
 
@@ -5790,10 +5945,12 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr
                goto done;
        }
 
-       if (local != NULL) {
-               fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
-       } else {
-               fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
+       if (remoteOnly == false) {
+               if (local != NULL) {
+                       fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
+               } else {
+                       fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
+               }
        }
        if (remote != NULL) {
                fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote);
@@ -5804,16 +5961,18 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr
 #if INET6
        if (inp->inp_vflag & INP_IPV6) {
                hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3];
-               hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3];
+               hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr6.s6_addr32[3] : 0;
        } else
 #endif /* INET6 */
        {
                hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr;
-               hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr;
+               hashkey_laddr = (remoteOnly == false) ? matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr : 0;
        }
 
-       inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr,
-           matchentry.cfentry_lport, matchentry.cfentry_fport);
+       hashkey_fport = matchentry.cfentry_fport;
+       hashkey_lport = (remoteOnly == false) ? matchentry.cfentry_lport : 0;
+
+       inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr, hashkey_lport, hashkey_fport);
        inp_hash_element &= db->cfdb_hashmask;
 
        cfilhash = &db->cfdb_hashbase[inp_hash_element];
@@ -5821,9 +5980,9 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr
        LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
 #if INET6
                if ((inp->inp_vflag & INP_IPV6) &&
-                   nextentry->cfentry_lport == matchentry.cfentry_lport &&
+                   (remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) &&
                    nextentry->cfentry_fport == matchentry.cfentry_fport &&
-                   IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) &&
+                   (remoteOnly || IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6)) &&
                    IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) {
 #if DATA_DEBUG
                        cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry");
@@ -5831,9 +5990,9 @@ cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr
                        return nextentry;
                } else
 #endif /* INET6 */
-               if (nextentry->cfentry_lport == matchentry.cfentry_lport &&
+               if ((remoteOnly || nextentry->cfentry_lport == matchentry.cfentry_lport) &&
                    nextentry->cfentry_fport == matchentry.cfentry_fport &&
-                   nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr &&
+                   (remoteOnly || nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr) &&
                    nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) {
 #if DATA_DEBUG
                        cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry");
@@ -5925,6 +6084,27 @@ done:
        return entry;
 }
 
+void
+cfil_db_update_entry_local(struct cfil_db *db, struct cfil_hash_entry *entry, struct sockaddr *local)
+{
+       struct inpcb *inp = sotoinpcb(db->cfdb_so);
+
+       CFIL_LOG(LOG_INFO, "");
+
+       if (inp == NULL || entry == NULL) {
+               return;
+       }
+
+       if (local != NULL) {
+               fill_cfil_hash_entry_from_address(entry, TRUE, local);
+       } else {
+               fill_cfil_hash_entry_from_inp(entry, TRUE, inp);
+       }
+       cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: local updated");
+
+       return;
+}
+
 struct cfil_info *
 cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
 {
@@ -5952,7 +6132,7 @@ cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
 }
 
 struct cfil_hash_entry *
-cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote)
+cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote, int debug)
 {
        struct cfil_hash_entry *hash_entry = NULL;
 
@@ -5967,7 +6147,16 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out
        }
 
        // See if flow already exists.
-       hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote);
+       hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, false);
+       if (hash_entry == NULL) {
+               // No match with both local and remote, try match with remote only
+               hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote, true);
+               if (hash_entry != NULL) {
+                       // Simply update the local address into the original flow, keeping
+                       // its sockId and flow_hash unchanged.
+                       cfil_db_update_entry_local(so->so_cfil_db, hash_entry, local);
+               }
+       }
        if (hash_entry != NULL) {
                return hash_entry;
        }
@@ -5987,6 +6176,7 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out
                return NULL;
        }
        hash_entry->cfentry_cfil->cfi_dir = outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN;
+       hash_entry->cfentry_cfil->cfi_debug = debug;
 
 #if LIFECYCLE_DEBUG
        cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
@@ -6010,6 +6200,10 @@ cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool out
        /* Hold a reference on the socket for each flow */
        so->so_usecount++;
 
+       if (debug) {
+               cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
+       }
+
        error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, 0,
            outgoing ? CFS_CONNECTION_DIR_OUT : CFS_CONNECTION_DIR_IN);
        /* We can recover from flow control or out of memory errors */
@@ -6031,6 +6225,7 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
        uint32_t filter_control_unit;
        struct cfil_hash_entry *hash_entry = NULL;
        struct cfil_info *cfil_info = NULL;
+       int debug = 0;
 
        socket_lock_assert_owned(so);
 
@@ -6061,7 +6256,7 @@ cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
                return error;
        }
 
-       hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote);
+       hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote, debug);
        if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) {
                CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow");
                return EPIPE;
@@ -6739,7 +6934,7 @@ cfil_info_udp_expire(void *v, wait_result_t w)
                        break;
                }
 
-               if (IS_UDP(cfil_info->cfi_so)) {
+               if (IS_IP_DGRAM(cfil_info->cfi_so)) {
                        if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) ||
                            cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) ||
                            cfil_info_buffer_threshold_exceeded(cfil_info)) {
@@ -6808,17 +7003,20 @@ go_sleep:
 }
 
 struct m_tag *
-cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
+cfil_dgram_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
 {
        struct m_tag *tag = NULL;
        struct cfil_tag *ctag = NULL;
        struct cfil_hash_entry *hash_entry = NULL;
+       struct inpcb *inp = NULL;
 
        if (cfil_info == NULL || cfil_info->cfi_so == NULL ||
            cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) {
                return NULL;
        }
 
+       inp = sotoinpcb(cfil_info->cfi_so);
+
        /* Allocate a tag */
        tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP,
            sizeof(struct cfil_tag), M_DONTWAIT, m);
@@ -6827,6 +7025,7 @@ cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
                ctag = (struct cfil_tag*)(tag + 1);
                ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt;
                ctag->cfil_so_options = cfil_info->cfi_so->so_options;
+               ctag->cfil_inp_flags = inp ? inp->inp_flags : 0;
 
                hash_entry = cfil_info->cfi_hash_entry;
                if (hash_entry->cfentry_family == AF_INET6) {
@@ -6845,8 +7044,8 @@ cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
 }
 
 struct m_tag *
-cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
-    struct sockaddr **faddr)
+cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
+    struct sockaddr **faddr, int *inp_flags)
 {
        struct m_tag *tag = NULL;
        struct cfil_tag *ctag = NULL;
@@ -6863,6 +7062,9 @@ cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *opt
                if (faddr) {
                        *faddr = (struct sockaddr *) &ctag->cfil_faddr;
                }
+               if (inp_flags) {
+                       *inp_flags = ctag->cfil_inp_flags;
+               }
 
                /*
                 * Unlink tag and hand it over to caller.
@@ -6874,6 +7076,23 @@ cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *opt
        return NULL;
 }
 
+boolean_t
+cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags)
+{
+       struct m_tag *tag = NULL;
+       struct cfil_tag *ctag = NULL;
+
+       tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL);
+       if (tag) {
+               ctag = (struct cfil_tag *)(tag + 1);
+               if (inp_flags) {
+                       *inp_flags = ctag->cfil_inp_flags;
+               }
+               return true;
+       }
+       return false;
+}
+
 static int
 cfil_dispatch_stats_event_locked(int kcunit, struct cfil_stats_report_buffer *buffer, uint32_t stats_count)
 {
@@ -7057,7 +7276,7 @@ cfil_stats_collect_flow_stats_for_filter(int kcunit,
                                                union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL;
                                                union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr;
                                                cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp,
-                                                   src, dst, inp->inp_vflag & INP_IPV4, outgoing);
+                                                   src, dst, !IS_INP_V6(inp), outgoing);
                                        }
                                }
 
index e3829bf0223bc09cf1d31246330aa06c36a64408..2944eba1e10e0e3c4a7447868fe123e971fcc381 100644 (file)
@@ -212,6 +212,11 @@ struct cfil_msg_sock_attached {
        uint32_t                cfs_signature_length;
 };
 
+/*
+ * CFIL data flags
+ */
+#define CFD_DATA_FLAG_IP_HEADER         0x00000001          /* Data includes IP header */
+
 /*
  * struct cfil_msg_data_event
  *
@@ -235,6 +240,7 @@ struct cfil_msg_data_event {
        uint64_t                cfd_end_offset;
        cfil_crypto_signature   cfd_signature;
        uint32_t                cfd_signature_length;
+       uint32_t                cfd_flags;
        /* Actual content data immediatly follows */
 };
 
@@ -525,8 +531,10 @@ extern void cfil_sock_buf_update(struct sockbuf *sb);
 
 extern cfil_sock_id_t cfil_sock_id_from_socket(struct socket *so);
 
-extern struct m_tag *cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt,
-    short *options, struct sockaddr **faddr);
+extern struct m_tag *cfil_dgram_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt,
+    short *options, struct sockaddr **faddr, int *inp_flags);
+extern boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags);
+
 #endif /* BSD_KERNEL_PRIVATE */
 
 __END_DECLS
index 80bc27d78cf753afc7a0e944c6e9dcbd974be84b..e72c0d57c1316b33ef83a770e9199a08142e70e0 100644 (file)
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/net_api_stats.h>
+#include <net/pfvar.h>
 
 #include <netinet/in.h> /* for struct arpcom */
 #include <netinet/in_systm.h>
@@ -2475,7 +2476,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg)
        switch (ifs->if_type) {
        case IFT_ETHER:
                if (strcmp(ifs->if_name, "en") == 0 &&
-                   ifs->if_subfamily == IFNET_SUBFAMILY_WIFI) {
+                   ifs->if_subfamily == IFNET_SUBFAMILY_WIFI &&
+                   (ifs->if_eflags & IFEF_IPV4_ROUTER) == 0) {
                        /* XXX is there a better way to identify Wi-Fi STA? */
                        mac_nat = TRUE;
                }
index d410484cac3196610fc9dfed59718cf23b5e9c6e..c73dadf89b2d019969640902efcae3a1af3f00c8 100644 (file)
@@ -7276,10 +7276,10 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
                                necp_get_parent_cred_result(NULL, info);
                        }
                }
+       }
 
-               if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
-                       info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false;
-               }
+       if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) {
+               info->is_platform_binary = csproc_get_platform_binary(current_proc()) ? true : false;
        }
 
        if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) {
index 560dee4ab465117f3792fb93975f4537bcb22779..9ba6cc70c3604e98bdb3e0cd66f9d1d2eab48ed0 100644 (file)
@@ -1127,6 +1127,7 @@ pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p,
        dst->tagname[sizeof(dst->tagname) - 1] = '\0';
        dst->match_tagname[sizeof(dst->match_tagname) - 1] = '\0';
        dst->overload_tblname[sizeof(dst->overload_tblname) - 1] = '\0';
+       dst->owner[sizeof(dst->owner) - 1] = '\0';
 
        dst->cuid = kauth_cred_getuid(p->p_ucred);
        dst->cpid = p->p_pid;
@@ -1158,7 +1159,8 @@ pf_rule_copyout(struct pf_rule *src, struct pf_rule *dst)
        dst->kif = NULL;
        dst->overload_tbl = NULL;
 
-       TAILQ_INIT(&dst->rpool.list);
+       dst->rpool.list.tqh_first = NULL;
+       dst->rpool.list.tqh_last = NULL;
        dst->rpool.cur = NULL;
 
        dst->entries.tqe_prev = NULL;
index 1a405129f0bcc0cb701feb11c81583857e84813f..83d34f1a70c317a769157d02232dabac773af4bd 100644 (file)
@@ -1689,6 +1689,7 @@ flow_divert_send_app_data(struct flow_divert_pcb *fd_cb, mbuf_t data, struct soc
                                                    "sbappendaddr failed. send buffer size = %u, send_window = %u, error = %d\n",
                                                    fd_cb->so->so_snd.sb_cc, fd_cb->send_window, error);
                                        }
+                                       error = 0;
                                } else {
                                        if (!sbappendrecord(&fd_cb->so->so_snd, data)) {
                                                FDLOG(LOG_ERR, fd_cb,
@@ -2104,6 +2105,9 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off
                                FDLOG0(LOG_INFO, fd_cb, "No remote address provided");
                                error = 0;
                        } else {
+                               if (remote_address.ss_len > sizeof(remote_address)) {
+                                       remote_address.ss_len = sizeof(remote_address);
+                               }
                                /* validate the address */
                                if (flow_divert_is_sockaddr_valid((struct sockaddr *)&remote_address)) {
                                        got_remote_sa = TRUE;
@@ -3247,6 +3251,9 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr
        struct flow_divert_pcb  *fd_cb  = so->so_fd_pcb;
        int                                             error   = 0;
        struct inpcb *inp;
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+#endif
 
        VERIFY((so->so_flags & SOF_FLOW_DIVERT) && so->so_fd_pcb != NULL);
 
@@ -3284,7 +3291,7 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr
                 */
                if (to == NULL && so->so_cfil_db) {
                        struct sockaddr *cfil_faddr = NULL;
-                       struct m_tag *cfil_tag = cfil_udp_get_socket_state(data, NULL, NULL, &cfil_faddr);
+                       cfil_tag = cfil_dgram_get_socket_state(data, NULL, NULL, &cfil_faddr, NULL);
                        if (cfil_tag) {
                                to = (struct sockaddr *)(void *)cfil_faddr;
                        }
@@ -3323,6 +3330,12 @@ done:
        if (control) {
                mbuf_free(control);
        }
+#if CONTENT_FILTER
+       if (cfil_tag) {
+               m_tag_free(cfil_tag);
+       }
+#endif
+
        return error;
 }
 
@@ -3444,7 +3457,12 @@ flow_divert_attach(struct socket *so, uint32_t flow_id, uint32_t ctl_unit)
                        sorwakeup(so);
                }
        }
-       flow_divert_set_protosw(so);
+       if (SOCK_TYPE(so) == SOCK_STREAM) {
+               flow_divert_set_protosw(so);
+       } else if (SOCK_TYPE(so) == SOCK_DGRAM) {
+               flow_divert_set_udp_protosw(so);
+       }
+
        socket_unlock(so, 0);
 
        fd_cb->so = so;
index d097b293fc9f883343c355277dc6ad48a3dcaa21..4f3be1b1d66e1a803d46361d4c533f69189fd675 100644 (file)
@@ -3520,15 +3520,16 @@ inp_update_policy(struct inpcb *inp)
 #if defined(XNU_TARGET_OS_OSX)
        if (so->so_rpid > 0) {
                lookup_uuid = so->so_ruuid;
+               ogencnt = so->so_policy_gencnt;
+               err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
        }
 #endif
-       if (lookup_uuid == NULL) {
+       if (lookup_uuid == NULL || err == ENOENT) {
                lookup_uuid = ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid);
+               ogencnt = so->so_policy_gencnt;
+               err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
        }
 
-       ogencnt = so->so_policy_gencnt;
-       err = proc_uuid_policy_lookup(lookup_uuid, &pflags, &so->so_policy_gencnt);
-
        /*
         * Discard cached generation count if the entry is gone (ENOENT),
         * so that we go thru the checks below.
index 44804c8a2e08d76dccad0cdecbf1d62dea00c9c9..65adc858acc106b480ac86cc9ef2bf3b907c80c5 100644 (file)
@@ -80,6 +80,7 @@
 
 #include <net/if.h>
 #include <net/route.h>
+#include <net/content_filter.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
@@ -1289,6 +1290,7 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m,
        struct in_ifaddr *ia = NULL;
        int icmplen;
        int error = EINVAL;
+       int inp_flags = inp ? inp->inp_flags : 0;
 
        if (inp == NULL
 #if NECP
@@ -1301,7 +1303,16 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m,
                goto bad;
        }
 
-       if ((inp->inp_flags & INP_HDRINCL) != 0) {
+#if CONTENT_FILTER
+       /*
+        * If socket is subject to Content Filter, get inp_flags from saved state
+        */
+       if (so->so_cfil_db && nam == NULL) {
+               cfil_dgram_peek_socket_state(m, &inp_flags);
+       }
+#endif
+
+       if ((inp_flags & INP_HDRINCL) != 0) {
                /* Expect 32-bit aligned data ptr on strict-align platforms */
                MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
                /*
index 9e8637a928dffa7e5f7a0255ec6639d40457db4d..356298cf7b78949f336209b2a146c268c717d69c 100644 (file)
@@ -2394,6 +2394,11 @@ mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
                goto out_err;
        }
 
+       if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
+               error = EOVERFLOW;
+               goto out_err;
+       }
+
        mpts = mptcp_subflow_alloc();
        if (mpts == NULL) {
                os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
index 0012e449760500252500d5aefc133147d738debd..db728ca45425d8ed9b0f22362aee910dbd9ae869 100644 (file)
@@ -222,7 +222,7 @@ out:
 }
 
 static int
-mptcp_entitlement_check(struct socket *mp_so)
+mptcp_entitlement_check(struct socket *mp_so, uint8_t svctype)
 {
        struct mptses *mpte = mpsotompte(mp_so);
 
@@ -254,7 +254,7 @@ mptcp_entitlement_check(struct socket *mp_so)
        }
 #endif
 
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
+       if (svctype == MPTCP_SVCTYPE_AGGREGATE) {
                if (mptcp_developer_mode) {
                        return 0;
                }
@@ -274,7 +274,7 @@ mptcp_entitlement_check(struct socket *mp_so)
 
 deny:
        os_log_error(mptcp_log_handle, "%s - %lx: MPTCP prohibited on svc %u\n",
-           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_svctype);
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), svctype);
 
        return -1;
 }
@@ -354,7 +354,7 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
        }
 
        if (!(mpte->mpte_flags & MPTE_SVCTYPE_CHECKED)) {
-               if (mptcp_entitlement_check(mp_so) < 0) {
+               if (mptcp_entitlement_check(mp_so, mpte->mpte_svctype) < 0) {
                        error = EPERM;
                        goto out;
                }
@@ -1713,13 +1713,12 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
                                goto err_out;
                        }
 
-                       mpte->mpte_svctype = optval;
-
-                       if (mptcp_entitlement_check(mp_so) < 0) {
+                       if (mptcp_entitlement_check(mp_so, optval) < 0) {
                                error = EACCES;
                                goto err_out;
                        }
 
+                       mpte->mpte_svctype = optval;
                        mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
 
                        goto out;
index f13bfb95023568e16ed3d10e5c769fb24546a472..c1063229da3b1ef83443af14603bc112706b3811 100644 (file)
@@ -62,6 +62,7 @@ struct mptses {
        struct mptcb    *mpte_mptcb;            /* ptr to MPTCP PCB */
        TAILQ_HEAD(, mptopt) mpte_sopts;        /* list of socket options */
        TAILQ_HEAD(, mptsub) mpte_subflows;     /* list of subflows */
+#define MPTCP_MAX_NUM_SUBFLOWS 256
        uint16_t        mpte_numflows;          /* # of subflows in list */
        uint16_t        mpte_nummpcapflows;     /* # of MP_CAP subflows */
        sae_associd_t   mpte_associd;           /* MPTCP association ID */
index dc552d9e30ac30ffdf703082edbb0bafc1648b9c..b3838d3e72639392a1ea476f02b893f875730d67 100644 (file)
@@ -86,6 +86,7 @@
 #include <net/if.h>
 #include <net/net_api_stats.h>
 #include <net/route.h>
+#include <net/content_filter.h>
 
 #define _IP_VHL
 #include <netinet/in.h>
@@ -277,7 +278,14 @@ rip_input(struct mbuf *m, int iphlen)
                                                continue;
                                        }
                                }
-                               if (last->inp_flags & INP_STRIPHDR) {
+                               if (last->inp_flags & INP_STRIPHDR
+#if CONTENT_FILTER
+                                   /*
+                                    * If socket is subject to Content Filter, delay stripping until reinject
+                                    */
+                                   && (last->inp_socket->so_cfil_db == NULL)
+#endif
+                                   ) {
                                        n->m_len -= iphlen;
                                        n->m_pkthdr.len -= iphlen;
                                        n->m_data += iphlen;
@@ -330,7 +338,14 @@ rip_input(struct mbuf *m, int iphlen)
                                        goto unlock;
                                }
                        }
-                       if (last->inp_flags & INP_STRIPHDR) {
+                       if (last->inp_flags & INP_STRIPHDR
+#if CONTENT_FILTER
+                           /*
+                            * If socket is subject to Content Filter, delay stripping until reinject
+                            */
+                           && (last->inp_socket->so_cfil_db == NULL)
+#endif
+                           ) {
                                m->m_len -= iphlen;
                                m->m_pkthdr.len -= iphlen;
                                m->m_data += iphlen;
@@ -370,10 +385,74 @@ rip_output(
        struct ip *ip;
        struct inpcb *inp = sotoinpcb(so);
        int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+       int inp_flags = inp ? inp->inp_flags : 0;
        struct ip_out_args ipoa;
        struct ip_moptions *imo;
        int tos = IPTOS_UNSPEC;
        int error = 0;
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+       bool cfil_faddr_use = false;
+       uint32_t cfil_so_state_change_cnt = 0;
+       short cfil_so_options = 0;
+       int cfil_inp_flags = 0;
+       struct sockaddr *cfil_faddr = NULL;
+       struct sockaddr_in *cfil_sin;
+#endif
+
+#if CONTENT_FILTER
+       /*
+        * If socket is subject to Content Filter and no addr is passed in,
+        * retrieve CFIL saved state from mbuf and use it if necessary.
+        */
+       if (so->so_cfil_db && dst == INADDR_ANY) {
+               cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, &cfil_inp_flags);
+               if (cfil_tag) {
+                       cfil_sin = SIN(cfil_faddr);
+                       flags = (cfil_so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+                       inp_flags = cfil_inp_flags;
+                       if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
+                               /*
+                                * Socket is unconnected, simply use the saved faddr as 'addr' to go through
+                                * the connect/disconnect logic.
+                                */
+                               dst = cfil_sin->sin_addr.s_addr;
+                       } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+                           (inp->inp_fport != cfil_sin->sin_port ||
+                           inp->inp_faddr.s_addr != cfil_sin->sin_addr.s_addr)) {
+                               /*
+                                * Socket is connected but socket state and dest addr/port changed.
+                                * We need to use the saved faddr and socket options.
+                                */
+                               cfil_faddr_use = true;
+                       }
+                       m_tag_free(cfil_tag);
+               }
+       }
+#endif
+
+       if (so->so_state & SS_ISCONNECTED) {
+               if (dst != INADDR_ANY) {
+                       if (m != NULL) {
+                               m_freem(m);
+                       }
+                       if (control != NULL) {
+                               m_freem(control);
+                       }
+                       return EISCONN;
+               }
+               dst = cfil_faddr_use ? cfil_sin->sin_addr.s_addr : inp->inp_faddr.s_addr;
+       } else {
+               if (dst == INADDR_ANY) {
+                       if (m != NULL) {
+                               m_freem(m);
+                       }
+                       if (control != NULL) {
+                               m_freem(control);
+                       }
+                       return ENOTCONN;
+               }
+       }
 
        bzero(&ipoa, sizeof(ipoa));
        ipoa.ipoa_boundif = IFSCOPE_NONE;
@@ -436,7 +515,7 @@ rip_output(
         * If the user handed us a complete IP packet, use it.
         * Otherwise, allocate an mbuf for a header and fill it in.
         */
-       if ((inp->inp_flags & INP_HDRINCL) == 0) {
+       if ((inp_flags & INP_HDRINCL) == 0) {
                if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
                        m_freem(m);
                        return EMSGSIZE;
@@ -493,8 +572,12 @@ rip_output(
                /*
                 * We need a route to perform NECP route rule checks
                 */
-               if (net_qos_policy_restricted != 0 &&
-                   ROUTE_UNUSABLE(&inp->inp_route)) {
+               if ((net_qos_policy_restricted != 0 &&
+                   ROUTE_UNUSABLE(&inp->inp_route))
+#if CONTENT_FILTER
+                   || cfil_faddr_use
+#endif
+                   ) {
                        struct sockaddr_in to;
                        struct sockaddr_in from;
                        struct in_addr laddr = ip->ip_src;
@@ -600,6 +683,10 @@ rip_output(
 
                if ((rt->rt_flags & (RTF_MULTICAST | RTF_BROADCAST)) ||
                    inp->inp_socket == NULL ||
+#if CONTENT_FILTER
+                   /* Discard temporary route for cfil case */
+                   cfil_faddr_use ||
+#endif
                    !(inp->inp_socket->so_state & SS_ISCONNECTED)) {
                        rt = NULL;      /* unusable */
                }
@@ -1067,7 +1154,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 {
 #pragma unused(flags, p)
        struct inpcb *inp = sotoinpcb(so);
-       u_int32_t dst;
+       u_int32_t dst = INADDR_ANY;
        int error = 0;
 
        if (inp == NULL
@@ -1083,17 +1170,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                goto bad;
        }
 
-       if (so->so_state & SS_ISCONNECTED) {
-               if (nam != NULL) {
-                       error = EISCONN;
-                       goto bad;
-               }
-               dst = inp->inp_faddr.s_addr;
-       } else {
-               if (nam == NULL) {
-                       error = ENOTCONN;
-                       goto bad;
-               }
+       if (nam != NULL) {
                dst = ((struct sockaddr_in *)(void *)nam)->sin_addr.s_addr;
        }
        return rip_output(m, so, dst, control);
index ed16674e5d4b1797a9fc72341e585489bb8c26f1..818f05cce172de7cb6df58529062384807e2b67e 100644 (file)
@@ -1479,6 +1479,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
 #if CONTENT_FILTER
        struct m_tag *cfil_tag = NULL;
        bool cfil_faddr_use = false;
+       bool sndinprog_cnt_used = false;
        uint32_t cfil_so_state_change_cnt = 0;
        short cfil_so_options = 0;
        struct sockaddr *cfil_faddr = NULL;
@@ -1510,7 +1511,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
         * retrieve CFIL saved state from mbuf and use it if necessary.
         */
        if (so->so_cfil_db && !addr) {
-               cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr);
+               cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL);
                if (cfil_tag) {
                        sin = (struct sockaddr_in *)(void *)cfil_faddr;
                        if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
@@ -1673,6 +1674,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
                fport = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_port;
        }
 #endif
+       inp->inp_sndinprog_cnt++;
+       sndinprog_cnt_used = true;
 
        if (addr) {
                sin = (struct sockaddr_in *)(void *)addr;
@@ -1936,8 +1939,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
                ipoa.ipoa_flags |= IPOAF_BOUND_SRCADDR;
        }
 
-       inp->inp_sndinprog_cnt++;
-
        socket_unlock(so, 0);
        error = ip_output(m, inpopts, &ro, soopts, mopts, &ipoa);
        m = NULL;
@@ -1971,14 +1972,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
                inp_set_fc_state(inp, adv->code);
        }
 
-       VERIFY(inp->inp_sndinprog_cnt > 0);
-       if (--inp->inp_sndinprog_cnt == 0) {
-               inp->inp_flags &= ~(INP_FC_FEEDBACK);
-               if (inp->inp_sndingprog_waiters > 0) {
-                       wakeup(&inp->inp_sndinprog_cnt);
-               }
-       }
-
        /* Synchronize PCB cached route */
        inp_route_copyin(inp, &ro);
 
@@ -2057,6 +2050,16 @@ release:
                m_tag_free(cfil_tag);
        }
 #endif
+       if (sndinprog_cnt_used) {
+               VERIFY(inp->inp_sndinprog_cnt > 0);
+               if (--inp->inp_sndinprog_cnt == 0) {
+                       inp->inp_flags &= ~(INP_FC_FEEDBACK);
+                       if (inp->inp_sndingprog_waiters > 0) {
+                               wakeup(&inp->inp_sndinprog_cnt);
+                       }
+               }
+               sndinprog_cnt_used = false;
+       }
 
        return error;
 }
index a7376b6861c486ece41cddd955a173e3ec73fdfd..042244b8364664991f7248efdbd7c420ac8893c9 100644 (file)
@@ -3157,8 +3157,6 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m,
 #pragma unused(flags, p)
        int error = 0;
        struct inpcb *inp = sotoinpcb(so);
-       struct sockaddr_in6 tmp;
-       struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam;
        struct icmp6_hdr *icmp6;
 
        if (inp == NULL
@@ -3174,28 +3172,6 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m,
                return rip6_output(m, so, SIN6(nam), control, 0);
        }
 
-       /* always copy sockaddr to avoid overwrites */
-       if (so->so_state & SS_ISCONNECTED) {
-               if (nam != NULL) {
-                       error = EISCONN;
-                       goto bad;
-               }
-               /* XXX */
-               bzero(&tmp, sizeof(tmp));
-               tmp.sin6_family = AF_INET6;
-               tmp.sin6_len = sizeof(struct sockaddr_in6);
-               bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
-                   sizeof(struct in6_addr));
-               dst = &tmp;
-       } else {
-               if (nam == NULL) {
-                       error = ENOTCONN;
-                       goto bad;
-               }
-               tmp = *(struct sockaddr_in6 *)(void *)nam;
-               dst = &tmp;
-       }
-
        /*
         * For an ICMPv6 packet, we should know its type and code
         */
@@ -3224,13 +3200,7 @@ icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m,
                }
        }
 
-#if ENABLE_DEFAULT_SCOPE
-       if (dst->sin6_scope_id == 0) {  /* not change if specified  */
-               dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr);
-       }
-#endif
-
-       return rip6_output(m, so, dst, control, 0);
+       return rip6_output(m, so, SIN6(nam), control, 0);
 bad:
        VERIFY(error != 0);
 
index 5b2b17517f61c6c8dd5d46b5585cb8f54af612c9..15b48d475521170bcc5e32d9c5a4dd0ef143cb49 100644 (file)
 #include <net/net_api_stats.h>
 #include <net/route.h>
 #include <net/if_types.h>
+#include <net/content_filter.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
@@ -363,8 +364,80 @@ rip6_output(
        int netsvctype = _NET_SERVICE_TYPE_UNSPEC;
        struct ip6_out_args ip6oa;
        int flags = IPV6_OUTARGS;
+       struct sockaddr_in6 tmp;
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+       bool cfil_faddr_use = false;
+       uint32_t cfil_so_state_change_cnt = 0;
+       short cfil_so_options = 0;
+       struct sockaddr *cfil_faddr = NULL;
+       struct sockaddr_in6 *cfil_sin6 = NULL;
+#endif
 
        in6p = sotoin6pcb(so);
+       if (in6p == NULL) {
+               error = EINVAL;
+               goto bad;
+       }
+
+#if CONTENT_FILTER
+       /*
+        * If socket is subject to Content Filter and no addr is passed in,
+        * retrieve CFIL saved state from mbuf and use it if necessary.
+        */
+       if (so->so_cfil_db && !dstsock) {
+               cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr, NULL);
+               if (cfil_tag) {
+                       cfil_sin6 = SIN6(cfil_faddr);
+                       if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
+                               /*
+                                * Socket is unconnected, simply use the saved faddr as 'addr' to go through
+                                * the connect/disconnect logic.
+                                */
+                               dstsock = cfil_sin6;
+                       } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+                           (in6p->in6p_fport != cfil_sin6->sin6_port ||
+                           !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &cfil_sin6->sin6_addr))) {
+                               /*
+                                * Socket is connected but socket state and dest addr/port changed.
+                                * We need to use the saved faddr and socket options.
+                                */
+                               cfil_faddr_use = true;
+                       }
+               }
+       }
+#endif
+
+       /* always copy sockaddr to avoid overwrites */
+       if (so->so_state & SS_ISCONNECTED) {
+               if (dstsock != NULL) {
+                       error = EISCONN;
+                       goto bad;
+               }
+               /* XXX */
+               bzero(&tmp, sizeof(tmp));
+               tmp.sin6_family = AF_INET6;
+               tmp.sin6_len = sizeof(struct sockaddr_in6);
+               bcopy(
+#if CONTENT_FILTER
+                       cfil_faddr_use ? &cfil_sin6->sin6_addr :
+#endif
+                       &in6p->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr));
+               dstsock = &tmp;
+       } else {
+               if (dstsock == NULL) {
+                       error = ENOTCONN;
+                       goto bad;
+               }
+               tmp = *dstsock;
+               dstsock = &tmp;
+       }
+
+#if ENABLE_DEFAULT_SCOPE
+       if (dstsock->sin6_scope_id == 0) { /* not change if specified  */
+               dstsock->sin6_scope_id = scope6_addr2default(&dstsock->sin6_addr);
+       }
+#endif
 
        bzero(&ip6oa, sizeof(ip6oa));
        ip6oa.ip6oa_boundif = IFSCOPE_NONE;
@@ -604,8 +677,12 @@ rip6_output(
                /*
                 * We need a route to perform NECP route rule checks
                 */
-               if (net_qos_policy_restricted != 0 &&
-                   ROUTE_UNUSABLE(&in6p->in6p_route)) {
+               if ((net_qos_policy_restricted != 0 &&
+                   ROUTE_UNUSABLE(&in6p->in6p_route))
+#if CONTENT_FILTER
+                   || cfil_faddr_use
+#endif
+                   ) {
                        struct sockaddr_in6 to;
                        struct sockaddr_in6 from;
 
@@ -697,6 +774,10 @@ rip6_output(
 
                if ((rt->rt_flags & RTF_MULTICAST) ||
                    in6p->in6p_socket == NULL ||
+#if CONTENT_FILTER
+                   /* Discard temporary route for cfil case */
+                   cfil_faddr_use ||
+#endif
                    !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) {
                        rt = NULL;      /* unusable */
                }
@@ -772,6 +853,12 @@ freectl:
        if (oifp != NULL) {
                ifnet_release(oifp);
        }
+#if CONTENT_FILTER
+       if (cfil_tag) {
+               m_tag_free(cfil_tag);
+       }
+#endif
+
        return error;
 }
 
@@ -1053,8 +1140,6 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 {
 #pragma unused(flags, p)
        struct inpcb *inp = sotoinpcb(so);
-       struct sockaddr_in6 tmp;
-       struct sockaddr_in6 *dst = (struct sockaddr_in6 *)(void *)nam;
        int error = 0;
 
        if (inp == NULL
@@ -1070,33 +1155,7 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                goto bad;
        }
 
-       /* always copy sockaddr to avoid overwrites */
-       if (so->so_state & SS_ISCONNECTED) {
-               if (nam != NULL) {
-                       error = EISCONN;
-                       goto bad;
-               }
-               /* XXX */
-               bzero(&tmp, sizeof(tmp));
-               tmp.sin6_family = AF_INET6;
-               tmp.sin6_len = sizeof(struct sockaddr_in6);
-               bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
-                   sizeof(struct in6_addr));
-               dst = &tmp;
-       } else {
-               if (nam == NULL) {
-                       error = ENOTCONN;
-                       goto bad;
-               }
-               tmp = *(struct sockaddr_in6 *)(void *)nam;
-               dst = &tmp;
-       }
-#if ENABLE_DEFAULT_SCOPE
-       if (dst->sin6_scope_id == 0) {  /* not change if specified  */
-               dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr);
-       }
-#endif
-       return rip6_output(m, so, dst, control, 1);
+       return rip6_output(m, so, SIN6(nam), control, 1);
 
 bad:
        VERIFY(error != 0);
index 66025ca433e5c673ef665c5a7f1ae34f29e34764..eba8e037e3f917ee16ebeb4806600d77feb94ca8 100644 (file)
@@ -173,6 +173,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
 #if CONTENT_FILTER
        struct m_tag *cfil_tag = NULL;
        bool cfil_faddr_use = false;
+       bool sndinprog_cnt_used = false;
        uint32_t cfil_so_state_change_cnt = 0;
        struct sockaddr *cfil_faddr = NULL;
        struct sockaddr_in6 *cfil_sin6 = NULL;
@@ -216,7 +217,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
         * retrieve CFIL saved state from mbuf and use it if necessary.
         */
        if (so->so_cfil_db && !addr6) {
-               cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr);
+               cfil_tag = cfil_dgram_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr, NULL);
                if (cfil_tag) {
                        cfil_sin6 = (struct sockaddr_in6 *)(void *)cfil_faddr;
                        if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
@@ -250,6 +251,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
        ip6oa.ip6oa_sotc = sotc;
        ip6oa.ip6oa_netsvctype = netsvctype;
 
+       in6p->inp_sndinprog_cnt++;
+       sndinprog_cnt_used = true;
+
        if (addr6) {
                /*
                 * IPv4 version of udp_output calls in_pcbconnect in this case,
@@ -529,8 +533,6 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                        IM6O_UNLOCK(im6o);
                }
 
-               in6p->inp_sndinprog_cnt++;
-
                socket_unlock(so, 0);
                error = ip6_output(m, optp, &ro, flags, im6o, NULL, &ip6oa);
                m = NULL;
@@ -568,14 +570,6 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                        inp_set_fc_state(in6p, adv->code);
                }
 
-               VERIFY(in6p->inp_sndinprog_cnt > 0);
-               if (--in6p->inp_sndinprog_cnt == 0) {
-                       in6p->inp_flags &= ~(INP_FC_FEEDBACK);
-                       if (in6p->inp_sndingprog_waiters > 0) {
-                               wakeup(&in6p->inp_sndinprog_cnt);
-                       }
-               }
-
                if (ro.ro_rt != NULL) {
                        struct ifnet *outif = ro.ro_rt->rt_ifp;
 
@@ -661,6 +655,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
        goto releaseopt;
 
 release:
+
        if (m != NULL) {
                m_freem(m);
        }
@@ -677,5 +672,16 @@ releaseopt:
                m_tag_free(cfil_tag);
        }
 #endif
+       if (sndinprog_cnt_used) {
+               VERIFY(in6p->inp_sndinprog_cnt > 0);
+               if (--in6p->inp_sndinprog_cnt == 0) {
+                       in6p->inp_flags &= ~(INP_FC_FEEDBACK);
+                       if (in6p->inp_sndingprog_waiters > 0) {
+                               wakeup(&in6p->inp_sndinprog_cnt);
+                       }
+               }
+               sndinprog_cnt_used = false;
+       }
+
        return error;
 }
index 2917f5c7ec6d6f632126aeff553108c8e81c1744..356d7b99d87ce3f9b0cedd5441ac1f96e7a05738 100644 (file)
@@ -1012,7 +1012,7 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 #if CONTENT_FILTER
        //If socket is subject to UDP Content Filter and unconnected, get addr from tag.
        if (so->so_cfil_db && !addr && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
-               cfil_tag = cfil_udp_get_socket_state(m, NULL, NULL, &cfil_faddr);
+               cfil_tag = cfil_dgram_get_socket_state(m, NULL, NULL, &cfil_faddr, NULL);
                if (cfil_tag) {
                        addr = (struct sockaddr *)cfil_faddr;
                }
index 816e849c44e74d94ec7632c25d6b4c769197f731..aa7aa4b4bb4e674d4155f070c6f87df5ae617d4e 100644 (file)
@@ -131,6 +131,7 @@ struct nameidata {
 #define NAMEI_CONTLOOKUP        0x002    /* Continue processing a lookup which was partially processed in a compound VNOP */
 #define NAMEI_TRAILINGSLASH     0x004    /* There was at least one trailing slash after last component */
 #define NAMEI_UNFINISHED        0x008    /* We broke off a lookup to do a compound op */
+
 /*
  * XXX Hack: we need to encode the intended VNOP in order to
  * be able to include information about which operations a filesystem
@@ -143,6 +144,8 @@ struct nameidata {
 #define NAMEI_COMPOUNDRENAME    0x100
 #define NAMEI_COMPOUND_OP_MASK (NAMEI_COMPOUNDOPEN | NAMEI_COMPOUNDREMOVE | NAMEI_COMPOUNDMKDIR | NAMEI_COMPOUNDRMDIR | NAMEI_COMPOUNDRENAME)
 
+#define NAMEI_NOPROCLOCK        0x1000  /* do not take process lock (set by vnode_lookup) */
+
 #ifdef KERNEL
 /*
  * namei operational modifier flags, stored in ni_cnd.flags
index 763515e8f2a65883cb25d6e883e6463ac4dcb7c4..57e827b40930ac39f92be26c3ebf00ee0af652b1 100644 (file)
@@ -419,6 +419,7 @@ struct  proc {
 #if !CONFIG_EMBEDDED
        uint64_t        p_user_data;                    /* general-purpose storage for userland-provided data */
 #endif /* !CONFIG_EMBEDDED */
+       lck_rw_t        p_dirs_lock;                    /* keeps fd_cdir and fd_rdir stable across a lookup */
 };
 
 #define PGRPID_DEAD 0xdeaddead
@@ -681,6 +682,7 @@ extern lck_grp_t * proc_knhashlock_grp;
 extern lck_grp_t * proc_mlock_grp;
 extern lck_grp_t * proc_ucred_mlock_grp;
 extern lck_grp_t * proc_slock_grp;
+extern lck_grp_t * proc_dirslock_grp;
 extern lck_grp_attr_t * proc_lck_grp_attr;
 extern lck_attr_t * proc_lck_attr;
 
@@ -702,6 +704,10 @@ extern void proc_fdlock(struct proc *);
 extern void proc_fdlock_spin(struct proc *);
 extern void proc_fdunlock(struct proc *);
 extern void proc_fdlock_assert(proc_t p, int assertflags);
+extern void proc_dirs_lock_shared(struct proc *);
+extern void proc_dirs_unlock_shared(struct proc *);
+extern void proc_dirs_lock_exclusive(struct proc *);
+extern void proc_dirs_unlock_exclusive(struct proc *);
 extern void proc_ucred_lock(struct proc *);
 extern void proc_ucred_unlock(struct proc *);
 __private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid,
index 3611e2c6a7236764a2f18c543f4fa102857d8964..cbb8567614e461d7e5a8247f42cd066d5512eed3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -768,6 +768,7 @@ extern int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control, int *error_out);
 extern int sbappendchain(struct sockbuf *sb, struct mbuf *m, int space);
 extern int sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
+extern int sbappendrecord_nodrop(struct sockbuf *sb, struct mbuf *m0);
 extern void sbflush(struct sockbuf *sb);
 extern int sbspace(struct sockbuf *sb);
 extern int soabort(struct socket *so);
@@ -829,6 +830,7 @@ extern void so_acquire_accept_list(struct socket *, struct socket *);
 extern void so_release_accept_list(struct socket *);
 
 extern int sbappend(struct sockbuf *sb, struct mbuf *m);
+extern int sbappend_nodrop(struct sockbuf *sb, struct mbuf *m);
 extern int sbappendstream(struct sockbuf *sb, struct mbuf *m);
 extern int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
     struct mbuf *control, int *error_out);
index c85c8485f19b7c3bd9b6e14497fd79d84aa4090b..dff5f82d3ec9ff902137bed408c796b1b64ca83f 100644 (file)
@@ -1330,56 +1330,6 @@ vfs_context_cwd(vfs_context_t ctx)
        return cwd;
 }
 
-/*
- * vfs_context_get_cwd
- *
- * Description:        Returns a vnode for the current working directory for the
- *              supplied context. The returned vnode has an iocount on it
- *              which must be released with a vnode_put().
- *
- * Parameters: vfs_context_t                   The context to use
- *
- * Returns:    vnode_t                         The current working directory
- *                                             for this context
- *
- * Notes:      The function first attempts to obtain the current directory
- *             from the thread, and if it is not present there, falls back
- *             to obtaining it from the process instead.  If it can't be
- *             obtained from either place, we return NULLVP.
- */
-vnode_t
-vfs_context_get_cwd(vfs_context_t ctx)
-{
-       vnode_t cwd = NULLVP;
-
-       if (ctx != NULL && ctx->vc_thread != NULL) {
-               uthread_t uth = get_bsdthread_info(ctx->vc_thread);
-               proc_t proc;
-
-               /*
-                * Get the cwd from the thread; if there isn't one, get it
-                * from the process, instead.
-                */
-               cwd = uth->uu_cdir;
-
-               if (cwd) {
-                       if ((vnode_get(cwd) != 0)) {
-                               cwd = NULLVP;
-                       }
-               } else if ((proc = (proc_t)get_bsdthreadtask_info(ctx->vc_thread)) != NULL &&
-                   proc->p_fd != NULL) {
-                       proc_fdlock(proc);
-                       cwd = proc->p_fd->fd_cdir;
-                       if (cwd && (vnode_get(cwd) != 0)) {
-                               cwd = NULLVP;
-                       }
-                       proc_fdunlock(proc);
-               }
-       }
-
-       return cwd;
-}
-
 /*
  * vfs_context_create
  *
index 77c525baac8014c906286e7d6489e74aa51f3943..9aad31d57bc851436c038190d84ca638c906de37 100644 (file)
@@ -113,7 +113,7 @@ static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, v
 #endif
 
 static int              lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx);
-static int              handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx);
+static int              lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx);
 static int              lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx);
 static void             lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation);
 static int              lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly,
@@ -167,8 +167,6 @@ namei(struct nameidata *ndp)
 {
        struct filedesc *fdp;   /* pointer to file descriptor state */
        struct vnode *dp;       /* the directory we are searching */
-       struct vnode *rootdir_with_usecount = NULLVP;
-       struct vnode *startdir_with_usecount = NULLVP;
        struct vnode *usedvp = ndp->ni_dvp;  /* store pointer to vp in case we must loop due to
                                              *                                          heavy vnode pressure */
        u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */
@@ -185,6 +183,8 @@ namei(struct nameidata *ndp)
        int volfs_restarts = 0;
 #endif
        size_t bytes_copied = 0;
+       bool take_proc_lock = !(ndp->ni_flag & NAMEI_NOPROCLOCK);
+       bool proc_lock_taken = false;
 
        fdp = p->p_fd;
 
@@ -351,29 +351,15 @@ retry_copy:
        /*
         * determine the starting point for the translation.
         *
-        * We may need to upto 2 usecounts on vnodes before starting the translation
-        * We need to have a usecount on the root directory for the process
-        * for the entire duration of the lookup. This is because symlink
-        * translation can restart translation at / if a symlink is encountered.
-        *
-        * For the duration of this lookup at rootdir for this lookup is the one
-        * we fetch now under the proc_fdlock even the if the proc rootdir changes
-        * once we let go of the proc_fdlock.
-        *
-        * In the future we may consider holding off a chroot till we complete
-        * in progress lookups.
-        *
-        * If the starting directory is not the process rootdir then we need
-        * a usecount on the starting directory as well for the duration of the
-        * lookup.
-        *
-        * Getting an addtional usecount involves first getting an iocount under
-        * the lock that ensures that a usecount is on the directory. Once we
-        * get an iocount we can release the lock and we will be free to get a
-        * usecount without the vnode getting recycled. Once we get the usecount
-        * we can release the icoount which we used to get our usecount.
+        * We hold the proc_dirs lock across the lookup so that the
+        * process rootdir and cwd are stable (i.e. the usecounts
+        * on them are mainatained for the duration of the lookup)
         */
-       proc_fdlock(p);
+       if (take_proc_lock) {
+               assert(proc_lock_taken == false);
+               proc_dirs_lock_shared(p);
+               proc_lock_taken = true;
+       }
        if (!(fdp->fd_flags & FD_CHROOT)) {
                ndp->ni_rootdir = rootvnode;
        } else {
@@ -382,10 +368,8 @@ retry_copy:
 
        if (!ndp->ni_rootdir) {
                if (!(fdp->fd_flags & FD_CHROOT)) {
-                       proc_fdunlock(p);
                        printf("rootvnode is not set\n");
                } else {
-                       proc_fdunlock(p);
                        /* This should be a panic */
                        printf("fdp->fd_rdir is not set\n");
                }
@@ -393,43 +377,10 @@ retry_copy:
                goto error_out;
        }
 
-       /*
-        * We have the proc_fdlock here so we still have a usecount
-        * on ndp->ni_rootdir.
-        *
-        * However we need to get our own usecount on it in order to
-        * ensure that the vnode isn't recycled to something else.
-        *
-        * Note : It's fine if the vnode is force reclaimed but with
-        * a usecount it won't be reused until we release the reference.
-        *
-        * In order to get that usecount however, we need to first
-        * get non blocking iocount since we'll be doing this under
-        * the proc_fdlock.
-        */
-       if (vnode_get(ndp->ni_rootdir) != 0) {
-               proc_fdunlock(p);
-               error = ENOENT;
-               goto error_out;
-       }
-
-       proc_fdunlock(p);
-
-       /* Now we can safely get our own ref on ni_rootdir */
-       error = vnode_ref_ext(ndp->ni_rootdir, O_EVTONLY, 0);
-       vnode_put(ndp->ni_rootdir);
-       if (error) {
-               ndp->ni_rootdir = NULLVP;
-               goto error_out;
-       }
-
-       rootdir_with_usecount = ndp->ni_rootdir;
-
        cnp->cn_nameptr = cnp->cn_pnbuf;
 
        ndp->ni_usedvp = NULLVP;
 
-       bool dp_needs_put = false;
        if (*(cnp->cn_nameptr) == '/') {
                while (*(cnp->cn_nameptr) == '/') {
                        cnp->cn_nameptr++;
@@ -440,40 +391,15 @@ retry_copy:
                dp = ndp->ni_dvp;
                ndp->ni_usedvp = dp;
        } else {
-               dp = vfs_context_get_cwd(ctx);
-               if (dp) {
-                       dp_needs_put = true;
-               }
+               dp = vfs_context_cwd(ctx);
        }
 
        if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
-               if (dp_needs_put) {
-                       vnode_put(dp);
-                       dp_needs_put = false;
-               }
                dp = NULLVP;
                error = ENOENT;
                goto error_out;
        }
 
-       if (dp != rootdir_with_usecount) {
-               error = vnode_ref_ext(dp, O_EVTONLY, 0);
-               if (error) {
-                       if (dp_needs_put) {
-                               vnode_put(dp);
-                               dp_needs_put = false;
-                       }
-                       dp = NULLVP;
-                       goto error_out;
-               }
-               startdir_with_usecount = dp;
-       }
-
-       if (dp_needs_put) {
-               vnode_put(dp);
-               dp_needs_put = false;
-       }
-
        ndp->ni_dvp = NULLVP;
        ndp->ni_vp  = NULLVP;
 
@@ -492,7 +418,6 @@ retry_copy:
                        goto error_out;
                }
 #endif
-
                ndp->ni_startdir = dp;
                dp = NULLVP;
 
@@ -504,46 +429,19 @@ retry_copy:
                 * Check for symbolic link
                 */
                if ((cnp->cn_flags & ISSYMLINK) == 0) {
-                       if (startdir_with_usecount) {
-                               vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
-                               startdir_with_usecount = NULLVP;
-                       }
-                       if (rootdir_with_usecount) {
-                               vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0);
-                               rootdir_with_usecount = NULLVP;
+                       if (proc_lock_taken) {
+                               proc_dirs_unlock_shared(p);
+                               proc_lock_taken = false;
                        }
                        return 0;
                }
 
 continue_symlink:
-               /*
-                * Gives us a new path to process, and a starting dir (with an iocount).
-                * The iocount is needed to take a usecount on the vnode returned
-                * (if it is not a vnode we already have a usecount on).
-                */
-               error = handle_symlink_for_namei(ndp, &dp, ctx);
+               /* Gives us a new path to process, and a starting dir */
+               error = lookup_handle_symlink(ndp, &dp, ctx);
                if (error != 0) {
                        break;
                }
-
-               if (dp == ndp->ni_rootdir && startdir_with_usecount) {
-                       vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
-                       startdir_with_usecount = NULLVP;
-               } else if (dp != startdir_with_usecount) {
-                       if (startdir_with_usecount) {
-                               vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
-                               startdir_with_usecount = NULLVP;
-                       }
-                       error = vnode_ref_ext(dp, O_EVTONLY, 0);
-                       if (error) {
-                               vnode_put(dp);
-                               dp = NULLVP;
-                               goto error_out;
-                       }
-                       startdir_with_usecount = dp;
-               }
-               /* iocount not required on dp anymore */
-               vnode_put(dp);
        }
        /*
         * only come here if we fail to handle a SYMLINK...
@@ -559,6 +457,10 @@ out_drop:
                vnode_put(ndp->ni_vp);
        }
 error_out:
+       if (proc_lock_taken) {
+               proc_dirs_unlock_shared(p);
+               proc_lock_taken = false;
+       }
        if ((cnp->cn_flags & HASBUF)) {
                cnp->cn_flags &= ~HASBUF;
                FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
@@ -567,15 +469,6 @@ error_out:
        ndp->ni_vp = NULLVP;
        ndp->ni_dvp = NULLVP;
 
-       if (startdir_with_usecount) {
-               vnode_rele_ext(startdir_with_usecount, O_EVTONLY, 0);
-               startdir_with_usecount = NULLVP;
-       }
-       if (rootdir_with_usecount) {
-               vnode_rele_ext(rootdir_with_usecount, O_EVTONLY, 0);
-               rootdir_with_usecount = NULLVP;
-       }
-
 #if CONFIG_VOLFS
        /*
         * Deal with volfs fallout.
@@ -1672,10 +1565,10 @@ out:
 
 /*
  * Takes ni_vp and ni_dvp non-NULL.  Returns with *new_dp set to the location
- * at which to start a lookup with a resolved path and with an iocount.
+ * at which to start a lookup with a resolved path, and all other iocounts dropped.
  */
 static int
-handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
+lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx)
 {
        int error;
        char *cp;               /* pointer into pathname argument */
@@ -1766,18 +1659,17 @@ handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t c
        /*
         * starting point for 'relative'
         * symbolic link path
-        *
-        * If the starting point is not the root we have to return an iocounted
-        * dp to namei so we don't release the icoount here.
         */
        dp = ndp->ni_dvp;
-       ndp->ni_dvp = NULLVP;
 
        /*
         * get rid of references returned via 'lookup'
         */
        vnode_put(ndp->ni_vp);
+       vnode_put(ndp->ni_dvp); /* ALWAYS have a dvp for a symlink */
+
        ndp->ni_vp = NULLVP;
+       ndp->ni_dvp = NULLVP;
 
        /*
         * Check if symbolic link restarts us at the root
@@ -1787,20 +1679,9 @@ handle_symlink_for_namei(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t c
                        cnp->cn_nameptr++;
                        ndp->ni_pathlen--;
                }
-               vnode_put(dp);
                if ((dp = ndp->ni_rootdir) == NULLVP) {
                        return ENOENT;
                }
-               if (vnode_get(dp) != 0) {
-                       return ENOENT;
-               }
-       }
-
-       if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) {
-               if (dp) {
-                       vnode_put(dp);
-               }
-               return ENOENT;
        }
 
        *new_dp = dp;
index 866780991fa543f6afd6e316620223b0b7f34bbe..0d44e828be508ec33dcb3fa9aaf35afe6d1abaf0 100644 (file)
@@ -5948,6 +5948,8 @@ vnode_lookupat(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx,
        if (start_dvp && (path[0] != '/')) {
                nd.ni_dvp = start_dvp;
                nd.ni_cnd.cn_flags |= USEDVP;
+               /* Don't take proc lock vnode_lookupat with a startdir specified */
+               nd.ni_flag |=  NAMEI_NOPROCLOCK;
        }
 
        if ((error = namei(&nd))) {
index a0a04deb8b43910efb399a0595a1ed2051891b57..f27adda4db00456416f164d332f5b04c120104bc 100644 (file)
@@ -211,6 +211,9 @@ struct fd_vn_data * fg_vn_data_alloc(void);
  */
 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 
+/* Max retry limit for rename due to vnode recycling. */
+#define MAX_RENAME_ERECYCLE_RETRIES 1024
+
 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
     int unlink_flags);
 
@@ -2005,6 +2008,7 @@ checkdirs_callback(proc_t p, void * arg)
                return PROC_RETURNED;
        }
 
+       proc_dirs_lock_exclusive(p);
        /*
         * Now do the work.  Note: we dropped the proc_fdlock, so we
         * have to do all of the checks again.
@@ -2024,6 +2028,7 @@ checkdirs_callback(proc_t p, void * arg)
                }
        }
        proc_fdunlock(p);
+       proc_dirs_unlock_exclusive(p);
 
        /*
         * Dispose of any references that are no longer needed.
@@ -3586,10 +3591,12 @@ common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
                        return ENOENT;
                }
        } else {
+               proc_dirs_lock_exclusive(p);
                proc_fdlock(p);
                tvp = fdp->fd_cdir;
                fdp->fd_cdir = vp;
                proc_fdunlock(p);
+               proc_dirs_unlock_exclusive(p);
        }
 
        if (tvp) {
@@ -3659,10 +3666,12 @@ chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_threa
                        return ENOENT;
                }
        } else {
+               proc_dirs_lock_exclusive(p);
                proc_fdlock(p);
                tvp = fdp->fd_cdir;
                fdp->fd_cdir = ndp->ni_vp;
                proc_fdunlock(p);
+               proc_dirs_unlock_exclusive(p);
        }
 
        if (tvp) {
@@ -3781,11 +3790,21 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
        }
        vnode_put(nd.ni_vp);
 
+       /*
+        * This lock provides the guarantee that as long as you hold the lock
+        * fdp->fd_rdir has a usecount on it. This is used to take an iocount
+        * on a referenced vnode in namei when determining the rootvnode for
+        * a process.
+        */
+       /* needed for synchronization with lookup */
+       proc_dirs_lock_exclusive(p);
+       /* needed for setting the flag and other activities on the fd itself */
        proc_fdlock(p);
        tvp = fdp->fd_rdir;
        fdp->fd_rdir = nd.ni_vp;
        fdp->fd_flags |= FD_CHROOT;
        proc_fdunlock(p);
+       proc_dirs_unlock_exclusive(p);
 
        if (tvp != NULL) {
                vnode_rele(tvp);
@@ -8478,7 +8497,13 @@ skipped_lookup:
                 * but other filesystems susceptible to this race could return it, too.
                 */
                if (error == ERECYCLE) {
-                       do_retry = 1;
+                       if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
+                               do_retry = 1;
+                               retry_count += 1;
+                       } else {
+                               printf("rename retry limit due to ERECYCLE reached\n");
+                               error = ENOENT;
+                       }
                }
 
                /*
index 441ebcc083e9b3832ebb7b4ae7d60122a0ad8f15..9259dc79e6684b65a6cad1a5ead9cc81fbd5aa33 100644 (file)
@@ -1,4 +1,4 @@
-19.4.0
+19.5.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index 0c19f496484f0343fc823b4411ca5d9056752d76..7b6beff04672878700dad1166b48471dd013a2f8 100644 (file)
@@ -67,6 +67,10 @@ enum IODirection
        kIODirectionPrepareReserved1  = 0x00000010,
 #define IODIRECTIONPREPARENONCOHERENTDEFINED    1
        kIODirectionPrepareNonCoherent = 0x00000020,
+#if KERNEL_PRIVATE
+#define IODIRECTIONPREPAREAVOIDTHROTTLING       1
+       kIODirectionPrepareAvoidThrottling = 0x00000100,
+#endif
 
        // these flags are valid for the complete() method only
 #define IODIRECTIONCOMPLETEWITHERRORDEFINED             1
index d73a4343b5961696c3ef3dfda5bd4d0f53bd570a..a4e7d0536eb23a0a6c5a44619579f2b0fbe2dfc9 100644 (file)
@@ -3440,6 +3440,7 @@ IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection)
                                                upl_abort(iopl.fIOPL, 0);
                                                upl_deallocate(iopl.fIOPL);
                                        }
+                                       error = kIOReturnNoMemory;
                                        goto abortExit;
                                }
                                dataP = NULL;
@@ -3740,6 +3741,10 @@ IOGeneralMemoryDescriptor::prepare(IODirection forDirection)
        }
 
        if (kIOMemoryTypeVirtual == type || kIOMemoryTypeVirtual64 == type || kIOMemoryTypeUIO == type) {
+               if ((forDirection & kIODirectionPrepareAvoidThrottling) && NEED_TO_HARD_THROTTLE_THIS_TASK()) {
+                       error = kIOReturnNotReady;
+                       goto finish;
+               }
                error = wireVirtual(forDirection);
        }
 
@@ -3751,6 +3756,8 @@ IOGeneralMemoryDescriptor::prepare(IODirection forDirection)
                }
        }
 
+finish:
+
        if (_prepareLock) {
                IOLockUnlock(_prepareLock);
        }
index 2f01b468107c17153a4c1c7fcd5436190a1e77e2..cd07ccabac89356de4bbdd744b28336ca96fee1c 100644 (file)
@@ -4671,7 +4671,7 @@ pmap_static_allocations_done(void)
         *
         * Note that this workaround does not pose a security risk, because the RO
         * page tables still remain read-only, due to KTRR/CTRR, and further protecting
-        * them at the APRR level would be unnecessary.
+        * them would be unnecessary.
         */
        monitor_start_pa = kvtophys((vm_offset_t)&ropagetable_begin);
        monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
index 875ddb7a54677a7791f17bc4219f4dae2715ac8d..1efc217f9af371c8d714cef165268d237c004ed7 100644 (file)
@@ -1148,7 +1148,7 @@ Lskip_el0_eret_mapping:
 Lexception_return_restore_registers:
        mov     x0, sp                                                          // x0 = &pcb
        // Loads authed $x0->ss_64.pc into x1 and $x0->ss_64.cpsr into w2
-       AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24
+       AUTH_THREAD_STATE_IN_X0 x20, x21, x22, x23, x24, el0_state_allowed=1
 
 /* Restore special register state */
        ldr             w3, [sp, NS64_FPSR]
index 7f5f8ed29504d90802395bc1ce2f04ce9fb7aa95..e1896caa6ff098ab41d91b81a7c330c13dfdeef7 100644 (file)
@@ -26,6 +26,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+#include <arm64/proc_reg.h>
 #include <pexpert/arm64/board_config.h>
 #include "assym.s"
 
  *
  * On CPUs with PAC support, this macro will auth the above values with ml_check_signed_state().
  *
- * arg0 - scratch register 1
- * arg1 - scratch register 2
- * arg2 - scratch register 3
- * arg3 - scratch register 4
- * arg4 - scratch register 5
+ * tmp1 - scratch register 1
+ * tmp2 - scratch register 2
+ * tmp3 - scratch register 3
+ * tmp4 - scratch register 4
+ * tmp5 - scratch register 5
  */
 /* BEGIN IGNORE CODESTYLE */
-.macro  AUTH_THREAD_STATE_IN_X0
-       ldr             x1, [x0, SS64_PC]
+.macro  AUTH_THREAD_STATE_IN_X0        tmp1, tmp2, tmp3, tmp4, tmp5, el0_state_allowed=0
        ldr             w2, [x0, SS64_CPSR]
+.if \el0_state_allowed==0
+#if __has_feature(ptrauth_calls)
+       // If testing for a canary CPSR value, ensure that we do not observe writes to other fields without it
+       dmb             ld
+#endif
+.endif
+       ldr             x1, [x0, SS64_PC]
        ldp             x16, x17, [x0, SS64_X16]
 
 #if defined(HAS_APPLE_PAC)
        // Save x3-x5 to preserve across call
-       mov             $2, x3
-       mov             $3, x4
-       mov             $4, x5
+       mov             \tmp3, x3
+       mov             \tmp4, x4
+       mov             \tmp5, x5
 
        /*
        * Arg0: The ARM context pointer (already in x0)
        * Stash saved state PC and CPSR in other registers to avoid reloading potentially unauthed
        * values from memory.  (ml_check_signed_state will clobber x1 and x2.)
        */
-       mov             $0, x1
-       mov             $1, x2
+       mov             \tmp1, x1
+       mov             \tmp2, x2
        ldr             x3, [x0, SS64_LR]
        mov             x4, x16
        mov             x5, x17
        bl              EXT(ml_check_signed_state)
-       mov             x1, $0
-       mov             x2, $1
+       mov             x1, \tmp1
+       mov             x2, \tmp2
+
+.if \el0_state_allowed==0
+       and             \tmp2, \tmp2, #PSR64_MODE_MASK
+       cbnz            \tmp2, 1f
+       bl              EXT(ml_auth_thread_state_invalid_cpsr)
+1:
+.endif
 
        // LR was already loaded/authed earlier, if we reload it we might be loading a potentially unauthed value
        mov             lr, x3
-       mov             x3, $2
-       mov             x4, $3
-       mov             x5, $4
+       mov             x3, \tmp3
+       mov             x4, \tmp4
+       mov             x5, \tmp5
 #else
        ldr             lr, [x0, SS64_LR]
 #endif /* defined(HAS_APPLE_PAC) */
index 9d41431fe6b7da750a185a7b44ac88fbe027cf9a..191997c1353bbcb0a276eb04425d998910388c10 100644 (file)
@@ -415,25 +415,32 @@ L_mmu_kvtop_wpreflight_invalid:
 /*
  * SET_RECOVERY_HANDLER
  *
- *     Sets up a page fault recovery handler
+ *     Sets up a page fault recovery handler.  This macro clobbers x16 and x17.
  *
- *     arg0 - persisted thread pointer
- *     arg1 - persisted recovery handler
- *     arg2 - scratch reg
- *     arg3 - recovery label
+ *     label - recovery label
+ *     tpidr - persisted thread pointer
+ *     old_handler - persisted recovery handler
+ *     label_in_adr_range - whether \label is within 1 MB of PC
  */
-.macro SET_RECOVERY_HANDLER
-       mrs             $0, TPIDR_EL1                                   // Load thread pointer
-       adrp    $2, $3@page                                             // Load the recovery handler address
-       add             $2, $2, $3@pageoff
+.macro SET_RECOVERY_HANDLER    label, tpidr=x16, old_handler=x10, label_in_adr_range=0
+       // Note: x16 and x17 are designated for use as temporaries in
+       // interruptible PAC routines.  DO NOT CHANGE THESE REGISTER ASSIGNMENTS.
+.if \label_in_adr_range==1                                             // Load the recovery handler address
+       adr             x17, \label
+.else
+       adrp    x17, \label@page
+       add             x17, x17, \label@pageoff
+.endif
 #if defined(HAS_APPLE_PAC)
-       add             $1, $0, TH_RECOVER
-       movk    $1, #PAC_DISCRIMINATOR_RECOVER, lsl 48
-       pacia   $2, $1                                                  // Sign with IAKey + blended discriminator
+       mrs             x16, TPIDR_EL1
+       add             x16, x16, TH_RECOVER
+       movk    x16, #PAC_DISCRIMINATOR_RECOVER, lsl 48
+       pacia   x17, x16                                                        // Sign with IAKey + blended discriminator
 #endif
 
-       ldr             $1, [$0, TH_RECOVER]                    // Save previous recovery handler
-       str             $2, [$0, TH_RECOVER]                    // Set new signed recovery handler
+       mrs             \tpidr, TPIDR_EL1                                       // Load thread pointer
+       ldr             \old_handler, [\tpidr, TH_RECOVER]      // Save previous recovery handler
+       str             x17, [\tpidr, TH_RECOVER]                       // Set new signed recovery handler
 .endmacro
 
 /*
@@ -441,18 +448,18 @@ L_mmu_kvtop_wpreflight_invalid:
  *
  *     Clears page fault handler set by SET_RECOVERY_HANDLER
  *
- *     arg0 - thread pointer saved by SET_RECOVERY_HANDLER
- *     arg1 - old recovery handler saved by SET_RECOVERY_HANDLER
+ *     tpidr - thread pointer saved by SET_RECOVERY_HANDLER
+ *     old_handler - old recovery handler saved by SET_RECOVERY_HANDLER
  */
-.macro CLEAR_RECOVERY_HANDLER
-       str             $1, [$0, TH_RECOVER]            // Restore the previous recovery handler
+.macro CLEAR_RECOVERY_HANDLER  tpidr=x16, old_handler=x10
+       str             \old_handler, [\tpidr, TH_RECOVER]      // Restore the previous recovery handler
 .endmacro
 
 
        .text
        .align 2
 copyio_error:
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        mov             x0, #EFAULT                                     // Return an EFAULT error
        POP_FRAME
        ARM64_STACK_EPILOG
@@ -466,7 +473,7 @@ copyio_error:
 LEXT(_bcopyin)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        /* If len is less than 16 bytes, just do a bytewise copy */
        cmp             x2, #16
        b.lt    2f
@@ -486,7 +493,7 @@ LEXT(_bcopyin)
        strb    w3, [x1], #1
        b.hi    2b
 3:
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        mov             x0, #0
        POP_FRAME
        ARM64_STACK_EPILOG
@@ -500,11 +507,11 @@ LEXT(_bcopyin)
 LEXT(_copyin_atomic32)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        ldr             w8, [x0]
        str             w8, [x1]
        mov             x0, #0
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -517,7 +524,7 @@ LEXT(_copyin_atomic32)
 LEXT(_copyin_atomic32_wait_if_equals)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        ldxr            w8, [x0]
        cmp             w8, w1
        mov             x0, ESTALE
@@ -526,7 +533,7 @@ LEXT(_copyin_atomic32_wait_if_equals)
        wfe
 1:
        clrex
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -539,11 +546,11 @@ LEXT(_copyin_atomic32_wait_if_equals)
 LEXT(_copyin_atomic64)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        ldr             x8, [x0]
        str             x8, [x1]
        mov             x0, #0
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -557,10 +564,10 @@ LEXT(_copyin_atomic64)
 LEXT(_copyout_atomic32)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        str             w0, [x1]
        mov             x0, #0
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -573,10 +580,10 @@ LEXT(_copyout_atomic32)
 LEXT(_copyout_atomic64)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        str             x0, [x1]
        mov             x0, #0
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -590,7 +597,7 @@ LEXT(_copyout_atomic64)
 LEXT(_bcopyout)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        /* If len is less than 16 bytes, just do a bytewise copy */
        cmp             x2, #16
        b.lt    2f
@@ -610,7 +617,7 @@ LEXT(_bcopyout)
        strb    w3, [x1], #1
        b.hi    2b
 3:
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        mov             x0, #0
        POP_FRAME
        ARM64_STACK_EPILOG
@@ -628,17 +635,7 @@ LEXT(_bcopyout)
 LEXT(_bcopyinstr)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       adr             x4, Lcopyinstr_error            // Get address for recover
-       mrs             x10, TPIDR_EL1                          // Get thread pointer
-       ldr             x11, [x10, TH_RECOVER]          // Save previous recover
-
-#if defined(HAS_APPLE_PAC)
-       add             x5, x10, TH_RECOVER             // Sign new pointer with IAKey + blended discriminator
-       movk    x5, #PAC_DISCRIMINATOR_RECOVER, lsl 48
-       pacia   x4, x5
-#endif
-       str             x4, [x10, TH_RECOVER]           // Store new recover
-
+       SET_RECOVERY_HANDLER Lcopyinstr_error, label_in_adr_range=1
        mov             x4, #0                                          // x4 - total bytes copied
 Lcopyinstr_loop:
        ldrb    w5, [x0], #1                                    // Load a byte from the user source
@@ -656,7 +653,7 @@ Lcopyinstr_done:
 Lcopyinstr_error:
        mov             x0, #EFAULT                                     // Return EFAULT on error
 Lcopyinstr_exit:
-       str             x11, [x10, TH_RECOVER]          // Restore old recover
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -672,9 +669,9 @@ Lcopyinstr_exit:
  *     x3 : temp
  *     x5 : temp (kernel virtual base)
  *     x9 : temp
- *     x10 : thread pointer (set by SET_RECOVERY_HANDLER)
- *     x11 : old recovery function (set by SET_RECOVERY_HANDLER)
+ *     x10 : old recovery function (set by SET_RECOVERY_HANDLER)
  *     x12, x13 : backtrace data
+ *     x16 : thread pointer (set by SET_RECOVERY_HANDLER)
  *
  */
        .text
@@ -683,7 +680,7 @@ Lcopyinstr_exit:
 LEXT(copyinframe)
        ARM64_STACK_PROLOG
        PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
+       SET_RECOVERY_HANDLER copyio_error
        cbnz    w2, Lcopyinframe64              // Check frame size
        adrp    x5, EXT(gVirtBase)@page // For 32-bit frame, make sure we're not trying to copy from kernel
        add             x5, x5, EXT(gVirtBase)@pageoff
@@ -714,7 +711,7 @@ Lcopyinframe_valid:
        mov     w0, #0                                  // Success
 
 Lcopyinframe_done:
-       CLEAR_RECOVERY_HANDLER x10, x11
+       CLEAR_RECOVERY_HANDLER
        POP_FRAME
        ARM64_STACK_EPILOG
 
@@ -1124,6 +1121,24 @@ Lcheck_hash_panic:
        CALL_EXTERN panic_with_thread_kernel_state
 Lcheck_hash_str:
        .asciz "JOP Hash Mismatch Detected (PC, CPSR, or LR corruption)"
+
+/**
+ * void ml_auth_thread_state_invalid_cpsr(arm_saved_state_t *ss)
+ *
+ * Panics due to an invalid CPSR value in ss.
+ */
+       .text
+       .align 2
+       .globl EXT(ml_auth_thread_state_invalid_cpsr)
+LEXT(ml_auth_thread_state_invalid_cpsr)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       mov             x1, x0
+       adr             x0, Linvalid_cpsr_str
+       CALL_EXTERN panic_with_thread_kernel_state
+
+Linvalid_cpsr_str:
+       .asciz "Thread state corruption detected (PE mode == 0)"
 #endif /* HAS_APPLE_PAC */
 
        .text
index ff4efbfdddd123c1e15edb22f63ab43983539e57..29f2b71854aeb80fe55f911370551ceebe880836 100644 (file)
@@ -343,6 +343,7 @@ machine_stack_attach(thread_t thread,
 #if defined(HAS_APPLE_PAC)
        /* Sign the initial kernel stack saved state */
        const uint32_t default_cpsr = PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK;
+       boolean_t intr = ml_set_interrupts_enabled(FALSE);
        asm volatile (
                "mov    x0, %[ss]"                              "\n"
 
@@ -376,6 +377,7 @@ machine_stack_attach(thread_t thread,
                  [SS64_LR]             "i"(offsetof(struct arm_saved_state, ss_64.lr))
                : "x0", "x1", "x2", "x3", "x4", "x5", "x6"
        );
+       ml_set_interrupts_enabled(intr);
 #else
        savestate->lr = (uintptr_t)thread_continue;
        savestate->cpsr = (PSR64_KERNEL_DEFAULT & ~PSR64_MODE_EL_MASK) | current_el;
index 28f87b0a1e034f89d73719a998fb49e43e1d1f83..a9f1eec26e307eef8e38a02a564091e2f28bfa78 100644 (file)
@@ -105,17 +105,31 @@ thread_state64_to_saved_state(const arm_thread_state64_t * ts64,
     arm_saved_state_t *          saved_state)
 {
        uint32_t i;
+#if __has_feature(ptrauth_calls)
+       boolean_t intr = ml_set_interrupts_enabled(FALSE);
+#endif /* __has_feature(ptrauth_calls) */
 
        assert(is_saved_state64(saved_state));
 
+       set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64);
+#if __has_feature(ptrauth_calls)
+       /*
+        * Make writes to ts64->cpsr visible first, since it's useful as a
+        * canary to detect thread-state corruption.
+        */
+       __builtin_arm_dmb(DMB_ST);
+#endif
        set_saved_state_fp(saved_state, ts64->fp);
        set_saved_state_lr(saved_state, ts64->lr);
        set_saved_state_sp(saved_state, ts64->sp);
        set_saved_state_pc(saved_state, ts64->pc);
-       set_saved_state_cpsr(saved_state, (ts64->cpsr & ~PSR64_MODE_MASK) | PSR64_MODE_RW_64);
        for (i = 0; i < 29; i++) {
                set_saved_state_reg(saved_state, i, ts64->x[i]);
        }
+
+#if __has_feature(ptrauth_calls)
+       ml_set_interrupts_enabled(intr);
+#endif /* __has_feature(ptrauth_calls) */
 }
 
 #endif /* __arm64__ */
@@ -1316,7 +1330,9 @@ machine_thread_state_initialize(thread_t thread)
 #if defined(HAS_APPLE_PAC)
        /* Sign the initial user-space thread state */
        if (thread->machine.upcb != NULL) {
+               boolean_t intr = ml_set_interrupts_enabled(FALSE);
                ml_sign_thread_state(thread->machine.upcb, 0, 0, 0, 0, 0);
+               ml_set_interrupts_enabled(intr);
        }
 #endif /* defined(HAS_APPLE_PAC) */
 
index 98d7250c059e5006633dbba875d5e7a90daf816c..df259cf29923ad330f93567a37fcf1a2aef24627 100644 (file)
@@ -7447,14 +7447,21 @@ void
 task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num)
 {
        vm_object_t find_vmo;
-       int64_t size = 0;
+       unsigned int i = 0;
+       unsigned int vmobj_limit = len / sizeof(vm_object_query_data_t);
 
        task_objq_lock(task);
        if (query != NULL) {
                queue_iterate(&task->task_objq, find_vmo, vm_object_t, task_objq)
                {
-                       int byte_size;
-                       vm_object_query_t p = &query[size++];
+                       vm_object_query_t p = &query[i];
+
+                       /*
+                        * Clear the entire vm_object_query_t struct as we are using
+                        * only the first 6 bits in the uint64_t bitfield for this
+                        * anonymous struct member.
+                        */
+                       bzero(p, sizeof(*p));
 
                        p->object_id = (vm_object_id_t) VM_KERNEL_ADDRPERM(find_vmo);
                        p->virtual_size = find_vmo->internal ? find_vmo->vo_size : 0;
@@ -7471,16 +7478,17 @@ task_copy_vmobjects(task_t task, vm_object_query_t query, int len, int64_t* num)
                                p->compressed_size = 0;
                        }
 
-                       /* make sure to not overrun */
-                       byte_size = (int) size * sizeof(vm_object_query_data_t);
-                       if ((int)(byte_size + sizeof(vm_object_query_data_t)) > len) {
+                       i++;
+
+                       /* Make sure to not overrun */
+                       if (i == vmobj_limit) {
                                break;
                        }
                }
        } else {
-               size = task->task_owned_objects;
+               i = task->task_owned_objects;
        }
        task_objq_unlock(task);
 
-       *num = size;
+       *num = i;
 }
index 548c48e1c59b77ad97011faea7bf791a719e0358..30f2f097d3984faf39686c29c74dc00c63396c2f 100644 (file)
@@ -504,6 +504,18 @@ typedef struct arm_saved_state arm_saved_state_t;
 
 #if defined(XNU_KERNEL_PRIVATE)
 #if defined(HAS_APPLE_PAC)
+
+#include <sys/cdefs.h>
+
+/*
+ * Used by MANIPULATE_SIGNED_THREAD_STATE(), potentially from C++ (IOKit) code.
+ * Open-coded to prevent a circular dependency between mach/arm/thread_status.h
+ * and osfmk/arm/machine_routines.h.
+ */
+__BEGIN_DECLS
+extern boolean_t ml_set_interrupts_enabled(boolean_t);
+__END_DECLS
+
 /*
  * Methods used to sign and check thread state to detect corruptions of saved
  * thread state across exceptions and context switches.
@@ -531,30 +543,34 @@ extern void ml_check_signed_state(const arm_saved_state_t *, uint64_t, uint32_t,
  * x6: scratch register
  * x7: scratch register
  */
-#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...)               \
-       asm volatile (                                                  \
-               "mov    x8, lr"                         "\n"            \
-               "mov    x0, %[iss]"                     "\n"            \
-               "ldp    x4, x5, [x0, %[SS64_X16]]"      "\n"            \
-               "ldr    x6, [x0, %[SS64_PC]]"           "\n"            \
-               "ldr    w7, [x0, %[SS64_CPSR]]"         "\n"            \
-               "ldr    x3, [x0, %[SS64_LR]]"           "\n"            \
-               "mov    x1, x6"                         "\n"            \
-               "mov    w2, w7"                         "\n"            \
-               "bl     _ml_check_signed_state"         "\n"            \
-               "mov    x1, x6"                         "\n"            \
-               "mov    w2, w7"                         "\n"            \
-               _instr                                  "\n"            \
-               "bl     _ml_sign_thread_state"          "\n"            \
-               "mov    lr, x8"                         "\n"            \
-               :                                                       \
-               : [iss]         "r"(_iss),                              \
-                 [SS64_X16]    "i"(ss64_offsetof(x[16])),              \
-                 [SS64_PC]     "i"(ss64_offsetof(pc)),                 \
-                 [SS64_CPSR]   "i"(ss64_offsetof(cpsr)),               \
-                 [SS64_LR]     "i"(ss64_offsetof(lr)),##__VA_ARGS__    \
-               : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"  \
-       )
+#define MANIPULATE_SIGNED_THREAD_STATE(_iss, _instr, ...)                       \
+       do {                                                                    \
+               boolean_t _intr = ml_set_interrupts_enabled(FALSE);             \
+               asm volatile (                                                  \
+                       "mov    x8, lr"                         "\n"            \
+                       "mov    x0, %[iss]"                     "\n"            \
+                       "ldp    x4, x5, [x0, %[SS64_X16]]"      "\n"            \
+                       "ldr    x6, [x0, %[SS64_PC]]"           "\n"            \
+                       "ldr    w7, [x0, %[SS64_CPSR]]"         "\n"            \
+                       "ldr    x3, [x0, %[SS64_LR]]"           "\n"            \
+                       "mov    x1, x6"                         "\n"            \
+                       "mov    w2, w7"                         "\n"            \
+                       "bl     _ml_check_signed_state"         "\n"            \
+                       "mov    x1, x6"                         "\n"            \
+                       "mov    w2, w7"                         "\n"            \
+                       _instr                                  "\n"            \
+                       "bl     _ml_sign_thread_state"          "\n"            \
+                       "mov    lr, x8"                         "\n"            \
+                       :                                                       \
+                       : [iss]         "r"(_iss),                              \
+                         [SS64_X16]    "i"(ss64_offsetof(x[16])),              \
+                         [SS64_PC]     "i"(ss64_offsetof(pc)),                 \
+                         [SS64_CPSR]   "i"(ss64_offsetof(cpsr)),               \
+                         [SS64_LR]     "i"(ss64_offsetof(lr)),##__VA_ARGS__    \
+                       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8"  \
+               );                                                              \
+               ml_set_interrupts_enabled(_intr);                               \
+       } while (0)
 
 static inline void
 check_and_sign_copied_thread_state(arm_saved_state_t *dst, const arm_saved_state_t *src)
index 071a66d4e3adc3729a1091b5ca3e2af71da49a07..37a6f28442268ce4e77dac024400f01a246e0b61 100644 (file)
@@ -1653,6 +1653,7 @@ c_seg_alloc_nextslot(c_segment_t c_seg)
 }
 
 
+#define C_SEG_MAJOR_COMPACT_STATS_MAX   (30)
 
 struct {
        uint64_t asked_permission;
@@ -1662,7 +1663,11 @@ struct {
        uint64_t wasted_space_in_swapouts;
        uint64_t count_of_swapouts;
        uint64_t count_of_freed_segs;
-} c_seg_major_compact_stats;
+       uint64_t bailed_compactions;
+       uint64_t bytes_freed_rate_us;
+} c_seg_major_compact_stats[C_SEG_MAJOR_COMPACT_STATS_MAX];
+
+int c_seg_major_compact_stats_now = 0;
 
 
 #define C_MAJOR_COMPACTION_SIZE_APPROPRIATE     ((C_SEG_BUFSIZE * 90) / 100)
@@ -1673,7 +1678,7 @@ c_seg_major_compact_ok(
        c_segment_t c_seg_dst,
        c_segment_t c_seg_src)
 {
-       c_seg_major_compact_stats.asked_permission++;
+       c_seg_major_compact_stats[c_seg_major_compact_stats_now].asked_permission++;
 
        if (c_seg_src->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE &&
            c_seg_dst->c_bytes_used >= C_MAJOR_COMPACTION_SIZE_APPROPRIATE) {
@@ -1720,7 +1725,7 @@ c_seg_major_compact(
        c_seg_dst->c_was_major_compacted++;
        c_seg_src->c_was_major_donor++;
 #endif
-       c_seg_major_compact_stats.compactions++;
+       c_seg_major_compact_stats[c_seg_major_compact_stats_now].compactions++;
 
        dst_slot = c_seg_dst->c_nextslot;
 
@@ -1766,8 +1771,8 @@ c_seg_major_compact(
 
                c_rounded_size = (c_size + C_SEG_OFFSET_ALIGNMENT_MASK) & ~C_SEG_OFFSET_ALIGNMENT_MASK;
 
-               c_seg_major_compact_stats.moved_slots++;
-               c_seg_major_compact_stats.moved_bytes += c_size;
+               c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++;
+               c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += c_size;
 
                cslot_copy(c_dst, c_src);
                c_dst->c_offset = c_seg_dst->c_nextoffset;
@@ -2319,6 +2324,8 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all)
        boolean_t       needs_to_swap = FALSE;
 
 
+       VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0);
+
 #if !CONFIG_EMBEDDED
        LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED);
 #endif /* !CONFIG_EMBEDDED */
@@ -2348,6 +2355,8 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all)
                }
                lck_mtx_lock_spin_always(c_list_lock);
        }
+
+       VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_END, c_minor_count, number_compacted, needs_to_swap, 0);
 }
 
 
@@ -2689,15 +2698,20 @@ do_fastwake_warmup(queue_head_t *c_queue, boolean_t consider_all_cseg)
        }
 }
 
+int min_csegs_per_major_compaction = DELAYED_COMPACTIONS_PER_PASS;
 
 void
 vm_compressor_compact_and_swap(boolean_t flush_all)
 {
        c_segment_t     c_seg, c_seg_next;
-       boolean_t       keep_compacting;
+       boolean_t       keep_compacting, switch_state;
        clock_sec_t     now;
        clock_nsec_t    nsec;
+       mach_timespec_t start_ts, end_ts;
+       unsigned int    number_considered, wanted_cseg_found, yield_after_considered_per_pass, number_yields;
+       uint64_t        bytes_to_free, bytes_freed, delta_usec;
 
+       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_START, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
 
        if (fastwake_warmup == TRUE) {
                uint64_t        starting_warmup_count;
@@ -2731,6 +2745,16 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
         */
        clock_get_system_nanotime(&now, &nsec);
 
+       start_ts.tv_sec = (int) now;
+       start_ts.tv_nsec = nsec;
+       delta_usec = 0;
+       number_considered = 0;
+       wanted_cseg_found = 0;
+       number_yields = 0;
+       bytes_to_free = 0;
+       bytes_freed = 0;
+       yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS);
+
        while (!queue_empty(&c_age_list_head) && compaction_swapper_abort == 0) {
                if (hibernate_flushing == TRUE) {
                        clock_sec_t     sec;
@@ -2764,6 +2788,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
 
                        lck_mtx_unlock_always(c_list_lock);
 
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 1, c_swapout_count, 0, 0);
+
                        thread_block(THREAD_CONTINUE_NULL);
 
                        lck_mtx_lock_spin_always(c_list_lock);
@@ -2783,6 +2809,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                         * to do minor compactions to make
                         * more memory available
                         */
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 2, c_swapout_count, 0, 0);
+
                        continue;
                }
 
@@ -2804,11 +2832,14 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
 
                        lck_mtx_lock_spin_always(c_list_lock);
 
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 3, needs_to_swap, 0, 0);
+
                        if (needs_to_swap == FALSE) {
                                break;
                        }
                }
                if (queue_empty(&c_age_list_head)) {
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 4, c_age_count, 0, 0);
                        break;
                }
                c_seg = (c_segment_t) queue_first(&c_age_list_head);
@@ -2816,12 +2847,15 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                assert(c_seg->c_state == C_ON_AGE_Q);
 
                if (flush_all == TRUE && c_seg->c_generation_id > c_generation_id_flush_barrier) {
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 5, 0, 0, 0);
                        break;
                }
 
                lck_mtx_lock_spin_always(&c_seg->c_lock);
 
                if (c_seg->c_busy) {
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 6, (void*) VM_KERNEL_ADDRPERM(c_seg), 0, 0);
+
                        lck_mtx_unlock_always(c_list_lock);
                        c_seg_wait_on_busy(c_seg);
                        lck_mtx_lock_spin_always(c_list_lock);
@@ -2835,13 +2869,15 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                         * found an empty c_segment and freed it
                         * so go grab the next guy in the queue
                         */
-                       c_seg_major_compact_stats.count_of_freed_segs++;
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 7, 0, 0, 0);
+                       c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
                        continue;
                }
                /*
                 * Major compaction
                 */
                keep_compacting = TRUE;
+               switch_state = TRUE;
 
                while (keep_compacting == TRUE) {
                        assert(c_seg->c_busy);
@@ -2856,6 +2892,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
 
                        assert(c_seg_next->c_state == C_ON_AGE_Q);
 
+                       number_considered++;
+
                        if (c_seg_major_compact_ok(c_seg, c_seg_next) == FALSE) {
                                break;
                        }
@@ -2863,7 +2901,24 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                        lck_mtx_lock_spin_always(&c_seg_next->c_lock);
 
                        if (c_seg_next->c_busy) {
+                               /*
+                                * We are going to block for our neighbor.
+                                * If our c_seg is wanted, we should unbusy
+                                * it because we don't know how long we might
+                                * have to block here.
+                                */
+                               if (c_seg->c_wanted) {
+                                       lck_mtx_unlock_always(&c_seg_next->c_lock);
+                                       switch_state = FALSE;
+                                       c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
+                                       wanted_cseg_found++;
+                                       break;
+                               }
+
                                lck_mtx_unlock_always(c_list_lock);
+
+                               VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 8, (void*) VM_KERNEL_ADDRPERM(c_seg_next), 0, 0);
+
                                c_seg_wait_on_busy(c_seg_next);
                                lck_mtx_lock_spin_always(c_list_lock);
 
@@ -2872,12 +2927,14 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                        /* grab that segment */
                        C_SEG_BUSY(c_seg_next);
 
+                       bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
                        if (c_seg_do_minor_compaction_and_unlock(c_seg_next, FALSE, TRUE, TRUE)) {
                                /*
                                 * found an empty c_segment and freed it
                                 * so we can't continue to use c_seg_next
                                 */
-                               c_seg_major_compact_stats.count_of_freed_segs++;
+                               bytes_freed += bytes_to_free;
+                               c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
                                continue;
                        }
 
@@ -2888,6 +2945,8 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
 
                        keep_compacting = c_seg_major_compact(c_seg, c_seg_next);
 
+                       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 9, keep_compacting, 0, 0);
+
                        PAGE_REPLACEMENT_DISALLOWED(TRUE);
 
                        lck_mtx_lock_spin_always(&c_seg_next->c_lock);
@@ -2901,54 +2960,78 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
                         * by passing TRUE, we ask for c_busy to be cleared
                         * and c_wanted to be taken care of
                         */
+                       bytes_to_free = C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
                        if (c_seg_minor_compaction_and_unlock(c_seg_next, TRUE)) {
-                               c_seg_major_compact_stats.count_of_freed_segs++;
+                               bytes_freed += bytes_to_free;
+                               c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_freed_segs++;
+                       } else {
+                               bytes_to_free -= C_SEG_OFFSET_TO_BYTES(c_seg_next->c_populated_offset);
+                               bytes_freed += bytes_to_free;
                        }
 
                        PAGE_REPLACEMENT_DISALLOWED(FALSE);
 
                        /* relock the list */
                        lck_mtx_lock_spin_always(c_list_lock);
+
+                       if (c_seg->c_wanted) {
+                               /*
+                                * Our c_seg is in demand. Let's
+                                * unbusy it and wakeup the waiters
+                                * instead of continuing the compaction
+                                * because we could be in this loop
+                                * for a while.
+                                */
+                               switch_state = FALSE;
+                               wanted_cseg_found++;
+                               c_seg_major_compact_stats[c_seg_major_compact_stats_now].bailed_compactions++;
+                               break;
+                       }
                } /* major compaction */
 
+               VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 10, number_considered, wanted_cseg_found, 0);
+
                lck_mtx_lock_spin_always(&c_seg->c_lock);
 
                assert(c_seg->c_busy);
                assert(!c_seg->c_on_minorcompact_q);
 
-               if (VM_CONFIG_SWAP_IS_ACTIVE) {
-                       /*
-                        * This mode of putting a generic c_seg on the swapout list is
-                        * only supported when we have general swapping enabled
-                        */
-                       c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
-               } else {
-                       if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
-                               assert(VM_CONFIG_SWAP_IS_PRESENT);
+               if (switch_state) {
+                       if (VM_CONFIG_SWAP_IS_ACTIVE) {
                                /*
-                                * we are running compressor sweeps with swap-behind
-                                * make sure the c_seg has aged enough before swapping it
-                                * out...
+                                * This mode of putting a generic c_seg on the swapout list is
+                                * only supported when we have general swapping enabled
                                 */
-                               if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
-                                       c_seg->c_overage_swap = TRUE;
-                                       c_overage_swapped_count++;
-                                       c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+                               c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+                       } else {
+                               if ((vm_swapout_ripe_segments == TRUE && c_overage_swapped_count < c_overage_swapped_limit)) {
+                                       assert(VM_CONFIG_SWAP_IS_PRESENT);
+                                       /*
+                                        * we are running compressor sweeps with swap-behind
+                                        * make sure the c_seg has aged enough before swapping it
+                                        * out...
+                                        */
+                                       if ((now - c_seg->c_creation_ts) >= vm_ripe_target_age) {
+                                               c_seg->c_overage_swap = TRUE;
+                                               c_overage_swapped_count++;
+                                               c_seg_switch_state(c_seg, C_ON_SWAPOUT_Q, FALSE);
+                                       }
                                }
                        }
+                       if (c_seg->c_state == C_ON_AGE_Q) {
+                               /*
+                                * this c_seg didn't get moved to the swapout queue
+                                * so we need to move it out of the way...
+                                * we just did a major compaction on it so put it
+                                * on that queue
+                                */
+                               c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
+                       } else {
+                               c_seg_major_compact_stats[c_seg_major_compact_stats_now].wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used;
+                               c_seg_major_compact_stats[c_seg_major_compact_stats_now].count_of_swapouts++;
+                       }
                }
-               if (c_seg->c_state == C_ON_AGE_Q) {
-                       /*
-                        * this c_seg didn't get moved to the swapout queue
-                        * so we need to move it out of the way...
-                        * we just did a major compaction on it so put it
-                        * on that queue
-                        */
-                       c_seg_switch_state(c_seg, C_ON_MAJORCOMPACT_Q, FALSE);
-               } else {
-                       c_seg_major_compact_stats.wasted_space_in_swapouts += C_SEG_BUFSIZE - c_seg->c_bytes_used;
-                       c_seg_major_compact_stats.count_of_swapouts++;
-               }
+
                C_SEG_WAKEUP_DONE(c_seg);
 
                lck_mtx_unlock_always(&c_seg->c_lock);
@@ -2960,7 +3043,55 @@ vm_compressor_compact_and_swap(boolean_t flush_all)
 
                        lck_mtx_lock_spin_always(c_list_lock);
                }
+
+               if (number_considered >= yield_after_considered_per_pass) {
+                       if (wanted_cseg_found) {
+                               /*
+                                * We stopped major compactions on a c_seg
+                                * that is wanted. We don't know the priority
+                                * of the waiter unfortunately but we are at
+                                * a very high priority and so, just in case
+                                * the waiter is a critical system daemon or
+                                * UI thread, let's give up the CPU in case
+                                * the system is running a few CPU intensive
+                                * tasks.
+                                */
+                               lck_mtx_unlock_always(c_list_lock);
+
+                               mutex_pause(2); /* 100us yield */
+
+                               number_yields++;
+
+                               VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_NONE, 11, number_considered, number_yields, 0);
+
+                               lck_mtx_lock_spin_always(c_list_lock);
+                       }
+
+                       number_considered = 0;
+                       wanted_cseg_found = 0;
+               }
        }
+       clock_get_system_nanotime(&now, &nsec);
+       end_ts.tv_sec = (int) now;
+       end_ts.tv_nsec = nsec;
+
+       SUB_MACH_TIMESPEC(&end_ts, &start_ts);
+
+       delta_usec = (end_ts.tv_sec * USEC_PER_SEC) + (end_ts.tv_nsec / NSEC_PER_USEC) - (number_yields * 100);
+
+       delta_usec = MAX(1, delta_usec); /* we could have 0 usec run if conditions weren't right */
+
+       c_seg_major_compact_stats[c_seg_major_compact_stats_now].bytes_freed_rate_us = (bytes_freed / delta_usec);
+
+       if ((c_seg_major_compact_stats_now + 1) == C_SEG_MAJOR_COMPACT_STATS_MAX) {
+               c_seg_major_compact_stats_now = 0;
+       } else {
+               c_seg_major_compact_stats_now++;
+       }
+
+       assert(c_seg_major_compact_stats_now < C_SEG_MAJOR_COMPACT_STATS_MAX);
+
+       VM_DEBUG_CONSTANT_EVENT(vm_compressor_compact_and_swap, VM_COMPRESSOR_COMPACT_AND_SWAP, DBG_FUNC_END, c_age_count, c_minor_count, c_major_count, vm_page_free_count);
 }
 
 
index 1622e5547705f21f9f76d0e86c36ad058751e2ae..0cfa661698a9cf90f409897399c50fd06a2edc19 100644 (file)
@@ -138,12 +138,15 @@ extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 uint64_t vm_hard_throttle_threshold;
 
 
-
-#define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
-                                                ((vm_page_free_count < vm_page_throttle_limit || \
-                                                  HARD_THROTTLE_LIMIT_REACHED()) && \
-                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
-
+OS_ALWAYS_INLINE
+boolean_t
+NEED_TO_HARD_THROTTLE_THIS_TASK(void)
+{
+       return vm_wants_task_throttled(current_task()) ||
+              ((vm_page_free_count < vm_page_throttle_limit ||
+              HARD_THROTTLE_LIMIT_REACHED()) &&
+              proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
+}
 
 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
index 8fe4c76d860324a1dcd2b0911516071e08799389..185764384df40b47f1a753f2452ec5b33c13d60f 100644 (file)
@@ -198,6 +198,12 @@ extern void vm_rtfault_record_init(void);
 
 #endif  /* MACH_KERNEL_PRIVATE */
 
+#if XNU_KERNEL_PRIVATE
+
+boolean_t NEED_TO_HARD_THROTTLE_THIS_TASK(void);
+
+#endif
+
 #endif  /* KERNEL_PRIVATE */
 
 #endif  /* _VM_VM_FAULT_H_ */
index d9e16eab31e529d02ab1e5b818457230f883f16d..d4e947c16b6f47498347a92a41522f03a8343bfd 100644 (file)
@@ -155,6 +155,9 @@ extern int      vm_debug_events;
 
 #define VM_PAGE_GRAB                    0x126
 #define VM_PAGE_RELEASE                 0x127
+#define VM_COMPRESSOR_COMPACT_AND_SWAP  0x128
+#define VM_COMPRESSOR_DO_DELAYED_COMPACTIONS 0x129
+
 
 #define VM_PRESSURE_EVENT               0x130
 #define VM_EXECVE                       0x131
index 471312f805abb443a6ccb51be05e63d7a97258fa..1ed21b3d953ce2ae4c8963f0634958f90d1764c0 100644 (file)
@@ -671,22 +671,29 @@ T_DECL(budget_replenishment, "budget replenishes properly") {
        length = sizeof(kTestIntervalSecs);
        new_budget_ln = sizeof(new_budget);
        ret = sysctlbyname("vm.memorystatus_freeze_calculate_new_budget", &new_budget, &new_budget_ln, &kTestIntervalSecs, length);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget");
+       T_ASSERT_POSIX_SUCCESS(ret, "vm.memorystatus_freeze_calculate_new_budget");
 
        // Grab the daily budget.
        length = sizeof(memorystatus_freeze_daily_mb_max);
        ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &memorystatus_freeze_daily_mb_max, &length, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max");
+       T_ASSERT_POSIX_SUCCESS(ret, "kern.memorystatus_freeze_daily_mb_max");
 
-       memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024 * 1024 / page_size;
+       memorystatus_freeze_daily_pages_max = memorystatus_freeze_daily_mb_max * 1024UL * 1024UL / page_size;
+       T_LOG("memorystatus_freeze_daily_mb_max %u", memorystatus_freeze_daily_mb_max);
+       T_LOG("memorystatus_freeze_daily_pages_max %u", memorystatus_freeze_daily_pages_max);
+       T_LOG("page_size %u", page_size);
 
        /*
         * We're kTestIntervalSecs past a new interval. Which means we are owed kNumSecondsInDay
         * seconds of budget.
         */
        expected_new_budget_pages = memorystatus_freeze_daily_pages_max;
+       T_LOG("expected_new_budget_pages before %u", expected_new_budget_pages);
+       T_ASSERT_EQ(kTestIntervalSecs, 60 * 60 * 32, "kTestIntervalSecs did not change");
        expected_new_budget_pages += ((kTestIntervalSecs * kFixedPointFactor) / (kNumSecondsInDay)
            * memorystatus_freeze_daily_pages_max) / kFixedPointFactor;
+       T_LOG("expected_new_budget_pages after %u", expected_new_budget_pages);
+       T_LOG("memorystatus_freeze_daily_pages_max after %u", memorystatus_freeze_daily_pages_max);
 
        T_QUIET; T_ASSERT_EQ(new_budget, expected_new_budget_pages, "Calculate new budget behaves correctly.");
 }
index e8bc324b1df5a11928f69575a5269b5fa43ac3fd..b6d3f4f69c81794722097120f2e746b43aa1b0dd 100755 (executable)
@@ -2,6 +2,8 @@ from xnu import *
 from utils import *
 import sys
 
+current_KDP_mode = "swhosted"
+
 def GetKDPPacketHeaderInt(request=0, is_reply=False, seq=0, length=0, key=0):
     """ create a 64 bit number that could be saved as pkt_hdr_t
         params:
@@ -283,3 +285,26 @@ def KDPSetDumpInfo(cmd_args=None):
         print "Failed to save the dumpinfo."
     return retval
 
+@lldb_command('kdpmode')
+def KDPMode(cmd_args=None):
+    """
+    Change KDP mode between software hosted and hardware probe.
+    When lldb is connected to a KDP server backed by a hardware debug tool
+    setting this to 'hwprobe' enables physical memory access.
+    
+    swhosted: LLDB is connected to the target using a serial or socket connection.
+    hwprobe: LLDB is connected to the target using a hardware probe.
+
+    usage: kdpmode <mode>
+    mode: 'swhosted' or 'hwprobe'
+    """
+    global current_KDP_mode
+
+    if cmd_args == None or len(cmd_args) == 0:
+        return current_KDP_mode
+    if len(cmd_args) > 1 or cmd_args[0] not in {'swhosted', 'hwprobe'}:
+        print "Invalid Arguments", KDPMode.__doc__
+    else:
+        current_KDP_mode = cmd_args[0]
+    return
+
index 8bff2689cb0f9e418e561b154340a90759f11146..8bb7134fc39e548dd89d9c9f12a2c33c7263d633 100755 (executable)
@@ -2,6 +2,7 @@ from xnu import *
 import xnudefines
 from kdp import *
 from utils import *
+import struct
 
 def ReadPhysInt(phys_addr, bitsize = 64, cpuval = None):
     """ Read a physical memory data based on address.
@@ -65,38 +66,69 @@ def KDPReadPhysMEM(address, bits):
         print "Target is not connected over kdp. Nothing to do here."
         return retval
 
-    input_address = unsigned(addressof(kern.globals.manual_pkt.input))
-    len_address = unsigned(addressof(kern.globals.manual_pkt.len))
-    data_address = unsigned(addressof(kern.globals.manual_pkt.data))
-    if not WriteInt32ToMemoryAddress(0, input_address):
-        return retval
-
-    kdp_pkt_size = GetType('kdp_readphysmem64_req_t').GetByteSize()
-    if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
-        return retval
+    if "hwprobe" == KDPMode():
+        # Send the proper KDP command and payload to the bare metal debug tool via a KDP server
+        addr_for_kdp = struct.unpack("<Q", struct.pack(">Q", address))[0]
+        byte_count = struct.unpack("<I", struct.pack(">I", bits/8))[0]
+        packet = "{0:016x}{1:08x}{2:04x}".format(addr_for_kdp, byte_count, 0x0)
 
-    data_addr = int(addressof(kern.globals.manual_pkt))
-    pkt = kern.GetValueFromAddress(data_addr, 'kdp_readphysmem64_req_t *')
+        ret_obj = lldb.SBCommandReturnObject()
+        ci = lldb.debugger.GetCommandInterpreter()
+        ci.HandleCommand('process plugin packet send -c 25 -p {0}'.format(packet), ret_obj)
 
-    header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_READPHYSMEM64'), length=kdp_pkt_size)
+        if ret_obj.Succeeded():
+            value = ret_obj.GetOutput()
 
-    if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
-         WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
-         WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
-         WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
-         ):
-
-        if WriteInt32ToMemoryAddress(1, input_address):
-            # now read data from the kdp packet
-            data_address = unsigned(addressof(kern.GetValueFromAddress(int(addressof(kern.globals.manual_pkt.data)), 'kdp_readphysmem64_reply_t *').data))
             if bits == 64 :
-                retval =  kern.GetValueFromAddress(data_address, 'uint64_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                pack_fmt = "<Q"
+                unpack_fmt = ">Q"
             if bits == 32 :
-                retval =  kern.GetValueFromAddress(data_address, 'uint32_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                pack_fmt = "<I"
+                unpack_fmt = ">I"
             if bits == 16 :
-                retval =  kern.GetValueFromAddress(data_address, 'uint16_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                pack_fmt = "<H"
+                unpack_fmt = ">H"
             if bits == 8 :
-                retval =  kern.GetValueFromAddress(data_address, 'uint8_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                pack_fmt = "<B"
+                unpack_fmt = ">B"
+
+            retval = struct.unpack(unpack_fmt, struct.pack(pack_fmt, int(value[-((bits/4)+1):], 16)))[0]
+
+    else:
+        input_address = unsigned(addressof(kern.globals.manual_pkt.input))
+        len_address = unsigned(addressof(kern.globals.manual_pkt.len))
+        data_address = unsigned(addressof(kern.globals.manual_pkt.data))
+
+        if not WriteInt32ToMemoryAddress(0, input_address):
+            return retval
+
+        kdp_pkt_size = GetType('kdp_readphysmem64_req_t').GetByteSize()
+        if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
+            return retval
+
+        data_addr = int(addressof(kern.globals.manual_pkt))
+        pkt = kern.GetValueFromAddress(data_addr, 'kdp_readphysmem64_req_t *')
+
+        header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_READPHYSMEM64'), length=kdp_pkt_size)
+
+        if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
+             WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
+             WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
+             WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
+             ):
+
+            if WriteInt32ToMemoryAddress(1, input_address):
+                # now read data from the kdp packet
+                data_address = unsigned(addressof(kern.GetValueFromAddress(int(addressof(kern.globals.manual_pkt.data)), 'kdp_readphysmem64_reply_t *').data))
+                if bits == 64 :
+                    retval =  kern.GetValueFromAddress(data_address, 'uint64_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                if bits == 32 :
+                    retval =  kern.GetValueFromAddress(data_address, 'uint32_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                if bits == 16 :
+                    retval =  kern.GetValueFromAddress(data_address, 'uint16_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+                if bits == 8 :
+                    retval =  kern.GetValueFromAddress(data_address, 'uint8_t *').GetSBValue().Dereference().GetValueAsUnsigned()
+
     return retval
 
 
@@ -112,42 +144,75 @@ def KDPWritePhysMEM(address, intval, bits):
     if "kdp" != GetConnectionProtocol():
         print "Target is not connected over kdp. Nothing to do here."
         return False
-    input_address = unsigned(addressof(kern.globals.manual_pkt.input))
-    len_address = unsigned(addressof(kern.globals.manual_pkt.len))
-    data_address = unsigned(addressof(kern.globals.manual_pkt.data))
-    if not WriteInt32ToMemoryAddress(0, input_address):
-        return False
+    
+    if "hwprobe" == KDPMode():
+        # Send the proper KDP command and payload to the bare metal debug tool via a KDP server
+        addr_for_kdp = struct.unpack("<Q", struct.pack(">Q", address))[0]
+        byte_count = struct.unpack("<I", struct.pack(">I", bits/8))[0]
+
+        if bits == 64 :
+            pack_fmt = ">Q"
+            unpack_fmt = "<Q"
+        if bits == 32 :
+            pack_fmt = ">I"
+            unpack_fmt = "<I"
+        if bits == 16 :
+            pack_fmt = ">H"
+            unpack_fmt = "<H"
+        if bits == 8 :
+            pack_fmt = ">B"
+            unpack_fmt = "<B"
+
+        data_val = struct.unpack(unpack_fmt, struct.pack(pack_fmt, intval))[0]
+
+        packet = "{0:016x}{1:08x}{2:04x}{3:016x}".format(addr_for_kdp, byte_count, 0x0, data_val)
+
+        ret_obj = lldb.SBCommandReturnObject()
+        ci = lldb.debugger.GetCommandInterpreter()
+        ci.HandleCommand('process plugin packet send -c 26 -p {0}'.format(packet), ret_obj)
+
+        if ret_obj.Succeeded():
+            return True
+        else:
+            return False
 
-    kdp_pkt_size = GetType('kdp_writephysmem64_req_t').GetByteSize() + (bits / 8)
-    if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
-        return False
+    else:
+        input_address = unsigned(addressof(kern.globals.manual_pkt.input))
+        len_address = unsigned(addressof(kern.globals.manual_pkt.len))
+        data_address = unsigned(addressof(kern.globals.manual_pkt.data))
+        if not WriteInt32ToMemoryAddress(0, input_address):
+            return False
 
-    data_addr = int(addressof(kern.globals.manual_pkt))
-    pkt = kern.GetValueFromAddress(data_addr, 'kdp_writephysmem64_req_t *')
-
-    header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_WRITEPHYSMEM64'), length=kdp_pkt_size)
-
-    if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
-         WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
-         WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
-         WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
-         ):
-
-        if bits == 8:
-            if not WriteInt8ToMemoryAddress(intval, int(addressof(pkt.data))):
-                return False
-        if bits == 16:
-            if not WriteInt16ToMemoryAddress(intval, int(addressof(pkt.data))):
-                return False
-        if bits == 32:
-            if not WriteInt32ToMemoryAddress(intval, int(addressof(pkt.data))):
-                return False
-        if bits == 64:
-            if not WriteInt64ToMemoryAddress(intval, int(addressof(pkt.data))):
-                return False
-        if WriteInt32ToMemoryAddress(1, input_address):
-            return True
-    return False
+        kdp_pkt_size = GetType('kdp_writephysmem64_req_t').GetByteSize() + (bits / 8)
+        if not WriteInt32ToMemoryAddress(kdp_pkt_size, len_address):
+            return False
+
+        data_addr = int(addressof(kern.globals.manual_pkt))
+        pkt = kern.GetValueFromAddress(data_addr, 'kdp_writephysmem64_req_t *')
+
+        header_value =GetKDPPacketHeaderInt(request=GetEnumValue('kdp_req_t::KDP_WRITEPHYSMEM64'), length=kdp_pkt_size)
+
+        if ( WriteInt64ToMemoryAddress((header_value), int(addressof(pkt.hdr))) and
+             WriteInt64ToMemoryAddress(address, int(addressof(pkt.address))) and
+             WriteInt32ToMemoryAddress((bits/8), int(addressof(pkt.nbytes))) and
+             WriteInt16ToMemoryAddress(xnudefines.lcpu_self, int(addressof(pkt.lcpu)))
+             ):
+
+            if bits == 8:
+                if not WriteInt8ToMemoryAddress(intval, int(addressof(pkt.data))):
+                    return False
+            if bits == 16:
+                if not WriteInt16ToMemoryAddress(intval, int(addressof(pkt.data))):
+                    return False
+            if bits == 32:
+                if not WriteInt32ToMemoryAddress(intval, int(addressof(pkt.data))):
+                    return False
+            if bits == 64:
+                if not WriteInt64ToMemoryAddress(intval, int(addressof(pkt.data))):
+                    return False
+            if WriteInt32ToMemoryAddress(1, input_address):
+                return True
+        return False
 
 
 def WritePhysInt(phys_addr, int_val, bitsize = 64):