xnu-7195.101.1.tar.gz

[apple/xnu.git] / bsd / netinet / mptcp.c
diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c

index 1945ecfcfeb9e6660277a4ef25e7dd752a02c619..85a8cebc17245d25404a55f3c89ef91adf5d12e7 100644 (file)
--- a/bsd/netinet/mptcp.c
+++ b/bsd/netinet/mptcp.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2012-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   *
   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
@@ -26,6 +26,59 @@
   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   */
  
+/*
+ * A note on the MPTCP/NECP-interactions:
+ *
+ * MPTCP uses NECP-callbacks to get notified of interface/policy events.
+ * MPTCP registers to these events at the MPTCP-layer for interface-events
+ * through a call to necp_client_register_multipath_cb.
+ * To get per-flow events (aka per TCP-subflow), we register to it with
+ * necp_client_register_socket_flow. Both registrations happen by using the
+ * necp-client-uuid that comes from the app.
+ *
+ * The locking is rather tricky. In general, we expect the lock-ordering to
+ * happen from necp-fd -> necp->client -> mpp_lock.
+ *
+ * There are however some subtleties.
+ *
+ * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
+ * safe, because it is the very first time this MPTCP-connection goes into NECP.
+ * As we go into NECP we take the NECP-locks and thus are guaranteed that no
+ * NECP-locks will deadlock us. Because these NECP-events will also first take
+ * the NECP-locks. Either they win the race and thus won't find our
+ * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
+ * the callbacks while holding the NECP lock.
+ *
+ * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
+ * because we have already registered callbacks and we might race against an
+ * NECP-event that will match on our socket. So, we have to unlock to be safe.
+ *
+ * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
+ * so_usecount has reached 0. We must be careful to not remove the mpp_socket
+ * pointers before we unregistered the callback. Because, again we might be
+ * racing against an NECP-event. Unregistering must happen with an unlocked
+ * mpp_lock, because of the lock-ordering constraint. It could be that
+ * before we had a chance to unregister an NECP-event triggers. That's why
+ * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
+ * there while the socket is being garbage-collected, the use-count will go
+ * down to 0 and we exit. Removal of the multipath_cb again happens by taking
+ * the NECP-locks so any running NECP-events will finish first and exit cleanly.
+ *
+ * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
+ * the socket-lock must be unlocked for lock-ordering constraints. This gets a
+ * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
+ * So, we drop the mp_so-lock as soon as the subflow is unlinked with
+ * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
+ * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
+ * gets it, it will realize that the subflow became non-MPTCP and retry (see
+ * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
+ * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
+ * for the NECP-lock (held by the other thread that is taking care of the NECP-
+ * event). So, the event now finally gets the subflow-lock and then hits an
+ * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
+ * the NECP callback.
+ */
+
  #include <sys/param.h>
  #include <sys/systm.h>
  #include <sys/kernel.h>
@@ -39,11 +92,8 @@
  #include <kern/zalloc.h>
  #include <kern/locks.h>
  
-#include <mach/thread_act.h>
  #include <mach/sdt.h>
  
-#include <dev/random/randomdev.h>
-
  #include <net/if.h>
  #include <netinet/in.h>
  #include <netinet/in_var.h>
@@ -59,17 +109,17 @@
  
  int mptcp_enable = 1;
  SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_enable, 0, "Enable Multipath TCP Support");
-
-int mptcp_dbg = 0;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_dbg, 0, "Enable Multipath TCP Debugging");
+    &mptcp_enable, 0, "Enable Multipath TCP Support");
  
-/* Number of times to try negotiating MPTCP on SYN retransmissions */
-int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
+/*
+ * Number of times to try negotiating MPTCP on SYN retransmissions.
+ * We haven't seen any reports of a middlebox that is dropping all SYN-segments
+ * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
+ */
+int mptcp_mpcap_retries = 4;
  SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
-       CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
  
  /*
   * By default, DSS checksum is turned off, revisit if we ever do
@@ -77,7 +127,7 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
   */
  int mptcp_dss_csum = 0;
  SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_dss_csum, 0, "Enable DSS checksum");
+    &mptcp_dss_csum, 0, "Enable DSS checksum");
  
  /*
   * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
@@ -85,50 +135,213 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
   */
  int mptcp_fail_thresh = 1;
  SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_fail_thresh, 0, "Failover threshold");
-
+    &mptcp_fail_thresh, 0, "Failover threshold");
  
  /*
   * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
   * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
   * Some carrier networks have a timeout of 10 or 15 minutes.
   */
-int mptcp_subflow_keeptime = 60*14;
+int mptcp_subflow_keeptime = 60 * 14;
  SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
+    &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
  
-/*
- * MP_PRIO option.
- */
-int mptcp_mpprio_enable = 1;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mpprio, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_mpprio_enable, 0, "Enable MP_PRIO option");
+int mptcp_rtthist_rtthresh = 600;
+SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
  
-/*
- * REMOVE_ADDR option.
- */
-int mptcp_remaddr_enable = 1;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, remaddr, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_remaddr_enable, 0, "Enable REMOVE_ADDR option");
+int mptcp_rtothresh = 1500;
+SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_rtothresh, 0, "RTO threshold");
  
  /*
- * FastJoin Option
+ * Probe the preferred path, when it is not in use
   */
-int mptcp_fastjoin = 1;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fastjoin, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_fastjoin, 0, "Enable FastJoin Option");
+uint32_t mptcp_probeto = 1000;
+SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_probeto, 0, "Disable probing by setting to 0");
  
-int mptcp_zerortt_fastjoin = 0;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, zerortt_fastjoin, CTLFLAG_RW |
-       CTLFLAG_LOCKED, &mptcp_zerortt_fastjoin, 0,
-       "Enable Zero RTT Fast Join");
+uint32_t mptcp_probecnt = 5;
+SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &mptcp_probecnt, 0, "Number of probe writes");
  
-/*
- * R/W Notification on resume
- */
-int mptcp_rwnotify = 0;
-SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rwnotify, CTLFLAG_RW | CTLFLAG_LOCKED,
-       &mptcp_rwnotify, 0, "Enable RW notify on resume");
+static int
+mptcp_reass_present(struct socket *mp_so)
+{
+       struct mptses *mpte = mpsotompte(mp_so);
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       struct tseg_qent *q;
+       int dowakeup = 0;
+       int flags = 0;
+
+       /*
+        * Present data to user, advancing rcv_nxt through
+        * completed sequence space.
+        */
+       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
+               return flags;
+       }
+       q = LIST_FIRST(&mp_tp->mpt_segq);
+       if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
+               return flags;
+       }
+
+       /*
+        * If there is already another thread doing reassembly for this
+        * connection, it is better to let it finish the job --
+        * (radar 16316196)
+        */
+       if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
+               return flags;
+       }
+
+       mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
+
+       do {
+               mp_tp->mpt_rcvnxt += q->tqe_len;
+               LIST_REMOVE(q, tqe_q);
+               if (mp_so->so_state & SS_CANTRCVMORE) {
+                       m_freem(q->tqe_m);
+               } else {
+                       flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
+                       if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
+                               dowakeup = 1;
+                       }
+               }
+               zfree(tcp_reass_zone, q);
+               mp_tp->mpt_reassqlen--;
+               q = LIST_FIRST(&mp_tp->mpt_segq);
+       } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
+       mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
+
+       if (dowakeup) {
+               sorwakeup(mp_so); /* done with socket lock held */
+       }
+       return flags;
+}
+
+static int
+mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
+{
+       struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
+       u_int64_t mb_dsn = phdr->mp_dsn;
+       struct tseg_qent *q;
+       struct tseg_qent *p = NULL;
+       struct tseg_qent *nq;
+       struct tseg_qent *te = NULL;
+       uint32_t qlimit;
+
+       /*
+        * Limit the number of segments in the reassembly queue to prevent
+        * holding on to too many segments (and thus running out of mbufs).
+        * Make sure to let the missing segment through which caused this
+        * queue.  Always keep one global queue entry spare to be able to
+        * process the missing segment.
+        */
+       qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
+           (tcp_autorcvbuf_max >> 10));
+       if (mb_dsn != mp_tp->mpt_rcvnxt &&
+           (mp_tp->mpt_reassqlen + 1) >= qlimit) {
+               tcpstat.tcps_mptcp_rcvmemdrop++;
+               m_freem(m);
+               *tlenp = 0;
+               return 0;
+       }
+
+       /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
+       te = (struct tseg_qent *) zalloc(tcp_reass_zone);
+       if (te == NULL) {
+               tcpstat.tcps_mptcp_rcvmemdrop++;
+               m_freem(m);
+               return 0;
+       }
+
+       mp_tp->mpt_reassqlen++;
+
+       /*
+        * Find a segment which begins after this one does.
+        */
+       LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
+               if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
+                       break;
+               }
+               p = q;
+       }
+
+       /*
+        * If there is a preceding segment, it may provide some of
+        * our data already.  If so, drop the data from the incoming
+        * segment.  If it provides all of our data, drop us.
+        */
+       if (p != NULL) {
+               int64_t i;
+               /* conversion to int (in i) handles seq wraparound */
+               i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
+               if (i > 0) {
+                       if (i >= *tlenp) {
+                               tcpstat.tcps_mptcp_rcvduppack++;
+                               m_freem(m);
+                               zfree(tcp_reass_zone, te);
+                               te = NULL;
+                               mp_tp->mpt_reassqlen--;
+                               /*
+                                * Try to present any queued data
+                                * at the left window edge to the user.
+                                * This is needed after the 3-WHS
+                                * completes.
+                                */
+                               goto out;
+                       }
+                       VERIFY(i <= INT_MAX);
+                       m_adj(m, (int)i);
+                       *tlenp -= i;
+                       phdr->mp_dsn += i;
+               }
+       }
+
+       tcpstat.tcps_mp_oodata++;
+
+       /*
+        * While we overlap succeeding segments trim them or,
+        * if they are completely covered, dequeue them.
+        */
+       while (q) {
+               int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
+               if (i <= 0) {
+                       break;
+               }
+
+               if (i < q->tqe_len) {
+                       q->tqe_m->m_pkthdr.mp_dsn += i;
+                       q->tqe_len -= i;
+
+                       VERIFY(i <= INT_MAX);
+                       m_adj(q->tqe_m, (int)i);
+                       break;
+               }
+
+               nq = LIST_NEXT(q, tqe_q);
+               LIST_REMOVE(q, tqe_q);
+               m_freem(q->tqe_m);
+               zfree(tcp_reass_zone, q);
+               mp_tp->mpt_reassqlen--;
+               q = nq;
+       }
+
+       /* Insert the new segment queue entry into place. */
+       te->tqe_m = m;
+       te->tqe_th = NULL;
+       te->tqe_len = *tlenp;
+
+       if (p == NULL) {
+               LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
+       } else {
+               LIST_INSERT_AFTER(p, te, tqe_q);
+       }
+
+out:
+       return mptcp_reass_present(mp_so);
+}
  
  /*
   * MPTCP input, called when data has been read from a subflow socket.
@@ -138,20 +351,21 @@ mptcp_input(struct mptses *mpte, struct mbuf *m)
  {
         struct socket *mp_so;
         struct mptcb *mp_tp = NULL;
-       u_int64_t mb_dsn;
-       u_int32_t mb_datalen;
-       int count = 0;
+       int count = 0, wakeup = 0;
         struct mbuf *save = NULL, *prev = NULL;
         struct mbuf *freelist = NULL, *tail = NULL;
-       boolean_t in_fallback = FALSE;
  
         VERIFY(m->m_flags & M_PKTHDR);
  
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
+       mp_tp = mpte->mpte_mptcb;
+
+       socket_lock_assert_owned(mp_so);
  
         DTRACE_MPTCP(input);
  
+       mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
+
         /*
          * Each mbuf contains MPTCP Data Sequence Map
          * Process the data for reassembly, delivery to MPTCP socket
@@ -160,39 +374,76 @@ mptcp_input(struct mptses *mpte, struct mbuf *m)
          */
         count = mp_so->so_rcv.sb_cc;
  
-       VERIFY(m != NULL);
-       mp_tp = mpte->mpte_mptcb;
-       VERIFY(mp_tp != NULL);
-
-       /* Ok to check for this flag without lock as its set in this thread */
-       in_fallback = (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
-
         /*
          * In the degraded fallback case, data is accepted without DSS map
          */
-       if (in_fallback) {
-fallback: 
-               /* 
-                * assume degraded flow as this may be the first packet 
-                * without DSS, and the subflow state is not updated yet. 
+       if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
+               struct mbuf *iter;
+               int mb_dfin = 0;
+fallback:
+               mptcp_sbrcv_grow(mp_tp);
+
+               iter = m;
+               while (iter) {
+                       if ((iter->m_flags & M_PKTHDR) &&
+                           (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
+                               mb_dfin = 1;
+                       }
+
+                       if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
+                               /* Don't add zero-length packets, so jump it! */
+                               if (prev == NULL) {
+                                       m = iter->m_next;
+                                       m_free(iter);
+                                       iter = m;
+                               } else {
+                                       prev->m_next = iter->m_next;
+                                       m_free(iter);
+                                       iter = prev->m_next;
+                               }
+
+                               /* It was a zero-length packet so next one must be a pkthdr */
+                               VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
+                       } else {
+                               prev = iter;
+                               iter = iter->m_next;
+                       }
+               }
+
+               /*
+                * assume degraded flow as this may be the first packet
+                * without DSS, and the subflow state is not updated yet.
                  */
-               if (sbappendstream(&mp_so->so_rcv, m))
+               if (sbappendstream_rcvdemux(mp_so, m)) {
                         sorwakeup(mp_so);
+               }
+
                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
                     struct socket *, mp_so,
                     struct sockbuf *, &mp_so->so_rcv,
                     struct sockbuf *, &mp_so->so_snd,
                     struct mptses *, mpte);
                 count = mp_so->so_rcv.sb_cc - count;
-               mptcplog3((LOG_DEBUG, "%s: fread %d bytes\n", __func__, count));
+
+               mp_tp->mpt_rcvnxt += count;
+
+               if (mb_dfin) {
+                       mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
+                       socantrcvmore(mp_so);
+               }
                 return;
         }
  
-       MPT_LOCK(mp_tp);
         do {
+               u_int64_t mb_dsn;
+               int32_t mb_datalen;
+               int64_t todrop;
+               int mb_dfin = 0;
+
+               VERIFY(m->m_flags & M_PKTHDR);
+
                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
-                       MPT_UNLOCK(mp_tp);
                         goto fallback;
                 }
  
@@ -211,83 +462,165 @@ fallback:
                         prev = save;
                         save = save->m_next;
                 }
-               if (prev)
+               if (prev) {
                         prev->m_next = NULL;
-               else
+               } else {
                         m->m_next = NULL;
+               }
  
                 mb_dsn = m->m_pkthdr.mp_dsn;
                 mb_datalen = m->m_pkthdr.mp_rlen;
  
-               if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvatmark)) {
-                       tcpstat.tcps_mp_oodata++;
-                       MPT_UNLOCK(mp_tp);
-                       m_freem(m);
-                       return;
+               todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
+               if (todrop > 0) {
+                       tcpstat.tcps_mptcp_rcvpackafterwin++;
+
+                       os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                           (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
+                           mp_tp->mpt_rcvwnd, todrop);
+
+                       if (todrop >= mb_datalen) {
+                               if (freelist == NULL) {
+                                       freelist = m;
+                               } else {
+                                       tail->m_next = m;
+                               }
+
+                               if (prev != NULL) {
+                                       tail = prev;
+                               } else {
+                                       tail = m;
+                               }
+
+                               m = save;
+                               prev = save = NULL;
+                               continue;
+                       } else {
+                               VERIFY(todrop <= INT_MAX);
+                               m_adj(m, (int)-todrop);
+                               mb_datalen -= todrop;
+                               m->m_pkthdr.mp_rlen -= todrop;
+                       }
+
                         /*
-                        * Reassembly queue support here in future. Per spec,
-                        * senders must implement retransmission timer to
-                        * retransmit unacked data. Dropping out of order
-                        * gives a slight hit on performance but allows us to
-                        * deploy MPTCP and protects us against in-window DoS
-                        * attacks that attempt to use up memory by sending
-                        * out of order data. When doing load sharing across
-                        * subflows, out of order support is a must.
+                        * We drop from the right edge of the mbuf, thus the
+                        * DATA_FIN is dropped as well
                          */
+                       m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
                 }
  
-               if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvatmark)) {
+               if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
-                           mp_tp->mpt_rcvatmark)) {
-                               if (freelist == NULL)
+                           mp_tp->mpt_rcvnxt)) {
+                               if (freelist == NULL) {
                                         freelist = m;
-                               else
+                               } else {
                                         tail->m_next = m;
+                               }
  
-                               if (prev != NULL)
+                               if (prev != NULL) {
                                         tail = prev;
-                               else
+                               } else {
                                         tail = m;
+                               }
  
                                 m = save;
                                 prev = save = NULL;
                                 continue;
                         } else {
-                               m_adj(m, (mp_tp->mpt_rcvatmark - mb_dsn));
+                               VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
+                               m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
+                               mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
+                               mb_dsn = mp_tp->mpt_rcvnxt;
+                               VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
+                               m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
+                               m->m_pkthdr.mp_dsn = mb_dsn;
                         }
-                       mptcplog((LOG_INFO, "%s: %llu %d 2 \n", __func__,
-                           mp_tp->mpt_rcvatmark, m->m_pkthdr.len));
                 }
  
-               MPT_UNLOCK(mp_tp);
-               if (sbappendstream(&mp_so->so_rcv, m)) {
-                       sorwakeup(mp_so);
+               if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
+                   !LIST_EMPTY(&mp_tp->mpt_segq)) {
+                       mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
+
+                       goto next;
+               }
+               mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
+
+               mptcp_sbrcv_grow(mp_tp);
+
+               if (sbappendstream_rcvdemux(mp_so, m)) {
+                       wakeup = 1;
                 }
+
                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
                     struct sockbuf *, &mp_so->so_rcv,
                     struct sockbuf *, &mp_so->so_snd,
                     struct mptses *, mpte,
                     struct mptcb *, mp_tp);
-               MPT_LOCK(mp_tp);
                 count = mp_so->so_rcv.sb_cc - count;
                 tcpstat.tcps_mp_rcvtotal++;
                 tcpstat.tcps_mp_rcvbytes += count;
-               mptcplog3((LOG_DEBUG, "%s: read %d bytes\n", __func__, count));
-               /*
-                * The data received at the MPTCP layer will never exceed the
-                * receive window because anything to the right of the
-                * receive window will be trimmed at the subflow level.
-                */
-               mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
-               mp_tp->mpt_rcvatmark += count;
+
+               mp_tp->mpt_rcvnxt += count;
+
+next:
+               if (mb_dfin) {
+                       mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
+                       socantrcvmore(mp_so);
+               }
                 m = save;
                 prev = save = NULL;
                 count = mp_so->so_rcv.sb_cc;
         } while (m);
-       MPT_UNLOCK(mp_tp);
  
-       if (freelist)
+       if (freelist) {
                 m_freem(freelist);
+       }
+
+       if (wakeup) {
+               sorwakeup(mp_so);
+       }
+}
+
+boolean_t
+mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
+{
+       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
+
+       /*
+        * Always send if there is data in the reinject-queue.
+        */
+       if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
+               return TRUE;
+       }
+
+       /*
+        * Don't send, if:
+        *
+        * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
+        *    Except when using TFO, we might be doing a 0-byte write.
+        * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
+        * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
+        */
+
+       if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
+               return FALSE;
+       }
+
+       if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
+               return FALSE;
+       }
+
+       if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
+               return FALSE;
+       }
+
+       if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
+               return FALSE;
+       }
+
+       return TRUE;
  }
  
  /*
@@ -296,299 +629,509 @@ fallback:
  int
  mptcp_output(struct mptses *mpte)
  {
+       struct mptcb *mp_tp;
         struct mptsub *mpts;
         struct mptsub *mpts_tried = NULL;
         struct socket *mp_so;
+       struct mptsub *preferred_mpts = NULL;
+       uint64_t old_snd_nxt;
         int error = 0;
  
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       if (mp_so->so_state & SS_CANTSENDMORE) {
-               return (EPIPE);
-       }
+       mp_so = mptetoso(mpte);
+       mp_tp = mpte->mpte_mptcb;
  
-try_again:
-       /* get the "best" subflow to be used for transmission */
-       mpts = mptcp_get_subflow(mpte, NULL);
-       if (mpts == NULL) {
-               mptcplog((LOG_ERR, "%s: mp_so 0x%llx has no usable subflow\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
-               goto out;
+       socket_lock_assert_owned(mp_so);
+
+       if (mp_so->so_flags & SOF_DEFUNCT) {
+               return 0;
         }
  
-       mptcplog3((LOG_INFO, "%s: mp_so 0x%llx cid %d \n", __func__,
-           (uint64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
-
-       /* In case there's just one flow, we reattempt later */
-       MPTS_LOCK(mpts);
-       if ((mpts_tried != NULL) && ((mpts == mpts_tried) ||
-           (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
-               MPTS_UNLOCK(mpts);
-               MPTS_LOCK(mpts_tried);
-               mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
-               mpts_tried->mpts_flags |= MPTSF_ACTIVE;
-               MPTS_UNLOCK(mpts_tried);
-               MPT_LOCK(mpte->mpte_mptcb);
-               mptcp_start_timer(mpte->mpte_mptcb, MPTT_REXMT);
-               MPT_UNLOCK(mpte->mpte_mptcb);
-               mptcplog((LOG_INFO, "%s: mp_so 0x%llx retry later\n",
-                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
-               goto out;
+       VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
+       mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
+
+       old_snd_nxt = mp_tp->mpt_sndnxt;
+       while (mptcp_can_send_more(mp_tp, FALSE)) {
+               /* get the "best" subflow to be used for transmission */
+               mpts = mptcp_get_subflow(mpte, &preferred_mpts);
+               if (mpts == NULL) {
+                       mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
+                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+                       break;
+               }
+
+               /* In case there's just one flow, we reattempt later */
+               if (mpts_tried != NULL &&
+                   (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
+                       mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
+                       mpts_tried->mpts_flags |= MPTSF_ACTIVE;
+                       mptcp_start_timer(mpte, MPTT_REXMT);
+                       break;
+               }
+
+               /*
+                * Automatic sizing of send socket buffer. Increase the send
+                * socket buffer size if all of the following criteria are met
+                *      1. the receiver has enough buffer space for this data
+                *      2. send buffer is filled to 7/8th with data (so we actually
+                *         have data to make use of it);
+                */
+               if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
+                   tcp_cansbgrow(&mp_so->so_snd)) {
+                       if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
+                           mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
+                               if (sbreserve(&mp_so->so_snd,
+                                   min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
+                                   tcp_autosndbuf_max)) == 1) {
+                                       mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
+                               }
+                       }
+               }
+
+               DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
+                   struct socket *, mp_so);
+               error = mptcp_subflow_output(mpte, mpts, 0);
+               if (error) {
+                       /* can be a temporary loss of source address or other error */
+                       mpts->mpts_flags |= MPTSF_FAILINGOVER;
+                       mpts->mpts_flags &= ~MPTSF_ACTIVE;
+                       mpts_tried = mpts;
+                       if (error != ECANCELED) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
+                                   error, mpts->mpts_flags);
+                       }
+                       break;
+               }
+               /* The model is to have only one active flow at a time */
+               mpts->mpts_flags |= MPTSF_ACTIVE;
+               mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
+
+               /* Allows us to update the smoothed rtt */
+               if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
+                       if (preferred_mpts->mpts_probesoon) {
+                               if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
+                                       mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
+                                       if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
+                                               preferred_mpts->mpts_probesoon = 0;
+                                               preferred_mpts->mpts_probecnt = 0;
+                                       }
+                               }
+                       } else {
+                               preferred_mpts->mpts_probesoon = tcp_now;
+                               preferred_mpts->mpts_probecnt = 0;
+                       }
+               }
+
+               if (mpte->mpte_active_sub == NULL) {
+                       mpte->mpte_active_sub = mpts;
+               } else if (mpte->mpte_active_sub != mpts) {
+                       mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
+                       mpte->mpte_active_sub = mpts;
+
+                       mptcpstats_inc_switch(mpte, mpts);
+               }
         }
  
-       DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
-           struct socket *, mp_so);
-       error = mptcp_subflow_output(mpte, mpts);
-       if (error) {
-               /* can be a temporary loss of source address or other error */
-               mpts->mpts_flags |= MPTSF_FAILINGOVER;
-               mpts->mpts_flags &= ~MPTSF_ACTIVE;
-               mpts_tried = mpts;
-               MPTS_UNLOCK(mpts);
-               mptcplog((LOG_INFO, "%s: error = %d \n", __func__, error));
-               goto try_again;
-       }
-       /* The model is to have only one active flow at a time */
-       mpts->mpts_flags |= MPTSF_ACTIVE;
-       MPTS_UNLOCK(mpts);
-       if (mpte->mpte_active_sub == NULL) {
-               mpte->mpte_active_sub = mpts;
-       } else if (mpte->mpte_active_sub != mpts) {
-               MPTS_LOCK(mpte->mpte_active_sub);
-               mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
-               MPTS_UNLOCK(mpte->mpte_active_sub);
-               mpte->mpte_active_sub = mpts;
+       if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
+               if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
+                   mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
+                       mptcp_finish_usrclosed(mpte);
+               }
         }
-out:
+
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
+
         /* subflow errors should not be percolated back up */
-       return (0);
+       return 0;
+}
+
+
+static struct mptsub *
+mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
+{
+       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+
+       /*
+        * Lower RTT? Take it, if it's our first one, or
+        * it doesn't has any loss, or the current one has
+        * loss as well.
+        */
+       if (tp->t_srtt && *currtt > tp->t_srtt &&
+           (curbest == NULL || tp->t_rxtshift == 0 ||
+           sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
+               *currtt = tp->t_srtt;
+               return mpts;
+       }
+
+       /*
+        * If we find a subflow without loss, take it always!
+        */
+       if (curbest &&
+           sototcpcb(curbest->mpts_socket)->t_rxtshift &&
+           tp->t_rxtshift == 0) {
+               *currtt = tp->t_srtt;
+               return mpts;
+       }
+
+       return curbest != NULL ? curbest : mpts;
+}
+
+static struct mptsub *
+mptcp_return_subflow(struct mptsub *mpts)
+{
+       if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
+               return NULL;
+       }
+
+       return mpts;
+}
+
+static boolean_t
+mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
+{
+       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+       int fail_thresh = mptcp_fail_thresh;
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
+               fail_thresh *= 2;
+       }
+
+       return tp->t_rxtshift >= fail_thresh &&
+              (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
  }
  
  /*
   * Return the most eligible subflow to be used for sending data.
- * This function also serves to check if any alternate subflow is available
- * or not.
   */
  struct mptsub *
-mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore)
+mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
  {
+       struct tcpcb *besttp, *secondtp;
+       struct inpcb *bestinp, *secondinp;
         struct mptsub *mpts;
-       struct mptsub *fallback = NULL;
-       struct socket *so = NULL;
+       struct mptsub *best = NULL;
+       struct mptsub *second_best = NULL;
+       int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
  
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       /*
+        * First Step:
+        * Choose the best subflow for cellular and non-cellular interfaces.
+        */
  
         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
-               MPTS_LOCK(mpts);
+               struct socket *so = mpts->mpts_socket;
+               struct tcpcb *tp = sototcpcb(so);
+               struct inpcb *inp = sotoinpcb(so);
+
+               mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
+                   __func__, mpts->mpts_connid, mpts->mpts_flags,
+                   INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
+                   inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
+                   tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
+                   mptcp_subflow_cwnd_space(so)),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+               /*
+                * First, the hard conditions to reject subflows
+                * (e.g., not connected,...)
+                */
+               if (inp->inp_last_outifp == NULL) {
+                       continue;
+               }
  
-               if ((ignore) && (mpts == ignore)) {
-                       MPTS_UNLOCK(mpts);
+               if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
                         continue;
                 }
  
                 /* There can only be one subflow in degraded state */
                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
-                       MPTS_UNLOCK(mpts);
+                       best = mpts;
                         break;
                 }
  
                 /*
-                * Subflows with Fastjoin allow data to be written before
-                * the subflow is mp capable.
+                * If this subflow is waiting to finally send, do it!
                  */
-               if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
-                   !(mpts->mpts_flags & MPTSF_FASTJ_REQD)) {
-                       MPTS_UNLOCK(mpts);
-                       continue;
+               if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
+                       return mptcp_return_subflow(mpts);
                 }
  
-               if (mpts->mpts_flags & MPTSF_SUSPENDED) {
-                       MPTS_UNLOCK(mpts);
+               /*
+                * Only send if the subflow is MP_CAPABLE. The exceptions to
+                * this rule (degraded or TFO) have been taken care of above.
+                */
+               if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
                         continue;
                 }
  
-               if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
-                   (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
-                       MPTS_UNLOCK(mpts);
+               if ((so->so_state & SS_ISDISCONNECTED) ||
+                   !(so->so_state & SS_ISCONNECTED) ||
+                   !TCPS_HAVEESTABLISHED(tp->t_state) ||
+                   tp->t_state > TCPS_CLOSE_WAIT) {
                         continue;
                 }
  
-               if (mpts->mpts_flags & MPTSF_FAILINGOVER) {
-                       so = mpts->mpts_socket;
-                       if ((so) && (!(so->so_flags & SOF_PCBCLEARING))) {
-                               socket_lock(so, 1);
-                               if ((so->so_snd.sb_cc == 0) &&
-                                   (mptcp_no_rto_spike(so))) {
-                                       mpts->mpts_flags &= ~MPTSF_FAILINGOVER;
-                                       so->so_flags &= ~SOF_MP_TRYFAILOVER;
-                                       fallback = mpts;
-                                       socket_unlock(so, 1);
-                               } else {
-                                       fallback = mpts;
-                                       socket_unlock(so, 1);
-                                       MPTS_UNLOCK(mpts);
-                                       continue;
-                               }
-                       } else {
-                               MPTS_UNLOCK(mpts);
-                               continue;
-                       }
-               }
-
-               if (mpts->mpts_flags & MPTSF_PREFERRED) {
-                       MPTS_UNLOCK(mpts);
-                       break;
+               /*
+                * Second, the soft conditions to find the subflow with best
+                * conditions for each set (aka cellular vs non-cellular)
+                */
+               if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
+                       second_best = mptcp_choose_subflow(mpts, second_best,
+                           &exp_rtt);
+               } else {
+                       best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
                 }
-
-               /* When there are no preferred flows, use first one in list */
-               fallback = mpts;
-
-               MPTS_UNLOCK(mpts);
         }
+
         /*
          * If there is no preferred or backup subflow, and there is no active
          * subflow use the last usable subflow.
          */
-       if (mpts == NULL) {
-               return (fallback);
+       if (best == NULL) {
+               return mptcp_return_subflow(second_best);
         }
  
-       return (mpts);
-}
+       if (second_best == NULL) {
+               return mptcp_return_subflow(best);
+       }
  
-struct mptsub *
-mptcp_get_pending_subflow(struct mptses *mpte, struct mptsub *ignore)
-{
-       struct mptsub *mpts = NULL;
-       
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       besttp = sototcpcb(best->mpts_socket);
+       bestinp = sotoinpcb(best->mpts_socket);
+       secondtp = sototcpcb(second_best->mpts_socket);
+       secondinp = sotoinpcb(second_best->mpts_socket);
  
-       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
-               MPTS_LOCK(mpts);
+       if (preferred != NULL) {
+               *preferred = mptcp_return_subflow(best);
+       }
  
-               if ((ignore) && (mpts == ignore)) {
-                       MPTS_UNLOCK(mpts);
-                       continue;
+       /*
+        * Second Step: Among best and second_best. Choose the one that is
+        * most appropriate for this particular service-type.
+        */
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
+               return mptcp_return_subflow(best);
+       } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
+               /*
+                * Only handover if Symptoms tells us to do so.
+                */
+               if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
+                   mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
+                       return mptcp_return_subflow(second_best);
                 }
  
-               if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
-                       MPTS_UNLOCK(mpts);
-                       break;
+               return mptcp_return_subflow(best);
+       } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
+               int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
+               int rto_thresh = mptcp_rtothresh;
+
+               /* Adjust with symptoms information */
+               if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
+                   mptcp_is_wifi_unusable_for_session(mpte) != 0) {
+                       rtt_thresh /= 2;
+                       rto_thresh /= 2;
+               }
+
+               if (besttp->t_srtt && secondtp->t_srtt &&
+                   besttp->t_srtt >= rtt_thresh &&
+                   secondtp->t_srtt < rtt_thresh) {
+                       tcpstat.tcps_mp_sel_rtt++;
+                       mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
+                           best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
+                           second_best->mpts_connid,
+                           secondtp->t_srtt >> TCP_RTT_SHIFT),
+                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+                       return mptcp_return_subflow(second_best);
+               }
+
+               if (mptcp_subflow_is_slow(mpte, best) &&
+                   secondtp->t_rxtshift == 0) {
+                       return mptcp_return_subflow(second_best);
+               }
+
+               /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
+               if (besttp->t_rxtcur && secondtp->t_rxtcur &&
+                   besttp->t_rxtcur >= rto_thresh &&
+                   secondtp->t_rxtcur < rto_thresh) {
+                       tcpstat.tcps_mp_sel_rto++;
+                       mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
+                           best->mpts_connid, besttp->t_rxtcur,
+                           second_best->mpts_connid, secondtp->t_rxtcur),
+                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+
+                       return mptcp_return_subflow(second_best);
+               }
+
+               /*
+                * None of the above conditions for sending on the secondary
+                * were true. So, let's schedule on the best one, if he still
+                * has some space in the congestion-window.
+                */
+               return mptcp_return_subflow(best);
+       } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
+               struct mptsub *tmp;
+
+               /*
+                * We only care about RTT when aggregating
+                */
+               if (besttp->t_srtt > secondtp->t_srtt) {
+                       tmp = best;
+                       best = second_best;
+                       besttp = secondtp;
+                       bestinp = secondinp;
+
+                       second_best = tmp;
+                       secondtp = sototcpcb(second_best->mpts_socket);
+                       secondinp = sotoinpcb(second_best->mpts_socket);
+               }
+
+               /* Is there still space in the congestion window? */
+               if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
+                       return mptcp_return_subflow(second_best);
                 }
  
-               MPTS_UNLOCK(mpts);
+               return mptcp_return_subflow(best);
+       } else {
+               panic("Unknown service-type configured for MPTCP");
+       }
+
+       return NULL;
+}
+
+static const char *
+mptcp_event_to_str(uint32_t event)
+{
+       const char *c = "UNDEFINED";
+       switch (event) {
+       case MPCE_CLOSE:
+               c = "MPCE_CLOSE";
+               break;
+       case MPCE_RECV_DATA_ACK:
+               c = "MPCE_RECV_DATA_ACK";
+               break;
+       case MPCE_RECV_DATA_FIN:
+               c = "MPCE_RECV_DATA_FIN";
+               break;
         }
-       return (mpts);
+       return c;
+}
+
+static const char *
+mptcp_state_to_str(mptcp_state_t state)
+{
+       const char *c = "UNDEFINED";
+       switch (state) {
+       case MPTCPS_CLOSED:
+               c = "MPTCPS_CLOSED";
+               break;
+       case MPTCPS_LISTEN:
+               c = "MPTCPS_LISTEN";
+               break;
+       case MPTCPS_ESTABLISHED:
+               c = "MPTCPS_ESTABLISHED";
+               break;
+       case MPTCPS_CLOSE_WAIT:
+               c = "MPTCPS_CLOSE_WAIT";
+               break;
+       case MPTCPS_FIN_WAIT_1:
+               c = "MPTCPS_FIN_WAIT_1";
+               break;
+       case MPTCPS_CLOSING:
+               c = "MPTCPS_CLOSING";
+               break;
+       case MPTCPS_LAST_ACK:
+               c = "MPTCPS_LAST_ACK";
+               break;
+       case MPTCPS_FIN_WAIT_2:
+               c = "MPTCPS_FIN_WAIT_2";
+               break;
+       case MPTCPS_TIME_WAIT:
+               c = "MPTCPS_TIME_WAIT";
+               break;
+       case MPTCPS_TERMINATE:
+               c = "MPTCPS_TERMINATE";
+               break;
+       }
+       return c;
  }
  
  void
  mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
  {
-       MPT_LOCK_ASSERT_HELD(mp_tp);
+       struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
  
-       DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, 
+       socket_lock_assert_owned(mp_so);
+
+       mptcp_state_t old_state = mp_tp->mpt_state;
+
+       DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
             uint32_t, event);
  
         switch (mp_tp->mpt_state) {
         case MPTCPS_CLOSED:
         case MPTCPS_LISTEN:
-               mp_tp->mpt_state = MPTCPS_CLOSED;
+               mp_tp->mpt_state = MPTCPS_TERMINATE;
                 break;
  
         case MPTCPS_ESTABLISHED:
                 if (event == MPCE_CLOSE) {
                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
-               }       
-               else if (event == MPCE_RECV_DATA_FIN) {
+               } else if (event == MPCE_RECV_DATA_FIN) {
                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
-               }       
+               }
                 break;
  
         case MPTCPS_CLOSE_WAIT:
                 if (event == MPCE_CLOSE) {
                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
-               }       
+               }
                 break;
  
         case MPTCPS_FIN_WAIT_1:
-               if (event == MPCE_RECV_DATA_ACK)
+               if (event == MPCE_RECV_DATA_ACK) {
                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
-               else if (event == MPCE_RECV_DATA_FIN) {
+               } else if (event == MPCE_RECV_DATA_FIN) {
                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
                         mp_tp->mpt_state = MPTCPS_CLOSING;
-               }       
+               }
                 break;
  
         case MPTCPS_CLOSING:
-               if (event == MPCE_RECV_DATA_ACK)
+               if (event == MPCE_RECV_DATA_ACK) {
                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
+               }
                 break;
  
         case MPTCPS_LAST_ACK:
-               if (event == MPCE_RECV_DATA_ACK)
-                       mp_tp->mpt_state = MPTCPS_TERMINATE;
+               if (event == MPCE_RECV_DATA_ACK) {
+                       mptcp_close(mp_tp->mpt_mpte, mp_tp);
+               }
                 break;
  
         case MPTCPS_FIN_WAIT_2:
                 if (event == MPCE_RECV_DATA_FIN) {
                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
-               }       
+               }
                 break;
  
         case MPTCPS_TIME_WAIT:
-               break;
-
-       case MPTCPS_FASTCLOSE_WAIT:
-               if (event == MPCE_CLOSE) {
-                       /* no need to adjust for data FIN */
-                       mp_tp->mpt_state = MPTCPS_TERMINATE;
-               }
-               break;
         case MPTCPS_TERMINATE:
                 break;
+
         default:
                 VERIFY(0);
                 /* NOTREACHED */
         }
-       DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, 
+       DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
             uint32_t, event);
-       mptcplog((LOG_INFO, "%s: state = %d\n",
-           __func__, mp_tp->mpt_state));
-}
-
-/*
- * Update the mptcb send state variables, but the actual sbdrop occurs
- * in MPTCP layer
- */
-void
-mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
-{
-       u_int64_t acked = 0;
-
-       acked = full_dack - mp_tp->mpt_snduna;
-
-       if (acked) {
-               mp_tp->mpt_snduna += acked;
-               /* In degraded mode, we may get some Data ACKs */
-               if ((tp->t_mpflags & TMPF_TCP_FALLBACK) &&
-                       !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
-                       MPTCP_SEQ_GT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
-                       /* bring back sndnxt to retransmit MPTCP data */
-                       mp_tp->mpt_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
-                       mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
-                       tp->t_inpcb->inp_socket->so_flags1 |= 
-                           SOF1_POST_FALLBACK_SYNC;
-               }
-       }
-       if ((full_dack == mp_tp->mpt_sndmax) &&
-           (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1)) {
-               mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_ACK);
-               tp->t_mpflags &= ~TMPF_SEND_DFIN;
-       }
+       mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
+           mptcp_state_to_str(old_state),
+           mptcp_state_to_str(mp_tp->mpt_state),
+           mptcp_event_to_str(event)),
+           MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
  }
  
  /* If you change this function, match up mptcp_update_rcv_state_f */
@@ -604,13 +1147,10 @@ mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
         NTOHS(dss_info->mdss_data_len);
  
         /* XXX for autosndbuf grow sb here */
-       MPT_LOCK(mp_tp);
         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
-       MPT_UNLOCK(mp_tp);
         mptcp_update_rcv_state_meat(mp_tp, tp,
             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
             csum);
-
  }
  
  void
@@ -619,30 +1159,19 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
      uint16_t csum)
  {
         if (mdss_data_len == 0) {
-               mptcplog((LOG_INFO, "%s: Received infinite mapping.",
-                   __func__));
+               os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
+
                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
-                       mptcplog((LOG_ERR, "%s: Bad checksum value %x \n",
-                           __func__, csum));
+                       os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
+                           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
                 }
                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
                 return;
         }
-       MPT_LOCK(mp_tp);
-       if (mptcp_dbg >= MP_VERBOSE_DEBUG_1)
-               printf("%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n",
-                   __func__, seqn, mdss_data_len, full_dsn,
-                   mp_tp->mpt_rcvnxt);
-
-       /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
-       if ((seqn == 0) && (mdss_data_len == 1)) {
-               mptcplog((LOG_INFO, "%s: Data FIN DSS opt state = %d \n",
-                   __func__, mp_tp->mpt_state));
-               MPT_UNLOCK(mp_tp);
-               return;
-       }
-       MPT_UNLOCK(mp_tp);
+
         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
+
         tp->t_rcv_map.mpt_dsn = full_dsn;
         tp->t_rcv_map.mpt_sseq = seqn;
         tp->t_rcv_map.mpt_len = mdss_data_len;
@@ -651,40 +1180,84 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
  }
  
  
-void
-mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt *dss_info, struct tcpcb *tp,
-    uint16_t csum)
+static int
+mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
+    int hdrlen)
  {
-       u_int64_t full_dsn = 0;
-       struct mptcb *mp_tp = tptomptp(tp);
+       u_int32_t datalen;
  
-       NTOHL(dss_info->mdss_dsn);
-       NTOHL(dss_info->mdss_subflow_seqn);
-       NTOHS(dss_info->mdss_data_len);
-       MPT_LOCK(mp_tp);
-       MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
-       MPT_UNLOCK(mp_tp);
-       mptcp_update_rcv_state_meat(mp_tp, tp,
-           full_dsn,
-           dss_info->mdss_subflow_seqn,
-           dss_info->mdss_data_len,
-           csum);
+       if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
+               return 0;
+       }
+
+       datalen = m->m_pkthdr.mp_rlen;
+
+       /* unacceptable DSS option, fallback to TCP */
+       if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
+               os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
+                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
+       } else {
+               return 0;
+       }
+       tp->t_mpflags |= TMPF_SND_MPFAIL;
+       mptcp_notify_mpfail(so);
+       m_freem(m);
+       return -1;
  }
  
-void
-mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt *dss_info,
-    struct tcpcb *tp, uint16_t csum)
+int
+mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+    int drop_hdrlen)
+{
+       mptcp_insert_rmap(tp, m, th);
+       if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
+           drop_hdrlen) != 0) {
+               return -1;
+       }
+       return 0;
+}
+
+static uint16_t
+mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
+    uint16_t dlen, uint16_t csum, int dfin)
  {
-       u_int64_t dsn = mptcp_ntoh64(dss_info->mdss_dsn);
         struct mptcb *mp_tp = tptomptp(tp);
+       int real_len = dlen - dfin;
+       uint32_t sum = 0;
  
-       NTOHL(dss_info->mdss_subflow_seqn);
-       NTOHS(dss_info->mdss_data_len);
-       mptcp_update_rcv_state_meat(mp_tp, tp,
-           dsn,
-           dss_info->mdss_subflow_seqn,
-           dss_info->mdss_data_len,
-           csum);
+       VERIFY(real_len >= 0);
+
+       if (mp_tp == NULL) {
+               return 0;
+       }
+
+       if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
+               return 0;
+       }
+
+       if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
+               return 0;
+       }
+
+       /*
+        * The remote side may send a packet with fewer bytes than the
+        * claimed DSS checksum length.
+        */
+       if ((int)m_length2(m, NULL) < real_len) {
+               return 0xffff;
+       }
+
+       if (real_len != 0) {
+               sum = m_sum16(m, 0, real_len);
+       }
+
+       sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
+       ADDCARRY(sum);
+
+       DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
+           uint32_t, sum);
+
+       return ~sum & 0xffff;
  }
  
  /*
@@ -695,84 +1268,335 @@ mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt *dss_info,
   * DSS option.
   */
  
+int
+mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
+    uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
+{
+       uint16_t mptcp_csum;
+
+       mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
+       if (mptcp_csum) {
+               tp->t_mpflags |= TMPF_SND_MPFAIL;
+               mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
+               m_freem(m);
+               tcpstat.tcps_mp_badcsum++;
+               return -1;
+       }
+       return 0;
+}
+
  uint16_t
-mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, int off)
+mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
  {
-       struct mptcb *mp_tp = tptomptp(tp);
         uint32_t sum = 0;
-       uint64_t dsn;
-       uint32_t sseq;
-       uint16_t len;
-       uint16_t csum;
  
-       if (mp_tp == NULL)
-               return (0);
+       if (dlen) {
+               sum = m_sum16(m, 0, dlen);
+       }
  
-       if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
-               return (0);
+       dss_val = mptcp_hton64(dss_val);
+       sseq = htonl(sseq);
+       dlen = htons(dlen);
+       sum += in_pseudo64(dss_val, sseq, dlen);
  
-       if (!(tp->t_mpflags & TMPF_EMBED_DSN))
-               return (0);
+       ADDCARRY(sum);
+       sum = ~sum & 0xffff;
+       DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
+       mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
+           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
  
-       if (tp->t_mpflags & TMPF_TCP_FALLBACK)
-               return (0);
+       return (uint16_t)sum;
+}
  
-       /* 
-        * The remote side may send a packet with fewer bytes than the
-        * claimed DSS checksum length.
-        */
-       if ((int)m_length2(m, NULL) < (off + tp->t_rcv_map.mpt_len))
-               return (0xffff);
+/*
+ * When WiFi signal starts fading, there's more loss and RTT spikes.
+ * Check if there has been a large spike by comparing against
+ * a tolerable RTT spike threshold.
+ */
+boolean_t
+mptcp_no_rto_spike(struct socket *so)
+{
+       struct tcpcb *tp = intotcpcb(sotoinpcb(so));
+       int32_t spike = 0;
  
-       if (tp->t_rcv_map.mpt_len != 0)
-               sum = m_sum16(m, off, tp->t_rcv_map.mpt_len);
+       if (tp->t_rxtcur > mptcp_rtothresh) {
+               spike = tp->t_rxtcur - mptcp_rtothresh;
  
-       dsn = mptcp_hton64(tp->t_rcv_map.mpt_dsn);
-       sseq = htonl(tp->t_rcv_map.mpt_sseq);
-       len = htons(tp->t_rcv_map.mpt_len);
-       csum = tp->t_rcv_map.mpt_csum;
-       sum += in_pseudo64(dsn, sseq, (len + csum));
-       ADDCARRY(sum);
-       DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
-           uint32_t, sum);
-       mptcplog((LOG_INFO, "%s: sum = %x \n", __func__, sum));
-       return (~sum & 0xffff);
+               mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
+                   __func__, spike,
+                   tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
+                   tp->t_rttcur),
+                   (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
+       }
+
+       if (spike > 0) {
+               return FALSE;
+       } else {
+               return TRUE;
+       }
  }
  
  void
-mptcp_output_csum(struct tcpcb *tp, struct mbuf *m, int32_t len,
-    unsigned hdrlen, u_int64_t dss_val, u_int32_t *sseqp)
+mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
  {
-       struct mptcb *mp_tp = tptomptp(tp);
-       u_int32_t sum = 0;
-       uint32_t sseq;
-       uint16_t dss_len;
-       uint16_t csum = 0;
-       uint16_t *csump = NULL;
+       VERIFY(mpp->mpp_flags & flag);
+       mpp->mpp_flags &= ~flag;
  
-       if (mp_tp == NULL)
+       if (mptcp_should_defer_upcall(mpp)) {
                 return;
+       }
  
-       if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
-               return;
+       if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
+               mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
+
+               mptcp_subflow_workloop(mpp->mpp_pcbe);
+       }
  
-       if (sseqp == NULL)
+       if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
+               mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
+
+               sorwakeup(mpp->mpp_socket);
+       }
+
+       if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
+               mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
+
+               sowwakeup(mpp->mpp_socket);
+       }
+}
+
+static void
+mptcp_reset_itfinfo(struct mpt_itf_info *info)
+{
+       memset(info, 0, sizeof(*info));
+}
+
+void
+mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
+    uint32_t necp_flags, __unused bool *viable)
+{
+       boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
+       boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
+       boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
+       boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
+       struct mppcb *mp = (struct mppcb *)handle;
+       struct mptses *mpte = mptompte(mp);
+       struct socket *mp_so;
+       struct mptcb *mp_tp;
+       uint32_t i, ifindex;
+       struct ifnet *ifp;
+       int locked = 0;
+
+       ifindex = interface_index;
+       VERIFY(ifindex != IFSCOPE_NONE);
+
+       /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
+       if (mp->mpp_socket->so_usecount == 0) {
                 return;
+       }
  
-       if (len)
-               sum = m_sum16(m, hdrlen, len);
+       mp_so = mptetoso(mpte);
  
-       dss_val = mptcp_hton64(dss_val);
-       sseq = *sseqp;
-       dss_len = *(uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t));
-       sum += in_pseudo64(dss_val, sseq, (dss_len + csum));
+       if (action != NECP_CLIENT_CBACTION_INITIAL) {
+               socket_lock(mp_so, 1);
+               locked = 1;
  
-       ADDCARRY(sum);
-       sum = ~sum & 0xffff;
-       csump = (uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t) +
-           sizeof (uint16_t));
-       DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
-           uint32_t, sum);
-       *csump = sum;
-       mptcplog3((LOG_INFO, "%s: sum = %x \n", __func__, sum));
+               /* Check again, because it might have changed while waiting */
+               if (mp->mpp_socket->so_usecount == 0) {
+                       goto out;
+               }
+       }
+
+       socket_lock_assert_owned(mp_so);
+
+       mp_tp = mpte->mpte_mptcb;
+
+       ifnet_head_lock_shared();
+       ifp = ifindex2ifnet[ifindex];
+       ifnet_head_done();
+
+       os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
+           __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
+           ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
+           mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
+           has_v4, has_v6, has_nat64, low_power);
+
+       /* No need on fallen back sockets */
+       if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
+               goto out;
+       }
+
+       /*
+        * When the interface goes in low-power mode we don't want to establish
+        * new subflows on it. Thus, mark it internally as non-viable.
+        */
+       if (low_power) {
+               action = NECP_CLIENT_CBACTION_NONVIABLE;
+       }
+
+       if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
+               for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+                       if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
+                               continue;
+                       }
+
+                       if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
+                               mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
+                       }
+               }
+
+               mptcp_sched_create_subflows(mpte);
+       } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
+           action == NECP_CLIENT_CBACTION_INITIAL) {
+               int found_slot = 0, slot_index = -1;
+               struct sockaddr *dst;
+
+               if (ifp == NULL) {
+                       goto out;
+               }
+
+               if (IFNET_IS_COMPANION_LINK(ifp)) {
+                       goto out;
+               }
+
+               if (IFNET_IS_EXPENSIVE(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
+                       goto out;
+               }
+
+               if (IFNET_IS_CONSTRAINED(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
+                       goto out;
+               }
+
+               if (IFNET_IS_CELLULAR(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
+                       goto out;
+               }
+
+               if (IS_INTF_CLAT46(ifp)) {
+                       has_v4 = FALSE;
+               }
+
+               /* Look for the slot on where to store/update the interface-info. */
+               for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+                       /* Found a potential empty slot where we can put it */
+                       if (mpte->mpte_itfinfo[i].ifindex == 0) {
+                               found_slot = 1;
+                               slot_index = i;
+                       }
+
+                       /*
+                        * The interface is already in our array. Check if we
+                        * need to update it.
+                        */
+                       if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
+                           (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
+                           mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
+                           mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
+                               found_slot = 1;
+                               slot_index = i;
+                               break;
+                       }
+
+                       if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
+                               /*
+                                * Ok, it's already there and we don't need
+                                * to update it
+                                */
+                               goto out;
+                       }
+               }
+
+               dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
+               if (dst && dst->sa_family == AF_INET &&
+                   has_v6 && !has_nat64 && !has_v4) {
+                       if (found_slot) {
+                               mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
+                               mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
+                               mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
+                               mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
+                       }
+                       goto out;
+               }
+
+               if (found_slot == 0) {
+                       int new_size = mpte->mpte_itfinfo_size * 2;
+                       struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
+
+                       if (info == NULL) {
+                               os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
+                                   __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
+                               goto out;
+                       }
+
+                       memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
+
+                       if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
+                               _FREE(mpte->mpte_itfinfo, M_TEMP);
+                       }
+
+                       /* We allocated a new one, thus the first must be empty */
+                       slot_index = mpte->mpte_itfinfo_size;
+
+                       mpte->mpte_itfinfo = info;
+                       mpte->mpte_itfinfo_size = new_size;
+               }
+
+               VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
+               mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
+               mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
+               mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
+               mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
+
+               mptcp_sched_create_subflows(mpte);
+       }
+
+out:
+       if (locked) {
+               socket_unlock(mp_so, 1);
+       }
+}
+
+void
+mptcp_set_restrictions(struct socket *mp_so)
+{
+       struct mptses *mpte = mpsotompte(mp_so);
+       uint32_t i;
+
+       socket_lock_assert_owned(mp_so);
+
+       ifnet_head_lock_shared();
+
+       for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+               struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
+               uint32_t ifindex = info->ifindex;
+               struct ifnet *ifp;
+
+               if (ifindex == IFSCOPE_NONE) {
+                       continue;
+               }
+
+               ifp = ifindex2ifnet[ifindex];
+               if (ifp == NULL) {
+                       continue;
+               }
+
+               if (IFNET_IS_EXPENSIVE(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
+                       info->ifindex = IFSCOPE_NONE;
+               }
+
+               if (IFNET_IS_CONSTRAINED(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
+                       info->ifindex = IFSCOPE_NONE;
+               }
+
+               if (IFNET_IS_CELLULAR(ifp) &&
+                   (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
+                       info->ifindex = IFSCOPE_NONE;
+               }
+       }
+
+       ifnet_head_done();
  }