]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/mptcp_subr.c
xnu-4903.241.1.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
index a2ecbf4c0045e531a4872c22ee461f15a22dbf05..c7b154796cb0db5084d5177850b9ca9c5b62b93e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#include <sys/param.h>
-#include <sys/proc.h>
-#include <sys/systm.h>
+#include <kern/locks.h>
+#include <kern/policy_internal.h>
+#include <kern/zalloc.h>
+
+#include <mach/sdt.h>
+
+#include <sys/domain.h>
+#include <sys/kdebug.h>
+#include <sys/kern_control.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/mcache.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
-#include <sys/syslog.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
 #include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
 
-#include <kern/zalloc.h>
-#include <kern/locks.h>
-
-#include <mach/thread_act.h>
-#include <mach/sdt.h>
-
+#include <net/content_filter.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
@@ -57,6 +60,7 @@
 #include <netinet/tcp_var.h>
 #include <netinet/mptcp_var.h>
 #include <netinet/mptcp.h>
+#include <netinet/mptcp_opt.h>
 #include <netinet/mptcp_seq.h>
 #include <netinet/mptcp_timer.h>
 #include <libkern/crypto/sha1.h>
@@ -66,8 +70,6 @@
 #endif /* INET6 */
 #include <dev/random/randomdev.h>
 
-extern char *proc_best_name(proc_t);
-
 /*
  * Notes on MPTCP implementation.
  *
@@ -86,67 +88,45 @@ extern char *proc_best_name(proc_t);
  * PCB (mppcb) as well as the MPTCP Session (mptses).
  *
  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
- * in particular, the list of subflows as well as the MPTCP thread.
  *
  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
  * represented by the mptsub structure.  Because each subflow requires access
  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
- * subflow.  This gets decremented prior to the subflow's destruction.  The
- * subflow lock (mpts_lock) is used to protect accesses to the subflow.
- *
- * To handle events (read, write, control) from the subflows, an MPTCP thread
- * is created; currently, there is one thread per MPTCP Session.  In order to
- * prevent the MPTCP socket from being destroyed while being accessed by the
- * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
- * which will be decremented prior to the thread's termination.  The thread
- * lock (mpte_thread_lock) is used to synchronize its signalling.
+ * subflow.  This gets decremented prior to the subflow's destruction.
  *
- * Lock ordering is defined as follows:
+ * To handle events (read, write, control) from the subflows, we do direct
+ * upcalls into the specific function.
  *
- *     mtcbinfo (mppi_lock)
- *             mp_so (mpp_lock)
- *                     mpts (mpts_lock)
- *                             so (inpcb_mtx)
- *                                     mptcb (mpt_lock)
- *
- * It is not a requirement that all of the above locks need to be acquired
- * in succession, but the correct lock ordering must be followed when there
- * are more than one locks that need to be held.  The MPTCP thread lock is
- * is not constrained by this arrangement, because none of the other locks
- * is ever acquired while holding mpte_thread_lock; therefore it may be called
- * at any moment to signal the thread.
+ * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
+ * lock. Incoming data on a subflow also ends up taking this single lock. To
+ * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
+ * of the MPTCP-socket.
  *
  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
  * work is done by the MPTCP garbage collector which is invoked on demand by
  * the PF_MULTIPATH garbage collector.  This process will take place once all
- * of the subflows have been destroyed, and the MPTCP thread be instructed to
- * self-terminate.
+ * of the subflows have been destroyed.
  */
 
-static void mptcp_sesdestroy(struct mptses *);
-static void mptcp_thread_signal_locked(struct mptses *);
-static void mptcp_thread_terminate_signal(struct mptses *);
-static void mptcp_thread_dowork(struct mptses *);
-static void mptcp_thread_func(void *, wait_result_t);
-static void mptcp_thread_destroy(struct mptses *);
-static void mptcp_key_pool_init(void);
 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
 
 static uint32_t mptcp_gc(struct mppcbinfo *);
-static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
-static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
     struct uio *, struct mbuf **, struct mbuf **, int *);
+static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
+    struct uio *, struct mbuf *, struct mbuf *, int);
 static void mptcp_subflow_rupcall(struct socket *, void *, int);
 static void mptcp_subflow_input(struct mptses *, struct mptsub *);
 static void mptcp_subflow_wupcall(struct socket *, void *, int);
-static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
-static void mptcp_update_last_owner(struct mptsub *, struct socket *);
-static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
-static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
-static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *, int *);
+static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
+static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
+static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
+
+static void mptcp_subflow_abort(struct mptsub *, int);
+
+static void mptcp_send_dfin(struct socket *so);
 
 /*
  * Possible return values for subflow event handlers.  Note that success
@@ -163,28 +143,22 @@ typedef enum {
 } ev_ret_t;
 
 static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
-static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
+static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
+static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
 
 static const char *mptcp_evret2str(ev_ret_t);
 
-static mptcp_key_t *mptcp_reserve_key(void);
-static int mptcp_do_sha1(mptcp_key_t *, char *, int);
-static void mptcp_init_local_parms(struct mptcb *);
+static void mptcp_do_sha1(mptcp_key_t *, char *);
+static void mptcp_init_local_parms(struct mptses *);
 
 static unsigned int mptsub_zone_size;          /* size of mptsub */
 static struct zone *mptsub_zone;               /* zone for mptsub */
@@ -197,8 +171,6 @@ static struct zone *mpt_subauth_zone;               /* zone of subf auth entry */
 
 struct mppcbinfo mtcbinfo;
 
-static struct mptcp_keys_pool_head mptcp_keys_pool;
-
 #define        MPTCP_SUBFLOW_WRITELEN  (8 * 1024)      /* bytes to write each time */
 #define        MPTCP_SUBFLOW_READLEN   (8 * 1024)      /* bytes to read each time */
 
@@ -206,39 +178,21 @@ SYSCTL_DECL(_net_inet);
 
 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
 
-uint32_t mptcp_dbg_area = 0;           /* more noise if greater than 1 */
+uint32_t mptcp_dbg_area = 31;          /* more noise if greater than 1 */
 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
        &mptcp_dbg_area, 0, "MPTCP debug area");
 
-uint32_t mptcp_dbg_level = 0;
+uint32_t mptcp_dbg_level = 1;
 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
        &mptcp_dbg_level, 0, "MPTCP debug level");
 
-
 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
        &mtcbinfo.mppi_count, 0, "Number of active PCBs");
 
-/*
- * Since there is one kernel thread per mptcp socket, imposing an artificial
- * limit on number of allowed mptcp sockets.
- */
-uint32_t mptcp_socket_limit = MPPCB_LIMIT;
-SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
-       &mptcp_socket_limit, 0, "MPTCP socket limit");
-
-/*
- * SYSCTL to turn on delayed cellular subflow start.
- */
-uint32_t mptcp_delayed_subf_start = 0;
-SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
-       &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
 
-/*
- * sysctl to use network status hints from symptomsd
- */
-uint32_t mptcp_use_symptomsd = 1;
-SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
-       &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
+static int mptcp_alternate_port = 0;
+SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
+          &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
 
 static struct protosw mptcp_subflow_protosw;
 static struct pr_usrreqs mptcp_subflow_usrreqs;
@@ -247,26 +201,26 @@ static struct ip6protosw mptcp_subflow_protosw6;
 static struct pr_usrreqs mptcp_subflow_usrreqs6;
 #endif /* INET6 */
 
+static uint8_t mptcp_create_subflows_scheduled;
+
 typedef struct mptcp_subflow_event_entry {
        uint64_t        sofilt_hint_mask;
        ev_ret_t        (*sofilt_hint_ev_hdlr)(
                            struct mptses *mpte,
                            struct mptsub *mpts,
-                           uint64_t *p_mpsofilt_hint);
+                           uint64_t *p_mpsofilt_hint,
+                           uint64_t event);
 } mptsub_ev_entry_t;
 
+static uint8_t mptcp_cellicon_is_set;
+static uint32_t mptcp_last_cellicon_set;
+#define        MPTCP_CELLICON_TOGGLE_RATE      (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
+
 /*
  * XXX The order of the event handlers below is really
- * really important.
- * SO_FILT_HINT_DELETEOK event has to be handled first,
- * else we may end up missing on this event.
- * Please read radar://24043716 for more details.
+ * really important. Think twice before changing it.
  */
 static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
-       {
-               .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
-               .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
-       },
        {
                .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
                .sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
@@ -277,7 +231,7 @@ static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
        },
        {
                .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
        },
        {
                .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
@@ -285,14 +239,11 @@ static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
        },
        {
                .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
-       },
-       {       .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
        },
        {
                .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
        },
        {
                .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
@@ -302,14 +253,6 @@ static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
                .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
                .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
        },
-       {
-               .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
-       },
-       {
-               .sofilt_hint_mask = SO_FILT_HINT_RESUME,
-               .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
-       },
        {
                .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
                .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
@@ -323,11 +266,17 @@ static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
                .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
        },
        {
-               .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
-               .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
-       }
+               .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
+       },
+       {
+               .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
+               .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
+       },
 };
 
+os_log_t mptcp_log_handle;
+
 /*
  * Protocol pr_init callback.
  */
@@ -361,6 +310,7 @@ mptcp_init(struct protosw *pp, struct domain *dp)
        mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
        mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
        mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
+       mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
        mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
        /*
         * Socket filters shouldn't attach/detach to/from this protosw
@@ -383,6 +333,7 @@ mptcp_init(struct protosw *pp, struct domain *dp)
        mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
        mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
        mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
+       mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
        mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
        /*
         * Socket filters shouldn't attach/detach to/from this protosw
@@ -415,7 +366,6 @@ mptcp_init(struct protosw *pp, struct domain *dp)
 
        mtcbinfo.mppi_gc = mptcp_gc;
        mtcbinfo.mppi_timer = mptcp_timer;
-       mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
 
        /* attach to MP domain for garbage collection to take place */
        mp_pcbinfo_attach(&mtcbinfo);
@@ -448,20 +398,84 @@ mptcp_init(struct protosw *pp, struct domain *dp)
        zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
        zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
 
-       /* Set up a list of unique keys */
-       mptcp_key_pool_init();
+       mptcp_last_cellicon_set = tcp_now;
+
+       mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
+}
+
+int
+mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
+{
+       const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+
+       int i, index = -1;
+
+       if (ifp == NULL) {
+               mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               return (-1);
+       }
+
+       for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
+               if (stats[i].ifindex == IFSCOPE_NONE) {
+                       if (index < 0)
+                               index = i;
+                       continue;
+               }
+
+               if (stats[i].ifindex == ifp->if_index) {
+                       index = i;
+                       return (index);
+               }
+       }
+
+       if (index != -1) {
+               stats[index].ifindex = ifp->if_index;
+               if (stats[index].is_expensive == 0)
+                       stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
+       }
+
+       return (index);
+}
+
+void
+mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
+{
+       int index;
+
+       tcpstat.tcps_mp_switches++;
+       mpte->mpte_subflow_switches++;
+
+       index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
+
+       if (index != -1)
+               mpte->mpte_itfstats[index].switches++;
+}
+
+/*
+ * Flushes all recorded socket options from an MP socket.
+ */
+static void
+mptcp_flush_sopts(struct mptses *mpte)
+{
+       struct mptopt *mpo, *tmpo;
+
+       TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
+               mptcp_sopt_remove(mpte, mpo);
+               mptcp_sopt_free(mpo);
+       }
+       VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
 }
 
 /*
  * Create an MPTCP session, called as a result of opening a MPTCP socket.
  */
-void *
-mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
+int
+mptcp_sescreate(struct mppcb *mpp)
 {
        struct mppcbinfo *mppi;
        struct mptses *mpte;
        struct mptcb *mp_tp;
-       int error = 0;
 
        VERIFY(mpp != NULL);
        mppi = mpp->mpp_pcbinfo;
@@ -482,178 +496,710 @@ mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
        mpte->mpte_associd = SAE_ASSOCID_ANY;
        mpte->mpte_connid_last = SAE_CONNID_ANY;
 
-       lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
-           mppi->mppi_lock_attr);
+       mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
+       mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
 
-       /*
-        * XXX: adi@apple.com
-        *
-        * This can be rather expensive if we have lots of MPTCP sockets,
-        * but we need a kernel thread for this model to work.  Perhaps we
-        * could amortize the costs by having one worker thread per a group
-        * of MPTCP sockets.
-        */
-       if (kernel_thread_start(mptcp_thread_func, mpte,
-           &mpte->mpte_thread) != KERN_SUCCESS) {
-               error = ENOBUFS;
-               goto out;
-       }
-       mp_so->so_usecount++;           /* for thread */
+       if (mptcp_alternate_port)
+               mpte->mpte_alternate_port = htons(mptcp_alternate_port);
 
        /* MPTCP Protocol Control Block */
        bzero(mp_tp, sizeof (*mp_tp));
-       lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
-           mppi->mppi_lock_attr);
        mp_tp->mpt_mpte = mpte;
        mp_tp->mpt_state = MPTCPS_CLOSED;
-out:
-       if (error != 0)
-               lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
-       DTRACE_MPTCP5(session__create, struct socket *, mp_so,
-           struct sockbuf *, &mp_so->so_rcv,
-           struct sockbuf *, &mp_so->so_snd,
-           struct mppcb *, mpp, int, error);
 
-       return ((error != 0) ? NULL : mpte);
+       DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
+
+       return (0);
+}
+
+static void
+mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
+                    uint64_t *cellbytes, uint64_t *allbytes)
+{
+       int64_t mycellbytes = 0;
+       uint64_t myallbytes = 0;
+       int i;
+
+       for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
+               if (mpte->mpte_itfstats[i].is_expensive) {
+                       mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
+                       mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
+               }
+
+               myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
+               myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
+       }
+
+       if (initial_cell) {
+               mycellbytes -= mpte->mpte_init_txbytes;
+               mycellbytes -= mpte->mpte_init_txbytes;
+       }
+
+       if (mycellbytes < 0) {
+               mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               *cellbytes = 0;
+               *allbytes = 0;
+       } else {
+               *cellbytes = mycellbytes;
+               *allbytes = myallbytes;
+       }
+}
+
+static void
+mptcpstats_session_wrapup(struct mptses *mpte)
+{
+       boolean_t cell = mpte->mpte_initial_cell;
+
+       switch (mpte->mpte_svctype) {
+       case MPTCP_SVCTYPE_HANDOVER:
+               if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
+                       tcpstat.tcps_mptcp_fp_handover_attempt++;
+
+                       if (cell && mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_fp_handover_success_cell++;
+
+                               if (mpte->mpte_used_wifi)
+                                       tcpstat.tcps_mptcp_handover_wifi_from_cell++;
+                       } else if (mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_fp_handover_success_wifi++;
+
+                               if (mpte->mpte_used_cell)
+                                       tcpstat.tcps_mptcp_handover_cell_from_wifi++;
+                       }
+               } else {
+                       tcpstat.tcps_mptcp_handover_attempt++;
+
+                       if (cell && mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_handover_success_cell++;
+
+                               if (mpte->mpte_used_wifi)
+                                       tcpstat.tcps_mptcp_handover_wifi_from_cell++;
+                       } else if (mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_handover_success_wifi++;
+
+                               if (mpte->mpte_used_cell)
+                                       tcpstat.tcps_mptcp_handover_cell_from_wifi++;
+                       }
+               }
+
+               if (mpte->mpte_handshake_success) {
+                       uint64_t cellbytes;
+                       uint64_t allbytes;
+
+                       mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
+
+                       tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
+                       tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
+               }
+               break;
+       case MPTCP_SVCTYPE_INTERACTIVE:
+               if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
+                       tcpstat.tcps_mptcp_fp_interactive_attempt++;
+
+                       if (mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_fp_interactive_success++;
+
+                               if (!cell && mpte->mpte_used_cell)
+                                       tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
+                       }
+               } else {
+                       tcpstat.tcps_mptcp_interactive_attempt++;
+
+                       if (mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_interactive_success++;
+
+                               if (!cell && mpte->mpte_used_cell)
+                                       tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
+                       }
+               }
+
+               if (mpte->mpte_handshake_success) {
+                       uint64_t cellbytes;
+                       uint64_t allbytes;
+
+                       mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
+
+                       tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
+                       tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
+               }
+               break;
+       case MPTCP_SVCTYPE_AGGREGATE:
+               if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
+                       tcpstat.tcps_mptcp_fp_aggregate_attempt++;
+
+                       if (mpte->mpte_handshake_success)
+                               tcpstat.tcps_mptcp_fp_aggregate_success++;
+               } else {
+                       tcpstat.tcps_mptcp_aggregate_attempt++;
+
+                       if (mpte->mpte_handshake_success) {
+                               tcpstat.tcps_mptcp_aggregate_success++;
+                       }
+               }
+
+               if (mpte->mpte_handshake_success) {
+                       uint64_t cellbytes;
+                       uint64_t allbytes;
+
+                       mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
+
+                       tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
+                       tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
+               }
+               break;
+       }
+
+       if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
+               tcpstat.tcps_mptcp_back_to_wifi++;
+
+       if (mpte->mpte_triggered_cell)
+               tcpstat.tcps_mptcp_triggered_cell++;
 }
 
 /*
  * Destroy an MPTCP session.
  */
 static void
-mptcp_sesdestroy(struct mptses *mpte)
+mptcp_session_destroy(struct mptses *mpte)
 {
        struct mptcb *mp_tp;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
        mp_tp = mpte->mpte_mptcb;
        VERIFY(mp_tp != NULL);
 
+       mptcpstats_session_wrapup(mpte);
+
+       mptcp_unset_cellicon();
+
        /*
         * MPTCP Multipath PCB Extension section
         */
        mptcp_flush_sopts(mpte);
        VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
 
-       lck_mtx_destroy(&mpte->mpte_thread_lock,
-           mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
+       if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
+               _FREE(mpte->mpte_itfinfo, M_TEMP);
+
+       mpte->mpte_itfinfo = NULL;
+
+       m_freem_list(mpte->mpte_reinjectq);
 
        /*
         * MPTCP Protocol Control Block section
         */
-       lck_mtx_destroy(&mp_tp->mpt_lock,
-           mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
-
        DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
            struct mptcb *, mp_tp);
 }
 
-/*
- * Allocate an MPTCP socket option structure.
- */
-struct mptopt *
-mptcp_sopt_alloc(int how)
+static boolean_t
+mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
 {
-       struct mptopt *mpo;
+       return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
+               mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
+               !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
+}
 
-       mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
-           zalloc_noblock(mptopt_zone);
-       if (mpo != NULL) {
-               bzero(mpo, mptopt_zone_size);
+static int
+mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
+{
+       static const struct in6_addr well_known_prefix = {
+               .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
+                                        0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                        0x00, 0x00, 0x00, 0x00},
+       };
+       char buf[MAX_IPv6_STR_LEN];
+       char *ptrv4 = (char *)addrv4;
+       char *ptr = (char *)addr;
+
+       if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
+           IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
+           IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
+           IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
+           IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
+           IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
+           INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
+               return (-1);
        }
 
-       return (mpo);
-}
+       /* Check for the well-known prefix */
+       if (len == NAT64_PREFIX_LEN_96 &&
+           IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
+               if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
+                   IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space
+                       return (-1);
+       }
 
-/*
- * Free an MPTCP socket option structure.
- */
-void
-mptcp_sopt_free(struct mptopt *mpo)
-{
-       VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
+       switch (len) {
+               case NAT64_PREFIX_LEN_96:
+                       memcpy(ptr + 12, ptrv4, 4);
+                       break;
+               case NAT64_PREFIX_LEN_64:
+                       memcpy(ptr + 9, ptrv4, 4);
+                       break;
+               case NAT64_PREFIX_LEN_56:
+                       memcpy(ptr + 7, ptrv4, 1);
+                       memcpy(ptr + 9, ptrv4 + 1, 3);
+                       break;
+               case NAT64_PREFIX_LEN_48:
+                       memcpy(ptr + 6, ptrv4, 2);
+                       memcpy(ptr + 9, ptrv4 + 2, 2);
+                       break;
+               case NAT64_PREFIX_LEN_40:
+                       memcpy(ptr + 5, ptrv4, 3);
+                       memcpy(ptr + 9, ptrv4 + 3, 1);
+                       break;
+               case NAT64_PREFIX_LEN_32:
+                       memcpy(ptr + 4, ptrv4, 4);
+                       break;
+               default:
+                       panic("NAT64-prefix len is wrong: %u\n", len);
+       }
 
-       zfree(mptopt_zone, mpo);
-}
+       os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
+                   __func__, len,
+                   inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
 
-/*
- * Add a socket option to the MPTCP socket option list.
- */
-void
-mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
-{
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
-       mpo->mpo_flags |= MPOF_ATTACHED;
-       TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
+       return (0);
 }
 
-/*
- * Remove a socket option from the MPTCP socket option list.
- */
-void
-mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
+static void
+mptcp_trigger_cell_bringup(struct mptses *mpte)
 {
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
-       mpo->mpo_flags &= ~MPOF_ATTACHED;
-       TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
-}
+       struct socket *mp_so = mptetoso(mpte);
 
-/*
- * Search for an existing <sopt_level,sopt_name> socket option.
- */
-struct mptopt *
-mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
-{
-       struct mptopt *mpo;
+       if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
+               uuid_string_t uuidstr;
+               int err;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+               mpte_unlock(mpte);
+               err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
+                                                         TRUE);
+               mpte_lock(mpte);
 
-       TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
-               if (mpo->mpo_level == sopt->sopt_level &&
-                   mpo->mpo_name == sopt->sopt_name)
-                       break;
-       }
-       VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
+               if (err == 0)
+                       mpte->mpte_triggered_cell = 1;
 
-       return (mpo);
+               uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
+               os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
+                           __func__, uuidstr, err);
+       } else {
+               os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
+       }
 }
 
-/*
- * Flushes all recorded socket options from an MP socket.
- */
+
 void
-mptcp_flush_sopts(struct mptses *mpte)
+mptcp_check_subflows_and_add(struct mptses *mpte)
 {
-       struct mptopt *mpo, *tmpo;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       boolean_t cellular_viable = FALSE;
+       boolean_t want_cellular = TRUE;
+       uint32_t i;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       if (!mptcp_ok_to_create_subflows(mp_tp))
+               return;
 
-       TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
-               mptcp_sopt_remove(mpte, mpo);
-               mptcp_sopt_free(mpo);
-       }
-       VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
-}
+       for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+               struct mpt_itf_info *info;
+               struct mptsub *mpts;
+               struct ifnet *ifp;
+               uint32_t ifindex;
+               int found = 0;
 
-/*
- * Allocate a MPTCP subflow structure.
- */
-struct mptsub *
-mptcp_subflow_alloc(int how)
-{
-       struct mptsub *mpts;
+               info = &mpte->mpte_itfinfo[i];
 
-       mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
-           zalloc_noblock(mptsub_zone);
-       if (mpts != NULL) {
-               bzero(mpts, mptsub_zone_size);
-               lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
-                   mtcbinfo.mppi_lock_attr);
-       }
+               if (info->no_mptcp_support)
+                       continue;
 
+               ifindex = info->ifindex;
+               if (ifindex == IFSCOPE_NONE)
+                       continue;
+
+               ifnet_head_lock_shared();
+               ifp = ifindex2ifnet[ifindex];
+               ifnet_head_done();
+
+               if (ifp == NULL)
+                       continue;
+
+               if (IFNET_IS_CELLULAR(ifp))
+                       cellular_viable = TRUE;
+
+               TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+                       const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+
+                       if (subifp == NULL)
+                               continue;
+
+                       /*
+                        * In Handover mode, only create cell subflow if
+                        * 1. Wi-Fi Assist is active
+                        * 2. Symptoms marked WiFi as weak
+                        * 3. We are experiencing RTOs or we are not sending data.
+                        *
+                        * This covers the scenario, where:
+                        * 1. We send and get retransmission timeouts (thus,
+                        *    we confirmed that WiFi is indeed bad).
+                        * 2. We are not sending and the server tries to send.
+                        *    Establshing a cell-subflow gives the server a
+                        *    chance to send us some data over cell if WiFi
+                        *    is dead. We establish the subflow with the
+                        *    backup-bit set, so the server is not allowed to
+                        *    send on this subflow as long as WiFi is providing
+                        *    good performance.
+                        */
+                       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
+                           !IFNET_IS_CELLULAR(subifp) &&
+                           !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
+                           (mptcp_is_wifi_unusable(mpte) == 0 ||
+                            (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
+                             ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
+                               os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
+                                            __func__, mptcp_is_wifi_unusable(mpte),
+                                            sototcpcb(mpts->mpts_socket)->t_rxtshift,
+                                            !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
+                                            mptetoso(mpte)->so_snd.sb_cc,
+                                            ifindex, subifp->if_index);
+                               found = 1;
+
+                               /* We found a proper subflow on WiFi - no need for cell */
+                               want_cellular = FALSE;
+                               break;
+                       } else {
+                               os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
+                                            __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
+                                            mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
+                                            !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
+
+                       }
+
+                       if (subifp->if_index == ifindex &&
+                           !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
+                           sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
+                               /*
+                                * We found a subflow on this interface.
+                                * No need to create a new one.
+                                */
+                               found = 1;
+                               break;
+                       }
+               }
+
+               if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
+                   !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
+                   mptcp_developer_mode == 0) {
+                       mptcp_ask_symptoms(mpte);
+                       return;
+               }
+
+               if (!found) {
+                       struct sockaddr *dst = &mpte->mpte_dst;
+                       struct sockaddr_in6 nat64pre;
+
+                       if (mpte->mpte_dst.sa_family == AF_INET &&
+                           !info->has_v4_conn && info->has_nat64_conn) {
+                               struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
+                               int error, j;
+
+                               bzero(&nat64pre, sizeof(struct sockaddr_in6));
+
+                               error = ifnet_get_nat64prefix(ifp, nat64prefixes);
+                               if (error) {
+                                       os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
+                                                    __func__, ifp->if_name, error);
+                                       continue;
+                               }
+
+                               for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
+                                       if (nat64prefixes[j].prefix_len != 0)
+                                               break;
+                               }
+
+                               VERIFY(j < NAT64_MAX_NUM_PREFIXES);
+
+                               error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
+                                                              nat64prefixes[j].prefix_len,
+                                                              &mpte->__mpte_dst_v4.sin_addr);
+                               if (error != 0) {
+                                       os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
+                                                   __func__);
+                                       continue;
+                               }
+
+                               memcpy(&nat64pre.sin6_addr,
+                                      &nat64prefixes[j].ipv6_prefix,
+                                      sizeof(nat64pre.sin6_addr));
+                               nat64pre.sin6_len = sizeof(struct sockaddr_in6);
+                               nat64pre.sin6_family = AF_INET6;
+                               nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
+                               nat64pre.sin6_flowinfo = 0;
+                               nat64pre.sin6_scope_id = 0;
+
+                               dst = (struct sockaddr *)&nat64pre;
+                       }
+
+                       /* Initial subflow started on a NAT64'd address? */
+                       if (mpte->mpte_dst.sa_family == AF_INET6 &&
+                           mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
+                               dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
+                       }
+
+                       if (dst->sa_family == AF_INET && !info->has_v4_conn)
+                               continue;
+                       if (dst->sa_family == AF_INET6 && !info->has_v6_conn)
+                               continue;
+
+                       mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
+               }
+       }
+
+       if (!cellular_viable && want_cellular) {
+               /* Trigger Cell Bringup */
+               mptcp_trigger_cell_bringup(mpte);
+       }
+}
+
+/*
+ * Based on the MPTCP Service-type and the state of the subflows, we
+ * will destroy subflows here.
+ */
+static void
+mptcp_check_subflows_and_remove(struct mptses *mpte)
+{
+       struct mptsub *mpts, *tmpts;
+       int found_working_subflow = 0, removed_some = 0;
+       int wifi_unusable = mptcp_is_wifi_unusable(mpte);
+
+       if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
+               return;
+
+       /*
+        * Look for a subflow that is on a non-cellular interface
+        * and actually works (aka, no retransmission timeout).
+        */
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+               struct socket *so;
+               struct tcpcb *tp;
+
+               if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
+                       continue;
+
+               so = mpts->mpts_socket;
+               tp = sototcpcb(so);
+
+               if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
+                   tp->t_state != TCPS_ESTABLISHED)
+                       continue;
+
+               /* Is this subflow in good condition? */
+               if (tp->t_rxtshift == 0)
+                       found_working_subflow = 1;
+
+               /* Or WiFi is fine */
+               if (!wifi_unusable)
+                       found_working_subflow = 1;
+       }
+
+       /*
+        * Couldn't find a working subflow, let's not remove those on a cellular
+        * interface.
+        */
+       if (!found_working_subflow)
+               return;
+
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+
+               /* Only remove cellular subflows */
+               if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
+                       continue;
+
+               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+               removed_some = 1;
+       }
+
+       if (removed_some)
+               mptcp_unset_cellicon();
+}
+
+static void
+mptcp_remove_subflows(struct mptses *mpte)
+{
+       struct mptsub *mpts, *tmpts;
+
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
+                       mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
+
+                       soevent(mpts->mpts_socket,
+                               SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
+               }
+       }
+}
+
+static void
+mptcp_create_subflows(__unused void *arg)
+{
+       struct mppcb *mpp;
+
+       /*
+        * Start with clearing, because we might be processing connections
+        * while a new event comes in.
+        */
+       if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
+               mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+       /* Iterate over all MPTCP connections */
+
+       lck_mtx_lock(&mtcbinfo.mppi_lock);
+
+       TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
+               struct mptses *mpte;
+               struct socket *mp_so;
+
+               if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
+                       continue;
+
+               mpp_lock(mpp);
+
+               mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
+
+               mpte = mpp->mpp_pcbe;
+               mp_so = mpp->mpp_socket;
+
+               VERIFY(mp_so->so_usecount > 0);
+
+               mptcp_check_subflows_and_add(mpte);
+               mptcp_remove_subflows(mpte);
+
+               mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
+               mpp_unlock(mpp);
+       }
+
+       lck_mtx_unlock(&mtcbinfo.mppi_lock);
+}
+
+/*
+ * We need this because we are coming from an NECP-event. This event gets posted
+ * while holding NECP-locks. The creation of the subflow however leads us back
+ * into NECP (e.g., to add the necp_cb and also from tcp_connect).
+ * So, we would deadlock there as we already hold the NECP-lock.
+ *
+ * So, let's schedule this separately. It also gives NECP the chance to make
+ * progress, without having to wait for MPTCP to finish its subflow creation.
+ */
+void
+mptcp_sched_create_subflows(struct mptses *mpte)
+{
+       struct mppcb *mpp = mpte->mpte_mppcb;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       struct socket *mp_so = mpp->mpp_socket;
+
+       if (!mptcp_ok_to_create_subflows(mp_tp)) {
+               mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
+                         __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               return;
+       }
+
+       if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
+               mp_so->so_usecount++; /* To prevent it from being free'd in-between */
+               mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
+       }
+
+       if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
+               return;
+
+       /* Do the call in 100ms to allow NECP to schedule it on all sockets */
+       timeout(mptcp_create_subflows, NULL, hz/10);
+}
+
+/*
+ * Allocate an MPTCP socket option structure.
+ */
+struct mptopt *
+mptcp_sopt_alloc(int how)
+{
+       struct mptopt *mpo;
+
+       mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
+           zalloc_noblock(mptopt_zone);
+       if (mpo != NULL) {
+               bzero(mpo, mptopt_zone_size);
+       }
+
+       return (mpo);
+}
+
+/*
+ * Free an MPTCP socket option structure.
+ */
+void
+mptcp_sopt_free(struct mptopt *mpo)
+{
+       VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
+
+       zfree(mptopt_zone, mpo);
+}
+
+/*
+ * Add a socket option to the MPTCP socket option list.
+ */
+void
+mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
+{
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       mpo->mpo_flags |= MPOF_ATTACHED;
+       TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
+}
+
+/*
+ * Remove a socket option from the MPTCP socket option list.
+ */
+void
+mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
+{
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
+       mpo->mpo_flags &= ~MPOF_ATTACHED;
+       TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
+}
+
+/*
+ * Search for an existing <sopt_level,sopt_name> socket option.
+ */
+struct mptopt *
+mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
+{
+       struct mptopt *mpo;
+
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+
+       TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
+               if (mpo->mpo_level == sopt->sopt_level &&
+                   mpo->mpo_name == sopt->sopt_name)
+                       break;
+       }
+       return (mpo);
+}
+
+/*
+ * Allocate a MPTCP subflow structure.
+ */
+static struct mptsub *
+mptcp_subflow_alloc(void)
+{
+       struct mptsub *mpts = zalloc(mptsub_zone);
+
+       if (mpts == NULL)
+               return (NULL);
+
+       bzero(mpts, mptsub_zone_size);
        return (mpts);
 }
 
@@ -661,44 +1207,145 @@ mptcp_subflow_alloc(int how)
  * Deallocate a subflow structure, called when all of the references held
  * on it have been released.  This implies that the subflow has been deleted.
  */
-void
+static void
 mptcp_subflow_free(struct mptsub *mpts)
 {
-       MPTS_LOCK_ASSERT_HELD(mpts);
-
        VERIFY(mpts->mpts_refcnt == 0);
        VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
        VERIFY(mpts->mpts_mpte == NULL);
        VERIFY(mpts->mpts_socket == NULL);
 
-       if (mpts->mpts_src_sl != NULL) {
-               sockaddrlist_free(mpts->mpts_src_sl);
-               mpts->mpts_src_sl = NULL;
-       }
-       if (mpts->mpts_dst_sl != NULL) {
-               sockaddrlist_free(mpts->mpts_dst_sl);
-               mpts->mpts_dst_sl = NULL;
+       if (mpts->mpts_src != NULL) {
+               FREE(mpts->mpts_src, M_SONAME);
+               mpts->mpts_src = NULL;
        }
-       MPTS_UNLOCK(mpts);
-       lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
 
        zfree(mptsub_zone, mpts);
 }
 
+static void
+mptcp_subflow_addref(struct mptsub *mpts)
+{
+       if (++mpts->mpts_refcnt == 0)
+               panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
+               /* NOTREACHED */
+}
+
+static void
+mptcp_subflow_remref(struct mptsub *mpts)
+{
+       if (mpts->mpts_refcnt == 0) {
+               panic("%s: mpts %p negative refcnt\n", __func__, mpts);
+               /* NOTREACHED */
+       }
+       if (--mpts->mpts_refcnt > 0)
+               return;
+
+       /* callee will unlock and destroy lock */
+       mptcp_subflow_free(mpts);
+}
+
+static void
+mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
+{
+       struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
+       struct tcpcb *tp = sototcpcb(so);
+
+       /*
+        * From this moment on, the subflow is linked to the MPTCP-connection.
+        * Locking,... happens now at the MPTCP-layer
+        */
+       tp->t_mptcb = mpte->mpte_mptcb;
+       so->so_flags |= SOF_MP_SUBFLOW;
+       mp_so->so_usecount++;
+
+       /*
+        * Insert the subflow into the list, and associate the MPTCP PCB
+        * as well as the the subflow socket.  From this point on, removing
+        * the subflow needs to be done via mptcp_subflow_del().
+        */
+       TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
+       mpte->mpte_numflows++;
+
+       atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
+       mpts->mpts_mpte = mpte;
+       mpts->mpts_socket = so;
+       tp->t_mpsub = mpts;
+       mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
+       mptcp_subflow_addref(mpts);     /* for subflow socket */
+}
+
+static void
+mptcp_subflow_necp_cb(void *handle, __unused int action,
+                     __unused uint32_t interface_index,
+                     uint32_t necp_flags, bool *viable)
+{
+       boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
+       struct inpcb *inp = (struct inpcb *)handle;
+       struct socket *so = inp->inp_socket;
+       struct mptsub *mpts;
+       struct mptses *mpte;
+
+       if (low_power)
+               action = NECP_CLIENT_CBACTION_NONVIABLE;
+
+       if (action != NECP_CLIENT_CBACTION_NONVIABLE)
+               return;
+
+       /*
+        * The socket is being garbage-collected. There is nothing to be done
+        * here.
+        */
+       if (so->so_usecount == 0)
+               return;
+
+       socket_lock(so, 1);
+
+       /* Check again after we acquired the lock. */
+       if (so->so_usecount == 0)
+               goto out;
+
+       mpte = tptomptp(sototcpcb(so))->mpt_mpte;
+       mpts = sototcpcb(so)->t_mpsub;
+
+       os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
+                    __func__, mpts->mpts_ifscope, low_power);
+
+       mpts->mpts_flags |= MPTSF_CLOSE_REQD;
+
+       mptcp_sched_create_subflows(mpte);
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL)
+               *viable = 1;
+
+out:
+       socket_unlock(so, 1);
+}
+
 /*
  * Create an MPTCP subflow socket.
  */
 static int
 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
-    struct proc *p, struct socket **so)
+    struct socket **so)
 {
+       lck_mtx_t *subflow_mtx;
        struct mptopt smpo, *mpo, *tmpo;
+       struct proc *p;
        struct socket *mp_so;
        int error;
 
        *so = NULL;
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       mp_so = mptetoso(mpte);
+
+       p = proc_find(mp_so->last_pid);
+       if (p == PROC_NULL) {
+               mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+               return (ESRCH);
+       }
 
        /*
         * Create the subflow socket (multipath subflow, non-blocking.)
@@ -708,19 +1355,49 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
         * It also indicates to the underlying TCP to handle MPTCP options.
         * A multipath subflow socket implies SS_NOFDREF state.
         */
-       if ((error = socreate_internal(dom, so, SOCK_STREAM,
-           IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
-               mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
-                   " unable to create subflow socket error %d\n",
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+       /*
+        * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
+        * the ipi-lock. We cannot hold the socket-lock at that point.
+        */
+       mpte_unlock(mpte);
+       error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
+                                 SOCF_ASYNC, PROC_NULL);
+       mpte_lock(mpte);
+       if (error) {
+               mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
+                         __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+               proc_rele(p);
+
+               mptcp_subflow_free(mpts);
                return (error);
        }
 
-       socket_lock(*so, 0);
-       VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
-       VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
-           (SS_NBIO|SS_NOFDREF));
+       /*
+        * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
+        * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
+        * Which is why we also need to get the lock with pr_getlock, as after
+        * setting the flag, socket_unlock will work on the MPTCP-level lock.
+        */
+       subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
+       lck_mtx_lock(subflow_mtx);
+
+       /*
+        * Must be the first thing we do, to make sure all pointers for this
+        * subflow are set.
+        */
+       mptcp_subflow_attach(mpte, mpts, *so);
+
+       /*
+        * A multipath subflow socket is used internally in the kernel,
+        * therefore it does not have a file desciptor associated by
+        * default.
+        */
+       (*so)->so_state |= SS_NOFDREF;
+
+       lck_mtx_unlock(subflow_mtx);
 
        /* prevent the socket buffers from being compressed */
        (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
@@ -729,10 +1406,42 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
        /* Inherit preconnect and TFO data flags */
        if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
                (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
-
        if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
                (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
 
+       /* Inherit uuid and create the related flow. */
+       if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
+               struct mptcb *mp_tp = mpte->mpte_mptcb;
+
+               sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
+
+               /*
+                * A note on the unlock: With MPTCP, we do multiple times a
+                * necp_client_register_socket_flow. This is problematic,
+                * because now the lock-ordering guarantee (first necp-locks,
+                * then socket-locks) is no more respected. So, we need to
+                * unlock here.
+                */
+               mpte_unlock(mpte);
+               error = necp_client_register_socket_flow(mp_so->last_pid,
+                   mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
+               mpte_lock(mpte);
+
+               if (error)
+                       goto out_err;
+
+               /* Possible state-change during the unlock above */
+               if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
+                   (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
+                       goto out_err;
+
+               uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
+       } else {
+               mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+       }
+
+       /* inherit the other socket options */
        bzero(&smpo, sizeof (smpo));
        smpo.mpo_flags |= MPOF_SUBFLOW_OK;
        smpo.mpo_level = SOL_SOCKET;
@@ -740,42 +1449,36 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
 
        /* disable SIGPIPE */
        smpo.mpo_name = SO_NOSIGPIPE;
-       if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
-               goto out;
+       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
+               goto out_err;
 
        /* find out if the subflow's source address goes away */
        smpo.mpo_name = SO_NOADDRERR;
-       if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
-               goto out;
+       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
+               goto out_err;
 
        /* enable keepalive */
        smpo.mpo_name = SO_KEEPALIVE;
-       if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
-               goto out;
-
-       /*
-        * Limit the receive socket buffer size to 64k.
-        *
-        * We need to take into consideration the window scale option
-        * which could be negotiated in one subflow but disabled in
-        * another subflow.
-        * XXX This can be improved in the future.
-        */
-       smpo.mpo_name = SO_RCVBUF;
-       smpo.mpo_intval = MPTCP_RWIN_MAX;
-       if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
-               goto out;
-
-       /* N.B.: set by sosetopt */
-       VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
-       /* Prevent automatic socket buffer sizing. */
-       (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
+       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
+               goto out_err;
 
        smpo.mpo_level = IPPROTO_TCP;
        smpo.mpo_intval = mptcp_subflow_keeptime;
        smpo.mpo_name = TCP_KEEPALIVE;
-       if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
-               goto out;
+       if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
+               goto out_err;
+
+       if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
+               /*
+                * On secondary subflows we might need to set the cell-fallback
+                * flag (see conditions in mptcp_subflow_sosetopt).
+                */
+               smpo.mpo_level = SOL_SOCKET;
+               smpo.mpo_name = SO_MARK_CELLFALLBACK;
+               smpo.mpo_intval = 1;
+               if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
+                       goto out_err;
+       }
 
        /* replay setsockopt(2) on the subflow sockets for eligible options */
        TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
@@ -796,14 +1499,12 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
                        continue;
 
                interim = (mpo->mpo_flags & MPOF_INTERIM);
-               if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
-                       char buf[32];
-                       mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
-                           " mp_so 0x%llx"
-                           " sopt %s val %d interim record removed\n",
+               if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
+                       mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
+                           " sopt %s val %d interim record removed\n", __func__,
                            (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
-                           buf, sizeof (buf)), mpo->mpo_intval),
+                           mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
+                           mpo->mpo_intval),
                            MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
                        mptcp_sopt_remove(mpte, mpo);
                        mptcp_sopt_free(mpo);
@@ -816,7 +1517,6 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
         * so use a customized socket receive function.  We will undo
         * this when the socket is peeled off or closed.
         */
-       mpts->mpts_oprotosw = (*so)->so_proto;
        switch (dom) {
        case PF_INET:
                (*so)->so_proto = &mptcp_subflow_protosw;
@@ -831,11 +1531,20 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
                /* NOTREACHED */
        }
 
-out:
-       socket_unlock(*so, 0);
+       proc_rele(p);
+
+       DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
+           int, dom, int, error);
+
+       return (0);
+
+out_err:
+       mptcp_subflow_abort(mpts, error);
+
+       proc_rele(p);
 
-       DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
-           struct mptsub *, mpts, int, dom, int, error);
+       mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
+                 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
 
        return (error);
 }
@@ -846,96 +1555,116 @@ out:
  * Note that this may be called on an embryonic subflow, and the only
  * thing that is guaranteed valid is the protocol-user request.
  */
-static int
-mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
+static void
+mptcp_subflow_soclose(struct mptsub *mpts)
 {
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       struct socket *so = mpts->mpts_socket;
 
-       socket_lock(so, 0);
+       if (mpts->mpts_flags & MPTSF_CLOSED)
+               return;
+
+       VERIFY(so != NULL);
        VERIFY(so->so_flags & SOF_MP_SUBFLOW);
        VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
 
-       /* restore protocol-user requests */
-       VERIFY(mpts->mpts_oprotosw != NULL);
-       so->so_proto = mpts->mpts_oprotosw;
-       socket_unlock(so, 0);
-
-       mpts->mpts_socket = NULL;       /* may already be NULL */
-
        DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
            struct socket *, so,
            struct sockbuf *, &so->so_rcv,
            struct sockbuf *, &so->so_snd,
            struct mptses *, mpts->mpts_mpte);
 
-       return (soclose(so));
+       mpts->mpts_flags |= MPTSF_CLOSED;
+
+       if (so->so_retaincnt == 0) {
+               soclose_locked(so);
+
+               return;
+       } else {
+               VERIFY(so->so_usecount > 0);
+               so->so_usecount--;
+       }
+
+       return;
 }
 
 /*
  * Connect an MPTCP subflow socket.
  *
- * This may be called inline as part of adding a subflow, or asynchronously
- * by the thread (upon progressing to MPTCPF_JOIN_READY).  Note that in the
- * pending connect case, the subflow socket may have been bound to an interface
- * and/or a source IP address which may no longer be around by the time this
- * routine is called; in that case the connect attempt will most likely fail.
+ * Note that in the pending connect case, the subflow socket may have been
+ * bound to an interface and/or a source IP address which may no longer be
+ * around by the time this routine is called; in that case the connect attempt
+ * will most likely fail.
  */
 static int
 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
 {
-       struct socket *so;
-       int af, error;
+       char dbuf[MAX_IPv6_STR_LEN];
+       struct socket *mp_so, *so;
+       struct mptcb *mp_tp;
+       struct sockaddr *dst;
+       struct proc *p;
+       int af, error, dport;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mp_so = mptetoso(mpte);
+       mp_tp = mpte->mpte_mptcb;
+       so = mpts->mpts_socket;
+       af = mpts->mpts_dst.sa_family;
+       dst = &mpts->mpts_dst;
 
-       VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
-           MPTSF_CONNECTING);
+       VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
        VERIFY(mpts->mpts_socket != NULL);
-       so = mpts->mpts_socket;
-       af = mpts->mpts_family;
+       VERIFY(af == AF_INET || af == AF_INET6);
 
-       if (af == AF_INET || af == AF_INET6) {
-               struct sockaddr_entry *dst_se;
-               char dbuf[MAX_IPv6_STR_LEN];
+       if (af == AF_INET) {
+               inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof (dbuf));
+               dport = ntohs(SIN(dst)->sin_port);
+       } else {
+               inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof (dbuf));
+               dport = ntohs(SIN6(dst)->sin6_port);
+       }
 
-               dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
-               VERIFY(dst_se != NULL);
+       os_log_info(mptcp_log_handle,
+                   "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
+                   dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
 
-               mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
-                   "dst %s[%d] cid %d [pended %s]\n",
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
-                   inet_ntop(af, ((af == AF_INET) ?
-                   (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
-                   (void *)&SIN6(dst_se->se_addr)->sin6_addr),
-                   dbuf, sizeof (dbuf)), ((af == AF_INET) ?
-                   ntohs(SIN(dst_se->se_addr)->sin_port) :
-                   ntohs(SIN6(dst_se->se_addr)->sin6_port)),
-                   mpts->mpts_connid,
-                   ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
-                   "YES" : "NO")),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+       p = proc_find(mp_so->last_pid);
+       if (p == PROC_NULL) {
+               mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+               return (ESRCH);
        }
 
        mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
 
-       socket_lock(so, 0);
        mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
 
        /* connect the subflow socket */
-       error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
-           mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
-           mpte->mpte_associd, NULL, CONNREQF_MPTCP,
-           &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
-       socket_unlock(so, 0);
+       error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
+           p, mpts->mpts_ifscope,
+           mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
+
+       mpts->mpts_iss = sototcpcb(so)->iss;
+
+       /* See tcp_connect_complete */
+       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
+           (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
+               mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
+       }
 
        /* Allocate a unique address id per subflow */
        mpte->mpte_addrid_last++;
        if (mpte->mpte_addrid_last == 0)
                mpte->mpte_addrid_last++;
 
+       proc_rele(p);
+
        DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
            struct mptsub *, mpts, int, error);
+       if (error)
+               mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
+                         __func__, error, mpts->mpts_ifscope),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
 
        return (error);
 }
@@ -948,12 +1677,13 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 #pragma unused(uio)
+       struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
        int flags, error = 0;
        struct proc *p = current_proc();
        struct mbuf *m, **mp = mp0;
-       struct mbuf *nextrecord;
+       boolean_t proc_held = FALSE;
 
-       socket_lock(so, 1);
+       mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
        VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
 
 #ifdef MORE_LOCKING_DEBUG
@@ -967,10 +1697,9 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
         * to the MPTCP layer, so we require that the caller passes in the
         * expected parameters.
         */
-       if (mp == NULL || controlp != NULL) {
-               socket_unlock(so, 1);
+       if (mp == NULL || controlp != NULL)
                return (EINVAL);
-       }
+
        *mp = NULL;
        if (psa != NULL)
                *psa = NULL;
@@ -979,10 +1708,9 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
        else
                flags = 0;
 
-       if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
-               socket_unlock(so, 1);
+       if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
                return (EOPNOTSUPP);
-       }
+
        flags |= (MSG_DONTWAIT|MSG_NBIO);
 
        /*
@@ -994,10 +1722,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                struct sockbuf *sb = &so->so_rcv;
 
                error = ENOTCONN;
-               SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
-                   __func__, proc_pid(p), proc_best_name(p),
-                   (uint64_t)VM_KERNEL_ADDRPERM(so),
-                   SOCK_DOM(so), SOCK_TYPE(so), error);
                /*
                 * This socket should have been disconnected and flushed
                 * prior to being returned from sodefunct(); there should
@@ -1005,7 +1729,6 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                 */
                if (so->so_state & SS_DEFUNCT)
                        sb_empty_assert(sb, __func__);
-               socket_unlock(so, 1);
                return (error);
        }
 
@@ -1026,20 +1749,16 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
         * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
         */
        if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
-           (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
-               socket_unlock(so, 1);
+           (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
                return (0);
-       }
 
        /*
         * For consistency with soreceive() semantics, we need to obey
         * SB_LOCK in case some other code path has locked the buffer.
         */
        error = sblock(&so->so_rcv, 0);
-       if (error != 0) {
-               socket_unlock(so, 1);
+       if (error != 0)
                return (error);
-       }
 
        m = so->so_rcv.sb_mb;
        if (m == NULL) {
@@ -1075,141 +1794,222 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                goto release;
        }
 
+       mptcp_update_last_owner(so, mp_so);
+
+       if (mp_so->last_pid != proc_pid(p)) {
+               p = proc_find(mp_so->last_pid);
+               if (p == PROC_NULL) {
+                       p = current_proc();
+               } else {
+                       proc_held = TRUE;
+               }
+       }
+
        OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
        SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
        SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
 
        while (m != NULL) {
-               nextrecord = m->m_nextpkt;
-               sbfree(&so->so_rcv, m);
-
-               if (mp != NULL) {
-                       *mp = m;
-                       mp = &m->m_next;
-                       so->so_rcv.sb_mb = m = m->m_next;
-                       *mp = NULL;
+               int dlen = 0, dfin = 0, error_out = 0;
+               struct mbuf *start = m;
+               uint64_t dsn;
+               uint32_t sseq;
+               uint16_t orig_dlen;
+               uint16_t csum;
+
+               VERIFY(m->m_nextpkt == NULL);
+
+               if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
+                       orig_dlen = dlen = m->m_pkthdr.mp_rlen;
+                       dsn = m->m_pkthdr.mp_dsn;
+                       sseq = m->m_pkthdr.mp_rseq;
+                       csum = m->m_pkthdr.mp_csum;
+               } else {
+                       /* We did fallback */
+                       mptcp_adj_rmap(so, m, 0, 0, 0, 0);
+
+                       sbfree(&so->so_rcv, m);
+
+                       if (mp != NULL) {
+                               *mp = m;
+                               mp = &m->m_next;
+                               so->so_rcv.sb_mb = m = m->m_next;
+                               *mp = NULL;
+
+                       }
+
+                       if (m != NULL) {
+                               so->so_rcv.sb_lastrecord = m;
+                       } else {
+                               SB_EMPTY_FIXUP(&so->so_rcv);
+                       }
+
+                       continue;
                }
 
+               if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
+                       dfin = 1;
+
+               /*
+                * Check if the full mapping is now present
+                */
+               if ((int)so->so_rcv.sb_cc < dlen - dfin) {
+                       mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
+                                 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
+                                MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
+
+                       if (*mp0 == NULL)
+                               error = EWOULDBLOCK;
+                       goto release;
+               }
+
+               /* Now, get the full mapping */
+               while (dlen > 0) {
+                       if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
+                               error_out = 1;
+                               error = EIO;
+                               dlen = 0;
+                               soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+                               break;
+                       }
+
+                       dlen -= m->m_len;
+                       sbfree(&so->so_rcv, m);
+
+                       if (mp != NULL) {
+                               *mp = m;
+                               mp = &m->m_next;
+                               so->so_rcv.sb_mb = m = m->m_next;
+                               *mp = NULL;
+                       }
+
+                       if (dlen - dfin == 0)
+                               dlen = 0;
+
+                       VERIFY(dlen <= 0 || m);
+               }
+
+               VERIFY(dlen == 0);
+
                if (m != NULL) {
-                       m->m_nextpkt = nextrecord;
-                       if (nextrecord == NULL)
-                               so->so_rcv.sb_lastrecord = m;
+                       so->so_rcv.sb_lastrecord = m;
                } else {
-                       m = so->so_rcv.sb_mb = nextrecord;
                        SB_EMPTY_FIXUP(&so->so_rcv);
                }
+
+               if (error_out)
+                       goto release;
+
+
+               if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
+                       error = EIO;
+                       *mp0 = NULL;
+                       goto release;
+               }
+
                SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
                SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
        }
 
        DTRACE_MPTCP3(subflow__receive, struct socket *, so,
            struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
-       /* notify protocol that we drained all the data */
-       if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
-               (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
 
        if (flagsp != NULL)
                *flagsp |= flags;
 
 release:
-       sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
+       sbunlock(&so->so_rcv, TRUE);
+
+       if (proc_held)
+               proc_rele(p);
+
        return (error);
 
 }
 
-
 /*
- * Prepare an MPTCP subflow socket for peeloff(2); basically undo
- * the work done earlier when the subflow socket was created.
+ * MPTCP subflow socket send routine, derived from sosend().
  */
-void
-mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
-    struct socket *so)
+static int
+mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags)
 {
-       struct mptopt smpo;
-       struct socket *mp_so;
-       int p, c;
-
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
+       struct proc *p = current_proc();
+       boolean_t en_tracing = FALSE, proc_held = FALSE;
+       int en_tracing_val;
+       int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
+       int error;
 
-       socket_lock(so, 0);
-       VERIFY(so->so_flags & SOF_MP_SUBFLOW);
-       VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
+       VERIFY(control == NULL);
+       VERIFY(addr == NULL);
+       VERIFY(uio == NULL);
+       VERIFY(flags == 0);
+       VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
 
-       /* inherit MPTCP socket states */
-       if (!(mp_so->so_state & SS_NBIO))
-               so->so_state &= ~SS_NBIO;
+       VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
+       VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
 
        /*
-        * At this point, the socket is not yet closed, as there is at least
-        * one outstanding usecount previously held by mpts_socket from
-        * socreate().  Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
+        * trace if tracing & network (vs. unix) sockets & and
+        * non-loopback
         */
-       so->so_flags &= ~SOF_MP_SUBFLOW;
-       so->so_state &= ~SS_NOFDREF;
-       so->so_flags &= ~SOF_MPTCP_TRUE;
+       if (ENTR_SHOULDTRACE &&
+           (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
+               struct inpcb *inp = sotoinpcb(so);
+               if (inp->inp_last_outifp != NULL &&
+                   !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
+                       en_tracing = TRUE;
+                       en_tracing_val = top->m_pkthdr.len;
+                       KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
+                           VM_KERNEL_ADDRPERM(so),
+                           ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
+                           (int64_t)en_tracing_val);
+               }
+       }
 
-       /* allow socket buffers to be compressed */
-       so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
-       so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
+       mptcp_update_last_owner(so, mp_so);
 
-       /*
-        * Allow socket buffer auto sizing.
-        *
-        * This will increase the current 64k buffer size to whatever is best.
-        */
-       if (!(so->so_rcv.sb_flags & SB_USRSIZE))
-               so->so_rcv.sb_flags |= SB_AUTOSIZE;
-       if (!(so->so_snd.sb_flags & SB_USRSIZE))
-               so->so_snd.sb_flags |= SB_AUTOSIZE;
+       if (mp_so->last_pid != proc_pid(p)) {
+               p = proc_find(mp_so->last_pid);
+               if (p == PROC_NULL) {
+                       p = current_proc();
+               } else {
+                       proc_held = TRUE;
+               }
+       }
 
-       /* restore protocol-user requests */
-       VERIFY(mpts->mpts_oprotosw != NULL);
-       so->so_proto = mpts->mpts_oprotosw;
+#if NECP
+       inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
+#endif /* NECP */
 
-       bzero(&smpo, sizeof (smpo));
-       smpo.mpo_flags |= MPOF_SUBFLOW_OK;
-       smpo.mpo_level = SOL_SOCKET;
+       OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
 
-       /* inherit SOF_NOSIGPIPE from parent MP socket */
-       p = (mp_so->so_flags & SOF_NOSIGPIPE);
-       c = (so->so_flags & SOF_NOSIGPIPE);
-       smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
-       smpo.mpo_name = SO_NOSIGPIPE;
-       if ((p - c) != 0)
-               (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
+       error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
+       if (error)
+               goto out;
 
-       /* inherit SOF_NOADDRAVAIL from parent MP socket */
-       p = (mp_so->so_flags & SOF_NOADDRAVAIL);
-       c = (so->so_flags & SOF_NOADDRAVAIL);
-       smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
-       smpo.mpo_name = SO_NOADDRERR;
-       if ((p - c) != 0)
-               (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
+       error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
+       top = NULL;
 
-       /* inherit SO_KEEPALIVE from parent MP socket */
-       p = (mp_so->so_options & SO_KEEPALIVE);
-       c = (so->so_options & SO_KEEPALIVE);
-       smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
-       smpo.mpo_name = SO_KEEPALIVE;
-       if ((p - c) != 0)
-               (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
+out:
+       if (top != NULL)
+               m_freem(top);
 
-       /* unset TCP level default keepalive option */
-       p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
-       c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
-       smpo.mpo_level = IPPROTO_TCP;
-       smpo.mpo_intval = 0;
-       smpo.mpo_name = TCP_KEEPALIVE;
-       if ((p - c) != 0)
-               (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
-       socket_unlock(so, 0);
+       if (proc_held)
+               proc_rele(p);
+
+       soclearfastopen(so);
+
+       if (en_tracing) {
+               KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
+                   VM_KERNEL_ADDRPERM(so),
+                   ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
+                   (int64_t)en_tracing_val);
+       }
+
+       return (error);
 
-       DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
-           struct mptsub *, mpts, struct socket *, so,
-           struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
 }
 
 /*
@@ -1217,70 +2017,70 @@ mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
  * connected), or add a subflow to an existing MPTCP connection.
  */
 int
-mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
-    struct proc *p, uint32_t ifscope)
+mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
+    struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
 {
-       struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
        struct socket *mp_so, *so = NULL;
-       struct mptsub_connreq mpcr;
        struct mptcb *mp_tp;
+       struct mptsub *mpts = NULL;
        int af, error = 0;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
 
-       MPT_LOCK(mp_tp);
        if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
                /* If the remote end sends Data FIN, refuse subflow adds */
+               mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
                error = ENOTCONN;
-               MPT_UNLOCK(mp_tp);
-               return (error);
+               goto out_err;
        }
-       MPT_UNLOCK(mp_tp);
-
-       MPTS_LOCK(mpts);
-       VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
-       VERIFY(mpts->mpts_mpte == NULL);
-       VERIFY(mpts->mpts_socket == NULL);
-       VERIFY(mpts->mpts_dst_sl != NULL);
-       VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
 
-       /* select source (if specified) and destination addresses */
-       if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
-           &mpts->mpts_dst_sl, &dst_se)) != 0)
-               goto out;
+       mpts = mptcp_subflow_alloc();
+       if (mpts == NULL) {
+               mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               error = ENOMEM;
+               goto out_err;
+       }
 
-       VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
-       VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
-       af = mpts->mpts_family = dst_se->se_addr->sa_family;
-       VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
-       VERIFY(af == AF_INET || af == AF_INET6);
+       if (src != NULL) {
+               int len = src->sa_len;
 
-       /*
-        * If the source address is not specified, allocate a storage for
-        * it, so that later on we can fill it in with the actual source
-        * IP address chosen by the underlying layer for the subflow after
-        * it is connected.
-        */
-       if (mpts->mpts_src_sl == NULL) {
-               mpts->mpts_src_sl =
-                   sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
-               if (mpts->mpts_src_sl == NULL) {
-                       error = ENOBUFS;
-                       goto out;
+               MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
+                   M_WAITOK | M_ZERO);
+               if (mpts->mpts_src == NULL) {
+                       mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
+                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                       error = ENOMEM;
+                       goto out_err;
                }
-               se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
-               VERIFY(se != NULL && se->se_addr != NULL &&
-                   se->se_addr->sa_len == dst_se->se_addr->sa_len);
-               bzero(se->se_addr, se->se_addr->sa_len);
-               se->se_addr->sa_len = dst_se->se_addr->sa_len;
-               se->se_addr->sa_family = dst_se->se_addr->sa_family;
+               bcopy(src, mpts->mpts_src, len);
        }
 
+       memcpy(&mpts->mpts_dst, dst, dst->sa_len);
+
+       af = mpts->mpts_dst.sa_family;
+
+       mpts->mpts_ifscope = ifscope;
+
        /* create the subflow socket */
-       if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
-               goto out;
+       if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
+               /*
+                * Returning (error) and not cleaning up, because up to here
+                * all we did is creating mpts.
+                *
+                * And the contract is that the call to mptcp_subflow_socreate,
+                * moves ownership of mpts to mptcp_subflow_socreate.
+                */
+               return (error);
+
+       /*
+        * We may be called from within the kernel. Still need to account this
+        * one to the real app.
+        */
+       mptcp_update_last_owner(mpts->mpts_socket, mp_so);
 
        /*
         * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
@@ -1292,8 +2092,6 @@ mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
                mpte->mpte_connid_last++;
 
        mpts->mpts_connid = mpte->mpte_connid_last;
-       VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
-           mpts->mpts_connid != SAE_CONNID_ALL);
 
        mpts->mpts_rel_seq = 1;
 
@@ -1302,169 +2100,45 @@ mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
        if (mpte->mpte_addrid_last == 0)
                mpte->mpte_addrid_last++;
 
-       /* bind subflow socket to the specified interface */
-       if (ifscope != IFSCOPE_NONE) {
-               socket_lock(so, 0);
-               error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
-               if (error != 0) {
-                       socket_unlock(so, 0);
-                       (void) mptcp_subflow_soclose(mpts, so);
-                       goto out;
-               }
-               VERIFY(mpts->mpts_outif != NULL);
-               mpts->mpts_flags |= MPTSF_BOUND_IF;
-
-               if (IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
-                       sototcpcb(so)->t_mpflags |= TMPF_BACKUP_PATH;
-               } else {
-                       mpts->mpts_flags |= MPTSF_PREFERRED;
-               }
-
-               mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
-                   "bindif %s[%d] cid %d expensive %d\n",
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mpts->mpts_outif->if_xname,
-                   ifscope, mpts->mpts_connid,
-                   IFNET_IS_EXPENSIVE(mpts->mpts_outif)),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-               socket_unlock(so, 0);
-       }
-
-       /* if source address and/or port is specified, bind to it */
-       if (src_se != NULL) {
-               struct sockaddr *sa = src_se->se_addr;
-               uint32_t mpts_flags = 0;
-               in_port_t lport;
-
-               switch (af) {
-               case AF_INET:
-                       if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
-                               mpts_flags |= MPTSF_BOUND_IP;
-                       if ((lport = SIN(sa)->sin_port) != 0)
-                               mpts_flags |= MPTSF_BOUND_PORT;
-                       break;
-#if INET6
-               case AF_INET6:
-                       VERIFY(af == AF_INET6);
-                       if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
-                               mpts_flags |= MPTSF_BOUND_IP;
-                       if ((lport = SIN6(sa)->sin6_port) != 0)
-                               mpts_flags |= MPTSF_BOUND_PORT;
-                       break;
-#endif /* INET6 */
-               }
-
-               error = sobindlock(so, sa, 1);  /* will lock/unlock socket */
-               if (error != 0) {
-                       (void) mptcp_subflow_soclose(mpts, so);
-                       goto out;
-               }
-               mpts->mpts_flags |= mpts_flags;
-
-               if (af == AF_INET || af == AF_INET6) {
-                       char sbuf[MAX_IPv6_STR_LEN];
-
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
-                           "mp_so 0x%llx bindip %s[%d] cid %d\n",
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           inet_ntop(af, ((af == AF_INET) ?
-                           (void *)&SIN(sa)->sin_addr.s_addr :
-                           (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
-                           ntohs(lport), mpts->mpts_connid),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-               }
-       }
-
-       /*
-        * Insert the subflow into the list, and associate the MPTCP PCB
-        * as well as the the subflow socket.  From this point on, removing
-        * the subflow needs to be done via mptcp_subflow_del().
-        */
-       TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
-       mpte->mpte_numflows++;
-
-       atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
-       mpts->mpts_mpte = mpte;
-       mpts->mpts_socket = so;
-       MPTS_ADDREF_LOCKED(mpts);       /* for being in MPTCP subflow list */
-       MPTS_ADDREF_LOCKED(mpts);       /* for subflow socket */
-       mp_so->so_usecount++;           /* for subflow socket */
-
        /* register for subflow socket read/write events */
-       (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
-           mptcp_subflow_wupcall, mpts);
+       sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
 
-       /*
-        * Register for subflow socket control events; ignore
-        * SO_FILT_HINT_CONNINFO_UPDATED from below since we
-        * will generate it here.
-        */
-       (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
+       /* Register for subflow socket control events */
+       sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
            SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
-           SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
-           SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
-           SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
-           SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
-           SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
-           SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
-           SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
+           SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
+           SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
+           SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
+           SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
+           SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
+           SO_FILT_HINT_ADAPTIVE_WTIMO);
 
        /* sanity check */
        VERIFY(!(mpts->mpts_flags &
            (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
 
-       bzero(&mpcr, sizeof (mpcr));
-       mpcr.mpcr_proc = p;
-       mpcr.mpcr_ifscope = ifscope;
        /*
         * Indicate to the TCP subflow whether or not it should establish
         * the initial MPTCP connection, or join an existing one.  Fill
         * in the connection request structure with additional info needed
         * by the underlying TCP (to be used in the TCP options, etc.)
         */
-       MPT_LOCK(mp_tp);
        if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
+               mpts->mpts_flags |= MPTSF_INITIAL_SUB;
+
                if (mp_tp->mpt_state == MPTCPS_CLOSED) {
-                       mptcp_init_local_parms(mp_tp);
+                       mptcp_init_local_parms(mpte);
                }
-               MPT_UNLOCK(mp_tp);
                soisconnecting(mp_so);
-               mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
+
+               /* If fastopen is requested, set state in mpts */
+               if (so->so_flags1 & SOF1_PRECONNECT_DATA)
+                       mpts->mpts_flags |= MPTSF_TFO_REQD;
        } else {
                if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
                        mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
-
-               /* avoid starting up cellular subflow unless required */
-               if ((mptcp_delayed_subf_start) &&
-                   (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
-                       mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
-               }
-               MPT_UNLOCK(mp_tp);
-               mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
-       }
-
-       /* If fastjoin or fastopen is requested, set state in mpts */
-       if (mpte->mpte_nummpcapflows == 0) {
-               if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
-                       MPT_LOCK(mp_tp);
-                       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-                               mpts->mpts_flags |= MPTSF_TFO_REQD;
-                               mpts->mpts_sndnxt = mp_tp->mpt_snduna;
-                       }
-                       MPT_UNLOCK(mp_tp);
-               }
-
-               if (so->so_flags & SOF_MPTCP_FASTJOIN) {
-                       MPT_LOCK(mp_tp);
-                       if (mp_tp->mpt_state == MPTCPS_ESTABLISHED) {
-                               mpts->mpts_flags |= MPTSF_FASTJ_REQD;
-                               mpts->mpts_sndnxt = mp_tp->mpt_snduna;
-                       }
-                       MPT_UNLOCK(mp_tp);
-               }
        }
 
-       mpts->mpts_mpcr = mpcr;
        mpts->mpts_flags |= MPTSF_CONNECTING;
 
        if (af == AF_INET || af == AF_INET6) {
@@ -1475,76 +2149,84 @@ mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
                    "[pending %s]\n", __func__,
                    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
                    inet_ntop(af, ((af == AF_INET) ?
-                   (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
-                   (void *)&SIN6(dst_se->se_addr)->sin6_addr),
+                   (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
+                   (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
                    dbuf, sizeof (dbuf)), ((af == AF_INET) ?
-                   ntohs(SIN(dst_se->se_addr)->sin_port) :
-                   ntohs(SIN6(dst_se->se_addr)->sin6_port)),
+                   ntohs(SIN(&mpts->mpts_dst)->sin_port) :
+                   ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
                    mpts->mpts_connid,
                    ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
                    "YES" : "NO")),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
        }
 
        /* connect right away if first attempt, or if join can be done now */
        if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
                error = mptcp_subflow_soconnectx(mpte, mpts);
 
-out:
-       MPTS_UNLOCK(mpts);
-       if (error == 0) {
-               soevent(mp_so, SO_FILT_HINT_LOCKED |
-                   SO_FILT_HINT_CONNINFO_UPDATED);
-       }
+       if (error)
+               goto out_err_close;
+
+       if (pcid)
+               *pcid = mpts->mpts_connid;
+
+       return (0);
+
+out_err_close:
+       mptcp_subflow_abort(mpts, error);
+
        return (error);
+
+out_err:
+       if (mpts)
+               mptcp_subflow_free(mpts);
+
+       return (error);
+}
+
+void
+mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
+{
+       int index = mptcp_get_statsindex(stats, mpts);
+
+       if (index != -1) {
+               struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
+
+               stats[index].mpis_txbytes += inp->inp_stat->txbytes;
+               stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
+       }
 }
 
 /*
  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
  * will no longer be accessible after a subflow is deleted, thus this
  * should occur only after the subflow socket has been disconnected.
- * If peeloff(2) is called, leave the socket open.
  */
 void
-mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
+mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
 {
-       struct socket *mp_so, *so;
-
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-
-       MPTS_LOCK(mpts);
-       so = mpts->mpts_socket;
-       VERIFY(so != NULL);
+       struct socket *mp_so = mptetoso(mpte);
+       struct socket *so = mpts->mpts_socket;
+       struct tcpcb *tp = sototcpcb(so);
 
-       if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
-           (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
-               MPTS_UNLOCK(mpts);
-               mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
-                   " mp_so 0x%llx flags %x\n",
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-               return;
-       }
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       VERIFY(mpts->mpts_mpte == mpte);
+       VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
+       VERIFY(mpte->mpte_numflows != 0);
+       VERIFY(mp_so->so_usecount > 0);
 
-       mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
-           "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
-           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-           mp_so->so_usecount,
-           mp_so->so_retaincnt, mpts->mpts_connid,
-           (close ? "YES" : "NO"), mpts->mpts_soerror,
-           mpts->mpts_flags,
-           mp_so->so_error),
-           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
+                 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
+                 mpts->mpts_flags, mp_so->so_error),
+                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-       VERIFY(mpts->mpts_mpte == mpte);
-       VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
-           mpts->mpts_connid != SAE_CONNID_ALL);
+       mptcpstats_update(mpte->mpte_itfstats, mpts);
+       mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
+       mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
 
-       VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
        atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
        TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
-       VERIFY(mpte->mpte_numflows != 0);
        mpte->mpte_numflows--;
        if (mpte->mpte_active_sub == mpts)
                mpte->mpte_active_sub = NULL;
@@ -1553,73 +2235,94 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
         * Drop references held by this subflow socket; there
         * will be no further upcalls made from this point.
         */
-       (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
-       (void) sock_catchevents(so, NULL, NULL, 0);
+       sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
+       sock_catchevents_locked(so, NULL, NULL, 0);
 
        mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
 
-       if (close)
-               (void) mptcp_subflow_soclose(mpts, so);
-
-       VERIFY(mp_so->so_usecount != 0);
        mp_so->so_usecount--;           /* for subflow socket */
        mpts->mpts_mpte = NULL;
        mpts->mpts_socket = NULL;
-       MPTS_UNLOCK(mpts);
 
-       MPTS_REMREF(mpts);              /* for MPTCP subflow list */
-       MPTS_REMREF(mpts);              /* for subflow socket */
+       mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
+       mptcp_subflow_remref(mpts);             /* for subflow socket */
+
+       so->so_flags &= ~SOF_MP_SUBFLOW;
+       tp->t_mptcb = NULL;
+       tp->t_mpsub = NULL;
+}
+
+void
+mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
+{
+       struct socket *so = mpts->mpts_socket;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       int send_dfin = 0;
+
+       if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
+               send_dfin = 1;
+
+       if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
+           (so->so_state & SS_ISCONNECTED)) {
+               mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
+                   __func__, mpts->mpts_connid, send_dfin),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+               if (send_dfin)
+                       mptcp_send_dfin(so);
+               soshutdownlock(so, SHUT_WR);
+       }
+
+}
+
+static void
+mptcp_subflow_abort(struct mptsub *mpts, int error)
+{
+       struct socket *so = mpts->mpts_socket;
+       struct tcpcb *tp = sototcpcb(so);
+
+       if (mpts->mpts_flags & MPTSF_DISCONNECTED)
+               return;
+
+       mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
+                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-       soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
+       if (tp->t_state != TCPS_CLOSED)
+               tcp_drop(tp, error);
+
+       mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
 }
 
 /*
  * Disconnect a subflow socket.
  */
 void
-mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
-    boolean_t deleteok)
+mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
 {
        struct socket *so;
        struct mptcb *mp_tp;
        int send_dfin = 0;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
        VERIFY(mpts->mpts_mpte == mpte);
        VERIFY(mpts->mpts_socket != NULL);
-       VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
-           mpts->mpts_connid != SAE_CONNID_ALL);
 
        if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
                return;
 
        mpts->mpts_flags |= MPTSF_DISCONNECTING;
 
-       /*
-        * If this is coming from disconnectx(2) or issued as part of
-        * closing the MPTCP socket, the subflow shouldn't stick around.
-        * Otherwise let it linger around in case the upper layers need
-        * to retrieve its conninfo.
-        */
-       if (deleteok)
-               mpts->mpts_flags |= MPTSF_DELETEOK;
-
        so = mpts->mpts_socket;
        mp_tp = mpte->mpte_mptcb;
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
+       if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
                send_dfin = 1;
-       MPT_UNLOCK(mp_tp);
 
-       socket_lock(so, 0);
        if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
            (so->so_state & SS_ISCONNECTED)) {
-               mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
-                   "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
-                   (deleteok ? "NO" : "YES")),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+               mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
+                   __func__, mpts->mpts_connid, send_dfin),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
                if (send_dfin)
                        mptcp_send_dfin(so);
@@ -1627,154 +2330,134 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
                (void) soshutdownlock(so, SHUT_WR);
                (void) sodisconnectlocked(so);
        }
-       socket_unlock(so, 0);
        /*
         * Generate a disconnect event for this subflow socket, in case
         * the lower layer doesn't do it; this is needed because the
-        * subflow socket deletion relies on it.  This will also end up
-        * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
-        * we cannot do that here because subflow lock is currently held.
+        * subflow socket deletion relies on it.
         */
-       mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
+       mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
 }
 
 /*
- * Subflow socket read upcall.
- *
- * Called when the associated subflow socket posted a read event.  The subflow
- * socket lock has been released prior to invoking the callback.  Note that the
- * upcall may occur synchronously as a result of MPTCP performing an action on
- * it, or asynchronously as a result of an event happening at the subflow layer.
- * Therefore, to maintain lock ordering, the only lock that can be acquired
- * here is the thread lock, for signalling purposes.
+ * Called when the associated subflow socket posted a read event.
  */
 static void
 mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
 {
 #pragma unused(so, waitf)
-       struct mptsub *mpts = arg;
+       struct mptsub *mpts = arg, *tmpts;
        struct mptses *mpte = mpts->mpts_mpte;
 
-       /*
-        * mpte should never be NULL, except in a race with
-        * mptcp_subflow_del
-        */
-       if (mpte == NULL)
+       VERIFY(mpte != NULL);
+
+       if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
+               if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
+                       mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
                return;
+       }
+
+       mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
+       TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
+               if (mpts->mpts_socket->so_usecount == 0) {
+                       /* Will be removed soon by tcp_garbage_collect */
+                       continue;
+               }
+
+               mptcp_subflow_addref(mpts);
+               mpts->mpts_socket->so_usecount++;
 
-       lck_mtx_lock(&mpte->mpte_thread_lock);
-       mptcp_thread_signal_locked(mpte);
-       lck_mtx_unlock(&mpte->mpte_thread_lock);
+               mptcp_subflow_input(mpte, mpts);
+
+               mptcp_subflow_remref(mpts);             /* ours */
+
+               VERIFY(mpts->mpts_socket->so_usecount != 0);
+               mpts->mpts_socket->so_usecount--;
+       }
+
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
 }
 
 /*
  * Subflow socket input.
- *
- * Called in the context of the MPTCP thread, for reading data from the
- * underlying subflow socket and delivering it to MPTCP.
  */
 static void
 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
 {
+       struct socket *mp_so = mptetoso(mpte);
        struct mbuf *m = NULL;
        struct socket *so;
-       int error;
-       struct mptsub *mpts_alt = NULL;
+       int error, wakeup = 0;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
+       mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
 
        DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
            struct mptsub *, mpts);
 
        if (!(mpts->mpts_flags & MPTSF_CONNECTED))
-               return;
+               goto out;
 
        so = mpts->mpts_socket;
 
        error = sock_receive_internal(so, NULL, &m, 0, NULL);
        if (error != 0 && error != EWOULDBLOCK) {
-               mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
+               mptcplog((LOG_ERR, "%s: cid %d error %d\n",
                    __func__, mpts->mpts_connid, error),
                    MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
-               MPTS_UNLOCK(mpts);
-               mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
-               if (mpts_alt == NULL) {
-                       if (mptcp_delayed_subf_start) {
-                               mpts_alt = mptcp_get_pending_subflow(mpte,
-                                   mpts);
-                               if (mpts_alt) {
-                                       mptcplog((LOG_DEBUG,"MPTCP Receiver:"
-                                       " %s: pending %d\n",
-                                       __func__, mpts_alt->mpts_connid),
-                                       MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
-                               } else {
-                                       mptcplog((LOG_ERR, "MPTCP Receiver:"
-                                           " %s: no pending flow for cid %d",
-                                           __func__, mpts->mpts_connid),
-                                           MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
-                               }
-                       } else {
-                               mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
-                                   " path for cid %d\n", __func__,
-                                   mpts->mpts_connid),
-                                   MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
-                       }
-                       if (error == ENODATA) {
-                               /*
-                                * Don't ignore ENODATA so as to discover
-                                * nasty middleboxes.
-                                */
-                               struct socket *mp_so =
-                                   mpte->mpte_mppcb->mpp_socket;
-                               mp_so->so_error = ENODATA;
-                               sorwakeup(mp_so);
-                       }
+               if (error == ENODATA) {
+                       /*
+                        * Don't ignore ENODATA so as to discover
+                        * nasty middleboxes.
+                        */
+                       mp_so->so_error = ENODATA;
+
+                       wakeup = 1;
+                       goto out;
                }
-               MPTS_LOCK(mpts);
        } else if (error == 0) {
-               mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
-                   __func__, mpts->mpts_connid),
+               mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
                    MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
        }
 
        /* In fallback, make sure to accept data on all but one subflow */
-       if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
-           (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
+       if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
+           !(mpts->mpts_flags & MPTSF_ACTIVE)) {
+               mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
+                   __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
                m_freem(m);
-               return;
+               goto out;
        }
 
        if (m != NULL) {
+               if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
+                       mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
 
-               /* Did we receive data on the backup subflow? */
-               if (!(mpts->mpts_flags & MPTSF_ACTIVE))
-                       mpts->mpts_peerswitch++;
-               else
-                       mpts->mpts_peerswitch = 0;
+                       mpte->mpte_used_cell = 1;
+               } else {
+                       mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
+
+                       mpte->mpte_used_wifi = 1;
+               }
 
-               /*
-                * Release subflow lock since this may trigger MPTCP to send,
-                * possibly on a different subflow.  An extra reference has
-                * been held on the subflow by the MPTCP thread before coming
-                * here, so we can be sure that it won't go away, in the event
-                * the MP socket lock gets released.
-                */
-               MPTS_UNLOCK(mpts);
                mptcp_input(mpte, m);
-               MPTS_LOCK(mpts);
        }
+
+       /* notify protocol that we drained all the data */
+       if (error == 0 && m != NULL &&
+           (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
+               (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
+
+out:
+       if (wakeup)
+               mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
+
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
 }
 
 /*
  * Subflow socket write upcall.
  *
- * Called when the associated subflow socket posted a read event.  The subflow
- * socket lock has been released prior to invoking the callback.  Note that the
- * upcall may occur synchronously as a result of MPTCP performing an action on
- * it, or asynchronously as a result of an event happening at the subflow layer.
- * Therefore, to maintain lock ordering, the only lock that can be acquired
- * here is the thread lock, for signalling purposes.
+ * Called when the associated subflow socket posted a read event.
  */
 static void
 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
@@ -1783,18 +2466,36 @@ mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
        struct mptsub *mpts = arg;
        struct mptses *mpte = mpts->mpts_mpte;
 
-       /*
-        * mpte should never be NULL except in a race with
-        * mptcp_subflow_del which doesn't hold socket lock across critical
-        * section. This upcall is made after releasing the socket lock.
-        * Interleaving of socket operations becomes possible therefore.
-        */
-       if (mpte == NULL)
+       VERIFY(mpte != NULL);
+
+       if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
+               if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
+                       mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
                return;
+       }
 
-       lck_mtx_lock(&mpte->mpte_thread_lock);
-       mptcp_thread_signal_locked(mpte);
-       lck_mtx_unlock(&mpte->mpte_thread_lock);
+       mptcp_output(mpte);
+}
+
+static boolean_t
+mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
+{
+       struct mbuf *so_m = so->so_snd.sb_mb;
+       uint64_t dsn = m->m_pkthdr.mp_dsn;
+
+       while (so_m) {
+               VERIFY(so_m->m_flags & M_PKTHDR);
+               VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
+
+               /* Part of the segment is covered, don't reinject here */
+               if (so_m->m_pkthdr.mp_dsn <= dsn &&
+                   so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn)
+                       return TRUE;
+
+               so_m = so_m->m_next;
+       }
+
+       return FALSE;
 }
 
 /*
@@ -1803,61 +2504,47 @@ mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
  * Called for sending data from MPTCP to the underlying subflow socket.
  */
 int
-mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
+mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
 {
-       struct socket *mp_so, *so;
-       size_t sb_cc = 0, tot_sent = 0;
-       struct mbuf *sb_mb;
-       int error = 0, wakeup = 0;
-       u_int64_t mpt_dsn = 0;
        struct mptcb *mp_tp = mpte->mpte_mptcb;
-       struct mbuf *mpt_mbuf = NULL;
-       u_int64_t off = 0;
-       struct mbuf *head, *tail;
-       int tcp_zero_len_write = 0;
-
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
+       struct socket *mp_so, *so;
+       struct tcpcb *tp;
+       uint64_t mpt_dsn = 0, off = 0;
+       int sb_cc = 0, error = 0, wakeup = 0;
+       uint32_t dss_csum;
+       uint16_t tot_sent = 0;
+       boolean_t reinjected = FALSE;
+
+       mpte_lock_assert_held(mpte);
+
+       mp_so = mptetoso(mpte);
        so = mpts->mpts_socket;
+       tp = sototcpcb(so);
+
+       VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
+       mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
+
+       VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
+       VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
+              (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
+              (mpts->mpts_flags & MPTSF_TFO_REQD));
+       VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
 
+       mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
+                 __func__, mpts->mpts_flags, mpte->mpte_flags,
+                 mptcp_subflow_cwnd_space(so)),
+                MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
        DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
            struct mptsub *, mpts);
 
-       /* subflow socket is suspended? */
-       if (mpts->mpts_flags & MPTSF_SUSPENDED) {
-               mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
-                   "flow controlled\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
-               goto out;
-       }
-
-       /* subflow socket is not MPTCP capable? */
-       if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
-           !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
-           !(mpts->mpts_flags & MPTSF_FASTJ_SEND) &&
-           !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
-               mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
-                   "MPTCP capable\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
-               goto out;
-       }
-
        /* Remove Addr Option is not sent reliably as per I-D */
        if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
-               struct tcpcb *tp = intotcpcb(sotoinpcb(so));
                tp->t_rem_aid = mpte->mpte_lost_aid;
-               if (mptcp_remaddr_enable)
-                       tp->t_mpflags |= TMPF_SND_REM_ADDR;
+               tp->t_mpflags |= TMPF_SND_REM_ADDR;
                mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
        }
 
-       if (mpts->mpts_flags & MPTSF_TFO_REQD) {
-               mptcp_drop_tfo_data(mpte, mpts, &wakeup);
-       }
-
        /*
         * The mbuf chains containing the metadata (as well as pointing to
         * the user data sitting at the MPTCP output queue) would then be
@@ -1872,154 +2559,213 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
         *      pkt_flags marked with the PKTF_MPTCP flag.
         */
 
-       /* First, drop acknowledged data */
-       sb_mb = mp_so->so_snd.sb_mb;
+       if (mpte->mpte_reinjectq)
+               sb_mb = mpte->mpte_reinjectq;
+       else
+               sb_mb = mp_so->so_snd.sb_mb;
+
        if (sb_mb == NULL) {
+               mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
+                         __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
+                         (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+
+               /* Fix it to prevent looping */
+               if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
+                       mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
                goto out;
        }
 
        VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
 
-       mpt_mbuf = sb_mb;
-       while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
-               if (((so->so_state & SS_ISCONNECTED) == 0) &&
-                   (mpt_mbuf->m_next == NULL) &&
-                   (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
-                       /*
-                        * If TFO, allow connection establishment with zero
-                        * length write.
-                        */
-                       tcp_zero_len_write = 1;
-                       goto zero_len_write;
-               }
-               mpt_mbuf = mpt_mbuf->m_next;
-       }
-       if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
-               mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
-       } else {
-               goto out;
+       if (sb_mb->m_pkthdr.mp_rlen == 0 &&
+           !(so->so_state & SS_ISCONNECTED) &&
+           (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
+               tp->t_mpflags |= TMPF_TFO_REQUEST;
+               goto zero_len_write;
        }
 
-       MPT_LOCK(mp_tp);
+       mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
+
+       /* First, drop acknowledged data */
        if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
-               u_int64_t len = 0;
-               len = mp_tp->mpt_snduna - mpt_dsn;
-               MPT_UNLOCK(mp_tp);
-               sbdrop(&mp_so->so_snd, (int)len);
-               wakeup = 1;
-               MPT_LOCK(mp_tp);
+               mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
+                                  "dsn %u suna %u reinject? %u\n",
+                         __func__, (uint32_t)mpt_dsn,
+                         (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               if (mpte->mpte_reinjectq) {
+                       mptcp_clean_reinjectq(mpte);
+               } else {
+                       uint64_t len = 0;
+                       len = mp_tp->mpt_snduna - mpt_dsn;
+                       sbdrop(&mp_so->so_snd, (int)len);
+                       wakeup = 1;
+               }
+       }
+
+       /* Check again because of above sbdrop */
+       if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
+               mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               goto out;
        }
 
        /*
         * In degraded mode, we don't receive data acks, so force free
         * mbufs less than snd_nxt
         */
-       if (mp_so->so_snd.sb_mb == NULL) {
-               MPT_UNLOCK(mp_tp);
-               goto out;
-       }
-
-       mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
        if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
            (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
-           MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
-               u_int64_t len = 0;
-               len = mp_tp->mpt_sndnxt - mpt_dsn;
-               sbdrop(&mp_so->so_snd, (int)len);
-               wakeup = 1;
-               mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
+           mp_so->so_snd.sb_mb) {
+               mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
+               if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
+                       uint64_t len = 0;
+                       len = mp_tp->mpt_snduna - mpt_dsn;
+                       sbdrop(&mp_so->so_snd, (int)len);
+                       wakeup = 1;
+
+                       mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
+                                 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
+                                MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+               }
        }
 
        if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
            !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
                mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
                so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
-               if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
-                       mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
-       }
-
-       /*
-        * Adjust the subflow's notion of next byte to send based on
-        * the last unacknowledged byte
-        */
-       if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
-               mpts->mpts_sndnxt = mp_tp->mpt_snduna;
        }
 
        /*
         * Adjust the top level notion of next byte used for retransmissions
         * and sending FINs.
         */
-       if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
+       if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
                mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
-       }
-
 
        /* Now determine the offset from which to start transmitting data */
-       sb_mb = mp_so->so_snd.sb_mb;
-       sb_cc = mp_so->so_snd.sb_cc;
+       if (mpte->mpte_reinjectq)
+               sb_mb = mpte->mpte_reinjectq;
+       else
+dont_reinject:
+               sb_mb = mp_so->so_snd.sb_mb;
        if (sb_mb == NULL) {
-               MPT_UNLOCK(mp_tp);
+               mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
                goto out;
        }
-       if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
-               off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
-               sb_cc -= (size_t)off;
+
+       if (sb_mb == mpte->mpte_reinjectq) {
+               sb_cc = sb_mb->m_pkthdr.mp_rlen;
+               off = 0;
+
+               if (mptcp_search_seq_in_sub(sb_mb, so)) {
+                       if (mptcp_can_send_more(mp_tp, TRUE)) {
+                               goto dont_reinject;
+                       }
+
+                       error = ECANCELED;
+                       goto out;
+               }
+
+               reinjected = TRUE;
+       } else if (flags & MPTCP_SUBOUT_PROBING) {
+               sb_cc = sb_mb->m_pkthdr.mp_rlen;
+               off = 0;
        } else {
-               MPT_UNLOCK(mp_tp);
-               goto out;
+               sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
+
+               /*
+                * With TFO, there might be no data at all, thus still go into this
+                * code-path here.
+                */
+               if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
+                   MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
+                       off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
+                       sb_cc -= off;
+               } else {
+                       mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
+                                 __func__, (uint32_t)mp_tp->mpt_sndnxt,
+                                 (uint32_t)mp_tp->mpt_sndmax),
+                                MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+
+                       goto out;
+               }
        }
-       MPT_UNLOCK(mp_tp);
 
-       mpt_mbuf = sb_mb;
+       sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
+       if (sb_cc <= 0) {
+               mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
+                         __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
+                         (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
+                         mptcp_subflow_cwnd_space(so)),
+                         MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+       }
+
+       sb_cc = min(sb_cc, UINT16_MAX);
+
+       /*
+        * Create a DSN mapping for the data we are about to send. It all
+        * has the same mapping.
+        */
+       if (reinjected)
+               mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
+       else
+               mpt_dsn = mp_tp->mpt_snduna + off;
 
-       while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
-           (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
+       mpt_mbuf = sb_mb;
+       while (mpt_mbuf && reinjected == FALSE &&
+              (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
+               mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
                off -= mpt_mbuf->m_pkthdr.mp_rlen;
                mpt_mbuf = mpt_mbuf->m_next;
        }
        if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
-               mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
-                   "snduna = %llu sndnxt = %llu probe %d\n",
-                   __func__, mpts->mpts_connid,
-                   mp_tp->mpt_snduna, mpts->mpts_sndnxt,
+               mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
+                   __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
                    mpts->mpts_probecnt),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 
        VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
 
        head = tail = NULL;
 
        while (tot_sent < sb_cc) {
-               struct mbuf *m;
-               size_t mlen;
+               ssize_t mlen;
 
-               mlen = mpt_mbuf->m_pkthdr.mp_rlen;
+               mlen = mpt_mbuf->m_len;
                mlen -= off;
-               if (mlen == 0)
-                       goto out;
+               mlen = min(mlen, sb_cc - tot_sent);
 
-               if (mlen > sb_cc) {
-                       panic("%s: unexpected %lu %lu \n", __func__,
-                           mlen, sb_cc);
+               if (mlen < 0) {
+                       mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
+                                 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
+                                 (uint32_t)off, sb_cc, tot_sent),
+                                MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
+                       goto out;
                }
 
+               if (mlen == 0)
+                       goto next;
+
                m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
                    M_COPYM_MUST_COPY_HDR);
                if (m == NULL) {
+                       mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
+                                MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
                        error = ENOBUFS;
                        break;
                }
 
                /* Create a DSN mapping for the data (m_copym does it) */
-               mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
                VERIFY(m->m_flags & M_PKTHDR);
+               VERIFY(m->m_next == NULL);
+
                m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
                m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
-               m->m_pkthdr.mp_dsn = mpt_dsn + off;
+               m->m_pkthdr.mp_dsn = mpt_dsn;
                m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
-               m->m_pkthdr.mp_rlen = mlen;
-               mpts->mpts_rel_seq += mlen;
                m->m_pkthdr.len = mlen;
 
                if (head == NULL) {
@@ -2031,352 +2777,498 @@ mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
 
                tot_sent += mlen;
                off = 0;
+next:
                mpt_mbuf = mpt_mbuf->m_next;
        }
 
-       if (head != NULL) {
-               struct tcpcb *tp = intotcpcb(sotoinpcb(so));
+       if (reinjected) {
+               if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
+                       struct mbuf *n = sb_mb;
+
+                       while (n) {
+                               n->m_pkthdr.mp_dsn += sb_cc;
+                               n->m_pkthdr.mp_rlen -= sb_cc;
+                               n = n->m_next;
+                       }
+                       m_adj(sb_mb, sb_cc);
+               } else {
+                       mpte->mpte_reinjectq = sb_mb->m_nextpkt;
+                       m_freem(sb_mb);
+               }
+       }
+
+       mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
+                 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
+                 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
+
+       if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
+               dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
+                                            tot_sent);
+       }
+
+       /* Now, let's update rel-seq and the data-level length */
+       mpts->mpts_rel_seq += tot_sent;
+       m = head;
+       while (m) {
+               if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
+                       m->m_pkthdr.mp_csum = dss_csum;
+               m->m_pkthdr.mp_rlen = tot_sent;
+               m = m->m_next;
+       }
 
+       if (head != NULL) {
                if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
-                   (tp->t_tfo_stats == 0)) {
+                   (tp->t_tfo_stats == 0))
                        tp->t_mpflags |= TMPF_TFO_REQUEST;
-               } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
-                       tp->t_mpflags |= TMPF_FASTJOIN_SEND;
-               }
 
                error = sock_sendmbuf(so, NULL, head, 0, NULL);
 
-               DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
+               DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
                    struct sockbuf *, &so->so_rcv,
                    struct sockbuf *, &so->so_snd,
                    struct mptses *, mpte, struct mptsub *, mpts,
                    size_t, tot_sent);
-       } else if (tcp_zero_len_write == 1) {
-zero_len_write:
-               socket_lock(so, 1);
-               /* Opting to call pru_send as no mbuf at subflow level */
-               error = (*so->so_proto->pr_usrreqs->pru_send)
-                   (so, 0, NULL, NULL, NULL, current_proc());
-               socket_unlock(so, 1);
        }
 
-       if ((error == 0) || (error == EWOULDBLOCK)) {
-               mpts->mpts_sndnxt += tot_sent;
+done_sending:
+       if (error == 0 ||
+           (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
+               uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
 
                if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
                        tcpstat.tcps_mp_num_probes++;
-                       if (tot_sent < mpts->mpts_maxseg)
+                       if ((uint32_t)tot_sent < mpts->mpts_maxseg)
                                mpts->mpts_probecnt += 1;
                        else
                                mpts->mpts_probecnt +=
                                    tot_sent/mpts->mpts_maxseg;
                }
 
-               MPT_LOCK(mp_tp);
-
-               if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
-                       if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
+               if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
+                       if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
                            MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
                                mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
-                       mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
+                       mp_tp->mpt_sndnxt = new_sndnxt;
                }
-               mptcp_cancel_timer(mp_tp, MPTT_REXMT);
-               MPT_UNLOCK(mp_tp);
 
-               if (so->so_flags1 & SOF1_PRECONNECT_DATA)
-                       so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
+               mptcp_cancel_timer(mp_tp, MPTT_REXMT);
 
-               /* Send once in SYN_SENT state to avoid sending SYN spam */
-               if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
-                       so->so_flags &= ~SOF_MPTCP_FASTJOIN;
-                       mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
-               }
+               /* Must be here as mptcp_can_send_more() checks for this */
+               soclearfastopen(mp_so);
 
                if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
                    (mpts->mpts_probesoon != 0))
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
-                           "wrote %d %d probe %d probedelta %d\n",
-                           __func__, mpts->mpts_connid, (int)tot_sent,
-                           (int) sb_cc, mpts->mpts_probecnt,
+                       mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
+                           __func__, mpts->mpts_connid,
+                           !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
+                           tot_sent, (int) sb_cc, mpts->mpts_probecnt,
                            (tcp_now - mpts->mpts_probesoon)),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
+
+               if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
+                       mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
+
+                       mpte->mpte_used_cell = 1;
+               } else {
+                       mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
+
+                       mpte->mpte_used_wifi = 1;
+               }
+
+               /*
+                * Don't propagate EWOULDBLOCK - it's already taken care of
+                * in mptcp_usr_send for TFO.
+                */
+               error = 0;
        } else {
-               mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
-                   __func__, mpts->mpts_connid, error, tot_sent),
+               mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
+                   __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
                    MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
        }
 out:
+
        if (wakeup)
-               sowwakeup(mp_so);
+               mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
 
+       mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
        return (error);
+
+zero_len_write:
+       /* Opting to call pru_send as no mbuf at subflow level */
+       error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
+                                                     NULL, current_proc());
+
+       goto done_sending;
 }
 
-/*
- * Subflow socket control event upcall.
- *
- * Called when the associated subflow socket posted one or more control events.
- * The subflow socket lock has been released prior to invoking the callback.
- * Note that the upcall may occur synchronously as a result of MPTCP performing
- * an action on it, or asynchronously as a result of an event happening at the
- * subflow layer.  Therefore, to maintain lock ordering, the only lock that can
- * be acquired here is the thread lock, for signalling purposes.
- */
 static void
-mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
+mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
 {
-#pragma unused(so)
-       struct mptsub *mpts = arg;
-       struct mptses *mpte = mpts->mpts_mpte;
+       struct mbuf *n, *prev = NULL;
 
-       VERIFY(mpte != NULL);
+       mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
+                 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
+                 m->m_pkthdr.mp_rseq),
+                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+       n = mpte->mpte_reinjectq;
+
+       /* First, look for an mbuf n, whose data-sequence-number is bigger or
+        * equal than m's sequence number.
+        */
+       while (n) {
+               if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
+                       break;
+
+               prev = n;
+
+               n = n->m_nextpkt;
+       }
+
+       if (n) {
+               /* m is already fully covered by the next mbuf in the queue */
+               if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
+                   n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
+                       mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
+                                 __func__, n->m_pkthdr.mp_rlen),
+                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       goto dont_queue;
+               }
+
+               /* m is covering the next mbuf entirely, thus we remove this guy */
+               if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
+                       struct mbuf *tmp = n->m_nextpkt;
+
+                       mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
+                                 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
+                                 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
+                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+                       m->m_nextpkt = NULL;
+                       if (prev == NULL)
+                               mpte->mpte_reinjectq = tmp;
+                       else
+                               prev->m_nextpkt = tmp;
+
+                       m_freem(n);
+                       n = tmp;
+               }
+
+       }
+
+       if (prev) {
+               /* m is already fully covered by the previous mbuf in the queue */
+               if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
+                       mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
+                                 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
+                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                       goto dont_queue;
+               }
+       }
+
+       if (prev == NULL)
+               mpte->mpte_reinjectq = m;
+       else
+               prev->m_nextpkt = m;
 
-       lck_mtx_lock(&mpte->mpte_thread_lock);
-       atomic_bitset_32(&mpts->mpts_evctl, events);
-       mptcp_thread_signal_locked(mpte);
-       lck_mtx_unlock(&mpte->mpte_thread_lock);
+       m->m_nextpkt = n;
+
+       return;
+
+dont_queue:
+       m_freem(m);
+       return;
 }
 
-/*
- * Subflow socket control events.
- *
- * Called for handling events related to the underlying subflow socket.
- */
-static ev_ret_t
-mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+static struct mbuf *
+mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
 {
-       uint32_t events, save_events;
-       ev_ret_t ret = MPTS_EVRET_OK;
-       int i = 0;
-       int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
-               sizeof(mpsub_ev_entry_tbl[0]);
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       struct socket *mp_so = mptetoso(mpte);
+       struct mbuf *m;
 
-       /* bail if there's nothing to process */
-       if ((events = mpts->mpts_evctl) == 0)
-               return (ret);
+       m = mp_so->so_snd.sb_mb;
 
-       if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
-           SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
-           SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
-           SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
-           SO_FILT_HINT_DISCONNECTED)) {
-               events |= SO_FILT_HINT_MPFAILOVER;
+       while (m) {
+               /* If this segment covers what we are looking for, return it. */
+               if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
+                   MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
+                       break;
+
+
+               /* Segment is no more in the queue */
+               if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
+                       return NULL;
+
+               m = m->m_next;
        }
 
-       save_events = events;
+       return m;
+}
 
-       DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
-           struct mptsub *, mpts, uint32_t, events);
+static struct mbuf *
+mptcp_copy_mbuf_list(struct mbuf *m, int len)
+{
+       struct mbuf *top = NULL, *tail = NULL;
+       uint64_t dsn;
+       uint32_t dlen, rseq;
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
-           mpts->mpts_connid, events, SO_FILT_HINT_BITS),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+       dsn = m->m_pkthdr.mp_dsn;
+       dlen = m->m_pkthdr.mp_rlen;
+       rseq = m->m_pkthdr.mp_rseq;
 
-       /*
-        * Process all the socket filter hints and reset the hint
-        * once it is handled
-        */
-       for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
-               /*
-                * Always execute the DISCONNECTED event, because it will wakeup
-                * the app.
-                */
-               if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
-                   (ret >= MPTS_EVRET_OK ||
-                    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
-                       ev_ret_t error =
-                               mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
-                       events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
-                       ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
+       while (len > 0) {
+               struct mbuf *n;
+
+               VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
+
+               n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
+               if (n == NULL) {
+                       mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
+                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                       goto err;
                }
-       }
 
-       /*
-        * We should be getting only events specified via sock_catchevents(),
-        * so loudly complain if we have any unprocessed one(s).
-        */
-       if (events != 0 || ret < MPTS_EVRET_OK) {
-               mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
-                   " unhandled events=%b\n",
-                   (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
-                   __func__, mpts->mpts_connid,
-                   mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
+               VERIFY(n->m_flags & M_PKTHDR);
+               VERIFY(n->m_next == NULL);
+               VERIFY(n->m_pkthdr.mp_dsn == dsn);
+               VERIFY(n->m_pkthdr.mp_rlen == dlen);
+               VERIFY(n->m_pkthdr.mp_rseq == rseq);
+               VERIFY(n->m_len == m->m_len);
+
+               n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
+
+               if (top == NULL)
+                       top = n;
+
+               if (tail != NULL)
+                       tail->m_next = n;
+
+               tail = n;
+
+               len -= m->m_len;
+               m = m->m_next;
        }
 
-       /* clear the ones we've processed */
-       atomic_bitclear_32(&mpts->mpts_evctl, save_events);
-       return (ret);
+       return top;
+
+err:
+       if (top)
+               m_freem(top);
+
+       return NULL;
 }
 
-/*
- * Handle SO_FILT_HINT_CONNRESET subflow socket event.
- */
-static ev_ret_t
-mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+static void
+mptcp_reinject_mbufs(struct socket *so)
 {
-       struct socket *mp_so, *so;
-       struct mptcb *mp_tp;
-       boolean_t linger;
+       struct tcpcb *tp = sototcpcb(so);
+       struct mptsub *mpts = tp->t_mpsub;
+       struct mptcb *mp_tp = tptomptp(tp);
+       struct mptses *mpte = mp_tp->mpt_mpte;;
+       struct sockbuf *sb = &so->so_snd;
+       struct mbuf *m;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       mp_tp = mpte->mpte_mptcb;
-       so = mpts->mpts_socket;
+       m = sb->sb_mb;
+       while (m) {
+               struct mbuf *n = m->m_next, *orig = m;
 
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
+               mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
+                         __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
+                         m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d [linger %s]\n", __func__,
-           mpts->mpts_connid, (linger ? "YES" : "NO")),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+               VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
 
-       /*
-        * We got a TCP RST for this subflow connection.
-        *
-        * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
-        * client if the MPTCP connection has not been established or
-        * if the connection has only one subflow and is a connection being
-        * resumed. Otherwise we close the socket.
-        */
-       mptcp_subflow_disconnect(mpte, mpts, !linger);
+               if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
+                       goto next;
 
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-               mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
-       } else if (mpte->mpte_nummpcapflows < 1 ||
-                  ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
-                   (mpts->mpts_flags & MPTSF_ACTIVE))) {
-               mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
-               *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
+               /* Has it all already been acknowledged at the data-level? */
+               if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
+                       goto next;
+
+               /* Part of this has already been acknowledged - lookup in the
+                * MPTCP-socket for the segment.
+                */
+               if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
+                       m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
+                       if (m == NULL)
+                               goto next;
+               }
+
+               /* Copy the mbuf with headers (aka, DSN-numbers) */
+               m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
+               if (m == NULL)
+                       break;
+
+               VERIFY(m->m_nextpkt == NULL);
+
+               /* Now, add to the reinject-queue, eliminating overlapping
+                * segments
+                */
+               mptcp_add_reinjectq(mpte, m);
+
+               orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
+
+next:
+               /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
+               while (n) {
+                       VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
+
+                       if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
+                               break;
+
+                       n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
+                       n = n->m_next;
+               }
+
+               m = n;
        }
-       MPT_UNLOCK(mp_tp);
+}
 
-       /*
-        * Keep the subflow socket around, unless the MPTCP socket has
-        * been detached or the subflow has been disconnected explicitly,
-        * in which case it should be deleted right away.
-        */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+void
+mptcp_clean_reinjectq(struct mptses *mpte)
+{
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+
+       mpte_lock_assert_held(mpte);
+
+       while (mpte->mpte_reinjectq) {
+               struct mbuf *m = mpte->mpte_reinjectq;
+
+               if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
+                   MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
+                       break;
+
+               mpte->mpte_reinjectq = m->m_nextpkt;
+               m->m_nextpkt = NULL;
+               m_freem(m);
+       }
 }
 
 /*
- * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
+ * Subflow socket control event upcall.
  */
-static ev_ret_t
-mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+static void
+mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
 {
-       struct mptcb *mp_tp;
-       struct socket *so;
+#pragma unused(so)
+       struct mptsub *mpts = arg;
+       struct mptses *mpte = mpts->mpts_mpte;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       VERIFY(mpte != NULL);
+       mpte_lock_assert_held(mpte);
 
-       mp_tp = mpte->mpte_mptcb;
-       so = mpts->mpts_socket;
+       if ((mpts->mpts_evctl & events) == events)
+               return;
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d\n", __func__, mpts->mpts_connid),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       mpts->mpts_evctl |= events;
 
-       /*
-       * A FIN on a fallen back MPTCP-connection should be treated like a
-       * DATA_FIN.
-       */
-       MPT_LOCK(mp_tp);
-       if ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
-           (mpts->mpts_flags & MPTSF_ACTIVE)) {
-               mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
-               if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
-                       *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
-               }
+       if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
+               mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
+               return;
        }
-       MPT_UNLOCK(mp_tp);
 
-       return (MPTS_EVRET_OK); /* keep the subflow socket around */
+       mptcp_subflow_workloop(mpte);
 }
 
 /*
- * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
+ * Subflow socket control events.
+ *
+ * Called for handling events related to the underlying subflow socket.
  */
 static ev_ret_t
-mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
+mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
        uint64_t *p_mpsofilt_hint)
 {
-#pragma unused(p_mpsofilt_hint)
-       struct socket *so;
+       ev_ret_t ret = MPTS_EVRET_OK;
+       int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
+                                     sizeof(mpsub_ev_entry_tbl[0]);
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
-       so = mpts->mpts_socket;
+       /* bail if there's nothing to process */
+       if (!mpts->mpts_evctl)
+               return (ret);
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d\n", __func__, mpts->mpts_connid),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
+           SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
+           SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
+           SO_FILT_HINT_DISCONNECTED)) {
+               mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
+       }
 
-       return (MPTS_EVRET_OK); /* keep the subflow socket around */
+       DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
+           struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
+
+       mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
+                 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
+                MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+
+       /*
+        * Process all the socket filter hints and reset the hint
+        * once it is handled
+        */
+       for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
+               /*
+                * Always execute the DISCONNECTED event, because it will wakeup
+                * the app.
+                */
+               if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
+                   (ret >= MPTS_EVRET_OK ||
+                    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
+                       mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
+                       ev_ret_t error =
+                               mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
+                       ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
+               }
+       }
+
+       /*
+        * We should be getting only events specified via sock_catchevents(),
+        * so loudly complain if we have any unprocessed one(s).
+        */
+       if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
+               mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
+                   (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
+                   mpts->mpts_connid,
+                   mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
+                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       else
+               mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
+                         mpts->mpts_evctl, SO_FILT_HINT_BITS),
+                        MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+
+       return (ret);
 }
 
-/*
- * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
- */
 static ev_ret_t
-mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
+                          uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-#pragma unused(p_mpsofilt_hint)
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
-       boolean_t linger;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
 
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
-
-       mptcplog((LOG_NOTICE, "MPTCP Events: "
-           "%s: cid %d [linger %s]\n", __func__,
-           mpts->mpts_connid, (linger ? "YES" : "NO")),
+       mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
+           mpts->mpts_connid, event),
            MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
-       if (mpts->mpts_soerror == 0)
-               mpts->mpts_soerror = ETIMEDOUT;
-
        /*
-        * The subflow connection has timed out.
-        *
-        * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
-        * client if the MPTCP connection has not been established. Otherwise
-        * drop it.
+        * We got an event for this subflow that might need to be propagated,
+        * based on the state of the MPTCP connection.
         */
-       mptcp_subflow_disconnect(mpte, mpts, !linger);
-
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-               mp_so->so_error = ETIMEDOUT;
+       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
+           ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
+               mp_so->so_error = so->so_error;
+               *p_mpsofilt_hint |= event;
        }
-       MPT_UNLOCK(mp_tp);
 
-       /*
-        * Keep the subflow socket around, unless the MPTCP socket has
-        * been detached or the subflow has been disconnected explicitly,
-        * in which case it should be deleted right away.
-        */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+       return (MPTS_EVRET_OK);
 }
 
 /*
@@ -2384,24 +3276,18 @@ mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-#pragma unused(p_mpsofilt_hint)
-       struct socket *mp_so, *so;
-       struct mptcb *mp_tp;
-       boolean_t linger;
-       struct tcpcb *tp = NULL;
+#pragma unused(p_mpsofilt_hint, event)
+       struct socket *mp_so;
+       struct tcpcb *tp;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       mp_tp = mpte->mpte_mptcb;
-       so = mpts->mpts_socket;
+       mp_so = mptetoso(mpte);
+       tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
 
-       /* Not grabbing socket lock as t_local_aid is write once only */
-       tp = intotcpcb(sotoinpcb(so));
        /*
         * This overwrites any previous mpte_lost_aid to avoid storing
         * too much state when the typical case has only two subflows.
@@ -2409,42 +3295,18 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
        mpte->mpte_flags |= MPTE_SND_REM_ADDR;
        mpte->mpte_lost_aid = tp->t_local_aid;
 
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
-
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s cid %d [linger %s]\n", __func__,
-           mpts->mpts_connid, (linger ? "YES" : "NO")),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
-
-       if (mpts->mpts_soerror == 0)
-               mpts->mpts_soerror = EADDRNOTAVAIL;
+       mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
+                  MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
        /*
         * The subflow connection has lost its source address.
-        *
-        * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
-        * client if the MPTCP connection has not been established.  If it
-        * has been established with one subflow , we keep the MPTCP
-        * connection valid without any subflows till closed by application.
-        * This lets tcp connection manager decide whether to close this or
-        * not as it reacts to reachability changes too.
         */
-       mptcp_subflow_disconnect(mpte, mpts, !linger);
+       mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
 
-       MPT_LOCK(mp_tp);
-       if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
-           (mp_so->so_flags & SOF_NOADDRAVAIL)) {
-               mp_so->so_error = EADDRNOTAVAIL;
-       }
-       MPT_UNLOCK(mp_tp);
+       if (mp_so->so_flags & SOF_NOADDRAVAIL)
+               mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
 
-       /*
-        * Keep the subflow socket around, unless the MPTCP socket has
-        * been detached or the subflow has been disconnected explicitly,
-        * in which case it should be deleted right away.
-        */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+       return (MPTS_EVRET_DELETE);
 }
 
 /*
@@ -2453,19 +3315,15 @@ mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-       struct socket *so, *mp_so;
+#pragma unused(event)
        struct mptcb *mp_tp;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       so = mpts->mpts_socket;
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        mp_tp = mpte->mpte_mptcb;
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d\n", __func__, mpts->mpts_connid),
+       mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
            MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
        /*
@@ -2474,11 +3332,9 @@ mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
        * mptcp socket and the user is notified so that it may close
        * the socket if needed.
        */
-       MPT_LOCK(mp_tp);
        if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
-               *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
+               *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
 
-       MPT_UNLOCK(mp_tp);
        return (MPTS_EVRET_OK); /* keep the subflow socket around */
 }
 
@@ -2487,106 +3343,68 @@ mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
+#pragma unused(event, p_mpsofilt_hint)
        struct mptsub *mpts_alt = NULL;
-       struct socket *so = NULL;
+       struct socket *alt_so = NULL;
        struct socket *mp_so;
        int altpath_exists = 0;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       mptcplog((LOG_NOTICE, "MPTCP Events: "
-           "%s: mp_so 0x%llx\n", __func__,
-           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       mpte_lock_assert_held(mpte);
+       mp_so = mptetoso(mpte);
+       mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
+                 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
+                MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
-       MPTS_UNLOCK(mpts);
-       mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
+       mptcp_reinject_mbufs(mpts->mpts_socket);
 
+       mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
        /*
         * If there is no alternate eligible subflow, ignore the
         * failover hint.
         */
        if (mpts_alt == NULL) {
-               mptcplog((LOG_WARNING, "MPTCP Events: "
-                   "%s: no alternate path\n", __func__),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
-
-               if (mptcp_delayed_subf_start) {
-                       mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
-                       if (mpts_alt != NULL) {
-                               MPTS_LOCK(mpts_alt);
-                               (void) mptcp_subflow_soconnectx(mpte,
-                                   mpts_alt);
-                               MPTS_UNLOCK(mpts_alt);
-                       }
-               }
-               MPTS_LOCK(mpts);
+               mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
+                        MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+
                goto done;
        }
-       MPTS_LOCK(mpts_alt);
+
        altpath_exists = 1;
-       so = mpts_alt->mpts_socket;
+       alt_so = mpts_alt->mpts_socket;
        if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
-               socket_lock(so, 1);
                /* All data acknowledged and no RTT spike */
-               if ((so->so_snd.sb_cc == 0) &&
-                   (mptcp_no_rto_spike(so))) {
-                       so->so_flags &= ~SOF_MP_TRYFAILOVER;
+               if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
                        mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
                } else {
                        /* no alternate path available */
                        altpath_exists = 0;
                }
-               socket_unlock(so, 1);
-       }
-       if (altpath_exists) {
-               mptcplog((LOG_INFO, "MPTCP Events: "
-                   "%s: cid = %d\n",
-                   __func__, mpts_alt->mpts_connid),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
-               mpts_alt->mpts_flags |= MPTSF_ACTIVE;
-               mpts_alt->mpts_peerswitch = 0;
-               struct mptcb *mp_tp = mpte->mpte_mptcb;
-               /* Bring the subflow's notion of snd_nxt into the send window */
-               MPT_LOCK(mp_tp);
-               mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
-               MPT_UNLOCK(mp_tp);
-               mpte->mpte_active_sub = mpts_alt;
-               socket_lock(so, 1);
-               sowwakeup(so);
-               socket_unlock(so, 1);
        }
-       MPTS_UNLOCK(mpts_alt);
 
        if (altpath_exists) {
-               *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
-               mptcplog((LOG_NOTICE, "MPTCP Events: "
-                   "%s: mp_so 0x%llx switched from "
-                   "%d to %d\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mpts->mpts_connid, mpts_alt->mpts_connid),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
-               tcpstat.tcps_mp_switches++;
-       }
+               mpts_alt->mpts_flags |= MPTSF_ACTIVE;
 
-       MPTS_LOCK(mpts);
-       if (altpath_exists) {
+               mpte->mpte_active_sub = mpts_alt;
                mpts->mpts_flags |= MPTSF_FAILINGOVER;
                mpts->mpts_flags &= ~MPTSF_ACTIVE;
+
+               mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
+                         __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
+                        MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+
+               mptcpstats_inc_switch(mpte, mpts);
+
+               sowwakeup(alt_so);
        } else {
-               mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
-                   __func__, mpts->mpts_connid),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+               mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
+                         mpts->mpts_connid),
+                        MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 done:
-               so = mpts->mpts_socket;
-               socket_lock(so, 1);
-               so->so_flags &= ~SOF_MP_TRYFAILOVER;
-               socket_unlock(so, 1);
+               mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
        }
-       MPTS_LOCK_ASSERT_HELD(mpts);
+
        return (MPTS_EVRET_OK);
 }
 
@@ -2595,102 +3413,106 @@ done:
  */
 static ev_ret_t
 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-       struct socket *mp_so, *so;
-       struct mptcb *mp_tp;
-       boolean_t linger;
-
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       mp_tp = mpte->mpte_mptcb;
-       so = mpts->mpts_socket;
-
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
-
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d [linger %s]\n", __func__,
-           mpts->mpts_connid, (linger ? "YES" : "NO")),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
-       if (mpts->mpts_soerror == 0)
-               mpts->mpts_soerror = EHOSTUNREACH;
+       mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
+           mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
        /*
-        * The subflow connection cannot use the outgoing interface.
-        *
-        * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
-        * client if the MPTCP connection has not been established.  If it
-        * has been established, let the upper layer call disconnectx.
+        * The subflow connection cannot use the outgoing interface, let's
+        * close this subflow.
         */
-       mptcp_subflow_disconnect(mpte, mpts, !linger);
-       *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
+       mptcp_subflow_abort(mpts, EPERM);
 
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-               mp_so->so_error = EHOSTUNREACH;
-       }
-       MPT_UNLOCK(mp_tp);
+       mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
 
-       /*
-        * Keep the subflow socket around, unless the MPTCP socket has
-        * been detached or the subflow has been disconnected explicitly,
-        * in which case it should be deleted right away.
-        */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+       return (MPTS_EVRET_DELETE);
 }
 
 /*
- * Handle SO_FILT_HINT_SUSPEND subflow socket event.
+ * https://tools.ietf.org/html/rfc6052#section-2
+ * https://tools.ietf.org/html/rfc6147#section-5.2
  */
-static ev_ret_t
-mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+static boolean_t
+mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
+                            const struct ipv6_prefix *prefix,
+                            struct in_addr *addrv4)
 {
-#pragma unused(p_mpsofilt_hint)
-       struct socket *so;
+       char buf[MAX_IPv4_STR_LEN];
+       char *ptrv4 = (char *)addrv4;
+       const char *ptr = (const char *)addr;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-
-       so = mpts->mpts_socket;
+       if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0)
+               return false;
 
-       /* the subflow connection is being flow controlled */
-       mpts->mpts_flags |= MPTSF_SUSPENDED;
+       switch (prefix->prefix_len) {
+               case NAT64_PREFIX_LEN_96:
+                       memcpy(ptrv4, ptr + 12, 4);
+                       break;
+               case NAT64_PREFIX_LEN_64:
+                       memcpy(ptrv4, ptr + 9, 4);
+                       break;
+               case NAT64_PREFIX_LEN_56:
+                       memcpy(ptrv4, ptr + 7, 1);
+                       memcpy(ptrv4 + 1, ptr + 9, 3);
+                       break;
+               case NAT64_PREFIX_LEN_48:
+                       memcpy(ptrv4, ptr + 6, 2);
+                       memcpy(ptrv4 + 2, ptr + 9, 2);
+                       break;
+               case NAT64_PREFIX_LEN_40:
+                       memcpy(ptrv4, ptr + 5, 3);
+                       memcpy(ptrv4 + 3, ptr + 9, 1);
+                       break;
+               case NAT64_PREFIX_LEN_32:
+                       memcpy(ptrv4, ptr + 4, 4);
+                       break;
+               default:
+                       panic("NAT64-prefix len is wrong: %u\n",
+                             prefix->prefix_len);
+       }
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d\n", __func__,
-           mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
+                   inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
 
-       return (MPTS_EVRET_OK); /* keep the subflow socket around */
+       return true;
 }
 
-/*
- * Handle SO_FILT_HINT_RESUME subflow socket event.
- */
-static ev_ret_t
-mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+static void
+mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
 {
-#pragma unused(p_mpsofilt_hint)
-       struct socket *so;
+       struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
+       struct socket *so = mpts->mpts_socket;
+       struct ifnet *ifp;
+       int j;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       ifp = sotoinpcb(so)->inp_last_outifp;
 
-       so = mpts->mpts_socket;
+       if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
+               mptcp_ask_for_nat64(ifp);
+               return;
+       }
 
-       /* the subflow connection is no longer flow controlled */
-       mpts->mpts_flags &= ~MPTSF_SUSPENDED;
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d\n", __func__, mpts->mpts_connid),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
+               int success;
 
-       return (MPTS_EVRET_OK); /* keep the subflow socket around */
+               if (nat64prefixes[j].prefix_len == 0)
+                       continue;
+
+               success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
+                                                      &nat64prefixes[j],
+                                                      &mpte->mpte_dst_v4_nat64.sin_addr);
+               if (success) {
+                       mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
+                       mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
+                       mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
+                       break;
+               }
+       }
 }
 
 /*
@@ -2698,46 +3520,39 @@ mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-       char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
-       struct sockaddr_entry *src_se, *dst_se;
-       struct sockaddr_storage src;
+#pragma unused(event, p_mpsofilt_hint)
        struct socket *mp_so, *so;
+       struct inpcb *inp;
+       struct tcpcb *tp;
        struct mptcb *mp_tp;
-       struct ifnet *outifp;
-       int af, error = 0;
+       int af;
        boolean_t mpok = FALSE;
-       boolean_t cell = FALSE;
-       boolean_t wifi = FALSE;
-       boolean_t wired = FALSE;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       mp_tp = mpte->mpte_mptcb;
 
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mp_so = mptetoso(mpte);
+       mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
-       af = mpts->mpts_family;
+       tp = sototcpcb(so);
+       af = mpts->mpts_dst.sa_family;
 
        if (mpts->mpts_flags & MPTSF_CONNECTED)
                return (MPTS_EVRET_OK);
 
        if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
            (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
-               socket_lock(so, 0);
                if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
                    (so->so_state & SS_ISCONNECTED)) {
-                   mptcplog((LOG_DEBUG, "MPTCP Events: "
-                       "%s: cid %d disconnect before tcp connect\n",
+                   mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
                        __func__, mpts->mpts_connid),
                        MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
                        (void) soshutdownlock(so, SHUT_RD);
                        (void) soshutdownlock(so, SHUT_WR);
                        (void) sodisconnectlocked(so);
                }
-               socket_unlock(so, 0);
                return (MPTS_EVRET_OK);
        }
 
@@ -2747,19 +3562,15 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
         *
         *   a. If MPTCP connection is not yet established, then this must be
         *      the first subflow connection.  If MPTCP failed to negotiate,
-        *      indicate to the MPTCP socket client via EPROTO, that the
-        *      underlying TCP connection may be peeled off via peeloff(2).
-        *      Otherwise, mark the MPTCP socket as connected.
+        *      fallback to regular TCP by degrading this subflow.
         *
         *   b. If MPTCP connection has been established, then this must be
         *      one of the subsequent subflow connections. If MPTCP failed
-        *      to negotiate, disconnect the connection since peeloff(2)
-        *      is no longer possible.
+        *      to negotiate, disconnect the connection.
         *
         * Right now, we simply unblock any waiters at the MPTCP socket layer
         * if the MPTCP connection has not been established.
         */
-       socket_lock(so, 0);
 
        if (so->so_state & SS_ISDISCONNECTED) {
                /*
@@ -2769,208 +3580,83 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                 * where the subflow could get disconnected before the
                 * connected event is processed.
                 */
-               socket_unlock(so, 0);
                return (MPTS_EVRET_OK);
        }
 
-       mpts->mpts_soerror = 0;
-       mpts->mpts_flags &= ~MPTSF_CONNECTING;
-       mpts->mpts_flags |= MPTSF_CONNECTED;
+       if (mpts->mpts_flags & MPTSF_TFO_REQD)
+               mptcp_drop_tfo_data(mpte, mpts);
 
-       if (!(so->so_flags1 & SOF1_DATA_IDEMPOTENT))
-               mpts->mpts_flags &= ~MPTSF_TFO_REQD;
+       mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
+       mpts->mpts_flags |= MPTSF_CONNECTED;
 
-       struct tcpcb *tp = sototcpcb(so);
        if (tp->t_mpflags & TMPF_MPTCP_TRUE)
                mpts->mpts_flags |= MPTSF_MP_CAPABLE;
 
        tp->t_mpflags &= ~TMPF_TFO_REQUEST;
 
-       VERIFY(mpts->mpts_dst_sl != NULL);
-       dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
-       VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
-           dst_se->se_addr->sa_family == af);
-
-       VERIFY(mpts->mpts_src_sl != NULL);
-       src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
-       VERIFY(src_se != NULL && src_se->se_addr != NULL &&
-           src_se->se_addr->sa_family == af);
-
-       /* get/check source IP address */
-       switch (af) {
-       case AF_INET: {
-               error = in_getsockaddr_s(so, &src);
-               if (error == 0) {
-                       struct sockaddr_in *ms = SIN(src_se->se_addr);
-                       struct sockaddr_in *s = SIN(&src);
-
-                       VERIFY(s->sin_len == ms->sin_len);
-                       VERIFY(ms->sin_family == AF_INET);
-
-                       if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
-                           bcmp(&ms->sin_addr, &s->sin_addr,
-                           sizeof (ms->sin_addr)) != 0) {
-                               mptcplog((LOG_ERR, "MPTCP Events: "
-                                   "%s: cid %d local "
-                                   "address %s (expected %s)\n", __func__,
-                                   mpts->mpts_connid, inet_ntop(AF_INET,
-                                   (void *)&s->sin_addr.s_addr, buf0,
-                                   sizeof (buf0)), inet_ntop(AF_INET,
-                                   (void *)&ms->sin_addr.s_addr, buf1,
-                                   sizeof (buf1))),
-                                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
-                       }
-                       bcopy(s, ms, sizeof (*s));
-               }
-               break;
-       }
-#if INET6
-       case AF_INET6: {
-               error = in6_getsockaddr_s(so, &src);
-               if (error == 0) {
-                       struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
-                       struct sockaddr_in6 *s = SIN6(&src);
-
-                       VERIFY(s->sin6_len == ms->sin6_len);
-                       VERIFY(ms->sin6_family == AF_INET6);
-
-                       if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
-                           bcmp(&ms->sin6_addr, &s->sin6_addr,
-                           sizeof (ms->sin6_addr)) != 0) {
-                               mptcplog((LOG_ERR, "MPTCP Events: "
-                                   "%s: cid %d local "
-                                   "address %s (expected %s)\n", __func__,
-                                   mpts->mpts_connid, inet_ntop(AF_INET6,
-                                   (void *)&s->sin6_addr, buf0,
-                                   sizeof (buf0)), inet_ntop(AF_INET6,
-                                   (void *)&ms->sin6_addr, buf1,
-                                   sizeof (buf1))),
-                                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
-                       }
-                       bcopy(s, ms, sizeof (*s));
-               }
-               break;
-       }
-#endif /* INET6 */
-       default:
-               VERIFY(0);
-               /* NOTREACHED */
-       }
-
-       if (error != 0) {
-               mptcplog((LOG_ERR, "MPTCP Events "
-                   "%s: cid %d getsockaddr failed (%d)\n",
-                   __func__, mpts->mpts_connid, error),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
-       }
-
        /* get/verify the outbound interface */
-       outifp = sotoinpcb(so)->inp_last_outifp;        /* could be NULL */
-       if (mpts->mpts_flags & MPTSF_BOUND_IF) {
-               VERIFY(mpts->mpts_outif != NULL);
-               if (mpts->mpts_outif != outifp) {
-                       mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
-                           "(expected %s)\n", __func__, mpts->mpts_connid,
-                           ((outifp != NULL) ? outifp->if_xname : "NULL"),
-                           mpts->mpts_outif->if_xname),
-                           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
-
-                       if (outifp == NULL)
-                               outifp = mpts->mpts_outif;
-               }
-       } else {
-               mpts->mpts_outif = outifp;
-       }
-
-       mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
-       mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
-       mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
-
-       cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
-       wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
-       wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
-
-       if (cell)
-               mpts->mpts_linktype |= MPTSL_CELL;
-       else if (wifi)
-               mpts->mpts_linktype |= MPTSL_WIFI;
-       else if (wired)
-               mpts->mpts_linktype |= MPTSL_WIRED;
-
-       socket_unlock(so, 0);
-
-       mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
-           "establishment srtt %d \n", __func__,
-           mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
-           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+       inp = sotoinpcb(so);
 
+       mpts->mpts_maxseg = tp->t_maxseg;
 
-       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-           "%s: cid %d outif %s %s[%d] -> %s[%d] "
-           "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
-           outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
-           (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
-           (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
-           ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
-           ntohs(SIN6(src_se->se_addr)->sin6_port)),
-           inet_ntop(af, ((af == AF_INET) ?
-           (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
-           (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
-           ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
-           ntohs(SIN6(dst_se->se_addr)->sin6_port)),
-           ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
-           "MPTCP capable" : "a regular TCP")),
+       mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
+           ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
+           ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
            (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
 
        mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
-       MPTS_UNLOCK(mpts);
-
-       *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
 
-       MPT_LOCK(mp_tp);
        if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
+               mp_tp->mpt_state = MPTCPS_ESTABLISHED;
+               mpte->mpte_associd = mpts->mpts_connid;
+               DTRACE_MPTCP2(state__change,
+                   struct mptcb *, mp_tp,
+                   uint32_t, 0 /* event */);
+
+               if (SOCK_DOM(so) == AF_INET) {
+                       in_getsockaddr_s(so, &mpte->__mpte_src_v4);
+               } else {
+                       in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
+               }
+
+               mpts->mpts_flags |= MPTSF_ACTIVE;
+
                /* case (a) above */
                if (!mpok) {
-                       mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
-                       (void) mptcp_drop(mpte, mp_tp, EPROTO);
-                       MPT_UNLOCK(mp_tp);
+                       tcpstat.tcps_mpcap_fallback++;
+
+                       tp->t_mpflags |= TMPF_INFIN_SENT;
+                       mptcp_notify_mpfail(so);
                } else {
-                       MPT_UNLOCK(mp_tp);
-                       mptcplog((LOG_DEBUG, "MPTCP State: "
-                           "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
-                           MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
-                       mp_tp->mpt_state = MPTCPS_ESTABLISHED;
-                       mpte->mpte_associd = mpts->mpts_connid;
-                       DTRACE_MPTCP2(state__change,
-                           struct mptcb *, mp_tp,
-                           uint32_t, 0 /* event */);
-
-                       if (mpts->mpts_outif &&
-                           IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
-                               sototcpcb(so)->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
+                       if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
+                           mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
+                               tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
                        } else {
                                mpts->mpts_flags |= MPTSF_PREFERRED;
                        }
-                       soisconnected(mp_so);
-               }
-               MPTS_LOCK(mpts);
-               if (mpok) {
                        mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
                        mpte->mpte_nummpcapflows++;
-                       MPT_LOCK_SPIN(mp_tp);
-                       /* With TFO, sndnxt may be initialized earlier */
-                       if (mpts->mpts_sndnxt == 0)
-                               mpts->mpts_sndnxt = mp_tp->mpt_snduna;
-                       MPT_UNLOCK(mp_tp);
+
+                       if (SOCK_DOM(so) == AF_INET6)
+                               mptcp_handle_ipv6_connection(mpte, mpts);
+
+                       mptcp_check_subflows_and_add(mpte);
+
+                       if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
+                               mpte->mpte_initial_cell = 1;
+
+                       mpte->mpte_handshake_success = 1;
                }
+
+               mp_tp->mpt_sndwnd = tp->snd_wnd;
+               mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
+               mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
+               soisconnected(mp_so);
+
+               mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
+                   __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
+                   MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
        } else if (mpok) {
-               MPT_UNLOCK(mp_tp);
-               if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
-                       /* Experimental code, disabled by default. */
-                       sorwakeup(mp_so);
-                       sowwakeup(mp_so);
-               }
                /*
                 * case (b) above
                 * In case of additional flows, the MPTCP socket is not
@@ -2978,24 +3664,62 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
                 * for 3-way handshake.  TCP would have guaranteed that this
                 * is an MPTCP subflow.
                 */
-               MPTS_LOCK(mpts);
+               if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
+                   !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
+                   mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
+                       tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
+                       mpts->mpts_flags &= ~MPTSF_PREFERRED;
+               } else {
+                       mpts->mpts_flags |= MPTSF_PREFERRED;
+               }
+
                mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
-               mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
                mpte->mpte_nummpcapflows++;
-               MPT_LOCK_SPIN(mp_tp);
-               /* With Fastjoin, sndnxt is updated before connected_ev */
-               if (mpts->mpts_sndnxt == 0) {
-                       mpts->mpts_sndnxt = mp_tp->mpt_snduna;
-                       mpts->mpts_rel_seq = 1;
-               }
-               MPT_UNLOCK(mp_tp);
-               mptcp_output_needed(mpte, mpts);
+
+               mpts->mpts_rel_seq = 1;
+
+               mptcp_check_subflows_and_remove(mpte);
        } else {
-               MPT_UNLOCK(mp_tp);
-               MPTS_LOCK(mpts);
+               unsigned int i;
+
+               /* Should we try the alternate port? */
+               if (mpte->mpte_alternate_port &&
+                   inp->inp_fport != mpte->mpte_alternate_port) {
+                       union sockaddr_in_4_6 dst;
+                       struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
+
+                       memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
+
+                       dst_in->sin_port = mpte->mpte_alternate_port;
+
+                       mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
+                                         mpts->mpts_ifscope , NULL);
+               } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
+                       for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+                               struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
+
+                               if (inp->inp_last_outifp->if_index == info->ifindex) {
+                                       info->no_mptcp_support = 1;
+                                       break;
+                               }
+                       }
+               }
+
+               tcpstat.tcps_join_fallback++;
+               if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
+                       tcpstat.tcps_mptcp_cell_proxy++;
+               else
+                       tcpstat.tcps_mptcp_wifi_proxy++;
+
+               soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
+
+               return (MPTS_EVRET_OK);
        }
 
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       /* This call, just to "book" an entry in the stats-table for this ifindex */
+       mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
+
+       mptcp_output(mpte);
 
        return (MPTS_EVRET_OK); /* keep the subflow socket around */
 }
@@ -3005,77 +3729,56 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+       uint64_t *p_mpsofilt_hint, uint64_t event)
 {
+#pragma unused(event, p_mpsofilt_hint)
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
-       boolean_t linger;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
 
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
-
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: cid %d [linger %s]\n", __func__,
-           mpts->mpts_connid, (linger ? "YES" : "NO")),
+       mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
+           __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
+           !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
+           !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
            MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
        if (mpts->mpts_flags & MPTSF_DISCONNECTED)
-               return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+               return (MPTS_EVRET_DELETE);
 
-       /*
-        * Clear flags that are used by getconninfo to return state.
-        * Retain like MPTSF_DELETEOK for internal purposes.
-        */
-       mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
-           MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
-           MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
-           MPTSF_SUSPENDED|MPTSF_ACTIVE);
        mpts->mpts_flags |= MPTSF_DISCONNECTED;
 
-       /*
-        * The subflow connection has been disconnected.
-        *
-        * Right now, we simply unblock any waiters at the MPTCP socket layer
-        * if the MPTCP connection has not been established.
-        */
-       *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
+       /* The subflow connection has been disconnected. */
 
        if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
                mpte->mpte_nummpcapflows--;
                if (mpte->mpte_active_sub == mpts) {
                        mpte->mpte_active_sub = NULL;
-                       mptcplog((LOG_DEBUG, "MPTCP Events: "
-                           "%s: resetting active subflow \n",
+                       mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
                            __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
                }
                mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
        }
 
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
-               MPT_UNLOCK(mp_tp);
-               MPTS_UNLOCK(mpts);
-               soisdisconnected(mp_so);
-               MPTS_LOCK(mpts);
-       } else {
-               MPT_UNLOCK(mp_tp);
+       if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
+           ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
+           (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
+               mptcp_drop(mpte, mp_tp, so->so_error);
        }
 
        /*
-        * The underlying subflow socket has been disconnected;
-        * it is no longer useful to us.  Keep the subflow socket
-        * around, unless the MPTCP socket has been detached or
-        * the subflow has been disconnected explicitly, in which
-        * case it should be deleted right away.
+        * Clear flags that are used by getconninfo to return state.
+        * Retain like MPTSF_DELETEOK for internal purposes.
         */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+       mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
+           MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
+           MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
+
+       return (MPTS_EVRET_DELETE);
 }
 
 /*
@@ -3083,23 +3786,19 @@ mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
  */
 static ev_ret_t
 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
-               uint64_t *p_mpsofilt_hint)
+               uint64_t *p_mpsofilt_hint, uint64_t event)
 {
+#pragma unused(event, p_mpsofilt_hint)
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
        ev_ret_t ret = MPTS_EVRET_OK;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
-
-       MPTS_LOCK_ASSERT_HELD(mpts);
        so = mpts->mpts_socket;
 
-       socket_lock(so, 0);
-       MPT_LOCK(mp_tp);
-
        if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
                mpts->mpts_flags |= MPTSF_MP_CAPABLE;
        else
@@ -3109,9 +3808,9 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
                if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
                        goto done;
                mpts->mpts_flags |= MPTSF_MP_DEGRADED;
-       }
-       else
+       } else {
                mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
+       }
 
        if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
                mpts->mpts_flags |= MPTSF_MP_READY;
@@ -3126,27 +3825,21 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
        if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
                VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
                ret = MPTS_EVRET_DISCONNECT_FALLBACK;
-               *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
-                       SO_FILT_HINT_CONNINFO_UPDATED;
+
+               m_freem_list(mpte->mpte_reinjectq);
+               mpte->mpte_reinjectq = NULL;
        } else if (mpts->mpts_flags & MPTSF_MP_READY) {
                mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
                ret = MPTS_EVRET_CONNECT_PENDING;
-       } else {
-               *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
-                       SO_FILT_HINT_CONNINFO_UPDATED;
        }
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s: mp_so 0x%llx mpt_flags=%b cid %d "
-           "mptsf=%b\n", __func__,
-           (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
-           mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
-           mpts->mpts_flags, MPTSF_BITS),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
+                 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
+                 mpts->mpts_flags, MPTSF_BITS),
+                MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
 
 done:
-       MPT_UNLOCK(mp_tp);
-       socket_unlock(so, 0);
        return (ret);
 }
 
@@ -3155,28 +3848,20 @@ done:
  */
 static ev_ret_t
 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+                        uint64_t *p_mpsofilt_hint, uint64_t event)
 {
+#pragma unused(event)
        struct socket *mp_so, *so;
        struct mptcb *mp_tp;
-       boolean_t linger, is_fastclose;
+       boolean_t is_fastclose;
 
-
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
        mp_tp = mpte->mpte_mptcb;
        so = mpts->mpts_socket;
 
-       linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
-           !(mp_so->so_flags & SOF_PCBCLEARING));
-
-       if (mpts->mpts_soerror == 0)
-               mpts->mpts_soerror = ECONNABORTED;
-
        /* We got an invalid option or a fast close */
-       socket_lock(so, 0);
        struct tcptemp *t_template;
        struct inpcb *inp = sotoinpcb(so);
        struct tcpcb *tp = NULL;
@@ -3207,12 +3892,7 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
                    so, mpts->mpts_connid),
                    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
        }
-       socket_unlock(so, 0);
-       mptcp_subflow_disconnect(mpte, mpts, !linger);
-
-       *p_mpsofilt_hint |=  (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
-
-       MPT_LOCK(mp_tp);
+       mptcp_subflow_abort(mpts, ECONNABORTED);
 
        if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
                *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
@@ -3230,65 +3910,64 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
 
        if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
                mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
-       MPT_UNLOCK(mp_tp);
 
-       /*
-        * Keep the subflow socket around unless the subflow has been
-        * disconnected explicitly.
-        */
-       return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
+       return (MPTS_EVRET_DELETE);
 }
 
 static ev_ret_t
-mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
+                               uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-#pragma unused(p_mpsofilt_hint)
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       VERIFY(mpte->mpte_mppcb != NULL);
+#pragma unused(event)
+       bool found_active = false;
+
+       mpts->mpts_flags |= MPTSF_READ_STALL;
 
-       if (mpte->mpte_nummpcapflows == 0) {
-               struct mptcb *mp_tp = mpte->mpte_mptcb;
-               mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
-                   __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
-                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 
-               mpte->mpte_active_sub = mpts;
-               mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
-               MPT_LOCK(mp_tp);
-               /*
-                * If mptcp_subflow_output is called before fastjoin_ev
-                * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
-                * and further mpts->mpts_sndnxt is incremented by len copied.
-                */
-               if (mpts->mpts_sndnxt == 0) {
-                       mpts->mpts_sndnxt = mp_tp->mpt_snduna;
+               if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+                   TCPS_HAVERCVDFIN2(tp->t_state))
+                       continue;
+
+               if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
+                       found_active = true;
+                       break;
                }
-               MPT_UNLOCK(mp_tp);
        }
 
+       if (!found_active)
+               *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
+
        return (MPTS_EVRET_OK);
 }
 
 static ev_ret_t
-mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
-       uint64_t *p_mpsofilt_hint)
+mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
+                               uint64_t *p_mpsofilt_hint, uint64_t event)
 {
-#pragma unused(p_mpsofilt_hint)
-       MPTE_LOCK_ASSERT_HELD(mpte);
-       MPTS_LOCK_ASSERT_HELD(mpts);
-       VERIFY(mpte->mpte_mppcb != NULL);
+#pragma unused(event)
+       bool found_active = false;
 
-       mptcplog((LOG_DEBUG, "MPTCP Events: "
-           "%s cid %d\n", __func__, mpts->mpts_connid),
-           MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+       mpts->mpts_flags |= MPTSF_WRITE_STALL;
 
-       mpts->mpts_flags |= MPTSF_DELETEOK;
-       if (mpts->mpts_flags & MPTSF_DISCONNECTED)
-               return (MPTS_EVRET_DELETE);
-       else
-               return (MPTS_EVRET_OK);
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+
+               if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+                   tp->t_state > TCPS_CLOSE_WAIT)
+                       continue;
+
+               if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
+                       found_active = true;
+                       break;
+               }
+       }
+
+       if (!found_active)
+               *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
+
+       return (MPTS_EVRET_OK);
 }
 
 static const char *
@@ -3315,63 +3994,60 @@ mptcp_evret2str(ev_ret_t ret)
        return (c);
 }
 
-/*
- * Add a reference to a subflow structure; used by MPTS_ADDREF().
- */
-void
-mptcp_subflow_addref(struct mptsub *mpts, int locked)
-{
-       if (!locked)
-               MPTS_LOCK(mpts);
-       else
-               MPTS_LOCK_ASSERT_HELD(mpts);
-
-       if (++mpts->mpts_refcnt == 0) {
-               panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
-               /* NOTREACHED */
-       }
-       if (!locked)
-               MPTS_UNLOCK(mpts);
-}
-
-/*
- * Remove a reference held on a subflow structure; used by MPTS_REMREF();
- */
-void
-mptcp_subflow_remref(struct mptsub *mpts)
-{
-       MPTS_LOCK(mpts);
-       if (mpts->mpts_refcnt == 0) {
-               panic("%s: mpts %p negative refcnt\n", __func__, mpts);
-               /* NOTREACHED */
-       }
-       if (--mpts->mpts_refcnt > 0) {
-               MPTS_UNLOCK(mpts);
-               return;
-       }
-       /* callee will unlock and destroy lock */
-       mptcp_subflow_free(mpts);
-}
-
 /*
  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
  * caller must ensure that the option can be issued on subflow sockets, via
  * MPOF_SUBFLOW_OK flag.
  */
 int
-mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
-    struct mptopt *mpo)
+mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
 {
-       struct socket *mp_so;
+       struct socket *mp_so, *so;
        struct sockopt sopt;
-       char buf[32];
        int error;
 
        VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
-       mpo->mpo_flags &= ~MPOF_INTERIM;
+       mpte_lock_assert_held(mpte);
+
+       mp_so = mptetoso(mpte);
+       so = mpts->mpts_socket;
+
+       if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
+           mpo->mpo_level == SOL_SOCKET &&
+           mpo->mpo_name == SO_MARK_CELLFALLBACK) {
+               struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
+
+               mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
+                         __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
+                         sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
+                         mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+               /*
+                * When we open a new subflow, mark it as cell fallback, if
+                * this subflow goes over cell.
+                *
+                * (except for first-party apps)
+                */
+
+               if (mpte->mpte_flags & MPTE_FIRSTPARTY)
+                       return (0);
+
+               if (sotoinpcb(so)->inp_last_outifp &&
+                   !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
+                       return (0);
+
+               /*
+                * This here is an OR, because if the app is not binding to the
+                * interface, then it definitely is not a cell-fallback
+                * connection.
+                */
+               if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
+                   !IFNET_IS_CELLULAR(ifp))
+                       return (0);
+       }
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mpo->mpo_flags &= ~MPOF_INTERIM;
 
        bzero(&sopt, sizeof (sopt));
        sopt.sopt_dir = SOPT_SET;
@@ -3381,23 +4057,21 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
        sopt.sopt_valsize = sizeof (int);
        sopt.sopt_p = kernproc;
 
-       error = sosetoptlock(so, &sopt, 0);     /* already locked */
+       error = sosetoptlock(so, &sopt, 0);
        if (error == 0) {
-               mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx sopt %s "
+               mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
                    "val %d set successful\n", __func__,
                    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
-                   buf, sizeof (buf)), mpo->mpo_intval),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
+                   mpo->mpo_intval),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
        } else {
-               mptcplog((LOG_ERR, "MPTCP Socket: "
-                   "%s: mp_so 0x%llx sopt %s "
+               mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
                    "val %d set error %d\n", __func__,
                    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
-                   buf, sizeof (buf)), mpo->mpo_intval, error),
-                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
+                   mpo->mpo_intval, error),
+                   MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
        }
        return (error);
 }
@@ -3413,12 +4087,11 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
 {
        struct socket *mp_so;
        struct sockopt sopt;
-       char buf[32];
        int error;
 
        VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
+       mp_so = mptetoso(mpte);
 
        bzero(&sopt, sizeof (sopt));
        sopt.sopt_dir = SOPT_GET;
@@ -3434,15 +4107,14 @@ mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
                    "%s: mp_so 0x%llx sopt %s "
                    "val %d get successful\n", __func__,
                    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
-                   buf, sizeof (buf)), mpo->mpo_intval),
+                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
+                   mpo->mpo_intval),
                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
        } else {
                mptcplog((LOG_ERR, "MPTCP Socket: "
                    "%s: mp_so 0x%llx sopt %s get error %d\n",
                    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                   mptcp_sopt2str(mpo->mpo_level,
-                   mpo->mpo_name, buf, sizeof (buf)), error),
+                   mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
        }
        return (error);
@@ -3462,7 +4134,7 @@ mptcp_gc(struct mppcbinfo *mppi)
        struct mppcb *mpp, *tmpp;
        uint32_t active = 0;
 
-       lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
+       LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
 
        TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
                struct socket *mp_so;
@@ -3484,9 +4156,9 @@ mptcp_gc(struct mppcbinfo *mppi)
                    mp_so->so_retaincnt, mpp->mpp_state),
                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-               if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
+               if (!mpte_try_lock(mpte)) {
                        mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx skipped "
+                           "%s: mp_so 0x%llx skipped lock "
                            "(u=%d,r=%d)\n", __func__,
                            (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
                            mp_so->so_usecount, mp_so->so_retaincnt),
@@ -3496,12 +4168,12 @@ mptcp_gc(struct mppcbinfo *mppi)
                }
 
                /* check again under the lock */
-               if (mp_so->so_usecount > 1) {
+               if (mp_so->so_usecount > 0) {
                        boolean_t wakeup = FALSE;
                        struct mptsub *mpts, *tmpts;
 
                        mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx skipped "
+                           "%s: mp_so 0x%llx skipped usecount "
                            "[u=%d,r=%d] %d %d\n", __func__,
                            (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
                            mp_so->so_usecount, mp_so->so_retaincnt,
@@ -3509,70 +4181,37 @@ mptcp_gc(struct mppcbinfo *mppi)
                            mp_tp->mpt_state),
                            MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
-                       MPT_LOCK(mp_tp);
                        if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
                                if (mp_tp->mpt_gc_ticks > 0)
                                        mp_tp->mpt_gc_ticks--;
                                if (mp_tp->mpt_gc_ticks == 0) {
                                        wakeup = TRUE;
-                                       if (mp_tp->mpt_localkey != NULL) {
-                                               mptcp_free_key(
-                                                   mp_tp->mpt_localkey);
-                                               mp_tp->mpt_localkey = NULL;
-                                       }
                                }
                        }
-                       MPT_UNLOCK(mp_tp);
                        if (wakeup) {
                                TAILQ_FOREACH_SAFE(mpts,
                                    &mpte->mpte_subflows, mpts_entry, tmpts) {
-                                       MPTS_LOCK(mpts);
-                                       mpts->mpts_flags |= MPTSF_DELETEOK;
-                                       if (mpts->mpts_soerror == 0)
-                                               mpts->mpts_soerror = ETIMEDOUT;
-                                       mptcp_subflow_eupcall(mpts->mpts_socket,
+                                       mptcp_subflow_eupcall1(mpts->mpts_socket,
                                            mpts, SO_FILT_HINT_DISCONNECTED);
-                                       MPTS_UNLOCK(mpts);
                                }
                        }
-                       lck_mtx_unlock(&mpp->mpp_lock);
+                       mpte_unlock(mpte);
                        active++;
                        continue;
                }
 
                if (mpp->mpp_state != MPPCB_STATE_DEAD) {
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx skipped "
-                           "[u=%d,r=%d,s=%d]\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mp_so->so_usecount, mp_so->so_retaincnt,
-                           mpp->mpp_state),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-                       lck_mtx_unlock(&mpp->mpp_lock);
-                       active++;
-                       continue;
+                       panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
+                             "[u=%d,r=%d,s=%d]\n", __func__,
+                             (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                             mp_so->so_usecount, mp_so->so_retaincnt,
+                             mpp->mpp_state);
                }
 
-               /*
-                * The PCB has been detached, and there is exactly 1 refnct
-                * held by the MPTCP thread.  Signal that thread to terminate,
-                * after which the last refcnt will be released.  That will
-                * allow it to be destroyed below during the next round.
-                */
-               if (mp_so->so_usecount == 1) {
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: mp_so 0x%llx scheduled for "
-                           "termination [u=%d,r=%d]\n", __func__,
-                           (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
-                           mp_so->so_usecount, mp_so->so_retaincnt),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+               if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
+                       mptcp_close(mpte, mp_tp);
 
-                       /* signal MPTCP thread to terminate */
-                       mptcp_thread_terminate_signal(mpte);
-                       lck_mtx_unlock(&mpp->mpp_lock);
-                       active++;
-                       continue;
-               }
+               mptcp_session_destroy(mpte);
 
                mptcplog((LOG_DEBUG, "MPTCP Socket: "
                    "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
@@ -3600,12 +4239,10 @@ mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
 {
        struct socket *mp_so;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPT_LOCK_ASSERT_HELD(mp_tp);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mptcb == mp_tp);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
 
-       mp_tp->mpt_state = MPTCPS_TERMINATE;
        DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
            uint32_t, 0 /* event */);
 
@@ -3625,33 +4262,20 @@ mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
        struct socket *mp_so = NULL;
        struct mptsub *mpts = NULL, *tmpts = NULL;
 
-       MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
-       MPT_LOCK_ASSERT_HELD(mp_tp);
+       mpte_lock_assert_held(mpte);    /* same as MP socket lock */
        VERIFY(mpte->mpte_mptcb == mp_tp);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       if (mp_tp->mpt_localkey != NULL) {
-               mptcp_free_key(mp_tp->mpt_localkey);
-               mp_tp->mpt_localkey = NULL;
-       }
+       mp_so = mptetoso(mpte);
 
-       MPT_UNLOCK(mp_tp);
-       soisdisconnected(mp_so);
+       mp_tp->mpt_state = MPTCPS_TERMINATE;
 
-       MPT_LOCK(mp_tp);
-       if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
-               return (NULL);
-       }
-       MPT_UNLOCK(mp_tp);
+       mptcp_freeq(mp_tp);
+
+       soisdisconnected(mp_so);
 
        /* Clean up all subflows */
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
-               MPTS_LOCK(mpts);
-               mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
-               mptcp_subflow_disconnect(mpte, mpts, TRUE);
-               MPTS_UNLOCK(mpts);
-               mptcp_subflow_del(mpte, mpts, TRUE);
+               mptcp_subflow_disconnect(mpte, mpts);
        }
-       MPT_LOCK(mp_tp);
 
        return (NULL);
 }
@@ -3663,84 +4287,34 @@ mptcp_notify_close(struct socket *so)
 }
 
 /*
- * Signal MPTCP thread to wake up.
+ * MPTCP workloop.
  */
 void
-mptcp_thread_signal(struct mptses *mpte)
-{
-       lck_mtx_lock(&mpte->mpte_thread_lock);
-       mptcp_thread_signal_locked(mpte);
-       lck_mtx_unlock(&mpte->mpte_thread_lock);
-}
-
-/*
- * Signal MPTCP thread to wake up (locked version)
- */
-static void
-mptcp_thread_signal_locked(struct mptses *mpte)
-{
-       lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
-
-       mpte->mpte_thread_reqs++;
-       if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
-               wakeup_one((caddr_t)&mpte->mpte_thread);
-}
-
-/*
- * Signal MPTCP thread to terminate.
- */
-static void
-mptcp_thread_terminate_signal(struct mptses *mpte)
-{
-       lck_mtx_lock(&mpte->mpte_thread_lock);
-       if (mpte->mpte_thread != THREAD_NULL) {
-               mpte->mpte_thread = THREAD_NULL;
-               mpte->mpte_thread_reqs++;
-               if (!mpte->mpte_thread_active)
-                       wakeup_one((caddr_t)&mpte->mpte_thread);
-       }
-       lck_mtx_unlock(&mpte->mpte_thread_lock);
-}
-
-/*
- * MPTCP thread workloop.
- */
-static void
-mptcp_thread_dowork(struct mptses *mpte)
+mptcp_subflow_workloop(struct mptses *mpte)
 {
        struct socket *mp_so;
        struct mptsub *mpts, *tmpts;
        boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
-       uint64_t mpsofilt_hint_mask = 0;
+       uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
 
-       MPTE_LOCK(mpte);                /* same as MP socket lock */
+       mpte_lock_assert_held(mpte);
        VERIFY(mpte->mpte_mppcb != NULL);
-       mp_so = mpte->mpte_mppcb->mpp_socket;
+       mp_so = mptetoso(mpte);
        VERIFY(mp_so != NULL);
 
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
                ev_ret_t ret;
 
-               MPTS_LOCK(mpts);
-               MPTS_ADDREF_LOCKED(mpts);       /* for us */
-
-               /* Update process ownership based on parent mptcp socket */
-               mptcp_update_last_owner(mpts, mp_so);
-
-               mptcp_subflow_input(mpte, mpts);
+               if (mpts->mpts_socket->so_usecount == 0) {
+                       /* Will be removed soon by tcp_garbage_collect */
+                       continue;
+               }
 
-               mptcp_get_rtt_measurement(mpts, mpte);
+               mptcp_subflow_addref(mpts);
+               mpts->mpts_socket->so_usecount++;
 
                ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
 
-               if (mpts->mpts_flags & MPTSF_ACTIVE) {
-                       mptcplog((LOG_DEBUG, "MPTCP Socket: "
-                           "%s: cid %d \n", __func__,
-                           mpts->mpts_connid),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-                       (void) mptcp_subflow_output(mpte, mpts);
-               }
-
                /*
                 * If MPTCP socket is closed, disconnect all subflows.
                 * This will generate a disconnect event which will
@@ -3748,15 +4322,14 @@ mptcp_thread_dowork(struct mptses *mpte)
                 * non-zero error to be returned above.
                 */
                if (mp_so->so_flags & SOF_PCBCLEARING)
-                       mptcp_subflow_disconnect(mpte, mpts, FALSE);
-               MPTS_UNLOCK(mpts);
+                       mptcp_subflow_disconnect(mpte, mpts);
 
                switch (ret) {
                case MPTS_EVRET_OK:
                        /* nothing to do */
                        break;
                case MPTS_EVRET_DELETE:
-                       mptcp_subflow_del(mpte, mpts, TRUE);
+                       mptcp_subflow_soclose(mpts);
                        break;
                case MPTS_EVRET_CONNECT_PENDING:
                        connect_pending = TRUE;
@@ -3772,53 +4345,35 @@ mptcp_thread_dowork(struct mptses *mpte)
                            MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
                        break;
                }
-               MPTS_REMREF(mpts);              /* ours */
-       }
-
-       if (mpsofilt_hint_mask) {
-               if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
-                       socantrcvmore(mp_so);
-                       mpsofilt_hint_mask &= ~SO_FILT_HINT_CANTRCVMORE;
-               }
+               mptcp_subflow_remref(mpts);             /* ours */
 
-               if (mpsofilt_hint_mask & SO_FILT_HINT_CONNRESET) {
-                       struct mptcb *mp_tp = mpte->mpte_mptcb;
+               VERIFY(mpts->mpts_socket->so_usecount != 0);
+               mpts->mpts_socket->so_usecount--;
+       }
 
-                       MPT_LOCK(mp_tp);
-                       mptcp_drop(mpte, mp_tp, ECONNRESET);
-                       MPT_UNLOCK(mp_tp);
-               }
+       if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
+               VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
 
                soevent(mp_so, mpsofilt_hint_mask);
        }
 
-       if (!connect_pending && !disconnect_fallback) {
-               MPTE_UNLOCK(mpte);
+       if (!connect_pending && !disconnect_fallback)
                return;
-       }
 
        TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
-               MPTS_LOCK(mpts);
                if (disconnect_fallback) {
                        struct socket *so = NULL;
                        struct inpcb *inp = NULL;
                        struct tcpcb *tp = NULL;
 
-                       if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
-                               MPTS_UNLOCK(mpts);
+                       if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
                                continue;
-                       }
 
                        mpts->mpts_flags |= MPTSF_MP_DEGRADED;
 
                        if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
-                           MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
-                               MPTS_UNLOCK(mpts);
+                           MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
                                continue;
-                       }
-
-                       if (mpts->mpts_flags & MPTSF_TFO_REQD)
-                               mptcp_drop_tfo_data(mpte, mpts, NULL);
 
                        so = mpts->mpts_socket;
 
@@ -3831,7 +4386,6 @@ mptcp_thread_dowork(struct mptses *mpte)
                         * ACTIVE one.
                         */
 
-                       socket_lock(so, 1);
                        inp = sotoinpcb(so);
                        tp = intotcpcb(inp);
                        tp->t_mpflags &=
@@ -3839,26 +4393,11 @@ mptcp_thread_dowork(struct mptses *mpte)
                        tp->t_mpflags |= TMPF_TCP_FALLBACK;
 
                        if (mpts->mpts_flags & MPTSF_ACTIVE) {
-                               socket_unlock(so, 1);
-                               MPTS_UNLOCK(mpts);
                                continue;
                        }
                        tp->t_mpflags |= TMPF_RESET;
-                       soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
-                       socket_unlock(so, 1);
-
+                       soevent(so, SO_FILT_HINT_MUSTRST);
                } else if (connect_pending) {
-                       /*
-                        * If delayed subflow start is set and cellular,
-                        * delay the connect till a retransmission timeout
-                        */
-
-                       if ((mptcp_delayed_subf_start) &&
-                           (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
-                               MPTS_UNLOCK(mpts);
-                               continue;
-                       }
-
                        /*
                         * The MPTCP connection has progressed to a state
                         * where it supports full multipath semantics; allow
@@ -3866,102 +4405,22 @@ mptcp_thread_dowork(struct mptses *mpte)
                         * that are in the PENDING state.
                         */
                        if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
-                               (void) mptcp_subflow_soconnectx(mpte, mpts);
-                       }
-               }
-               MPTS_UNLOCK(mpts);
-       }
-
-       MPTE_UNLOCK(mpte);
-}
-
-/*
- * MPTCP thread.
- */
-static void
-mptcp_thread_func(void *v, wait_result_t w)
-{
-#pragma unused(w)
-       struct mptses *mpte = v;
-       struct timespec *ts = NULL;
+                               int error = mptcp_subflow_soconnectx(mpte, mpts);
 
-       VERIFY(mpte != NULL);
-
-       lck_mtx_lock_spin(&mpte->mpte_thread_lock);
-
-       for (;;) {
-               lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
-
-               if (mpte->mpte_thread != THREAD_NULL) {
-                       (void) msleep(&mpte->mpte_thread,
-                           &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
-                           __func__, ts);
-               }
-
-               /* MPTCP socket is closed? */
-               if (mpte->mpte_thread == THREAD_NULL) {
-                       lck_mtx_unlock(&mpte->mpte_thread_lock);
-                       /* callee will destroy thread lock */
-                       mptcp_thread_destroy(mpte);
-                       /* NOTREACHED */
-                       return;
-               }
-
-               mpte->mpte_thread_active = 1;
-               for (;;) {
-                       uint32_t reqs = mpte->mpte_thread_reqs;
-
-                       lck_mtx_unlock(&mpte->mpte_thread_lock);
-                       mptcp_thread_dowork(mpte);
-                       lck_mtx_lock_spin(&mpte->mpte_thread_lock);
-
-                       /* if there's no pending request, we're done */
-                       if (reqs == mpte->mpte_thread_reqs ||
-                           mpte->mpte_thread == THREAD_NULL)
-                               break;
+                               if (error)
+                                       mptcp_subflow_abort(mpts, error);
+                       }
                }
-               mpte->mpte_thread_reqs = 0;
-               mpte->mpte_thread_active = 0;
        }
 }
 
-/*
- * Destroy a MTCP thread, to be called in the MPTCP thread context
- * upon receiving an indication to self-terminate.  This routine
- * will not return, as the current thread is terminated at the end.
- */
-static void
-mptcp_thread_destroy(struct mptses *mpte)
-{
-       struct socket *mp_so;
-
-       MPTE_LOCK(mpte);                /* same as MP socket lock */
-       VERIFY(mpte->mpte_thread == THREAD_NULL);
-       VERIFY(mpte->mpte_mppcb != NULL);
-
-       mptcp_sesdestroy(mpte);
-
-       mp_so = mpte->mpte_mppcb->mpp_socket;
-       VERIFY(mp_so != NULL);
-       VERIFY(mp_so->so_usecount != 0);
-       mp_so->so_usecount--;           /* for thread */
-       mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
-       MPTE_UNLOCK(mpte);
-
-       /* for the extra refcnt from kernel_thread_start() */
-       thread_deallocate(current_thread());
-       /* this is the end */
-       thread_terminate(current_thread());
-       /* NOTREACHED */
-}
-
 /*
  * Protocol pr_lock callback.
  */
 int
 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
 {
-       struct mppcb *mpp = sotomppcb(mp_so);
+       struct mppcb *mpp = mpsotomppcb(mp_so);
        void *lr_saved;
 
        if (lr == NULL)
@@ -3974,7 +4433,7 @@ mptcp_lock(struct socket *mp_so, int refcount, void *lr)
                    mp_so, lr_saved, solockhistory_nr(mp_so));
                /* NOTREACHED */
        }
-       lck_mtx_lock(&mpp->mpp_lock);
+       mpp_lock(mpp);
 
        if (mp_so->so_usecount < 0) {
                panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
@@ -3996,205 +4455,56 @@ mptcp_lock(struct socket *mp_so, int refcount, void *lr)
 int
 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
 {
-       struct mppcb *mpp = sotomppcb(mp_so);
-       void *lr_saved;
-
-       if (lr == NULL)
-               lr_saved = __builtin_return_address(0);
-       else
-               lr_saved = lr;
-
-       if (mpp == NULL) {
-               panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
-                   mp_so, mp_so->so_usecount, lr_saved,
-                   solockhistory_nr(mp_so));
-               /* NOTREACHED */
-       }
-       lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
-
-       if (refcount != 0)
-               mp_so->so_usecount--;
-
-       if (mp_so->so_usecount < 0) {
-               panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
-                   mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
-               /* NOTREACHED */
-       }
-       mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
-       mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
-       lck_mtx_unlock(&mpp->mpp_lock);
-
-       return (0);
-}
-
-/*
- * Protocol pr_getlock callback.
- */
-lck_mtx_t *
-mptcp_getlock(struct socket *mp_so, int locktype)
-{
-#pragma unused(locktype)
-       struct mppcb *mpp = sotomppcb(mp_so);
-
-       if (mpp == NULL) {
-               panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
-                   solockhistory_nr(mp_so));
-               /* NOTREACHED */
-       }
-       if (mp_so->so_usecount < 0) {
-               panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
-                   mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
-               /* NOTREACHED */
-       }
-       return (&mpp->mpp_lock);
-}
-
-/*
- * Key generation functions
- */
-static void
-mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
-{
-       struct mptcp_key_entry *key_elm;
-try_again:
-       read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
-       if (key_entry->mkey_value == 0)
-               goto try_again;
-       mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
-           sizeof (key_entry->mkey_digest));
-
-       LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
-               if (key_elm->mkey_value == key_entry->mkey_value) {
-                       goto try_again;
-               }
-               if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
-                   0) {
-                       goto try_again;
-               }
-       }
-}
-
-static mptcp_key_t *
-mptcp_reserve_key(void)
-{
-       struct mptcp_key_entry *key_elm;
-       struct mptcp_key_entry *found_elm = NULL;
-
-       lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
-       LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
-               if (key_elm->mkey_flags == MKEYF_FREE) {
-                       key_elm->mkey_flags = MKEYF_INUSE;
-                       found_elm = key_elm;
-                       break;
-               }
-       }
-       lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
-
-       if (found_elm) {
-               return (&found_elm->mkey_value);
-       }
-
-       key_elm = (struct mptcp_key_entry *)
-           zalloc(mptcp_keys_pool.mkph_key_entry_zone);
-       key_elm->mkey_flags = MKEYF_INUSE;
-
-       lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
-       mptcp_generate_unique_key(key_elm);
-       LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
-       mptcp_keys_pool.mkph_count += 1;
-       lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
-       return (&key_elm->mkey_value);
-}
-
-static caddr_t
-mptcp_get_stored_digest(mptcp_key_t *key)
-{
-       struct mptcp_key_entry *key_holder;
-       caddr_t digest = NULL;
-
-       lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
-       key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
-           offsetof(struct mptcp_key_entry, mkey_value));
-       if (key_holder->mkey_flags != MKEYF_INUSE)
-               panic_plain("%s", __func__);
-       digest = &key_holder->mkey_digest[0];
-       lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
-       return (digest);
-}
+       struct mppcb *mpp = mpsotomppcb(mp_so);
+       void *lr_saved;
 
-void
-mptcp_free_key(mptcp_key_t *key)
-{
-       struct mptcp_key_entry *key_holder;
-       struct mptcp_key_entry *key_elm;
-       int pt = RandomULong();
+       if (lr == NULL)
+               lr_saved = __builtin_return_address(0);
+       else
+               lr_saved = lr;
 
-       lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
-       key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
-           offsetof(struct mptcp_key_entry, mkey_value));
-       key_holder->mkey_flags = MKEYF_FREE;
+       if (mpp == NULL) {
+               panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
+                   mp_so, mp_so->so_usecount, lr_saved,
+                   solockhistory_nr(mp_so));
+               /* NOTREACHED */
+       }
+       mpp_lock_assert_held(mpp);
 
-       LIST_REMOVE(key_holder, mkey_next);
-       mptcp_keys_pool.mkph_count -= 1;
+       if (refcount != 0)
+               mp_so->so_usecount--;
 
-       /* Free half the time */
-       if (pt & 0x01) {
-               zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
-       } else {
-               /* Insert it at random point to avoid early reuse */
-               int i = 0;
-               if (mptcp_keys_pool.mkph_count > 1) {
-                       pt = pt % (mptcp_keys_pool.mkph_count - 1);
-                       LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
-                               if (++i >= pt) {
-                                       LIST_INSERT_AFTER(key_elm, key_holder,
-                                           mkey_next);
-                                       break;
-                               }
-                       }
-                       if (i < pt)
-                               panic("missed insertion");
-               } else {
-                       LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
-                           mkey_next);
-               }
-               mptcp_keys_pool.mkph_count += 1;
+       if (mp_so->so_usecount < 0) {
+               panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
+                   mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
+               /* NOTREACHED */
        }
-       lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
+       mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
+       mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
+       mpp_unlock(mpp);
+
+       return (0);
 }
 
-static void
-mptcp_key_pool_init(void)
+/*
+ * Protocol pr_getlock callback.
+ */
+lck_mtx_t *
+mptcp_getlock(struct socket *mp_so, int flags)
 {
-       int i;
-       struct mptcp_key_entry *key_entry;
-
-       LIST_INIT(&mptcp_keys_pool);
-       mptcp_keys_pool.mkph_count = 0;
-
-       mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
-           (sizeof (struct mptcp_key_entry));
-       mptcp_keys_pool.mkph_key_entry_zone = zinit(
-           mptcp_keys_pool.mkph_key_elm_sz,
-           MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
-           MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
-       if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
-               panic("%s: unable to allocate MPTCP keys zone \n", __func__);
+       struct mppcb *mpp = mpsotomppcb(mp_so);
+
+       if (mpp == NULL) {
+               panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
+                   solockhistory_nr(mp_so));
                /* NOTREACHED */
        }
-       zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
-       zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
-
-       for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
-               key_entry = (struct mptcp_key_entry *)
-                   zalloc(mptcp_keys_pool.mkph_key_entry_zone);
-               key_entry->mkey_flags = MKEYF_FREE;
-               mptcp_generate_unique_key(key_entry);
-               LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
-               mptcp_keys_pool.mkph_count += 1;
+       if (mp_so->so_usecount < 0) {
+               panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
+                   mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
+               /* NOTREACHED */
        }
-       lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
-           mtcbinfo.mppi_lock_attr);
+       return (mpp_getlock(mpp, flags));
 }
 
 /*
@@ -4207,10 +4517,8 @@ mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
 {
        struct tcpcb *tp = sototcpcb(so);
        struct mptcp_subf_auth_entry *sauth_entry;
-       MPT_LOCK_ASSERT_NOTHELD(mp_tp);
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
-       MPT_LOCK_SPIN(mp_tp);
-       tp->t_mptcb = mp_tp;
        /*
         * The address ID of the first flow is implicitly 0.
         */
@@ -4221,7 +4529,6 @@ mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
                tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
                so->so_flags |= SOF_MP_SEC_SUBFLOW;
        }
-       MPT_UNLOCK(mp_tp);
        sauth_entry = zalloc(mpt_subauth_zone);
        sauth_entry->msae_laddr_id = tp->t_local_aid;
        sauth_entry->msae_raddr_id = 0;
@@ -4230,9 +4537,7 @@ try_again:
        sauth_entry->msae_laddr_rand = RandomULong();
        if (sauth_entry->msae_laddr_rand == 0)
                goto try_again;
-       MPT_LOCK_SPIN(mp_tp);
        LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
-       MPT_UNLOCK(mp_tp);
 }
 
 static void
@@ -4242,14 +4547,10 @@ mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
        struct tcpcb *tp = NULL;
        int found = 0;
 
-       socket_lock(so, 0);
        tp = sototcpcb(so);
-       if (tp == NULL) {
-               socket_unlock(so, 0);
+       if (tp == NULL)
                return;
-       }
 
-       MPT_LOCK(mp_tp);
        LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
                if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
                        found = 1;
@@ -4259,13 +4560,9 @@ mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
        if (found) {
                LIST_REMOVE(sauth_entry, msae_next);
        }
-       MPT_UNLOCK(mp_tp);
 
        if (found)
                zfree(mpt_subauth_zone, sauth_entry);
-
-       tp->t_mptcb = NULL;
-       socket_unlock(so, 0);
 }
 
 void
@@ -4273,9 +4570,8 @@ mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
     u_int32_t *rrand)
 {
        struct mptcp_subf_auth_entry *sauth_entry;
-       MPT_LOCK_ASSERT_NOTHELD(mp_tp);
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
-       MPT_LOCK(mp_tp);
        LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
                if (sauth_entry->msae_laddr_id == addr_id) {
                        if (lrand)
@@ -4285,7 +4581,6 @@ mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
                        break;
                }
        }
-       MPT_UNLOCK(mp_tp);
 }
 
 void
@@ -4293,9 +4588,8 @@ mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
 {
        struct mptcp_subf_auth_entry *sauth_entry;
-       MPT_LOCK_ASSERT_NOTHELD(mp_tp);
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
-       MPT_LOCK(mp_tp);
        LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
                if (sauth_entry->msae_laddr_id == laddr_id) {
                        if ((sauth_entry->msae_raddr_id != 0) &&
@@ -4304,7 +4598,6 @@ mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
                                    " address ids %d %d \n", __func__, raddr_id,
                                    sauth_entry->msae_raddr_id),
                                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-                               MPT_UNLOCK(mp_tp);
                                return;
                        }
                        sauth_entry->msae_raddr_id = raddr_id;
@@ -4315,42 +4608,34 @@ mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
                                    __func__, raddr_rand,
                                    sauth_entry->msae_raddr_rand),
                                    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-                               MPT_UNLOCK(mp_tp);
                                return;
                        }
                        sauth_entry->msae_raddr_rand = raddr_rand;
-                       MPT_UNLOCK(mp_tp);
                        return;
                }
        }
-       MPT_UNLOCK(mp_tp);
 }
 
 /*
  * SHA1 support for MPTCP
  */
-static int
-mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
+static void
+mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
 {
        SHA1_CTX sha1ctxt;
        const unsigned char *sha1_base;
        int sha1_size;
 
-       if (digest_len != SHA1_RESULTLEN) {
-               return (FALSE);
-       }
-
        sha1_base = (const unsigned char *) key;
        sha1_size = sizeof (mptcp_key_t);
        SHA1Init(&sha1ctxt);
        SHA1Update(&sha1ctxt, sha1_base, sha1_size);
        SHA1Final(sha_digest, &sha1ctxt);
-       return (TRUE);
 }
 
 void
 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
-       u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
+       u_int32_t rand1, u_int32_t rand2, u_char *digest)
 {
        SHA1_CTX  sha1ctxt;
        mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
@@ -4358,7 +4643,7 @@ mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
        u_int32_t data[2];
        int i;
 
-       bzero(digest, digest_len);
+       bzero(digest, SHA1_RESULTLEN);
 
        /* Set up the Key for HMAC */
        key_ipad[0] = key1;
@@ -4398,41 +4683,22 @@ mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
  */
 void
-mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
-    int digest_len)
+mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
 {
        uint32_t lrand, rrand;
-       mptcp_key_t localkey, remotekey;
-       MPT_LOCK_ASSERT_NOTHELD(mp_tp);
 
-       if (digest_len != SHA1_RESULTLEN)
-               return;
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        lrand = rrand = 0;
        mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
-       MPT_LOCK_SPIN(mp_tp);
-       localkey = *mp_tp->mpt_localkey;
-       remotekey = mp_tp->mpt_remotekey;
-       MPT_UNLOCK(mp_tp);
-       mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
-           digest_len);
-}
-
-u_int64_t
-mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
-{
-       u_char digest[SHA1_RESULTLEN];
-       u_int64_t trunced_digest;
-
-       mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
-       bcopy(digest, &trunced_digest, 8);
-       return (trunced_digest);
+       mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
+           digest);
 }
 
 /*
  * Authentication data generation
  */
-void
+static void
 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
     int token_len)
 {
@@ -4444,7 +4710,7 @@ mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
        return;
 }
 
-void
+static void
 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
     int idsn_len)
 {
@@ -4484,15 +4750,17 @@ mptcp_conn_properties(struct mptcb *mp_tp)
 }
 
 static void
-mptcp_init_local_parms(struct mptcb *mp_tp)
+mptcp_init_local_parms(struct mptses *mpte)
 {
-       caddr_t local_digest = NULL;
+       struct mptcb *mp_tp = mpte->mpte_mptcb;
+       char key_digest[SHA1_RESULTLEN];
+
+       read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
+       mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
 
-       mp_tp->mpt_localkey = mptcp_reserve_key();
-       local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
-       mptcp_generate_token(local_digest, SHA1_RESULTLEN,
+       mptcp_generate_token(key_digest, SHA1_RESULTLEN,
            (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
-       mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
+       mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
            (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
 
        /* The subflow SYN is also first MPTCP byte */
@@ -4505,65 +4773,25 @@ mptcp_init_local_parms(struct mptcb *mp_tp)
 int
 mptcp_init_remote_parms(struct mptcb *mp_tp)
 {
-       char remote_digest[MPTCP_SHA1_RESULTLEN];
-       MPT_LOCK_ASSERT_HELD(mp_tp);
+       char remote_digest[SHA1_RESULTLEN];
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
        /* Only Version 0 is supported for auth purposes */
        if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
                return (-1);
 
        /* Setup local and remote tokens and Initial DSNs */
-
-       if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
-           SHA1_RESULTLEN)) {
-               mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
-                   __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
-               return (-1);
-       }
+       mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
        mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
            (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
        mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
            (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
-       mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
+       mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
 
        return (0);
 }
 
-/*
- * Helper Functions
- */
-mptcp_token_t
-mptcp_get_localtoken(void* mptcb_arg)
-{
-       struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
-       return (mp_tp->mpt_localtoken);
-}
-
-mptcp_token_t
-mptcp_get_remotetoken(void* mptcb_arg)
-{
-       struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
-       return (mp_tp->mpt_remotetoken);
-}
-
-u_int64_t
-mptcp_get_localkey(void* mptcb_arg)
-{
-       struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
-       if (mp_tp->mpt_localkey != NULL)
-               return (*mp_tp->mpt_localkey);
-       else
-               return (0);
-}
-
-u_int64_t
-mptcp_get_remotekey(void* mptcb_arg)
-{
-       struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
-       return (mp_tp->mpt_remotekey);
-}
-
-void
+static void
 mptcp_send_dfin(struct socket *so)
 {
        struct tcpcb *tp = NULL;
@@ -4593,7 +4821,8 @@ mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
                return;
 
        __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
-       MPT_LOCK(mp_tp);
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
+
        while (m) {
                VERIFY(m->m_flags & M_PKTHDR);
                m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
@@ -4602,56 +4831,97 @@ mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
                mp_tp->mpt_sndmax += m_pktlen(m);
                m = m->m_next;
        }
-       MPT_UNLOCK(mp_tp);
+}
+
+void
+mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
+{
+       struct mptcb *mp_tp = tptomptp(sototcpcb(so));
+       uint64_t data_ack;
+       uint64_t dsn;
+
+       if (!m || len == 0)
+               return;
+
+       while (m && len > 0) {
+               VERIFY(m->m_flags & M_PKTHDR);
+               VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
+
+               data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
+               dsn = m->m_pkthdr.mp_dsn;
+
+               len -= m->m_len;
+               m = m->m_next;
+       }
+
+       if (m && len == 0) {
+               /*
+                * If there is one more mbuf in the chain, it automatically means
+                * that up to m->mp_dsn has been ack'ed.
+                *
+                * This means, we actually correct data_ack back down (compared
+                * to what we set inside the loop - dsn + data_len). Because in
+                * the loop we are "optimistic" and assume that the full mapping
+                * will be acked. If that's not the case and we get out of the
+                * loop with m != NULL, it means only up to m->mp_dsn has been
+                * really acked.
+                */
+               data_ack = m->m_pkthdr.mp_dsn;
+       }
+
+       if (len < 0) {
+               /*
+                * If len is negative, meaning we acked in the middle of an mbuf,
+                * only up to this mbuf's data-sequence number has been acked
+                * at the MPTCP-level.
+                */
+               data_ack = dsn;
+       }
+
+       mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
+                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+       mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
 }
 
 void
 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
 {
-       u_int32_t sub_len = 0;
        int rewinding = 0;
 
-       if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
-               /* TFO makes things complicated. */
-               if (so->so_flags1 & SOF1_TFO_REWIND) {
-                       rewinding = 1;
-                       so->so_flags1 &= ~SOF1_TFO_REWIND;
-               }
+       /* TFO makes things complicated. */
+       if (so->so_flags1 & SOF1_TFO_REWIND) {
+               rewinding = 1;
+               so->so_flags1 &= ~SOF1_TFO_REWIND;
        }
 
-       while (m) {
+       while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
+               u_int32_t sub_len;
                VERIFY(m->m_flags & M_PKTHDR);
+               VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
 
-               if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
-                       sub_len = m->m_pkthdr.mp_rlen;
+               sub_len = m->m_pkthdr.mp_rlen;
 
-                       if (sub_len < len) {
-                               m->m_pkthdr.mp_dsn += sub_len;
-                               if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
-                                       m->m_pkthdr.mp_rseq += sub_len;
-                               }
-                               m->m_pkthdr.mp_rlen = 0;
-                               len -= sub_len;
-                       } else {
-                               /* sub_len >= len */
-                               if (rewinding == 0)
-                                       m->m_pkthdr.mp_dsn += len;
-                               if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
-                                       if (rewinding == 0)
-                                               m->m_pkthdr.mp_rseq += len;
-                               }
-                               mptcplog((LOG_DEBUG, "MPTCP Sender: "
-                                   "%s: dsn 0x%llx ssn %u len %d %d\n",
-                                   __func__,
-                                   m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
-                                   m->m_pkthdr.mp_rlen, len),
-                                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-                               m->m_pkthdr.mp_rlen -= len;
-                               break;
+               if (sub_len < len) {
+                       m->m_pkthdr.mp_dsn += sub_len;
+                       if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
+                               m->m_pkthdr.mp_rseq += sub_len;
                        }
+                       m->m_pkthdr.mp_rlen = 0;
+                       len -= sub_len;
                } else {
-                       panic("%s: MPTCP tag not set", __func__);
-                       /* NOTREACHED */
+                       /* sub_len >= len */
+                       if (rewinding == 0)
+                               m->m_pkthdr.mp_dsn += len;
+                       if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
+                               if (rewinding == 0)
+                                       m->m_pkthdr.mp_rseq += len;
+                       }
+                       mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
+                           __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
+                           m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
+                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
+                       m->m_pkthdr.mp_rlen -= len;
+                       break;
                }
                m = m->m_next;
        }
@@ -4663,37 +4933,32 @@ mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
                 * Received an ack without receiving a DATA_ACK.
                 * Need to fallback to regular TCP (or destroy this subflow).
                 */
+               sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
                mptcp_notify_mpfail(so);
        }
 }
 
 /* Obtain the DSN mapping stored in the mbuf */
 void
-mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
-    u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
+mptcp_output_getm_dsnmap32(struct socket *so, int off,
+    uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
 {
        u_int64_t dsn64;
 
-       mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
+       mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
        *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
-       *dsn64p = dsn64;
 }
 
 void
-mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
-    u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
+mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
+                          uint32_t *relseq, uint16_t *data_len,
+                          uint16_t *dss_csum)
 {
        struct mbuf *m = so->so_snd.sb_mb;
-       struct mbuf *mnext = NULL;
-       uint32_t runlen = 0;
-       u_int64_t dsn64;
-       uint32_t contig_len = 0;
+       int off_orig = off;
 
-       if (m == NULL)
-               return;
+       VERIFY(off >= 0);
 
-       if (off < 0)
-               return;
        /*
         * In the subflow socket, the DSN sequencing can be discontiguous,
         * but the subflow sequence mapping is contiguous. Use the subflow
@@ -4702,97 +4967,29 @@ mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
         */
 
        while (m) {
-               VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
                VERIFY(m->m_flags & M_PKTHDR);
+               VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
 
-               if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
-                       off -= m->m_pkthdr.mp_rlen;
+               if (off >= m->m_len) {
+                       off -= m->m_len;
                        m = m->m_next;
                } else {
                        break;
                }
        }
 
-       if (m == NULL) {
-               panic("%s: bad offset", __func__);
-               /* NOTREACHED */
-       }
-
-       dsn64 = m->m_pkthdr.mp_dsn + off;
-       *dsn = dsn64;
-       *relseq = m->m_pkthdr.mp_rseq + off;
-
-       /*
-        * Now find the last contiguous byte and its length from
-        * start.
-        */
-       runlen = m->m_pkthdr.mp_rlen - off;
-       contig_len = runlen;
-
-       /* If datalen does not span multiple mbufs, return */
-       if (datalen <= runlen) {
-               *data_len = min(datalen, UINT16_MAX);
-               return;
-       }
-
-       mnext = m->m_next;
-       while (datalen > runlen) {
-               if (mnext == NULL) {
-                       panic("%s: bad datalen = %d, %d %d", __func__, datalen,
-                           runlen, off);
-                       /* NOTREACHED */
-               }
-               VERIFY(mnext->m_flags & M_PKTHDR);
-               VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
-
-               /*
-                * case A. contiguous DSN stream
-                * case B. discontiguous DSN stream
-                */
-               if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
-                       /* case A */
-                       runlen += mnext->m_pkthdr.mp_rlen;
-                       contig_len += mnext->m_pkthdr.mp_rlen;
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
-                           __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-               } else {
-                       /* case B */
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: "
-                           "%s: discontig datalen %d contig_len %d cc %d \n",
-                           __func__, datalen, contig_len, so->so_snd.sb_cc),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-                       break;
-               }
-               mnext = mnext->m_next;
-       }
-       datalen = min(datalen, UINT16_MAX);
-       *data_len = min(datalen, contig_len);
-       mptcplog((LOG_DEBUG, "MPTCP Sender: "
-           "%s: %llu %u %d %d \n", __func__,
-           *dsn, *relseq, *data_len, off),
-           MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
-}
+       VERIFY(m);
+       VERIFY(off >= 0);
+       VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
 
-/*
- * MPTCP's notion of the next insequence Data Sequence number is adjusted
- * here. It must be called from mptcp_adj_rmap() which is called only after
- * reassembly of out of order data. The rcvnxt variable must
- * be updated only when atleast some insequence new data is received.
- */
-static void
-mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
-{
-       struct mptcb *mp_tp = tptomptp(tp);
+       *dsn = m->m_pkthdr.mp_dsn;
+       *relseq = m->m_pkthdr.mp_rseq;
+       *data_len = m->m_pkthdr.mp_rlen;
+       *dss_csum = m->m_pkthdr.mp_csum;
 
-       if (mp_tp == NULL)
-               return;
-       MPT_LOCK(mp_tp);
-       if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
-           (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
-           m->m_pkthdr.mp_rlen)))) {
-               mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
-       }
-       MPT_UNLOCK(mp_tp);
+       mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
+                 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
+                MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 }
 
 /*
@@ -4807,70 +5004,64 @@ mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
  * with mptcp_adj_rmap()
  */
 void
-mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
+mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
 {
+       VERIFY(m->m_flags & M_PKTHDR);
        VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
 
        if (tp->t_mpflags & TMPF_EMBED_DSN) {
-               VERIFY(m->m_flags & M_PKTHDR);
                m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
                m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
                m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
+               m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
+               if (tp->t_rcv_map.mpt_dfin)
+                       m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
+
                m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
+
                tp->t_mpflags &= ~TMPF_EMBED_DSN;
                tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
+       } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
+               if (th->th_flags & TH_FIN)
+                       m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
        }
 }
 
 int
-mptcp_adj_rmap(struct socket *so, struct mbuf *m)
+mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
+              uint32_t rseq, uint16_t dlen)
 {
-       u_int64_t dsn;
-       u_int32_t sseq, datalen;
-       struct tcpcb *tp = intotcpcb(sotoinpcb(so));
-       u_int32_t old_rcvnxt = 0;
+       struct mptsub *mpts = sototcpcb(so)->t_mpsub;
 
        if (m_pktlen(m) == 0)
-               return 0;
-
-       if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
-               VERIFY(m->m_flags & M_PKTHDR);
-
-               dsn = m->m_pkthdr.mp_dsn;
-               sseq = m->m_pkthdr.mp_rseq + tp->irs;
-               datalen = m->m_pkthdr.mp_rlen;
-       } else {
-               /* data arrived without an DSS option mapping */
-
-               /* initial subflow can fallback right after SYN handshake */
-               mptcp_notify_mpfail(so);
-               return 0;
-       }
-
-       /* In the common case, data is in window and in sequence */
-       if (m->m_pkthdr.len == (int)datalen) {
-               mptcp_adj_rcvnxt(tp, m);
-               return 0;
-       }
+               return (0);
 
-       old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
-       if (SEQ_GT(old_rcvnxt, sseq)) {
-               /* data trimmed from the left */
-               int off = old_rcvnxt - sseq;
+       if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
+               if (off && (dsn != m->m_pkthdr.mp_dsn ||
+                           rseq != m->m_pkthdr.mp_rseq ||
+                           dlen != m->m_pkthdr.mp_rlen)) {
+                       mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
+                                 __func__, dsn, m->m_pkthdr.mp_dsn,
+                                 rseq, m->m_pkthdr.mp_rseq,
+                                 dlen, m->m_pkthdr.mp_rlen),
+                                MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
+                       return (-1);
+               }
                m->m_pkthdr.mp_dsn += off;
                m->m_pkthdr.mp_rseq += off;
                m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
-       } else if (old_rcvnxt == sseq) {
-               /*
-                * data was trimmed from the right
-                */
-               m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
        } else {
-               mptcp_notify_mpfail(so);
-               return (-1);
+               if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
+                       /* data arrived without an DSS option mapping */
+
+                       /* initial subflow can fallback right after SYN handshake */
+                       mptcp_notify_mpfail(so);
+               }
        }
-       mptcp_adj_rcvnxt(tp, m);
-       return 0;
+
+       mpts->mpts_flags |= MPTSF_CONFIRMED;
+
+       return (0);
 }
 
 /*
@@ -4890,9 +5081,8 @@ mptcp_act_on_txfail(struct socket *so)
        if (tp == NULL)
                return;
 
-       if (so->so_flags & SOF_MP_TRYFAILOVER) {
+       if (so->so_flags & SOF_MP_TRYFAILOVER)
                return;
-       }
 
        so->so_flags |= SOF_MP_TRYFAILOVER;
        soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
@@ -4921,9 +5111,8 @@ mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
                    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
                        off = dsn_fail - dsn;
                        *tcp_seq = m->m_pkthdr.mp_rseq + off;
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
-                           __func__, dsn, dsn_fail),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+                       mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
+                                 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
                        return (0);
                }
 
@@ -4945,62 +5134,224 @@ mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
  * Support for sending contiguous MPTCP bytes in subflow
  * Also for preventing sending data with ACK in 3-way handshake
  */
-int32_t
-mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
+int32_t
+mptcp_adj_sendlen(struct socket *so, int32_t off)
+{
+       struct tcpcb *tp = sototcpcb(so);
+       struct mptsub *mpts = tp->t_mpsub;
+       uint64_t mdss_dsn;
+       uint32_t mdss_subflow_seq;
+       int mdss_subflow_off;
+       uint16_t mdss_data_len;
+       uint16_t dss_csum;
+
+       mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
+                                  &mdss_data_len, &dss_csum);
+
+       /*
+        * We need to compute how much of the mapping still remains.
+        * So, we compute the offset in the send-buffer of the dss-sub-seq.
+        */
+       mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
+
+       /*
+        * When TFO is used, we are sending the mpts->mpts_iss although the relative
+        * seq has been set to 1 (while it should be 0).
+        */
+       if (tp->t_mpflags & TMPF_TFO_REQUEST)
+               mdss_subflow_off--;
+
+       if (off < mdss_subflow_off)
+               printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
+               off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
+       VERIFY(off >= mdss_subflow_off);
+
+       mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
+                 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
+                 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
+       return (mdss_data_len - (off - mdss_subflow_off));
+}
+
+static uint32_t
+mptcp_get_maxseg(struct mptses *mpte)
+{
+       struct mptsub *mpts;
+       uint32_t maxseg = 0;
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+
+               if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+                   TCPS_HAVERCVDFIN2(tp->t_state))
+                       continue;
+
+               if (tp->t_maxseg > maxseg)
+                       maxseg = tp->t_maxseg;
+       }
+
+       return (maxseg);
+}
+
+static uint8_t
+mptcp_get_rcvscale(struct mptses *mpte)
+{
+       struct mptsub *mpts;
+       uint8_t rcvscale = UINT8_MAX;
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+
+               if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+                   TCPS_HAVERCVDFIN2(tp->t_state))
+                       continue;
+
+               if (tp->rcv_scale < rcvscale)
+                       rcvscale = tp->rcv_scale;
+       }
+
+       return (rcvscale);
+}
+
+/* Similar to tcp_sbrcv_reserve */
+static void
+mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
+       u_int32_t newsize, u_int32_t idealsize)
+{
+       uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
+
+       /* newsize should not exceed max */
+       newsize = min(newsize, tcp_autorcvbuf_max);
+
+       /* The receive window scale negotiated at the
+        * beginning of the connection will also set a
+        * limit on the socket buffer size
+        */
+       newsize = min(newsize, TCP_MAXWIN << rcvscale);
+
+       /* Set new socket buffer size */
+       if (newsize > sbrcv->sb_hiwat &&
+               (sbreserve(sbrcv, newsize) == 1)) {
+               sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
+                   (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
+
+               /* Again check the limit set by the advertised
+                * window scale
+                */
+               sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
+                       TCP_MAXWIN << rcvscale);
+       }
+}
+
+void
+mptcp_sbrcv_grow(struct mptcb *mp_tp)
+{
+       struct mptses *mpte = mp_tp->mpt_mpte;
+       struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
+       struct sockbuf *sbrcv = &mp_so->so_rcv;
+       uint32_t hiwat_sum = 0;
+       uint32_t ideal_sum = 0;
+       struct mptsub *mpts;
+
+       /*
+        * Do not grow the receive socket buffer if
+        * - auto resizing is disabled, globally or on this socket
+        * - the high water mark already reached the maximum
+        * - the stream is in background and receive side is being
+        * throttled
+        * - if there are segments in reassembly queue indicating loss,
+        * do not need to increase recv window during recovery as more
+        * data is not going to be sent. A duplicate ack sent during
+        * recovery should not change the receive window
+        */
+       if (tcp_do_autorcvbuf == 0 ||
+           (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
+           tcp_cansbgrow(sbrcv) == 0 ||
+           sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
+           (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
+           !LIST_EMPTY(&mp_tp->mpt_segq)) {
+               /* Can not resize the socket buffer, just return */
+               return;
+       }
+
+       /*
+        * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
+        *
+        * But, for this we first need accurate receiver-RTT estimations, which
+        * we currently don't have.
+        *
+        * Let's use a dummy algorithm for now, just taking the sum of all
+        * subflow's receive-buffers. It's too low, but that's all we can get
+        * for now.
+        */
+
+       TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
+               hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
+               ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
+       }
+
+       mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
+}
+
+/*
+ * Determine if we can grow the recieve socket buffer to avoid sending
+ * a zero window update to the peer. We allow even socket buffers that
+ * have fixed size (set by the application) to grow if the resource
+ * constraints are met. They will also be trimmed after the application
+ * reads data.
+ *
+ * Similar to tcp_sbrcv_grow_rwin
+ */
+static void
+mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
 {
-       u_int64_t       mdss_dsn = 0;
-       u_int32_t       mdss_subflow_seq = 0;
-       u_int16_t       mdss_data_len = 0;
+       struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
+       u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
+       u_int32_t rcvbuf = sb->sb_hiwat;
 
-       if (len == 0)
-               return (len);
-
-       mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
-           &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
+       if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
+               return;
 
-       /*
-        * Special case handling for Fast Join. We want to send data right
-        * after ACK of the 3-way handshake, but not piggyback the data
-        * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
-        * mdss_data_len control this.
-        */
-       struct tcpcb *tp = NULL;
-       tp = intotcpcb(sotoinpcb(so));
-       if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
-            (tp->t_mpflags & TMPF_PREESTABLISHED) &&
-           (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
-           (tp->t_mpflags & TMPF_SENT_JOIN) &&
-           (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
-           (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
-               mdss_data_len = 0;
-               tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
-       }
-
-       if ((tp->t_state > TCPS_SYN_SENT) &&
-           (tp->t_mpflags & TMPF_TFO_REQUEST)) {
-               mdss_data_len = 0;
-               tp->t_mpflags &= ~TMPF_TFO_REQUEST;
+       if (tcp_do_autorcvbuf == 1 &&
+           tcp_cansbgrow(sb) &&
+           /* Diff to tcp_sbrcv_grow_rwin */
+           (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
+           (rcvbuf - sb->sb_cc) < rcvbufinc &&
+           rcvbuf < tcp_autorcvbuf_max &&
+           (sb->sb_idealsize > 0 &&
+           sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
+               sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
        }
-       return (mdss_data_len);
 }
 
+/* Similar to tcp_sbspace */
 int32_t
-mptcp_sbspace(struct mptcb *mpt)
+mptcp_sbspace(struct mptcb *mp_tp)
 {
-       struct sockbuf *sb;
+       struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
        uint32_t rcvbuf;
        int32_t space;
+       int32_t pending = 0;
+
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
 
-       MPT_LOCK_ASSERT_HELD(mpt);
-       MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
+       mptcp_sbrcv_grow_rwin(mp_tp, sb);
 
-       sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
+       /* hiwat might have changed */
        rcvbuf = sb->sb_hiwat;
-       space = ((int32_t)imin((rcvbuf - sb->sb_cc),
-           (sb->sb_mbmax - sb->sb_mbcnt)));
+
+       space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
+               (sb->sb_mbmax - sb->sb_mbcnt)));
        if (space < 0)
                space = 0;
-       /* XXX check if it's too small? */
+
+#if CONTENT_FILTER
+       /* Compensate for data being processed by content filters */
+       pending = cfil_sock_data_space(sb);
+#endif /* CONTENT_FILTER */
+       if (pending > space)
+               space = 0;
+       else
+               space -= pending;
 
        return (space);
 }
@@ -5070,12 +5421,11 @@ boolean_t
 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
 {
        boolean_t ret = 1;
-       VERIFY(mp_tp != NULL);
-       MPT_LOCK(mp_tp);
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
+
        if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
                ret = 0;
        }
-       MPT_UNLOCK(mp_tp);
        return (ret);
 }
 
@@ -5090,34 +5440,36 @@ mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
 
 #define        MPTCP_COMPUTE_LEN {                             \
        mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
-       MPT_LOCK(mp_tp);                                \
        if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
                mss_lower += 2;                         \
        else                                            \
                /* adjust to 32-bit boundary + EOL */   \
                mss_lower += 2;                         \
-       MPT_UNLOCK(mp_tp);                              \
 }
        if (mp_tp == NULL)
                return (0);
 
+       mpte_lock_assert_held(mp_tp->mpt_mpte);
+
        /*
         * For the first subflow and subsequent subflows, adjust mss for
         * most common MPTCP option size, for case where tcp_mss is called
         * during option processing and MTU discovery.
         */
-       if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
-           (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
-               MPTCP_COMPUTE_LEN;
-       }
-
-       if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
-           (tp->t_mpflags & TMPF_SENT_JOIN)) {
-               MPTCP_COMPUTE_LEN;
-       }
+       if (!mtudisc) {
+               if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
+                   !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
+                       MPTCP_COMPUTE_LEN;
+               }
 
-       if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
-               MPTCP_COMPUTE_LEN;
+               if (tp->t_mpflags & TMPF_PREESTABLISHED &&
+                   tp->t_mpflags & TMPF_SENT_JOIN) {
+                       MPTCP_COMPUTE_LEN;
+               }
+       } else {
+               if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
+                       MPTCP_COMPUTE_LEN;
+               }
        }
 
        return (mss_lower);
@@ -5127,21 +5479,15 @@ mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
  * Update the pid, upid, uuid of the subflow so, based on parent so
  */
 void
-mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
+mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
 {
-       struct socket *subflow_so = mpts->mpts_socket;
-
-       MPTS_LOCK_ASSERT_HELD(mpts);
-
-       socket_lock(subflow_so, 0);
-       if ((subflow_so->last_pid != parent_mpso->last_pid) ||
-               (subflow_so->last_upid != parent_mpso->last_upid)) {
-               subflow_so->last_upid = parent_mpso->last_upid;
-               subflow_so->last_pid = parent_mpso->last_pid;
-               uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
+       if (so->last_pid != mp_so->last_pid ||
+           so->last_upid != mp_so->last_upid) {
+               so->last_upid = mp_so->last_upid;
+               so->last_pid = mp_so->last_pid;
+               uuid_copy(so->last_uuid, mp_so->last_uuid);
        }
-       so_update_policy(subflow_so);
-       socket_unlock(subflow_so, 0);
+       so_update_policy(so);
 }
 
 static void
@@ -5177,11 +5523,9 @@ fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
        flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
        flow->flow_flags = mpts->mpts_flags;
        flow->flow_cid = mpts->mpts_connid;
-       flow->flow_sndnxt = mpts->mpts_sndnxt;
        flow->flow_relseq = mpts->mpts_rel_seq;
-       flow->flow_soerror = mpts->mpts_soerror;
+       flow->flow_soerror = mpts->mpts_socket->so_error;
        flow->flow_probecnt = mpts->mpts_probecnt;
-       flow->flow_peerswitch = mpts->mpts_peerswitch;
 }
 
 static int
@@ -5189,7 +5533,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
 {
 #pragma unused(oidp, arg1, arg2)
        int error = 0, f;
-       size_t n, len;
+       size_t len;
        struct mppcb *mpp;
        struct mptses *mpte;
        struct mptcb *mp_tp;
@@ -5202,8 +5546,8 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                return (EPERM);
 
        lck_mtx_lock(&mtcbinfo.mppi_lock);
-       n = mtcbinfo.mppi_count;
        if (req->oldptr == USER_ADDR_NULL) {
+               size_t n = mtcbinfo.mppi_count;
                lck_mtx_unlock(&mtcbinfo.mppi_lock);
                req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
                    4 * (n + n/8)  * sizeof(mptcp_flow_t);
@@ -5211,19 +5555,15 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
        }
        TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
                flows = NULL;
-               lck_mtx_lock(&mpp->mpp_lock);
+               mpp_lock(mpp);
                VERIFY(mpp->mpp_flags & MPP_ATTACHED);
-               if (mpp->mpp_flags & MPP_DEFUNCT) {
-                       lck_mtx_unlock(&mpp->mpp_lock);
-                       continue;
-               }
                mpte = mptompte(mpp);
                VERIFY(mpte != NULL);
+               mpte_lock_assert_held(mpte);
                mp_tp = mpte->mpte_mptcb;
                VERIFY(mp_tp != NULL);
 
                bzero(&mptcpci, sizeof(mptcpci));
-               MPT_LOCK(mp_tp);
                mptcpci.mptcpci_state = mp_tp->mpt_state;
                mptcpci.mptcpci_flags = mp_tp->mpt_flags;
                mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
@@ -5235,10 +5575,9 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
                mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
                mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
-               mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
+               mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
                mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
                mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
-               MPT_UNLOCK(mp_tp);
 
                mptcpci.mptcpci_nflows = mpte->mpte_numflows;
                mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
@@ -5250,7 +5589,7 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                if (mpte->mpte_numflows != 0) {
                        flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
                        if (flows == NULL) {
-                               lck_mtx_unlock(&mpp->mpp_lock);
+                               mpp_unlock(mpp);
                                break;
                        }
                        mptcpci.mptcpci_len = sizeof(mptcpci) +
@@ -5262,21 +5601,17 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS
                        error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
                }
                if (error) {
-                       lck_mtx_unlock(&mpp->mpp_lock);
+                       mpp_unlock(mpp);
                        FREE(flows, M_TEMP);
                        break;
                }
                f = 0;
                TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
-                       MPTS_LOCK(mpts);
                        so = mpts->mpts_socket;
-                       socket_lock(so, 0);
                        fill_mptcp_subflow(so, &flows[f], mpts);
-                       socket_unlock(so, 0);
-                       MPTS_UNLOCK(mpts);
                        f++;
                }
-               lck_mtx_unlock(&mpp->mpp_lock);
+               mpp_unlock(mpp);
                if (flows) {
                        error = SYSCTL_OUT(req, flows, len);
                        FREE(flows, M_TEMP);
@@ -5293,42 +5628,6 @@ SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
     "List of active MPTCP connections");
 
-/*
- * Check the health of the other subflows and do an mptcp_output if
- * there is no other active or functional subflow at the time of
- * call of this function.
- */
-static void
-mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
-{
-       struct mptsub *from_mpts = NULL;
-
-       MPTE_LOCK_ASSERT_HELD(mpte);
-
-       MPTS_UNLOCK(to_mpts);
-
-       from_mpts = mpte->mpte_active_sub;
-
-       if (from_mpts == NULL)
-               goto output_needed;
-
-       MPTS_LOCK(from_mpts);
-
-       if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
-           (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
-               MPTS_UNLOCK(from_mpts);
-               goto output_needed;
-       }
-
-       MPTS_UNLOCK(from_mpts);
-       MPTS_LOCK(to_mpts);
-       return;
-
-output_needed:
-       mptcp_output(mpte);
-       MPTS_LOCK(to_mpts);
-}
-
 /*
  * Set notsent lowat mark on the MPTCB
  */
@@ -5346,7 +5645,7 @@ mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
        else
                error = EINVAL;
 
-       return error;
+       return (error);
 }
 
 u_int32_t
@@ -5358,13 +5657,14 @@ mptcp_get_notsent_lowat(struct mptses *mpte)
                mp_tp = mpte->mpte_mptcb;
 
        if (mp_tp)
-               return mp_tp->mpt_notsent_lowat;
+               return (mp_tp->mpt_notsent_lowat);
        else
-               return 0;
+               return (0);
 }
 
 int
-mptcp_notsent_lowat_check(struct socket *so) {
+mptcp_notsent_lowat_check(struct socket *so)
+{
        struct mptses *mpte;
        struct mppcb *mpp;
        struct mptcb *mp_tp;
@@ -5372,15 +5672,15 @@ mptcp_notsent_lowat_check(struct socket *so) {
 
        int notsent = 0;
 
-       mpp = sotomppcb(so);
+       mpp = mpsotomppcb(so);
        if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
                return (0);
        }
 
        mpte = mptompte(mpp);
+       mpte_lock_assert_held(mpte);
        mp_tp = mpte->mpte_mptcb;
 
-       MPT_LOCK(mp_tp);
        notsent = so->so_snd.sb_cc;
 
        if ((notsent == 0) ||
@@ -5391,10 +5691,8 @@ mptcp_notsent_lowat_check(struct socket *so) {
                    mp_tp->mpt_notsent_lowat, notsent,
                    notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
                    MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
-               MPT_UNLOCK(mp_tp);
                return (1);
        }
-       MPT_UNLOCK(mp_tp);
 
        /* When Nagle's algorithm is not disabled, it is better
         * to wakeup the client even before there is atleast one
@@ -5402,10 +5700,8 @@ mptcp_notsent_lowat_check(struct socket *so) {
         */
        TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
                int retval = 0;
-               MPTS_LOCK(mpts);
                if (mpts->mpts_flags & MPTSF_ACTIVE) {
                        struct socket *subf_so = mpts->mpts_socket;
-                       socket_lock(subf_so, 0);
                        struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
 
                        notsent = so->so_snd.sb_cc -
@@ -5419,46 +5715,153 @@ mptcp_notsent_lowat_check(struct socket *so) {
                            " nodelay false \n",
                            mp_tp->mpt_notsent_lowat, notsent),
                            MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
-                       socket_unlock(subf_so, 0);
-                       MPTS_UNLOCK(mpts);
                        return (retval);
                }
-               MPTS_UNLOCK(mpts);
        }
        return (0);
 }
 
-static void
-mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
-{
-       MPTE_LOCK_ASSERT_HELD(mpte);
-       MPTS_LOCK_ASSERT_HELD(mpts);
-
-       struct socket *subflow_so = mpts->mpts_socket;
-       socket_lock(subflow_so, 0);
-       mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
-       mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
-       socket_unlock(subflow_so, 0);
-}
-
 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
 static uint32_t mptcp_kern_skt_inuse = 0;
+static uint32_t mptcp_kern_skt_unit;
 symptoms_advisory_t mptcp_advisory;
 
 static errno_t
 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
-       void **unitinfo)
+                          void **unitinfo)
 {
 #pragma unused(kctlref, sac, unitinfo)
-       /*
-        * We don't need to do anything here. But we can atleast ensure
-        * only one user opens the MPTCP_KERN_CTL_NAME control socket.
-        */
-       if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
-               return (0);
+
+       if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
+               os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
+
+       mptcp_kern_skt_unit = sac->sc_unit;
+
+       return (0);
+}
+
+static void
+mptcp_allow_uuid(uuid_t uuid)
+{
+       struct mppcb *mpp;
+
+       /* Iterate over all MPTCP connections */
+
+       lck_mtx_lock(&mtcbinfo.mppi_lock);
+
+       TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
+               struct mptses *mpte;
+               struct socket *mp_so;
+
+               mpp_lock(mpp);
+
+               mpte = mpp->mpp_pcbe;
+               mp_so = mpp->mpp_socket;
+
+               if (mp_so->so_flags & SOF_DELEGATED &&
+                   uuid_compare(uuid, mp_so->e_uuid))
+                       goto next;
+               else if (!(mp_so->so_flags & SOF_DELEGATED) &&
+                        uuid_compare(uuid, mp_so->last_uuid))
+                       goto next;
+
+               mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
+
+               mptcp_check_subflows_and_add(mpte);
+               mptcp_remove_subflows(mpte);
+
+               mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
+
+next:
+               mpp_unlock(mpp);
+       }
+
+       lck_mtx_unlock(&mtcbinfo.mppi_lock);
+}
+
+static void
+mptcp_wifi_status_changed(void)
+{
+       struct mppcb *mpp;
+
+       /* Iterate over all MPTCP connections */
+
+       lck_mtx_lock(&mtcbinfo.mppi_lock);
+
+       TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
+               struct mptses *mpte;
+               struct socket *mp_so;
+
+               mpp_lock(mpp);
+
+               mpte = mpp->mpp_pcbe;
+               mp_so = mpp->mpp_socket;
+
+               /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
+               if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
+                       goto next;
+
+               mptcp_check_subflows_and_add(mpte);
+               mptcp_check_subflows_and_remove(mpte);
+
+next:
+               mpp_unlock(mpp);
+       }
+
+       lck_mtx_unlock(&mtcbinfo.mppi_lock);
+}
+
+void
+mptcp_ask_symptoms(struct mptses *mpte)
+{
+       struct mptcp_symptoms_ask_uuid ask;
+       struct socket *mp_so;
+       struct proc *p;
+       int pid, prio, err;
+
+       if (mptcp_kern_skt_unit == 0) {
+               os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
+               return;
+       }
+
+       mp_so = mptetoso(mpte);
+
+       if (mp_so->so_flags & SOF_DELEGATED)
+               pid = mp_so->e_pid;
+       else
+               pid = mp_so->last_pid;
+
+       p = proc_find(pid);
+       if (p == PROC_NULL) {
+               os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
+               return;
+       }
+
+       ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
+
+       if (mp_so->so_flags & SOF_DELEGATED)
+               uuid_copy(ask.uuid, mp_so->e_uuid);
        else
-               return (EALREADY);
+               uuid_copy(ask.uuid, mp_so->last_uuid);
+
+       prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
+
+       if (prio == TASK_BACKGROUND_APPLICATION)
+               ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
+       else if (prio == TASK_FOREGROUND_APPLICATION)
+               ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
+       else
+               ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
+
+       err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
+                             &ask, sizeof(ask), CTL_DATA_EOR);
+
+       os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
+                    __func__, pid, ask.priority, err);
+
+
+       proc_rele(p);
 }
 
 static errno_t
@@ -5466,68 +5869,66 @@ mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
        void *unitinfo)
 {
 #pragma unused(kctlref, kcunit, unitinfo)
-       if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
-               /* TBD needs to be locked if the size grows more than an int */
-               bzero(&mptcp_advisory, sizeof(mptcp_advisory));
-               return (0);
-       }
-       else {
-               return (EINVAL);
-       }
+
+       OSDecrementAtomic(&mptcp_kern_skt_inuse);
+
+       return (0);
 }
 
 static errno_t
 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
        mbuf_t m, int flags)
 {
-#pragma unused(kctlref, kcunit, unitinfo, flags)
+#pragma unused(kctlref, unitinfo, flags)
        symptoms_advisory_t     *sa = NULL;
 
+       if (kcunit != mptcp_kern_skt_unit)
+               os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
+                            __func__, kcunit, mptcp_kern_skt_unit);
+
        if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
                mbuf_freem(m);
                return (EINVAL);
        }
 
-       if (mbuf_len(m) >= sizeof(*sa))
-               sa = mbuf_data(m);
-       else
+       if (mbuf_len(m) < sizeof(*sa)) {
+               mbuf_freem(m);
                return (EINVAL);
-
-       if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
-               /*
-                * we could use this notification to notify all mptcp pcbs
-                * of the change in network status. But its difficult to
-                * define if sending REMOVE_ADDR or MP_PRIO is appropriate
-                * given that these are only soft indicators of the network
-                * state. Leaving this as TBD for now.
-                */
        }
 
-       if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
-               mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
-                   __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
-                   sa->sa_cell_status, mptcp_advisory.sa_cell_status),
-                   MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
-                   MPTCP_LOGLVL_LOG);
+       sa = mbuf_data(m);
+
+       if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
+           sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
+               uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
+
+               mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
+                   __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
+                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
 
                if ((sa->sa_wifi_status &
                    (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
-                   (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
+                   (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
                        mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
-               }
 
-               if ((sa->sa_cell_status &
-                   (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
-                   (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
-                       mptcp_advisory.sa_cell_status = sa->sa_cell_status;
-               }
-       } else {
-               mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
-                   "wifi %d cell %d\n", __func__,
-                   mptcp_advisory.sa_wifi_status,
-                   mptcp_advisory.sa_cell_status),
-                   MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
+               if (old_wifi_status != mptcp_advisory.sa_wifi_status)
+                       mptcp_wifi_status_changed();
+       } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
+               mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
+                   mptcp_advisory.sa_wifi_status),
+                   MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+       } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
+               uuid_t uuid;
+
+               mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
+                         MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+
+               uuid_copy(uuid, (unsigned char *)(sa + 1));
+
+               mptcp_allow_uuid(uuid);
        }
+
+       mbuf_freem(m);
        return (0);
 }
 
@@ -5548,148 +5949,208 @@ mptcp_control_register(void)
        (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
 }
 
+/*
+ * Three return-values:
+ * 1  : WiFi is bad
+ * 0  : WiFi is good
+ * -1 : WiFi-state is unknown, use subflow-only heuristics
+ */
 int
-mptcp_is_wifi_unusable(void)
+mptcp_is_wifi_unusable(struct mptses *mpte)
 {
-       /* a false return val indicates there is no info or wifi is ok */
-       return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
-}
+       if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
+               if (mptcp_advisory.sa_wifi_status)
+                       return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
 
-int
-mptcp_is_cell_unusable(void)
-{
-       /* a false return val indicates there is no info or cell is ok */
-       return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
+               /*
+                * If it's a first-party app and we don't have any info
+                * about the Wi-Fi state, let's be pessimistic.
+                */
+               return (-1);
+       }
+
+       return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
 }
 
-struct mptsub*
-mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
+boolean_t
+mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
 {
-       struct mptsub *cellsub = NULL;
-       struct mptsub *wifisub = NULL;
-       struct mptsub *wiredsub = NULL;
+       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+       int fail_thresh = mptcp_fail_thresh;
 
-       VERIFY ((best != NULL) && (second_best != NULL));
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
+               fail_thresh *= 2;
 
-       if (!mptcp_use_symptomsd)
-               return (NULL);
-
-       if (!mptcp_kern_skt_inuse)
-               return (NULL);
-
-       /*
-        * There could be devices with more than one wifi interface or
-        * more than one wired or cell interfaces.
-        * TBD: SymptomsD is unavailable on such platforms as of now.
-        * Try to prefer best when possible in general.
-        * Also, SymptomsD sends notifications about wifi only when it
-        * is primary.
-        */
-       if (best->mpts_linktype & MPTSL_WIFI)
-               wifisub = best;
-       else if (best->mpts_linktype & MPTSL_CELL)
-               cellsub = best;
-       else if (best->mpts_linktype & MPTSL_WIRED)
-               wiredsub = best;
-
-       /*
-        * On platforms with wired paths, don't use hints about wifi or cell.
-        * Currently, SymptomsD is not available on platforms with wired paths.
-        */
-       if (wiredsub)
-               return (NULL);
-
-       if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
-               wifisub = second_best;
-
-       if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
-               cellsub = second_best;
-
-       if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
-               wiredsub = second_best;
-
-       if ((wifisub == best) && mptcp_is_wifi_unusable()) {
-               tcpstat.tcps_mp_sel_symtomsd++;
-               if (mptcp_is_cell_unusable()) {
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
-                           " suggests both Wifi and Cell are bad. Wired %s.",
-                           (wiredsub == NULL) ? "none" : "present"),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
-                       return (wiredsub);
-               } else {
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
-                           " suggests Wifi bad, Cell good. Wired %s.",
-                           (wiredsub == NULL) ? "none" : "present"),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
-                       return ((wiredsub != NULL) ? wiredsub : cellsub);
-               }
-       }
-
-       if ((cellsub == best) && (mptcp_is_cell_unusable())) {
-               tcpstat.tcps_mp_sel_symtomsd++;
-               if (mptcp_is_wifi_unusable()) {
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
-                           " suggests both Cell and Wifi are bad. Wired %s.",
-                           (wiredsub == NULL) ? "none" : "present"),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
-                       return (wiredsub);
-               } else {
-                       mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
-                           " suggests Cell bad, Wifi good. Wired %s.",
-                           (wiredsub == NULL) ? "none" : "present"),
-                           MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
-                       return ((wiredsub != NULL) ? wiredsub : wifisub);
-               }
-       }
-
-       /* little is known about the state of the network or wifi is good */
-       return (NULL);
+       return (tp->t_rxtshift >= fail_thresh &&
+               (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq));
 }
 
 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
 static void
-mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts, int *wakeup)
+mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
 {
-       struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
+       struct socket *mp_so = mptetoso(mpte);
        struct socket *so = mpts->mpts_socket;
        struct tcpcb *tp = intotcpcb(sotoinpcb(so));
        struct mptcb *mp_tp = mpte->mpte_mptcb;
 
        /* If data was sent with SYN, rewind state */
        if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
-               mpts->mpts_flags &= ~MPTSF_TFO_REQD;
-               tp->t_mpflags &= ~TMPF_TFO_REQUEST;
-               MPT_LOCK(mp_tp);
-               u_int64_t mp_droplen = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
+               u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
                unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
+
                VERIFY(mp_droplen <= (UINT_MAX));
                VERIFY(mp_droplen >= tcp_droplen);
 
+               mpts->mpts_flags &= ~MPTSF_TFO_REQD;
+               mpts->mpts_iss += tcp_droplen;
+               tp->t_mpflags &= ~TMPF_TFO_REQUEST;
+
                if (mp_droplen > tcp_droplen) {
                        /* handle partial TCP ack */
                        mp_so->so_flags1 |= SOF1_TFO_REWIND;
                        mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
-                       mpts->mpts_sndnxt = mp_tp->mpt_sndnxt;
                        mp_droplen = tcp_droplen;
                } else {
                        /* all data on SYN was acked */
                        mpts->mpts_rel_seq = 1;
                        mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
-                       mpts->mpts_sndnxt = mp_tp->mpt_snduna;
                }
                mp_tp->mpt_sndmax -= tcp_droplen;
 
-               MPT_UNLOCK(mp_tp);
                if (mp_droplen != 0) {
                        VERIFY(mp_so->so_snd.sb_mb != NULL);
                        sbdrop(&mp_so->so_snd, (int)mp_droplen);
-                       if (wakeup)
-                               *wakeup = 1;
                }
-               mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d "
-                   "TFO tcp len %d mptcp len %d\n", __func__,
-                   (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid,
-                   tcp_droplen, mp_droplen),
-                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
+               mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
+                         __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
+                         mpts->mpts_connid, tcp_droplen, mp_droplen),
+                        MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
+       }
+}
+
+int
+mptcp_freeq(struct mptcb *mp_tp)
+{
+       struct tseg_qent *q;
+       int rv = 0;
+
+       while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
+               LIST_REMOVE(q, tqe_q);
+               m_freem(q->tqe_m);
+               zfree(tcp_reass_zone, q);
+               rv = 1;
+       }
+       mp_tp->mpt_reassqlen = 0;
+       return (rv);
+}
+
+static int
+mptcp_post_event(u_int32_t event_code, int value)
+{
+       struct kev_mptcp_data event_data;
+       struct kev_msg ev_msg;
+
+       memset(&ev_msg, 0, sizeof(ev_msg));
+
+       ev_msg.vendor_code      = KEV_VENDOR_APPLE;
+       ev_msg.kev_class        = KEV_NETWORK_CLASS;
+       ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
+       ev_msg.event_code       = event_code;
+
+       event_data.value = value;
+
+       ev_msg.dv[0].data_ptr    = &event_data;
+       ev_msg.dv[0].data_length = sizeof(event_data);
+
+       return kev_post_msg(&ev_msg);
+}
+
+void
+mptcp_set_cellicon(struct mptses *mpte)
+{
+       int error;
+
+       /* First-party apps (Siri) don't flip the cellicon */
+       if (mpte->mpte_flags & MPTE_FIRSTPARTY)
+               return;
+
+       /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
+       mptcp_last_cellicon_set = tcp_now;
+
+       /* If cellicon is already set, get out of here! */
+       if (OSTestAndSet(7, &mptcp_cellicon_is_set))
+               return;
+
+       error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
+
+       if (error)
+               mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
+                         __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+       else
+               mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+}
+
+void
+mptcp_unset_cellicon(void)
+{
+       int error;
+
+       /* If cellicon is already unset, get out of here! */
+       if (OSTestAndClear(7, &mptcp_cellicon_is_set))
+               return;
+
+       /*
+        * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
+        * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
+        * it again.
+        */
+       if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
+                    tcp_now)) {
+               OSTestAndSet(7, &mptcp_cellicon_is_set);
+               return;
        }
+
+       error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
+
+       if (error)
+               mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
+                         __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+       else
+               mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
+                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+}
+
+void
+mptcp_reset_rexmit_state(struct tcpcb *tp)
+{
+       struct mptsub *mpts;
+       struct inpcb *inp;
+       struct socket *so;
+
+       inp = tp->t_inpcb;
+       if (inp == NULL)
+               return;
+
+       so = inp->inp_socket;
+       if (so == NULL)
+               return;
+
+       if (!(so->so_flags & SOF_MP_SUBFLOW))
+               return;
+
+       mpts = tp->t_mpsub;
+
+       mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
+       so->so_flags &= ~SOF_MP_TRYFAILOVER;
+}
+
+void
+mptcp_reset_keepalive(struct tcpcb *tp)
+{
+       struct mptsub *mpts = tp->t_mpsub;
+
+       mpts->mpts_flags &= ~MPTSF_READ_STALL;
 }
+