bsd/netinet/mptcp.c

   1 /*
   2  * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * A note on the MPTCP/NECP-interactions:
  31  *
  32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
  33  * MPTCP registers to these events at the MPTCP-layer for interface-events
  34  * through a call to necp_client_register_multipath_cb.
  35  * To get per-flow events (aka per TCP-subflow), we register to it with
  36  * necp_client_register_socket_flow. Both registrations happen by using the
  37  * necp-client-uuid that comes from the app.
  38  *
  39  * The locking is rather tricky. In general, we expect the lock-ordering to
  40  * happen from necp-fd -> necp->client -> mpp_lock.
  41  *
  42  * There are however some subtleties.
  43  *
  44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
  45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
  46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
  47  * NECP-locks will deadlock us. Because these NECP-events will also first take
  48  * the NECP-locks. Either they win the race and thus won't find our
  49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
  50  * the callbacks while holding the NECP lock.
  51  *
  52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
  53  * because we have already registered callbacks and we might race against an
  54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
  55  *
  56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
  57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
  58  * pointers before we unregistered the callback. Because, again we might be
  59  * racing against an NECP-event. Unregistering must happen with an unlocked
  60  * mpp_lock, because of the lock-ordering constraint. It could be that
  61  * before we had a chance to unregister an NECP-event triggers. That's why
  62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
  63  * there while the socket is being garbage-collected, the use-count will go
  64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
  65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
  66  *
  67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
  68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
  69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
  70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
  71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
  72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
  73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
  74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
  75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
  76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
  77  * event). So, the event now finally gets the subflow-lock and then hits an
  78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
  79  * the NECP callback.
  80  */
  81
  82 #include <sys/param.h>
  83 #include <sys/systm.h>
  84 #include <sys/kernel.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/mcache.h>
  87 #include <sys/socket.h>
  88 #include <sys/socketvar.h>
  89 #include <sys/syslog.h>
  90 #include <sys/protosw.h>
  91
  92 #include <kern/zalloc.h>
  93 #include <kern/locks.h>
  94
  95 #include <mach/sdt.h>
  96
  97 #include <net/if.h>
  98 #include <netinet/in.h>
  99 #include <netinet/in_var.h>
 100 #include <netinet/tcp.h>
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_var.h>
 104 #include <netinet/mptcp_var.h>
 105 #include <netinet/mptcp.h>
 106 #include <netinet/mptcp_seq.h>
 107 #include <netinet/mptcp_opt.h>
 108 #include <netinet/mptcp_timer.h>
 109
 110 int mptcp_enable = 1;
 111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
 112         &mptcp_enable, 0, "Enable Multipath TCP Support");
 113
 114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
 115 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
 116 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
 117         CTLFLAG_RW | CTLFLAG_LOCKED,
 118         &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
 119
 120 /*
 121  * By default, DSS checksum is turned off, revisit if we ever do
 122  * MPTCP for non SSL Traffic.
 123  */
 124 int mptcp_dss_csum = 0;
 125 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
 126         &mptcp_dss_csum, 0, "Enable DSS checksum");
 127
 128 /*
 129  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
 130  * is attempted on a different path.
 131  */
 132 int mptcp_fail_thresh = 1;
 133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
 134         &mptcp_fail_thresh, 0, "Failover threshold");
 135
 136
 137 /*
 138  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
 139  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
 140  * Some carrier networks have a timeout of 10 or 15 minutes.
 141  */
 142 int mptcp_subflow_keeptime = 60*14;
 143 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
 144         &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
 145
 146 int mptcp_rtthist_rtthresh = 600;
 147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 148         &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
 149
 150 /*
 151  * Use RTO history for sending new data
 152  */
 153 int mptcp_use_rto = 1;
 154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
 155         &mptcp_use_rto, 0, "Disable RTO for subflow selection");
 156
 157 int mptcp_rtothresh = 1500;
 158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 159         &mptcp_rtothresh, 0, "RTO threshold");
 160
 161 /*
 162  * Probe the preferred path, when it is not in use
 163  */
 164 uint32_t mptcp_probeto = 1000;
 165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
 166         &mptcp_probeto, 0, "Disable probing by setting to 0");
 167
 168 uint32_t mptcp_probecnt = 5;
 169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 170         &mptcp_probecnt, 0, "Number of probe writes");
 171
 172 /*
 173  * Static declarations
 174  */
 175 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 176                                  uint32_t, uint16_t, uint16_t, uint16_t);
 177
 178 static int
 179 mptcp_reass_present(struct socket *mp_so)
 180 {
 181         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 182         struct tseg_qent *q;
 183         int dowakeup = 0;
 184         int flags = 0;
 185
 186         /*
 187          * Present data to user, advancing rcv_nxt through
 188          * completed sequence space.
 189          */
 190         if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
 191                 return (flags);
 192         q = LIST_FIRST(&mp_tp->mpt_segq);
 193         if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt)
 194                 return (flags);
 195
 196         /*
 197          * If there is already another thread doing reassembly for this
 198          * connection, it is better to let it finish the job --
 199          * (radar 16316196)
 200          */
 201         if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG)
 202                 return (flags);
 203
 204         mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
 205
 206         do {
 207                 mp_tp->mpt_rcvnxt += q->tqe_len;
 208                 LIST_REMOVE(q, tqe_q);
 209                 if (mp_so->so_state & SS_CANTRCVMORE) {
 210                         m_freem(q->tqe_m);
 211                 } else {
 212                         flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 213                         if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0))
 214                                 dowakeup = 1;
 215                 }
 216                 zfree(tcp_reass_zone, q);
 217                 mp_tp->mpt_reassqlen--;
 218                 q = LIST_FIRST(&mp_tp->mpt_segq);
 219         } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
 220         mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
 221
 222         if (dowakeup)
 223                 sorwakeup(mp_so); /* done with socket lock held */
 224         return (flags);
 225
 226 }
 227
 228 static int
 229 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
 230 {
 231         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 232         u_int64_t mb_dsn = phdr->mp_dsn;
 233         struct tseg_qent *q;
 234         struct tseg_qent *p = NULL;
 235         struct tseg_qent *nq;
 236         struct tseg_qent *te = NULL;
 237         u_int16_t qlimit;
 238
 239         /*
 240          * Limit the number of segments in the reassembly queue to prevent
 241          * holding on to too many segments (and thus running out of mbufs).
 242          * Make sure to let the missing segment through which caused this
 243          * queue.  Always keep one global queue entry spare to be able to
 244          * process the missing segment.
 245          */
 246         qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
 247             (tcp_autorcvbuf_max >> 10));
 248         if (mb_dsn != mp_tp->mpt_rcvnxt &&
 249             (mp_tp->mpt_reassqlen + 1) >= qlimit) {
 250                 tcpstat.tcps_mptcp_rcvmemdrop++;
 251                 m_freem(m);
 252                 *tlenp = 0;
 253                 return (0);
 254         }
 255
 256         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 257         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 258         if (te == NULL) {
 259                 tcpstat.tcps_mptcp_rcvmemdrop++;
 260                 m_freem(m);
 261                 return (0);
 262         }
 263
 264         mp_tp->mpt_reassqlen++;
 265
 266         /*
 267          * Find a segment which begins after this one does.
 268          */
 269         LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
 270                 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn))
 271                         break;
 272                 p = q;
 273         }
 274
 275         /*
 276          * If there is a preceding segment, it may provide some of
 277          * our data already.  If so, drop the data from the incoming
 278          * segment.  If it provides all of our data, drop us.
 279          */
 280         if (p != NULL) {
 281                 int64_t i;
 282                 /* conversion to int (in i) handles seq wraparound */
 283                 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
 284                 if (i > 0) {
 285                         if (i >= *tlenp) {
 286                                 tcpstat.tcps_mptcp_rcvduppack++;
 287                                 m_freem(m);
 288                                 zfree(tcp_reass_zone, te);
 289                                 te = NULL;
 290                                 mp_tp->mpt_reassqlen--;
 291                                 /*
 292                                  * Try to present any queued data
 293                                  * at the left window edge to the user.
 294                                  * This is needed after the 3-WHS
 295                                  * completes.
 296                                  */
 297                                 goto out;
 298                         }
 299                         m_adj(m, i);
 300                         *tlenp -= i;
 301                         phdr->mp_dsn += i;
 302                 }
 303         }
 304
 305         tcpstat.tcps_mp_oodata++;
 306
 307         /*
 308          * While we overlap succeeding segments trim them or,
 309          * if they are completely covered, dequeue them.
 310          */
 311         while (q) {
 312                 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
 313                 if (i <= 0)
 314                         break;
 315
 316                 if (i < q->tqe_len) {
 317                         q->tqe_m->m_pkthdr.mp_dsn += i;
 318                         q->tqe_len -= i;
 319                         m_adj(q->tqe_m, i);
 320                         break;
 321                 }
 322
 323                 nq = LIST_NEXT(q, tqe_q);
 324                 LIST_REMOVE(q, tqe_q);
 325                 m_freem(q->tqe_m);
 326                 zfree(tcp_reass_zone, q);
 327                 mp_tp->mpt_reassqlen--;
 328                 q = nq;
 329         }
 330
 331         /* Insert the new segment queue entry into place. */
 332         te->tqe_m = m;
 333         te->tqe_th = NULL;
 334         te->tqe_len = *tlenp;
 335
 336         if (p == NULL) {
 337                 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
 338         } else {
 339                 LIST_INSERT_AFTER(p, te, tqe_q);
 340         }
 341
 342 out:
 343         return (mptcp_reass_present(mp_so));
 344 }
 345
 346 /*
 347  * MPTCP input, called when data has been read from a subflow socket.
 348  */
 349 void
 350 mptcp_input(struct mptses *mpte, struct mbuf *m)
 351 {
 352         struct socket *mp_so;
 353         struct mptcb *mp_tp = NULL;
 354         int count = 0, wakeup = 0;
 355         struct mbuf *save = NULL, *prev = NULL;
 356         struct mbuf *freelist = NULL, *tail = NULL;
 357
 358         VERIFY(m->m_flags & M_PKTHDR);
 359
 360         mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 361
 362         mp_so = mptetoso(mpte);
 363         mp_tp = mpte->mpte_mptcb;
 364
 365         DTRACE_MPTCP(input);
 366
 367         mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
 368
 369         /*
 370          * Each mbuf contains MPTCP Data Sequence Map
 371          * Process the data for reassembly, delivery to MPTCP socket
 372          * client, etc.
 373          *
 374          */
 375         count = mp_so->so_rcv.sb_cc;
 376
 377         /*
 378          * In the degraded fallback case, data is accepted without DSS map
 379          */
 380         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
 381                 struct mbuf *iter;
 382                 int mb_dfin = 0;
 383 fallback:
 384                 mptcp_sbrcv_grow(mp_tp);
 385
 386                 for (iter = m; iter; iter = iter->m_next) {
 387                         if ((iter->m_flags & M_PKTHDR) &&
 388                             (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
 389                                 mb_dfin = 1;
 390                                 break;
 391                         }
 392                 }
 393
 394                 /*
 395                  * assume degraded flow as this may be the first packet
 396                  * without DSS, and the subflow state is not updated yet.
 397                  */
 398                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
 399                         sorwakeup(mp_so);
 400
 401                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
 402                     struct socket *, mp_so,
 403                     struct sockbuf *, &mp_so->so_rcv,
 404                     struct sockbuf *, &mp_so->so_snd,
 405                     struct mptses *, mpte);
 406                 count = mp_so->so_rcv.sb_cc - count;
 407
 408                 mp_tp->mpt_rcvnxt += count;
 409
 410                 if (mb_dfin) {
 411                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 412                         socantrcvmore(mp_so);
 413                 }
 414
 415                 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
 416                     count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 417                 return;
 418         }
 419
 420         do {
 421                 u_int64_t mb_dsn;
 422                 int32_t mb_datalen;
 423                 int64_t todrop;
 424                 int mb_dfin = 0;
 425
 426                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
 427                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
 428                         goto fallback;
 429
 430                 save = m->m_next;
 431                 /*
 432                  * A single TCP packet formed of multiple mbufs
 433                  * holds DSS mapping in the first mbuf of the chain.
 434                  * Other mbufs in the chain may have M_PKTHDR set
 435                  * even though they belong to the same TCP packet
 436                  * and therefore use the DSS mapping stored in the
 437                  * first mbuf of the mbuf chain. mptcp_input() can
 438                  * get an mbuf chain with multiple TCP packets.
 439                  */
 440                 while (save && (!(save->m_flags & M_PKTHDR) ||
 441                     !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
 442                         prev = save;
 443                         save = save->m_next;
 444                 }
 445                 if (prev)
 446                         prev->m_next = NULL;
 447                 else
 448                         m->m_next = NULL;
 449
 450                 mb_dsn = m->m_pkthdr.mp_dsn;
 451                 mb_datalen = m->m_pkthdr.mp_rlen;
 452
 453                 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
 454                 if (todrop > 0) {
 455                         tcpstat.tcps_mptcp_rcvpackafterwin++;
 456
 457                         if (todrop >= mb_datalen) {
 458                                 if (freelist == NULL)
 459                                         freelist = m;
 460                                 else
 461                                         tail->m_next = m;
 462
 463                                 if (prev != NULL)
 464                                         tail = prev;
 465                                 else
 466                                         tail = m;
 467
 468                                 m = save;
 469                                 prev = save = NULL;
 470                                 continue;
 471                         } else {
 472                                 m_adj(m, -todrop);
 473                                 mb_datalen -= todrop;
 474                         }
 475
 476                         /*
 477                          * We drop from the right edge of the mbuf, thus the
 478                          * DATA_FIN is dropped as well
 479                          */
 480                         m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
 481                 }
 482
 483                 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
 484                     !LIST_EMPTY(&mp_tp->mpt_segq)) {
 485                         mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
 486
 487                         goto next;
 488                 }
 489                 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 490
 491                 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
 492                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
 493                             mp_tp->mpt_rcvnxt)) {
 494                                 if (freelist == NULL)
 495                                         freelist = m;
 496                                 else
 497                                         tail->m_next = m;
 498
 499                                 if (prev != NULL)
 500                                         tail = prev;
 501                                 else
 502                                         tail = m;
 503
 504                                 m = save;
 505                                 prev = save = NULL;
 506                                 continue;
 507                         } else {
 508                                 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
 509                         }
 510                         mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
 511                             mp_tp->mpt_rcvnxt),
 512                             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 513                 }
 514
 515                 mptcp_sbrcv_grow(mp_tp);
 516
 517                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
 518                         wakeup = 1;
 519
 520                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
 521                     struct sockbuf *, &mp_so->so_rcv,
 522                     struct sockbuf *, &mp_so->so_snd,
 523                     struct mptses *, mpte,
 524                     struct mptcb *, mp_tp);
 525                 count = mp_so->so_rcv.sb_cc - count;
 526                 tcpstat.tcps_mp_rcvtotal++;
 527                 tcpstat.tcps_mp_rcvbytes += count;
 528                 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
 529                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 530
 531                 mp_tp->mpt_rcvnxt += count;
 532
 533 next:
 534                 if (mb_dfin) {
 535                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 536                         socantrcvmore(mp_so);
 537                 }
 538                 m = save;
 539                 prev = save = NULL;
 540                 count = mp_so->so_rcv.sb_cc;
 541         } while (m);
 542
 543         if (freelist)
 544                 m_freem(freelist);
 545
 546         if (wakeup)
 547                 sorwakeup(mp_so);
 548 }
 549
 550 static boolean_t
 551 mptcp_can_send_more(struct mptcb *mp_tp)
 552 {
 553         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 554
 555         /*
 556          * Always send if there is data in the reinject-queue.
 557          */
 558         if (mp_tp->mpt_mpte->mpte_reinjectq)
 559                 return (TRUE);
 560
 561         /*
 562          * Don't send, if:
 563          *
 564          * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
 565          *    Except when using TFO, we might be doing a 0-byte write.
 566          * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
 567          * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
 568          */
 569
 570         if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax))
 571                 return (FALSE);
 572
 573         if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt))
 574                 return (FALSE);
 575
 576         if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
 577                 return (FALSE);
 578
 579         if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2)
 580                 return (FALSE);
 581
 582         return (TRUE);
 583 }
 584
 585 /*
 586  * MPTCP output.
 587  */
 588 int
 589 mptcp_output(struct mptses *mpte)
 590 {
 591         struct mptcb *mp_tp;
 592         struct mptsub *mpts;
 593         struct mptsub *mpts_tried = NULL;
 594         struct socket *mp_so;
 595         struct mptsub *preferred_mpts = NULL;
 596         uint64_t old_snd_nxt;
 597         int error = 0;
 598
 599         mpte_lock_assert_held(mpte);
 600         mp_so = mptetoso(mpte);
 601         mp_tp = mpte->mpte_mptcb;
 602
 603         VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
 604         mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 605
 606         mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
 607                   __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
 608                   (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
 609                   mpte->mpte_reinjectq ? 1 : 0,
 610                   mp_tp->mpt_state),
 611                  MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 612
 613         old_snd_nxt = mp_tp->mpt_sndnxt;
 614         while (mptcp_can_send_more(mp_tp)) {
 615                 /* get the "best" subflow to be used for transmission */
 616                 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
 617                 if (mpts == NULL) {
 618                         mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
 619                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 620                         break;
 621                 }
 622
 623                 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
 624                     MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 625
 626                 /* In case there's just one flow, we reattempt later */
 627                 if (mpts_tried != NULL &&
 628                     (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
 629                         mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
 630                         mpts_tried->mpts_flags |= MPTSF_ACTIVE;
 631                         mptcp_start_timer(mpte, MPTT_REXMT);
 632                         mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
 633                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 634                         break;
 635                 }
 636
 637                 /*
 638                  * Automatic sizing of send socket buffer. Increase the send
 639                  * socket buffer size if all of the following criteria are met
 640                  *      1. the receiver has enough buffer space for this data
 641                  *      2. send buffer is filled to 7/8th with data (so we actually
 642                  *         have data to make use of it);
 643                  */
 644                 if (tcp_do_autosendbuf == 1 &&
 645                     (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
 646                     tcp_cansbgrow(&mp_so->so_snd)) {
 647                         if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
 648                             mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
 649                                 if (sbreserve(&mp_so->so_snd,
 650                                     min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 651                                     tcp_autosndbuf_max)) == 1) {
 652                                         mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
 653
 654                                         mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
 655                                                   __func__, mp_so->so_snd.sb_hiwat,
 656                                                   mp_so->so_snd.sb_lowat),
 657                                                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 658                                 }
 659                         }
 660                 }
 661
 662                 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
 663                     struct socket *, mp_so);
 664                 error = mptcp_subflow_output(mpte, mpts, 0);
 665                 if (error) {
 666                         /* can be a temporary loss of source address or other error */
 667                         mpts->mpts_flags |= MPTSF_FAILINGOVER;
 668                         mpts->mpts_flags &= ~MPTSF_ACTIVE;
 669                         mpts_tried = mpts;
 670                         mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
 671                                   error, mpts->mpts_flags),
 672                                  MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
 673                         break;
 674                 }
 675                 /* The model is to have only one active flow at a time */
 676                 mpts->mpts_flags |= MPTSF_ACTIVE;
 677                 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
 678
 679                 /* Allows us to update the smoothed rtt */
 680                 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
 681                         if (preferred_mpts->mpts_probesoon) {
 682                                 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
 683                                         mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
 684                                         if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
 685                                                 preferred_mpts->mpts_probesoon = 0;
 686                                                 preferred_mpts->mpts_probecnt = 0;
 687                                         }
 688                                 }
 689                         } else {
 690                                 preferred_mpts->mpts_probesoon = tcp_now;
 691                                 preferred_mpts->mpts_probecnt = 0;
 692                         }
 693                 }
 694
 695                 if (mpte->mpte_active_sub == NULL) {
 696                         mpte->mpte_active_sub = mpts;
 697                 } else if (mpte->mpte_active_sub != mpts) {
 698                         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 699                         struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
 700
 701                         mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
 702                             mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
 703                             mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
 704                             (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
 705
 706                         mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
 707                         mpte->mpte_active_sub = mpts;
 708
 709                         mptcpstats_inc_switch(mpte, mpts);
 710                 }
 711         }
 712
 713         mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
 714
 715         /* subflow errors should not be percolated back up */
 716         return (0);
 717 }
 718
 719
 720 static struct mptsub *
 721 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
 722 {
 723         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 724
 725         /*
 726          * Lower RTT? Take it, if it's our first one, or
 727          * it doesn't has any loss, or the current one has
 728          * loss as well.
 729          */
 730         if (tp->t_srtt && *currtt > tp->t_srtt &&
 731             (curbest == NULL || tp->t_rxtshift == 0 ||
 732              sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
 733                 *currtt = tp->t_srtt;
 734                 return (mpts);
 735         }
 736
 737         /*
 738          * If we find a subflow without loss, take it always!
 739          */
 740         if (curbest &&
 741             sototcpcb(curbest->mpts_socket)->t_rxtshift &&
 742             tp->t_rxtshift == 0) {
 743                 *currtt = tp->t_srtt;
 744                 return (mpts);
 745         }
 746
 747         return (curbest != NULL ? curbest : mpts);
 748 }
 749
 750 static struct mptsub *
 751 mptcp_return_subflow(struct mptsub *mpts)
 752 {
 753         if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0)
 754                 return (NULL);
 755
 756         return (mpts);
 757 }
 758
 759 /*
 760  * Return the most eligible subflow to be used for sending data.
 761  */
 762 struct mptsub *
 763 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
 764 {
 765         struct tcpcb *besttp, *secondtp;
 766         struct inpcb *bestinp, *secondinp;
 767         struct mptsub *mpts;
 768         struct mptsub *best = NULL;
 769         struct mptsub *second_best = NULL;
 770         int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
 771
 772         /*
 773          * First Step:
 774          * Choose the best subflow for cellular and non-cellular interfaces.
 775          */
 776
 777         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
 778                 struct socket *so = mpts->mpts_socket;
 779                 struct tcpcb *tp = sototcpcb(so);
 780                 struct inpcb *inp = sotoinpcb(so);
 781
 782                 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
 783                           __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
 784                           INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
 785                           inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
 786                           tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
 787                           mptcp_subflow_cwnd_space(so)),
 788                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 789
 790                 /*
 791                  * First, the hard conditions to reject subflows
 792                  * (e.g., not connected,...)
 793                  */
 794                 if (mpts == ignore || inp->inp_last_outifp == NULL)
 795                         continue;
 796
 797                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
 798                         continue;
 799
 800                 /* There can only be one subflow in degraded state */
 801                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
 802                         best = mpts;
 803                         break;
 804                 }
 805
 806                 /*
 807                  * If this subflow is waiting to finally send, do it!
 808                  */
 809                 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
 810                         return (mptcp_return_subflow(mpts));
 811
 812                 /*
 813                  * Only send if the subflow is MP_CAPABLE. The exceptions to
 814                  * this rule (degraded or TFO) have been taken care of above.
 815                  */
 816                 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE))
 817                         continue;
 818
 819                 if ((so->so_state & SS_ISDISCONNECTED) ||
 820                     !(so->so_state & SS_ISCONNECTED) ||
 821                     !TCPS_HAVEESTABLISHED(tp->t_state) ||
 822                     tp->t_state > TCPS_CLOSE_WAIT)
 823                         continue;
 824
 825                 /*
 826                  * Second, the soft conditions to find the subflow with best
 827                  * conditions for each set (aka cellular vs non-cellular)
 828                  */
 829                 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
 830                         second_best = mptcp_choose_subflow(mpts, second_best,
 831                                                            &exp_rtt);
 832                 else
 833                         best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
 834         }
 835
 836         /*
 837          * If there is no preferred or backup subflow, and there is no active
 838          * subflow use the last usable subflow.
 839          */
 840         if (best == NULL)
 841                 return (mptcp_return_subflow(second_best));
 842
 843         if (second_best == NULL)
 844                 return (mptcp_return_subflow(best));
 845
 846         besttp = sototcpcb(best->mpts_socket);
 847         bestinp = sotoinpcb(best->mpts_socket);
 848         secondtp = sototcpcb(second_best->mpts_socket);
 849         secondinp = sotoinpcb(second_best->mpts_socket);
 850
 851         if (preferred != NULL)
 852                 *preferred = mptcp_return_subflow(best);
 853
 854         /*
 855          * Second Step: Among best and second_best. Choose the one that is
 856          * most appropriate for this particular service-type.
 857          */
 858         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 859                 /*
 860                  * Only handover if Symptoms tells us to do so.
 861                  */
 862                 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
 863                     mptcp_is_wifi_unusable() &&
 864                     besttp->t_rxtshift >= mptcp_fail_thresh)
 865                         return (mptcp_return_subflow(second_best));
 866
 867                 return (mptcp_return_subflow(best));
 868         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
 869                 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
 870                 int rto_thresh = mptcp_rtothresh;
 871
 872                 /* Adjust with symptoms information */
 873                 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
 874                     mptcp_is_wifi_unusable()) {
 875                         rtt_thresh /= 2;
 876                         rto_thresh /= 2;
 877                 }
 878
 879                 if (besttp->t_srtt && secondtp->t_srtt &&
 880                     besttp->t_srtt >= rtt_thresh &&
 881                     secondtp->t_srtt < rtt_thresh) {
 882                         tcpstat.tcps_mp_sel_rtt++;
 883                         mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
 884                             best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
 885                             second_best->mpts_connid,
 886                             secondtp->t_srtt >> TCP_RTT_SHIFT),
 887                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 888                         return (mptcp_return_subflow(second_best));
 889                 }
 890
 891                 if (besttp->t_rxtshift >= mptcp_fail_thresh &&
 892                     secondtp->t_rxtshift == 0) {
 893                         return (mptcp_return_subflow(second_best));
 894                 }
 895
 896                 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
 897                 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
 898                     besttp->t_rxtcur >= rto_thresh &&
 899                     secondtp->t_rxtcur < rto_thresh) {
 900                         tcpstat.tcps_mp_sel_rto++;
 901                         mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
 902                             best->mpts_connid, besttp->t_rxtcur,
 903                             second_best->mpts_connid, secondtp->t_rxtcur),
 904                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 905
 906                         return (mptcp_return_subflow(second_best));
 907                 }
 908
 909                 /*
 910                  * None of the above conditions for sending on the secondary
 911                  * were true. So, let's schedule on the best one, if he still
 912                  * has some space in the congestion-window.
 913                  */
 914                 return (mptcp_return_subflow(best));
 915         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
 916                 struct mptsub *tmp;
 917
 918                 /*
 919                  * We only care about RTT when aggregating
 920                  */
 921                 if (besttp->t_srtt > secondtp->t_srtt) {
 922                         tmp = best;
 923                         best = second_best;
 924                         besttp = secondtp;
 925                         bestinp = secondinp;
 926
 927                         second_best = tmp;
 928                         secondtp = sototcpcb(second_best->mpts_socket);
 929                         secondinp = sotoinpcb(second_best->mpts_socket);
 930                 }
 931
 932                 /* Is there still space in the congestion window? */
 933                 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0)
 934                         return (mptcp_return_subflow(second_best));
 935
 936                 return (mptcp_return_subflow(best));
 937         } else {
 938                 panic("Unknown service-type configured for MPTCP");
 939         }
 940
 941         return (NULL);
 942 }
 943
 944 static const char *
 945 mptcp_event_to_str(uint32_t event)
 946 {
 947         const char *c = "UNDEFINED";
 948         switch (event) {
 949         case MPCE_CLOSE:
 950                 c = "MPCE_CLOSE";
 951                 break;
 952         case MPCE_RECV_DATA_ACK:
 953                 c = "MPCE_RECV_DATA_ACK";
 954                 break;
 955         case MPCE_RECV_DATA_FIN:
 956                 c = "MPCE_RECV_DATA_FIN";
 957                 break;
 958         }
 959         return (c);
 960 }
 961
 962 static const char *
 963 mptcp_state_to_str(mptcp_state_t state)
 964 {
 965         const char *c = "UNDEFINED";
 966         switch (state) {
 967         case MPTCPS_CLOSED:
 968                 c = "MPTCPS_CLOSED";
 969                 break;
 970         case MPTCPS_LISTEN:
 971                 c = "MPTCPS_LISTEN";
 972                 break;
 973         case MPTCPS_ESTABLISHED:
 974                 c = "MPTCPS_ESTABLISHED";
 975                 break;
 976         case MPTCPS_CLOSE_WAIT:
 977                 c = "MPTCPS_CLOSE_WAIT";
 978                 break;
 979         case MPTCPS_FIN_WAIT_1:
 980                 c = "MPTCPS_FIN_WAIT_1";
 981                 break;
 982         case MPTCPS_CLOSING:
 983                 c = "MPTCPS_CLOSING";
 984                 break;
 985         case MPTCPS_LAST_ACK:
 986                 c = "MPTCPS_LAST_ACK";
 987                 break;
 988         case MPTCPS_FIN_WAIT_2:
 989                 c = "MPTCPS_FIN_WAIT_2";
 990                 break;
 991         case MPTCPS_TIME_WAIT:
 992                 c = "MPTCPS_TIME_WAIT";
 993                 break;
 994         case MPTCPS_TERMINATE:
 995                 c = "MPTCPS_TERMINATE";
 996                 break;
 997         }
 998         return (c);
 999 }
1000
1001 void
1002 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1003 {
1004         mpte_lock_assert_held(mp_tp->mpt_mpte);
1005         mptcp_state_t old_state = mp_tp->mpt_state;
1006
1007         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1008             uint32_t, event);
1009
1010         switch (mp_tp->mpt_state) {
1011         case MPTCPS_CLOSED:
1012         case MPTCPS_LISTEN:
1013                 mp_tp->mpt_state = MPTCPS_CLOSED;
1014                 break;
1015
1016         case MPTCPS_ESTABLISHED:
1017                 if (event == MPCE_CLOSE) {
1018                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1019                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1020                 } else if (event == MPCE_RECV_DATA_FIN) {
1021                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1022                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1023                 }
1024                 break;
1025
1026         case MPTCPS_CLOSE_WAIT:
1027                 if (event == MPCE_CLOSE) {
1028                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
1029                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1030                 }
1031                 break;
1032
1033         case MPTCPS_FIN_WAIT_1:
1034                 if (event == MPCE_RECV_DATA_ACK) {
1035                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1036                 } else if (event == MPCE_RECV_DATA_FIN) {
1037                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1038                         mp_tp->mpt_state = MPTCPS_CLOSING;
1039                 }
1040                 break;
1041
1042         case MPTCPS_CLOSING:
1043                 if (event == MPCE_RECV_DATA_ACK)
1044                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1045                 break;
1046
1047         case MPTCPS_LAST_ACK:
1048                 if (event == MPCE_RECV_DATA_ACK)
1049                         mptcp_close(mp_tp->mpt_mpte, mp_tp);
1050                 break;
1051
1052         case MPTCPS_FIN_WAIT_2:
1053                 if (event == MPCE_RECV_DATA_FIN) {
1054                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1055                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1056                 }
1057                 break;
1058
1059         case MPTCPS_TIME_WAIT:
1060         case MPTCPS_TERMINATE:
1061                 break;
1062
1063         default:
1064                 VERIFY(0);
1065                 /* NOTREACHED */
1066         }
1067         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1068             uint32_t, event);
1069         mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1070             mptcp_state_to_str(old_state),
1071             mptcp_state_to_str(mp_tp->mpt_state),
1072             mptcp_event_to_str(event)),
1073             MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1074 }
1075
1076 /* If you change this function, match up mptcp_update_rcv_state_f */
1077 void
1078 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1079     uint16_t csum)
1080 {
1081         struct mptcb *mp_tp = tptomptp(tp);
1082         u_int64_t full_dsn = 0;
1083
1084         NTOHL(dss_info->mdss_dsn);
1085         NTOHL(dss_info->mdss_subflow_seqn);
1086         NTOHS(dss_info->mdss_data_len);
1087
1088         /* XXX for autosndbuf grow sb here */
1089         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1090         mptcp_update_rcv_state_meat(mp_tp, tp,
1091             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1092             csum);
1093
1094 }
1095
1096 void
1097 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1098     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1099     uint16_t csum)
1100 {
1101         if (mdss_data_len == 0) {
1102                 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
1103                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1104
1105                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1106                         mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
1107                             csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1108                 }
1109                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1110                 return;
1111         }
1112         mptcplog((LOG_DEBUG,
1113             "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__,
1114             seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
1115             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1116
1117         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1118
1119         tp->t_rcv_map.mpt_dsn = full_dsn;
1120         tp->t_rcv_map.mpt_sseq = seqn;
1121         tp->t_rcv_map.mpt_len = mdss_data_len;
1122         tp->t_rcv_map.mpt_csum = csum;
1123         tp->t_mpflags |= TMPF_EMBED_DSN;
1124 }
1125
1126
1127 static int
1128 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1129     int hdrlen)
1130 {
1131         u_int32_t datalen;
1132
1133         if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
1134                 return 0;
1135
1136         datalen = m->m_pkthdr.mp_rlen;
1137
1138         /* unacceptable DSS option, fallback to TCP */
1139         if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1140                 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
1141                     __func__, m->m_pkthdr.len, datalen),
1142                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1143         } else {
1144                 return 0;
1145         }
1146         tp->t_mpflags |= TMPF_SND_MPFAIL;
1147         mptcp_notify_mpfail(so);
1148         m_freem(m);
1149         return -1;
1150 }
1151
1152 int
1153 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1154                     int drop_hdrlen)
1155 {
1156         mptcp_insert_rmap(tp, m, th);
1157         if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1158             drop_hdrlen) != 0)
1159                 return -1;
1160         return 0;
1161 }
1162
1163 /*
1164  * MPTCP Checksum support
1165  * The checksum is calculated whenever the MPTCP DSS option is included
1166  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1167  * header and the actual data indicated by the length specified in the
1168  * DSS option.
1169  */
1170
1171 int
1172 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1173                     uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1174 {
1175         uint16_t mptcp_csum;
1176
1177         mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1178         if (mptcp_csum) {
1179                 tp->t_mpflags |= TMPF_SND_MPFAIL;
1180                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1181                 m_freem(m);
1182                 tcpstat.tcps_mp_badcsum++;
1183                 return (-1);
1184         }
1185         return (0);
1186 }
1187
1188 static uint16_t
1189 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1190                  uint16_t dlen, uint16_t csum, uint16_t dfin)
1191 {
1192         struct mptcb *mp_tp = tptomptp(tp);
1193         uint16_t real_len = dlen - dfin;
1194         uint32_t sum = 0;
1195
1196         if (mp_tp == NULL)
1197                 return (0);
1198
1199         if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1200                 return (0);
1201
1202         if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1203                 return (0);
1204
1205         /*
1206          * The remote side may send a packet with fewer bytes than the
1207          * claimed DSS checksum length.
1208          */
1209         if ((int)m_length2(m, NULL) < real_len) {
1210                 return (0xffff);
1211         }
1212
1213         if (real_len != 0)
1214                 sum = m_sum16(m, 0, real_len);
1215
1216         sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1217         ADDCARRY(sum);
1218         DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1219             uint32_t, sum);
1220
1221         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1222             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1223         return (~sum & 0xffff);
1224 }
1225
1226 uint32_t
1227 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1228 {
1229         u_int32_t sum = 0;
1230
1231         if (dlen)
1232                 sum = m_sum16(m, 0, dlen);
1233
1234         dss_val = mptcp_hton64(dss_val);
1235         sseq = htonl(sseq);
1236         dlen = htons(dlen);
1237         sum += in_pseudo64(dss_val, sseq, dlen);
1238
1239         ADDCARRY(sum);
1240         sum = ~sum & 0xffff;
1241         DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1242         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1243                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1244
1245         return sum;
1246 }
1247
1248 /*
1249  * When WiFi signal starts fading, there's more loss and RTT spikes.
1250  * Check if there has been a large spike by comparing against
1251  * a tolerable RTT spike threshold.
1252  */
1253 boolean_t
1254 mptcp_no_rto_spike(struct socket *so)
1255 {
1256         struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1257         int32_t spike = 0;
1258
1259         if (tp->t_rxtcur > mptcp_rtothresh) {
1260                 spike = tp->t_rxtcur - mptcp_rtothresh;
1261
1262                 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1263                     __func__, spike,
1264                     tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1265                     tp->t_rttcur),
1266                     (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1267
1268         }
1269
1270         if (spike > 0 ) {
1271                 return (FALSE);
1272         } else {
1273                 return (TRUE);
1274         }
1275 }
1276
1277 void
1278 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1279 {
1280         VERIFY(mpp->mpp_flags & flag);
1281         mpp->mpp_flags &= ~flag;
1282
1283         if (mptcp_should_defer_upcall(mpp))
1284                 return;
1285
1286         if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1287                 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1288
1289                 mptcp_subflow_workloop(mpp->mpp_pcbe);
1290         }
1291
1292         if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1293                 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1294
1295                 sorwakeup(mpp->mpp_socket);
1296         }
1297
1298         if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1299                 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1300
1301                 sowwakeup(mpp->mpp_socket);
1302         }
1303
1304         if (mpp->mpp_flags & MPP_SET_CELLICON) {
1305                 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1306
1307                 mptcp_set_cellicon(mpp->mpp_pcbe);
1308         }
1309
1310         if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1311                 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1312
1313                 mptcp_unset_cellicon();
1314         }
1315 }
1316
1317 static void
1318 mptcp_ask_for_nat64(struct ifnet *ifp)
1319 {
1320         in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1321
1322         mptcplog((LOG_DEBUG, "%s: asked for NAT64-prefix on %s\n",
1323                  __func__, ifp->if_name), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1324 }
1325
1326 static void
1327 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1328 {
1329         info->ifindex = 0;
1330         info->has_v4_conn = 0;
1331         info->has_v6_conn = 0;
1332 }
1333
1334 void
1335 mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
1336 {
1337         struct mppcb *mp = (struct mppcb *)handle;
1338         struct mptses *mpte = mptompte(mp);
1339         struct socket *mp_so;
1340         struct mptcb *mp_tp;
1341         int locked = 0;
1342         uint32_t i, ifindex;
1343
1344         ifindex = flow->interface_index;
1345         VERIFY(ifindex != IFSCOPE_NONE);
1346
1347         /* ToDo - remove after rdar://problem/32007628 */
1348         if (!IF_INDEX_IN_RANGE(ifindex))
1349                 printf("%s 1 ifindex %u not in range of flow %p action %d\n",
1350                        __func__, ifindex, flow, action);
1351
1352         /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1353         if (mp->mpp_socket->so_usecount == 0)
1354                 return;
1355
1356         if (action != NECP_CLIENT_CBACTION_INITIAL) {
1357                 mpte_lock(mpte);
1358                 locked = 1;
1359
1360                 /* Check again, because it might have changed while waiting */
1361                 if (mp->mpp_socket->so_usecount == 0)
1362                         goto out;
1363         }
1364
1365         mp_tp = mpte->mpte_mptcb;
1366         mp_so = mptetoso(mpte);
1367
1368         mptcplog((LOG_DEBUG, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n",
1369                  __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state),
1370                  MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1371
1372         /* No need on fallen back sockets */
1373         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
1374                 goto out;
1375
1376         if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1377                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1378                         if (mpte->mpte_itfinfo[i].ifindex == ifindex)
1379                                 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1380                 }
1381
1382                 mptcp_sched_create_subflows(mpte);
1383         } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1384                    action == NECP_CLIENT_CBACTION_INITIAL) {
1385                 int found_empty = 0, empty_index = -1;
1386                 struct ifnet *ifp;
1387
1388                 /* ToDo - remove after rdar://problem/32007628 */
1389                 if (!IF_INDEX_IN_RANGE(ifindex))
1390                         printf("%s 2 ifindex %u not in range of flow %p action %d\n",
1391                                __func__, ifindex, flow, action);
1392
1393                 ifnet_head_lock_shared();
1394                 ifp = ifindex2ifnet[ifindex];
1395                 ifnet_head_done();
1396
1397                 /* ToDo - remove after rdar://problem/32007628 */
1398                 if (!IF_INDEX_IN_RANGE(ifindex))
1399                         printf("%s 3 ifindex %u not in range of flow %p action %d\n",
1400                                __func__, ifindex, flow, action);
1401
1402                 if (ifp == NULL)
1403                         goto out;
1404
1405                 if (IFNET_IS_EXPENSIVE(ifp) &&
1406                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1407                         goto out;
1408
1409                 if (IFNET_IS_CELLULAR(ifp) &&
1410                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1411                         goto out;
1412
1413                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1414                         if (mpte->mpte_itfinfo[i].ifindex == 0) {
1415                                 found_empty = 1;
1416                                 empty_index = i;
1417                         }
1418
1419                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1420                                 /* Ok, it's already there */
1421                                 goto out;
1422                         }
1423                 }
1424
1425                 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1426                     !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) &&
1427                     ifnet_get_nat64prefix(ifp, NULL) == ENOENT) {
1428                         mptcp_ask_for_nat64(ifp);
1429                         goto out;
1430                 }
1431
1432                 if (found_empty == 0) {
1433                         int new_size = mpte->mpte_itfinfo_size * 2;
1434                         struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1435
1436                         if (info == NULL) {
1437                                 mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size),
1438                                          MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1439                                 goto out;
1440                         }
1441
1442                         memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1443
1444                         if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
1445                                 _FREE(mpte->mpte_itfinfo, M_TEMP);
1446
1447                         /* We allocated a new one, thus the first must be empty */
1448                         empty_index = mpte->mpte_itfinfo_size;
1449
1450                         mpte->mpte_itfinfo = info;
1451                         mpte->mpte_itfinfo_size = new_size;
1452
1453                         mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size),
1454                             MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1455                 }
1456
1457                 VERIFY(empty_index >= 0 && empty_index < (int)mpte->mpte_itfinfo_size);
1458                 mpte->mpte_itfinfo[empty_index].ifindex = ifindex;
1459                 mpte->mpte_itfinfo[empty_index].has_v4_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1460                 mpte->mpte_itfinfo[empty_index].has_v6_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1461
1462                 mptcp_sched_create_subflows(mpte);
1463         }
1464
1465 out:
1466         if (locked)
1467                 mpte_unlock(mpte);
1468 }
1469
1470 void
1471 mptcp_set_restrictions(struct socket *mp_so)
1472 {
1473         struct mptses *mpte = mpsotompte(mp_so);
1474         uint32_t i;
1475
1476         mpte_lock_assert_held(mpte);
1477
1478         ifnet_head_lock_shared();
1479
1480         for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1481                 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1482                 uint32_t ifindex = info->ifindex;
1483                 struct ifnet *ifp;
1484
1485                 if (ifindex == IFSCOPE_NONE)
1486                         continue;
1487
1488                 ifp = ifindex2ifnet[ifindex];
1489
1490                 if (IFNET_IS_EXPENSIVE(ifp) &&
1491                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1492                         info->ifindex = IFSCOPE_NONE;
1493
1494                 if (IFNET_IS_CELLULAR(ifp) &&
1495                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1496                         info->ifindex = IFSCOPE_NONE;
1497         }
1498
1499         ifnet_head_done();
1500 }
1501