bsd/netinet/mptcp.c

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * A note on the MPTCP/NECP-interactions:
  31  *
  32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
  33  * MPTCP registers to these events at the MPTCP-layer for interface-events
  34  * through a call to necp_client_register_multipath_cb.
  35  * To get per-flow events (aka per TCP-subflow), we register to it with
  36  * necp_client_register_socket_flow. Both registrations happen by using the
  37  * necp-client-uuid that comes from the app.
  38  *
  39  * The locking is rather tricky. In general, we expect the lock-ordering to
  40  * happen from necp-fd -> necp->client -> mpp_lock.
  41  *
  42  * There are however some subtleties.
  43  *
  44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
  45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
  46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
  47  * NECP-locks will deadlock us. Because these NECP-events will also first take
  48  * the NECP-locks. Either they win the race and thus won't find our
  49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
  50  * the callbacks while holding the NECP lock.
  51  *
  52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
  53  * because we have already registered callbacks and we might race against an
  54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
  55  *
  56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
  57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
  58  * pointers before we unregistered the callback. Because, again we might be
  59  * racing against an NECP-event. Unregistering must happen with an unlocked
  60  * mpp_lock, because of the lock-ordering constraint. It could be that
  61  * before we had a chance to unregister an NECP-event triggers. That's why
  62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
  63  * there while the socket is being garbage-collected, the use-count will go
  64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
  65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
  66  *
  67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
  68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
  69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
  70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
  71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
  72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
  73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
  74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
  75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
  76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
  77  * event). So, the event now finally gets the subflow-lock and then hits an
  78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
  79  * the NECP callback.
  80  */
  81
  82 #include <sys/param.h>
  83 #include <sys/systm.h>
  84 #include <sys/kernel.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/mcache.h>
  87 #include <sys/socket.h>
  88 #include <sys/socketvar.h>
  89 #include <sys/syslog.h>
  90 #include <sys/protosw.h>
  91
  92 #include <kern/zalloc.h>
  93 #include <kern/locks.h>
  94
  95 #include <mach/sdt.h>
  96
  97 #include <net/if.h>
  98 #include <netinet/in.h>
  99 #include <netinet/in_var.h>
 100 #include <netinet/tcp.h>
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_var.h>
 104 #include <netinet/mptcp_var.h>
 105 #include <netinet/mptcp.h>
 106 #include <netinet/mptcp_seq.h>
 107 #include <netinet/mptcp_opt.h>
 108 #include <netinet/mptcp_timer.h>
 109
 110 int mptcp_enable = 1;
 111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
 112     &mptcp_enable, 0, "Enable Multipath TCP Support");
 113
 114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
 115 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
 116 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
 117     CTLFLAG_RW | CTLFLAG_LOCKED,
 118     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
 119
 120 /*
 121  * By default, DSS checksum is turned off, revisit if we ever do
 122  * MPTCP for non SSL Traffic.
 123  */
 124 int mptcp_dss_csum = 0;
 125 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
 126     &mptcp_dss_csum, 0, "Enable DSS checksum");
 127
 128 /*
 129  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
 130  * is attempted on a different path.
 131  */
 132 int mptcp_fail_thresh = 1;
 133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
 134     &mptcp_fail_thresh, 0, "Failover threshold");
 135
 136
 137 /*
 138  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
 139  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
 140  * Some carrier networks have a timeout of 10 or 15 minutes.
 141  */
 142 int mptcp_subflow_keeptime = 60 * 14;
 143 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
 144     &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
 145
 146 int mptcp_rtthist_rtthresh = 600;
 147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 148     &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
 149
 150 /*
 151  * Use RTO history for sending new data
 152  */
 153 int mptcp_use_rto = 1;
 154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
 155     &mptcp_use_rto, 0, "Disable RTO for subflow selection");
 156
 157 int mptcp_rtothresh = 1500;
 158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 159     &mptcp_rtothresh, 0, "RTO threshold");
 160
 161 /*
 162  * Probe the preferred path, when it is not in use
 163  */
 164 uint32_t mptcp_probeto = 1000;
 165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
 166     &mptcp_probeto, 0, "Disable probing by setting to 0");
 167
 168 uint32_t mptcp_probecnt = 5;
 169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 170     &mptcp_probecnt, 0, "Number of probe writes");
 171
 172 /*
 173  * Static declarations
 174  */
 175 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 176     uint32_t, uint16_t, uint16_t, uint16_t);
 177
 178 static int
 179 mptcp_reass_present(struct socket *mp_so)
 180 {
 181         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 182         struct tseg_qent *q;
 183         int dowakeup = 0;
 184         int flags = 0;
 185
 186         /*
 187          * Present data to user, advancing rcv_nxt through
 188          * completed sequence space.
 189          */
 190         if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
 191                 return flags;
 192         }
 193         q = LIST_FIRST(&mp_tp->mpt_segq);
 194         if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
 195                 return flags;
 196         }
 197
 198         /*
 199          * If there is already another thread doing reassembly for this
 200          * connection, it is better to let it finish the job --
 201          * (radar 16316196)
 202          */
 203         if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
 204                 return flags;
 205         }
 206
 207         mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
 208
 209         do {
 210                 mp_tp->mpt_rcvnxt += q->tqe_len;
 211                 LIST_REMOVE(q, tqe_q);
 212                 if (mp_so->so_state & SS_CANTRCVMORE) {
 213                         m_freem(q->tqe_m);
 214                 } else {
 215                         flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 216                         if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0)) {
 217                                 dowakeup = 1;
 218                         }
 219                 }
 220                 zfree(tcp_reass_zone, q);
 221                 mp_tp->mpt_reassqlen--;
 222                 q = LIST_FIRST(&mp_tp->mpt_segq);
 223         } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
 224         mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
 225
 226         if (dowakeup) {
 227                 sorwakeup(mp_so); /* done with socket lock held */
 228         }
 229         return flags;
 230 }
 231
 232 static int
 233 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
 234 {
 235         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 236         u_int64_t mb_dsn = phdr->mp_dsn;
 237         struct tseg_qent *q;
 238         struct tseg_qent *p = NULL;
 239         struct tseg_qent *nq;
 240         struct tseg_qent *te = NULL;
 241         u_int16_t qlimit;
 242
 243         /*
 244          * Limit the number of segments in the reassembly queue to prevent
 245          * holding on to too many segments (and thus running out of mbufs).
 246          * Make sure to let the missing segment through which caused this
 247          * queue.  Always keep one global queue entry spare to be able to
 248          * process the missing segment.
 249          */
 250         qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
 251             (tcp_autorcvbuf_max >> 10));
 252         if (mb_dsn != mp_tp->mpt_rcvnxt &&
 253             (mp_tp->mpt_reassqlen + 1) >= qlimit) {
 254                 tcpstat.tcps_mptcp_rcvmemdrop++;
 255                 m_freem(m);
 256                 *tlenp = 0;
 257                 return 0;
 258         }
 259
 260         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 261         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 262         if (te == NULL) {
 263                 tcpstat.tcps_mptcp_rcvmemdrop++;
 264                 m_freem(m);
 265                 return 0;
 266         }
 267
 268         mp_tp->mpt_reassqlen++;
 269
 270         /*
 271          * Find a segment which begins after this one does.
 272          */
 273         LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
 274                 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
 275                         break;
 276                 }
 277                 p = q;
 278         }
 279
 280         /*
 281          * If there is a preceding segment, it may provide some of
 282          * our data already.  If so, drop the data from the incoming
 283          * segment.  If it provides all of our data, drop us.
 284          */
 285         if (p != NULL) {
 286                 int64_t i;
 287                 /* conversion to int (in i) handles seq wraparound */
 288                 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
 289                 if (i > 0) {
 290                         if (i >= *tlenp) {
 291                                 tcpstat.tcps_mptcp_rcvduppack++;
 292                                 m_freem(m);
 293                                 zfree(tcp_reass_zone, te);
 294                                 te = NULL;
 295                                 mp_tp->mpt_reassqlen--;
 296                                 /*
 297                                  * Try to present any queued data
 298                                  * at the left window edge to the user.
 299                                  * This is needed after the 3-WHS
 300                                  * completes.
 301                                  */
 302                                 goto out;
 303                         }
 304                         m_adj(m, i);
 305                         *tlenp -= i;
 306                         phdr->mp_dsn += i;
 307                 }
 308         }
 309
 310         tcpstat.tcps_mp_oodata++;
 311
 312         /*
 313          * While we overlap succeeding segments trim them or,
 314          * if they are completely covered, dequeue them.
 315          */
 316         while (q) {
 317                 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
 318                 if (i <= 0) {
 319                         break;
 320                 }
 321
 322                 if (i < q->tqe_len) {
 323                         q->tqe_m->m_pkthdr.mp_dsn += i;
 324                         q->tqe_len -= i;
 325                         m_adj(q->tqe_m, i);
 326                         break;
 327                 }
 328
 329                 nq = LIST_NEXT(q, tqe_q);
 330                 LIST_REMOVE(q, tqe_q);
 331                 m_freem(q->tqe_m);
 332                 zfree(tcp_reass_zone, q);
 333                 mp_tp->mpt_reassqlen--;
 334                 q = nq;
 335         }
 336
 337         /* Insert the new segment queue entry into place. */
 338         te->tqe_m = m;
 339         te->tqe_th = NULL;
 340         te->tqe_len = *tlenp;
 341
 342         if (p == NULL) {
 343                 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
 344         } else {
 345                 LIST_INSERT_AFTER(p, te, tqe_q);
 346         }
 347
 348 out:
 349         return mptcp_reass_present(mp_so);
 350 }
 351
 352 /*
 353  * MPTCP input, called when data has been read from a subflow socket.
 354  */
 355 void
 356 mptcp_input(struct mptses *mpte, struct mbuf *m)
 357 {
 358         struct socket *mp_so;
 359         struct mptcb *mp_tp = NULL;
 360         int count = 0, wakeup = 0;
 361         struct mbuf *save = NULL, *prev = NULL;
 362         struct mbuf *freelist = NULL, *tail = NULL;
 363
 364         VERIFY(m->m_flags & M_PKTHDR);
 365
 366         mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 367
 368         mp_so = mptetoso(mpte);
 369         mp_tp = mpte->mpte_mptcb;
 370
 371         DTRACE_MPTCP(input);
 372
 373         mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
 374
 375         /*
 376          * Each mbuf contains MPTCP Data Sequence Map
 377          * Process the data for reassembly, delivery to MPTCP socket
 378          * client, etc.
 379          *
 380          */
 381         count = mp_so->so_rcv.sb_cc;
 382
 383         /*
 384          * In the degraded fallback case, data is accepted without DSS map
 385          */
 386         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
 387                 struct mbuf *iter;
 388                 int mb_dfin = 0;
 389 fallback:
 390                 mptcp_sbrcv_grow(mp_tp);
 391
 392                 iter = m;
 393                 while (iter) {
 394                         if ((iter->m_flags & M_PKTHDR) &&
 395                             (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
 396                                 mb_dfin = 1;
 397                         }
 398
 399                         if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
 400                                 /* Don't add zero-length packets, so jump it! */
 401                                 if (prev == NULL) {
 402                                         m = iter->m_next;
 403                                         m_free(iter);
 404                                         iter = m;
 405                                 } else {
 406                                         prev->m_next = iter->m_next;
 407                                         m_free(iter);
 408                                         iter = prev->m_next;
 409                                 }
 410
 411                                 /* It was a zero-length packet so next one must be a pkthdr */
 412                                 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
 413                         } else {
 414                                 prev = iter;
 415                                 iter = iter->m_next;
 416                         }
 417                 }
 418
 419                 /*
 420                  * assume degraded flow as this may be the first packet
 421                  * without DSS, and the subflow state is not updated yet.
 422                  */
 423                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 424                         sorwakeup(mp_so);
 425                 }
 426
 427                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
 428                     struct socket *, mp_so,
 429                     struct sockbuf *, &mp_so->so_rcv,
 430                     struct sockbuf *, &mp_so->so_snd,
 431                     struct mptses *, mpte);
 432                 count = mp_so->so_rcv.sb_cc - count;
 433
 434                 mp_tp->mpt_rcvnxt += count;
 435
 436                 if (mb_dfin) {
 437                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 438                         socantrcvmore(mp_so);
 439                 }
 440
 441                 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
 442                     count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 443                 return;
 444         }
 445
 446         do {
 447                 u_int64_t mb_dsn;
 448                 int32_t mb_datalen;
 449                 int64_t todrop;
 450                 int mb_dfin = 0;
 451
 452                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
 453                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
 454                         goto fallback;
 455                 }
 456
 457                 save = m->m_next;
 458                 /*
 459                  * A single TCP packet formed of multiple mbufs
 460                  * holds DSS mapping in the first mbuf of the chain.
 461                  * Other mbufs in the chain may have M_PKTHDR set
 462                  * even though they belong to the same TCP packet
 463                  * and therefore use the DSS mapping stored in the
 464                  * first mbuf of the mbuf chain. mptcp_input() can
 465                  * get an mbuf chain with multiple TCP packets.
 466                  */
 467                 while (save && (!(save->m_flags & M_PKTHDR) ||
 468                     !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
 469                         prev = save;
 470                         save = save->m_next;
 471                 }
 472                 if (prev) {
 473                         prev->m_next = NULL;
 474                 } else {
 475                         m->m_next = NULL;
 476                 }
 477
 478                 mb_dsn = m->m_pkthdr.mp_dsn;
 479                 mb_datalen = m->m_pkthdr.mp_rlen;
 480
 481                 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
 482                 if (todrop > 0) {
 483                         tcpstat.tcps_mptcp_rcvpackafterwin++;
 484
 485                         if (todrop >= mb_datalen) {
 486                                 if (freelist == NULL) {
 487                                         freelist = m;
 488                                 } else {
 489                                         tail->m_next = m;
 490                                 }
 491
 492                                 if (prev != NULL) {
 493                                         tail = prev;
 494                                 } else {
 495                                         tail = m;
 496                                 }
 497
 498                                 m = save;
 499                                 prev = save = NULL;
 500                                 continue;
 501                         } else {
 502                                 m_adj(m, -todrop);
 503                                 mb_datalen -= todrop;
 504                         }
 505
 506                         /*
 507                          * We drop from the right edge of the mbuf, thus the
 508                          * DATA_FIN is dropped as well
 509                          */
 510                         m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
 511                 }
 512
 513
 514                 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
 515                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
 516                             mp_tp->mpt_rcvnxt)) {
 517                                 if (freelist == NULL) {
 518                                         freelist = m;
 519                                 } else {
 520                                         tail->m_next = m;
 521                                 }
 522
 523                                 if (prev != NULL) {
 524                                         tail = prev;
 525                                 } else {
 526                                         tail = m;
 527                                 }
 528
 529                                 m = save;
 530                                 prev = save = NULL;
 531                                 continue;
 532                         } else {
 533                                 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
 534                         }
 535                         mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
 536                             mp_tp->mpt_rcvnxt),
 537                             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 538                 }
 539
 540                 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
 541                     !LIST_EMPTY(&mp_tp->mpt_segq)) {
 542                         mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
 543
 544                         goto next;
 545                 }
 546                 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 547
 548                 mptcp_sbrcv_grow(mp_tp);
 549
 550                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 551                         wakeup = 1;
 552                 }
 553
 554                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
 555                     struct sockbuf *, &mp_so->so_rcv,
 556                     struct sockbuf *, &mp_so->so_snd,
 557                     struct mptses *, mpte,
 558                     struct mptcb *, mp_tp);
 559                 count = mp_so->so_rcv.sb_cc - count;
 560                 tcpstat.tcps_mp_rcvtotal++;
 561                 tcpstat.tcps_mp_rcvbytes += count;
 562                 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
 563                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 564
 565                 mp_tp->mpt_rcvnxt += count;
 566
 567 next:
 568                 if (mb_dfin) {
 569                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 570                         socantrcvmore(mp_so);
 571                 }
 572                 m = save;
 573                 prev = save = NULL;
 574                 count = mp_so->so_rcv.sb_cc;
 575         } while (m);
 576
 577         if (freelist) {
 578                 m_freem(freelist);
 579         }
 580
 581         if (wakeup) {
 582                 sorwakeup(mp_so);
 583         }
 584 }
 585
 586 boolean_t
 587 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
 588 {
 589         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 590
 591         /*
 592          * Always send if there is data in the reinject-queue.
 593          */
 594         if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
 595                 return TRUE;
 596         }
 597
 598         /*
 599          * Don't send, if:
 600          *
 601          * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
 602          *    Except when using TFO, we might be doing a 0-byte write.
 603          * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
 604          * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
 605          */
 606
 607         if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
 608                 return FALSE;
 609         }
 610
 611         if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
 612                 return FALSE;
 613         }
 614
 615         if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 616                 return FALSE;
 617         }
 618
 619         if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
 620                 return FALSE;
 621         }
 622
 623         return TRUE;
 624 }
 625
 626 /*
 627  * MPTCP output.
 628  */
 629 int
 630 mptcp_output(struct mptses *mpte)
 631 {
 632         struct mptcb *mp_tp;
 633         struct mptsub *mpts;
 634         struct mptsub *mpts_tried = NULL;
 635         struct socket *mp_so;
 636         struct mptsub *preferred_mpts = NULL;
 637         uint64_t old_snd_nxt;
 638         int error = 0;
 639
 640         mpte_lock_assert_held(mpte);
 641         mp_so = mptetoso(mpte);
 642         mp_tp = mpte->mpte_mptcb;
 643
 644         VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
 645         mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 646
 647         mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
 648             __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
 649             (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
 650             mpte->mpte_reinjectq ? 1 : 0,
 651             mp_tp->mpt_state),
 652             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 653
 654         old_snd_nxt = mp_tp->mpt_sndnxt;
 655         while (mptcp_can_send_more(mp_tp, FALSE)) {
 656                 /* get the "best" subflow to be used for transmission */
 657                 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
 658                 if (mpts == NULL) {
 659                         mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
 660                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 661                         break;
 662                 }
 663
 664                 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
 665                     MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 666
 667                 /* In case there's just one flow, we reattempt later */
 668                 if (mpts_tried != NULL &&
 669                     (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
 670                         mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
 671                         mpts_tried->mpts_flags |= MPTSF_ACTIVE;
 672                         mptcp_start_timer(mpte, MPTT_REXMT);
 673                         mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
 674                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 675                         break;
 676                 }
 677
 678                 /*
 679                  * Automatic sizing of send socket buffer. Increase the send
 680                  * socket buffer size if all of the following criteria are met
 681                  *      1. the receiver has enough buffer space for this data
 682                  *      2. send buffer is filled to 7/8th with data (so we actually
 683                  *         have data to make use of it);
 684                  */
 685                 if (tcp_do_autosendbuf == 1 &&
 686                     (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
 687                     tcp_cansbgrow(&mp_so->so_snd)) {
 688                         if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
 689                             mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
 690                                 if (sbreserve(&mp_so->so_snd,
 691                                     min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 692                                     tcp_autosndbuf_max)) == 1) {
 693                                         mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
 694
 695                                         mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
 696                                             __func__, mp_so->so_snd.sb_hiwat,
 697                                             mp_so->so_snd.sb_lowat),
 698                                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 699                                 }
 700                         }
 701                 }
 702
 703                 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
 704                     struct socket *, mp_so);
 705                 error = mptcp_subflow_output(mpte, mpts, 0);
 706                 if (error) {
 707                         /* can be a temporary loss of source address or other error */
 708                         mpts->mpts_flags |= MPTSF_FAILINGOVER;
 709                         mpts->mpts_flags &= ~MPTSF_ACTIVE;
 710                         mpts_tried = mpts;
 711                         if (error != ECANCELED) {
 712                                 mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
 713                                     error, mpts->mpts_flags),
 714                                     MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
 715                         }
 716                         break;
 717                 }
 718                 /* The model is to have only one active flow at a time */
 719                 mpts->mpts_flags |= MPTSF_ACTIVE;
 720                 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
 721
 722                 /* Allows us to update the smoothed rtt */
 723                 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
 724                         if (preferred_mpts->mpts_probesoon) {
 725                                 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
 726                                         mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
 727                                         if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
 728                                                 preferred_mpts->mpts_probesoon = 0;
 729                                                 preferred_mpts->mpts_probecnt = 0;
 730                                         }
 731                                 }
 732                         } else {
 733                                 preferred_mpts->mpts_probesoon = tcp_now;
 734                                 preferred_mpts->mpts_probecnt = 0;
 735                         }
 736                 }
 737
 738                 if (mpte->mpte_active_sub == NULL) {
 739                         mpte->mpte_active_sub = mpts;
 740                 } else if (mpte->mpte_active_sub != mpts) {
 741                         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 742                         struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
 743
 744                         mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
 745                             mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
 746                             mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
 747                             (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
 748
 749                         mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
 750                         mpte->mpte_active_sub = mpts;
 751
 752                         mptcpstats_inc_switch(mpte, mpts);
 753                 }
 754         }
 755
 756         if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 757                 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
 758                     mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
 759                         mptcp_finish_usrclosed(mpte);
 760                 }
 761         }
 762
 763         mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
 764
 765         /* subflow errors should not be percolated back up */
 766         return 0;
 767 }
 768
 769
 770 static struct mptsub *
 771 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
 772 {
 773         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 774
 775         /*
 776          * Lower RTT? Take it, if it's our first one, or
 777          * it doesn't has any loss, or the current one has
 778          * loss as well.
 779          */
 780         if (tp->t_srtt && *currtt > tp->t_srtt &&
 781             (curbest == NULL || tp->t_rxtshift == 0 ||
 782             sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
 783                 *currtt = tp->t_srtt;
 784                 return mpts;
 785         }
 786
 787         /*
 788          * If we find a subflow without loss, take it always!
 789          */
 790         if (curbest &&
 791             sototcpcb(curbest->mpts_socket)->t_rxtshift &&
 792             tp->t_rxtshift == 0) {
 793                 *currtt = tp->t_srtt;
 794                 return mpts;
 795         }
 796
 797         return curbest != NULL ? curbest : mpts;
 798 }
 799
 800 static struct mptsub *
 801 mptcp_return_subflow(struct mptsub *mpts)
 802 {
 803         if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
 804                 return NULL;
 805         }
 806
 807         return mpts;
 808 }
 809
 810 /*
 811  * Return the most eligible subflow to be used for sending data.
 812  */
 813 struct mptsub *
 814 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
 815 {
 816         struct tcpcb *besttp, *secondtp;
 817         struct inpcb *bestinp, *secondinp;
 818         struct mptsub *mpts;
 819         struct mptsub *best = NULL;
 820         struct mptsub *second_best = NULL;
 821         int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
 822
 823         /*
 824          * First Step:
 825          * Choose the best subflow for cellular and non-cellular interfaces.
 826          */
 827
 828         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
 829                 struct socket *so = mpts->mpts_socket;
 830                 struct tcpcb *tp = sototcpcb(so);
 831                 struct inpcb *inp = sotoinpcb(so);
 832
 833                 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
 834                     __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
 835                     INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
 836                     inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
 837                     tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
 838                     mptcp_subflow_cwnd_space(so)),
 839                     MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 840
 841                 /*
 842                  * First, the hard conditions to reject subflows
 843                  * (e.g., not connected,...)
 844                  */
 845                 if (mpts == ignore || inp->inp_last_outifp == NULL) {
 846                         continue;
 847                 }
 848
 849                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
 850                         continue;
 851                 }
 852
 853                 /* There can only be one subflow in degraded state */
 854                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
 855                         best = mpts;
 856                         break;
 857                 }
 858
 859                 /*
 860                  * If this subflow is waiting to finally send, do it!
 861                  */
 862                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
 863                         return mptcp_return_subflow(mpts);
 864                 }
 865
 866                 /*
 867                  * Only send if the subflow is MP_CAPABLE. The exceptions to
 868                  * this rule (degraded or TFO) have been taken care of above.
 869                  */
 870                 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
 871                         continue;
 872                 }
 873
 874                 if ((so->so_state & SS_ISDISCONNECTED) ||
 875                     !(so->so_state & SS_ISCONNECTED) ||
 876                     !TCPS_HAVEESTABLISHED(tp->t_state) ||
 877                     tp->t_state > TCPS_CLOSE_WAIT) {
 878                         continue;
 879                 }
 880
 881                 /*
 882                  * Second, the soft conditions to find the subflow with best
 883                  * conditions for each set (aka cellular vs non-cellular)
 884                  */
 885                 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
 886                         second_best = mptcp_choose_subflow(mpts, second_best,
 887                             &exp_rtt);
 888                 } else {
 889                         best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
 890                 }
 891         }
 892
 893         /*
 894          * If there is no preferred or backup subflow, and there is no active
 895          * subflow use the last usable subflow.
 896          */
 897         if (best == NULL) {
 898                 return mptcp_return_subflow(second_best);
 899         }
 900
 901         if (second_best == NULL) {
 902                 return mptcp_return_subflow(best);
 903         }
 904
 905         besttp = sototcpcb(best->mpts_socket);
 906         bestinp = sotoinpcb(best->mpts_socket);
 907         secondtp = sototcpcb(second_best->mpts_socket);
 908         secondinp = sotoinpcb(second_best->mpts_socket);
 909
 910         if (preferred != NULL) {
 911                 *preferred = mptcp_return_subflow(best);
 912         }
 913
 914         /*
 915          * Second Step: Among best and second_best. Choose the one that is
 916          * most appropriate for this particular service-type.
 917          */
 918         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 919                 /*
 920                  * Only handover if Symptoms tells us to do so.
 921                  */
 922                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 923                     mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best)) {
 924                         return mptcp_return_subflow(second_best);
 925                 }
 926
 927                 return mptcp_return_subflow(best);
 928         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
 929                 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
 930                 int rto_thresh = mptcp_rtothresh;
 931
 932                 /* Adjust with symptoms information */
 933                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 934                     mptcp_is_wifi_unusable(mpte) != 0) {
 935                         rtt_thresh /= 2;
 936                         rto_thresh /= 2;
 937                 }
 938
 939                 if (besttp->t_srtt && secondtp->t_srtt &&
 940                     besttp->t_srtt >= rtt_thresh &&
 941                     secondtp->t_srtt < rtt_thresh) {
 942                         tcpstat.tcps_mp_sel_rtt++;
 943                         mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
 944                             best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
 945                             second_best->mpts_connid,
 946                             secondtp->t_srtt >> TCP_RTT_SHIFT),
 947                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 948                         return mptcp_return_subflow(second_best);
 949                 }
 950
 951                 if (mptcp_subflow_is_bad(mpte, best) &&
 952                     secondtp->t_rxtshift == 0) {
 953                         return mptcp_return_subflow(second_best);
 954                 }
 955
 956                 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
 957                 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
 958                     besttp->t_rxtcur >= rto_thresh &&
 959                     secondtp->t_rxtcur < rto_thresh) {
 960                         tcpstat.tcps_mp_sel_rto++;
 961                         mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
 962                             best->mpts_connid, besttp->t_rxtcur,
 963                             second_best->mpts_connid, secondtp->t_rxtcur),
 964                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 965
 966                         return mptcp_return_subflow(second_best);
 967                 }
 968
 969                 /*
 970                  * None of the above conditions for sending on the secondary
 971                  * were true. So, let's schedule on the best one, if he still
 972                  * has some space in the congestion-window.
 973                  */
 974                 return mptcp_return_subflow(best);
 975         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
 976                 struct mptsub *tmp;
 977
 978                 /*
 979                  * We only care about RTT when aggregating
 980                  */
 981                 if (besttp->t_srtt > secondtp->t_srtt) {
 982                         tmp = best;
 983                         best = second_best;
 984                         besttp = secondtp;
 985                         bestinp = secondinp;
 986
 987                         second_best = tmp;
 988                         secondtp = sototcpcb(second_best->mpts_socket);
 989                         secondinp = sotoinpcb(second_best->mpts_socket);
 990                 }
 991
 992                 /* Is there still space in the congestion window? */
 993                 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
 994                         return mptcp_return_subflow(second_best);
 995                 }
 996
 997                 return mptcp_return_subflow(best);
 998         } else {
 999                 panic("Unknown service-type configured for MPTCP");
1000         }
1001
1002         return NULL;
1003 }
1004
1005 static const char *
1006 mptcp_event_to_str(uint32_t event)
1007 {
1008         const char *c = "UNDEFINED";
1009         switch (event) {
1010         case MPCE_CLOSE:
1011                 c = "MPCE_CLOSE";
1012                 break;
1013         case MPCE_RECV_DATA_ACK:
1014                 c = "MPCE_RECV_DATA_ACK";
1015                 break;
1016         case MPCE_RECV_DATA_FIN:
1017                 c = "MPCE_RECV_DATA_FIN";
1018                 break;
1019         }
1020         return c;
1021 }
1022
1023 static const char *
1024 mptcp_state_to_str(mptcp_state_t state)
1025 {
1026         const char *c = "UNDEFINED";
1027         switch (state) {
1028         case MPTCPS_CLOSED:
1029                 c = "MPTCPS_CLOSED";
1030                 break;
1031         case MPTCPS_LISTEN:
1032                 c = "MPTCPS_LISTEN";
1033                 break;
1034         case MPTCPS_ESTABLISHED:
1035                 c = "MPTCPS_ESTABLISHED";
1036                 break;
1037         case MPTCPS_CLOSE_WAIT:
1038                 c = "MPTCPS_CLOSE_WAIT";
1039                 break;
1040         case MPTCPS_FIN_WAIT_1:
1041                 c = "MPTCPS_FIN_WAIT_1";
1042                 break;
1043         case MPTCPS_CLOSING:
1044                 c = "MPTCPS_CLOSING";
1045                 break;
1046         case MPTCPS_LAST_ACK:
1047                 c = "MPTCPS_LAST_ACK";
1048                 break;
1049         case MPTCPS_FIN_WAIT_2:
1050                 c = "MPTCPS_FIN_WAIT_2";
1051                 break;
1052         case MPTCPS_TIME_WAIT:
1053                 c = "MPTCPS_TIME_WAIT";
1054                 break;
1055         case MPTCPS_TERMINATE:
1056                 c = "MPTCPS_TERMINATE";
1057                 break;
1058         }
1059         return c;
1060 }
1061
1062 void
1063 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1064 {
1065         mpte_lock_assert_held(mp_tp->mpt_mpte);
1066         mptcp_state_t old_state = mp_tp->mpt_state;
1067
1068         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1069             uint32_t, event);
1070
1071         switch (mp_tp->mpt_state) {
1072         case MPTCPS_CLOSED:
1073         case MPTCPS_LISTEN:
1074                 mp_tp->mpt_state = MPTCPS_TERMINATE;
1075                 break;
1076
1077         case MPTCPS_ESTABLISHED:
1078                 if (event == MPCE_CLOSE) {
1079                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1080                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1081                 } else if (event == MPCE_RECV_DATA_FIN) {
1082                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1083                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1084                 }
1085                 break;
1086
1087         case MPTCPS_CLOSE_WAIT:
1088                 if (event == MPCE_CLOSE) {
1089                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
1090                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1091                 }
1092                 break;
1093
1094         case MPTCPS_FIN_WAIT_1:
1095                 if (event == MPCE_RECV_DATA_ACK) {
1096                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1097                 } else if (event == MPCE_RECV_DATA_FIN) {
1098                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1099                         mp_tp->mpt_state = MPTCPS_CLOSING;
1100                 }
1101                 break;
1102
1103         case MPTCPS_CLOSING:
1104                 if (event == MPCE_RECV_DATA_ACK) {
1105                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1106                 }
1107                 break;
1108
1109         case MPTCPS_LAST_ACK:
1110                 if (event == MPCE_RECV_DATA_ACK) {
1111                         mptcp_close(mp_tp->mpt_mpte, mp_tp);
1112                 }
1113                 break;
1114
1115         case MPTCPS_FIN_WAIT_2:
1116                 if (event == MPCE_RECV_DATA_FIN) {
1117                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1118                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1119                 }
1120                 break;
1121
1122         case MPTCPS_TIME_WAIT:
1123         case MPTCPS_TERMINATE:
1124                 break;
1125
1126         default:
1127                 VERIFY(0);
1128                 /* NOTREACHED */
1129         }
1130         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1131             uint32_t, event);
1132         mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1133             mptcp_state_to_str(old_state),
1134             mptcp_state_to_str(mp_tp->mpt_state),
1135             mptcp_event_to_str(event)),
1136             MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1137 }
1138
1139 /* If you change this function, match up mptcp_update_rcv_state_f */
1140 void
1141 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1142     uint16_t csum)
1143 {
1144         struct mptcb *mp_tp = tptomptp(tp);
1145         u_int64_t full_dsn = 0;
1146
1147         NTOHL(dss_info->mdss_dsn);
1148         NTOHL(dss_info->mdss_subflow_seqn);
1149         NTOHS(dss_info->mdss_data_len);
1150
1151         /* XXX for autosndbuf grow sb here */
1152         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1153         mptcp_update_rcv_state_meat(mp_tp, tp,
1154             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1155             csum);
1156 }
1157
1158 void
1159 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1160     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1161     uint16_t csum)
1162 {
1163         if (mdss_data_len == 0) {
1164                 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
1165                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1166
1167                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1168                         mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
1169                             csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1170                 }
1171                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1172                 return;
1173         }
1174         mptcplog((LOG_DEBUG,
1175             "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__,
1176             seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt),
1177             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1178
1179         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1180
1181         tp->t_rcv_map.mpt_dsn = full_dsn;
1182         tp->t_rcv_map.mpt_sseq = seqn;
1183         tp->t_rcv_map.mpt_len = mdss_data_len;
1184         tp->t_rcv_map.mpt_csum = csum;
1185         tp->t_mpflags |= TMPF_EMBED_DSN;
1186 }
1187
1188
1189 static int
1190 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1191     int hdrlen)
1192 {
1193         u_int32_t datalen;
1194
1195         if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1196                 return 0;
1197         }
1198
1199         datalen = m->m_pkthdr.mp_rlen;
1200
1201         /* unacceptable DSS option, fallback to TCP */
1202         if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1203                 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
1204                     __func__, m->m_pkthdr.len, datalen),
1205                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1206         } else {
1207                 return 0;
1208         }
1209         tp->t_mpflags |= TMPF_SND_MPFAIL;
1210         mptcp_notify_mpfail(so);
1211         m_freem(m);
1212         return -1;
1213 }
1214
1215 int
1216 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1217     int drop_hdrlen)
1218 {
1219         mptcp_insert_rmap(tp, m, th);
1220         if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1221             drop_hdrlen) != 0) {
1222                 return -1;
1223         }
1224         return 0;
1225 }
1226
1227 /*
1228  * MPTCP Checksum support
1229  * The checksum is calculated whenever the MPTCP DSS option is included
1230  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1231  * header and the actual data indicated by the length specified in the
1232  * DSS option.
1233  */
1234
1235 int
1236 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1237     uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1238 {
1239         uint16_t mptcp_csum;
1240
1241         mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1242         if (mptcp_csum) {
1243                 tp->t_mpflags |= TMPF_SND_MPFAIL;
1244                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1245                 m_freem(m);
1246                 tcpstat.tcps_mp_badcsum++;
1247                 return -1;
1248         }
1249         return 0;
1250 }
1251
1252 static uint16_t
1253 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1254     uint16_t dlen, uint16_t csum, uint16_t dfin)
1255 {
1256         struct mptcb *mp_tp = tptomptp(tp);
1257         uint16_t real_len = dlen - dfin;
1258         uint32_t sum = 0;
1259
1260         if (mp_tp == NULL) {
1261                 return 0;
1262         }
1263
1264         if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1265                 return 0;
1266         }
1267
1268         if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1269                 return 0;
1270         }
1271
1272         /*
1273          * The remote side may send a packet with fewer bytes than the
1274          * claimed DSS checksum length.
1275          */
1276         if ((int)m_length2(m, NULL) < real_len) {
1277                 return 0xffff;
1278         }
1279
1280         if (real_len != 0) {
1281                 sum = m_sum16(m, 0, real_len);
1282         }
1283
1284         sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1285         ADDCARRY(sum);
1286         DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1287             uint32_t, sum);
1288
1289         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1290             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1291         return ~sum & 0xffff;
1292 }
1293
1294 uint32_t
1295 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1296 {
1297         uint32_t sum = 0;
1298
1299         if (dlen) {
1300                 sum = m_sum16(m, 0, dlen);
1301         }
1302
1303         dss_val = mptcp_hton64(dss_val);
1304         sseq = htonl(sseq);
1305         dlen = htons(dlen);
1306         sum += in_pseudo64(dss_val, sseq, dlen);
1307
1308         ADDCARRY(sum);
1309         sum = ~sum & 0xffff;
1310         DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1311         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1312             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1313
1314         return sum;
1315 }
1316
1317 /*
1318  * When WiFi signal starts fading, there's more loss and RTT spikes.
1319  * Check if there has been a large spike by comparing against
1320  * a tolerable RTT spike threshold.
1321  */
1322 boolean_t
1323 mptcp_no_rto_spike(struct socket *so)
1324 {
1325         struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1326         int32_t spike = 0;
1327
1328         if (tp->t_rxtcur > mptcp_rtothresh) {
1329                 spike = tp->t_rxtcur - mptcp_rtothresh;
1330
1331                 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1332                     __func__, spike,
1333                     tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1334                     tp->t_rttcur),
1335                     (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1336         }
1337
1338         if (spike > 0) {
1339                 return FALSE;
1340         } else {
1341                 return TRUE;
1342         }
1343 }
1344
1345 void
1346 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1347 {
1348         VERIFY(mpp->mpp_flags & flag);
1349         mpp->mpp_flags &= ~flag;
1350
1351         if (mptcp_should_defer_upcall(mpp)) {
1352                 return;
1353         }
1354
1355         if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1356                 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1357
1358                 mptcp_subflow_workloop(mpp->mpp_pcbe);
1359         }
1360
1361         if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1362                 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1363
1364                 sorwakeup(mpp->mpp_socket);
1365         }
1366
1367         if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1368                 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1369
1370                 sowwakeup(mpp->mpp_socket);
1371         }
1372
1373         if (mpp->mpp_flags & MPP_SET_CELLICON) {
1374                 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1375
1376                 mptcp_set_cellicon(mpp->mpp_pcbe);
1377         }
1378
1379         if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1380                 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1381
1382                 mptcp_unset_cellicon();
1383         }
1384 }
1385
1386 void
1387 mptcp_ask_for_nat64(struct ifnet *ifp)
1388 {
1389         in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1390
1391         os_log_info(mptcp_log_handle,
1392             "%s: asked for NAT64-prefix on %s\n", __func__,
1393             ifp->if_name);
1394 }
1395
1396 static void
1397 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1398 {
1399         info->ifindex = 0;
1400         info->has_v4_conn = 0;
1401         info->has_v6_conn = 0;
1402         info->has_nat64_conn = 0;
1403 }
1404
1405 void
1406 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1407     uint32_t necp_flags, __unused bool *viable)
1408 {
1409         boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1410         boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1411         boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1412         boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1413         struct mppcb *mp = (struct mppcb *)handle;
1414         struct mptses *mpte = mptompte(mp);
1415         struct socket *mp_so;
1416         struct mptcb *mp_tp;
1417         int locked = 0;
1418         uint32_t i, ifindex;
1419
1420         ifindex = interface_index;
1421         VERIFY(ifindex != IFSCOPE_NONE);
1422
1423         /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1424         if (mp->mpp_socket->so_usecount == 0) {
1425                 return;
1426         }
1427
1428         if (action != NECP_CLIENT_CBACTION_INITIAL) {
1429                 mpte_lock(mpte);
1430                 locked = 1;
1431
1432                 /* Check again, because it might have changed while waiting */
1433                 if (mp->mpp_socket->so_usecount == 0) {
1434                         goto out;
1435                 }
1436         }
1437
1438         mpte_lock_assert_held(mpte);
1439
1440         mp_tp = mpte->mpte_mptcb;
1441         mp_so = mptetoso(mpte);
1442
1443         os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1444             __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1445             has_v4, has_v6, has_nat64, low_power);
1446
1447         /* No need on fallen back sockets */
1448         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1449                 goto out;
1450         }
1451
1452         /*
1453          * When the interface goes in low-power mode we don't want to establish
1454          * new subflows on it. Thus, mark it internally as non-viable.
1455          */
1456         if (low_power) {
1457                 action = NECP_CLIENT_CBACTION_NONVIABLE;
1458         }
1459
1460         if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1461                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1462                         if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1463                                 continue;
1464                         }
1465
1466                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1467                                 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1468                         }
1469                 }
1470
1471                 mptcp_sched_create_subflows(mpte);
1472         } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1473             action == NECP_CLIENT_CBACTION_INITIAL) {
1474                 int found_slot = 0, slot_index = -1;
1475                 struct ifnet *ifp;
1476
1477                 ifnet_head_lock_shared();
1478                 ifp = ifindex2ifnet[ifindex];
1479                 ifnet_head_done();
1480
1481                 if (ifp == NULL) {
1482                         goto out;
1483                 }
1484
1485                 if (IFNET_IS_EXPENSIVE(ifp) &&
1486                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1487                         goto out;
1488                 }
1489
1490                 if (IFNET_IS_CELLULAR(ifp) &&
1491                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1492                         goto out;
1493                 }
1494
1495                 if (IS_INTF_CLAT46(ifp)) {
1496                         has_v4 = FALSE;
1497                 }
1498
1499                 /* Look for the slot on where to store/update the interface-info. */
1500                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1501                         /* Found a potential empty slot where we can put it */
1502                         if (mpte->mpte_itfinfo[i].ifindex == 0) {
1503                                 found_slot = 1;
1504                                 slot_index = i;
1505                         }
1506
1507                         /*
1508                          * The interface is already in our array. Check if we
1509                          * need to update it.
1510                          */
1511                         if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1512                             (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1513                             mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1514                             mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1515                                 found_slot = 1;
1516                                 slot_index = i;
1517                                 break;
1518                         }
1519
1520                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1521                                 /*
1522                                  * Ok, it's already there and we don't need
1523                                  * to update it
1524                                  */
1525                                 goto out;
1526                         }
1527                 }
1528
1529                 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1530                     !has_nat64 && !has_v4) {
1531                         if (found_slot) {
1532                                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1533                                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1534                                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1535                         }
1536                         mptcp_ask_for_nat64(ifp);
1537                         goto out;
1538                 }
1539
1540                 if (found_slot == 0) {
1541                         int new_size = mpte->mpte_itfinfo_size * 2;
1542                         struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1543
1544                         if (info == NULL) {
1545                                 os_log_error(mptcp_log_handle, "%s malloc failed for %u\n",
1546                                     __func__, new_size);
1547                                 goto out;
1548                         }
1549
1550                         memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1551
1552                         if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1553                                 _FREE(mpte->mpte_itfinfo, M_TEMP);
1554                         }
1555
1556                         /* We allocated a new one, thus the first must be empty */
1557                         slot_index = mpte->mpte_itfinfo_size;
1558
1559                         mpte->mpte_itfinfo = info;
1560                         mpte->mpte_itfinfo_size = new_size;
1561                 }
1562
1563                 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1564                 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1565                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1566                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1567                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1568
1569                 mptcp_sched_create_subflows(mpte);
1570         }
1571
1572 out:
1573         if (locked) {
1574                 mpte_unlock(mpte);
1575         }
1576 }
1577
1578 void
1579 mptcp_set_restrictions(struct socket *mp_so)
1580 {
1581         struct mptses *mpte = mpsotompte(mp_so);
1582         uint32_t i;
1583
1584         mpte_lock_assert_held(mpte);
1585
1586         ifnet_head_lock_shared();
1587
1588         for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1589                 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1590                 uint32_t ifindex = info->ifindex;
1591                 struct ifnet *ifp;
1592
1593                 if (ifindex == IFSCOPE_NONE) {
1594                         continue;
1595                 }
1596
1597                 ifp = ifindex2ifnet[ifindex];
1598                 if (ifp == NULL) {
1599                         continue;
1600                 }
1601
1602                 if (IFNET_IS_EXPENSIVE(ifp) &&
1603                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1604                         info->ifindex = IFSCOPE_NONE;
1605                 }
1606
1607                 if (IFNET_IS_CELLULAR(ifp) &&
1608                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1609                         info->ifindex = IFSCOPE_NONE;
1610                 }
1611         }
1612
1613         ifnet_head_done();
1614 }