bsd/netinet/mptcp.c

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * A note on the MPTCP/NECP-interactions:
  31  *
  32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
  33  * MPTCP registers to these events at the MPTCP-layer for interface-events
  34  * through a call to necp_client_register_multipath_cb.
  35  * To get per-flow events (aka per TCP-subflow), we register to it with
  36  * necp_client_register_socket_flow. Both registrations happen by using the
  37  * necp-client-uuid that comes from the app.
  38  *
  39  * The locking is rather tricky. In general, we expect the lock-ordering to
  40  * happen from necp-fd -> necp->client -> mpp_lock.
  41  *
  42  * There are however some subtleties.
  43  *
  44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
  45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
  46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
  47  * NECP-locks will deadlock us. Because these NECP-events will also first take
  48  * the NECP-locks. Either they win the race and thus won't find our
  49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
  50  * the callbacks while holding the NECP lock.
  51  *
  52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
  53  * because we have already registered callbacks and we might race against an
  54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
  55  *
  56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
  57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
  58  * pointers before we unregistered the callback. Because, again we might be
  59  * racing against an NECP-event. Unregistering must happen with an unlocked
  60  * mpp_lock, because of the lock-ordering constraint. It could be that
  61  * before we had a chance to unregister an NECP-event triggers. That's why
  62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
  63  * there while the socket is being garbage-collected, the use-count will go
  64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
  65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
  66  *
  67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
  68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
  69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
  70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
  71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
  72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
  73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
  74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
  75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
  76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
  77  * event). So, the event now finally gets the subflow-lock and then hits an
  78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
  79  * the NECP callback.
  80  */
  81
  82 #include <sys/param.h>
  83 #include <sys/systm.h>
  84 #include <sys/kernel.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/mcache.h>
  87 #include <sys/socket.h>
  88 #include <sys/socketvar.h>
  89 #include <sys/syslog.h>
  90 #include <sys/protosw.h>
  91
  92 #include <kern/zalloc.h>
  93 #include <kern/locks.h>
  94
  95 #include <mach/sdt.h>
  96
  97 #include <net/if.h>
  98 #include <netinet/in.h>
  99 #include <netinet/in_var.h>
 100 #include <netinet/tcp.h>
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_var.h>
 104 #include <netinet/mptcp_var.h>
 105 #include <netinet/mptcp.h>
 106 #include <netinet/mptcp_seq.h>
 107 #include <netinet/mptcp_opt.h>
 108 #include <netinet/mptcp_timer.h>
 109
 110 int mptcp_enable = 1;
 111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
 112         &mptcp_enable, 0, "Enable Multipath TCP Support");
 113
 114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
 115 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
 116 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
 117         CTLFLAG_RW | CTLFLAG_LOCKED,
 118         &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
 119
 120 /*
 121  * By default, DSS checksum is turned off, revisit if we ever do
 122  * MPTCP for non SSL Traffic.
 123  */
 124 int mptcp_dss_csum = 0;
 125 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
 126         &mptcp_dss_csum, 0, "Enable DSS checksum");
 127
 128 /*
 129  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
 130  * is attempted on a different path.
 131  */
 132 int mptcp_fail_thresh = 1;
 133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
 134         &mptcp_fail_thresh, 0, "Failover threshold");
 135
 136
 137 /*
 138  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
 139  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
 140  * Some carrier networks have a timeout of 10 or 15 minutes.
 141  */
 142 int mptcp_subflow_keeptime = 60*14;
 143 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
 144         &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
 145
 146 int mptcp_rtthist_rtthresh = 600;
 147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 148         &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
 149
 150 /*
 151  * Use RTO history for sending new data
 152  */
 153 int mptcp_use_rto = 1;
 154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
 155         &mptcp_use_rto, 0, "Disable RTO for subflow selection");
 156
 157 int mptcp_rtothresh = 1500;
 158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 159         &mptcp_rtothresh, 0, "RTO threshold");
 160
 161 /*
 162  * Probe the preferred path, when it is not in use
 163  */
 164 uint32_t mptcp_probeto = 1000;
 165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
 166         &mptcp_probeto, 0, "Disable probing by setting to 0");
 167
 168 uint32_t mptcp_probecnt = 5;
 169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 170         &mptcp_probecnt, 0, "Number of probe writes");
 171
 172 /*
 173  * Static declarations
 174  */
 175 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 176                                  uint32_t, uint16_t, uint16_t, uint16_t);
 177
 178 static int
 179 mptcp_reass_present(struct socket *mp_so)
 180 {
 181         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 182         struct tseg_qent *q;
 183         int dowakeup = 0;
 184         int flags = 0;
 185
 186         /*
 187          * Present data to user, advancing rcv_nxt through
 188          * completed sequence space.
 189          */
 190         if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
 191                 return (flags);
 192         q = LIST_FIRST(&mp_tp->mpt_segq);
 193         if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt)
 194                 return (flags);
 195
 196         /*
 197          * If there is already another thread doing reassembly for this
 198          * connection, it is better to let it finish the job --
 199          * (radar 16316196)
 200          */
 201         if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG)
 202                 return (flags);
 203
 204         mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
 205
 206         do {
 207                 mp_tp->mpt_rcvnxt += q->tqe_len;
 208                 LIST_REMOVE(q, tqe_q);
 209                 if (mp_so->so_state & SS_CANTRCVMORE) {
 210                         m_freem(q->tqe_m);
 211                 } else {
 212                         flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 213                         if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0))
 214                                 dowakeup = 1;
 215                 }
 216                 zfree(tcp_reass_zone, q);
 217                 mp_tp->mpt_reassqlen--;
 218                 q = LIST_FIRST(&mp_tp->mpt_segq);
 219         } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
 220         mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
 221
 222         if (dowakeup)
 223                 sorwakeup(mp_so); /* done with socket lock held */
 224         return (flags);
 225
 226 }
 227
 228 static int
 229 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
 230 {
 231         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 232         u_int64_t mb_dsn = phdr->mp_dsn;
 233         struct tseg_qent *q;
 234         struct tseg_qent *p = NULL;
 235         struct tseg_qent *nq;
 236         struct tseg_qent *te = NULL;
 237         u_int16_t qlimit;
 238
 239         /*
 240          * Limit the number of segments in the reassembly queue to prevent
 241          * holding on to too many segments (and thus running out of mbufs).
 242          * Make sure to let the missing segment through which caused this
 243          * queue.  Always keep one global queue entry spare to be able to
 244          * process the missing segment.
 245          */
 246         qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
 247             (tcp_autorcvbuf_max >> 10));
 248         if (mb_dsn != mp_tp->mpt_rcvnxt &&
 249             (mp_tp->mpt_reassqlen + 1) >= qlimit) {
 250                 tcpstat.tcps_mptcp_rcvmemdrop++;
 251                 m_freem(m);
 252                 *tlenp = 0;
 253                 return (0);
 254         }
 255
 256         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 257         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 258         if (te == NULL) {
 259                 tcpstat.tcps_mptcp_rcvmemdrop++;
 260                 m_freem(m);
 261                 return (0);
 262         }
 263
 264         mp_tp->mpt_reassqlen++;
 265
 266         /*
 267          * Find a segment which begins after this one does.
 268          */
 269         LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
 270                 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn))
 271                         break;
 272                 p = q;
 273         }
 274
 275         /*
 276          * If there is a preceding segment, it may provide some of
 277          * our data already.  If so, drop the data from the incoming
 278          * segment.  If it provides all of our data, drop us.
 279          */
 280         if (p != NULL) {
 281                 int64_t i;
 282                 /* conversion to int (in i) handles seq wraparound */
 283                 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
 284                 if (i > 0) {
 285                         if (i >= *tlenp) {
 286                                 tcpstat.tcps_mptcp_rcvduppack++;
 287                                 m_freem(m);
 288                                 zfree(tcp_reass_zone, te);
 289                                 te = NULL;
 290                                 mp_tp->mpt_reassqlen--;
 291                                 /*
 292                                  * Try to present any queued data
 293                                  * at the left window edge to the user.
 294                                  * This is needed after the 3-WHS
 295                                  * completes.
 296                                  */
 297                                 goto out;
 298                         }
 299                         m_adj(m, i);
 300                         *tlenp -= i;
 301                         phdr->mp_dsn += i;
 302                 }
 303         }
 304
 305         tcpstat.tcps_mp_oodata++;
 306
 307         /*
 308          * While we overlap succeeding segments trim them or,
 309          * if they are completely covered, dequeue them.
 310          */
 311         while (q) {
 312                 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
 313                 if (i <= 0)
 314                         break;
 315
 316                 if (i < q->tqe_len) {
 317                         q->tqe_m->m_pkthdr.mp_dsn += i;
 318                         q->tqe_len -= i;
 319                         m_adj(q->tqe_m, i);
 320                         break;
 321                 }
 322
 323                 nq = LIST_NEXT(q, tqe_q);
 324                 LIST_REMOVE(q, tqe_q);
 325                 m_freem(q->tqe_m);
 326                 zfree(tcp_reass_zone, q);
 327                 mp_tp->mpt_reassqlen--;
 328                 q = nq;
 329         }
 330
 331         /* Insert the new segment queue entry into place. */
 332         te->tqe_m = m;
 333         te->tqe_th = NULL;
 334         te->tqe_len = *tlenp;
 335
 336         if (p == NULL) {
 337                 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
 338         } else {
 339                 LIST_INSERT_AFTER(p, te, tqe_q);
 340         }
 341
 342 out:
 343         return (mptcp_reass_present(mp_so));
 344 }
 345
 346 /*
 347  * MPTCP input, called when data has been read from a subflow socket.
 348  */
 349 void
 350 mptcp_input(struct mptses *mpte, struct mbuf *m)
 351 {
 352         struct socket *mp_so;
 353         struct mptcb *mp_tp = NULL;
 354         int count = 0, wakeup = 0;
 355         struct mbuf *save = NULL, *prev = NULL;
 356         struct mbuf *freelist = NULL, *tail = NULL;
 357
 358         VERIFY(m->m_flags & M_PKTHDR);
 359
 360         mpte_lock_assert_held(mpte);    /* same as MP socket lock */
 361
 362         mp_so = mptetoso(mpte);
 363         mp_tp = mpte->mpte_mptcb;
 364
 365         DTRACE_MPTCP(input);
 366
 367         mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
 368
 369         /*
 370          * Each mbuf contains MPTCP Data Sequence Map
 371          * Process the data for reassembly, delivery to MPTCP socket
 372          * client, etc.
 373          *
 374          */
 375         count = mp_so->so_rcv.sb_cc;
 376
 377         /*
 378          * In the degraded fallback case, data is accepted without DSS map
 379          */
 380         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
 381                 struct mbuf *iter;
 382                 int mb_dfin = 0;
 383 fallback:
 384                 mptcp_sbrcv_grow(mp_tp);
 385
 386                 iter = m;
 387                 while (iter) {
 388                         if ((iter->m_flags & M_PKTHDR) &&
 389                             (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
 390                                 mb_dfin = 1;
 391                         }
 392
 393                         if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
 394                                 /* Don't add zero-length packets, so jump it! */
 395                                 if (prev == NULL) {
 396                                         m = iter->m_next;
 397                                         m_free(iter);
 398                                         iter = m;
 399                                 } else {
 400                                         prev->m_next = iter->m_next;
 401                                         m_free(iter);
 402                                         iter = prev->m_next;
 403                                 }
 404
 405                                 /* It was a zero-length packet so next one must be a pkthdr */
 406                                 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
 407                         } else {
 408                                 prev = iter;
 409                                 iter = iter->m_next;
 410                         }
 411                 }
 412
 413                 /*
 414                  * assume degraded flow as this may be the first packet
 415                  * without DSS, and the subflow state is not updated yet.
 416                  */
 417                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
 418                         sorwakeup(mp_so);
 419
 420                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
 421                     struct socket *, mp_so,
 422                     struct sockbuf *, &mp_so->so_rcv,
 423                     struct sockbuf *, &mp_so->so_snd,
 424                     struct mptses *, mpte);
 425                 count = mp_so->so_rcv.sb_cc - count;
 426
 427                 mp_tp->mpt_rcvnxt += count;
 428
 429                 if (mb_dfin) {
 430                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 431                         socantrcvmore(mp_so);
 432                 }
 433
 434                 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
 435                     count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 436                 return;
 437         }
 438
 439         do {
 440                 u_int64_t mb_dsn;
 441                 int32_t mb_datalen;
 442                 int64_t todrop;
 443                 int mb_dfin = 0;
 444
 445                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
 446                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
 447                         goto fallback;
 448
 449                 save = m->m_next;
 450                 /*
 451                  * A single TCP packet formed of multiple mbufs
 452                  * holds DSS mapping in the first mbuf of the chain.
 453                  * Other mbufs in the chain may have M_PKTHDR set
 454                  * even though they belong to the same TCP packet
 455                  * and therefore use the DSS mapping stored in the
 456                  * first mbuf of the mbuf chain. mptcp_input() can
 457                  * get an mbuf chain with multiple TCP packets.
 458                  */
 459                 while (save && (!(save->m_flags & M_PKTHDR) ||
 460                     !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
 461                         prev = save;
 462                         save = save->m_next;
 463                 }
 464                 if (prev)
 465                         prev->m_next = NULL;
 466                 else
 467                         m->m_next = NULL;
 468
 469                 mb_dsn = m->m_pkthdr.mp_dsn;
 470                 mb_datalen = m->m_pkthdr.mp_rlen;
 471
 472                 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
 473                 if (todrop > 0) {
 474                         tcpstat.tcps_mptcp_rcvpackafterwin++;
 475
 476                         if (todrop >= mb_datalen) {
 477                                 if (freelist == NULL)
 478                                         freelist = m;
 479                                 else
 480                                         tail->m_next = m;
 481
 482                                 if (prev != NULL)
 483                                         tail = prev;
 484                                 else
 485                                         tail = m;
 486
 487                                 m = save;
 488                                 prev = save = NULL;
 489                                 continue;
 490                         } else {
 491                                 m_adj(m, -todrop);
 492                                 mb_datalen -= todrop;
 493                         }
 494
 495                         /*
 496                          * We drop from the right edge of the mbuf, thus the
 497                          * DATA_FIN is dropped as well
 498                          */
 499                         m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
 500                 }
 501
 502
 503                 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
 504                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
 505                             mp_tp->mpt_rcvnxt)) {
 506                                 if (freelist == NULL)
 507                                         freelist = m;
 508                                 else
 509                                         tail->m_next = m;
 510
 511                                 if (prev != NULL)
 512                                         tail = prev;
 513                                 else
 514                                         tail = m;
 515
 516                                 m = save;
 517                                 prev = save = NULL;
 518                                 continue;
 519                         } else {
 520                                 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
 521                         }
 522                         mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
 523                             mp_tp->mpt_rcvnxt),
 524                             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 525                 }
 526
 527                 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
 528                     !LIST_EMPTY(&mp_tp->mpt_segq)) {
 529                         mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
 530
 531                         goto next;
 532                 }
 533                 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 534
 535                 mptcp_sbrcv_grow(mp_tp);
 536
 537                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
 538                         wakeup = 1;
 539
 540                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
 541                     struct sockbuf *, &mp_so->so_rcv,
 542                     struct sockbuf *, &mp_so->so_snd,
 543                     struct mptses *, mpte,
 544                     struct mptcb *, mp_tp);
 545                 count = mp_so->so_rcv.sb_cc - count;
 546                 tcpstat.tcps_mp_rcvtotal++;
 547                 tcpstat.tcps_mp_rcvbytes += count;
 548                 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
 549                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 550
 551                 mp_tp->mpt_rcvnxt += count;
 552
 553 next:
 554                 if (mb_dfin) {
 555                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 556                         socantrcvmore(mp_so);
 557                 }
 558                 m = save;
 559                 prev = save = NULL;
 560                 count = mp_so->so_rcv.sb_cc;
 561         } while (m);
 562
 563         if (freelist)
 564                 m_freem(freelist);
 565
 566         if (wakeup)
 567                 sorwakeup(mp_so);
 568 }
 569
 570 boolean_t
 571 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
 572 {
 573         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 574
 575         /*
 576          * Always send if there is data in the reinject-queue.
 577          */
 578         if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq)
 579                 return (TRUE);
 580
 581         /*
 582          * Don't send, if:
 583          *
 584          * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
 585          *    Except when using TFO, we might be doing a 0-byte write.
 586          * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
 587          * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
 588          */
 589
 590         if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax))
 591                 return (FALSE);
 592
 593         if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt))
 594                 return (FALSE);
 595
 596         if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
 597                 return (FALSE);
 598
 599         if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2)
 600                 return (FALSE);
 601
 602         return (TRUE);
 603 }
 604
 605 /*
 606  * MPTCP output.
 607  */
 608 int
 609 mptcp_output(struct mptses *mpte)
 610 {
 611         struct mptcb *mp_tp;
 612         struct mptsub *mpts;
 613         struct mptsub *mpts_tried = NULL;
 614         struct socket *mp_so;
 615         struct mptsub *preferred_mpts = NULL;
 616         uint64_t old_snd_nxt;
 617         int error = 0;
 618
 619         mpte_lock_assert_held(mpte);
 620         mp_so = mptetoso(mpte);
 621         mp_tp = mpte->mpte_mptcb;
 622
 623         VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
 624         mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 625
 626         mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
 627                   __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
 628                   (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
 629                   mpte->mpte_reinjectq ? 1 : 0,
 630                   mp_tp->mpt_state),
 631                  MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 632
 633         old_snd_nxt = mp_tp->mpt_sndnxt;
 634         while (mptcp_can_send_more(mp_tp, FALSE)) {
 635                 /* get the "best" subflow to be used for transmission */
 636                 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
 637                 if (mpts == NULL) {
 638                         mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
 639                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 640                         break;
 641                 }
 642
 643                 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
 644                     MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 645
 646                 /* In case there's just one flow, we reattempt later */
 647                 if (mpts_tried != NULL &&
 648                     (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
 649                         mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
 650                         mpts_tried->mpts_flags |= MPTSF_ACTIVE;
 651                         mptcp_start_timer(mpte, MPTT_REXMT);
 652                         mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
 653                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 654                         break;
 655                 }
 656
 657                 /*
 658                  * Automatic sizing of send socket buffer. Increase the send
 659                  * socket buffer size if all of the following criteria are met
 660                  *      1. the receiver has enough buffer space for this data
 661                  *      2. send buffer is filled to 7/8th with data (so we actually
 662                  *         have data to make use of it);
 663                  */
 664                 if (tcp_do_autosendbuf == 1 &&
 665                     (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
 666                     tcp_cansbgrow(&mp_so->so_snd)) {
 667                         if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
 668                             mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
 669                                 if (sbreserve(&mp_so->so_snd,
 670                                     min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 671                                     tcp_autosndbuf_max)) == 1) {
 672                                         mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
 673
 674                                         mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
 675                                                   __func__, mp_so->so_snd.sb_hiwat,
 676                                                   mp_so->so_snd.sb_lowat),
 677                                                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
 678                                 }
 679                         }
 680                 }
 681
 682                 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
 683                     struct socket *, mp_so);
 684                 error = mptcp_subflow_output(mpte, mpts, 0);
 685                 if (error) {
 686                         /* can be a temporary loss of source address or other error */
 687                         mpts->mpts_flags |= MPTSF_FAILINGOVER;
 688                         mpts->mpts_flags &= ~MPTSF_ACTIVE;
 689                         mpts_tried = mpts;
 690                         if (error != ECANCELED)
 691                                 mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
 692                                           error, mpts->mpts_flags),
 693                                          MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
 694                         break;
 695                 }
 696                 /* The model is to have only one active flow at a time */
 697                 mpts->mpts_flags |= MPTSF_ACTIVE;
 698                 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
 699
 700                 /* Allows us to update the smoothed rtt */
 701                 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
 702                         if (preferred_mpts->mpts_probesoon) {
 703                                 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
 704                                         mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
 705                                         if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
 706                                                 preferred_mpts->mpts_probesoon = 0;
 707                                                 preferred_mpts->mpts_probecnt = 0;
 708                                         }
 709                                 }
 710                         } else {
 711                                 preferred_mpts->mpts_probesoon = tcp_now;
 712                                 preferred_mpts->mpts_probecnt = 0;
 713                         }
 714                 }
 715
 716                 if (mpte->mpte_active_sub == NULL) {
 717                         mpte->mpte_active_sub = mpts;
 718                 } else if (mpte->mpte_active_sub != mpts) {
 719                         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 720                         struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
 721
 722                         mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
 723                             mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
 724                             mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
 725                             (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
 726
 727                         mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
 728                         mpte->mpte_active_sub = mpts;
 729
 730                         mptcpstats_inc_switch(mpte, mpts);
 731                 }
 732         }
 733
 734         if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 735                 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
 736                     mp_tp->mpt_snduna == mp_tp->mpt_sndnxt)
 737                         mptcp_finish_usrclosed(mpte);
 738         }
 739
 740         mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
 741
 742         /* subflow errors should not be percolated back up */
 743         return (0);
 744 }
 745
 746
 747 static struct mptsub *
 748 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
 749 {
 750         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 751
 752         /*
 753          * Lower RTT? Take it, if it's our first one, or
 754          * it doesn't has any loss, or the current one has
 755          * loss as well.
 756          */
 757         if (tp->t_srtt && *currtt > tp->t_srtt &&
 758             (curbest == NULL || tp->t_rxtshift == 0 ||
 759              sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
 760                 *currtt = tp->t_srtt;
 761                 return (mpts);
 762         }
 763
 764         /*
 765          * If we find a subflow without loss, take it always!
 766          */
 767         if (curbest &&
 768             sototcpcb(curbest->mpts_socket)->t_rxtshift &&
 769             tp->t_rxtshift == 0) {
 770                 *currtt = tp->t_srtt;
 771                 return (mpts);
 772         }
 773
 774         return (curbest != NULL ? curbest : mpts);
 775 }
 776
 777 static struct mptsub *
 778 mptcp_return_subflow(struct mptsub *mpts)
 779 {
 780         if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0)
 781                 return (NULL);
 782
 783         return (mpts);
 784 }
 785
 786 /*
 787  * Return the most eligible subflow to be used for sending data.
 788  */
 789 struct mptsub *
 790 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
 791 {
 792         struct tcpcb *besttp, *secondtp;
 793         struct inpcb *bestinp, *secondinp;
 794         struct mptsub *mpts;
 795         struct mptsub *best = NULL;
 796         struct mptsub *second_best = NULL;
 797         int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
 798
 799         /*
 800          * First Step:
 801          * Choose the best subflow for cellular and non-cellular interfaces.
 802          */
 803
 804         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
 805                 struct socket *so = mpts->mpts_socket;
 806                 struct tcpcb *tp = sototcpcb(so);
 807                 struct inpcb *inp = sotoinpcb(so);
 808
 809                 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
 810                           __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
 811                           INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
 812                           inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
 813                           tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
 814                           mptcp_subflow_cwnd_space(so)),
 815                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 816
 817                 /*
 818                  * First, the hard conditions to reject subflows
 819                  * (e.g., not connected,...)
 820                  */
 821                 if (mpts == ignore || inp->inp_last_outifp == NULL)
 822                         continue;
 823
 824                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
 825                         continue;
 826
 827                 /* There can only be one subflow in degraded state */
 828                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
 829                         best = mpts;
 830                         break;
 831                 }
 832
 833                 /*
 834                  * If this subflow is waiting to finally send, do it!
 835                  */
 836                 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
 837                         return (mptcp_return_subflow(mpts));
 838
 839                 /*
 840                  * Only send if the subflow is MP_CAPABLE. The exceptions to
 841                  * this rule (degraded or TFO) have been taken care of above.
 842                  */
 843                 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE))
 844                         continue;
 845
 846                 if ((so->so_state & SS_ISDISCONNECTED) ||
 847                     !(so->so_state & SS_ISCONNECTED) ||
 848                     !TCPS_HAVEESTABLISHED(tp->t_state) ||
 849                     tp->t_state > TCPS_CLOSE_WAIT)
 850                         continue;
 851
 852                 /*
 853                  * Second, the soft conditions to find the subflow with best
 854                  * conditions for each set (aka cellular vs non-cellular)
 855                  */
 856                 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
 857                         second_best = mptcp_choose_subflow(mpts, second_best,
 858                                                            &exp_rtt);
 859                 else
 860                         best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
 861         }
 862
 863         /*
 864          * If there is no preferred or backup subflow, and there is no active
 865          * subflow use the last usable subflow.
 866          */
 867         if (best == NULL)
 868                 return (mptcp_return_subflow(second_best));
 869
 870         if (second_best == NULL)
 871                 return (mptcp_return_subflow(best));
 872
 873         besttp = sototcpcb(best->mpts_socket);
 874         bestinp = sotoinpcb(best->mpts_socket);
 875         secondtp = sototcpcb(second_best->mpts_socket);
 876         secondinp = sotoinpcb(second_best->mpts_socket);
 877
 878         if (preferred != NULL)
 879                 *preferred = mptcp_return_subflow(best);
 880
 881         /*
 882          * Second Step: Among best and second_best. Choose the one that is
 883          * most appropriate for this particular service-type.
 884          */
 885         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 886                 /*
 887                  * Only handover if Symptoms tells us to do so.
 888                  */
 889                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 890                     mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best))
 891                         return (mptcp_return_subflow(second_best));
 892
 893                 return (mptcp_return_subflow(best));
 894         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
 895                 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
 896                 int rto_thresh = mptcp_rtothresh;
 897
 898                 /* Adjust with symptoms information */
 899                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 900                     mptcp_is_wifi_unusable(mpte) != 0) {
 901                         rtt_thresh /= 2;
 902                         rto_thresh /= 2;
 903                 }
 904
 905                 if (besttp->t_srtt && secondtp->t_srtt &&
 906                     besttp->t_srtt >= rtt_thresh &&
 907                     secondtp->t_srtt < rtt_thresh) {
 908                         tcpstat.tcps_mp_sel_rtt++;
 909                         mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
 910                             best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
 911                             second_best->mpts_connid,
 912                             secondtp->t_srtt >> TCP_RTT_SHIFT),
 913                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 914                         return (mptcp_return_subflow(second_best));
 915                 }
 916
 917                 if (mptcp_subflow_is_bad(mpte, best) &&
 918                     secondtp->t_rxtshift == 0) {
 919                         return (mptcp_return_subflow(second_best));
 920                 }
 921
 922                 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
 923                 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
 924                     besttp->t_rxtcur >= rto_thresh &&
 925                     secondtp->t_rxtcur < rto_thresh) {
 926                         tcpstat.tcps_mp_sel_rto++;
 927                         mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
 928                             best->mpts_connid, besttp->t_rxtcur,
 929                             second_best->mpts_connid, secondtp->t_rxtcur),
 930                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 931
 932                         return (mptcp_return_subflow(second_best));
 933                 }
 934
 935                 /*
 936                  * None of the above conditions for sending on the secondary
 937                  * were true. So, let's schedule on the best one, if he still
 938                  * has some space in the congestion-window.
 939                  */
 940                 return (mptcp_return_subflow(best));
 941         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
 942                 struct mptsub *tmp;
 943
 944                 /*
 945                  * We only care about RTT when aggregating
 946                  */
 947                 if (besttp->t_srtt > secondtp->t_srtt) {
 948                         tmp = best;
 949                         best = second_best;
 950                         besttp = secondtp;
 951                         bestinp = secondinp;
 952
 953                         second_best = tmp;
 954                         secondtp = sototcpcb(second_best->mpts_socket);
 955                         secondinp = sotoinpcb(second_best->mpts_socket);
 956                 }
 957
 958                 /* Is there still space in the congestion window? */
 959                 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0)
 960                         return (mptcp_return_subflow(second_best));
 961
 962                 return (mptcp_return_subflow(best));
 963         } else {
 964                 panic("Unknown service-type configured for MPTCP");
 965         }
 966
 967         return (NULL);
 968 }
 969
 970 static const char *
 971 mptcp_event_to_str(uint32_t event)
 972 {
 973         const char *c = "UNDEFINED";
 974         switch (event) {
 975         case MPCE_CLOSE:
 976                 c = "MPCE_CLOSE";
 977                 break;
 978         case MPCE_RECV_DATA_ACK:
 979                 c = "MPCE_RECV_DATA_ACK";
 980                 break;
 981         case MPCE_RECV_DATA_FIN:
 982                 c = "MPCE_RECV_DATA_FIN";
 983                 break;
 984         }
 985         return (c);
 986 }
 987
 988 static const char *
 989 mptcp_state_to_str(mptcp_state_t state)
 990 {
 991         const char *c = "UNDEFINED";
 992         switch (state) {
 993         case MPTCPS_CLOSED:
 994                 c = "MPTCPS_CLOSED";
 995                 break;
 996         case MPTCPS_LISTEN:
 997                 c = "MPTCPS_LISTEN";
 998                 break;
 999         case MPTCPS_ESTABLISHED:
1000                 c = "MPTCPS_ESTABLISHED";
1001                 break;
1002         case MPTCPS_CLOSE_WAIT:
1003                 c = "MPTCPS_CLOSE_WAIT";
1004                 break;
1005         case MPTCPS_FIN_WAIT_1:
1006                 c = "MPTCPS_FIN_WAIT_1";
1007                 break;
1008         case MPTCPS_CLOSING:
1009                 c = "MPTCPS_CLOSING";
1010                 break;
1011         case MPTCPS_LAST_ACK:
1012                 c = "MPTCPS_LAST_ACK";
1013                 break;
1014         case MPTCPS_FIN_WAIT_2:
1015                 c = "MPTCPS_FIN_WAIT_2";
1016                 break;
1017         case MPTCPS_TIME_WAIT:
1018                 c = "MPTCPS_TIME_WAIT";
1019                 break;
1020         case MPTCPS_TERMINATE:
1021                 c = "MPTCPS_TERMINATE";
1022                 break;
1023         }
1024         return (c);
1025 }
1026
1027 void
1028 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1029 {
1030         mpte_lock_assert_held(mp_tp->mpt_mpte);
1031         mptcp_state_t old_state = mp_tp->mpt_state;
1032
1033         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1034             uint32_t, event);
1035
1036         switch (mp_tp->mpt_state) {
1037         case MPTCPS_CLOSED:
1038         case MPTCPS_LISTEN:
1039                 mp_tp->mpt_state = MPTCPS_TERMINATE;
1040                 break;
1041
1042         case MPTCPS_ESTABLISHED:
1043                 if (event == MPCE_CLOSE) {
1044                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1045                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1046                 } else if (event == MPCE_RECV_DATA_FIN) {
1047                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1048                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1049                 }
1050                 break;
1051
1052         case MPTCPS_CLOSE_WAIT:
1053                 if (event == MPCE_CLOSE) {
1054                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
1055                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1056                 }
1057                 break;
1058
1059         case MPTCPS_FIN_WAIT_1:
1060                 if (event == MPCE_RECV_DATA_ACK) {
1061                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1062                 } else if (event == MPCE_RECV_DATA_FIN) {
1063                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1064                         mp_tp->mpt_state = MPTCPS_CLOSING;
1065                 }
1066                 break;
1067
1068         case MPTCPS_CLOSING:
1069                 if (event == MPCE_RECV_DATA_ACK)
1070                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1071                 break;
1072
1073         case MPTCPS_LAST_ACK:
1074                 if (event == MPCE_RECV_DATA_ACK)
1075                         mptcp_close(mp_tp->mpt_mpte, mp_tp);
1076                 break;
1077
1078         case MPTCPS_FIN_WAIT_2:
1079                 if (event == MPCE_RECV_DATA_FIN) {
1080                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1081                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1082                 }
1083                 break;
1084
1085         case MPTCPS_TIME_WAIT:
1086         case MPTCPS_TERMINATE:
1087                 break;
1088
1089         default:
1090                 VERIFY(0);
1091                 /* NOTREACHED */
1092         }
1093         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1094             uint32_t, event);
1095         mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1096             mptcp_state_to_str(old_state),
1097             mptcp_state_to_str(mp_tp->mpt_state),
1098             mptcp_event_to_str(event)),
1099             MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1100 }
1101
1102 /* If you change this function, match up mptcp_update_rcv_state_f */
1103 void
1104 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1105     uint16_t csum)
1106 {
1107         struct mptcb *mp_tp = tptomptp(tp);
1108         u_int64_t full_dsn = 0;
1109
1110         NTOHL(dss_info->mdss_dsn);
1111         NTOHL(dss_info->mdss_subflow_seqn);
1112         NTOHS(dss_info->mdss_data_len);
1113
1114         /* XXX for autosndbuf grow sb here */
1115         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1116         mptcp_update_rcv_state_meat(mp_tp, tp,
1117             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1118             csum);
1119
1120 }
1121
1122 void
1123 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1124     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1125     uint16_t csum)
1126 {
1127         if (mdss_data_len == 0) {
1128                 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
1129                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1130
1131                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1132                         mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
1133                             csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1134                 }
1135                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1136                 return;
1137         }
1138         mptcplog((LOG_DEBUG,
1139             "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__,
1140             seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt),
1141             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1142
1143         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1144
1145         tp->t_rcv_map.mpt_dsn = full_dsn;
1146         tp->t_rcv_map.mpt_sseq = seqn;
1147         tp->t_rcv_map.mpt_len = mdss_data_len;
1148         tp->t_rcv_map.mpt_csum = csum;
1149         tp->t_mpflags |= TMPF_EMBED_DSN;
1150 }
1151
1152
1153 static int
1154 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1155     int hdrlen)
1156 {
1157         u_int32_t datalen;
1158
1159         if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
1160                 return 0;
1161
1162         datalen = m->m_pkthdr.mp_rlen;
1163
1164         /* unacceptable DSS option, fallback to TCP */
1165         if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1166                 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
1167                     __func__, m->m_pkthdr.len, datalen),
1168                     MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1169         } else {
1170                 return 0;
1171         }
1172         tp->t_mpflags |= TMPF_SND_MPFAIL;
1173         mptcp_notify_mpfail(so);
1174         m_freem(m);
1175         return -1;
1176 }
1177
1178 int
1179 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1180                     int drop_hdrlen)
1181 {
1182         mptcp_insert_rmap(tp, m, th);
1183         if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1184             drop_hdrlen) != 0)
1185                 return -1;
1186         return 0;
1187 }
1188
1189 /*
1190  * MPTCP Checksum support
1191  * The checksum is calculated whenever the MPTCP DSS option is included
1192  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1193  * header and the actual data indicated by the length specified in the
1194  * DSS option.
1195  */
1196
1197 int
1198 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1199                     uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1200 {
1201         uint16_t mptcp_csum;
1202
1203         mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1204         if (mptcp_csum) {
1205                 tp->t_mpflags |= TMPF_SND_MPFAIL;
1206                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1207                 m_freem(m);
1208                 tcpstat.tcps_mp_badcsum++;
1209                 return (-1);
1210         }
1211         return (0);
1212 }
1213
1214 static uint16_t
1215 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1216                  uint16_t dlen, uint16_t csum, uint16_t dfin)
1217 {
1218         struct mptcb *mp_tp = tptomptp(tp);
1219         uint16_t real_len = dlen - dfin;
1220         uint32_t sum = 0;
1221
1222         if (mp_tp == NULL)
1223                 return (0);
1224
1225         if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1226                 return (0);
1227
1228         if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1229                 return (0);
1230
1231         /*
1232          * The remote side may send a packet with fewer bytes than the
1233          * claimed DSS checksum length.
1234          */
1235         if ((int)m_length2(m, NULL) < real_len) {
1236                 return (0xffff);
1237         }
1238
1239         if (real_len != 0)
1240                 sum = m_sum16(m, 0, real_len);
1241
1242         sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1243         ADDCARRY(sum);
1244         DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1245             uint32_t, sum);
1246
1247         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1248             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1249         return (~sum & 0xffff);
1250 }
1251
1252 uint32_t
1253 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1254 {
1255         uint32_t sum = 0;
1256
1257         if (dlen)
1258                 sum = m_sum16(m, 0, dlen);
1259
1260         dss_val = mptcp_hton64(dss_val);
1261         sseq = htonl(sseq);
1262         dlen = htons(dlen);
1263         sum += in_pseudo64(dss_val, sseq, dlen);
1264
1265         ADDCARRY(sum);
1266         sum = ~sum & 0xffff;
1267         DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1268         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1269                   MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1270
1271         return sum;
1272 }
1273
1274 /*
1275  * When WiFi signal starts fading, there's more loss and RTT spikes.
1276  * Check if there has been a large spike by comparing against
1277  * a tolerable RTT spike threshold.
1278  */
1279 boolean_t
1280 mptcp_no_rto_spike(struct socket *so)
1281 {
1282         struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1283         int32_t spike = 0;
1284
1285         if (tp->t_rxtcur > mptcp_rtothresh) {
1286                 spike = tp->t_rxtcur - mptcp_rtothresh;
1287
1288                 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1289                     __func__, spike,
1290                     tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1291                     tp->t_rttcur),
1292                     (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1293
1294         }
1295
1296         if (spike > 0 ) {
1297                 return (FALSE);
1298         } else {
1299                 return (TRUE);
1300         }
1301 }
1302
1303 void
1304 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1305 {
1306         VERIFY(mpp->mpp_flags & flag);
1307         mpp->mpp_flags &= ~flag;
1308
1309         if (mptcp_should_defer_upcall(mpp))
1310                 return;
1311
1312         if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1313                 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1314
1315                 mptcp_subflow_workloop(mpp->mpp_pcbe);
1316         }
1317
1318         if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1319                 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1320
1321                 sorwakeup(mpp->mpp_socket);
1322         }
1323
1324         if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1325                 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1326
1327                 sowwakeup(mpp->mpp_socket);
1328         }
1329
1330         if (mpp->mpp_flags & MPP_SET_CELLICON) {
1331                 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1332
1333                 mptcp_set_cellicon(mpp->mpp_pcbe);
1334         }
1335
1336         if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1337                 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1338
1339                 mptcp_unset_cellicon();
1340         }
1341 }
1342
1343 void
1344 mptcp_ask_for_nat64(struct ifnet *ifp)
1345 {
1346         in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1347
1348         os_log_info(mptcp_log_handle,
1349                     "%s: asked for NAT64-prefix on %s\n", __func__,
1350                     ifp->if_name);
1351 }
1352
1353 static void
1354 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1355 {
1356         info->ifindex = 0;
1357         info->has_v4_conn = 0;
1358         info->has_v6_conn = 0;
1359         info->has_nat64_conn = 0;
1360 }
1361
1362 void
1363 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1364                       uint32_t necp_flags, __unused bool *viable)
1365 {
1366         boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1367         boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1368         boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1369         boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1370         struct mppcb *mp = (struct mppcb *)handle;
1371         struct mptses *mpte = mptompte(mp);
1372         struct socket *mp_so;
1373         struct mptcb *mp_tp;
1374         int locked = 0;
1375         uint32_t i, ifindex;
1376
1377         ifindex = interface_index;
1378         VERIFY(ifindex != IFSCOPE_NONE);
1379
1380         /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1381         if (mp->mpp_socket->so_usecount == 0)
1382                 return;
1383
1384         if (action != NECP_CLIENT_CBACTION_INITIAL) {
1385                 mpte_lock(mpte);
1386                 locked = 1;
1387
1388                 /* Check again, because it might have changed while waiting */
1389                 if (mp->mpp_socket->so_usecount == 0)
1390                         goto out;
1391         }
1392
1393         mpte_lock_assert_held(mpte);
1394
1395         mp_tp = mpte->mpte_mptcb;
1396         mp_so = mptetoso(mpte);
1397
1398         os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1399                      __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1400                      has_v4, has_v6, has_nat64, low_power);
1401
1402         /* No need on fallen back sockets */
1403         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
1404                 goto out;
1405
1406         /*
1407          * When the interface goes in low-power mode we don't want to establish
1408          * new subflows on it. Thus, mark it internally as non-viable.
1409          */
1410         if (low_power)
1411                 action = NECP_CLIENT_CBACTION_NONVIABLE;
1412
1413         if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1414                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1415                         if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE)
1416                                 continue;
1417
1418                         if (mpte->mpte_itfinfo[i].ifindex == ifindex)
1419                                 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1420                 }
1421
1422                 mptcp_sched_create_subflows(mpte);
1423         } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1424                    action == NECP_CLIENT_CBACTION_INITIAL) {
1425                 int found_slot = 0, slot_index = -1;
1426                 struct ifnet *ifp;
1427
1428                 ifnet_head_lock_shared();
1429                 ifp = ifindex2ifnet[ifindex];
1430                 ifnet_head_done();
1431
1432                 if (ifp == NULL)
1433                         goto out;
1434
1435                 if (IFNET_IS_EXPENSIVE(ifp) &&
1436                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1437                         goto out;
1438
1439                 if (IFNET_IS_CELLULAR(ifp) &&
1440                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1441                         goto out;
1442
1443                 if (IS_INTF_CLAT46(ifp))
1444                         has_v4 = FALSE;
1445
1446                 /* Look for the slot on where to store/update the interface-info. */
1447                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1448                         /* Found a potential empty slot where we can put it */
1449                         if (mpte->mpte_itfinfo[i].ifindex == 0) {
1450                                 found_slot = 1;
1451                                 slot_index = i;
1452                         }
1453
1454                         /*
1455                          * The interface is already in our array. Check if we
1456                          * need to update it.
1457                          */
1458                         if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1459                             (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1460                              mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1461                              mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1462                                 found_slot = 1;
1463                                 slot_index = i;
1464                                 break;
1465                         }
1466
1467                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1468                                 /*
1469                                  * Ok, it's already there and we don't need
1470                                  * to update it
1471                                  */
1472                                 goto out;
1473                         }
1474                 }
1475
1476                 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1477                     !has_nat64 && !has_v4) {
1478                         if (found_slot) {
1479                                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1480                                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1481                                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1482                         }
1483                         mptcp_ask_for_nat64(ifp);
1484                         goto out;
1485                 }
1486
1487                 if (found_slot == 0) {
1488                         int new_size = mpte->mpte_itfinfo_size * 2;
1489                         struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1490
1491                         if (info == NULL) {
1492                                 os_log_error(mptcp_log_handle, "%s malloc failed for %u\n",
1493                                              __func__, new_size);
1494                                 goto out;
1495                         }
1496
1497                         memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1498
1499                         if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
1500                                 _FREE(mpte->mpte_itfinfo, M_TEMP);
1501
1502                         /* We allocated a new one, thus the first must be empty */
1503                         slot_index = mpte->mpte_itfinfo_size;
1504
1505                         mpte->mpte_itfinfo = info;
1506                         mpte->mpte_itfinfo_size = new_size;
1507                 }
1508
1509                 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1510                 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1511                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1512                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1513                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1514
1515                 mptcp_sched_create_subflows(mpte);
1516         }
1517
1518 out:
1519         if (locked)
1520                 mpte_unlock(mpte);
1521 }
1522
1523 void
1524 mptcp_set_restrictions(struct socket *mp_so)
1525 {
1526         struct mptses *mpte = mpsotompte(mp_so);
1527         uint32_t i;
1528
1529         mpte_lock_assert_held(mpte);
1530
1531         ifnet_head_lock_shared();
1532
1533         for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1534                 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1535                 uint32_t ifindex = info->ifindex;
1536                 struct ifnet *ifp;
1537
1538                 if (ifindex == IFSCOPE_NONE)
1539                         continue;
1540
1541                 ifp = ifindex2ifnet[ifindex];
1542                 if (ifp == NULL)
1543                         continue;
1544
1545                 if (IFNET_IS_EXPENSIVE(ifp) &&
1546                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1547                         info->ifindex = IFSCOPE_NONE;
1548
1549                 if (IFNET_IS_CELLULAR(ifp) &&
1550                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1551                         info->ifindex = IFSCOPE_NONE;
1552         }
1553
1554         ifnet_head_done();
1555 }
1556