bsd/netinet/mptcp.c

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * A note on the MPTCP/NECP-interactions:
  31  *
  32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
  33  * MPTCP registers to these events at the MPTCP-layer for interface-events
  34  * through a call to necp_client_register_multipath_cb.
  35  * To get per-flow events (aka per TCP-subflow), we register to it with
  36  * necp_client_register_socket_flow. Both registrations happen by using the
  37  * necp-client-uuid that comes from the app.
  38  *
  39  * The locking is rather tricky. In general, we expect the lock-ordering to
  40  * happen from necp-fd -> necp->client -> mpp_lock.
  41  *
  42  * There are however some subtleties.
  43  *
  44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
  45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
  46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
  47  * NECP-locks will deadlock us. Because these NECP-events will also first take
  48  * the NECP-locks. Either they win the race and thus won't find our
  49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
  50  * the callbacks while holding the NECP lock.
  51  *
  52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
  53  * because we have already registered callbacks and we might race against an
  54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
  55  *
  56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
  57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
  58  * pointers before we unregistered the callback. Because, again we might be
  59  * racing against an NECP-event. Unregistering must happen with an unlocked
  60  * mpp_lock, because of the lock-ordering constraint. It could be that
  61  * before we had a chance to unregister an NECP-event triggers. That's why
  62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
  63  * there while the socket is being garbage-collected, the use-count will go
  64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
  65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
  66  *
  67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
  68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
  69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
  70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
  71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
  72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
  73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
  74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
  75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
  76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
  77  * event). So, the event now finally gets the subflow-lock and then hits an
  78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
  79  * the NECP callback.
  80  */
  81
  82 #include <sys/param.h>
  83 #include <sys/systm.h>
  84 #include <sys/kernel.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/mcache.h>
  87 #include <sys/socket.h>
  88 #include <sys/socketvar.h>
  89 #include <sys/syslog.h>
  90 #include <sys/protosw.h>
  91
  92 #include <kern/zalloc.h>
  93 #include <kern/locks.h>
  94
  95 #include <mach/sdt.h>
  96
  97 #include <net/if.h>
  98 #include <netinet/in.h>
  99 #include <netinet/in_var.h>
 100 #include <netinet/tcp.h>
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_var.h>
 104 #include <netinet/mptcp_var.h>
 105 #include <netinet/mptcp.h>
 106 #include <netinet/mptcp_seq.h>
 107 #include <netinet/mptcp_opt.h>
 108 #include <netinet/mptcp_timer.h>
 109
 110 int mptcp_enable = 1;
 111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
 112     &mptcp_enable, 0, "Enable Multipath TCP Support");
 113
 114 /*
 115  * Number of times to try negotiating MPTCP on SYN retransmissions.
 116  * We haven't seen any reports of a middlebox that is dropping all SYN-segments
 117  * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
 118  */
 119 int mptcp_mpcap_retries = 4;
 120 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
 121     CTLFLAG_RW | CTLFLAG_LOCKED,
 122     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
 123
 124 /*
 125  * By default, DSS checksum is turned off, revisit if we ever do
 126  * MPTCP for non SSL Traffic.
 127  */
 128 int mptcp_dss_csum = 0;
 129 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
 130     &mptcp_dss_csum, 0, "Enable DSS checksum");
 131
 132 /*
 133  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
 134  * is attempted on a different path.
 135  */
 136 int mptcp_fail_thresh = 1;
 137 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
 138     &mptcp_fail_thresh, 0, "Failover threshold");
 139
 140
 141 /*
 142  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
 143  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
 144  * Some carrier networks have a timeout of 10 or 15 minutes.
 145  */
 146 int mptcp_subflow_keeptime = 60 * 14;
 147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
 148     &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
 149
 150 int mptcp_rtthist_rtthresh = 600;
 151 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 152     &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
 153
 154 /*
 155  * Use RTO history for sending new data
 156  */
 157 int mptcp_use_rto = 1;
 158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
 159     &mptcp_use_rto, 0, "Disable RTO for subflow selection");
 160
 161 int mptcp_rtothresh = 1500;
 162 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 163     &mptcp_rtothresh, 0, "RTO threshold");
 164
 165 /*
 166  * Probe the preferred path, when it is not in use
 167  */
 168 uint32_t mptcp_probeto = 1000;
 169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
 170     &mptcp_probeto, 0, "Disable probing by setting to 0");
 171
 172 uint32_t mptcp_probecnt = 5;
 173 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 174     &mptcp_probecnt, 0, "Number of probe writes");
 175
 176 /*
 177  * Static declarations
 178  */
 179 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 180     uint32_t, uint16_t, uint16_t, uint16_t);
 181
 182 static int
 183 mptcp_reass_present(struct socket *mp_so)
 184 {
 185         struct mptses *mpte = mpsotompte(mp_so);
 186         struct mptcb *mp_tp = mpte->mpte_mptcb;
 187         struct tseg_qent *q;
 188         int dowakeup = 0;
 189         int flags = 0;
 190
 191         /*
 192          * Present data to user, advancing rcv_nxt through
 193          * completed sequence space.
 194          */
 195         if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
 196                 return flags;
 197         }
 198         q = LIST_FIRST(&mp_tp->mpt_segq);
 199         if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
 200                 return flags;
 201         }
 202
 203         /*
 204          * If there is already another thread doing reassembly for this
 205          * connection, it is better to let it finish the job --
 206          * (radar 16316196)
 207          */
 208         if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
 209                 return flags;
 210         }
 211
 212         mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
 213
 214         do {
 215                 mp_tp->mpt_rcvnxt += q->tqe_len;
 216                 LIST_REMOVE(q, tqe_q);
 217                 if (mp_so->so_state & SS_CANTRCVMORE) {
 218                         m_freem(q->tqe_m);
 219                 } else {
 220                         flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 221                         if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0)) {
 222                                 dowakeup = 1;
 223                         }
 224                 }
 225                 zfree(tcp_reass_zone, q);
 226                 mp_tp->mpt_reassqlen--;
 227                 q = LIST_FIRST(&mp_tp->mpt_segq);
 228         } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
 229         mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
 230
 231         if (dowakeup) {
 232                 sorwakeup(mp_so); /* done with socket lock held */
 233         }
 234         return flags;
 235 }
 236
 237 static int
 238 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
 239 {
 240         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 241         u_int64_t mb_dsn = phdr->mp_dsn;
 242         struct tseg_qent *q;
 243         struct tseg_qent *p = NULL;
 244         struct tseg_qent *nq;
 245         struct tseg_qent *te = NULL;
 246         u_int16_t qlimit;
 247
 248         /*
 249          * Limit the number of segments in the reassembly queue to prevent
 250          * holding on to too many segments (and thus running out of mbufs).
 251          * Make sure to let the missing segment through which caused this
 252          * queue.  Always keep one global queue entry spare to be able to
 253          * process the missing segment.
 254          */
 255         qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
 256             (tcp_autorcvbuf_max >> 10));
 257         if (mb_dsn != mp_tp->mpt_rcvnxt &&
 258             (mp_tp->mpt_reassqlen + 1) >= qlimit) {
 259                 tcpstat.tcps_mptcp_rcvmemdrop++;
 260                 m_freem(m);
 261                 *tlenp = 0;
 262                 return 0;
 263         }
 264
 265         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 266         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 267         if (te == NULL) {
 268                 tcpstat.tcps_mptcp_rcvmemdrop++;
 269                 m_freem(m);
 270                 return 0;
 271         }
 272
 273         mp_tp->mpt_reassqlen++;
 274
 275         /*
 276          * Find a segment which begins after this one does.
 277          */
 278         LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
 279                 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
 280                         break;
 281                 }
 282                 p = q;
 283         }
 284
 285         /*
 286          * If there is a preceding segment, it may provide some of
 287          * our data already.  If so, drop the data from the incoming
 288          * segment.  If it provides all of our data, drop us.
 289          */
 290         if (p != NULL) {
 291                 int64_t i;
 292                 /* conversion to int (in i) handles seq wraparound */
 293                 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
 294                 if (i > 0) {
 295                         if (i >= *tlenp) {
 296                                 tcpstat.tcps_mptcp_rcvduppack++;
 297                                 m_freem(m);
 298                                 zfree(tcp_reass_zone, te);
 299                                 te = NULL;
 300                                 mp_tp->mpt_reassqlen--;
 301                                 /*
 302                                  * Try to present any queued data
 303                                  * at the left window edge to the user.
 304                                  * This is needed after the 3-WHS
 305                                  * completes.
 306                                  */
 307                                 goto out;
 308                         }
 309                         m_adj(m, i);
 310                         *tlenp -= i;
 311                         phdr->mp_dsn += i;
 312                 }
 313         }
 314
 315         tcpstat.tcps_mp_oodata++;
 316
 317         /*
 318          * While we overlap succeeding segments trim them or,
 319          * if they are completely covered, dequeue them.
 320          */
 321         while (q) {
 322                 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
 323                 if (i <= 0) {
 324                         break;
 325                 }
 326
 327                 if (i < q->tqe_len) {
 328                         q->tqe_m->m_pkthdr.mp_dsn += i;
 329                         q->tqe_len -= i;
 330                         m_adj(q->tqe_m, i);
 331                         break;
 332                 }
 333
 334                 nq = LIST_NEXT(q, tqe_q);
 335                 LIST_REMOVE(q, tqe_q);
 336                 m_freem(q->tqe_m);
 337                 zfree(tcp_reass_zone, q);
 338                 mp_tp->mpt_reassqlen--;
 339                 q = nq;
 340         }
 341
 342         /* Insert the new segment queue entry into place. */
 343         te->tqe_m = m;
 344         te->tqe_th = NULL;
 345         te->tqe_len = *tlenp;
 346
 347         if (p == NULL) {
 348                 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
 349         } else {
 350                 LIST_INSERT_AFTER(p, te, tqe_q);
 351         }
 352
 353 out:
 354         return mptcp_reass_present(mp_so);
 355 }
 356
 357 /*
 358  * MPTCP input, called when data has been read from a subflow socket.
 359  */
 360 void
 361 mptcp_input(struct mptses *mpte, struct mbuf *m)
 362 {
 363         struct socket *mp_so;
 364         struct mptcb *mp_tp = NULL;
 365         int count = 0, wakeup = 0;
 366         struct mbuf *save = NULL, *prev = NULL;
 367         struct mbuf *freelist = NULL, *tail = NULL;
 368
 369         VERIFY(m->m_flags & M_PKTHDR);
 370
 371         mp_so = mptetoso(mpte);
 372         mp_tp = mpte->mpte_mptcb;
 373
 374         socket_lock_assert_owned(mp_so);
 375
 376         DTRACE_MPTCP(input);
 377
 378         mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
 379
 380         /*
 381          * Each mbuf contains MPTCP Data Sequence Map
 382          * Process the data for reassembly, delivery to MPTCP socket
 383          * client, etc.
 384          *
 385          */
 386         count = mp_so->so_rcv.sb_cc;
 387
 388         /*
 389          * In the degraded fallback case, data is accepted without DSS map
 390          */
 391         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
 392                 struct mbuf *iter;
 393                 int mb_dfin = 0;
 394 fallback:
 395                 mptcp_sbrcv_grow(mp_tp);
 396
 397                 iter = m;
 398                 while (iter) {
 399                         if ((iter->m_flags & M_PKTHDR) &&
 400                             (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
 401                                 mb_dfin = 1;
 402                         }
 403
 404                         if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
 405                                 /* Don't add zero-length packets, so jump it! */
 406                                 if (prev == NULL) {
 407                                         m = iter->m_next;
 408                                         m_free(iter);
 409                                         iter = m;
 410                                 } else {
 411                                         prev->m_next = iter->m_next;
 412                                         m_free(iter);
 413                                         iter = prev->m_next;
 414                                 }
 415
 416                                 /* It was a zero-length packet so next one must be a pkthdr */
 417                                 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
 418                         } else {
 419                                 prev = iter;
 420                                 iter = iter->m_next;
 421                         }
 422                 }
 423
 424                 /*
 425                  * assume degraded flow as this may be the first packet
 426                  * without DSS, and the subflow state is not updated yet.
 427                  */
 428                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 429                         sorwakeup(mp_so);
 430                 }
 431
 432                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
 433                     struct socket *, mp_so,
 434                     struct sockbuf *, &mp_so->so_rcv,
 435                     struct sockbuf *, &mp_so->so_snd,
 436                     struct mptses *, mpte);
 437                 count = mp_so->so_rcv.sb_cc - count;
 438
 439                 mp_tp->mpt_rcvnxt += count;
 440
 441                 if (mb_dfin) {
 442                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 443                         socantrcvmore(mp_so);
 444                 }
 445                 return;
 446         }
 447
 448         do {
 449                 u_int64_t mb_dsn;
 450                 int32_t mb_datalen;
 451                 int64_t todrop;
 452                 int mb_dfin = 0;
 453
 454                 VERIFY(m->m_flags & M_PKTHDR);
 455
 456                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
 457                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
 458                         goto fallback;
 459                 }
 460
 461                 save = m->m_next;
 462                 /*
 463                  * A single TCP packet formed of multiple mbufs
 464                  * holds DSS mapping in the first mbuf of the chain.
 465                  * Other mbufs in the chain may have M_PKTHDR set
 466                  * even though they belong to the same TCP packet
 467                  * and therefore use the DSS mapping stored in the
 468                  * first mbuf of the mbuf chain. mptcp_input() can
 469                  * get an mbuf chain with multiple TCP packets.
 470                  */
 471                 while (save && (!(save->m_flags & M_PKTHDR) ||
 472                     !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
 473                         prev = save;
 474                         save = save->m_next;
 475                 }
 476                 if (prev) {
 477                         prev->m_next = NULL;
 478                 } else {
 479                         m->m_next = NULL;
 480                 }
 481
 482                 mb_dsn = m->m_pkthdr.mp_dsn;
 483                 mb_datalen = m->m_pkthdr.mp_rlen;
 484
 485                 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
 486                 if (todrop > 0) {
 487                         tcpstat.tcps_mptcp_rcvpackafterwin++;
 488
 489                         os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
 490                             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
 491                             (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
 492                             mp_tp->mpt_rcvwnd, todrop);
 493
 494                         if (todrop >= mb_datalen) {
 495                                 if (freelist == NULL) {
 496                                         freelist = m;
 497                                 } else {
 498                                         tail->m_next = m;
 499                                 }
 500
 501                                 if (prev != NULL) {
 502                                         tail = prev;
 503                                 } else {
 504                                         tail = m;
 505                                 }
 506
 507                                 m = save;
 508                                 prev = save = NULL;
 509                                 continue;
 510                         } else {
 511                                 m_adj(m, -todrop);
 512                                 mb_datalen -= todrop;
 513                                 m->m_pkthdr.mp_rlen -= todrop;
 514                         }
 515
 516                         /*
 517                          * We drop from the right edge of the mbuf, thus the
 518                          * DATA_FIN is dropped as well
 519                          */
 520                         m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
 521                 }
 522
 523                 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
 524                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
 525                             mp_tp->mpt_rcvnxt)) {
 526                                 if (freelist == NULL) {
 527                                         freelist = m;
 528                                 } else {
 529                                         tail->m_next = m;
 530                                 }
 531
 532                                 if (prev != NULL) {
 533                                         tail = prev;
 534                                 } else {
 535                                         tail = m;
 536                                 }
 537
 538                                 m = save;
 539                                 prev = save = NULL;
 540                                 continue;
 541                         } else {
 542                                 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
 543                                 mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
 544                                 mb_dsn = mp_tp->mpt_rcvnxt;
 545                                 m->m_pkthdr.mp_rlen = mb_datalen;
 546                                 m->m_pkthdr.mp_dsn = mb_dsn;
 547                         }
 548                 }
 549
 550                 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
 551                     !LIST_EMPTY(&mp_tp->mpt_segq)) {
 552                         mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
 553
 554                         goto next;
 555                 }
 556                 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 557
 558                 mptcp_sbrcv_grow(mp_tp);
 559
 560                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 561                         wakeup = 1;
 562                 }
 563
 564                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
 565                     struct sockbuf *, &mp_so->so_rcv,
 566                     struct sockbuf *, &mp_so->so_snd,
 567                     struct mptses *, mpte,
 568                     struct mptcb *, mp_tp);
 569                 count = mp_so->so_rcv.sb_cc - count;
 570                 tcpstat.tcps_mp_rcvtotal++;
 571                 tcpstat.tcps_mp_rcvbytes += count;
 572
 573                 mp_tp->mpt_rcvnxt += count;
 574
 575 next:
 576                 if (mb_dfin) {
 577                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 578                         socantrcvmore(mp_so);
 579                 }
 580                 m = save;
 581                 prev = save = NULL;
 582                 count = mp_so->so_rcv.sb_cc;
 583         } while (m);
 584
 585         if (freelist) {
 586                 m_freem(freelist);
 587         }
 588
 589         if (wakeup) {
 590                 sorwakeup(mp_so);
 591         }
 592 }
 593
 594 boolean_t
 595 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
 596 {
 597         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 598
 599         /*
 600          * Always send if there is data in the reinject-queue.
 601          */
 602         if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
 603                 return TRUE;
 604         }
 605
 606         /*
 607          * Don't send, if:
 608          *
 609          * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
 610          *    Except when using TFO, we might be doing a 0-byte write.
 611          * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
 612          * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
 613          */
 614
 615         if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
 616                 return FALSE;
 617         }
 618
 619         if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
 620                 return FALSE;
 621         }
 622
 623         if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 624                 return FALSE;
 625         }
 626
 627         if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
 628                 return FALSE;
 629         }
 630
 631         return TRUE;
 632 }
 633
 634 /*
 635  * MPTCP output.
 636  */
 637 int
 638 mptcp_output(struct mptses *mpte)
 639 {
 640         struct mptcb *mp_tp;
 641         struct mptsub *mpts;
 642         struct mptsub *mpts_tried = NULL;
 643         struct socket *mp_so;
 644         struct mptsub *preferred_mpts = NULL;
 645         uint64_t old_snd_nxt;
 646         int error = 0;
 647
 648         mp_so = mptetoso(mpte);
 649         mp_tp = mpte->mpte_mptcb;
 650
 651         socket_lock_assert_owned(mp_so);
 652
 653         if (mp_so->so_flags & SOF_DEFUNCT) {
 654                 return 0;
 655         }
 656
 657         VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
 658         mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 659
 660         old_snd_nxt = mp_tp->mpt_sndnxt;
 661         while (mptcp_can_send_more(mp_tp, FALSE)) {
 662                 /* get the "best" subflow to be used for transmission */
 663                 mpts = mptcp_get_subflow(mpte, &preferred_mpts);
 664                 if (mpts == NULL) {
 665                         mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
 666                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 667                         break;
 668                 }
 669
 670                 /* In case there's just one flow, we reattempt later */
 671                 if (mpts_tried != NULL &&
 672                     (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
 673                         mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
 674                         mpts_tried->mpts_flags |= MPTSF_ACTIVE;
 675                         mptcp_start_timer(mpte, MPTT_REXMT);
 676                         break;
 677                 }
 678
 679                 /*
 680                  * Automatic sizing of send socket buffer. Increase the send
 681                  * socket buffer size if all of the following criteria are met
 682                  *      1. the receiver has enough buffer space for this data
 683                  *      2. send buffer is filled to 7/8th with data (so we actually
 684                  *         have data to make use of it);
 685                  */
 686                 if (tcp_do_autosendbuf == 1 &&
 687                     (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
 688                     tcp_cansbgrow(&mp_so->so_snd)) {
 689                         if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
 690                             mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
 691                                 if (sbreserve(&mp_so->so_snd,
 692                                     min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 693                                     tcp_autosndbuf_max)) == 1) {
 694                                         mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
 695                                 }
 696                         }
 697                 }
 698
 699                 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
 700                     struct socket *, mp_so);
 701                 error = mptcp_subflow_output(mpte, mpts, 0);
 702                 if (error) {
 703                         /* can be a temporary loss of source address or other error */
 704                         mpts->mpts_flags |= MPTSF_FAILINGOVER;
 705                         mpts->mpts_flags &= ~MPTSF_ACTIVE;
 706                         mpts_tried = mpts;
 707                         if (error != ECANCELED) {
 708                                 os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
 709                                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
 710                                     error, mpts->mpts_flags);
 711                         }
 712                         break;
 713                 }
 714                 /* The model is to have only one active flow at a time */
 715                 mpts->mpts_flags |= MPTSF_ACTIVE;
 716                 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
 717
 718                 /* Allows us to update the smoothed rtt */
 719                 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
 720                         if (preferred_mpts->mpts_probesoon) {
 721                                 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
 722                                         mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
 723                                         if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
 724                                                 preferred_mpts->mpts_probesoon = 0;
 725                                                 preferred_mpts->mpts_probecnt = 0;
 726                                         }
 727                                 }
 728                         } else {
 729                                 preferred_mpts->mpts_probesoon = tcp_now;
 730                                 preferred_mpts->mpts_probecnt = 0;
 731                         }
 732                 }
 733
 734                 if (mpte->mpte_active_sub == NULL) {
 735                         mpte->mpte_active_sub = mpts;
 736                 } else if (mpte->mpte_active_sub != mpts) {
 737                         mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
 738                         mpte->mpte_active_sub = mpts;
 739
 740                         mptcpstats_inc_switch(mpte, mpts);
 741                 }
 742         }
 743
 744         if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 745                 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
 746                     mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
 747                         mptcp_finish_usrclosed(mpte);
 748                 }
 749         }
 750
 751         mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
 752
 753         /* subflow errors should not be percolated back up */
 754         return 0;
 755 }
 756
 757
 758 static struct mptsub *
 759 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
 760 {
 761         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 762
 763         /*
 764          * Lower RTT? Take it, if it's our first one, or
 765          * it doesn't has any loss, or the current one has
 766          * loss as well.
 767          */
 768         if (tp->t_srtt && *currtt > tp->t_srtt &&
 769             (curbest == NULL || tp->t_rxtshift == 0 ||
 770             sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
 771                 *currtt = tp->t_srtt;
 772                 return mpts;
 773         }
 774
 775         /*
 776          * If we find a subflow without loss, take it always!
 777          */
 778         if (curbest &&
 779             sototcpcb(curbest->mpts_socket)->t_rxtshift &&
 780             tp->t_rxtshift == 0) {
 781                 *currtt = tp->t_srtt;
 782                 return mpts;
 783         }
 784
 785         return curbest != NULL ? curbest : mpts;
 786 }
 787
 788 static struct mptsub *
 789 mptcp_return_subflow(struct mptsub *mpts)
 790 {
 791         if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
 792                 return NULL;
 793         }
 794
 795         return mpts;
 796 }
 797
 798 static boolean_t
 799 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
 800 {
 801         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 802         int fail_thresh = mptcp_fail_thresh;
 803
 804         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 805                 fail_thresh *= 2;
 806         }
 807
 808         return tp->t_rxtshift >= fail_thresh &&
 809                (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
 810 }
 811
 812 /*
 813  * Return the most eligible subflow to be used for sending data.
 814  */
 815 struct mptsub *
 816 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
 817 {
 818         struct tcpcb *besttp, *secondtp;
 819         struct inpcb *bestinp, *secondinp;
 820         struct mptsub *mpts;
 821         struct mptsub *best = NULL;
 822         struct mptsub *second_best = NULL;
 823         int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
 824
 825         /*
 826          * First Step:
 827          * Choose the best subflow for cellular and non-cellular interfaces.
 828          */
 829
 830         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
 831                 struct socket *so = mpts->mpts_socket;
 832                 struct tcpcb *tp = sototcpcb(so);
 833                 struct inpcb *inp = sotoinpcb(so);
 834
 835                 mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
 836                     __func__, mpts->mpts_connid, mpts->mpts_flags,
 837                     INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
 838                     inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
 839                     tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
 840                     mptcp_subflow_cwnd_space(so)),
 841                     MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 842
 843                 /*
 844                  * First, the hard conditions to reject subflows
 845                  * (e.g., not connected,...)
 846                  */
 847                 if (inp->inp_last_outifp == NULL) {
 848                         continue;
 849                 }
 850
 851                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
 852                         continue;
 853                 }
 854
 855                 /* There can only be one subflow in degraded state */
 856                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
 857                         best = mpts;
 858                         break;
 859                 }
 860
 861                 /*
 862                  * If this subflow is waiting to finally send, do it!
 863                  */
 864                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
 865                         return mptcp_return_subflow(mpts);
 866                 }
 867
 868                 /*
 869                  * Only send if the subflow is MP_CAPABLE. The exceptions to
 870                  * this rule (degraded or TFO) have been taken care of above.
 871                  */
 872                 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
 873                         continue;
 874                 }
 875
 876                 if ((so->so_state & SS_ISDISCONNECTED) ||
 877                     !(so->so_state & SS_ISCONNECTED) ||
 878                     !TCPS_HAVEESTABLISHED(tp->t_state) ||
 879                     tp->t_state > TCPS_CLOSE_WAIT) {
 880                         continue;
 881                 }
 882
 883                 /*
 884                  * Second, the soft conditions to find the subflow with best
 885                  * conditions for each set (aka cellular vs non-cellular)
 886                  */
 887                 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
 888                         second_best = mptcp_choose_subflow(mpts, second_best,
 889                             &exp_rtt);
 890                 } else {
 891                         best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
 892                 }
 893         }
 894
 895         /*
 896          * If there is no preferred or backup subflow, and there is no active
 897          * subflow use the last usable subflow.
 898          */
 899         if (best == NULL) {
 900                 return mptcp_return_subflow(second_best);
 901         }
 902
 903         if (second_best == NULL) {
 904                 return mptcp_return_subflow(best);
 905         }
 906
 907         besttp = sototcpcb(best->mpts_socket);
 908         bestinp = sotoinpcb(best->mpts_socket);
 909         secondtp = sototcpcb(second_best->mpts_socket);
 910         secondinp = sotoinpcb(second_best->mpts_socket);
 911
 912         if (preferred != NULL) {
 913                 *preferred = mptcp_return_subflow(best);
 914         }
 915
 916         /*
 917          * Second Step: Among best and second_best. Choose the one that is
 918          * most appropriate for this particular service-type.
 919          */
 920         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 921                 /*
 922                  * Only handover if Symptoms tells us to do so.
 923                  */
 924                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 925                     mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
 926                         return mptcp_return_subflow(second_best);
 927                 }
 928
 929                 return mptcp_return_subflow(best);
 930         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
 931                 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
 932                 int rto_thresh = mptcp_rtothresh;
 933
 934                 /* Adjust with symptoms information */
 935                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 936                     mptcp_is_wifi_unusable_for_session(mpte) != 0) {
 937                         rtt_thresh /= 2;
 938                         rto_thresh /= 2;
 939                 }
 940
 941                 if (besttp->t_srtt && secondtp->t_srtt &&
 942                     besttp->t_srtt >= rtt_thresh &&
 943                     secondtp->t_srtt < rtt_thresh) {
 944                         tcpstat.tcps_mp_sel_rtt++;
 945                         mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
 946                             best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
 947                             second_best->mpts_connid,
 948                             secondtp->t_srtt >> TCP_RTT_SHIFT),
 949                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 950                         return mptcp_return_subflow(second_best);
 951                 }
 952
 953                 if (mptcp_subflow_is_slow(mpte, best) &&
 954                     secondtp->t_rxtshift == 0) {
 955                         return mptcp_return_subflow(second_best);
 956                 }
 957
 958                 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
 959                 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
 960                     besttp->t_rxtcur >= rto_thresh &&
 961                     secondtp->t_rxtcur < rto_thresh) {
 962                         tcpstat.tcps_mp_sel_rto++;
 963                         mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
 964                             best->mpts_connid, besttp->t_rxtcur,
 965                             second_best->mpts_connid, secondtp->t_rxtcur),
 966                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 967
 968                         return mptcp_return_subflow(second_best);
 969                 }
 970
 971                 /*
 972                  * None of the above conditions for sending on the secondary
 973                  * were true. So, let's schedule on the best one, if he still
 974                  * has some space in the congestion-window.
 975                  */
 976                 return mptcp_return_subflow(best);
 977         } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
 978                 struct mptsub *tmp;
 979
 980                 /*
 981                  * We only care about RTT when aggregating
 982                  */
 983                 if (besttp->t_srtt > secondtp->t_srtt) {
 984                         tmp = best;
 985                         best = second_best;
 986                         besttp = secondtp;
 987                         bestinp = secondinp;
 988
 989                         second_best = tmp;
 990                         secondtp = sototcpcb(second_best->mpts_socket);
 991                         secondinp = sotoinpcb(second_best->mpts_socket);
 992                 }
 993
 994                 /* Is there still space in the congestion window? */
 995                 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
 996                         return mptcp_return_subflow(second_best);
 997                 }
 998
 999                 return mptcp_return_subflow(best);
1000         } else {
1001                 panic("Unknown service-type configured for MPTCP");
1002         }
1003
1004         return NULL;
1005 }
1006
1007 static const char *
1008 mptcp_event_to_str(uint32_t event)
1009 {
1010         const char *c = "UNDEFINED";
1011         switch (event) {
1012         case MPCE_CLOSE:
1013                 c = "MPCE_CLOSE";
1014                 break;
1015         case MPCE_RECV_DATA_ACK:
1016                 c = "MPCE_RECV_DATA_ACK";
1017                 break;
1018         case MPCE_RECV_DATA_FIN:
1019                 c = "MPCE_RECV_DATA_FIN";
1020                 break;
1021         }
1022         return c;
1023 }
1024
1025 static const char *
1026 mptcp_state_to_str(mptcp_state_t state)
1027 {
1028         const char *c = "UNDEFINED";
1029         switch (state) {
1030         case MPTCPS_CLOSED:
1031                 c = "MPTCPS_CLOSED";
1032                 break;
1033         case MPTCPS_LISTEN:
1034                 c = "MPTCPS_LISTEN";
1035                 break;
1036         case MPTCPS_ESTABLISHED:
1037                 c = "MPTCPS_ESTABLISHED";
1038                 break;
1039         case MPTCPS_CLOSE_WAIT:
1040                 c = "MPTCPS_CLOSE_WAIT";
1041                 break;
1042         case MPTCPS_FIN_WAIT_1:
1043                 c = "MPTCPS_FIN_WAIT_1";
1044                 break;
1045         case MPTCPS_CLOSING:
1046                 c = "MPTCPS_CLOSING";
1047                 break;
1048         case MPTCPS_LAST_ACK:
1049                 c = "MPTCPS_LAST_ACK";
1050                 break;
1051         case MPTCPS_FIN_WAIT_2:
1052                 c = "MPTCPS_FIN_WAIT_2";
1053                 break;
1054         case MPTCPS_TIME_WAIT:
1055                 c = "MPTCPS_TIME_WAIT";
1056                 break;
1057         case MPTCPS_TERMINATE:
1058                 c = "MPTCPS_TERMINATE";
1059                 break;
1060         }
1061         return c;
1062 }
1063
1064 void
1065 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1066 {
1067         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1068
1069         socket_lock_assert_owned(mp_so);
1070
1071         mptcp_state_t old_state = mp_tp->mpt_state;
1072
1073         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1074             uint32_t, event);
1075
1076         switch (mp_tp->mpt_state) {
1077         case MPTCPS_CLOSED:
1078         case MPTCPS_LISTEN:
1079                 mp_tp->mpt_state = MPTCPS_TERMINATE;
1080                 break;
1081
1082         case MPTCPS_ESTABLISHED:
1083                 if (event == MPCE_CLOSE) {
1084                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1085                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1086                 } else if (event == MPCE_RECV_DATA_FIN) {
1087                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1088                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1089                 }
1090                 break;
1091
1092         case MPTCPS_CLOSE_WAIT:
1093                 if (event == MPCE_CLOSE) {
1094                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
1095                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1096                 }
1097                 break;
1098
1099         case MPTCPS_FIN_WAIT_1:
1100                 if (event == MPCE_RECV_DATA_ACK) {
1101                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1102                 } else if (event == MPCE_RECV_DATA_FIN) {
1103                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1104                         mp_tp->mpt_state = MPTCPS_CLOSING;
1105                 }
1106                 break;
1107
1108         case MPTCPS_CLOSING:
1109                 if (event == MPCE_RECV_DATA_ACK) {
1110                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1111                 }
1112                 break;
1113
1114         case MPTCPS_LAST_ACK:
1115                 if (event == MPCE_RECV_DATA_ACK) {
1116                         mptcp_close(mp_tp->mpt_mpte, mp_tp);
1117                 }
1118                 break;
1119
1120         case MPTCPS_FIN_WAIT_2:
1121                 if (event == MPCE_RECV_DATA_FIN) {
1122                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1123                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1124                 }
1125                 break;
1126
1127         case MPTCPS_TIME_WAIT:
1128         case MPTCPS_TERMINATE:
1129                 break;
1130
1131         default:
1132                 VERIFY(0);
1133                 /* NOTREACHED */
1134         }
1135         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1136             uint32_t, event);
1137         mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1138             mptcp_state_to_str(old_state),
1139             mptcp_state_to_str(mp_tp->mpt_state),
1140             mptcp_event_to_str(event)),
1141             MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1142 }
1143
1144 /* If you change this function, match up mptcp_update_rcv_state_f */
1145 void
1146 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1147     uint16_t csum)
1148 {
1149         struct mptcb *mp_tp = tptomptp(tp);
1150         u_int64_t full_dsn = 0;
1151
1152         NTOHL(dss_info->mdss_dsn);
1153         NTOHL(dss_info->mdss_subflow_seqn);
1154         NTOHS(dss_info->mdss_data_len);
1155
1156         /* XXX for autosndbuf grow sb here */
1157         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1158         mptcp_update_rcv_state_meat(mp_tp, tp,
1159             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1160             csum);
1161 }
1162
1163 void
1164 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1165     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1166     uint16_t csum)
1167 {
1168         if (mdss_data_len == 0) {
1169                 os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1170                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1171
1172                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1173                         os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1174                             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1175                 }
1176                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1177                 return;
1178         }
1179
1180         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1181
1182         tp->t_rcv_map.mpt_dsn = full_dsn;
1183         tp->t_rcv_map.mpt_sseq = seqn;
1184         tp->t_rcv_map.mpt_len = mdss_data_len;
1185         tp->t_rcv_map.mpt_csum = csum;
1186         tp->t_mpflags |= TMPF_EMBED_DSN;
1187 }
1188
1189
1190 static int
1191 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1192     int hdrlen)
1193 {
1194         u_int32_t datalen;
1195
1196         if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1197                 return 0;
1198         }
1199
1200         datalen = m->m_pkthdr.mp_rlen;
1201
1202         /* unacceptable DSS option, fallback to TCP */
1203         if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1204                 os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
1205                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
1206         } else {
1207                 return 0;
1208         }
1209         tp->t_mpflags |= TMPF_SND_MPFAIL;
1210         mptcp_notify_mpfail(so);
1211         m_freem(m);
1212         return -1;
1213 }
1214
1215 int
1216 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1217     int drop_hdrlen)
1218 {
1219         mptcp_insert_rmap(tp, m, th);
1220         if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1221             drop_hdrlen) != 0) {
1222                 return -1;
1223         }
1224         return 0;
1225 }
1226
1227 /*
1228  * MPTCP Checksum support
1229  * The checksum is calculated whenever the MPTCP DSS option is included
1230  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1231  * header and the actual data indicated by the length specified in the
1232  * DSS option.
1233  */
1234
1235 int
1236 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1237     uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1238 {
1239         uint16_t mptcp_csum;
1240
1241         mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1242         if (mptcp_csum) {
1243                 tp->t_mpflags |= TMPF_SND_MPFAIL;
1244                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1245                 m_freem(m);
1246                 tcpstat.tcps_mp_badcsum++;
1247                 return -1;
1248         }
1249         return 0;
1250 }
1251
1252 static uint16_t
1253 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1254     uint16_t dlen, uint16_t csum, uint16_t dfin)
1255 {
1256         struct mptcb *mp_tp = tptomptp(tp);
1257         uint16_t real_len = dlen - dfin;
1258         uint32_t sum = 0;
1259
1260         if (mp_tp == NULL) {
1261                 return 0;
1262         }
1263
1264         if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1265                 return 0;
1266         }
1267
1268         if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1269                 return 0;
1270         }
1271
1272         /*
1273          * The remote side may send a packet with fewer bytes than the
1274          * claimed DSS checksum length.
1275          */
1276         if ((int)m_length2(m, NULL) < real_len) {
1277                 return 0xffff;
1278         }
1279
1280         if (real_len != 0) {
1281                 sum = m_sum16(m, 0, real_len);
1282         }
1283
1284         sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1285         ADDCARRY(sum);
1286         DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1287             uint32_t, sum);
1288
1289         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1290             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1291         return ~sum & 0xffff;
1292 }
1293
1294 uint32_t
1295 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1296 {
1297         uint32_t sum = 0;
1298
1299         if (dlen) {
1300                 sum = m_sum16(m, 0, dlen);
1301         }
1302
1303         dss_val = mptcp_hton64(dss_val);
1304         sseq = htonl(sseq);
1305         dlen = htons(dlen);
1306         sum += in_pseudo64(dss_val, sseq, dlen);
1307
1308         ADDCARRY(sum);
1309         sum = ~sum & 0xffff;
1310         DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1311         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1312             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1313
1314         return sum;
1315 }
1316
1317 /*
1318  * When WiFi signal starts fading, there's more loss and RTT spikes.
1319  * Check if there has been a large spike by comparing against
1320  * a tolerable RTT spike threshold.
1321  */
1322 boolean_t
1323 mptcp_no_rto_spike(struct socket *so)
1324 {
1325         struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1326         int32_t spike = 0;
1327
1328         if (tp->t_rxtcur > mptcp_rtothresh) {
1329                 spike = tp->t_rxtcur - mptcp_rtothresh;
1330
1331                 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1332                     __func__, spike,
1333                     tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1334                     tp->t_rttcur),
1335                     (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1336         }
1337
1338         if (spike > 0) {
1339                 return FALSE;
1340         } else {
1341                 return TRUE;
1342         }
1343 }
1344
1345 void
1346 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1347 {
1348         VERIFY(mpp->mpp_flags & flag);
1349         mpp->mpp_flags &= ~flag;
1350
1351         if (mptcp_should_defer_upcall(mpp)) {
1352                 return;
1353         }
1354
1355         if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1356                 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1357
1358                 mptcp_subflow_workloop(mpp->mpp_pcbe);
1359         }
1360
1361         if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1362                 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1363
1364                 sorwakeup(mpp->mpp_socket);
1365         }
1366
1367         if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1368                 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1369
1370                 sowwakeup(mpp->mpp_socket);
1371         }
1372 }
1373
1374 void
1375 mptcp_ask_for_nat64(struct ifnet *ifp)
1376 {
1377         in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1378
1379         os_log_info(mptcp_log_handle,
1380             "%s: asked for NAT64-prefix on %s\n", __func__,
1381             ifp->if_name);
1382 }
1383
1384 static void
1385 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1386 {
1387         memset(info, 0, sizeof(*info));
1388 }
1389
1390 void
1391 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1392     uint32_t necp_flags, __unused bool *viable)
1393 {
1394         boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1395         boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1396         boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1397         boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1398         struct mppcb *mp = (struct mppcb *)handle;
1399         struct mptses *mpte = mptompte(mp);
1400         struct socket *mp_so;
1401         struct mptcb *mp_tp;
1402         int locked = 0;
1403         uint32_t i, ifindex;
1404
1405         ifindex = interface_index;
1406         VERIFY(ifindex != IFSCOPE_NONE);
1407
1408         /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1409         if (mp->mpp_socket->so_usecount == 0) {
1410                 return;
1411         }
1412
1413         mp_so = mptetoso(mpte);
1414
1415         if (action != NECP_CLIENT_CBACTION_INITIAL) {
1416                 socket_lock(mp_so, 1);
1417                 locked = 1;
1418
1419                 /* Check again, because it might have changed while waiting */
1420                 if (mp->mpp_socket->so_usecount == 0) {
1421                         goto out;
1422                 }
1423         }
1424
1425         socket_lock_assert_owned(mp_so);
1426
1427         mp_tp = mpte->mpte_mptcb;
1428
1429         os_log_info(mptcp_log_handle, "%s - %lx: action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1430             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1431             mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1432             has_v4, has_v6, has_nat64, low_power);
1433
1434         /* No need on fallen back sockets */
1435         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1436                 goto out;
1437         }
1438
1439         /*
1440          * When the interface goes in low-power mode we don't want to establish
1441          * new subflows on it. Thus, mark it internally as non-viable.
1442          */
1443         if (low_power) {
1444                 action = NECP_CLIENT_CBACTION_NONVIABLE;
1445         }
1446
1447         if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1448                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1449                         if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1450                                 continue;
1451                         }
1452
1453                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1454                                 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1455                         }
1456                 }
1457
1458                 mptcp_sched_create_subflows(mpte);
1459         } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1460             action == NECP_CLIENT_CBACTION_INITIAL) {
1461                 int found_slot = 0, slot_index = -1;
1462                 struct sockaddr *dst;
1463                 struct ifnet *ifp;
1464
1465                 ifnet_head_lock_shared();
1466                 ifp = ifindex2ifnet[ifindex];
1467                 ifnet_head_done();
1468
1469                 if (ifp == NULL) {
1470                         goto out;
1471                 }
1472
1473                 if (IFNET_IS_COMPANION_LINK(ifp)) {
1474                         goto out;
1475                 }
1476
1477                 if (IFNET_IS_EXPENSIVE(ifp) &&
1478                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1479                         goto out;
1480                 }
1481
1482                 if (IFNET_IS_CONSTRAINED(ifp) &&
1483                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1484                         goto out;
1485                 }
1486
1487                 if (IFNET_IS_CELLULAR(ifp) &&
1488                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1489                         goto out;
1490                 }
1491
1492                 if (IS_INTF_CLAT46(ifp)) {
1493                         has_v4 = FALSE;
1494                 }
1495
1496                 /* Look for the slot on where to store/update the interface-info. */
1497                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1498                         /* Found a potential empty slot where we can put it */
1499                         if (mpte->mpte_itfinfo[i].ifindex == 0) {
1500                                 found_slot = 1;
1501                                 slot_index = i;
1502                         }
1503
1504                         /*
1505                          * The interface is already in our array. Check if we
1506                          * need to update it.
1507                          */
1508                         if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1509                             (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1510                             mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1511                             mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1512                                 found_slot = 1;
1513                                 slot_index = i;
1514                                 break;
1515                         }
1516
1517                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1518                                 /*
1519                                  * Ok, it's already there and we don't need
1520                                  * to update it
1521                                  */
1522                                 goto out;
1523                         }
1524                 }
1525
1526                 dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1527                 if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) &&
1528                     has_v6 && !has_nat64 && !has_v4) {
1529                         if (found_slot) {
1530                                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1531                                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1532                                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1533                         }
1534                         mptcp_ask_for_nat64(ifp);
1535                         goto out;
1536                 }
1537
1538                 if (found_slot == 0) {
1539                         int new_size = mpte->mpte_itfinfo_size * 2;
1540                         struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1541
1542                         if (info == NULL) {
1543                                 os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1544                                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1545                                 goto out;
1546                         }
1547
1548                         memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1549
1550                         if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1551                                 _FREE(mpte->mpte_itfinfo, M_TEMP);
1552                         }
1553
1554                         /* We allocated a new one, thus the first must be empty */
1555                         slot_index = mpte->mpte_itfinfo_size;
1556
1557                         mpte->mpte_itfinfo = info;
1558                         mpte->mpte_itfinfo_size = new_size;
1559                 }
1560
1561                 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1562                 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1563                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1564                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1565                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1566
1567                 mptcp_sched_create_subflows(mpte);
1568         }
1569
1570 out:
1571         if (locked) {
1572                 socket_unlock(mp_so, 1);
1573         }
1574 }
1575
1576 void
1577 mptcp_set_restrictions(struct socket *mp_so)
1578 {
1579         struct mptses *mpte = mpsotompte(mp_so);
1580         uint32_t i;
1581
1582         socket_lock_assert_owned(mp_so);
1583
1584         ifnet_head_lock_shared();
1585
1586         for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1587                 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1588                 uint32_t ifindex = info->ifindex;
1589                 struct ifnet *ifp;
1590
1591                 if (ifindex == IFSCOPE_NONE) {
1592                         continue;
1593                 }
1594
1595                 ifp = ifindex2ifnet[ifindex];
1596                 if (ifp == NULL) {
1597                         continue;
1598                 }
1599
1600                 if (IFNET_IS_EXPENSIVE(ifp) &&
1601                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1602                         info->ifindex = IFSCOPE_NONE;
1603                 }
1604
1605                 if (IFNET_IS_CONSTRAINED(ifp) &&
1606                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1607                         info->ifindex = IFSCOPE_NONE;
1608                 }
1609
1610                 if (IFNET_IS_CELLULAR(ifp) &&
1611                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1612                         info->ifindex = IFSCOPE_NONE;
1613                 }
1614         }
1615
1616         ifnet_head_done();
1617 }