bsd/netinet/mptcp.c

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * A note on the MPTCP/NECP-interactions:
  31  *
  32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
  33  * MPTCP registers to these events at the MPTCP-layer for interface-events
  34  * through a call to necp_client_register_multipath_cb.
  35  * To get per-flow events (aka per TCP-subflow), we register to it with
  36  * necp_client_register_socket_flow. Both registrations happen by using the
  37  * necp-client-uuid that comes from the app.
  38  *
  39  * The locking is rather tricky. In general, we expect the lock-ordering to
  40  * happen from necp-fd -> necp->client -> mpp_lock.
  41  *
  42  * There are however some subtleties.
  43  *
  44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
  45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
  46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
  47  * NECP-locks will deadlock us. Because these NECP-events will also first take
  48  * the NECP-locks. Either they win the race and thus won't find our
  49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
  50  * the callbacks while holding the NECP lock.
  51  *
  52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
  53  * because we have already registered callbacks and we might race against an
  54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
  55  *
  56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
  57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
  58  * pointers before we unregistered the callback. Because, again we might be
  59  * racing against an NECP-event. Unregistering must happen with an unlocked
  60  * mpp_lock, because of the lock-ordering constraint. It could be that
  61  * before we had a chance to unregister an NECP-event triggers. That's why
  62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
  63  * there while the socket is being garbage-collected, the use-count will go
  64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
  65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
  66  *
  67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
  68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
  69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
  70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
  71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
  72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
  73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
  74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
  75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
  76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
  77  * event). So, the event now finally gets the subflow-lock and then hits an
  78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
  79  * the NECP callback.
  80  */
  81
  82 #include <sys/param.h>
  83 #include <sys/systm.h>
  84 #include <sys/kernel.h>
  85 #include <sys/mbuf.h>
  86 #include <sys/mcache.h>
  87 #include <sys/socket.h>
  88 #include <sys/socketvar.h>
  89 #include <sys/syslog.h>
  90 #include <sys/protosw.h>
  91
  92 #include <kern/zalloc.h>
  93 #include <kern/locks.h>
  94
  95 #include <mach/sdt.h>
  96
  97 #include <net/if.h>
  98 #include <netinet/in.h>
  99 #include <netinet/in_var.h>
 100 #include <netinet/tcp.h>
 101 #include <netinet/tcp_fsm.h>
 102 #include <netinet/tcp_seq.h>
 103 #include <netinet/tcp_var.h>
 104 #include <netinet/mptcp_var.h>
 105 #include <netinet/mptcp.h>
 106 #include <netinet/mptcp_seq.h>
 107 #include <netinet/mptcp_opt.h>
 108 #include <netinet/mptcp_timer.h>
 109
 110 int mptcp_enable = 1;
 111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
 112     &mptcp_enable, 0, "Enable Multipath TCP Support");
 113
 114 /*
 115  * Number of times to try negotiating MPTCP on SYN retransmissions.
 116  * We haven't seen any reports of a middlebox that is dropping all SYN-segments
 117  * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
 118  */
 119 int mptcp_mpcap_retries = 4;
 120 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
 121     CTLFLAG_RW | CTLFLAG_LOCKED,
 122     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
 123
 124 /*
 125  * By default, DSS checksum is turned off, revisit if we ever do
 126  * MPTCP for non SSL Traffic.
 127  */
 128 int mptcp_dss_csum = 0;
 129 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
 130     &mptcp_dss_csum, 0, "Enable DSS checksum");
 131
 132 /*
 133  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
 134  * is attempted on a different path.
 135  */
 136 int mptcp_fail_thresh = 1;
 137 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
 138     &mptcp_fail_thresh, 0, "Failover threshold");
 139
 140
 141 /*
 142  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
 143  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
 144  * Some carrier networks have a timeout of 10 or 15 minutes.
 145  */
 146 int mptcp_subflow_keeptime = 60 * 14;
 147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
 148     &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
 149
 150 int mptcp_rtthist_rtthresh = 600;
 151 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 152     &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
 153
 154 /*
 155  * Use RTO history for sending new data
 156  */
 157 int mptcp_use_rto = 1;
 158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
 159     &mptcp_use_rto, 0, "Disable RTO for subflow selection");
 160
 161 int mptcp_rtothresh = 1500;
 162 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 163     &mptcp_rtothresh, 0, "RTO threshold");
 164
 165 /*
 166  * Probe the preferred path, when it is not in use
 167  */
 168 uint32_t mptcp_probeto = 1000;
 169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
 170     &mptcp_probeto, 0, "Disable probing by setting to 0");
 171
 172 uint32_t mptcp_probecnt = 5;
 173 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 174     &mptcp_probecnt, 0, "Number of probe writes");
 175
 176 /*
 177  * Static declarations
 178  */
 179 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
 180     uint32_t, uint16_t, uint16_t, uint16_t);
 181
 182 static int
 183 mptcp_reass_present(struct socket *mp_so)
 184 {
 185         struct mptses *mpte = mpsotompte(mp_so);
 186         struct mptcb *mp_tp = mpte->mpte_mptcb;
 187         struct tseg_qent *q;
 188         int dowakeup = 0;
 189         int flags = 0;
 190
 191         /*
 192          * Present data to user, advancing rcv_nxt through
 193          * completed sequence space.
 194          */
 195         if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
 196                 return flags;
 197         }
 198         q = LIST_FIRST(&mp_tp->mpt_segq);
 199         if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
 200                 return flags;
 201         }
 202
 203         /*
 204          * If there is already another thread doing reassembly for this
 205          * connection, it is better to let it finish the job --
 206          * (radar 16316196)
 207          */
 208         if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
 209                 return flags;
 210         }
 211
 212         mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
 213
 214         do {
 215                 mp_tp->mpt_rcvnxt += q->tqe_len;
 216                 LIST_REMOVE(q, tqe_q);
 217                 if (mp_so->so_state & SS_CANTRCVMORE) {
 218                         m_freem(q->tqe_m);
 219                 } else {
 220                         flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 221                         if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0)) {
 222                                 dowakeup = 1;
 223                         }
 224                 }
 225                 zfree(tcp_reass_zone, q);
 226                 mp_tp->mpt_reassqlen--;
 227                 q = LIST_FIRST(&mp_tp->mpt_segq);
 228         } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
 229         mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
 230
 231         if (dowakeup) {
 232                 sorwakeup(mp_so); /* done with socket lock held */
 233         }
 234         return flags;
 235 }
 236
 237 static int
 238 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
 239 {
 240         struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
 241         u_int64_t mb_dsn = phdr->mp_dsn;
 242         struct tseg_qent *q;
 243         struct tseg_qent *p = NULL;
 244         struct tseg_qent *nq;
 245         struct tseg_qent *te = NULL;
 246         u_int16_t qlimit;
 247
 248         /*
 249          * Limit the number of segments in the reassembly queue to prevent
 250          * holding on to too many segments (and thus running out of mbufs).
 251          * Make sure to let the missing segment through which caused this
 252          * queue.  Always keep one global queue entry spare to be able to
 253          * process the missing segment.
 254          */
 255         qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
 256             (tcp_autorcvbuf_max >> 10));
 257         if (mb_dsn != mp_tp->mpt_rcvnxt &&
 258             (mp_tp->mpt_reassqlen + 1) >= qlimit) {
 259                 tcpstat.tcps_mptcp_rcvmemdrop++;
 260                 m_freem(m);
 261                 *tlenp = 0;
 262                 return 0;
 263         }
 264
 265         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 266         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 267         if (te == NULL) {
 268                 tcpstat.tcps_mptcp_rcvmemdrop++;
 269                 m_freem(m);
 270                 return 0;
 271         }
 272
 273         mp_tp->mpt_reassqlen++;
 274
 275         /*
 276          * Find a segment which begins after this one does.
 277          */
 278         LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
 279                 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
 280                         break;
 281                 }
 282                 p = q;
 283         }
 284
 285         /*
 286          * If there is a preceding segment, it may provide some of
 287          * our data already.  If so, drop the data from the incoming
 288          * segment.  If it provides all of our data, drop us.
 289          */
 290         if (p != NULL) {
 291                 int64_t i;
 292                 /* conversion to int (in i) handles seq wraparound */
 293                 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
 294                 if (i > 0) {
 295                         if (i >= *tlenp) {
 296                                 tcpstat.tcps_mptcp_rcvduppack++;
 297                                 m_freem(m);
 298                                 zfree(tcp_reass_zone, te);
 299                                 te = NULL;
 300                                 mp_tp->mpt_reassqlen--;
 301                                 /*
 302                                  * Try to present any queued data
 303                                  * at the left window edge to the user.
 304                                  * This is needed after the 3-WHS
 305                                  * completes.
 306                                  */
 307                                 goto out;
 308                         }
 309                         m_adj(m, i);
 310                         *tlenp -= i;
 311                         phdr->mp_dsn += i;
 312                 }
 313         }
 314
 315         tcpstat.tcps_mp_oodata++;
 316
 317         /*
 318          * While we overlap succeeding segments trim them or,
 319          * if they are completely covered, dequeue them.
 320          */
 321         while (q) {
 322                 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
 323                 if (i <= 0) {
 324                         break;
 325                 }
 326
 327                 if (i < q->tqe_len) {
 328                         q->tqe_m->m_pkthdr.mp_dsn += i;
 329                         q->tqe_len -= i;
 330                         m_adj(q->tqe_m, i);
 331                         break;
 332                 }
 333
 334                 nq = LIST_NEXT(q, tqe_q);
 335                 LIST_REMOVE(q, tqe_q);
 336                 m_freem(q->tqe_m);
 337                 zfree(tcp_reass_zone, q);
 338                 mp_tp->mpt_reassqlen--;
 339                 q = nq;
 340         }
 341
 342         /* Insert the new segment queue entry into place. */
 343         te->tqe_m = m;
 344         te->tqe_th = NULL;
 345         te->tqe_len = *tlenp;
 346
 347         if (p == NULL) {
 348                 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
 349         } else {
 350                 LIST_INSERT_AFTER(p, te, tqe_q);
 351         }
 352
 353 out:
 354         return mptcp_reass_present(mp_so);
 355 }
 356
 357 /*
 358  * MPTCP input, called when data has been read from a subflow socket.
 359  */
 360 void
 361 mptcp_input(struct mptses *mpte, struct mbuf *m)
 362 {
 363         struct socket *mp_so;
 364         struct mptcb *mp_tp = NULL;
 365         int count = 0, wakeup = 0;
 366         struct mbuf *save = NULL, *prev = NULL;
 367         struct mbuf *freelist = NULL, *tail = NULL;
 368
 369         VERIFY(m->m_flags & M_PKTHDR);
 370
 371         mp_so = mptetoso(mpte);
 372         mp_tp = mpte->mpte_mptcb;
 373
 374         socket_lock_assert_owned(mp_so);
 375
 376         DTRACE_MPTCP(input);
 377
 378         mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
 379
 380         /*
 381          * Each mbuf contains MPTCP Data Sequence Map
 382          * Process the data for reassembly, delivery to MPTCP socket
 383          * client, etc.
 384          *
 385          */
 386         count = mp_so->so_rcv.sb_cc;
 387
 388         /*
 389          * In the degraded fallback case, data is accepted without DSS map
 390          */
 391         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
 392                 struct mbuf *iter;
 393                 int mb_dfin = 0;
 394 fallback:
 395                 mptcp_sbrcv_grow(mp_tp);
 396
 397                 iter = m;
 398                 while (iter) {
 399                         if ((iter->m_flags & M_PKTHDR) &&
 400                             (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
 401                                 mb_dfin = 1;
 402                         }
 403
 404                         if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
 405                                 /* Don't add zero-length packets, so jump it! */
 406                                 if (prev == NULL) {
 407                                         m = iter->m_next;
 408                                         m_free(iter);
 409                                         iter = m;
 410                                 } else {
 411                                         prev->m_next = iter->m_next;
 412                                         m_free(iter);
 413                                         iter = prev->m_next;
 414                                 }
 415
 416                                 /* It was a zero-length packet so next one must be a pkthdr */
 417                                 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
 418                         } else {
 419                                 prev = iter;
 420                                 iter = iter->m_next;
 421                         }
 422                 }
 423
 424                 /*
 425                  * assume degraded flow as this may be the first packet
 426                  * without DSS, and the subflow state is not updated yet.
 427                  */
 428                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 429                         sorwakeup(mp_so);
 430                 }
 431
 432                 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
 433                     struct socket *, mp_so,
 434                     struct sockbuf *, &mp_so->so_rcv,
 435                     struct sockbuf *, &mp_so->so_snd,
 436                     struct mptses *, mpte);
 437                 count = mp_so->so_rcv.sb_cc - count;
 438
 439                 mp_tp->mpt_rcvnxt += count;
 440
 441                 if (mb_dfin) {
 442                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 443                         socantrcvmore(mp_so);
 444                 }
 445                 return;
 446         }
 447
 448         do {
 449                 u_int64_t mb_dsn;
 450                 int32_t mb_datalen;
 451                 int64_t todrop;
 452                 int mb_dfin = 0;
 453
 454                 VERIFY(m->m_flags & M_PKTHDR);
 455
 456                 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
 457                 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
 458                         goto fallback;
 459                 }
 460
 461                 save = m->m_next;
 462                 /*
 463                  * A single TCP packet formed of multiple mbufs
 464                  * holds DSS mapping in the first mbuf of the chain.
 465                  * Other mbufs in the chain may have M_PKTHDR set
 466                  * even though they belong to the same TCP packet
 467                  * and therefore use the DSS mapping stored in the
 468                  * first mbuf of the mbuf chain. mptcp_input() can
 469                  * get an mbuf chain with multiple TCP packets.
 470                  */
 471                 while (save && (!(save->m_flags & M_PKTHDR) ||
 472                     !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
 473                         prev = save;
 474                         save = save->m_next;
 475                 }
 476                 if (prev) {
 477                         prev->m_next = NULL;
 478                 } else {
 479                         m->m_next = NULL;
 480                 }
 481
 482                 mb_dsn = m->m_pkthdr.mp_dsn;
 483                 mb_datalen = m->m_pkthdr.mp_rlen;
 484
 485                 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
 486                 if (todrop > 0) {
 487                         tcpstat.tcps_mptcp_rcvpackafterwin++;
 488
 489                         os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
 490                             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
 491                             (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
 492                             mp_tp->mpt_rcvwnd, todrop);
 493
 494                         if (todrop >= mb_datalen) {
 495                                 if (freelist == NULL) {
 496                                         freelist = m;
 497                                 } else {
 498                                         tail->m_next = m;
 499                                 }
 500
 501                                 if (prev != NULL) {
 502                                         tail = prev;
 503                                 } else {
 504                                         tail = m;
 505                                 }
 506
 507                                 m = save;
 508                                 prev = save = NULL;
 509                                 continue;
 510                         } else {
 511                                 m_adj(m, -todrop);
 512                                 mb_datalen -= todrop;
 513                                 m->m_pkthdr.mp_rlen -= todrop;
 514                         }
 515
 516                         /*
 517                          * We drop from the right edge of the mbuf, thus the
 518                          * DATA_FIN is dropped as well
 519                          */
 520                         m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
 521                 }
 522
 523                 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
 524                         if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
 525                             mp_tp->mpt_rcvnxt)) {
 526                                 if (freelist == NULL) {
 527                                         freelist = m;
 528                                 } else {
 529                                         tail->m_next = m;
 530                                 }
 531
 532                                 if (prev != NULL) {
 533                                         tail = prev;
 534                                 } else {
 535                                         tail = m;
 536                                 }
 537
 538                                 m = save;
 539                                 prev = save = NULL;
 540                                 continue;
 541                         } else {
 542                                 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
 543                                 mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
 544                                 mb_dsn = mp_tp->mpt_rcvnxt;
 545                                 m->m_pkthdr.mp_rlen = mb_datalen;
 546                                 m->m_pkthdr.mp_dsn = mb_dsn;
 547                         }
 548                 }
 549
 550                 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
 551                     !LIST_EMPTY(&mp_tp->mpt_segq)) {
 552                         mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
 553
 554                         goto next;
 555                 }
 556                 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 557
 558                 mptcp_sbrcv_grow(mp_tp);
 559
 560                 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
 561                         wakeup = 1;
 562                 }
 563
 564                 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
 565                     struct sockbuf *, &mp_so->so_rcv,
 566                     struct sockbuf *, &mp_so->so_snd,
 567                     struct mptses *, mpte,
 568                     struct mptcb *, mp_tp);
 569                 count = mp_so->so_rcv.sb_cc - count;
 570                 tcpstat.tcps_mp_rcvtotal++;
 571                 tcpstat.tcps_mp_rcvbytes += count;
 572
 573                 mp_tp->mpt_rcvnxt += count;
 574
 575 next:
 576                 if (mb_dfin) {
 577                         mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
 578                         socantrcvmore(mp_so);
 579                 }
 580                 m = save;
 581                 prev = save = NULL;
 582                 count = mp_so->so_rcv.sb_cc;
 583         } while (m);
 584
 585         if (freelist) {
 586                 m_freem(freelist);
 587         }
 588
 589         if (wakeup) {
 590                 sorwakeup(mp_so);
 591         }
 592 }
 593
 594 boolean_t
 595 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
 596 {
 597         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
 598
 599         /*
 600          * Always send if there is data in the reinject-queue.
 601          */
 602         if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
 603                 return TRUE;
 604         }
 605
 606         /*
 607          * Don't send, if:
 608          *
 609          * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
 610          *    Except when using TFO, we might be doing a 0-byte write.
 611          * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
 612          * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
 613          */
 614
 615         if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
 616                 return FALSE;
 617         }
 618
 619         if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
 620                 return FALSE;
 621         }
 622
 623         if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 624                 return FALSE;
 625         }
 626
 627         if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
 628                 return FALSE;
 629         }
 630
 631         return TRUE;
 632 }
 633
 634 /*
 635  * MPTCP output.
 636  */
 637 int
 638 mptcp_output(struct mptses *mpte)
 639 {
 640         struct mptcb *mp_tp;
 641         struct mptsub *mpts;
 642         struct mptsub *mpts_tried = NULL;
 643         struct socket *mp_so;
 644         struct mptsub *preferred_mpts = NULL;
 645         uint64_t old_snd_nxt;
 646         int error = 0;
 647
 648         mp_so = mptetoso(mpte);
 649         socket_lock_assert_owned(mp_so);
 650         mp_tp = mpte->mpte_mptcb;
 651
 652         VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
 653         mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
 654
 655         old_snd_nxt = mp_tp->mpt_sndnxt;
 656         while (mptcp_can_send_more(mp_tp, FALSE)) {
 657                 /* get the "best" subflow to be used for transmission */
 658                 mpts = mptcp_get_subflow(mpte, &preferred_mpts);
 659                 if (mpts == NULL) {
 660                         mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
 661                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 662                         break;
 663                 }
 664
 665                 /* In case there's just one flow, we reattempt later */
 666                 if (mpts_tried != NULL &&
 667                     (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
 668                         mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
 669                         mpts_tried->mpts_flags |= MPTSF_ACTIVE;
 670                         mptcp_start_timer(mpte, MPTT_REXMT);
 671                         break;
 672                 }
 673
 674                 /*
 675                  * Automatic sizing of send socket buffer. Increase the send
 676                  * socket buffer size if all of the following criteria are met
 677                  *      1. the receiver has enough buffer space for this data
 678                  *      2. send buffer is filled to 7/8th with data (so we actually
 679                  *         have data to make use of it);
 680                  */
 681                 if (tcp_do_autosendbuf == 1 &&
 682                     (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
 683                     tcp_cansbgrow(&mp_so->so_snd)) {
 684                         if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
 685                             mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
 686                                 if (sbreserve(&mp_so->so_snd,
 687                                     min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 688                                     tcp_autosndbuf_max)) == 1) {
 689                                         mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
 690                                 }
 691                         }
 692                 }
 693
 694                 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
 695                     struct socket *, mp_so);
 696                 error = mptcp_subflow_output(mpte, mpts, 0);
 697                 if (error) {
 698                         /* can be a temporary loss of source address or other error */
 699                         mpts->mpts_flags |= MPTSF_FAILINGOVER;
 700                         mpts->mpts_flags &= ~MPTSF_ACTIVE;
 701                         mpts_tried = mpts;
 702                         if (error != ECANCELED) {
 703                                 os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
 704                                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
 705                                     error, mpts->mpts_flags);
 706                         }
 707                         break;
 708                 }
 709                 /* The model is to have only one active flow at a time */
 710                 mpts->mpts_flags |= MPTSF_ACTIVE;
 711                 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
 712
 713                 /* Allows us to update the smoothed rtt */
 714                 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
 715                         if (preferred_mpts->mpts_probesoon) {
 716                                 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
 717                                         mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
 718                                         if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
 719                                                 preferred_mpts->mpts_probesoon = 0;
 720                                                 preferred_mpts->mpts_probecnt = 0;
 721                                         }
 722                                 }
 723                         } else {
 724                                 preferred_mpts->mpts_probesoon = tcp_now;
 725                                 preferred_mpts->mpts_probecnt = 0;
 726                         }
 727                 }
 728
 729                 if (mpte->mpte_active_sub == NULL) {
 730                         mpte->mpte_active_sub = mpts;
 731                 } else if (mpte->mpte_active_sub != mpts) {
 732                         mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
 733                         mpte->mpte_active_sub = mpts;
 734
 735                         mptcpstats_inc_switch(mpte, mpts);
 736                 }
 737         }
 738
 739         if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
 740                 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
 741                     mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
 742                         mptcp_finish_usrclosed(mpte);
 743                 }
 744         }
 745
 746         mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
 747
 748         /* subflow errors should not be percolated back up */
 749         return 0;
 750 }
 751
 752
 753 static struct mptsub *
 754 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
 755 {
 756         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 757
 758         /*
 759          * Lower RTT? Take it, if it's our first one, or
 760          * it doesn't has any loss, or the current one has
 761          * loss as well.
 762          */
 763         if (tp->t_srtt && *currtt > tp->t_srtt &&
 764             (curbest == NULL || tp->t_rxtshift == 0 ||
 765             sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
 766                 *currtt = tp->t_srtt;
 767                 return mpts;
 768         }
 769
 770         /*
 771          * If we find a subflow without loss, take it always!
 772          */
 773         if (curbest &&
 774             sototcpcb(curbest->mpts_socket)->t_rxtshift &&
 775             tp->t_rxtshift == 0) {
 776                 *currtt = tp->t_srtt;
 777                 return mpts;
 778         }
 779
 780         return curbest != NULL ? curbest : mpts;
 781 }
 782
 783 static struct mptsub *
 784 mptcp_return_subflow(struct mptsub *mpts)
 785 {
 786         if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
 787                 return NULL;
 788         }
 789
 790         return mpts;
 791 }
 792
 793 static boolean_t
 794 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
 795 {
 796         struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
 797         int fail_thresh = mptcp_fail_thresh;
 798
 799         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 800                 fail_thresh *= 2;
 801         }
 802
 803         return tp->t_rxtshift >= fail_thresh &&
 804                (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
 805 }
 806
 807 /*
 808  * Return the most eligible subflow to be used for sending data.
 809  */
 810 struct mptsub *
 811 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
 812 {
 813         struct tcpcb *besttp, *secondtp;
 814         struct inpcb *bestinp, *secondinp;
 815         struct mptsub *mpts;
 816         struct mptsub *best = NULL;
 817         struct mptsub *second_best = NULL;
 818         int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
 819
 820         /*
 821          * First Step:
 822          * Choose the best subflow for cellular and non-cellular interfaces.
 823          */
 824
 825         TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
 826                 struct socket *so = mpts->mpts_socket;
 827                 struct tcpcb *tp = sototcpcb(so);
 828                 struct inpcb *inp = sotoinpcb(so);
 829
 830                 mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
 831                     __func__, mpts->mpts_connid, mpts->mpts_flags,
 832                     INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
 833                     inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
 834                     tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
 835                     mptcp_subflow_cwnd_space(so)),
 836                     MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 837
 838                 /*
 839                  * First, the hard conditions to reject subflows
 840                  * (e.g., not connected,...)
 841                  */
 842                 if (inp->inp_last_outifp == NULL) {
 843                         continue;
 844                 }
 845
 846                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
 847                         continue;
 848                 }
 849
 850                 /* There can only be one subflow in degraded state */
 851                 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
 852                         best = mpts;
 853                         break;
 854                 }
 855
 856                 /*
 857                  * If this subflow is waiting to finally send, do it!
 858                  */
 859                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
 860                         return mptcp_return_subflow(mpts);
 861                 }
 862
 863                 /*
 864                  * Only send if the subflow is MP_CAPABLE. The exceptions to
 865                  * this rule (degraded or TFO) have been taken care of above.
 866                  */
 867                 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
 868                         continue;
 869                 }
 870
 871                 if ((so->so_state & SS_ISDISCONNECTED) ||
 872                     !(so->so_state & SS_ISCONNECTED) ||
 873                     !TCPS_HAVEESTABLISHED(tp->t_state) ||
 874                     tp->t_state > TCPS_CLOSE_WAIT) {
 875                         continue;
 876                 }
 877
 878                 /*
 879                  * Second, the soft conditions to find the subflow with best
 880                  * conditions for each set (aka cellular vs non-cellular)
 881                  */
 882                 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
 883                         second_best = mptcp_choose_subflow(mpts, second_best,
 884                             &exp_rtt);
 885                 } else {
 886                         best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
 887                 }
 888         }
 889
 890         /*
 891          * If there is no preferred or backup subflow, and there is no active
 892          * subflow use the last usable subflow.
 893          */
 894         if (best == NULL) {
 895                 return mptcp_return_subflow(second_best);
 896         }
 897
 898         if (second_best == NULL) {
 899                 return mptcp_return_subflow(best);
 900         }
 901
 902         besttp = sototcpcb(best->mpts_socket);
 903         bestinp = sotoinpcb(best->mpts_socket);
 904         secondtp = sototcpcb(second_best->mpts_socket);
 905         secondinp = sotoinpcb(second_best->mpts_socket);
 906
 907         if (preferred != NULL) {
 908                 *preferred = mptcp_return_subflow(best);
 909         }
 910
 911         /*
 912          * Second Step: Among best and second_best. Choose the one that is
 913          * most appropriate for this particular service-type.
 914          */
 915         if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
 916                 /*
 917                  * Only handover if Symptoms tells us to do so.
 918                  */
 919                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 920                     mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
 921                         return mptcp_return_subflow(second_best);
 922                 }
 923
 924                 return mptcp_return_subflow(best);
 925         } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
 926                 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
 927                 int rto_thresh = mptcp_rtothresh;
 928
 929                 /* Adjust with symptoms information */
 930                 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
 931                     mptcp_is_wifi_unusable_for_session(mpte) != 0) {
 932                         rtt_thresh /= 2;
 933                         rto_thresh /= 2;
 934                 }
 935
 936                 if (besttp->t_srtt && secondtp->t_srtt &&
 937                     besttp->t_srtt >= rtt_thresh &&
 938                     secondtp->t_srtt < rtt_thresh) {
 939                         tcpstat.tcps_mp_sel_rtt++;
 940                         mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
 941                             best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
 942                             second_best->mpts_connid,
 943                             secondtp->t_srtt >> TCP_RTT_SHIFT),
 944                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 945                         return mptcp_return_subflow(second_best);
 946                 }
 947
 948                 if (mptcp_subflow_is_slow(mpte, best) &&
 949                     secondtp->t_rxtshift == 0) {
 950                         return mptcp_return_subflow(second_best);
 951                 }
 952
 953                 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
 954                 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
 955                     besttp->t_rxtcur >= rto_thresh &&
 956                     secondtp->t_rxtcur < rto_thresh) {
 957                         tcpstat.tcps_mp_sel_rto++;
 958                         mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
 959                             best->mpts_connid, besttp->t_rxtcur,
 960                             second_best->mpts_connid, secondtp->t_rxtcur),
 961                             MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
 962
 963                         return mptcp_return_subflow(second_best);
 964                 }
 965
 966                 /*
 967                  * None of the above conditions for sending on the secondary
 968                  * were true. So, let's schedule on the best one, if he still
 969                  * has some space in the congestion-window.
 970                  */
 971                 return mptcp_return_subflow(best);
 972         } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
 973                 struct mptsub *tmp;
 974
 975                 /*
 976                  * We only care about RTT when aggregating
 977                  */
 978                 if (besttp->t_srtt > secondtp->t_srtt) {
 979                         tmp = best;
 980                         best = second_best;
 981                         besttp = secondtp;
 982                         bestinp = secondinp;
 983
 984                         second_best = tmp;
 985                         secondtp = sototcpcb(second_best->mpts_socket);
 986                         secondinp = sotoinpcb(second_best->mpts_socket);
 987                 }
 988
 989                 /* Is there still space in the congestion window? */
 990                 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
 991                         return mptcp_return_subflow(second_best);
 992                 }
 993
 994                 return mptcp_return_subflow(best);
 995         } else {
 996                 panic("Unknown service-type configured for MPTCP");
 997         }
 998
 999         return NULL;
1000 }
1001
1002 static const char *
1003 mptcp_event_to_str(uint32_t event)
1004 {
1005         const char *c = "UNDEFINED";
1006         switch (event) {
1007         case MPCE_CLOSE:
1008                 c = "MPCE_CLOSE";
1009                 break;
1010         case MPCE_RECV_DATA_ACK:
1011                 c = "MPCE_RECV_DATA_ACK";
1012                 break;
1013         case MPCE_RECV_DATA_FIN:
1014                 c = "MPCE_RECV_DATA_FIN";
1015                 break;
1016         }
1017         return c;
1018 }
1019
1020 static const char *
1021 mptcp_state_to_str(mptcp_state_t state)
1022 {
1023         const char *c = "UNDEFINED";
1024         switch (state) {
1025         case MPTCPS_CLOSED:
1026                 c = "MPTCPS_CLOSED";
1027                 break;
1028         case MPTCPS_LISTEN:
1029                 c = "MPTCPS_LISTEN";
1030                 break;
1031         case MPTCPS_ESTABLISHED:
1032                 c = "MPTCPS_ESTABLISHED";
1033                 break;
1034         case MPTCPS_CLOSE_WAIT:
1035                 c = "MPTCPS_CLOSE_WAIT";
1036                 break;
1037         case MPTCPS_FIN_WAIT_1:
1038                 c = "MPTCPS_FIN_WAIT_1";
1039                 break;
1040         case MPTCPS_CLOSING:
1041                 c = "MPTCPS_CLOSING";
1042                 break;
1043         case MPTCPS_LAST_ACK:
1044                 c = "MPTCPS_LAST_ACK";
1045                 break;
1046         case MPTCPS_FIN_WAIT_2:
1047                 c = "MPTCPS_FIN_WAIT_2";
1048                 break;
1049         case MPTCPS_TIME_WAIT:
1050                 c = "MPTCPS_TIME_WAIT";
1051                 break;
1052         case MPTCPS_TERMINATE:
1053                 c = "MPTCPS_TERMINATE";
1054                 break;
1055         }
1056         return c;
1057 }
1058
1059 void
1060 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1061 {
1062         struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1063
1064         socket_lock_assert_owned(mp_so);
1065
1066         mptcp_state_t old_state = mp_tp->mpt_state;
1067
1068         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1069             uint32_t, event);
1070
1071         switch (mp_tp->mpt_state) {
1072         case MPTCPS_CLOSED:
1073         case MPTCPS_LISTEN:
1074                 mp_tp->mpt_state = MPTCPS_TERMINATE;
1075                 break;
1076
1077         case MPTCPS_ESTABLISHED:
1078                 if (event == MPCE_CLOSE) {
1079                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1080                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1081                 } else if (event == MPCE_RECV_DATA_FIN) {
1082                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1083                         mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1084                 }
1085                 break;
1086
1087         case MPTCPS_CLOSE_WAIT:
1088                 if (event == MPCE_CLOSE) {
1089                         mp_tp->mpt_state = MPTCPS_LAST_ACK;
1090                         mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1091                 }
1092                 break;
1093
1094         case MPTCPS_FIN_WAIT_1:
1095                 if (event == MPCE_RECV_DATA_ACK) {
1096                         mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1097                 } else if (event == MPCE_RECV_DATA_FIN) {
1098                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1099                         mp_tp->mpt_state = MPTCPS_CLOSING;
1100                 }
1101                 break;
1102
1103         case MPTCPS_CLOSING:
1104                 if (event == MPCE_RECV_DATA_ACK) {
1105                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1106                 }
1107                 break;
1108
1109         case MPTCPS_LAST_ACK:
1110                 if (event == MPCE_RECV_DATA_ACK) {
1111                         mptcp_close(mp_tp->mpt_mpte, mp_tp);
1112                 }
1113                 break;
1114
1115         case MPTCPS_FIN_WAIT_2:
1116                 if (event == MPCE_RECV_DATA_FIN) {
1117                         mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1118                         mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1119                 }
1120                 break;
1121
1122         case MPTCPS_TIME_WAIT:
1123         case MPTCPS_TERMINATE:
1124                 break;
1125
1126         default:
1127                 VERIFY(0);
1128                 /* NOTREACHED */
1129         }
1130         DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1131             uint32_t, event);
1132         mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1133             mptcp_state_to_str(old_state),
1134             mptcp_state_to_str(mp_tp->mpt_state),
1135             mptcp_event_to_str(event)),
1136             MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1137 }
1138
1139 /* If you change this function, match up mptcp_update_rcv_state_f */
1140 void
1141 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1142     uint16_t csum)
1143 {
1144         struct mptcb *mp_tp = tptomptp(tp);
1145         u_int64_t full_dsn = 0;
1146
1147         NTOHL(dss_info->mdss_dsn);
1148         NTOHL(dss_info->mdss_subflow_seqn);
1149         NTOHS(dss_info->mdss_data_len);
1150
1151         /* XXX for autosndbuf grow sb here */
1152         MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1153         mptcp_update_rcv_state_meat(mp_tp, tp,
1154             full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1155             csum);
1156 }
1157
1158 void
1159 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1160     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1161     uint16_t csum)
1162 {
1163         if (mdss_data_len == 0) {
1164                 os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1165                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1166
1167                 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1168                         os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1169                             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1170                 }
1171                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1172                 return;
1173         }
1174
1175         mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1176
1177         tp->t_rcv_map.mpt_dsn = full_dsn;
1178         tp->t_rcv_map.mpt_sseq = seqn;
1179         tp->t_rcv_map.mpt_len = mdss_data_len;
1180         tp->t_rcv_map.mpt_csum = csum;
1181         tp->t_mpflags |= TMPF_EMBED_DSN;
1182 }
1183
1184
1185 static int
1186 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1187     int hdrlen)
1188 {
1189         u_int32_t datalen;
1190
1191         if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1192                 return 0;
1193         }
1194
1195         datalen = m->m_pkthdr.mp_rlen;
1196
1197         /* unacceptable DSS option, fallback to TCP */
1198         if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1199                 os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
1200                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
1201         } else {
1202                 return 0;
1203         }
1204         tp->t_mpflags |= TMPF_SND_MPFAIL;
1205         mptcp_notify_mpfail(so);
1206         m_freem(m);
1207         return -1;
1208 }
1209
1210 int
1211 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1212     int drop_hdrlen)
1213 {
1214         mptcp_insert_rmap(tp, m, th);
1215         if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1216             drop_hdrlen) != 0) {
1217                 return -1;
1218         }
1219         return 0;
1220 }
1221
1222 /*
1223  * MPTCP Checksum support
1224  * The checksum is calculated whenever the MPTCP DSS option is included
1225  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1226  * header and the actual data indicated by the length specified in the
1227  * DSS option.
1228  */
1229
1230 int
1231 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1232     uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1233 {
1234         uint16_t mptcp_csum;
1235
1236         mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1237         if (mptcp_csum) {
1238                 tp->t_mpflags |= TMPF_SND_MPFAIL;
1239                 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1240                 m_freem(m);
1241                 tcpstat.tcps_mp_badcsum++;
1242                 return -1;
1243         }
1244         return 0;
1245 }
1246
1247 static uint16_t
1248 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1249     uint16_t dlen, uint16_t csum, uint16_t dfin)
1250 {
1251         struct mptcb *mp_tp = tptomptp(tp);
1252         uint16_t real_len = dlen - dfin;
1253         uint32_t sum = 0;
1254
1255         if (mp_tp == NULL) {
1256                 return 0;
1257         }
1258
1259         if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1260                 return 0;
1261         }
1262
1263         if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1264                 return 0;
1265         }
1266
1267         /*
1268          * The remote side may send a packet with fewer bytes than the
1269          * claimed DSS checksum length.
1270          */
1271         if ((int)m_length2(m, NULL) < real_len) {
1272                 return 0xffff;
1273         }
1274
1275         if (real_len != 0) {
1276                 sum = m_sum16(m, 0, real_len);
1277         }
1278
1279         sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1280         ADDCARRY(sum);
1281         DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1282             uint32_t, sum);
1283
1284         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1285             MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1286         return ~sum & 0xffff;
1287 }
1288
1289 uint32_t
1290 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1291 {
1292         uint32_t sum = 0;
1293
1294         if (dlen) {
1295                 sum = m_sum16(m, 0, dlen);
1296         }
1297
1298         dss_val = mptcp_hton64(dss_val);
1299         sseq = htonl(sseq);
1300         dlen = htons(dlen);
1301         sum += in_pseudo64(dss_val, sseq, dlen);
1302
1303         ADDCARRY(sum);
1304         sum = ~sum & 0xffff;
1305         DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1306         mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1307             MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1308
1309         return sum;
1310 }
1311
1312 /*
1313  * When WiFi signal starts fading, there's more loss and RTT spikes.
1314  * Check if there has been a large spike by comparing against
1315  * a tolerable RTT spike threshold.
1316  */
1317 boolean_t
1318 mptcp_no_rto_spike(struct socket *so)
1319 {
1320         struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1321         int32_t spike = 0;
1322
1323         if (tp->t_rxtcur > mptcp_rtothresh) {
1324                 spike = tp->t_rxtcur - mptcp_rtothresh;
1325
1326                 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1327                     __func__, spike,
1328                     tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1329                     tp->t_rttcur),
1330                     (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1331         }
1332
1333         if (spike > 0) {
1334                 return FALSE;
1335         } else {
1336                 return TRUE;
1337         }
1338 }
1339
1340 void
1341 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1342 {
1343         VERIFY(mpp->mpp_flags & flag);
1344         mpp->mpp_flags &= ~flag;
1345
1346         if (mptcp_should_defer_upcall(mpp)) {
1347                 return;
1348         }
1349
1350         if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1351                 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1352
1353                 mptcp_subflow_workloop(mpp->mpp_pcbe);
1354         }
1355
1356         if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1357                 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1358
1359                 sorwakeup(mpp->mpp_socket);
1360         }
1361
1362         if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1363                 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1364
1365                 sowwakeup(mpp->mpp_socket);
1366         }
1367 }
1368
1369 void
1370 mptcp_ask_for_nat64(struct ifnet *ifp)
1371 {
1372         in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1373
1374         os_log_info(mptcp_log_handle,
1375             "%s: asked for NAT64-prefix on %s\n", __func__,
1376             ifp->if_name);
1377 }
1378
1379 static void
1380 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1381 {
1382         memset(info, 0, sizeof(*info));
1383 }
1384
1385 void
1386 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1387     uint32_t necp_flags, __unused bool *viable)
1388 {
1389         boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1390         boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1391         boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1392         boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1393         struct mppcb *mp = (struct mppcb *)handle;
1394         struct mptses *mpte = mptompte(mp);
1395         struct socket *mp_so;
1396         struct mptcb *mp_tp;
1397         int locked = 0;
1398         uint32_t i, ifindex;
1399
1400         ifindex = interface_index;
1401         VERIFY(ifindex != IFSCOPE_NONE);
1402
1403         /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1404         if (mp->mpp_socket->so_usecount == 0) {
1405                 return;
1406         }
1407
1408         mp_so = mptetoso(mpte);
1409
1410         if (action != NECP_CLIENT_CBACTION_INITIAL) {
1411                 socket_lock(mp_so, 1);
1412                 locked = 1;
1413
1414                 /* Check again, because it might have changed while waiting */
1415                 if (mp->mpp_socket->so_usecount == 0) {
1416                         goto out;
1417                 }
1418         }
1419
1420         socket_lock_assert_owned(mp_so);
1421
1422         mp_tp = mpte->mpte_mptcb;
1423
1424         os_log_info(mptcp_log_handle, "%s - %lx: action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1425             __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1426             mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1427             has_v4, has_v6, has_nat64, low_power);
1428
1429         /* No need on fallen back sockets */
1430         if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1431                 goto out;
1432         }
1433
1434         /*
1435          * When the interface goes in low-power mode we don't want to establish
1436          * new subflows on it. Thus, mark it internally as non-viable.
1437          */
1438         if (low_power) {
1439                 action = NECP_CLIENT_CBACTION_NONVIABLE;
1440         }
1441
1442         if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1443                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1444                         if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1445                                 continue;
1446                         }
1447
1448                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1449                                 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1450                         }
1451                 }
1452
1453                 mptcp_sched_create_subflows(mpte);
1454         } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1455             action == NECP_CLIENT_CBACTION_INITIAL) {
1456                 int found_slot = 0, slot_index = -1;
1457                 struct sockaddr *dst;
1458                 struct ifnet *ifp;
1459
1460                 ifnet_head_lock_shared();
1461                 ifp = ifindex2ifnet[ifindex];
1462                 ifnet_head_done();
1463
1464                 if (ifp == NULL) {
1465                         goto out;
1466                 }
1467
1468                 if (IFNET_IS_EXPENSIVE(ifp) &&
1469                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1470                         goto out;
1471                 }
1472
1473                 if (IFNET_IS_CONSTRAINED(ifp) &&
1474                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1475                         goto out;
1476                 }
1477
1478                 if (IFNET_IS_CELLULAR(ifp) &&
1479                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1480                         goto out;
1481                 }
1482
1483                 if (IS_INTF_CLAT46(ifp)) {
1484                         has_v4 = FALSE;
1485                 }
1486
1487                 /* Look for the slot on where to store/update the interface-info. */
1488                 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1489                         /* Found a potential empty slot where we can put it */
1490                         if (mpte->mpte_itfinfo[i].ifindex == 0) {
1491                                 found_slot = 1;
1492                                 slot_index = i;
1493                         }
1494
1495                         /*
1496                          * The interface is already in our array. Check if we
1497                          * need to update it.
1498                          */
1499                         if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1500                             (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1501                             mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1502                             mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1503                                 found_slot = 1;
1504                                 slot_index = i;
1505                                 break;
1506                         }
1507
1508                         if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1509                                 /*
1510                                  * Ok, it's already there and we don't need
1511                                  * to update it
1512                                  */
1513                                 goto out;
1514                         }
1515                 }
1516
1517                 dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1518                 if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) &&
1519                     has_v6 && !has_nat64 && !has_v4) {
1520                         if (found_slot) {
1521                                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1522                                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1523                                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1524                         }
1525                         mptcp_ask_for_nat64(ifp);
1526                         goto out;
1527                 }
1528
1529                 if (found_slot == 0) {
1530                         int new_size = mpte->mpte_itfinfo_size * 2;
1531                         struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1532
1533                         if (info == NULL) {
1534                                 os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1535                                     __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1536                                 goto out;
1537                         }
1538
1539                         memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1540
1541                         if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1542                                 _FREE(mpte->mpte_itfinfo, M_TEMP);
1543                         }
1544
1545                         /* We allocated a new one, thus the first must be empty */
1546                         slot_index = mpte->mpte_itfinfo_size;
1547
1548                         mpte->mpte_itfinfo = info;
1549                         mpte->mpte_itfinfo_size = new_size;
1550                 }
1551
1552                 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1553                 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1554                 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1555                 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1556                 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1557
1558                 mptcp_sched_create_subflows(mpte);
1559         }
1560
1561 out:
1562         if (locked) {
1563                 socket_unlock(mp_so, 1);
1564         }
1565 }
1566
1567 void
1568 mptcp_set_restrictions(struct socket *mp_so)
1569 {
1570         struct mptses *mpte = mpsotompte(mp_so);
1571         uint32_t i;
1572
1573         socket_lock_assert_owned(mp_so);
1574
1575         ifnet_head_lock_shared();
1576
1577         for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1578                 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1579                 uint32_t ifindex = info->ifindex;
1580                 struct ifnet *ifp;
1581
1582                 if (ifindex == IFSCOPE_NONE) {
1583                         continue;
1584                 }
1585
1586                 ifp = ifindex2ifnet[ifindex];
1587                 if (ifp == NULL) {
1588                         continue;
1589                 }
1590
1591                 if (IFNET_IS_EXPENSIVE(ifp) &&
1592                     (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1593                         info->ifindex = IFSCOPE_NONE;
1594                 }
1595
1596                 if (IFNET_IS_CONSTRAINED(ifp) &&
1597                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1598                         info->ifindex = IFSCOPE_NONE;
1599                 }
1600
1601                 if (IFNET_IS_CELLULAR(ifp) &&
1602                     (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1603                         info->ifindex = IFSCOPE_NONE;
1604                 }
1605         }
1606
1607         ifnet_head_done();
1608 }