X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/5ba3f43ea354af8ad55bea84372a2bc834d8757c..refs/heads/master:/bsd/netinet/mptcp.c diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index 5d901a9da..85a8cebc1 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2017 Apple Inc. All rights reserved. + * Copyright (c) 2012-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,13 +109,17 @@ int mptcp_enable = 1; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_enable, 0, "Enable Multipath TCP Support"); + &mptcp_enable, 0, "Enable Multipath TCP Support"); -/* Number of times to try negotiating MPTCP on SYN retransmissions */ -int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES; +/* + * Number of times to try negotiating MPTCP on SYN retransmissions. + * We haven't seen any reports of a middlebox that is dropping all SYN-segments + * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times. + */ +int mptcp_mpcap_retries = 4; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr, - CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries"); + CTLFLAG_RW | CTLFLAG_LOCKED, + &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries"); /* * By default, DSS checksum is turned off, revisit if we ever do @@ -123,7 +127,7 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr, */ int mptcp_dss_csum = 0; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_dss_csum, 0, "Enable DSS checksum"); + &mptcp_dss_csum, 0, "Enable DSS checksum"); /* * When mptcp_fail_thresh number of retransmissions are sent, subflow failover @@ -131,74 +135,65 @@ SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED, */ int mptcp_fail_thresh = 1; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_fail_thresh, 0, "Failover threshold"); - + &mptcp_fail_thresh, 0, "Failover threshold"); /* * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout. * Some carrier networks have a timeout of 10 or 15 minutes. */ -int mptcp_subflow_keeptime = 60*14; +int mptcp_subflow_keeptime = 60 * 14; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_subflow_keeptime, 0, "Keepalive in seconds"); + &mptcp_subflow_keeptime, 0, "Keepalive in seconds"); int mptcp_rtthist_rtthresh = 600; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_rtthist_rtthresh, 0, "Rtt threshold"); - -/* - * Use RTO history for sending new data - */ -int mptcp_use_rto = 1; -SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_use_rto, 0, "Disable RTO for subflow selection"); + &mptcp_rtthist_rtthresh, 0, "Rtt threshold"); int mptcp_rtothresh = 1500; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_rtothresh, 0, "RTO threshold"); + &mptcp_rtothresh, 0, "RTO threshold"); /* * Probe the preferred path, when it is not in use */ uint32_t mptcp_probeto = 1000; SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_probeto, 0, "Disable probing by setting to 0"); + &mptcp_probeto, 0, "Disable probing by setting to 0"); uint32_t mptcp_probecnt = 5; SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED, - &mptcp_probecnt, 0, "Number of probe writes"); - -/* - * Static declarations - */ -static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t, - uint32_t, uint16_t, uint16_t); + &mptcp_probecnt, 0, "Number of probe writes"); static int mptcp_reass_present(struct socket *mp_so) { - struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb; + struct mptses *mpte = mpsotompte(mp_so); + struct mptcb *mp_tp = mpte->mpte_mptcb; struct tseg_qent *q; int dowakeup = 0; + int flags = 0; /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ - if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) - return (0); + if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { + return flags; + } q = LIST_FIRST(&mp_tp->mpt_segq); - if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) - return (0); + if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) { + return flags; + } /* * If there is already another thread doing reassembly for this * connection, it is better to let it finish the job -- * (radar 16316196) */ - if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) - return (0); + if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) { + return flags; + } mp_tp->mpt_flags |= MPTCPF_REASS_INPROG; @@ -208,8 +203,10 @@ mptcp_reass_present(struct socket *mp_so) if (mp_so->so_state & SS_CANTRCVMORE) { m_freem(q->tqe_m); } else { - if (sbappendstream(&mp_so->so_rcv, q->tqe_m)) + flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN); + if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) { dowakeup = 1; + } } zfree(tcp_reass_zone, q); mp_tp->mpt_reassqlen--; @@ -217,10 +214,10 @@ mptcp_reass_present(struct socket *mp_so) } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt); mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG; - if (dowakeup) + if (dowakeup) { sorwakeup(mp_so); /* done with socket lock held */ - return (0); - + } + return flags; } static int @@ -232,7 +229,7 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * struct tseg_qent *p = NULL; struct tseg_qent *nq; struct tseg_qent *te = NULL; - u_int16_t qlimit; + uint32_t qlimit; /* * Limit the number of segments in the reassembly queue to prevent @@ -241,14 +238,14 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * * queue. Always keep one global queue entry spare to be able to * process the missing segment. */ - qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10), + qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10), (tcp_autorcvbuf_max >> 10)); if (mb_dsn != mp_tp->mpt_rcvnxt && (mp_tp->mpt_reassqlen + 1) >= qlimit) { tcpstat.tcps_mptcp_rcvmemdrop++; m_freem(m); *tlenp = 0; - return (0); + return 0; } /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ @@ -256,7 +253,7 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * if (te == NULL) { tcpstat.tcps_mptcp_rcvmemdrop++; m_freem(m); - return (0); + return 0; } mp_tp->mpt_reassqlen++; @@ -265,8 +262,9 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * * Find a segment which begins after this one does. */ LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) { - if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) + if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) { break; + } p = q; } @@ -294,7 +292,8 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * */ goto out; } - m_adj(m, i); + VERIFY(i <= INT_MAX); + m_adj(m, (int)i); *tlenp -= i; phdr->mp_dsn += i; } @@ -308,13 +307,16 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * */ while (q) { int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn; - if (i <= 0) + if (i <= 0) { break; + } if (i < q->tqe_len) { q->tqe_m->m_pkthdr.mp_dsn += i; q->tqe_len -= i; - m_adj(q->tqe_m, i); + + VERIFY(i <= INT_MAX); + m_adj(q->tqe_m, (int)i); break; } @@ -338,7 +340,7 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * } out: - return (mptcp_reass_present(mp_so)); + return mptcp_reass_present(mp_so); } /* @@ -355,11 +357,11 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) VERIFY(m->m_flags & M_PKTHDR); - mpte_lock_assert_held(mpte); /* same as MP socket lock */ - mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; + socket_lock_assert_owned(mp_so); + DTRACE_MPTCP(input); mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp); @@ -376,23 +378,59 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) * In the degraded fallback case, data is accepted without DSS map */ if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { + struct mbuf *iter; + int mb_dfin = 0; fallback: mptcp_sbrcv_grow(mp_tp); + iter = m; + while (iter) { + if ((iter->m_flags & M_PKTHDR) && + (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) { + mb_dfin = 1; + } + + if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) { + /* Don't add zero-length packets, so jump it! */ + if (prev == NULL) { + m = iter->m_next; + m_free(iter); + iter = m; + } else { + prev->m_next = iter->m_next; + m_free(iter); + iter = prev->m_next; + } + + /* It was a zero-length packet so next one must be a pkthdr */ + VERIFY(iter == NULL || iter->m_flags & M_PKTHDR); + } else { + prev = iter; + iter = iter->m_next; + } + } + /* * assume degraded flow as this may be the first packet * without DSS, and the subflow state is not updated yet. */ - if (sbappendstream(&mp_so->so_rcv, m)) + if (sbappendstream_rcvdemux(mp_so, m)) { sorwakeup(mp_so); + } + DTRACE_MPTCP5(receive__degraded, struct mbuf *, m, struct socket *, mp_so, struct sockbuf *, &mp_so->so_rcv, struct sockbuf *, &mp_so->so_snd, struct mptses *, mpte); count = mp_so->so_rcv.sb_cc - count; - mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__, - count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); + + mp_tp->mpt_rcvnxt += count; + + if (mb_dfin) { + mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN); + socantrcvmore(mp_so); + } return; } @@ -400,10 +438,14 @@ fallback: u_int64_t mb_dsn; int32_t mb_datalen; int64_t todrop; + int mb_dfin = 0; + + VERIFY(m->m_flags & M_PKTHDR); /* If fallback occurs, mbufs will not have PKTF_MPTCP set */ - if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) + if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { goto fallback; + } save = m->m_next; /* @@ -420,10 +462,11 @@ fallback: prev = save; save = save->m_next; } - if (prev) + if (prev) { prev->m_next = NULL; - else + } else { m->m_next = NULL; + } mb_dsn = m->m_pkthdr.mp_dsn; mb_datalen = m->m_pkthdr.mp_rlen; @@ -432,61 +475,83 @@ fallback: if (todrop > 0) { tcpstat.tcps_mptcp_rcvpackafterwin++; + os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt, + mp_tp->mpt_rcvwnd, todrop); + if (todrop >= mb_datalen) { - if (freelist == NULL) + if (freelist == NULL) { freelist = m; - else + } else { tail->m_next = m; + } - if (prev != NULL) + if (prev != NULL) { tail = prev; - else + } else { tail = m; + } m = save; prev = save = NULL; continue; } else { - m_adj(m, -todrop); + VERIFY(todrop <= INT_MAX); + m_adj(m, (int)-todrop); mb_datalen -= todrop; + m->m_pkthdr.mp_rlen -= todrop; } - } - if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) || - !LIST_EMPTY(&mp_tp->mpt_segq)) { - mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m); - - goto next; + /* + * We drop from the right edge of the mbuf, thus the + * DATA_FIN is dropped as well + */ + m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN; } if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) { if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen), mp_tp->mpt_rcvnxt)) { - if (freelist == NULL) + if (freelist == NULL) { freelist = m; - else + } else { tail->m_next = m; + } - if (prev != NULL) + if (prev != NULL) { tail = prev; - else + } else { tail = m; + } m = save; prev = save = NULL; continue; } else { - m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn)); + VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX); + m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn)); + mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn); + mb_dsn = mp_tp->mpt_rcvnxt; + VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX); + m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen; + m->m_pkthdr.mp_dsn = mb_dsn; } - mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__, - mp_tp->mpt_rcvnxt), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); } + if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) || + !LIST_EMPTY(&mp_tp->mpt_segq)) { + mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m); + + goto next; + } + mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN); + mptcp_sbrcv_grow(mp_tp); - if (sbappendstream(&mp_so->so_rcv, m)) + if (sbappendstream_rcvdemux(mp_so, m)) { wakeup = 1; + } DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so, struct sockbuf *, &mp_so->so_rcv, @@ -496,34 +561,39 @@ fallback: count = mp_so->so_rcv.sb_cc - count; tcpstat.tcps_mp_rcvtotal++; tcpstat.tcps_mp_rcvbytes += count; - mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); mp_tp->mpt_rcvnxt += count; next: + if (mb_dfin) { + mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN); + socantrcvmore(mp_so); + } m = save; prev = save = NULL; count = mp_so->so_rcv.sb_cc; } while (m); - if (freelist) + if (freelist) { m_freem(freelist); + } - if (wakeup) + if (wakeup) { sorwakeup(mp_so); + } } -static boolean_t -mptcp_can_send_more(struct mptcb *mp_tp) +boolean_t +mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject) { struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); /* * Always send if there is data in the reinject-queue. */ - if (mp_tp->mpt_mpte->mpte_reinjectq) - return (TRUE); + if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) { + return TRUE; + } /* * Don't send, if: @@ -534,19 +604,23 @@ mptcp_can_send_more(struct mptcb *mp_tp) * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled. */ - if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) - return (FALSE); + if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) { + return FALSE; + } - if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) - return (FALSE); + if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) { + return FALSE; + } - if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) - return (FALSE); + if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) { + return FALSE; + } - if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) - return (FALSE); + if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) { + return FALSE; + } - return (TRUE); + return TRUE; } /* @@ -563,41 +637,34 @@ mptcp_output(struct mptses *mpte) uint64_t old_snd_nxt; int error = 0; - mpte_lock_assert_held(mpte); mp_so = mptetoso(mpte); mp_tp = mpte->mpte_mptcb; + socket_lock_assert_owned(mp_so); + + if (mp_so->so_flags & SOF_DEFUNCT) { + return 0; + } + VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)); mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL; - mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n", - __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax, - (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd, - mpte->mpte_reinjectq ? 1 : 0, - mp_tp->mpt_state), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); - old_snd_nxt = mp_tp->mpt_sndnxt; - while (mptcp_can_send_more(mp_tp)) { + while (mptcp_can_send_more(mp_tp, FALSE)) { /* get the "best" subflow to be used for transmission */ - mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts); + mpts = mptcp_get_subflow(mpte, &preferred_mpts); if (mpts == NULL) { mptcplog((LOG_INFO, "%s: no subflow\n", __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); break; } - mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); - /* In case there's just one flow, we reattempt later */ if (mpts_tried != NULL && (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) { mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER; mpts_tried->mpts_flags |= MPTSF_ACTIVE; mptcp_start_timer(mpte, MPTT_REXMT); - mptcplog((LOG_DEBUG, "%s: retry later\n", __func__), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); break; } @@ -608,8 +675,7 @@ mptcp_output(struct mptses *mpte) * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); */ - if (tcp_do_autosendbuf == 1 && - (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE && + if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE && tcp_cansbgrow(&mp_so->so_snd)) { if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat && mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) { @@ -617,11 +683,6 @@ mptcp_output(struct mptses *mpte) min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max)) == 1) { mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat; - - mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n", - __func__, mp_so->so_snd.sb_hiwat, - mp_so->so_snd.sb_lowat), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); } } } @@ -634,9 +695,11 @@ mptcp_output(struct mptses *mpte) mpts->mpts_flags |= MPTSF_FAILINGOVER; mpts->mpts_flags &= ~MPTSF_ACTIVE; mpts_tried = mpts; - mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__, - error, mpts->mpts_flags), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR); + if (error != ECANCELED) { + os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + error, mpts->mpts_flags); + } break; } /* The model is to have only one active flow at a time */ @@ -662,14 +725,6 @@ mptcp_output(struct mptses *mpte) if (mpte->mpte_active_sub == NULL) { mpte->mpte_active_sub = mpts; } else if (mpte->mpte_active_sub != mpts) { - struct tcpcb *tp = sototcpcb(mpts->mpts_socket); - struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket); - - mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__, - mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT, - mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT), - (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG); - mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE; mpte->mpte_active_sub = mpts; @@ -677,10 +732,17 @@ mptcp_output(struct mptses *mpte) } } + if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) { + if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && + mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) { + mptcp_finish_usrclosed(mpte); + } + } + mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL); /* subflow errors should not be percolated back up */ - return (0); + return 0; } @@ -696,9 +758,9 @@ mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt) */ if (tp->t_srtt && *currtt > tp->t_srtt && (curbest == NULL || tp->t_rxtshift == 0 || - sototcpcb(curbest->mpts_socket)->t_rxtshift)) { + sototcpcb(curbest->mpts_socket)->t_rxtshift)) { *currtt = tp->t_srtt; - return (mpts); + return mpts; } /* @@ -708,26 +770,41 @@ mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt) sototcpcb(curbest->mpts_socket)->t_rxtshift && tp->t_rxtshift == 0) { *currtt = tp->t_srtt; - return (mpts); + return mpts; } - return (curbest != NULL ? curbest : mpts); + return curbest != NULL ? curbest : mpts; } static struct mptsub * mptcp_return_subflow(struct mptsub *mpts) { - if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) - return (NULL); + if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) { + return NULL; + } + + return mpts; +} + +static boolean_t +mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts) +{ + struct tcpcb *tp = sototcpcb(mpts->mpts_socket); + int fail_thresh = mptcp_fail_thresh; + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { + fail_thresh *= 2; + } - return (mpts); + return tp->t_rxtshift >= fail_thresh && + (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq); } /* * Return the most eligible subflow to be used for sending data. */ struct mptsub * -mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred) +mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred) { struct tcpcb *besttp, *secondtp; struct inpcb *bestinp, *secondinp; @@ -746,23 +823,25 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr struct tcpcb *tp = sototcpcb(so); struct inpcb *inp = sotoinpcb(so); - mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n", - __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags, - INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state, - inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1, - tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt, - mptcp_subflow_cwnd_space(so)), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n", + __func__, mpts->mpts_connid, mpts->mpts_flags, + INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state, + inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1, + tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt, + mptcp_subflow_cwnd_space(so)), + MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); /* * First, the hard conditions to reject subflows * (e.g., not connected,...) */ - if (mpts == ignore || inp->inp_last_outifp == NULL) + if (inp->inp_last_outifp == NULL) { continue; + } - if (INP_WAIT_FOR_IF_FEEDBACK(inp)) + if (INP_WAIT_FOR_IF_FEEDBACK(inp)) { continue; + } /* There can only be one subflow in degraded state */ if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { @@ -773,72 +852,81 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr /* * If this subflow is waiting to finally send, do it! */ - if (so->so_flags1 & SOF1_PRECONNECT_DATA) - return (mptcp_return_subflow(mpts)); + if (so->so_flags1 & SOF1_PRECONNECT_DATA) { + return mptcp_return_subflow(mpts); + } /* * Only send if the subflow is MP_CAPABLE. The exceptions to * this rule (degraded or TFO) have been taken care of above. */ - if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) + if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) { continue; + } if ((so->so_state & SS_ISDISCONNECTED) || !(so->so_state & SS_ISCONNECTED) || !TCPS_HAVEESTABLISHED(tp->t_state) || - tp->t_state > TCPS_CLOSE_WAIT) + tp->t_state > TCPS_CLOSE_WAIT) { continue; + } /* * Second, the soft conditions to find the subflow with best * conditions for each set (aka cellular vs non-cellular) */ - if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) + if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) { second_best = mptcp_choose_subflow(mpts, second_best, - &exp_rtt); - else + &exp_rtt); + } else { best = mptcp_choose_subflow(mpts, best, &cheap_rtt); + } } /* * If there is no preferred or backup subflow, and there is no active * subflow use the last usable subflow. */ - if (best == NULL) - return (mptcp_return_subflow(second_best)); + if (best == NULL) { + return mptcp_return_subflow(second_best); + } - if (second_best == NULL) - return (mptcp_return_subflow(best)); + if (second_best == NULL) { + return mptcp_return_subflow(best); + } besttp = sototcpcb(best->mpts_socket); bestinp = sotoinpcb(best->mpts_socket); secondtp = sototcpcb(second_best->mpts_socket); secondinp = sotoinpcb(second_best->mpts_socket); - if (preferred != NULL) + if (preferred != NULL) { *preferred = mptcp_return_subflow(best); + } /* * Second Step: Among best and second_best. Choose the one that is * most appropriate for this particular service-type. */ - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { + return mptcp_return_subflow(best); + } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { /* * Only handover if Symptoms tells us to do so. */ - if (IFNET_IS_WIFI(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable() && - besttp->t_rxtshift >= mptcp_fail_thresh) - return (mptcp_return_subflow(second_best)); + if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && + mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) { + return mptcp_return_subflow(second_best); + } - return (mptcp_return_subflow(best)); + return mptcp_return_subflow(best); } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) { int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT; int rto_thresh = mptcp_rtothresh; /* Adjust with symptoms information */ - if (IFNET_IS_WIFI(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable()) { + if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && + mptcp_is_wifi_unusable_for_session(mpte) != 0) { rtt_thresh /= 2; rto_thresh /= 2; } @@ -852,12 +940,12 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr second_best->mpts_connid, secondtp->t_srtt >> TCP_RTT_SHIFT), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); - return (mptcp_return_subflow(second_best)); + return mptcp_return_subflow(second_best); } - if (besttp->t_rxtshift >= mptcp_fail_thresh && + if (mptcp_subflow_is_slow(mpte, best) && secondtp->t_rxtshift == 0) { - return (mptcp_return_subflow(second_best)); + return mptcp_return_subflow(second_best); } /* Compare RTOs, select second_best if best's rto exceeds rtothresh */ @@ -870,7 +958,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr second_best->mpts_connid, secondtp->t_rxtcur), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG); - return (mptcp_return_subflow(second_best)); + return mptcp_return_subflow(second_best); } /* @@ -878,8 +966,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr * were true. So, let's schedule on the best one, if he still * has some space in the congestion-window. */ - return (mptcp_return_subflow(best)); - } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) { + return mptcp_return_subflow(best); + } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) { struct mptsub *tmp; /* @@ -897,15 +985,16 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr } /* Is there still space in the congestion window? */ - if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) - return (mptcp_return_subflow(second_best)); + if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) { + return mptcp_return_subflow(second_best); + } - return (mptcp_return_subflow(best)); + return mptcp_return_subflow(best); } else { panic("Unknown service-type configured for MPTCP"); } - return (NULL); + return NULL; } static const char * @@ -923,13 +1012,13 @@ mptcp_event_to_str(uint32_t event) c = "MPCE_RECV_DATA_FIN"; break; } - return (c); + return c; } static const char * mptcp_state_to_str(mptcp_state_t state) { - const char *c = "UNDEFINED"; + const char *c = "UNDEFINED"; switch (state) { case MPTCPS_CLOSED: c = "MPTCPS_CLOSED"; @@ -962,13 +1051,16 @@ mptcp_state_to_str(mptcp_state_t state) c = "MPTCPS_TERMINATE"; break; } - return (c); + return c; } void mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) { - mpte_lock_assert_held(mp_tp->mpt_mpte); + struct socket *mp_so = mptetoso(mp_tp->mpt_mpte); + + socket_lock_assert_owned(mp_so); + mptcp_state_t old_state = mp_tp->mpt_state; DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, @@ -977,7 +1069,7 @@ mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) switch (mp_tp->mpt_state) { case MPTCPS_CLOSED: case MPTCPS_LISTEN: - mp_tp->mpt_state = MPTCPS_CLOSED; + mp_tp->mpt_state = MPTCPS_TERMINATE; break; case MPTCPS_ESTABLISHED: @@ -1007,13 +1099,15 @@ mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event) break; case MPTCPS_CLOSING: - if (event == MPCE_RECV_DATA_ACK) + if (event == MPCE_RECV_DATA_ACK) { mp_tp->mpt_state = MPTCPS_TIME_WAIT; + } break; case MPTCPS_LAST_ACK: - if (event == MPCE_RECV_DATA_ACK) + if (event == MPCE_RECV_DATA_ACK) { mptcp_close(mp_tp->mpt_mpte, mp_tp); + } break; case MPTCPS_FIN_WAIT_2: @@ -1057,7 +1151,6 @@ mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp, mptcp_update_rcv_state_meat(mp_tp, tp, full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len, csum); - } void @@ -1066,29 +1159,19 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp, uint16_t csum) { if (mdss_data_len == 0) { - mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte)); if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) { - mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__, - csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum); } mptcp_notify_mpfail(tp->t_inpcb->inp_socket); return; } - mptcplog((LOG_DEBUG, - "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__, - seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); - - /* Process a Data FIN packet , handled in mptcp_do_fin_opt */ - if ((seqn == 0) && (mdss_data_len == 1)) { - mptcplog((LOG_INFO, "%s: Data FIN in %s state \n", __func__, - mptcp_state_to_str(mp_tp->mpt_state)), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); - return; - } + mptcp_notify_mpready(tp->t_inpcb->inp_socket); + tp->t_rcv_map.mpt_dsn = full_dsn; tp->t_rcv_map.mpt_sseq = seqn; tp->t_rcv_map.mpt_len = mdss_data_len; @@ -1103,16 +1186,16 @@ mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m, { u_int32_t datalen; - if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) + if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { return 0; + } datalen = m->m_pkthdr.mp_rlen; /* unacceptable DSS option, fallback to TCP */ if (m->m_pkthdr.len > ((int) datalen + hdrlen)) { - mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d", - __func__, m->m_pkthdr.len, datalen), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen); } else { return 0; } @@ -1123,83 +1206,93 @@ mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m, } int -mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen) +mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, + int drop_hdrlen) { - mptcp_insert_rmap(tp, m); + mptcp_insert_rmap(tp, m, th); if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m, - drop_hdrlen) != 0) + drop_hdrlen) != 0) { return -1; - return 0; -} - -/* - * MPTCP Checksum support - * The checksum is calculated whenever the MPTCP DSS option is included - * in the TCP packet. The checksum includes the sum of the MPTCP psuedo - * header and the actual data indicated by the length specified in the - * DSS option. - */ - -int -mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, - uint32_t sseq, uint16_t dlen, uint16_t csum) -{ - uint16_t mptcp_csum; - - mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum); - if (mptcp_csum) { - tp->t_mpflags |= TMPF_SND_MPFAIL; - mptcp_notify_mpfail(tp->t_inpcb->inp_socket); - m_freem(m); - tcpstat.tcps_mp_badcsum++; - return (-1); } - return (0); + return 0; } static uint16_t mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq, - uint16_t dlen, uint16_t csum) + uint16_t dlen, uint16_t csum, int dfin) { struct mptcb *mp_tp = tptomptp(tp); + int real_len = dlen - dfin; uint32_t sum = 0; - if (mp_tp == NULL) - return (0); + VERIFY(real_len >= 0); - if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) - return (0); + if (mp_tp == NULL) { + return 0; + } - if (tp->t_mpflags & TMPF_TCP_FALLBACK) - return (0); + if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) { + return 0; + } + + if (tp->t_mpflags & TMPF_TCP_FALLBACK) { + return 0; + } /* * The remote side may send a packet with fewer bytes than the * claimed DSS checksum length. */ - if ((int)m_length2(m, NULL) < dlen) - return (0xffff); + if ((int)m_length2(m, NULL) < real_len) { + return 0xffff; + } - if (dlen != 0) - sum = m_sum16(m, 0, dlen); + if (real_len != 0) { + sum = m_sum16(m, 0, real_len); + } sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum); ADDCARRY(sum); + DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m, uint32_t, sum); - mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum), - MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); - return (~sum & 0xffff); + return ~sum & 0xffff; } -uint32_t +/* + * MPTCP Checksum support + * The checksum is calculated whenever the MPTCP DSS option is included + * in the TCP packet. The checksum includes the sum of the MPTCP psuedo + * header and the actual data indicated by the length specified in the + * DSS option. + */ + +int +mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, + uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin) +{ + uint16_t mptcp_csum; + + mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin); + if (mptcp_csum) { + tp->t_mpflags |= TMPF_SND_MPFAIL; + mptcp_notify_mpfail(tp->t_inpcb->inp_socket); + m_freem(m); + tcpstat.tcps_mp_badcsum++; + return -1; + } + return 0; +} + +uint16_t mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen) { - u_int32_t sum = 0; + uint32_t sum = 0; - if (dlen) + if (dlen) { sum = m_sum16(m, 0, dlen); + } dss_val = mptcp_hton64(dss_val); sseq = htonl(sseq); @@ -1210,9 +1303,9 @@ mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen sum = ~sum & 0xffff; DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum); mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum), - MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); + MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE); - return sum; + return (uint16_t)sum; } /* @@ -1233,14 +1326,13 @@ mptcp_no_rto_spike(struct socket *so) __func__, spike, tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT, tp->t_rttcur), - (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG); - + (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG); } - if (spike > 0 ) { - return (FALSE); + if (spike > 0) { + return FALSE; } else { - return (TRUE); + return TRUE; } } @@ -1250,8 +1342,9 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag) VERIFY(mpp->mpp_flags & flag); mpp->mpp_flags &= ~flag; - if (mptcp_should_defer_upcall(mpp)) + if (mptcp_should_defer_upcall(mpp)) { return; + } if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) { mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP; @@ -1270,171 +1363,199 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag) sowwakeup(mpp->mpp_socket); } - - if (mpp->mpp_flags & MPP_SET_CELLICON) { - mpp->mpp_flags &= ~MPP_SET_CELLICON; - - mptcp_set_cellicon(mpp->mpp_pcbe); - } - - if (mpp->mpp_flags & MPP_UNSET_CELLICON) { - mpp->mpp_flags &= ~MPP_UNSET_CELLICON; - - mptcp_unset_cellicon(); - } -} - -static void -mptcp_ask_for_nat64(struct ifnet *ifp) -{ - in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL); - - mptcplog((LOG_DEBUG, "%s: asked for NAT64-prefix on %s\n", - __func__, ifp->if_name), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); } static void mptcp_reset_itfinfo(struct mpt_itf_info *info) { - info->ifindex = 0; - info->has_v4_conn = 0; - info->has_v6_conn = 0; + memset(info, 0, sizeof(*info)); } void -mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) +mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, + uint32_t necp_flags, __unused bool *viable) { + boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4); + boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6); + boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64); + boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER); struct mppcb *mp = (struct mppcb *)handle; struct mptses *mpte = mptompte(mp); struct socket *mp_so; struct mptcb *mp_tp; - int locked = 0; uint32_t i, ifindex; + struct ifnet *ifp; + int locked = 0; - ifindex = flow->interface_index; + ifindex = interface_index; VERIFY(ifindex != IFSCOPE_NONE); - /* ToDo - remove after rdar://problem/32007628 */ - if (!IF_INDEX_IN_RANGE(ifindex)) - printf("%s 1 ifindex %u not in range of flow %p action %d\n", - __func__, ifindex, flow, action); - /* About to be garbage-collected (see note about MPTCP/NECP interactions) */ - if (mp->mpp_socket->so_usecount == 0) + if (mp->mpp_socket->so_usecount == 0) { return; + } + + mp_so = mptetoso(mpte); if (action != NECP_CLIENT_CBACTION_INITIAL) { - mpte_lock(mpte); + socket_lock(mp_so, 1); locked = 1; /* Check again, because it might have changed while waiting */ - if (mp->mpp_socket->so_usecount == 0) + if (mp->mpp_socket->so_usecount == 0) { goto out; + } } + socket_lock_assert_owned(mp_so); + mp_tp = mpte->mpte_mptcb; - mp_so = mptetoso(mpte); - mptcplog((LOG_DEBUG, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n", - __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + + os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex, + ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE, + mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state, + has_v4, has_v6, has_nat64, low_power); /* No need on fallen back sockets */ - if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) + if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { goto out; + } + + /* + * When the interface goes in low-power mode we don't want to establish + * new subflows on it. Thus, mark it internally as non-viable. + */ + if (low_power) { + action = NECP_CLIENT_CBACTION_NONVIABLE; + } if (action == NECP_CLIENT_CBACTION_NONVIABLE) { for (i = 0; i < mpte->mpte_itfinfo_size; i++) { - if (mpte->mpte_itfinfo[i].ifindex == ifindex) + if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) { + continue; + } + + if (mpte->mpte_itfinfo[i].ifindex == ifindex) { mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]); + } } mptcp_sched_create_subflows(mpte); } else if (action == NECP_CLIENT_CBACTION_VIABLE || - action == NECP_CLIENT_CBACTION_INITIAL) { - int found_empty = 0, empty_index = -1; - struct ifnet *ifp; + action == NECP_CLIENT_CBACTION_INITIAL) { + int found_slot = 0, slot_index = -1; + struct sockaddr *dst; - /* ToDo - remove after rdar://problem/32007628 */ - if (!IF_INDEX_IN_RANGE(ifindex)) - printf("%s 2 ifindex %u not in range of flow %p action %d\n", - __func__, ifindex, flow, action); - - ifnet_head_lock_shared(); - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - - /* ToDo - remove after rdar://problem/32007628 */ - if (!IF_INDEX_IN_RANGE(ifindex)) - printf("%s 3 ifindex %u not in range of flow %p action %d\n", - __func__, ifindex, flow, action); + if (ifp == NULL) { + goto out; + } - if (ifp == NULL) + if (IFNET_IS_COMPANION_LINK(ifp)) { goto out; + } if (IFNET_IS_EXPENSIVE(ifp) && - (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) + (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) { goto out; + } + + if (IFNET_IS_CONSTRAINED(ifp) && + (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) { + goto out; + } if (IFNET_IS_CELLULAR(ifp) && - (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) + (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) { goto out; + } + if (IS_INTF_CLAT46(ifp)) { + has_v4 = FALSE; + } + + /* Look for the slot on where to store/update the interface-info. */ for (i = 0; i < mpte->mpte_itfinfo_size; i++) { + /* Found a potential empty slot where we can put it */ if (mpte->mpte_itfinfo[i].ifindex == 0) { - found_empty = 1; - empty_index = i; + found_slot = 1; + slot_index = i; + } + + /* + * The interface is already in our array. Check if we + * need to update it. + */ + if (mpte->mpte_itfinfo[i].ifindex == ifindex && + (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 || + mpte->mpte_itfinfo[i].has_v6_conn != has_v6 || + mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) { + found_slot = 1; + slot_index = i; + break; } if (mpte->mpte_itfinfo[i].ifindex == ifindex) { - /* Ok, it's already there */ + /* + * Ok, it's already there and we don't need + * to update it + */ goto out; } } - if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) && - !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) && - ifnet_get_nat64prefix(ifp, NULL) == ENOENT) { - mptcp_ask_for_nat64(ifp); + dst = mptcp_get_session_dst(mpte, has_v6, has_v4); + if (dst && dst->sa_family == AF_INET && + has_v6 && !has_nat64 && !has_v4) { + if (found_slot) { + mpte->mpte_itfinfo[slot_index].ifindex = ifindex; + mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4; + mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; + mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64; + } goto out; } - if (found_empty == 0) { + if (found_slot == 0) { int new_size = mpte->mpte_itfinfo_size * 2; struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO); if (info == NULL) { - mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size); goto out; } memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info)); - if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) + if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) { _FREE(mpte->mpte_itfinfo, M_TEMP); + } /* We allocated a new one, thus the first must be empty */ - empty_index = mpte->mpte_itfinfo_size; + slot_index = mpte->mpte_itfinfo_size; mpte->mpte_itfinfo = info; mpte->mpte_itfinfo_size = new_size; - - mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); } - VERIFY(empty_index >= 0 && empty_index < (int)mpte->mpte_itfinfo_size); - mpte->mpte_itfinfo[empty_index].ifindex = ifindex; - mpte->mpte_itfinfo[empty_index].has_v4_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4); - mpte->mpte_itfinfo[empty_index].has_v6_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6); + VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size); + mpte->mpte_itfinfo[slot_index].ifindex = ifindex; + mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4; + mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; + mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64; mptcp_sched_create_subflows(mpte); } out: - if (locked) - mpte_unlock(mpte); + if (locked) { + socket_unlock(mp_so, 1); + } } void @@ -1443,7 +1564,7 @@ mptcp_set_restrictions(struct socket *mp_so) struct mptses *mpte = mpsotompte(mp_so); uint32_t i; - mpte_lock_assert_held(mpte); + socket_lock_assert_owned(mp_so); ifnet_head_lock_shared(); @@ -1452,20 +1573,30 @@ mptcp_set_restrictions(struct socket *mp_so) uint32_t ifindex = info->ifindex; struct ifnet *ifp; - if (ifindex == IFSCOPE_NONE) + if (ifindex == IFSCOPE_NONE) { continue; + } ifp = ifindex2ifnet[ifindex]; + if (ifp == NULL) { + continue; + } if (IFNET_IS_EXPENSIVE(ifp) && - (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) + (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) { info->ifindex = IFSCOPE_NONE; + } + + if (IFNET_IS_CONSTRAINED(ifp) && + (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) { + info->ifindex = IFSCOPE_NONE; + } if (IFNET_IS_CELLULAR(ifp) && - (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) + (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) { info->ifindex = IFSCOPE_NONE; + } } ifnet_head_done(); } -