2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * A note on the MPTCP/NECP-interactions:
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
42 * There are however some subtleties.
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
98 #include <netinet/in.h>
99 #include <netinet/in_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/mptcp_var.h>
105 #include <netinet/mptcp.h>
106 #include <netinet/mptcp_seq.h>
107 #include <netinet/mptcp_opt.h>
108 #include <netinet/mptcp_timer.h>
110 int mptcp_enable
= 1;
111 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, enable
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
112 &mptcp_enable
, 0, "Enable Multipath TCP Support");
114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
115 int mptcp_mpcap_retries
= MPTCP_CAPABLE_RETRIES
;
116 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mptcp_cap_retr
,
117 CTLFLAG_RW
| CTLFLAG_LOCKED
,
118 &mptcp_mpcap_retries
, 0, "Number of MP Capable SYN Retries");
121 * By default, DSS checksum is turned off, revisit if we ever do
122 * MPTCP for non SSL Traffic.
124 int mptcp_dss_csum
= 0;
125 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, dss_csum
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
126 &mptcp_dss_csum
, 0, "Enable DSS checksum");
129 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
130 * is attempted on a different path.
132 int mptcp_fail_thresh
= 1;
133 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fail
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
134 &mptcp_fail_thresh
, 0, "Failover threshold");
138 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
139 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
140 * Some carrier networks have a timeout of 10 or 15 minutes.
142 int mptcp_subflow_keeptime
= 60 * 14;
143 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, keepalive
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
144 &mptcp_subflow_keeptime
, 0, "Keepalive in seconds");
146 int mptcp_rtthist_rtthresh
= 600;
147 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rtthist_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
148 &mptcp_rtthist_rtthresh
, 0, "Rtt threshold");
151 * Use RTO history for sending new data
153 int mptcp_use_rto
= 1;
154 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, userto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
155 &mptcp_use_rto
, 0, "Disable RTO for subflow selection");
157 int mptcp_rtothresh
= 1500;
158 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rto_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
159 &mptcp_rtothresh
, 0, "RTO threshold");
162 * Probe the preferred path, when it is not in use
164 uint32_t mptcp_probeto
= 1000;
165 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probeto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
166 &mptcp_probeto
, 0, "Disable probing by setting to 0");
168 uint32_t mptcp_probecnt
= 5;
169 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probecnt
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
170 &mptcp_probecnt
, 0, "Number of probe writes");
173 * Static declarations
175 static uint16_t mptcp_input_csum(struct tcpcb
*, struct mbuf
*, uint64_t,
176 uint32_t, uint16_t, uint16_t, uint16_t);
179 mptcp_reass_present(struct socket
*mp_so
)
181 struct mptcb
*mp_tp
= mpsotomppcb(mp_so
)->mpp_pcbe
->mpte_mptcb
;
187 * Present data to user, advancing rcv_nxt through
188 * completed sequence space.
190 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
193 q
= LIST_FIRST(&mp_tp
->mpt_segq
);
194 if (!q
|| q
->tqe_m
->m_pkthdr
.mp_dsn
!= mp_tp
->mpt_rcvnxt
) {
199 * If there is already another thread doing reassembly for this
200 * connection, it is better to let it finish the job --
203 if (mp_tp
->mpt_flags
& MPTCPF_REASS_INPROG
) {
207 mp_tp
->mpt_flags
|= MPTCPF_REASS_INPROG
;
210 mp_tp
->mpt_rcvnxt
+= q
->tqe_len
;
211 LIST_REMOVE(q
, tqe_q
);
212 if (mp_so
->so_state
& SS_CANTRCVMORE
) {
215 flags
= !!(q
->tqe_m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
);
216 if (sbappendstream_rcvdemux(mp_so
, q
->tqe_m
, 0, 0)) {
220 zfree(tcp_reass_zone
, q
);
221 mp_tp
->mpt_reassqlen
--;
222 q
= LIST_FIRST(&mp_tp
->mpt_segq
);
223 } while (q
&& q
->tqe_m
->m_pkthdr
.mp_dsn
== mp_tp
->mpt_rcvnxt
);
224 mp_tp
->mpt_flags
&= ~MPTCPF_REASS_INPROG
;
227 sorwakeup(mp_so
); /* done with socket lock held */
233 mptcp_reass(struct socket
*mp_so
, struct pkthdr
*phdr
, int *tlenp
, struct mbuf
*m
)
235 struct mptcb
*mp_tp
= mpsotomppcb(mp_so
)->mpp_pcbe
->mpte_mptcb
;
236 u_int64_t mb_dsn
= phdr
->mp_dsn
;
238 struct tseg_qent
*p
= NULL
;
239 struct tseg_qent
*nq
;
240 struct tseg_qent
*te
= NULL
;
244 * Limit the number of segments in the reassembly queue to prevent
245 * holding on to too many segments (and thus running out of mbufs).
246 * Make sure to let the missing segment through which caused this
247 * queue. Always keep one global queue entry spare to be able to
248 * process the missing segment.
250 qlimit
= min(max(100, mp_so
->so_rcv
.sb_hiwat
>> 10),
251 (tcp_autorcvbuf_max
>> 10));
252 if (mb_dsn
!= mp_tp
->mpt_rcvnxt
&&
253 (mp_tp
->mpt_reassqlen
+ 1) >= qlimit
) {
254 tcpstat
.tcps_mptcp_rcvmemdrop
++;
260 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
261 te
= (struct tseg_qent
*) zalloc(tcp_reass_zone
);
263 tcpstat
.tcps_mptcp_rcvmemdrop
++;
268 mp_tp
->mpt_reassqlen
++;
271 * Find a segment which begins after this one does.
273 LIST_FOREACH(q
, &mp_tp
->mpt_segq
, tqe_q
) {
274 if (MPTCP_SEQ_GT(q
->tqe_m
->m_pkthdr
.mp_dsn
, mb_dsn
)) {
281 * If there is a preceding segment, it may provide some of
282 * our data already. If so, drop the data from the incoming
283 * segment. If it provides all of our data, drop us.
287 /* conversion to int (in i) handles seq wraparound */
288 i
= p
->tqe_m
->m_pkthdr
.mp_dsn
+ p
->tqe_len
- mb_dsn
;
291 tcpstat
.tcps_mptcp_rcvduppack
++;
293 zfree(tcp_reass_zone
, te
);
295 mp_tp
->mpt_reassqlen
--;
297 * Try to present any queued data
298 * at the left window edge to the user.
299 * This is needed after the 3-WHS
310 tcpstat
.tcps_mp_oodata
++;
313 * While we overlap succeeding segments trim them or,
314 * if they are completely covered, dequeue them.
317 int64_t i
= (mb_dsn
+ *tlenp
) - q
->tqe_m
->m_pkthdr
.mp_dsn
;
322 if (i
< q
->tqe_len
) {
323 q
->tqe_m
->m_pkthdr
.mp_dsn
+= i
;
329 nq
= LIST_NEXT(q
, tqe_q
);
330 LIST_REMOVE(q
, tqe_q
);
332 zfree(tcp_reass_zone
, q
);
333 mp_tp
->mpt_reassqlen
--;
337 /* Insert the new segment queue entry into place. */
340 te
->tqe_len
= *tlenp
;
343 LIST_INSERT_HEAD(&mp_tp
->mpt_segq
, te
, tqe_q
);
345 LIST_INSERT_AFTER(p
, te
, tqe_q
);
349 return mptcp_reass_present(mp_so
);
353 * MPTCP input, called when data has been read from a subflow socket.
356 mptcp_input(struct mptses
*mpte
, struct mbuf
*m
)
358 struct socket
*mp_so
;
359 struct mptcb
*mp_tp
= NULL
;
360 int count
= 0, wakeup
= 0;
361 struct mbuf
*save
= NULL
, *prev
= NULL
;
362 struct mbuf
*freelist
= NULL
, *tail
= NULL
;
364 VERIFY(m
->m_flags
& M_PKTHDR
);
366 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
368 mp_so
= mptetoso(mpte
);
369 mp_tp
= mpte
->mpte_mptcb
;
373 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
376 * Each mbuf contains MPTCP Data Sequence Map
377 * Process the data for reassembly, delivery to MPTCP socket
381 count
= mp_so
->so_rcv
.sb_cc
;
384 * In the degraded fallback case, data is accepted without DSS map
386 if (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) {
390 mptcp_sbrcv_grow(mp_tp
);
394 if ((iter
->m_flags
& M_PKTHDR
) &&
395 (iter
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
)) {
399 if ((iter
->m_flags
& M_PKTHDR
) && m_pktlen(iter
) == 0) {
400 /* Don't add zero-length packets, so jump it! */
406 prev
->m_next
= iter
->m_next
;
411 /* It was a zero-length packet so next one must be a pkthdr */
412 VERIFY(iter
== NULL
|| iter
->m_flags
& M_PKTHDR
);
420 * assume degraded flow as this may be the first packet
421 * without DSS, and the subflow state is not updated yet.
423 if (sbappendstream_rcvdemux(mp_so
, m
, 0, 0)) {
427 DTRACE_MPTCP5(receive__degraded
, struct mbuf
*, m
,
428 struct socket
*, mp_so
,
429 struct sockbuf
*, &mp_so
->so_rcv
,
430 struct sockbuf
*, &mp_so
->so_snd
,
431 struct mptses
*, mpte
);
432 count
= mp_so
->so_rcv
.sb_cc
- count
;
434 mp_tp
->mpt_rcvnxt
+= count
;
437 mptcp_close_fsm(mp_tp
, MPCE_RECV_DATA_FIN
);
438 socantrcvmore(mp_so
);
441 mptcplog((LOG_DEBUG
, "%s: Fallback read %d bytes\n", __func__
,
442 count
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
452 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
453 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
459 * A single TCP packet formed of multiple mbufs
460 * holds DSS mapping in the first mbuf of the chain.
461 * Other mbufs in the chain may have M_PKTHDR set
462 * even though they belong to the same TCP packet
463 * and therefore use the DSS mapping stored in the
464 * first mbuf of the mbuf chain. mptcp_input() can
465 * get an mbuf chain with multiple TCP packets.
467 while (save
&& (!(save
->m_flags
& M_PKTHDR
) ||
468 !(save
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
))) {
478 mb_dsn
= m
->m_pkthdr
.mp_dsn
;
479 mb_datalen
= m
->m_pkthdr
.mp_rlen
;
481 todrop
= (mb_dsn
+ mb_datalen
) - (mp_tp
->mpt_rcvnxt
+ mp_tp
->mpt_rcvwnd
);
483 tcpstat
.tcps_mptcp_rcvpackafterwin
++;
485 if (todrop
>= mb_datalen
) {
486 if (freelist
== NULL
) {
503 mb_datalen
-= todrop
;
507 * We drop from the right edge of the mbuf, thus the
508 * DATA_FIN is dropped as well
510 m
->m_pkthdr
.pkt_flags
&= ~PKTF_MPTCP_DFIN
;
514 if (MPTCP_SEQ_LT(mb_dsn
, mp_tp
->mpt_rcvnxt
)) {
515 if (MPTCP_SEQ_LEQ((mb_dsn
+ mb_datalen
),
516 mp_tp
->mpt_rcvnxt
)) {
517 if (freelist
== NULL
) {
533 m_adj(m
, (mp_tp
->mpt_rcvnxt
- mb_dsn
));
535 mptcplog((LOG_INFO
, "%s: Left Edge %llu\n", __func__
,
537 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
540 if (MPTCP_SEQ_GT(mb_dsn
, mp_tp
->mpt_rcvnxt
) ||
541 !LIST_EMPTY(&mp_tp
->mpt_segq
)) {
542 mb_dfin
= mptcp_reass(mp_so
, &m
->m_pkthdr
, &mb_datalen
, m
);
546 mb_dfin
= !!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
);
548 mptcp_sbrcv_grow(mp_tp
);
550 if (sbappendstream_rcvdemux(mp_so
, m
, 0, 0)) {
554 DTRACE_MPTCP6(receive
, struct mbuf
*, m
, struct socket
*, mp_so
,
555 struct sockbuf
*, &mp_so
->so_rcv
,
556 struct sockbuf
*, &mp_so
->so_snd
,
557 struct mptses
*, mpte
,
558 struct mptcb
*, mp_tp
);
559 count
= mp_so
->so_rcv
.sb_cc
- count
;
560 tcpstat
.tcps_mp_rcvtotal
++;
561 tcpstat
.tcps_mp_rcvbytes
+= count
;
562 mptcplog((LOG_DEBUG
, "%s: Read %d bytes\n", __func__
, count
),
563 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
565 mp_tp
->mpt_rcvnxt
+= count
;
569 mptcp_close_fsm(mp_tp
, MPCE_RECV_DATA_FIN
);
570 socantrcvmore(mp_so
);
574 count
= mp_so
->so_rcv
.sb_cc
;
587 mptcp_can_send_more(struct mptcb
*mp_tp
, boolean_t ignore_reinject
)
589 struct socket
*mp_so
= mptetoso(mp_tp
->mpt_mpte
);
592 * Always send if there is data in the reinject-queue.
594 if (!ignore_reinject
&& mp_tp
->mpt_mpte
->mpte_reinjectq
) {
601 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
602 * Except when using TFO, we might be doing a 0-byte write.
603 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
604 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
607 if (!(mp_so
->so_flags1
& SOF1_PRECONNECT_DATA
) && MPTCP_SEQ_GEQ(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_sndmax
)) {
611 if (MPTCP_SEQ_LEQ(mp_tp
->mpt_snduna
+ mp_tp
->mpt_sndwnd
, mp_tp
->mpt_sndnxt
)) {
615 if (mp_tp
->mpt_sndnxt
+ 1 == mp_tp
->mpt_sndmax
&& mp_tp
->mpt_state
> MPTCPS_CLOSE_WAIT
) {
619 if (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_2
) {
630 mptcp_output(struct mptses
*mpte
)
634 struct mptsub
*mpts_tried
= NULL
;
635 struct socket
*mp_so
;
636 struct mptsub
*preferred_mpts
= NULL
;
637 uint64_t old_snd_nxt
;
640 mpte_lock_assert_held(mpte
);
641 mp_so
= mptetoso(mpte
);
642 mp_tp
= mpte
->mpte_mptcb
;
644 VERIFY(!(mpte
->mpte_mppcb
->mpp_flags
& MPP_WUPCALL
));
645 mpte
->mpte_mppcb
->mpp_flags
|= MPP_WUPCALL
;
647 mptcplog((LOG_DEBUG
, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
648 __func__
, (uint32_t)mp_tp
->mpt_sndnxt
, (uint32_t)mp_tp
->mpt_sndmax
,
649 (uint32_t)mp_tp
->mpt_snduna
, mp_tp
->mpt_sndwnd
,
650 mpte
->mpte_reinjectq
? 1 : 0,
652 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
654 old_snd_nxt
= mp_tp
->mpt_sndnxt
;
655 while (mptcp_can_send_more(mp_tp
, FALSE
)) {
656 /* get the "best" subflow to be used for transmission */
657 mpts
= mptcp_get_subflow(mpte
, NULL
, &preferred_mpts
);
659 mptcplog((LOG_INFO
, "%s: no subflow\n", __func__
),
660 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
664 mptcplog((LOG_DEBUG
, "%s: using id %u\n", __func__
, mpts
->mpts_connid
),
665 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
667 /* In case there's just one flow, we reattempt later */
668 if (mpts_tried
!= NULL
&&
669 (mpts
== mpts_tried
|| (mpts
->mpts_flags
& MPTSF_FAILINGOVER
))) {
670 mpts_tried
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
671 mpts_tried
->mpts_flags
|= MPTSF_ACTIVE
;
672 mptcp_start_timer(mpte
, MPTT_REXMT
);
673 mptcplog((LOG_DEBUG
, "%s: retry later\n", __func__
),
674 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
679 * Automatic sizing of send socket buffer. Increase the send
680 * socket buffer size if all of the following criteria are met
681 * 1. the receiver has enough buffer space for this data
682 * 2. send buffer is filled to 7/8th with data (so we actually
683 * have data to make use of it);
685 if (tcp_do_autosendbuf
== 1 &&
686 (mp_so
->so_snd
.sb_flags
& (SB_AUTOSIZE
| SB_TRIM
)) == SB_AUTOSIZE
&&
687 tcp_cansbgrow(&mp_so
->so_snd
)) {
688 if ((mp_tp
->mpt_sndwnd
/ 4 * 5) >= mp_so
->so_snd
.sb_hiwat
&&
689 mp_so
->so_snd
.sb_cc
>= (mp_so
->so_snd
.sb_hiwat
/ 8 * 7)) {
690 if (sbreserve(&mp_so
->so_snd
,
691 min(mp_so
->so_snd
.sb_hiwat
+ tcp_autosndbuf_inc
,
692 tcp_autosndbuf_max
)) == 1) {
693 mp_so
->so_snd
.sb_idealsize
= mp_so
->so_snd
.sb_hiwat
;
695 mptcplog((LOG_DEBUG
, "%s: increased snd hiwat to %u lowat %u\n",
696 __func__
, mp_so
->so_snd
.sb_hiwat
,
697 mp_so
->so_snd
.sb_lowat
),
698 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
703 DTRACE_MPTCP3(output
, struct mptses
*, mpte
, struct mptsub
*, mpts
,
704 struct socket
*, mp_so
);
705 error
= mptcp_subflow_output(mpte
, mpts
, 0);
707 /* can be a temporary loss of source address or other error */
708 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
709 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
711 if (error
!= ECANCELED
) {
712 mptcplog((LOG_ERR
, "%s: Error = %d mpts_flags %#x\n", __func__
,
713 error
, mpts
->mpts_flags
),
714 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
718 /* The model is to have only one active flow at a time */
719 mpts
->mpts_flags
|= MPTSF_ACTIVE
;
720 mpts
->mpts_probesoon
= mpts
->mpts_probecnt
= 0;
722 /* Allows us to update the smoothed rtt */
723 if (mptcp_probeto
&& mpts
!= preferred_mpts
&& preferred_mpts
!= NULL
) {
724 if (preferred_mpts
->mpts_probesoon
) {
725 if ((tcp_now
- preferred_mpts
->mpts_probesoon
) > mptcp_probeto
) {
726 mptcp_subflow_output(mpte
, preferred_mpts
, MPTCP_SUBOUT_PROBING
);
727 if (preferred_mpts
->mpts_probecnt
>= mptcp_probecnt
) {
728 preferred_mpts
->mpts_probesoon
= 0;
729 preferred_mpts
->mpts_probecnt
= 0;
733 preferred_mpts
->mpts_probesoon
= tcp_now
;
734 preferred_mpts
->mpts_probecnt
= 0;
738 if (mpte
->mpte_active_sub
== NULL
) {
739 mpte
->mpte_active_sub
= mpts
;
740 } else if (mpte
->mpte_active_sub
!= mpts
) {
741 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
742 struct tcpcb
*acttp
= sototcpcb(mpte
->mpte_active_sub
->mpts_socket
);
744 mptcplog((LOG_DEBUG
, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__
,
745 mpte
->mpte_active_sub
->mpts_connid
, acttp
->t_srtt
>> TCP_RTT_SHIFT
,
746 mpts
->mpts_connid
, tp
->t_srtt
>> TCP_RTT_SHIFT
),
747 (MPTCP_SENDER_DBG
| MPTCP_SOCKET_DBG
), MPTCP_LOGLVL_LOG
);
749 mpte
->mpte_active_sub
->mpts_flags
&= ~MPTSF_ACTIVE
;
750 mpte
->mpte_active_sub
= mpts
;
752 mptcpstats_inc_switch(mpte
, mpts
);
756 if (mp_tp
->mpt_state
> MPTCPS_CLOSE_WAIT
) {
757 if (mp_tp
->mpt_sndnxt
+ 1 == mp_tp
->mpt_sndmax
&&
758 mp_tp
->mpt_snduna
== mp_tp
->mpt_sndnxt
) {
759 mptcp_finish_usrclosed(mpte
);
763 mptcp_handle_deferred_upcalls(mpte
->mpte_mppcb
, MPP_WUPCALL
);
765 /* subflow errors should not be percolated back up */
770 static struct mptsub
*
771 mptcp_choose_subflow(struct mptsub
*mpts
, struct mptsub
*curbest
, int *currtt
)
773 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
776 * Lower RTT? Take it, if it's our first one, or
777 * it doesn't has any loss, or the current one has
780 if (tp
->t_srtt
&& *currtt
> tp
->t_srtt
&&
781 (curbest
== NULL
|| tp
->t_rxtshift
== 0 ||
782 sototcpcb(curbest
->mpts_socket
)->t_rxtshift
)) {
783 *currtt
= tp
->t_srtt
;
788 * If we find a subflow without loss, take it always!
791 sototcpcb(curbest
->mpts_socket
)->t_rxtshift
&&
792 tp
->t_rxtshift
== 0) {
793 *currtt
= tp
->t_srtt
;
797 return curbest
!= NULL
? curbest
: mpts
;
800 static struct mptsub
*
801 mptcp_return_subflow(struct mptsub
*mpts
)
803 if (mpts
&& mptcp_subflow_cwnd_space(mpts
->mpts_socket
) <= 0) {
811 * Return the most eligible subflow to be used for sending data.
814 mptcp_get_subflow(struct mptses
*mpte
, struct mptsub
*ignore
, struct mptsub
**preferred
)
816 struct tcpcb
*besttp
, *secondtp
;
817 struct inpcb
*bestinp
, *secondinp
;
819 struct mptsub
*best
= NULL
;
820 struct mptsub
*second_best
= NULL
;
821 int exp_rtt
= INT_MAX
, cheap_rtt
= INT_MAX
;
825 * Choose the best subflow for cellular and non-cellular interfaces.
828 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
829 struct socket
*so
= mpts
->mpts_socket
;
830 struct tcpcb
*tp
= sototcpcb(so
);
831 struct inpcb
*inp
= sotoinpcb(so
);
833 mptcplog((LOG_DEBUG
, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
834 __func__
, mpts
->mpts_connid
, ignore
? ignore
->mpts_connid
: -1, mpts
->mpts_flags
,
835 INP_WAIT_FOR_IF_FEEDBACK(inp
), so
->so_state
, tp
->t_state
,
836 inp
->inp_last_outifp
? IFNET_IS_CELLULAR(inp
->inp_last_outifp
) : -1,
837 tp
->t_srtt
, tp
->t_rxtshift
, cheap_rtt
, exp_rtt
,
838 mptcp_subflow_cwnd_space(so
)),
839 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
842 * First, the hard conditions to reject subflows
843 * (e.g., not connected,...)
845 if (mpts
== ignore
|| inp
->inp_last_outifp
== NULL
) {
849 if (INP_WAIT_FOR_IF_FEEDBACK(inp
)) {
853 /* There can only be one subflow in degraded state */
854 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
860 * If this subflow is waiting to finally send, do it!
862 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
863 return mptcp_return_subflow(mpts
);
867 * Only send if the subflow is MP_CAPABLE. The exceptions to
868 * this rule (degraded or TFO) have been taken care of above.
870 if (!(mpts
->mpts_flags
& MPTSF_MP_CAPABLE
)) {
874 if ((so
->so_state
& SS_ISDISCONNECTED
) ||
875 !(so
->so_state
& SS_ISCONNECTED
) ||
876 !TCPS_HAVEESTABLISHED(tp
->t_state
) ||
877 tp
->t_state
> TCPS_CLOSE_WAIT
) {
882 * Second, the soft conditions to find the subflow with best
883 * conditions for each set (aka cellular vs non-cellular)
885 if (IFNET_IS_CELLULAR(inp
->inp_last_outifp
)) {
886 second_best
= mptcp_choose_subflow(mpts
, second_best
,
889 best
= mptcp_choose_subflow(mpts
, best
, &cheap_rtt
);
894 * If there is no preferred or backup subflow, and there is no active
895 * subflow use the last usable subflow.
898 return mptcp_return_subflow(second_best
);
901 if (second_best
== NULL
) {
902 return mptcp_return_subflow(best
);
905 besttp
= sototcpcb(best
->mpts_socket
);
906 bestinp
= sotoinpcb(best
->mpts_socket
);
907 secondtp
= sototcpcb(second_best
->mpts_socket
);
908 secondinp
= sotoinpcb(second_best
->mpts_socket
);
910 if (preferred
!= NULL
) {
911 *preferred
= mptcp_return_subflow(best
);
915 * Second Step: Among best and second_best. Choose the one that is
916 * most appropriate for this particular service-type.
918 if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_HANDOVER
) {
920 * Only handover if Symptoms tells us to do so.
922 if (!IFNET_IS_CELLULAR(bestinp
->inp_last_outifp
) &&
923 mptcp_is_wifi_unusable(mpte
) != 0 && mptcp_subflow_is_bad(mpte
, best
)) {
924 return mptcp_return_subflow(second_best
);
927 return mptcp_return_subflow(best
);
928 } else if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_INTERACTIVE
) {
929 int rtt_thresh
= mptcp_rtthist_rtthresh
<< TCP_RTT_SHIFT
;
930 int rto_thresh
= mptcp_rtothresh
;
932 /* Adjust with symptoms information */
933 if (!IFNET_IS_CELLULAR(bestinp
->inp_last_outifp
) &&
934 mptcp_is_wifi_unusable(mpte
) != 0) {
939 if (besttp
->t_srtt
&& secondtp
->t_srtt
&&
940 besttp
->t_srtt
>= rtt_thresh
&&
941 secondtp
->t_srtt
< rtt_thresh
) {
942 tcpstat
.tcps_mp_sel_rtt
++;
943 mptcplog((LOG_DEBUG
, "%s: best cid %d at rtt %d, second cid %d at rtt %d\n", __func__
,
944 best
->mpts_connid
, besttp
->t_srtt
>> TCP_RTT_SHIFT
,
945 second_best
->mpts_connid
,
946 secondtp
->t_srtt
>> TCP_RTT_SHIFT
),
947 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
948 return mptcp_return_subflow(second_best
);
951 if (mptcp_subflow_is_bad(mpte
, best
) &&
952 secondtp
->t_rxtshift
== 0) {
953 return mptcp_return_subflow(second_best
);
956 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
957 if (besttp
->t_rxtcur
&& secondtp
->t_rxtcur
&&
958 besttp
->t_rxtcur
>= rto_thresh
&&
959 secondtp
->t_rxtcur
< rto_thresh
) {
960 tcpstat
.tcps_mp_sel_rto
++;
961 mptcplog((LOG_DEBUG
, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__
,
962 best
->mpts_connid
, besttp
->t_rxtcur
,
963 second_best
->mpts_connid
, secondtp
->t_rxtcur
),
964 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
966 return mptcp_return_subflow(second_best
);
970 * None of the above conditions for sending on the secondary
971 * were true. So, let's schedule on the best one, if he still
972 * has some space in the congestion-window.
974 return mptcp_return_subflow(best
);
975 } else if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_AGGREGATE
) {
979 * We only care about RTT when aggregating
981 if (besttp
->t_srtt
> secondtp
->t_srtt
) {
988 secondtp
= sototcpcb(second_best
->mpts_socket
);
989 secondinp
= sotoinpcb(second_best
->mpts_socket
);
992 /* Is there still space in the congestion window? */
993 if (mptcp_subflow_cwnd_space(bestinp
->inp_socket
) <= 0) {
994 return mptcp_return_subflow(second_best
);
997 return mptcp_return_subflow(best
);
999 panic("Unknown service-type configured for MPTCP");
1006 mptcp_event_to_str(uint32_t event
)
1008 const char *c
= "UNDEFINED";
1013 case MPCE_RECV_DATA_ACK
:
1014 c
= "MPCE_RECV_DATA_ACK";
1016 case MPCE_RECV_DATA_FIN
:
1017 c
= "MPCE_RECV_DATA_FIN";
1024 mptcp_state_to_str(mptcp_state_t state
)
1026 const char *c
= "UNDEFINED";
1029 c
= "MPTCPS_CLOSED";
1032 c
= "MPTCPS_LISTEN";
1034 case MPTCPS_ESTABLISHED
:
1035 c
= "MPTCPS_ESTABLISHED";
1037 case MPTCPS_CLOSE_WAIT
:
1038 c
= "MPTCPS_CLOSE_WAIT";
1040 case MPTCPS_FIN_WAIT_1
:
1041 c
= "MPTCPS_FIN_WAIT_1";
1043 case MPTCPS_CLOSING
:
1044 c
= "MPTCPS_CLOSING";
1046 case MPTCPS_LAST_ACK
:
1047 c
= "MPTCPS_LAST_ACK";
1049 case MPTCPS_FIN_WAIT_2
:
1050 c
= "MPTCPS_FIN_WAIT_2";
1052 case MPTCPS_TIME_WAIT
:
1053 c
= "MPTCPS_TIME_WAIT";
1055 case MPTCPS_TERMINATE
:
1056 c
= "MPTCPS_TERMINATE";
1063 mptcp_close_fsm(struct mptcb
*mp_tp
, uint32_t event
)
1065 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
1066 mptcp_state_t old_state
= mp_tp
->mpt_state
;
1068 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
1071 switch (mp_tp
->mpt_state
) {
1074 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
1077 case MPTCPS_ESTABLISHED
:
1078 if (event
== MPCE_CLOSE
) {
1079 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_1
;
1080 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
1081 } else if (event
== MPCE_RECV_DATA_FIN
) {
1082 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
1083 mp_tp
->mpt_state
= MPTCPS_CLOSE_WAIT
;
1087 case MPTCPS_CLOSE_WAIT
:
1088 if (event
== MPCE_CLOSE
) {
1089 mp_tp
->mpt_state
= MPTCPS_LAST_ACK
;
1090 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
1094 case MPTCPS_FIN_WAIT_1
:
1095 if (event
== MPCE_RECV_DATA_ACK
) {
1096 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_2
;
1097 } else if (event
== MPCE_RECV_DATA_FIN
) {
1098 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
1099 mp_tp
->mpt_state
= MPTCPS_CLOSING
;
1103 case MPTCPS_CLOSING
:
1104 if (event
== MPCE_RECV_DATA_ACK
) {
1105 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
1109 case MPTCPS_LAST_ACK
:
1110 if (event
== MPCE_RECV_DATA_ACK
) {
1111 mptcp_close(mp_tp
->mpt_mpte
, mp_tp
);
1115 case MPTCPS_FIN_WAIT_2
:
1116 if (event
== MPCE_RECV_DATA_FIN
) {
1117 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
1118 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
1122 case MPTCPS_TIME_WAIT
:
1123 case MPTCPS_TERMINATE
:
1130 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
1132 mptcplog((LOG_INFO
, "%s: %s to %s on event %s\n", __func__
,
1133 mptcp_state_to_str(old_state
),
1134 mptcp_state_to_str(mp_tp
->mpt_state
),
1135 mptcp_event_to_str(event
)),
1136 MPTCP_STATE_DBG
, MPTCP_LOGLVL_LOG
);
1139 /* If you change this function, match up mptcp_update_rcv_state_f */
1141 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt
*dss_info
, struct tcpcb
*tp
,
1144 struct mptcb
*mp_tp
= tptomptp(tp
);
1145 u_int64_t full_dsn
= 0;
1147 NTOHL(dss_info
->mdss_dsn
);
1148 NTOHL(dss_info
->mdss_subflow_seqn
);
1149 NTOHS(dss_info
->mdss_data_len
);
1151 /* XXX for autosndbuf grow sb here */
1152 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
1153 mptcp_update_rcv_state_meat(mp_tp
, tp
,
1154 full_dsn
, dss_info
->mdss_subflow_seqn
, dss_info
->mdss_data_len
,
1159 mptcp_update_rcv_state_meat(struct mptcb
*mp_tp
, struct tcpcb
*tp
,
1160 u_int64_t full_dsn
, u_int32_t seqn
, u_int16_t mdss_data_len
,
1163 if (mdss_data_len
== 0) {
1164 mptcplog((LOG_INFO
, "%s: Infinite Mapping.\n", __func__
),
1165 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
1167 if ((mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
) && (csum
!= 0)) {
1168 mptcplog((LOG_ERR
, "%s: Bad checksum %x \n", __func__
,
1169 csum
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_ERR
);
1171 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
1174 mptcplog((LOG_DEBUG
,
1175 "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__
,
1176 seqn
, mdss_data_len
, (uint32_t)full_dsn
, (uint32_t)mp_tp
->mpt_rcvnxt
),
1177 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1179 mptcp_notify_mpready(tp
->t_inpcb
->inp_socket
);
1181 tp
->t_rcv_map
.mpt_dsn
= full_dsn
;
1182 tp
->t_rcv_map
.mpt_sseq
= seqn
;
1183 tp
->t_rcv_map
.mpt_len
= mdss_data_len
;
1184 tp
->t_rcv_map
.mpt_csum
= csum
;
1185 tp
->t_mpflags
|= TMPF_EMBED_DSN
;
1190 mptcp_validate_dss_map(struct socket
*so
, struct tcpcb
*tp
, struct mbuf
*m
,
1195 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
1199 datalen
= m
->m_pkthdr
.mp_rlen
;
1201 /* unacceptable DSS option, fallback to TCP */
1202 if (m
->m_pkthdr
.len
> ((int) datalen
+ hdrlen
)) {
1203 mptcplog((LOG_ERR
, "%s: mbuf len %d, MPTCP expected %d",
1204 __func__
, m
->m_pkthdr
.len
, datalen
),
1205 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
1209 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
1210 mptcp_notify_mpfail(so
);
1216 mptcp_input_preproc(struct tcpcb
*tp
, struct mbuf
*m
, struct tcphdr
*th
,
1219 mptcp_insert_rmap(tp
, m
, th
);
1220 if (mptcp_validate_dss_map(tp
->t_inpcb
->inp_socket
, tp
, m
,
1221 drop_hdrlen
) != 0) {
1228 * MPTCP Checksum support
1229 * The checksum is calculated whenever the MPTCP DSS option is included
1230 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1231 * header and the actual data indicated by the length specified in the
1236 mptcp_validate_csum(struct tcpcb
*tp
, struct mbuf
*m
, uint64_t dsn
,
1237 uint32_t sseq
, uint16_t dlen
, uint16_t csum
, uint16_t dfin
)
1239 uint16_t mptcp_csum
;
1241 mptcp_csum
= mptcp_input_csum(tp
, m
, dsn
, sseq
, dlen
, csum
, dfin
);
1243 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
1244 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
1246 tcpstat
.tcps_mp_badcsum
++;
1253 mptcp_input_csum(struct tcpcb
*tp
, struct mbuf
*m
, uint64_t dsn
, uint32_t sseq
,
1254 uint16_t dlen
, uint16_t csum
, uint16_t dfin
)
1256 struct mptcb
*mp_tp
= tptomptp(tp
);
1257 uint16_t real_len
= dlen
- dfin
;
1260 if (mp_tp
== NULL
) {
1264 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
)) {
1268 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
) {
1273 * The remote side may send a packet with fewer bytes than the
1274 * claimed DSS checksum length.
1276 if ((int)m_length2(m
, NULL
) < real_len
) {
1280 if (real_len
!= 0) {
1281 sum
= m_sum16(m
, 0, real_len
);
1284 sum
+= in_pseudo64(htonll(dsn
), htonl(sseq
), htons(dlen
) + csum
);
1286 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
1289 mptcplog((LOG_DEBUG
, "%s: sum = %x \n", __func__
, sum
),
1290 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1291 return ~sum
& 0xffff;
1295 mptcp_output_csum(struct mbuf
*m
, uint64_t dss_val
, uint32_t sseq
, uint16_t dlen
)
1300 sum
= m_sum16(m
, 0, dlen
);
1303 dss_val
= mptcp_hton64(dss_val
);
1306 sum
+= in_pseudo64(dss_val
, sseq
, dlen
);
1309 sum
= ~sum
& 0xffff;
1310 DTRACE_MPTCP2(checksum__result
, struct mbuf
*, m
, uint32_t, sum
);
1311 mptcplog((LOG_DEBUG
, "%s: sum = %x \n", __func__
, sum
),
1312 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1318 * When WiFi signal starts fading, there's more loss and RTT spikes.
1319 * Check if there has been a large spike by comparing against
1320 * a tolerable RTT spike threshold.
1323 mptcp_no_rto_spike(struct socket
*so
)
1325 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
1328 if (tp
->t_rxtcur
> mptcp_rtothresh
) {
1329 spike
= tp
->t_rxtcur
- mptcp_rtothresh
;
1331 mptcplog((LOG_DEBUG
, "%s: spike = %d rto = %d best = %d cur = %d\n",
1333 tp
->t_rxtcur
, tp
->t_rttbest
>> TCP_RTT_SHIFT
,
1335 (MPTCP_SOCKET_DBG
| MPTCP_SENDER_DBG
), MPTCP_LOGLVL_LOG
);
1346 mptcp_handle_deferred_upcalls(struct mppcb
*mpp
, uint32_t flag
)
1348 VERIFY(mpp
->mpp_flags
& flag
);
1349 mpp
->mpp_flags
&= ~flag
;
1351 if (mptcp_should_defer_upcall(mpp
)) {
1355 if (mpp
->mpp_flags
& MPP_SHOULD_WORKLOOP
) {
1356 mpp
->mpp_flags
&= ~MPP_SHOULD_WORKLOOP
;
1358 mptcp_subflow_workloop(mpp
->mpp_pcbe
);
1361 if (mpp
->mpp_flags
& MPP_SHOULD_RWAKEUP
) {
1362 mpp
->mpp_flags
&= ~MPP_SHOULD_RWAKEUP
;
1364 sorwakeup(mpp
->mpp_socket
);
1367 if (mpp
->mpp_flags
& MPP_SHOULD_WWAKEUP
) {
1368 mpp
->mpp_flags
&= ~MPP_SHOULD_WWAKEUP
;
1370 sowwakeup(mpp
->mpp_socket
);
1373 if (mpp
->mpp_flags
& MPP_SET_CELLICON
) {
1374 mpp
->mpp_flags
&= ~MPP_SET_CELLICON
;
1376 mptcp_set_cellicon(mpp
->mpp_pcbe
);
1379 if (mpp
->mpp_flags
& MPP_UNSET_CELLICON
) {
1380 mpp
->mpp_flags
&= ~MPP_UNSET_CELLICON
;
1382 mptcp_unset_cellicon();
1387 mptcp_ask_for_nat64(struct ifnet
*ifp
)
1389 in6_post_msg(ifp
, KEV_INET6_REQUEST_NAT64_PREFIX
, NULL
, NULL
);
1391 os_log_info(mptcp_log_handle
,
1392 "%s: asked for NAT64-prefix on %s\n", __func__
,
1397 mptcp_reset_itfinfo(struct mpt_itf_info
*info
)
1400 info
->has_v4_conn
= 0;
1401 info
->has_v6_conn
= 0;
1402 info
->has_nat64_conn
= 0;
1406 mptcp_session_necp_cb(void *handle
, int action
, uint32_t interface_index
,
1407 uint32_t necp_flags
, __unused
bool *viable
)
1409 boolean_t has_v4
= !!(necp_flags
& NECP_CLIENT_RESULT_FLAG_HAS_IPV4
);
1410 boolean_t has_v6
= !!(necp_flags
& NECP_CLIENT_RESULT_FLAG_HAS_IPV6
);
1411 boolean_t has_nat64
= !!(necp_flags
& NECP_CLIENT_RESULT_FLAG_HAS_NAT64
);
1412 boolean_t low_power
= !!(necp_flags
& NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER
);
1413 struct mppcb
*mp
= (struct mppcb
*)handle
;
1414 struct mptses
*mpte
= mptompte(mp
);
1415 struct socket
*mp_so
;
1416 struct mptcb
*mp_tp
;
1418 uint32_t i
, ifindex
;
1420 ifindex
= interface_index
;
1421 VERIFY(ifindex
!= IFSCOPE_NONE
);
1423 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1424 if (mp
->mpp_socket
->so_usecount
== 0) {
1428 if (action
!= NECP_CLIENT_CBACTION_INITIAL
) {
1432 /* Check again, because it might have changed while waiting */
1433 if (mp
->mpp_socket
->so_usecount
== 0) {
1438 mpte_lock_assert_held(mpte
);
1440 mp_tp
= mpte
->mpte_mptcb
;
1441 mp_so
= mptetoso(mpte
);
1443 os_log_info(mptcp_log_handle
, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1444 __func__
, action
, ifindex
, mp
->mpp_socket
->so_usecount
, mp_tp
->mpt_flags
, mp_tp
->mpt_state
,
1445 has_v4
, has_v6
, has_nat64
, low_power
);
1447 /* No need on fallen back sockets */
1448 if (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) {
1453 * When the interface goes in low-power mode we don't want to establish
1454 * new subflows on it. Thus, mark it internally as non-viable.
1457 action
= NECP_CLIENT_CBACTION_NONVIABLE
;
1460 if (action
== NECP_CLIENT_CBACTION_NONVIABLE
) {
1461 for (i
= 0; i
< mpte
->mpte_itfinfo_size
; i
++) {
1462 if (mpte
->mpte_itfinfo
[i
].ifindex
== IFSCOPE_NONE
) {
1466 if (mpte
->mpte_itfinfo
[i
].ifindex
== ifindex
) {
1467 mptcp_reset_itfinfo(&mpte
->mpte_itfinfo
[i
]);
1471 mptcp_sched_create_subflows(mpte
);
1472 } else if (action
== NECP_CLIENT_CBACTION_VIABLE
||
1473 action
== NECP_CLIENT_CBACTION_INITIAL
) {
1474 int found_slot
= 0, slot_index
= -1;
1477 ifnet_head_lock_shared();
1478 ifp
= ifindex2ifnet
[ifindex
];
1485 if (IFNET_IS_EXPENSIVE(ifp
) &&
1486 (mp_so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
)) {
1490 if (IFNET_IS_CELLULAR(ifp
) &&
1491 (mp_so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
)) {
1495 if (IS_INTF_CLAT46(ifp
)) {
1499 /* Look for the slot on where to store/update the interface-info. */
1500 for (i
= 0; i
< mpte
->mpte_itfinfo_size
; i
++) {
1501 /* Found a potential empty slot where we can put it */
1502 if (mpte
->mpte_itfinfo
[i
].ifindex
== 0) {
1508 * The interface is already in our array. Check if we
1509 * need to update it.
1511 if (mpte
->mpte_itfinfo
[i
].ifindex
== ifindex
&&
1512 (mpte
->mpte_itfinfo
[i
].has_v4_conn
!= has_v4
||
1513 mpte
->mpte_itfinfo
[i
].has_v6_conn
!= has_v6
||
1514 mpte
->mpte_itfinfo
[i
].has_nat64_conn
!= has_nat64
)) {
1520 if (mpte
->mpte_itfinfo
[i
].ifindex
== ifindex
) {
1522 * Ok, it's already there and we don't need
1529 if ((mpte
->mpte_dst
.sa_family
== AF_INET
|| mpte
->mpte_dst
.sa_family
== 0) &&
1530 !has_nat64
&& !has_v4
) {
1532 mpte
->mpte_itfinfo
[slot_index
].has_v4_conn
= has_v4
;
1533 mpte
->mpte_itfinfo
[slot_index
].has_v6_conn
= has_v6
;
1534 mpte
->mpte_itfinfo
[slot_index
].has_nat64_conn
= has_nat64
;
1536 mptcp_ask_for_nat64(ifp
);
1540 if (found_slot
== 0) {
1541 int new_size
= mpte
->mpte_itfinfo_size
* 2;
1542 struct mpt_itf_info
*info
= _MALLOC(sizeof(*info
) * new_size
, M_TEMP
, M_ZERO
);
1545 os_log_error(mptcp_log_handle
, "%s malloc failed for %u\n",
1546 __func__
, new_size
);
1550 memcpy(info
, mpte
->mpte_itfinfo
, mpte
->mpte_itfinfo_size
* sizeof(*info
));
1552 if (mpte
->mpte_itfinfo_size
> MPTE_ITFINFO_SIZE
) {
1553 _FREE(mpte
->mpte_itfinfo
, M_TEMP
);
1556 /* We allocated a new one, thus the first must be empty */
1557 slot_index
= mpte
->mpte_itfinfo_size
;
1559 mpte
->mpte_itfinfo
= info
;
1560 mpte
->mpte_itfinfo_size
= new_size
;
1563 VERIFY(slot_index
>= 0 && slot_index
< (int)mpte
->mpte_itfinfo_size
);
1564 mpte
->mpte_itfinfo
[slot_index
].ifindex
= ifindex
;
1565 mpte
->mpte_itfinfo
[slot_index
].has_v4_conn
= has_v4
;
1566 mpte
->mpte_itfinfo
[slot_index
].has_v6_conn
= has_v6
;
1567 mpte
->mpte_itfinfo
[slot_index
].has_nat64_conn
= has_nat64
;
1569 mptcp_sched_create_subflows(mpte
);
1579 mptcp_set_restrictions(struct socket
*mp_so
)
1581 struct mptses
*mpte
= mpsotompte(mp_so
);
1584 mpte_lock_assert_held(mpte
);
1586 ifnet_head_lock_shared();
1588 for (i
= 0; i
< mpte
->mpte_itfinfo_size
; i
++) {
1589 struct mpt_itf_info
*info
= &mpte
->mpte_itfinfo
[i
];
1590 uint32_t ifindex
= info
->ifindex
;
1593 if (ifindex
== IFSCOPE_NONE
) {
1597 ifp
= ifindex2ifnet
[ifindex
];
1602 if (IFNET_IS_EXPENSIVE(ifp
) &&
1603 (mp_so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
)) {
1604 info
->ifindex
= IFSCOPE_NONE
;
1607 if (IFNET_IS_CELLULAR(ifp
) &&
1608 (mp_so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
)) {
1609 info
->ifindex
= IFSCOPE_NONE
;