2 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 #include <sys/protosw.h>
39 #include <kern/zalloc.h>
40 #include <kern/locks.h>
42 #include <mach/thread_act.h>
45 #include <dev/random/randomdev.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_seq.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/mptcp_var.h>
55 #include <netinet/mptcp.h>
56 #include <netinet/mptcp_seq.h>
57 #include <netinet/mptcp_opt.h>
58 #include <netinet/mptcp_timer.h>
61 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, enable
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
62 &mptcp_enable
, 0, "Enable Multipath TCP Support");
64 /* Number of times to try negotiating MPTCP on SYN retransmissions */
65 int mptcp_mpcap_retries
= MPTCP_CAPABLE_RETRIES
;
66 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mptcp_cap_retr
,
67 CTLFLAG_RW
| CTLFLAG_LOCKED
,
68 &mptcp_mpcap_retries
, 0, "Number of MP Capable SYN Retries");
71 * By default, DSS checksum is turned off, revisit if we ever do
72 * MPTCP for non SSL Traffic.
74 int mptcp_dss_csum
= 0;
75 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, dss_csum
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
76 &mptcp_dss_csum
, 0, "Enable DSS checksum");
79 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
80 * is attempted on a different path.
82 int mptcp_fail_thresh
= 1;
83 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fail
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
84 &mptcp_fail_thresh
, 0, "Failover threshold");
88 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
89 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
90 * Some carrier networks have a timeout of 10 or 15 minutes.
92 int mptcp_subflow_keeptime
= 60*14;
93 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, keepalive
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
94 &mptcp_subflow_keeptime
, 0, "Keepalive in seconds");
99 int mptcp_mpprio_enable
= 1;
100 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mpprio
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
101 &mptcp_mpprio_enable
, 0, "Enable MP_PRIO option");
104 * REMOVE_ADDR option.
106 int mptcp_remaddr_enable
= 1;
107 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, remaddr
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
108 &mptcp_remaddr_enable
, 0, "Enable REMOVE_ADDR option");
113 int mptcp_fastjoin
= 1;
114 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fastjoin
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
115 &mptcp_fastjoin
, 0, "Enable FastJoin Option");
117 int mptcp_zerortt_fastjoin
= 0;
118 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, zerortt_fastjoin
, CTLFLAG_RW
|
119 CTLFLAG_LOCKED
, &mptcp_zerortt_fastjoin
, 0,
120 "Enable Zero RTT Fast Join");
123 * R/W Notification on resume
125 int mptcp_rwnotify
= 0;
126 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rwnotify
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
127 &mptcp_rwnotify
, 0, "Enable RW notify on resume");
130 * Using RTT history for sending new data
132 int mptcp_use_rtthist
= 1;
133 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rtthist
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
134 &mptcp_use_rtthist
, 0, "Disable RTT History");
136 #define MPTCP_RTTHIST_MINTHRESH 500
137 int mptcp_rtthist_rtthresh
= 600;
138 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rtthist_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
139 &mptcp_rtthist_rtthresh
, 0, "Rtt threshold");
142 * Use RTO history for sending new data
144 int mptcp_use_rto
= 1;
145 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, userto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
146 &mptcp_use_rto
, 0, "Disable RTO for subflow selection");
148 #define MPTCP_RTO_MINTHRESH 1000
149 int mptcp_rtothresh
= 1500;
150 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rto_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
151 &mptcp_rtothresh
, 0, "RTO threshold");
154 * Use server's chosen path for sending new data
156 int mptcp_peerswitch
= 1;
157 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, use_peer
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
158 &mptcp_peerswitch
, 0, "Use peer");
160 #define MPTCP_PEERSWITCH_CNTMIN 3
161 uint32_t mptcp_peerswitch_cnt
= 3;
162 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, peerswitchno
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
163 &mptcp_peerswitch_cnt
, 0, "Set threshold based on peer's data arrival");
166 * Probe the preferred path, when it is not in use
168 #define MPTCP_PROBETO_MIN 500
169 uint32_t mptcp_probeto
= 1000;
170 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probeto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
171 &mptcp_probeto
, 0, "Disable probing by setting to 0");
173 #define MPTCP_PROBE_MX 15
174 uint32_t mptcp_probecnt
= 5;
175 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probecnt
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
176 &mptcp_probecnt
, 0, "Number of probe writes");
179 * Static declarations
181 static int mptcp_validate_csum(struct tcpcb
*, struct mbuf
*, int);
182 static uint16_t mptcp_input_csum(struct tcpcb
*, struct mbuf
*, int);
185 * MPTCP input, called when data has been read from a subflow socket.
188 mptcp_input(struct mptses
*mpte
, struct mbuf
*m
)
190 struct socket
*mp_so
;
191 struct mptcb
*mp_tp
= NULL
;
193 u_int32_t mb_datalen
;
195 struct mbuf
*save
= NULL
, *prev
= NULL
;
196 struct mbuf
*freelist
= NULL
, *tail
= NULL
;
197 boolean_t in_fallback
= FALSE
;
199 VERIFY(m
->m_flags
& M_PKTHDR
);
201 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
202 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
207 * Each mbuf contains MPTCP Data Sequence Map
208 * Process the data for reassembly, delivery to MPTCP socket
212 count
= mp_so
->so_rcv
.sb_cc
;
215 mp_tp
= mpte
->mpte_mptcb
;
216 VERIFY(mp_tp
!= NULL
);
218 /* Ok to check for this flag without lock as its set in this thread */
219 in_fallback
= (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
);
222 * In the degraded fallback case, data is accepted without DSS map
227 * assume degraded flow as this may be the first packet
228 * without DSS, and the subflow state is not updated yet.
230 if (sbappendstream(&mp_so
->so_rcv
, m
))
232 DTRACE_MPTCP5(receive__degraded
, struct mbuf
*, m
,
233 struct socket
*, mp_so
,
234 struct sockbuf
*, &mp_so
->so_rcv
,
235 struct sockbuf
*, &mp_so
->so_snd
,
236 struct mptses
*, mpte
);
237 count
= mp_so
->so_rcv
.sb_cc
- count
;
238 mptcplog((LOG_DEBUG
, "MPTCP Receiver: Fallback read %d bytes\n",
239 count
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
245 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
246 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
253 * A single TCP packet formed of multiple mbufs
254 * holds DSS mapping in the first mbuf of the chain.
255 * Other mbufs in the chain may have M_PKTHDR set
256 * even though they belong to the same TCP packet
257 * and therefore use the DSS mapping stored in the
258 * first mbuf of the mbuf chain. mptcp_input() can
259 * get an mbuf chain with multiple TCP packets.
261 while (save
&& (!(save
->m_flags
& M_PKTHDR
) ||
262 !(save
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
))) {
271 mb_dsn
= m
->m_pkthdr
.mp_dsn
;
272 mb_datalen
= m
->m_pkthdr
.mp_rlen
;
274 if (MPTCP_SEQ_GT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
275 tcpstat
.tcps_mp_oodata
++;
280 * Reassembly queue support here in future. Per spec,
281 * senders must implement retransmission timer to
282 * retransmit unacked data. Dropping out of order
283 * gives a slight hit on performance but allows us to
284 * deploy MPTCP and protects us against in-window DoS
285 * attacks that attempt to use up memory by sending
286 * out of order data. When doing load sharing across
287 * subflows, out of order support is a must.
291 if (MPTCP_SEQ_LT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
292 if (MPTCP_SEQ_LEQ((mb_dsn
+ mb_datalen
),
293 mp_tp
->mpt_rcvatmark
)) {
294 if (freelist
== NULL
)
308 m_adj(m
, (mp_tp
->mpt_rcvatmark
- mb_dsn
));
310 mptcplog((LOG_INFO
, "MPTCP Receiver: Left Edge %llu\n",
311 mp_tp
->mpt_rcvatmark
),
312 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
316 if (sbappendstream(&mp_so
->so_rcv
, m
)) {
319 DTRACE_MPTCP6(receive
, struct mbuf
*, m
, struct socket
*, mp_so
,
320 struct sockbuf
*, &mp_so
->so_rcv
,
321 struct sockbuf
*, &mp_so
->so_snd
,
322 struct mptses
*, mpte
,
323 struct mptcb
*, mp_tp
);
325 count
= mp_so
->so_rcv
.sb_cc
- count
;
326 tcpstat
.tcps_mp_rcvtotal
++;
327 tcpstat
.tcps_mp_rcvbytes
+= count
;
328 mptcplog((LOG_DEBUG
, "MPTCP Receiver: Read %d bytes\n", count
),
329 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
332 * The data received at the MPTCP layer will never exceed the
333 * receive window because anything to the right of the
334 * receive window will be trimmed at the subflow level.
336 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
337 mp_tp
->mpt_rcvatmark
+= count
;
340 count
= mp_so
->so_rcv
.sb_cc
;
352 mptcp_output(struct mptses
*mpte
)
355 struct mptsub
*mpts_tried
= NULL
;
356 struct socket
*mp_so
;
357 struct mptsub
*preferred_mpts
= NULL
;
360 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
361 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
362 if (mp_so
->so_state
& SS_CANTSENDMORE
) {
363 mptcplog((LOG_DEBUG
, "MPTCP Sender: cantsendmore\n"),
364 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
369 /* get the "best" subflow to be used for transmission */
370 mpts
= mptcp_get_subflow(mpte
, NULL
, &preferred_mpts
);
372 mptcplog((LOG_ERR
, "MPTCP Sender: mp_so 0x%llx no subflow\n",
373 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)),
374 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
378 mptcplog((LOG_DEBUG
, "MPTCP Sender: mp_so 0x%llx using cid %d \n",
379 (uint64_t)VM_KERNEL_ADDRPERM(mp_so
), mpts
->mpts_connid
),
380 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
382 /* In case there's just one flow, we reattempt later */
384 if ((mpts_tried
!= NULL
) && ((mpts
== mpts_tried
) ||
385 (mpts
->mpts_flags
& MPTSF_FAILINGOVER
))) {
387 MPTS_LOCK(mpts_tried
);
388 mpts_tried
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
389 mpts_tried
->mpts_flags
|= MPTSF_ACTIVE
;
390 MPTS_UNLOCK(mpts_tried
);
391 mptcp_start_timer(mpte
, MPTT_REXMT
);
392 mptcplog((LOG_DEBUG
, "MPTCP Sender: mp_so 0x%llx retry later\n",
393 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)),
394 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
398 DTRACE_MPTCP3(output
, struct mptses
*, mpte
, struct mptsub
*, mpts
,
399 struct socket
*, mp_so
);
400 error
= mptcp_subflow_output(mpte
, mpts
);
401 if (error
&& error
!= EWOULDBLOCK
) {
402 /* can be a temporary loss of source address or other error */
403 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
404 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
407 mptcplog((LOG_INFO
, "MPTCP Sender: %s Error = %d \n",
409 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
412 /* The model is to have only one active flow at a time */
413 mpts
->mpts_flags
|= MPTSF_ACTIVE
;
414 mpts
->mpts_probesoon
= mpts
->mpts_probecnt
= 0;
417 /* Allows us to update the smoothed rtt */
418 if ((mptcp_probeto
) && (mptcp_probeto
>= MPTCP_PROBETO_MIN
) &&
419 (mpts
!= preferred_mpts
) && (preferred_mpts
!= NULL
)) {
420 MPTS_LOCK(preferred_mpts
);
421 if (preferred_mpts
->mpts_probesoon
) {
422 if ((tcp_now
- preferred_mpts
->mpts_probesoon
) >
424 (void) mptcp_subflow_output(mpte
, preferred_mpts
);
425 if (preferred_mpts
->mpts_probecnt
>=
426 MIN(mptcp_probecnt
, MPTCP_PROBE_MX
)) {
427 preferred_mpts
->mpts_probesoon
= 0;
428 preferred_mpts
->mpts_probecnt
= 0;
432 preferred_mpts
->mpts_probesoon
= tcp_now
;
433 preferred_mpts
->mpts_probecnt
= 0;
435 MPTS_UNLOCK(preferred_mpts
);
438 if (mpte
->mpte_active_sub
== NULL
) {
439 mpte
->mpte_active_sub
= mpts
;
440 } else if (mpte
->mpte_active_sub
!= mpts
) {
441 mptcplog((LOG_DEBUG
, "MPTCP Sender: switch [cid %d, srtt %d]"
442 "to [cid %d, srtt %d]\n",
443 mpte
->mpte_active_sub
->mpts_connid
,
444 mpte
->mpte_active_sub
->mpts_srtt
>> 5,
446 mpts
->mpts_srtt
>> 5),
447 MPTCP_SENDER_DBG
| MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
449 MPTS_LOCK(mpte
->mpte_active_sub
);
450 mpte
->mpte_active_sub
->mpts_flags
&= ~MPTSF_ACTIVE
;
451 mpts
->mpts_peerswitch
= 0;
452 MPTS_UNLOCK(mpte
->mpte_active_sub
);
453 mpte
->mpte_active_sub
= mpts
;
454 tcpstat
.tcps_mp_switches
++;
457 /* subflow errors should not be percolated back up */
462 * Return the most eligible subflow to be used for sending data.
463 * This function also serves to check if any alternate subflow is available
464 * or not. best and second_best flows are chosen by their priority. third_best
465 * could be best or second_best but is under loss at the time of evaluation.
468 mptcp_get_subflow(struct mptses
*mpte
, struct mptsub
*ignore
, struct mptsub
**preferred
)
471 struct mptsub
*best
= NULL
;
472 struct mptsub
*second_best
= NULL
;
473 struct mptsub
*third_best
= NULL
;
474 struct mptsub
*symptoms_best
= NULL
;
475 struct socket
*so
= NULL
;
477 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
479 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
482 if ((ignore
) && (mpts
== ignore
)) {
487 /* There can only be one subflow in degraded state */
488 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
495 * Subflows with TFO or Fastjoin allow data to be written before
496 * the subflow is mp capable.
498 if (!(mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) &&
499 !(mpts
->mpts_flags
& MPTSF_FASTJ_REQD
) &&
500 !(mpts
->mpts_flags
& MPTSF_TFO_REQD
)) {
505 if (mpts
->mpts_flags
& MPTSF_SUSPENDED
) {
510 if ((mpts
->mpts_flags
& MPTSF_DISCONNECTED
) ||
511 (mpts
->mpts_flags
& MPTSF_DISCONNECTING
)) {
516 if (mpts
->mpts_flags
& MPTSF_FAILINGOVER
) {
517 so
= mpts
->mpts_socket
;
518 if ((so
) && (!(so
->so_flags
& SOF_PCBCLEARING
))) {
520 if ((so
->so_snd
.sb_cc
== 0) &&
521 (mptcp_no_rto_spike(so
))) {
522 mpts
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
523 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
524 socket_unlock(so
, 1);
527 mptcplog((LOG_DEBUG
, "MPTCP Sender: "
528 "%s cid %d in failover\n",
529 __func__
, third_best
->mpts_connid
),
531 MPTCP_LOGLVL_VERBOSE
);
532 socket_unlock(so
, 1);
542 /* When there are no preferred flows, use first one in list */
543 if ((!second_best
) && !(mpts
->mpts_flags
& MPTSF_PREFERRED
))
546 if (mpts
->mpts_flags
& MPTSF_PREFERRED
) {
554 * If there is no preferred or backup subflow, and there is no active
555 * subflow use the last usable subflow.
558 return (second_best
? second_best
: third_best
);
561 if (second_best
== NULL
) {
562 return (best
? best
: third_best
);
565 if (preferred
!= NULL
)
568 /* Use a hint from symptomsd if it exists */
569 symptoms_best
= mptcp_use_symptoms_hints(best
, second_best
);
570 if (symptoms_best
!= NULL
)
571 return (symptoms_best
);
573 /* Compare RTTs, select second_best if best's rtt exceeds rttthresh */
574 if ((mptcp_use_rtthist
) &&
575 (best
->mpts_srtt
) && (second_best
->mpts_srtt
) &&
576 (best
->mpts_srtt
> second_best
->mpts_srtt
) &&
577 (best
->mpts_srtt
>= MAX((MPTCP_RTTHIST_MINTHRESH
<< 5),
578 (mptcp_rtthist_rtthresh
<< 5)))) {
579 tcpstat
.tcps_mp_sel_rtt
++;
580 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s best cid %d"
581 " at rtt %d, second cid %d at rtt %d\n", __func__
,
582 best
->mpts_connid
, best
->mpts_srtt
>> 5,
583 second_best
->mpts_connid
,
584 second_best
->mpts_srtt
>> 5),
585 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
586 return (second_best
);
589 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
590 if ((mptcp_use_rto
) &&
591 (best
->mpts_rxtcur
) && (second_best
->mpts_rxtcur
) &&
592 (best
->mpts_rxtcur
> second_best
->mpts_rxtcur
) &&
593 (best
->mpts_rxtcur
>=
594 MAX(MPTCP_RTO_MINTHRESH
, mptcp_rtothresh
))) {
595 tcpstat
.tcps_mp_sel_rto
++;
596 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s best cid %d"
597 " at rto %d, second cid %d at rto %d\n", __func__
,
598 best
->mpts_connid
, best
->mpts_rxtcur
,
599 second_best
->mpts_connid
, second_best
->mpts_rxtcur
),
600 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
602 return (second_best
);
605 /* If second_best received data, use second_best */
606 if (mptcp_peerswitch
&&
607 (second_best
->mpts_peerswitch
>
608 MAX(MPTCP_PEERSWITCH_CNTMIN
, mptcp_peerswitch_cnt
))) {
609 tcpstat
.tcps_mp_sel_peer
++;
610 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s: best cid %d"
611 " but using cid %d after receiving %d segments\n",
612 __func__
, best
->mpts_connid
, second_best
->mpts_connid
,
613 second_best
->mpts_peerswitch
), MPTCP_SENDER_DBG
,
615 return (second_best
);
621 mptcp_get_pending_subflow(struct mptses
*mpte
, struct mptsub
*ignore
)
623 struct mptsub
*mpts
= NULL
;
625 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
627 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
630 if ((ignore
) && (mpts
== ignore
)) {
635 if (mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) {
646 mptcp_event_to_str(uint32_t event
)
648 const char *c
= "UNDEFINED";
653 case MPCE_RECV_DATA_ACK
:
654 c
= "MPCE_RECV_DATA_ACK";
656 case MPCE_RECV_DATA_FIN
:
657 c
= "MPCE_RECV_DATA_FIN";
664 mptcp_state_to_str(mptcp_state_t state
)
666 const char *c
= "UNDEFINED";
674 case MPTCPS_ESTABLISHED
:
675 c
= "MPTCPS_ESTABLISHED";
677 case MPTCPS_CLOSE_WAIT
:
678 c
= "MPTCPS_CLOSE_WAIT";
680 case MPTCPS_FIN_WAIT_1
:
681 c
= "MPTCPS_FIN_WAIT_1";
684 c
= "MPTCPS_CLOSING";
686 case MPTCPS_LAST_ACK
:
687 c
= "MPTCPS_LAST_ACK";
689 case MPTCPS_FIN_WAIT_2
:
690 c
= "MPTCPS_FIN_WAIT_2";
692 case MPTCPS_TIME_WAIT
:
693 c
= "MPTCPS_TIME_WAIT";
695 case MPTCPS_FASTCLOSE_WAIT
:
696 c
= "MPTCPS_FASTCLOSE_WAIT";
698 case MPTCPS_TERMINATE
:
699 c
= "MPTCPS_TERMINATE";
706 mptcp_close_fsm(struct mptcb
*mp_tp
, uint32_t event
)
708 MPT_LOCK_ASSERT_HELD(mp_tp
);
709 mptcp_state_t old_state
= mp_tp
->mpt_state
;
711 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
714 switch (mp_tp
->mpt_state
) {
717 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
720 case MPTCPS_ESTABLISHED
:
721 if (event
== MPCE_CLOSE
) {
722 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_1
;
723 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
725 else if (event
== MPCE_RECV_DATA_FIN
) {
726 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
727 mp_tp
->mpt_state
= MPTCPS_CLOSE_WAIT
;
731 case MPTCPS_CLOSE_WAIT
:
732 if (event
== MPCE_CLOSE
) {
733 mp_tp
->mpt_state
= MPTCPS_LAST_ACK
;
734 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
738 case MPTCPS_FIN_WAIT_1
:
739 if (event
== MPCE_RECV_DATA_ACK
)
740 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_2
;
741 else if (event
== MPCE_RECV_DATA_FIN
) {
742 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
743 mp_tp
->mpt_state
= MPTCPS_CLOSING
;
748 if (event
== MPCE_RECV_DATA_ACK
)
749 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
752 case MPTCPS_LAST_ACK
:
753 if (event
== MPCE_RECV_DATA_ACK
)
754 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
757 case MPTCPS_FIN_WAIT_2
:
758 if (event
== MPCE_RECV_DATA_FIN
) {
759 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
760 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
764 case MPTCPS_TIME_WAIT
:
767 case MPTCPS_FASTCLOSE_WAIT
:
768 if (event
== MPCE_CLOSE
) {
769 /* no need to adjust for data FIN */
770 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
773 case MPTCPS_TERMINATE
:
779 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
781 mptcplog((LOG_INFO
, "MPTCP State: %s to %s on event %s\n",
782 mptcp_state_to_str(old_state
),
783 mptcp_state_to_str(mp_tp
->mpt_state
),
784 mptcp_event_to_str(event
)),
785 MPTCP_STATE_DBG
, MPTCP_LOGLVL_LOG
);
789 * Update the mptcb send state variables, but the actual sbdrop occurs
793 mptcp_data_ack_rcvd(struct mptcb
*mp_tp
, struct tcpcb
*tp
, u_int64_t full_dack
)
797 acked
= full_dack
- mp_tp
->mpt_snduna
;
800 mp_tp
->mpt_snduna
+= acked
;
801 /* In degraded mode, we may get some Data ACKs */
802 if ((tp
->t_mpflags
& TMPF_TCP_FALLBACK
) &&
803 !(mp_tp
->mpt_flags
& MPTCPF_POST_FALLBACK_SYNC
) &&
804 MPTCP_SEQ_GT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_snduna
)) {
805 /* bring back sndnxt to retransmit MPTCP data */
806 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_dsn_at_csum_fail
;
807 mp_tp
->mpt_flags
|= MPTCPF_POST_FALLBACK_SYNC
;
808 tp
->t_inpcb
->inp_socket
->so_flags1
|=
809 SOF1_POST_FALLBACK_SYNC
;
812 if ((full_dack
== mp_tp
->mpt_sndmax
) &&
813 (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_1
)) {
814 mptcp_close_fsm(mp_tp
, MPCE_RECV_DATA_ACK
);
815 tp
->t_mpflags
&= ~TMPF_SEND_DFIN
;
819 /* If you change this function, match up mptcp_update_rcv_state_f */
821 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt
*dss_info
, struct tcpcb
*tp
,
824 struct mptcb
*mp_tp
= tptomptp(tp
);
825 u_int64_t full_dsn
= 0;
827 NTOHL(dss_info
->mdss_dsn
);
828 NTOHL(dss_info
->mdss_subflow_seqn
);
829 NTOHS(dss_info
->mdss_data_len
);
831 /* XXX for autosndbuf grow sb here */
833 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
835 mptcp_update_rcv_state_meat(mp_tp
, tp
,
836 full_dsn
, dss_info
->mdss_subflow_seqn
, dss_info
->mdss_data_len
,
842 mptcp_update_rcv_state_meat(struct mptcb
*mp_tp
, struct tcpcb
*tp
,
843 u_int64_t full_dsn
, u_int32_t seqn
, u_int16_t mdss_data_len
,
846 if (mdss_data_len
== 0) {
847 mptcplog((LOG_INFO
, "MPTCP Receiver: Infinite Mapping.\n"),
848 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
850 if ((mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
) && (csum
!= 0)) {
851 mptcplog((LOG_ERR
, "MPTCP Receiver: Bad checksum %x \n",
852 csum
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_ERR
);
854 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
859 "MPTCP Receiver: seqn = %x len = %x full = %llx "
861 seqn
, mdss_data_len
, full_dsn
, mp_tp
->mpt_rcvnxt
),
862 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
864 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
865 if ((seqn
== 0) && (mdss_data_len
== 1)) {
866 mptcplog((LOG_INFO
, "MPTCP Receiver: Data FIN in %s state \n",
867 mptcp_state_to_str(mp_tp
->mpt_state
)),
868 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
873 mptcp_notify_mpready(tp
->t_inpcb
->inp_socket
);
874 tp
->t_rcv_map
.mpt_dsn
= full_dsn
;
875 tp
->t_rcv_map
.mpt_sseq
= seqn
;
876 tp
->t_rcv_map
.mpt_len
= mdss_data_len
;
877 tp
->t_rcv_map
.mpt_csum
= csum
;
878 tp
->t_mpflags
|= TMPF_EMBED_DSN
;
883 mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt
*dss_info
, struct tcpcb
*tp
,
886 u_int64_t full_dsn
= 0;
887 struct mptcb
*mp_tp
= tptomptp(tp
);
890 * May happen, because the caller of this function does an soevent.
891 * Review after rdar://problem/24083886
896 NTOHL(dss_info
->mdss_dsn
);
897 NTOHL(dss_info
->mdss_subflow_seqn
);
898 NTOHS(dss_info
->mdss_data_len
);
900 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
902 mptcp_update_rcv_state_meat(mp_tp
, tp
,
904 dss_info
->mdss_subflow_seqn
,
905 dss_info
->mdss_data_len
,
910 mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt
*dss_info
,
911 struct tcpcb
*tp
, uint16_t csum
)
913 u_int64_t dsn
= mptcp_ntoh64(dss_info
->mdss_dsn
);
914 struct mptcb
*mp_tp
= tptomptp(tp
);
917 * May happen, because the caller of this function does an soevent.
918 * Review after rdar://problem/24083886
923 NTOHL(dss_info
->mdss_subflow_seqn
);
924 NTOHS(dss_info
->mdss_data_len
);
925 mptcp_update_rcv_state_meat(mp_tp
, tp
,
927 dss_info
->mdss_subflow_seqn
,
928 dss_info
->mdss_data_len
,
933 mptcp_validate_dss_map(struct socket
*so
, struct tcpcb
*tp
, struct mbuf
*m
,
936 u_int32_t sseq
, datalen
;
938 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
))
941 sseq
= m
->m_pkthdr
.mp_rseq
+ tp
->irs
;
942 datalen
= m
->m_pkthdr
.mp_rlen
;
945 /* enable this to test TCP fallback post connection establishment */
946 if (SEQ_GT(sseq
, (tp
->irs
+1)))
947 datalen
= m
->m_pkthdr
.len
- hdrlen
- 1;
950 /* unacceptable DSS option, fallback to TCP */
951 if (m
->m_pkthdr
.len
> ((int) datalen
+ hdrlen
)) {
952 mptcplog((LOG_ERR
, "MPTCP Receiver: "
953 "%s: mbuf len %d, MPTCP expected %d",
954 __func__
, m
->m_pkthdr
.len
, datalen
),
955 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
959 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
960 mptcp_notify_mpfail(so
);
966 mptcp_input_preproc(struct tcpcb
*tp
, struct mbuf
*m
, int drop_hdrlen
)
968 if (mptcp_validate_csum(tp
, m
, drop_hdrlen
) != 0)
971 mptcp_insert_rmap(tp
, m
);
972 if (mptcp_validate_dss_map(tp
->t_inpcb
->inp_socket
, tp
, m
,
979 * MPTCP Checksum support
980 * The checksum is calculated whenever the MPTCP DSS option is included
981 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
982 * header and the actual data indicated by the length specified in the
987 mptcp_validate_csum(struct tcpcb
*tp
, struct mbuf
*m
, int drop_hdrlen
)
989 uint16_t mptcp_csum
= 0;
990 mptcp_csum
= mptcp_input_csum(tp
, m
, drop_hdrlen
);
992 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
993 tp
->t_mpflags
&= ~TMPF_EMBED_DSN
;
994 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
996 tcpstat
.tcps_mp_badcsum
++;
1003 mptcp_input_csum(struct tcpcb
*tp
, struct mbuf
*m
, int off
)
1005 struct mptcb
*mp_tp
= tptomptp(tp
);
1015 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
1018 if (!(tp
->t_mpflags
& TMPF_EMBED_DSN
))
1021 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
)
1025 * The remote side may send a packet with fewer bytes than the
1026 * claimed DSS checksum length.
1028 if ((int)m_length2(m
, NULL
) < (off
+ tp
->t_rcv_map
.mpt_len
))
1031 if (tp
->t_rcv_map
.mpt_len
!= 0)
1032 sum
= m_sum16(m
, off
, tp
->t_rcv_map
.mpt_len
);
1034 dsn
= mptcp_hton64(tp
->t_rcv_map
.mpt_dsn
);
1035 sseq
= htonl(tp
->t_rcv_map
.mpt_sseq
);
1036 len
= htons(tp
->t_rcv_map
.mpt_len
);
1037 csum
= tp
->t_rcv_map
.mpt_csum
;
1038 sum
+= in_pseudo64(dsn
, sseq
, (len
+ csum
));
1040 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
1042 mptcplog((LOG_DEBUG
, "MPTCP Receiver: sum = %x \n", sum
),
1043 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1044 return (~sum
& 0xffff);
1048 mptcp_output_csum(struct tcpcb
*tp
, struct mbuf
*m
, int32_t len
,
1049 unsigned hdrlen
, u_int64_t dss_val
, u_int32_t
*sseqp
)
1051 struct mptcb
*mp_tp
= tptomptp(tp
);
1056 uint16_t *csump
= NULL
;
1061 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
1068 sum
= m_sum16(m
, hdrlen
, len
);
1070 dss_val
= mptcp_hton64(dss_val
);
1072 dss_len
= *(uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
));
1073 sum
+= in_pseudo64(dss_val
, sseq
, (dss_len
+ csum
));
1076 sum
= ~sum
& 0xffff;
1077 csump
= (uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
) +
1079 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
1082 mptcplog((LOG_DEBUG
, "MPTCP Sender: sum = %x \n", sum
),
1083 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1087 * When WiFi signal starts fading, there's more loss and RTT spikes.
1088 * Check if there has been a large spike by comparing against
1089 * a tolerable RTT spike threshold.
1092 mptcp_no_rto_spike(struct socket
*so
)
1094 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
1097 if (tp
->t_rxtcur
> MAX(mptcp_rtothresh
, MPTCP_RTO_MINTHRESH
)) {
1098 spike
= tp
->t_rxtcur
- mptcp_rtothresh
;
1100 mptcplog((LOG_DEBUG
, "MPTCP Socket: %s: spike = %d rto = %d"
1101 "best = %d cur = %d\n", __func__
, spike
,
1102 tp
->t_rxtcur
, tp
->t_rttbest
>> TCP_RTT_SHIFT
,
1104 (MPTCP_SOCKET_DBG
|MPTCP_SENDER_DBG
), MPTCP_LOGLVL_LOG
);