2 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 #include <sys/protosw.h>
39 #include <kern/zalloc.h>
40 #include <kern/locks.h>
42 #include <mach/thread_act.h>
45 #include <dev/random/randomdev.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_seq.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/mptcp_var.h>
55 #include <netinet/mptcp.h>
56 #include <netinet/mptcp_seq.h>
57 #include <netinet/mptcp_opt.h>
58 #include <netinet/mptcp_timer.h>
61 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, enable
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
62 &mptcp_enable
, 0, "Enable Multipath TCP Support");
64 /* Number of times to try negotiating MPTCP on SYN retransmissions */
65 int mptcp_mpcap_retries
= MPTCP_CAPABLE_RETRIES
;
66 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mptcp_cap_retr
,
67 CTLFLAG_RW
| CTLFLAG_LOCKED
,
68 &mptcp_mpcap_retries
, 0, "Number of MP Capable SYN Retries");
71 * By default, DSS checksum is turned off, revisit if we ever do
72 * MPTCP for non SSL Traffic.
74 int mptcp_dss_csum
= 0;
75 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, dss_csum
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
76 &mptcp_dss_csum
, 0, "Enable DSS checksum");
79 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
80 * is attempted on a different path.
82 int mptcp_fail_thresh
= 1;
83 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fail
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
84 &mptcp_fail_thresh
, 0, "Failover threshold");
88 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
89 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
90 * Some carrier networks have a timeout of 10 or 15 minutes.
92 int mptcp_subflow_keeptime
= 60*14;
93 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, keepalive
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
94 &mptcp_subflow_keeptime
, 0, "Keepalive in seconds");
99 int mptcp_mpprio_enable
= 1;
100 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mpprio
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
101 &mptcp_mpprio_enable
, 0, "Enable MP_PRIO option");
104 * REMOVE_ADDR option.
106 int mptcp_remaddr_enable
= 1;
107 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, remaddr
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
108 &mptcp_remaddr_enable
, 0, "Enable REMOVE_ADDR option");
113 int mptcp_fastjoin
= 1;
114 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fastjoin
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
115 &mptcp_fastjoin
, 0, "Enable FastJoin Option");
117 int mptcp_zerortt_fastjoin
= 0;
118 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, zerortt_fastjoin
, CTLFLAG_RW
|
119 CTLFLAG_LOCKED
, &mptcp_zerortt_fastjoin
, 0,
120 "Enable Zero RTT Fast Join");
123 * R/W Notification on resume
125 int mptcp_rwnotify
= 0;
126 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rwnotify
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
127 &mptcp_rwnotify
, 0, "Enable RW notify on resume");
130 * Using RTT history for sending new data
132 int mptcp_use_rtthist
= 1;
133 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rtthist
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
134 &mptcp_use_rtthist
, 0, "Disable RTT History");
136 #define MPTCP_RTTHIST_MINTHRESH 500
137 int mptcp_rtthist_rtthresh
= 600;
138 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rtthist_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
139 &mptcp_rtthist_rtthresh
, 0, "Rtt threshold");
142 * Use RTO history for sending new data
144 int mptcp_use_rto
= 1;
145 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, userto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
146 &mptcp_use_rto
, 0, "Disable RTO for subflow selection");
148 #define MPTCP_RTO_MINTHRESH 1000
149 int mptcp_rtothresh
= 1500;
150 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, rto_thresh
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
151 &mptcp_rtothresh
, 0, "RTO threshold");
154 * Use server's chosen path for sending new data
156 int mptcp_peerswitch
= 1;
157 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, use_peer
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
158 &mptcp_peerswitch
, 0, "Use peer");
160 #define MPTCP_PEERSWITCH_CNTMIN 3
161 uint32_t mptcp_peerswitch_cnt
= 3;
162 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, peerswitchno
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
163 &mptcp_peerswitch_cnt
, 0, "Set threshold based on peer's data arrival");
166 * Probe the preferred path, when it is not in use
168 #define MPTCP_PROBETO_MIN 500
169 uint32_t mptcp_probeto
= 1000;
170 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probeto
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
171 &mptcp_probeto
, 0, "Disable probing by setting to 0");
173 #define MPTCP_PROBE_MX 15
174 uint32_t mptcp_probecnt
= 5;
175 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, probecnt
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
176 &mptcp_probecnt
, 0, "Number of probe writes");
179 * Static declarations
181 static int mptcp_validate_csum(struct tcpcb
*, struct mbuf
*, int);
182 static uint16_t mptcp_input_csum(struct tcpcb
*, struct mbuf
*, int);
185 * MPTCP input, called when data has been read from a subflow socket.
188 mptcp_input(struct mptses
*mpte
, struct mbuf
*m
)
190 struct socket
*mp_so
;
191 struct mptcb
*mp_tp
= NULL
;
193 u_int32_t mb_datalen
;
195 struct mbuf
*save
= NULL
, *prev
= NULL
;
196 struct mbuf
*freelist
= NULL
, *tail
= NULL
;
197 boolean_t in_fallback
= FALSE
;
199 VERIFY(m
->m_flags
& M_PKTHDR
);
201 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
202 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
207 * Each mbuf contains MPTCP Data Sequence Map
208 * Process the data for reassembly, delivery to MPTCP socket
212 count
= mp_so
->so_rcv
.sb_cc
;
215 mp_tp
= mpte
->mpte_mptcb
;
216 VERIFY(mp_tp
!= NULL
);
218 /* Ok to check for this flag without lock as its set in this thread */
219 in_fallback
= (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
);
222 * In the degraded fallback case, data is accepted without DSS map
227 * assume degraded flow as this may be the first packet
228 * without DSS, and the subflow state is not updated yet.
230 if (sbappendstream(&mp_so
->so_rcv
, m
))
232 DTRACE_MPTCP5(receive__degraded
, struct mbuf
*, m
,
233 struct socket
*, mp_so
,
234 struct sockbuf
*, &mp_so
->so_rcv
,
235 struct sockbuf
*, &mp_so
->so_snd
,
236 struct mptses
*, mpte
);
237 count
= mp_so
->so_rcv
.sb_cc
- count
;
238 mptcplog((LOG_DEBUG
, "MPTCP Receiver: Fallback read %d bytes\n",
239 count
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
245 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
246 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
253 * A single TCP packet formed of multiple mbufs
254 * holds DSS mapping in the first mbuf of the chain.
255 * Other mbufs in the chain may have M_PKTHDR set
256 * even though they belong to the same TCP packet
257 * and therefore use the DSS mapping stored in the
258 * first mbuf of the mbuf chain. mptcp_input() can
259 * get an mbuf chain with multiple TCP packets.
261 while (save
&& (!(save
->m_flags
& M_PKTHDR
) ||
262 !(save
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
))) {
271 mb_dsn
= m
->m_pkthdr
.mp_dsn
;
272 mb_datalen
= m
->m_pkthdr
.mp_rlen
;
274 if (MPTCP_SEQ_GT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
275 tcpstat
.tcps_mp_oodata
++;
280 * Reassembly queue support here in future. Per spec,
281 * senders must implement retransmission timer to
282 * retransmit unacked data. Dropping out of order
283 * gives a slight hit on performance but allows us to
284 * deploy MPTCP and protects us against in-window DoS
285 * attacks that attempt to use up memory by sending
286 * out of order data. When doing load sharing across
287 * subflows, out of order support is a must.
291 if (MPTCP_SEQ_LT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
292 if (MPTCP_SEQ_LEQ((mb_dsn
+ mb_datalen
),
293 mp_tp
->mpt_rcvatmark
)) {
294 if (freelist
== NULL
)
308 m_adj(m
, (mp_tp
->mpt_rcvatmark
- mb_dsn
));
310 mptcplog((LOG_INFO
, "MPTCP Receiver: Left Edge %llu\n",
311 mp_tp
->mpt_rcvatmark
),
312 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
316 if (sbappendstream(&mp_so
->so_rcv
, m
)) {
319 DTRACE_MPTCP6(receive
, struct mbuf
*, m
, struct socket
*, mp_so
,
320 struct sockbuf
*, &mp_so
->so_rcv
,
321 struct sockbuf
*, &mp_so
->so_snd
,
322 struct mptses
*, mpte
,
323 struct mptcb
*, mp_tp
);
325 count
= mp_so
->so_rcv
.sb_cc
- count
;
326 tcpstat
.tcps_mp_rcvtotal
++;
327 tcpstat
.tcps_mp_rcvbytes
+= count
;
328 mptcplog((LOG_DEBUG
, "MPTCP Receiver: Read %d bytes\n", count
),
329 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
332 * The data received at the MPTCP layer will never exceed the
333 * receive window because anything to the right of the
334 * receive window will be trimmed at the subflow level.
336 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
337 mp_tp
->mpt_rcvatmark
+= count
;
340 count
= mp_so
->so_rcv
.sb_cc
;
352 mptcp_output(struct mptses
*mpte
)
355 struct mptsub
*mpts_tried
= NULL
;
356 struct socket
*mp_so
;
357 struct mptsub
*preferred_mpts
= NULL
;
360 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
361 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
362 if (mp_so
->so_state
& SS_CANTSENDMORE
) {
363 mptcplog((LOG_DEBUG
, "MPTCP Sender: cantsendmore\n"),
364 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
369 /* get the "best" subflow to be used for transmission */
370 mpts
= mptcp_get_subflow(mpte
, NULL
, &preferred_mpts
);
372 mptcplog((LOG_ERR
, "MPTCP Sender: mp_so 0x%llx no subflow\n",
373 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)),
374 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
378 mptcplog((LOG_DEBUG
, "MPTCP Sender: mp_so 0x%llx using cid %d \n",
379 (uint64_t)VM_KERNEL_ADDRPERM(mp_so
), mpts
->mpts_connid
),
380 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
382 /* In case there's just one flow, we reattempt later */
384 if ((mpts_tried
!= NULL
) && ((mpts
== mpts_tried
) ||
385 (mpts
->mpts_flags
& MPTSF_FAILINGOVER
))) {
387 MPTS_LOCK(mpts_tried
);
388 mpts_tried
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
389 mpts_tried
->mpts_flags
|= MPTSF_ACTIVE
;
390 MPTS_UNLOCK(mpts_tried
);
391 mptcp_start_timer(mpte
, MPTT_REXMT
);
392 mptcplog((LOG_DEBUG
, "MPTCP Sender: mp_so 0x%llx retry later\n",
393 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)),
394 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
398 DTRACE_MPTCP3(output
, struct mptses
*, mpte
, struct mptsub
*, mpts
,
399 struct socket
*, mp_so
);
400 error
= mptcp_subflow_output(mpte
, mpts
);
402 /* can be a temporary loss of source address or other error */
403 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
404 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
407 mptcplog((LOG_INFO
, "MPTCP Sender: Error = %d \n", error
),
408 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
411 /* The model is to have only one active flow at a time */
412 mpts
->mpts_flags
|= MPTSF_ACTIVE
;
413 mpts
->mpts_probesoon
= mpts
->mpts_probecnt
= 0;
416 /* Allows us to update the smoothed rtt */
417 if ((mptcp_probeto
) && (mptcp_probeto
>= MPTCP_PROBETO_MIN
) &&
418 (mpts
!= preferred_mpts
) && (preferred_mpts
!= NULL
)) {
419 MPTS_LOCK(preferred_mpts
);
420 if (preferred_mpts
->mpts_probesoon
) {
421 if ((tcp_now
- preferred_mpts
->mpts_probesoon
) >
423 (void) mptcp_subflow_output(mpte
, preferred_mpts
);
424 if (preferred_mpts
->mpts_probecnt
>=
425 MIN(mptcp_probecnt
, MPTCP_PROBE_MX
)) {
426 preferred_mpts
->mpts_probesoon
= 0;
427 preferred_mpts
->mpts_probecnt
= 0;
431 preferred_mpts
->mpts_probesoon
= tcp_now
;
432 preferred_mpts
->mpts_probecnt
= 0;
434 MPTS_UNLOCK(preferred_mpts
);
437 if (mpte
->mpte_active_sub
== NULL
) {
438 mpte
->mpte_active_sub
= mpts
;
439 } else if (mpte
->mpte_active_sub
!= mpts
) {
440 mptcplog((LOG_DEBUG
, "MPTCP Sender: switch [cid %d, srtt %d]"
441 "to [cid %d, srtt %d]\n",
442 mpte
->mpte_active_sub
->mpts_connid
,
443 mpte
->mpte_active_sub
->mpts_srtt
>> 5,
445 mpts
->mpts_srtt
>> 5),
446 MPTCP_SENDER_DBG
| MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
448 MPTS_LOCK(mpte
->mpte_active_sub
);
449 mpte
->mpte_active_sub
->mpts_flags
&= ~MPTSF_ACTIVE
;
450 mpts
->mpts_peerswitch
= 0;
451 MPTS_UNLOCK(mpte
->mpte_active_sub
);
452 mpte
->mpte_active_sub
= mpts
;
453 tcpstat
.tcps_mp_switches
++;
456 /* subflow errors should not be percolated back up */
461 * Return the most eligible subflow to be used for sending data.
462 * This function also serves to check if any alternate subflow is available
463 * or not. best and second_best flows are chosen by their priority. third_best
464 * could be best or second_best but is under loss at the time of evaluation.
467 mptcp_get_subflow(struct mptses
*mpte
, struct mptsub
*ignore
, struct mptsub
**preferred
)
470 struct mptsub
*best
= NULL
;
471 struct mptsub
*second_best
= NULL
;
472 struct mptsub
*third_best
= NULL
;
473 struct mptsub
*symptoms_best
= NULL
;
474 struct socket
*so
= NULL
;
476 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
478 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
481 if ((ignore
) && (mpts
== ignore
)) {
486 /* There can only be one subflow in degraded state */
487 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
494 * Subflows with Fastjoin allow data to be written before
495 * the subflow is mp capable.
497 if (!(mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) &&
498 !(mpts
->mpts_flags
& MPTSF_FASTJ_REQD
)) {
503 if (mpts
->mpts_flags
& MPTSF_SUSPENDED
) {
508 if ((mpts
->mpts_flags
& MPTSF_DISCONNECTED
) ||
509 (mpts
->mpts_flags
& MPTSF_DISCONNECTING
)) {
514 if (mpts
->mpts_flags
& MPTSF_FAILINGOVER
) {
515 so
= mpts
->mpts_socket
;
516 if ((so
) && (!(so
->so_flags
& SOF_PCBCLEARING
))) {
518 if ((so
->so_snd
.sb_cc
== 0) &&
519 (mptcp_no_rto_spike(so
))) {
520 mpts
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
521 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
522 socket_unlock(so
, 1);
525 mptcplog((LOG_DEBUG
, "MPTCP Sender: "
526 "%s cid %d in failover\n",
527 __func__
, third_best
->mpts_connid
),
529 MPTCP_LOGLVL_VERBOSE
);
530 socket_unlock(so
, 1);
540 /* When there are no preferred flows, use first one in list */
541 if ((!second_best
) && !(mpts
->mpts_flags
& MPTSF_PREFERRED
))
544 if (mpts
->mpts_flags
& MPTSF_PREFERRED
) {
552 * If there is no preferred or backup subflow, and there is no active
553 * subflow use the last usable subflow.
556 return (second_best
? second_best
: third_best
);
559 if (second_best
== NULL
) {
560 return (best
? best
: third_best
);
563 if (preferred
!= NULL
)
566 /* Use a hint from symptomsd if it exists */
567 symptoms_best
= mptcp_use_symptoms_hints(best
, second_best
);
568 if (symptoms_best
!= NULL
)
569 return (symptoms_best
);
571 /* Compare RTTs, select second_best if best's rtt exceeds rttthresh */
572 if ((mptcp_use_rtthist
) &&
573 (best
->mpts_srtt
) && (second_best
->mpts_srtt
) &&
574 (best
->mpts_srtt
> second_best
->mpts_srtt
) &&
575 (best
->mpts_srtt
>= MAX((MPTCP_RTTHIST_MINTHRESH
<< 5),
576 (mptcp_rtthist_rtthresh
<< 5)))) {
577 tcpstat
.tcps_mp_sel_rtt
++;
578 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s best cid %d"
579 " at rtt %d, second cid %d at rtt %d\n", __func__
,
580 best
->mpts_connid
, best
->mpts_srtt
>> 5,
581 second_best
->mpts_connid
,
582 second_best
->mpts_srtt
>> 5),
583 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
584 return (second_best
);
587 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
588 if ((mptcp_use_rto
) &&
589 (best
->mpts_rxtcur
) && (second_best
->mpts_rxtcur
) &&
590 (best
->mpts_rxtcur
> second_best
->mpts_rxtcur
) &&
591 (best
->mpts_rxtcur
>=
592 MAX(MPTCP_RTO_MINTHRESH
, mptcp_rtothresh
))) {
593 tcpstat
.tcps_mp_sel_rto
++;
594 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s best cid %d"
595 " at rto %d, second cid %d at rto %d\n", __func__
,
596 best
->mpts_connid
, best
->mpts_rxtcur
,
597 second_best
->mpts_connid
, second_best
->mpts_rxtcur
),
598 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
600 return (second_best
);
603 /* If second_best received data, use second_best */
604 if (mptcp_peerswitch
&&
605 (second_best
->mpts_peerswitch
>
606 MAX(MPTCP_PEERSWITCH_CNTMIN
, mptcp_peerswitch_cnt
))) {
607 tcpstat
.tcps_mp_sel_peer
++;
608 mptcplog((LOG_DEBUG
, "MPTCP Sender: %s: best cid %d"
609 " but using cid %d after receiving %d segments\n",
610 __func__
, best
->mpts_connid
, second_best
->mpts_connid
,
611 second_best
->mpts_peerswitch
), MPTCP_SENDER_DBG
,
613 return (second_best
);
619 mptcp_get_pending_subflow(struct mptses
*mpte
, struct mptsub
*ignore
)
621 struct mptsub
*mpts
= NULL
;
623 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
625 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
628 if ((ignore
) && (mpts
== ignore
)) {
633 if (mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) {
644 mptcp_event_to_str(uint32_t event
)
646 const char *c
= "UNDEFINED";
651 case MPCE_RECV_DATA_ACK
:
652 c
= "MPCE_RECV_DATA_ACK";
654 case MPCE_RECV_DATA_FIN
:
655 c
= "MPCE_RECV_DATA_FIN";
662 mptcp_state_to_str(mptcp_state_t state
)
664 const char *c
= "UNDEFINED";
672 case MPTCPS_ESTABLISHED
:
673 c
= "MPTCPS_ESTABLISHED";
675 case MPTCPS_CLOSE_WAIT
:
676 c
= "MPTCPS_CLOSE_WAIT";
678 case MPTCPS_FIN_WAIT_1
:
679 c
= "MPTCPS_FIN_WAIT_1";
682 c
= "MPTCPS_CLOSING";
684 case MPTCPS_LAST_ACK
:
685 c
= "MPTCPS_LAST_ACK";
687 case MPTCPS_FIN_WAIT_2
:
688 c
= "MPTCPS_FIN_WAIT_2";
690 case MPTCPS_TIME_WAIT
:
691 c
= "MPTCPS_TIME_WAIT";
693 case MPTCPS_FASTCLOSE_WAIT
:
694 c
= "MPTCPS_FASTCLOSE_WAIT";
696 case MPTCPS_TERMINATE
:
697 c
= "MPTCPS_TERMINATE";
704 mptcp_close_fsm(struct mptcb
*mp_tp
, uint32_t event
)
706 MPT_LOCK_ASSERT_HELD(mp_tp
);
707 mptcp_state_t old_state
= mp_tp
->mpt_state
;
709 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
712 switch (mp_tp
->mpt_state
) {
715 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
718 case MPTCPS_ESTABLISHED
:
719 if (event
== MPCE_CLOSE
) {
720 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_1
;
721 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
723 else if (event
== MPCE_RECV_DATA_FIN
) {
724 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
725 mp_tp
->mpt_state
= MPTCPS_CLOSE_WAIT
;
729 case MPTCPS_CLOSE_WAIT
:
730 if (event
== MPCE_CLOSE
) {
731 mp_tp
->mpt_state
= MPTCPS_LAST_ACK
;
732 mp_tp
->mpt_sndmax
+= 1; /* adjust for Data FIN */
736 case MPTCPS_FIN_WAIT_1
:
737 if (event
== MPCE_RECV_DATA_ACK
)
738 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_2
;
739 else if (event
== MPCE_RECV_DATA_FIN
) {
740 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
741 mp_tp
->mpt_state
= MPTCPS_CLOSING
;
746 if (event
== MPCE_RECV_DATA_ACK
)
747 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
750 case MPTCPS_LAST_ACK
:
751 if (event
== MPCE_RECV_DATA_ACK
)
752 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
755 case MPTCPS_FIN_WAIT_2
:
756 if (event
== MPCE_RECV_DATA_FIN
) {
757 mp_tp
->mpt_rcvnxt
+= 1; /* adj remote data FIN */
758 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
762 case MPTCPS_TIME_WAIT
:
765 case MPTCPS_FASTCLOSE_WAIT
:
766 if (event
== MPCE_CLOSE
) {
767 /* no need to adjust for data FIN */
768 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
771 case MPTCPS_TERMINATE
:
777 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
779 mptcplog((LOG_INFO
, "MPTCP State: %s to %s on event %s\n",
780 mptcp_state_to_str(old_state
),
781 mptcp_state_to_str(mp_tp
->mpt_state
),
782 mptcp_event_to_str(event
)),
783 MPTCP_STATE_DBG
, MPTCP_LOGLVL_LOG
);
787 * Update the mptcb send state variables, but the actual sbdrop occurs
791 mptcp_data_ack_rcvd(struct mptcb
*mp_tp
, struct tcpcb
*tp
, u_int64_t full_dack
)
795 acked
= full_dack
- mp_tp
->mpt_snduna
;
798 mp_tp
->mpt_snduna
+= acked
;
799 /* In degraded mode, we may get some Data ACKs */
800 if ((tp
->t_mpflags
& TMPF_TCP_FALLBACK
) &&
801 !(mp_tp
->mpt_flags
& MPTCPF_POST_FALLBACK_SYNC
) &&
802 MPTCP_SEQ_GT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_snduna
)) {
803 /* bring back sndnxt to retransmit MPTCP data */
804 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_dsn_at_csum_fail
;
805 mp_tp
->mpt_flags
|= MPTCPF_POST_FALLBACK_SYNC
;
806 tp
->t_inpcb
->inp_socket
->so_flags1
|=
807 SOF1_POST_FALLBACK_SYNC
;
810 if ((full_dack
== mp_tp
->mpt_sndmax
) &&
811 (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_1
)) {
812 mptcp_close_fsm(mp_tp
, MPCE_RECV_DATA_ACK
);
813 tp
->t_mpflags
&= ~TMPF_SEND_DFIN
;
817 /* If you change this function, match up mptcp_update_rcv_state_f */
819 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt
*dss_info
, struct tcpcb
*tp
,
822 struct mptcb
*mp_tp
= tptomptp(tp
);
823 u_int64_t full_dsn
= 0;
825 NTOHL(dss_info
->mdss_dsn
);
826 NTOHL(dss_info
->mdss_subflow_seqn
);
827 NTOHS(dss_info
->mdss_data_len
);
829 /* XXX for autosndbuf grow sb here */
831 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
833 mptcp_update_rcv_state_meat(mp_tp
, tp
,
834 full_dsn
, dss_info
->mdss_subflow_seqn
, dss_info
->mdss_data_len
,
840 mptcp_update_rcv_state_meat(struct mptcb
*mp_tp
, struct tcpcb
*tp
,
841 u_int64_t full_dsn
, u_int32_t seqn
, u_int16_t mdss_data_len
,
844 if (mdss_data_len
== 0) {
845 mptcplog((LOG_INFO
, "MPTCP Receiver: Infinite Mapping.\n"),
846 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
848 if ((mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
) && (csum
!= 0)) {
849 mptcplog((LOG_ERR
, "MPTCP Receiver: Bad checksum %x \n",
850 csum
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_ERR
);
852 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
857 "MPTCP Receiver: seqn = %x len = %x full = %llx "
859 seqn
, mdss_data_len
, full_dsn
, mp_tp
->mpt_rcvnxt
),
860 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
862 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
863 if ((seqn
== 0) && (mdss_data_len
== 1)) {
864 mptcplog((LOG_INFO
, "MPTCP Receiver: Data FIN in %s state \n",
865 mptcp_state_to_str(mp_tp
->mpt_state
)),
866 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
871 mptcp_notify_mpready(tp
->t_inpcb
->inp_socket
);
872 tp
->t_rcv_map
.mpt_dsn
= full_dsn
;
873 tp
->t_rcv_map
.mpt_sseq
= seqn
;
874 tp
->t_rcv_map
.mpt_len
= mdss_data_len
;
875 tp
->t_rcv_map
.mpt_csum
= csum
;
876 tp
->t_mpflags
|= TMPF_EMBED_DSN
;
881 mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt
*dss_info
, struct tcpcb
*tp
,
884 u_int64_t full_dsn
= 0;
885 struct mptcb
*mp_tp
= tptomptp(tp
);
887 NTOHL(dss_info
->mdss_dsn
);
888 NTOHL(dss_info
->mdss_subflow_seqn
);
889 NTOHS(dss_info
->mdss_data_len
);
891 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
893 mptcp_update_rcv_state_meat(mp_tp
, tp
,
895 dss_info
->mdss_subflow_seqn
,
896 dss_info
->mdss_data_len
,
901 mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt
*dss_info
,
902 struct tcpcb
*tp
, uint16_t csum
)
904 u_int64_t dsn
= mptcp_ntoh64(dss_info
->mdss_dsn
);
905 struct mptcb
*mp_tp
= tptomptp(tp
);
907 NTOHL(dss_info
->mdss_subflow_seqn
);
908 NTOHS(dss_info
->mdss_data_len
);
909 mptcp_update_rcv_state_meat(mp_tp
, tp
,
911 dss_info
->mdss_subflow_seqn
,
912 dss_info
->mdss_data_len
,
917 mptcp_validate_dss_map(struct socket
*so
, struct tcpcb
*tp
, struct mbuf
*m
,
920 u_int32_t sseq
, datalen
;
922 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
))
925 sseq
= m
->m_pkthdr
.mp_rseq
+ tp
->irs
;
926 datalen
= m
->m_pkthdr
.mp_rlen
;
929 /* enable this to test TCP fallback post connection establishment */
930 if (SEQ_GT(sseq
, (tp
->irs
+1)))
931 datalen
= m
->m_pkthdr
.len
- hdrlen
- 1;
934 /* unacceptable DSS option, fallback to TCP */
935 if (m
->m_pkthdr
.len
> ((int) datalen
+ hdrlen
)) {
936 mptcplog((LOG_ERR
, "MPTCP Receiver: "
937 "%s: mbuf len %d, MPTCP expected %d",
938 __func__
, m
->m_pkthdr
.len
, datalen
),
939 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
943 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
944 mptcp_notify_mpfail(so
);
950 mptcp_input_preproc(struct tcpcb
*tp
, struct mbuf
*m
, int drop_hdrlen
)
952 if (mptcp_validate_csum(tp
, m
, drop_hdrlen
) != 0)
955 mptcp_insert_rmap(tp
, m
);
956 if (mptcp_validate_dss_map(tp
->t_inpcb
->inp_socket
, tp
, m
,
963 * MPTCP Checksum support
964 * The checksum is calculated whenever the MPTCP DSS option is included
965 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
966 * header and the actual data indicated by the length specified in the
971 mptcp_validate_csum(struct tcpcb
*tp
, struct mbuf
*m
, int drop_hdrlen
)
973 uint16_t mptcp_csum
= 0;
974 mptcp_csum
= mptcp_input_csum(tp
, m
, drop_hdrlen
);
976 tp
->t_mpflags
|= TMPF_SND_MPFAIL
;
977 tp
->t_mpflags
&= ~TMPF_EMBED_DSN
;
978 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
980 tcpstat
.tcps_mp_badcsum
++;
987 mptcp_input_csum(struct tcpcb
*tp
, struct mbuf
*m
, int off
)
989 struct mptcb
*mp_tp
= tptomptp(tp
);
999 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
1002 if (!(tp
->t_mpflags
& TMPF_EMBED_DSN
))
1005 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
)
1009 * The remote side may send a packet with fewer bytes than the
1010 * claimed DSS checksum length.
1012 if ((int)m_length2(m
, NULL
) < (off
+ tp
->t_rcv_map
.mpt_len
))
1015 if (tp
->t_rcv_map
.mpt_len
!= 0)
1016 sum
= m_sum16(m
, off
, tp
->t_rcv_map
.mpt_len
);
1018 dsn
= mptcp_hton64(tp
->t_rcv_map
.mpt_dsn
);
1019 sseq
= htonl(tp
->t_rcv_map
.mpt_sseq
);
1020 len
= htons(tp
->t_rcv_map
.mpt_len
);
1021 csum
= tp
->t_rcv_map
.mpt_csum
;
1022 sum
+= in_pseudo64(dsn
, sseq
, (len
+ csum
));
1024 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
1026 mptcplog((LOG_DEBUG
, "MPTCP Receiver: sum = %x \n", sum
),
1027 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1028 return (~sum
& 0xffff);
1032 mptcp_output_csum(struct tcpcb
*tp
, struct mbuf
*m
, int32_t len
,
1033 unsigned hdrlen
, u_int64_t dss_val
, u_int32_t
*sseqp
)
1035 struct mptcb
*mp_tp
= tptomptp(tp
);
1040 uint16_t *csump
= NULL
;
1045 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
1052 sum
= m_sum16(m
, hdrlen
, len
);
1054 dss_val
= mptcp_hton64(dss_val
);
1056 dss_len
= *(uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
));
1057 sum
+= in_pseudo64(dss_val
, sseq
, (dss_len
+ csum
));
1060 sum
= ~sum
& 0xffff;
1061 csump
= (uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
) +
1063 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
1066 mptcplog((LOG_DEBUG
, "MPTCP Sender: sum = %x \n", sum
),
1067 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
1071 * When WiFi signal starts fading, there's more loss and RTT spikes.
1072 * Check if there has been a large spike by comparing against
1073 * a tolerable RTT spike threshold.
1076 mptcp_no_rto_spike(struct socket
*so
)
1078 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
1081 if (tp
->t_rxtcur
> MAX(mptcp_rtothresh
, MPTCP_RTO_MINTHRESH
)) {
1082 spike
= tp
->t_rxtcur
- mptcp_rtothresh
;
1084 mptcplog((LOG_DEBUG
, "MPTCP Socket: %s: spike = %d rto = %d"
1085 "best = %d cur = %d\n", __func__
, spike
,
1086 tp
->t_rxtcur
, tp
->t_rttbest
>> TCP_RTT_SHIFT
,
1088 (MPTCP_SOCKET_DBG
|MPTCP_SENDER_DBG
), MPTCP_LOGLVL_LOG
);