]>
git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp.c
2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 #include <sys/protosw.h>
39 #include <kern/zalloc.h>
40 #include <kern/locks.h>
42 #include <mach/thread_act.h>
45 #include <dev/random/randomdev.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_seq.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/mptcp_var.h>
55 #include <netinet/mptcp.h>
56 #include <netinet/mptcp_seq.h>
57 #include <netinet/mptcp_opt.h>
58 #include <netinet/mptcp_timer.h>
61 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, enable
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
62 &mptcp_enable
, 0, "Enable Multipath TCP Support");
65 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, debug
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
66 &mptcp_dbg
, 0, "Enable Multipath TCP Debugging");
68 /* Number of times to try negotiating MPTCP on SYN retransmissions */
69 int mptcp_mpcap_retries
= MPTCP_CAPABLE_RETRIES
;
70 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mptcp_cap_retr
,
71 CTLFLAG_RW
| CTLFLAG_LOCKED
,
72 &mptcp_mpcap_retries
, 0, "Number of MP Capable SYN Retries");
75 * By default, DSS checksum is turned off, revisit if we ever do
76 * MPTCP for non SSL Traffic.
78 int mptcp_dss_csum
= 0;
79 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, dss_csum
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
80 &mptcp_dss_csum
, 0, "Enable DSS checksum");
83 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
84 * is attempted on a different path.
86 int mptcp_fail_thresh
= 1;
87 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, fail
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
88 &mptcp_fail_thresh
, 0, "Failover threshold");
92 * MPTCP subflows have TCP keepalives set to ON
94 int mptcp_subflow_keeptime
= 60;
95 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, keepalive
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
96 &mptcp_subflow_keeptime
, 0, "Keepalive in seconds");
101 int mptcp_mpprio_enable
= 1;
102 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, mpprio
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
103 &mptcp_mpprio_enable
, 0, "Enable MP_PRIO option");
106 * REMOVE_ADDR option.
108 int mptcp_remaddr_enable
= 1;
109 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, remaddr
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
110 &mptcp_remaddr_enable
, 0, "Enable REMOVE_ADDR option");
113 * MPTCP input, called when data has been read from a subflow socket.
116 mptcp_input(struct mptses
*mpte
, struct mbuf
*m
)
118 struct socket
*mp_so
;
119 struct mptcb
*mp_tp
= NULL
;
121 u_int32_t mb_datalen
;
123 struct mbuf
*save
= NULL
;
124 struct mbuf
*freelist
= NULL
, *tail
= NULL
;
126 VERIFY(m
->m_flags
& M_PKTHDR
);
128 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
129 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
134 * Each mbuf contains MPTCP Data Sequence Map
135 * Process the data for reassembly, delivery to MPTCP socket
139 count
= mp_so
->so_rcv
.sb_cc
;
143 * In the degraded fallback case, data is accepted without DSS map
145 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
146 /* XXX need a check that this is indeed degraded */
147 if (sbappendstream(&mp_so
->so_rcv
, m
))
149 DTRACE_MPTCP5(receive__degraded
, struct mbuf
*, m
,
150 struct socket
*, mp_so
,
151 struct sockbuf
*, &mp_so
->so_rcv
,
152 struct sockbuf
*, &mp_so
->so_snd
,
153 struct mptses
*, mpte
);
154 count
= mp_so
->so_rcv
.sb_cc
- count
;
155 mptcplog3((LOG_DEBUG
, "%s: fread %d bytes\n", __func__
, count
));
159 mp_tp
= mpte
->mpte_mptcb
;
160 VERIFY(mp_tp
!= NULL
);
167 mb_dsn
= m
->m_pkthdr
.mp_dsn
;
168 mb_datalen
= m
->m_pkthdr
.mp_rlen
;
170 if (MPTCP_SEQ_GT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
171 tcpstat
.tcps_mp_oodata
++;
176 * Reassembly queue support here in future. Per spec,
177 * senders must implement retransmission timer to
178 * retransmit unacked data. Dropping out of order
179 * gives a slight hit on performance but allows us to
180 * deploy MPTCP and protects us against in-window DoS
181 * attacks that attempt to use up memory by sending
182 * out of order data. When doing load sharing across
183 * subflows, out of order support is a must.
187 if (MPTCP_SEQ_LT(mb_dsn
, mp_tp
->mpt_rcvatmark
)) {
188 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
189 VERIFY(m
->m_flags
& M_PKTHDR
);
190 VERIFY(m
->m_len
>= (int)mb_datalen
);
191 VERIFY(m
->m_pkthdr
.len
>= (int)mb_datalen
);
192 if (MPTCP_SEQ_LEQ((mb_dsn
+ mb_datalen
),
193 mp_tp
->mpt_rcvatmark
)) {
194 if (freelist
== NULL
)
203 m_adj(m
, (mp_tp
->mpt_rcvatmark
- mb_dsn
));
205 mptcplog((LOG_INFO
, "%s: %llu %d 2 \n", __func__
,
206 mp_tp
->mpt_rcvatmark
, m
->m_pkthdr
.len
));
210 if (sbappendstream(&mp_so
->so_rcv
, m
)) {
213 DTRACE_MPTCP6(receive
, struct mbuf
*, m
, struct socket
*, mp_so
,
214 struct sockbuf
*, &mp_so
->so_rcv
,
215 struct sockbuf
*, &mp_so
->so_snd
,
216 struct mptses
*, mpte
,
217 struct mptcb
*, mp_tp
);
219 count
= mp_so
->so_rcv
.sb_cc
- count
;
220 tcpstat
.tcps_mp_rcvtotal
++;
221 tcpstat
.tcps_mp_rcvbytes
+= count
;
222 mptcplog3((LOG_DEBUG
, "%s: read %d bytes\n", __func__
, count
));
224 * The data received at the MPTCP layer will never exceed the
225 * receive window because anything to the right of the
226 * receive window will be trimmed at the subflow level.
228 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
229 mp_tp
->mpt_rcvatmark
+= count
;
231 count
= mp_so
->so_rcv
.sb_cc
;
243 mptcp_output(struct mptses
*mpte
)
246 struct mptsub
*mpts_tried
= NULL
;
247 struct socket
*mp_so
;
250 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
251 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
252 if (mp_so
->so_state
& SS_CANTSENDMORE
) {
257 /* get the "best" subflow to be used for transmission */
258 mpts
= mptcp_get_subflow(mpte
, NULL
);
260 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx has no usable subflow\n",
261 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)));
265 mptcplog3((LOG_INFO
, "%s: mp_so 0x%llx cid %d \n", __func__
,
266 (uint64_t)VM_KERNEL_ADDRPERM(mp_so
), mpts
->mpts_connid
));
268 /* In case there's just one flow, we reattempt later */
270 if ((mpts_tried
!= NULL
) && ((mpts
== mpts_tried
) ||
271 (mpts
->mpts_flags
& MPTSF_FAILINGOVER
))) {
273 MPTS_LOCK(mpts_tried
);
274 mpts_tried
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
275 mpts_tried
->mpts_flags
|= MPTSF_ACTIVE
;
276 MPTS_UNLOCK(mpts_tried
);
277 MPT_LOCK(mpte
->mpte_mptcb
);
278 mptcp_start_timer(mpte
->mpte_mptcb
, MPTT_REXMT
);
279 MPT_UNLOCK(mpte
->mpte_mptcb
);
280 mptcplog((LOG_INFO
, "%s: mp_so 0x%llx retry later\n",
281 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)));
285 DTRACE_MPTCP3(output
, struct mptses
*, mpte
, struct mptsub
*, mpts
,
286 struct socket
*, mp_so
);
287 error
= mptcp_subflow_output(mpte
, mpts
);
289 /* can be a temporary loss of source address or other error */
290 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
291 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
294 mptcplog((LOG_INFO
, "%s: error = %d \n", __func__
, error
));
297 /* The model is to have only one active flow at a time */
298 mpts
->mpts_flags
|= MPTSF_ACTIVE
;
300 if (mpte
->mpte_active_sub
== NULL
) {
301 mpte
->mpte_active_sub
= mpts
;
302 } else if (mpte
->mpte_active_sub
!= mpts
) {
303 MPTS_LOCK(mpte
->mpte_active_sub
);
304 mpte
->mpte_active_sub
->mpts_flags
&= ~MPTSF_ACTIVE
;
305 MPTS_UNLOCK(mpte
->mpte_active_sub
);
306 mpte
->mpte_active_sub
= mpts
;
309 /* subflow errors should not be percolated back up */
314 * Return the most eligible subflow to be used for sending data.
315 * This function also serves to check if any alternate subflow is available
319 mptcp_get_subflow(struct mptses
*mpte
, struct mptsub
*ignore
)
322 struct mptsub
*fallback
= NULL
;
323 struct socket
*so
= NULL
;
325 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
327 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
328 MPTS_LOCK_SPIN(mpts
);
330 if ((ignore
) && (mpts
== ignore
)) {
335 /* There can only be one subflow in degraded state */
336 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
341 if (!(mpts
->mpts_flags
& MPTSF_MP_CAPABLE
)) {
346 if (mpts
->mpts_flags
& MPTSF_SUSPENDED
) {
351 if (mpts
->mpts_flags
& MPTSF_FAILINGOVER
) {
352 so
= mpts
->mpts_socket
;
353 if ((so
) && (!(so
->so_flags
& SOF_PCBCLEARING
))) {
355 if (so
->so_snd
.sb_cc
== 0) {
356 mpts
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
357 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
359 socket_unlock(so
, 1);
362 socket_unlock(so
, 1);
372 if (mpts
->mpts_flags
& MPTSF_PREFERRED
) {
377 /* When there are no preferred flows, use first one in list */
378 if (fallback
== NULL
)
384 * If there is no preferred or backup subflow, and there is no active
385 * subflow use the last usable subflow.
395 mptcp_close_fsm(struct mptcb
*mp_tp
, uint32_t event
)
397 MPT_LOCK_ASSERT_HELD(mp_tp
);
399 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
402 switch (mp_tp
->mpt_state
) {
405 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
408 case MPTCPS_ESTABLISHED
:
409 if (event
== MPCE_CLOSE
)
410 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_1
;
411 else if (event
== MPCE_RECV_DATA_FIN
)
412 mp_tp
->mpt_state
= MPTCPS_CLOSE_WAIT
;
415 case MPTCPS_CLOSE_WAIT
:
416 if (event
== MPCE_CLOSE
)
417 mp_tp
->mpt_state
= MPTCPS_LAST_ACK
;
420 case MPTCPS_FIN_WAIT_1
:
421 if (event
== MPCE_RECV_DATA_ACK
)
422 mp_tp
->mpt_state
= MPTCPS_FIN_WAIT_2
;
423 else if (event
== MPCE_RECV_DATA_FIN
)
424 mp_tp
->mpt_state
= MPTCPS_CLOSING
;
428 if (event
== MPCE_RECV_DATA_ACK
)
429 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
432 case MPTCPS_LAST_ACK
:
433 if (event
== MPCE_RECV_DATA_ACK
)
434 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
437 case MPTCPS_FIN_WAIT_2
:
438 if (event
== MPCE_RECV_DATA_FIN
)
439 mp_tp
->mpt_state
= MPTCPS_TIME_WAIT
;
442 case MPTCPS_TIME_WAIT
:
445 case MPTCPS_FASTCLOSE_WAIT
:
446 if (event
== MPCE_CLOSE
)
447 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
454 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
456 mptcplog((LOG_INFO
, "%s: state = %d\n",
457 __func__
, mp_tp
->mpt_state
));
461 * Update the mptcb send state variables, but the actual sbdrop occurs
465 mptcp_data_ack_rcvd(struct mptcb
*mp_tp
, struct tcpcb
*tp
, u_int64_t full_dack
)
469 acked
= full_dack
- mp_tp
->mpt_snduna
;
472 mp_tp
->mpt_snduna
+= acked
;
474 if ((full_dack
== mp_tp
->mpt_sndmax
) &&
475 (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_1
)) {
476 mptcp_close_fsm(mp_tp
, MPCE_RECV_DATA_ACK
);
477 tp
->t_mpflags
&= ~TMPF_SEND_DFIN
;
481 /* If you change this function, match up mptcp_update_rcv_state_f */
483 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt
*dss_info
, struct tcpcb
*tp
,
486 struct mptcb
*mp_tp
= tptomptp(tp
);
487 u_int64_t full_dsn
= 0;
489 NTOHL(dss_info
->mdss_dsn
);
490 NTOHL(dss_info
->mdss_subflow_seqn
);
491 NTOHS(dss_info
->mdss_data_len
);
493 /* XXX for autosndbuf grow sb here */
495 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
497 mptcp_update_rcv_state_meat(mp_tp
, tp
,
498 full_dsn
, dss_info
->mdss_subflow_seqn
, dss_info
->mdss_data_len
,
504 mptcp_update_rcv_state_meat(struct mptcb
*mp_tp
, struct tcpcb
*tp
,
505 u_int64_t full_dsn
, u_int32_t seqn
, u_int16_t mdss_data_len
,
508 if (mdss_data_len
== 0) {
509 mptcplog((LOG_INFO
, "%s: Received infinite mapping.",
511 if ((mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
) && (csum
!= 0)) {
512 mptcplog((LOG_ERR
, "%s: Bad checksum value %x \n",
515 mptcp_notify_mpfail(tp
->t_inpcb
->inp_socket
);
519 if (mptcp_dbg
>= MP_VERBOSE_DEBUG_1
)
520 printf("%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n",
521 __func__
, seqn
, mdss_data_len
, full_dsn
,
524 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
525 if ((seqn
== 0) && (mdss_data_len
== 1)) {
526 mptcplog((LOG_INFO
, "%s: Data FIN DSS opt state = %d \n",
527 __func__
, mp_tp
->mpt_state
));
532 mptcp_notify_mpready(tp
->t_inpcb
->inp_socket
);
533 tp
->t_rcv_map
.mpt_dsn
= full_dsn
;
534 tp
->t_rcv_map
.mpt_sseq
= seqn
;
535 tp
->t_rcv_map
.mpt_len
= mdss_data_len
;
536 tp
->t_rcv_map
.mpt_csum
= csum
;
537 tp
->t_mpflags
|= TMPF_EMBED_DSN
;
542 mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt
*dss_info
, struct tcpcb
*tp
,
545 u_int64_t full_dsn
= 0;
546 struct mptcb
*mp_tp
= tptomptp(tp
);
548 NTOHL(dss_info
->mdss_dsn
);
549 NTOHL(dss_info
->mdss_subflow_seqn
);
550 NTOHS(dss_info
->mdss_data_len
);
552 MPTCP_EXTEND_DSN(mp_tp
->mpt_rcvnxt
, dss_info
->mdss_dsn
, full_dsn
);
554 mptcp_update_rcv_state_meat(mp_tp
, tp
,
556 dss_info
->mdss_subflow_seqn
,
557 dss_info
->mdss_data_len
,
562 mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt
*dss_info
,
563 struct tcpcb
*tp
, uint16_t csum
)
565 u_int64_t dsn
= mptcp_ntoh64(dss_info
->mdss_dsn
);
566 struct mptcb
*mp_tp
= tptomptp(tp
);
568 NTOHL(dss_info
->mdss_subflow_seqn
);
569 NTOHS(dss_info
->mdss_data_len
);
570 mptcp_update_rcv_state_meat(mp_tp
, tp
,
572 dss_info
->mdss_subflow_seqn
,
573 dss_info
->mdss_data_len
,
578 * MPTCP Checksum support
579 * The checksum is calculated whenever the MPTCP DSS option is included
580 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
581 * header and the actual data indicated by the length specified in the
586 mptcp_input_csum(struct tcpcb
*tp
, struct mbuf
*m
, int off
)
588 struct mptcb
*mp_tp
= tptomptp(tp
);
598 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
601 if (!(tp
->t_mpflags
& TMPF_EMBED_DSN
))
604 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
)
608 * The remote side may send a packet with fewer bytes than the
609 * claimed DSS checksum length.
611 if ((int)m_length2(m
, NULL
) < (off
+ tp
->t_rcv_map
.mpt_len
))
614 if (tp
->t_rcv_map
.mpt_len
!= 0)
615 sum
= m_sum16(m
, off
, tp
->t_rcv_map
.mpt_len
);
617 dsn
= mptcp_hton64(tp
->t_rcv_map
.mpt_dsn
);
618 sseq
= htonl(tp
->t_rcv_map
.mpt_sseq
);
619 len
= htons(tp
->t_rcv_map
.mpt_len
);
620 csum
= tp
->t_rcv_map
.mpt_csum
;
621 sum
+= in_pseudo64(dsn
, sseq
, (len
+ csum
));
623 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
625 mptcplog((LOG_INFO
, "%s: sum = %x \n", __func__
, sum
));
626 return (~sum
& 0xffff);
630 mptcp_output_csum(struct tcpcb
*tp
, struct mbuf
*m
, int32_t len
,
631 unsigned hdrlen
, u_int64_t dss_val
, u_int32_t
*sseqp
)
633 struct mptcb
*mp_tp
= tptomptp(tp
);
638 uint16_t *csump
= NULL
;
643 if (!(mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
))
650 sum
= m_sum16(m
, hdrlen
, len
);
652 dss_val
= mptcp_hton64(dss_val
);
654 dss_len
= *(uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
));
655 sum
+= in_pseudo64(dss_val
, sseq
, (dss_len
+ csum
));
659 csump
= (uint16_t *)(void *)((u_char
*)sseqp
+ sizeof (u_int32_t
) +
661 DTRACE_MPTCP3(checksum__result
, struct tcpcb
*, tp
, struct mbuf
*, m
,
664 mptcplog3((LOG_INFO
, "%s: sum = %x \n", __func__
, sum
));