2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
51 #include <net/content_filter.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
74 * Notes on MPTCP implementation.
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
112 static void mptcp_attach_to_subf(struct socket
*, struct mptcb
*, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb
*, struct socket
*);
115 static uint32_t mptcp_gc(struct mppcbinfo
*);
116 static int mptcp_subflow_soreceive(struct socket
*, struct sockaddr
**,
117 struct uio
*, struct mbuf
**, struct mbuf
**, int *);
118 static int mptcp_subflow_sosend(struct socket
*, struct sockaddr
*,
119 struct uio
*, struct mbuf
*, struct mbuf
*, int);
120 static void mptcp_subflow_rupcall(struct socket
*, void *, int);
121 static void mptcp_subflow_input(struct mptses
*, struct mptsub
*);
122 static void mptcp_subflow_wupcall(struct socket
*, void *, int);
123 static void mptcp_subflow_eupcall1(struct socket
*, void *, uint32_t);
124 static void mptcp_update_last_owner(struct socket
*so
, struct socket
*mp_so
);
125 static void mptcp_drop_tfo_data(struct mptses
*, struct mptsub
*);
127 static void mptcp_subflow_abort(struct mptsub
*, int);
129 static void mptcp_send_dfin(struct socket
*so
);
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
139 MPTS_EVRET_DELETE
= 1, /* delete this subflow */
140 MPTS_EVRET_OK
= 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING
= 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK
= 4, /* abort all but preferred */
145 static ev_ret_t
mptcp_subflow_events(struct mptses
*, struct mptsub
*, uint64_t *);
146 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
147 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
148 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
149 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
150 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
151 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
152 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
153 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
154 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
155 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
156 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses
*, struct mptsub
*, uint64_t *, uint64_t);
158 static const char *mptcp_evret2str(ev_ret_t
);
160 static void mptcp_do_sha1(mptcp_key_t
*, char *);
161 static void mptcp_init_local_parms(struct mptses
*);
163 static unsigned int mptsub_zone_size
; /* size of mptsub */
164 static struct zone
*mptsub_zone
; /* zone for mptsub */
166 static unsigned int mptopt_zone_size
; /* size of mptopt */
167 static struct zone
*mptopt_zone
; /* zone for mptopt */
169 static unsigned int mpt_subauth_entry_size
; /* size of subf auth entry */
170 static struct zone
*mpt_subauth_zone
; /* zone of subf auth entry */
172 struct mppcbinfo mtcbinfo
;
174 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
177 SYSCTL_DECL(_net_inet
);
179 SYSCTL_NODE(_net_inet
, OID_AUTO
, mptcp
, CTLFLAG_RW
|CTLFLAG_LOCKED
, 0, "MPTCP");
181 uint32_t mptcp_dbg_area
= 31; /* more noise if greater than 1 */
182 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, dbg_area
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
183 &mptcp_dbg_area
, 0, "MPTCP debug area");
185 uint32_t mptcp_dbg_level
= 1;
186 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, dbg_level
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
187 &mptcp_dbg_level
, 0, "MPTCP debug level");
189 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, pcbcount
, CTLFLAG_RD
|CTLFLAG_LOCKED
,
190 &mtcbinfo
.mppi_count
, 0, "Number of active PCBs");
193 static int mptcp_alternate_port
= 0;
194 SYSCTL_INT(_net_inet_mptcp
, OID_AUTO
, alternate_port
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
195 &mptcp_alternate_port
, 0, "Set alternate port for MPTCP connections");
197 static struct protosw mptcp_subflow_protosw
;
198 static struct pr_usrreqs mptcp_subflow_usrreqs
;
200 static struct ip6protosw mptcp_subflow_protosw6
;
201 static struct pr_usrreqs mptcp_subflow_usrreqs6
;
204 static uint8_t mptcp_create_subflows_scheduled
;
206 typedef struct mptcp_subflow_event_entry
{
207 uint64_t sofilt_hint_mask
;
208 ev_ret_t (*sofilt_hint_ev_hdlr
)(
211 uint64_t *p_mpsofilt_hint
,
215 static uint8_t mptcp_cellicon_is_set
;
216 static uint32_t mptcp_last_cellicon_set
;
217 #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
220 * XXX The order of the event handlers below is really
221 * really important. Think twice before changing it.
223 static mptsub_ev_entry_t mpsub_ev_entry_tbl
[] = {
225 .sofilt_hint_mask
= SO_FILT_HINT_MPCANTRCVMORE
,
226 .sofilt_hint_ev_hdlr
= mptcp_subflow_mpcantrcvmore_ev
,
229 .sofilt_hint_mask
= SO_FILT_HINT_MPFAILOVER
,
230 .sofilt_hint_ev_hdlr
= mptcp_subflow_failover_ev
,
233 .sofilt_hint_mask
= SO_FILT_HINT_CONNRESET
,
234 .sofilt_hint_ev_hdlr
= mptcp_subflow_propagate_ev
,
237 .sofilt_hint_mask
= SO_FILT_HINT_MUSTRST
,
238 .sofilt_hint_ev_hdlr
= mptcp_subflow_mustrst_ev
,
241 .sofilt_hint_mask
= SO_FILT_HINT_CANTRCVMORE
,
242 .sofilt_hint_ev_hdlr
= mptcp_subflow_propagate_ev
,
245 .sofilt_hint_mask
= SO_FILT_HINT_TIMEOUT
,
246 .sofilt_hint_ev_hdlr
= mptcp_subflow_propagate_ev
,
249 .sofilt_hint_mask
= SO_FILT_HINT_NOSRCADDR
,
250 .sofilt_hint_ev_hdlr
= mptcp_subflow_nosrcaddr_ev
,
253 .sofilt_hint_mask
= SO_FILT_HINT_IFDENIED
,
254 .sofilt_hint_ev_hdlr
= mptcp_subflow_ifdenied_ev
,
257 .sofilt_hint_mask
= SO_FILT_HINT_CONNECTED
,
258 .sofilt_hint_ev_hdlr
= mptcp_subflow_connected_ev
,
261 .sofilt_hint_mask
= SO_FILT_HINT_MPSTATUS
,
262 .sofilt_hint_ev_hdlr
= mptcp_subflow_mpstatus_ev
,
265 .sofilt_hint_mask
= SO_FILT_HINT_DISCONNECTED
,
266 .sofilt_hint_ev_hdlr
= mptcp_subflow_disconnected_ev
,
269 .sofilt_hint_mask
= SO_FILT_HINT_ADAPTIVE_RTIMO
,
270 .sofilt_hint_ev_hdlr
= mptcp_subflow_adaptive_rtimo_ev
,
273 .sofilt_hint_mask
= SO_FILT_HINT_ADAPTIVE_WTIMO
,
274 .sofilt_hint_ev_hdlr
= mptcp_subflow_adaptive_wtimo_ev
,
278 os_log_t mptcp_log_handle
;
281 * Protocol pr_init callback.
284 mptcp_init(struct protosw
*pp
, struct domain
*dp
)
287 static int mptcp_initialized
= 0;
290 struct ip6protosw
*prp6
;
293 VERIFY((pp
->pr_flags
& (PR_INITIALIZED
|PR_ATTACHED
)) == PR_ATTACHED
);
295 /* do this only once */
296 if (mptcp_initialized
)
298 mptcp_initialized
= 1;
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
304 prp
= pffindproto_locked(PF_INET
, IPPROTO_TCP
, SOCK_STREAM
);
306 bcopy(prp
, &mptcp_subflow_protosw
, sizeof (*prp
));
307 bcopy(prp
->pr_usrreqs
, &mptcp_subflow_usrreqs
,
308 sizeof (mptcp_subflow_usrreqs
));
309 mptcp_subflow_protosw
.pr_entry
.tqe_next
= NULL
;
310 mptcp_subflow_protosw
.pr_entry
.tqe_prev
= NULL
;
311 mptcp_subflow_protosw
.pr_usrreqs
= &mptcp_subflow_usrreqs
;
312 mptcp_subflow_usrreqs
.pru_soreceive
= mptcp_subflow_soreceive
;
313 mptcp_subflow_usrreqs
.pru_sosend
= mptcp_subflow_sosend
;
314 mptcp_subflow_usrreqs
.pru_rcvoob
= pru_rcvoob_notsupp
;
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
320 mptcp_subflow_protosw
.pr_filter_head
.tqh_first
=
321 (struct socket_filter
*)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw
.pr_filter_head
.tqh_last
=
323 (struct socket_filter
**)(uintptr_t)0xdeadbeefdeadbeef;
326 prp6
= (struct ip6protosw
*)pffindproto_locked(PF_INET6
,
327 IPPROTO_TCP
, SOCK_STREAM
);
328 VERIFY(prp6
!= NULL
);
329 bcopy(prp6
, &mptcp_subflow_protosw6
, sizeof (*prp6
));
330 bcopy(prp6
->pr_usrreqs
, &mptcp_subflow_usrreqs6
,
331 sizeof (mptcp_subflow_usrreqs6
));
332 mptcp_subflow_protosw6
.pr_entry
.tqe_next
= NULL
;
333 mptcp_subflow_protosw6
.pr_entry
.tqe_prev
= NULL
;
334 mptcp_subflow_protosw6
.pr_usrreqs
= &mptcp_subflow_usrreqs6
;
335 mptcp_subflow_usrreqs6
.pru_soreceive
= mptcp_subflow_soreceive
;
336 mptcp_subflow_usrreqs6
.pru_sosend
= mptcp_subflow_sosend
;
337 mptcp_subflow_usrreqs6
.pru_rcvoob
= pru_rcvoob_notsupp
;
339 * Socket filters shouldn't attach/detach to/from this protosw
340 * since pr_protosw is to be used instead, which points to the
341 * real protocol; if they do, it is a bug and we should panic.
343 mptcp_subflow_protosw6
.pr_filter_head
.tqh_first
=
344 (struct socket_filter
*)(uintptr_t)0xdeadbeefdeadbeef;
345 mptcp_subflow_protosw6
.pr_filter_head
.tqh_last
=
346 (struct socket_filter
**)(uintptr_t)0xdeadbeefdeadbeef;
349 bzero(&mtcbinfo
, sizeof (mtcbinfo
));
350 TAILQ_INIT(&mtcbinfo
.mppi_pcbs
);
351 mtcbinfo
.mppi_size
= sizeof (struct mpp_mtp
);
352 if ((mtcbinfo
.mppi_zone
= zinit(mtcbinfo
.mppi_size
,
353 1024 * mtcbinfo
.mppi_size
, 8192, "mptcb")) == NULL
) {
354 panic("%s: unable to allocate MPTCP PCB zone\n", __func__
);
357 zone_change(mtcbinfo
.mppi_zone
, Z_CALLERACCT
, FALSE
);
358 zone_change(mtcbinfo
.mppi_zone
, Z_EXPAND
, TRUE
);
360 mtcbinfo
.mppi_lock_grp_attr
= lck_grp_attr_alloc_init();
361 mtcbinfo
.mppi_lock_grp
= lck_grp_alloc_init("mppcb",
362 mtcbinfo
.mppi_lock_grp_attr
);
363 mtcbinfo
.mppi_lock_attr
= lck_attr_alloc_init();
364 lck_mtx_init(&mtcbinfo
.mppi_lock
, mtcbinfo
.mppi_lock_grp
,
365 mtcbinfo
.mppi_lock_attr
);
367 mtcbinfo
.mppi_gc
= mptcp_gc
;
368 mtcbinfo
.mppi_timer
= mptcp_timer
;
370 /* attach to MP domain for garbage collection to take place */
371 mp_pcbinfo_attach(&mtcbinfo
);
373 mptsub_zone_size
= sizeof (struct mptsub
);
374 if ((mptsub_zone
= zinit(mptsub_zone_size
, 1024 * mptsub_zone_size
,
375 8192, "mptsub")) == NULL
) {
376 panic("%s: unable to allocate MPTCP subflow zone\n", __func__
);
379 zone_change(mptsub_zone
, Z_CALLERACCT
, FALSE
);
380 zone_change(mptsub_zone
, Z_EXPAND
, TRUE
);
382 mptopt_zone_size
= sizeof (struct mptopt
);
383 if ((mptopt_zone
= zinit(mptopt_zone_size
, 128 * mptopt_zone_size
,
384 1024, "mptopt")) == NULL
) {
385 panic("%s: unable to allocate MPTCP option zone\n", __func__
);
388 zone_change(mptopt_zone
, Z_CALLERACCT
, FALSE
);
389 zone_change(mptopt_zone
, Z_EXPAND
, TRUE
);
391 mpt_subauth_entry_size
= sizeof (struct mptcp_subf_auth_entry
);
392 if ((mpt_subauth_zone
= zinit(mpt_subauth_entry_size
,
393 1024 * mpt_subauth_entry_size
, 8192, "mptauth")) == NULL
) {
394 panic("%s: unable to allocate MPTCP address auth zone \n",
398 zone_change(mpt_subauth_zone
, Z_CALLERACCT
, FALSE
);
399 zone_change(mpt_subauth_zone
, Z_EXPAND
, TRUE
);
401 mptcp_last_cellicon_set
= tcp_now
;
403 mptcp_log_handle
= os_log_create("com.apple.xnu.net.mptcp", "mptcp");
407 mptcp_get_statsindex(struct mptcp_itf_stats
*stats
, const struct mptsub
*mpts
)
409 const struct ifnet
*ifp
= sotoinpcb(mpts
->mpts_socket
)->inp_last_outifp
;
414 mptcplog((LOG_ERR
, "%s: no ifp on subflow\n", __func__
),
415 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
419 for (i
= 0; i
< MPTCP_ITFSTATS_SIZE
; i
++) {
420 if (stats
[i
].ifindex
== IFSCOPE_NONE
) {
426 if (stats
[i
].ifindex
== ifp
->if_index
) {
433 stats
[index
].ifindex
= ifp
->if_index
;
434 if (stats
[index
].is_expensive
== 0)
435 stats
[index
].is_expensive
= IFNET_IS_CELLULAR(ifp
);
442 mptcpstats_inc_switch(struct mptses
*mpte
, const struct mptsub
*mpts
)
446 tcpstat
.tcps_mp_switches
++;
447 mpte
->mpte_subflow_switches
++;
449 index
= mptcp_get_statsindex(mpte
->mpte_itfstats
, mpts
);
452 mpte
->mpte_itfstats
[index
].switches
++;
456 * Flushes all recorded socket options from an MP socket.
459 mptcp_flush_sopts(struct mptses
*mpte
)
461 struct mptopt
*mpo
, *tmpo
;
463 TAILQ_FOREACH_SAFE(mpo
, &mpte
->mpte_sopts
, mpo_entry
, tmpo
) {
464 mptcp_sopt_remove(mpte
, mpo
);
465 mptcp_sopt_free(mpo
);
467 VERIFY(TAILQ_EMPTY(&mpte
->mpte_sopts
));
471 * Create an MPTCP session, called as a result of opening a MPTCP socket.
474 mptcp_sescreate(struct mppcb
*mpp
)
476 struct mppcbinfo
*mppi
;
481 mppi
= mpp
->mpp_pcbinfo
;
482 VERIFY(mppi
!= NULL
);
484 __IGNORE_WCASTALIGN(mpte
= &((struct mpp_mtp
*)mpp
)->mpp_ses
);
485 __IGNORE_WCASTALIGN(mp_tp
= &((struct mpp_mtp
*)mpp
)->mtcb
);
487 /* MPTCP Multipath PCB Extension */
488 bzero(mpte
, sizeof (*mpte
));
489 VERIFY(mpp
->mpp_pcbe
== NULL
);
490 mpp
->mpp_pcbe
= mpte
;
491 mpte
->mpte_mppcb
= mpp
;
492 mpte
->mpte_mptcb
= mp_tp
;
494 TAILQ_INIT(&mpte
->mpte_sopts
);
495 TAILQ_INIT(&mpte
->mpte_subflows
);
496 mpte
->mpte_associd
= SAE_ASSOCID_ANY
;
497 mpte
->mpte_connid_last
= SAE_CONNID_ANY
;
499 mpte
->mpte_itfinfo
= &mpte
->_mpte_itfinfo
[0];
500 mpte
->mpte_itfinfo_size
= MPTE_ITFINFO_SIZE
;
502 if (mptcp_alternate_port
)
503 mpte
->mpte_alternate_port
= htons(mptcp_alternate_port
);
505 /* MPTCP Protocol Control Block */
506 bzero(mp_tp
, sizeof (*mp_tp
));
507 mp_tp
->mpt_mpte
= mpte
;
508 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
510 DTRACE_MPTCP1(session__create
, struct mppcb
*, mpp
);
516 mptcpstats_get_bytes(struct mptses
*mpte
, boolean_t initial_cell
,
517 uint64_t *cellbytes
, uint64_t *allbytes
)
519 int64_t mycellbytes
= 0;
520 uint64_t myallbytes
= 0;
523 for (i
= 0; i
< MPTCP_ITFSTATS_SIZE
; i
++) {
524 if (mpte
->mpte_itfstats
[i
].is_expensive
) {
525 mycellbytes
+= mpte
->mpte_itfstats
[i
].mpis_txbytes
;
526 mycellbytes
+= mpte
->mpte_itfstats
[i
].mpis_rxbytes
;
529 myallbytes
+= mpte
->mpte_itfstats
[i
].mpis_txbytes
;
530 myallbytes
+= mpte
->mpte_itfstats
[i
].mpis_rxbytes
;
534 mycellbytes
-= mpte
->mpte_init_txbytes
;
535 mycellbytes
-= mpte
->mpte_init_txbytes
;
538 if (mycellbytes
< 0) {
539 mptcplog((LOG_ERR
, "%s cellbytes is %d\n", __func__
, mycellbytes
),
540 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
544 *cellbytes
= mycellbytes
;
545 *allbytes
= myallbytes
;
550 mptcpstats_session_wrapup(struct mptses
*mpte
)
552 boolean_t cell
= mpte
->mpte_initial_cell
;
554 switch (mpte
->mpte_svctype
) {
555 case MPTCP_SVCTYPE_HANDOVER
:
556 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
) {
557 tcpstat
.tcps_mptcp_fp_handover_attempt
++;
559 if (cell
&& mpte
->mpte_handshake_success
) {
560 tcpstat
.tcps_mptcp_fp_handover_success_cell
++;
562 if (mpte
->mpte_used_wifi
)
563 tcpstat
.tcps_mptcp_handover_wifi_from_cell
++;
564 } else if (mpte
->mpte_handshake_success
) {
565 tcpstat
.tcps_mptcp_fp_handover_success_wifi
++;
567 if (mpte
->mpte_used_cell
)
568 tcpstat
.tcps_mptcp_handover_cell_from_wifi
++;
571 tcpstat
.tcps_mptcp_handover_attempt
++;
573 if (cell
&& mpte
->mpte_handshake_success
) {
574 tcpstat
.tcps_mptcp_handover_success_cell
++;
576 if (mpte
->mpte_used_wifi
)
577 tcpstat
.tcps_mptcp_handover_wifi_from_cell
++;
578 } else if (mpte
->mpte_handshake_success
) {
579 tcpstat
.tcps_mptcp_handover_success_wifi
++;
581 if (mpte
->mpte_used_cell
)
582 tcpstat
.tcps_mptcp_handover_cell_from_wifi
++;
586 if (mpte
->mpte_handshake_success
) {
590 mptcpstats_get_bytes(mpte
, cell
, &cellbytes
, &allbytes
);
592 tcpstat
.tcps_mptcp_handover_cell_bytes
+= cellbytes
;
593 tcpstat
.tcps_mptcp_handover_all_bytes
+= allbytes
;
596 case MPTCP_SVCTYPE_INTERACTIVE
:
597 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
) {
598 tcpstat
.tcps_mptcp_fp_interactive_attempt
++;
600 if (mpte
->mpte_handshake_success
) {
601 tcpstat
.tcps_mptcp_fp_interactive_success
++;
603 if (!cell
&& mpte
->mpte_used_cell
)
604 tcpstat
.tcps_mptcp_interactive_cell_from_wifi
++;
607 tcpstat
.tcps_mptcp_interactive_attempt
++;
609 if (mpte
->mpte_handshake_success
) {
610 tcpstat
.tcps_mptcp_interactive_success
++;
612 if (!cell
&& mpte
->mpte_used_cell
)
613 tcpstat
.tcps_mptcp_interactive_cell_from_wifi
++;
617 if (mpte
->mpte_handshake_success
) {
621 mptcpstats_get_bytes(mpte
, cell
, &cellbytes
, &allbytes
);
623 tcpstat
.tcps_mptcp_interactive_cell_bytes
+= cellbytes
;
624 tcpstat
.tcps_mptcp_interactive_all_bytes
+= allbytes
;
627 case MPTCP_SVCTYPE_AGGREGATE
:
628 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
) {
629 tcpstat
.tcps_mptcp_fp_aggregate_attempt
++;
631 if (mpte
->mpte_handshake_success
)
632 tcpstat
.tcps_mptcp_fp_aggregate_success
++;
634 tcpstat
.tcps_mptcp_aggregate_attempt
++;
636 if (mpte
->mpte_handshake_success
) {
637 tcpstat
.tcps_mptcp_aggregate_success
++;
641 if (mpte
->mpte_handshake_success
) {
645 mptcpstats_get_bytes(mpte
, cell
, &cellbytes
, &allbytes
);
647 tcpstat
.tcps_mptcp_aggregate_cell_bytes
+= cellbytes
;
648 tcpstat
.tcps_mptcp_aggregate_all_bytes
+= allbytes
;
653 if (cell
&& mpte
->mpte_handshake_success
&& mpte
->mpte_used_wifi
)
654 tcpstat
.tcps_mptcp_back_to_wifi
++;
656 if (mpte
->mpte_triggered_cell
)
657 tcpstat
.tcps_mptcp_triggered_cell
++;
661 * Destroy an MPTCP session.
664 mptcp_session_destroy(struct mptses
*mpte
)
668 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
670 mp_tp
= mpte
->mpte_mptcb
;
671 VERIFY(mp_tp
!= NULL
);
673 mptcpstats_session_wrapup(mpte
);
675 mptcp_unset_cellicon();
678 * MPTCP Multipath PCB Extension section
680 mptcp_flush_sopts(mpte
);
681 VERIFY(TAILQ_EMPTY(&mpte
->mpte_subflows
) && mpte
->mpte_numflows
== 0);
683 if (mpte
->mpte_itfinfo_size
> MPTE_ITFINFO_SIZE
)
684 _FREE(mpte
->mpte_itfinfo
, M_TEMP
);
686 mpte
->mpte_itfinfo
= NULL
;
688 m_freem_list(mpte
->mpte_reinjectq
);
691 * MPTCP Protocol Control Block section
693 DTRACE_MPTCP2(session__destroy
, struct mptses
*, mpte
,
694 struct mptcb
*, mp_tp
);
698 mptcp_ok_to_create_subflows(struct mptcb
*mp_tp
)
700 return (mp_tp
->mpt_state
>= MPTCPS_ESTABLISHED
&&
701 mp_tp
->mpt_state
< MPTCPS_FIN_WAIT_1
&&
702 !(mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
));
706 mptcp_synthesize_nat64(struct in6_addr
*addr
, uint32_t len
, struct in_addr
*addrv4
)
708 static const struct in6_addr well_known_prefix
= {
709 .__u6_addr
.__u6_addr8
= {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
710 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
711 0x00, 0x00, 0x00, 0x00},
713 char buf
[MAX_IPv6_STR_LEN
];
714 char *ptrv4
= (char *)addrv4
;
715 char *ptr
= (char *)addr
;
717 if (IN_ZERONET(ntohl(addrv4
->s_addr
)) || // 0.0.0.0/8 Source hosts on local network
718 IN_LOOPBACK(ntohl(addrv4
->s_addr
)) || // 127.0.0.0/8 Loopback
719 IN_LINKLOCAL(ntohl(addrv4
->s_addr
)) || // 169.254.0.0/16 Link Local
720 IN_DS_LITE(ntohl(addrv4
->s_addr
)) || // 192.0.0.0/29 DS-Lite
721 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4
->s_addr
)) || // 192.88.99.0/24 6to4 Relay Anycast
722 IN_MULTICAST(ntohl(addrv4
->s_addr
)) || // 224.0.0.0/4 Multicast
723 INADDR_BROADCAST
== addrv4
->s_addr
) { // 255.255.255.255/32 Limited Broadcast
727 /* Check for the well-known prefix */
728 if (len
== NAT64_PREFIX_LEN_96
&&
729 IN6_ARE_ADDR_EQUAL(addr
, &well_known_prefix
)) {
730 if (IN_PRIVATE(ntohl(addrv4
->s_addr
)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
731 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4
->s_addr
))) // 100.64.0.0/10 Shared Address Space
736 case NAT64_PREFIX_LEN_96
:
737 memcpy(ptr
+ 12, ptrv4
, 4);
739 case NAT64_PREFIX_LEN_64
:
740 memcpy(ptr
+ 9, ptrv4
, 4);
742 case NAT64_PREFIX_LEN_56
:
743 memcpy(ptr
+ 7, ptrv4
, 1);
744 memcpy(ptr
+ 9, ptrv4
+ 1, 3);
746 case NAT64_PREFIX_LEN_48
:
747 memcpy(ptr
+ 6, ptrv4
, 2);
748 memcpy(ptr
+ 9, ptrv4
+ 2, 2);
750 case NAT64_PREFIX_LEN_40
:
751 memcpy(ptr
+ 5, ptrv4
, 3);
752 memcpy(ptr
+ 9, ptrv4
+ 3, 1);
754 case NAT64_PREFIX_LEN_32
:
755 memcpy(ptr
+ 4, ptrv4
, 4);
758 panic("NAT64-prefix len is wrong: %u\n", len
);
761 os_log_info(mptcp_log_handle
, "%s: nat64prefix-len %u synthesized %s\n",
763 inet_ntop(AF_INET6
, (void *)addr
, buf
, sizeof(buf
)));
769 mptcp_trigger_cell_bringup(struct mptses
*mpte
)
771 struct socket
*mp_so
= mptetoso(mpte
);
773 if (!uuid_is_null(mpsotomppcb(mp_so
)->necp_client_uuid
)) {
774 uuid_string_t uuidstr
;
777 err
= necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so
)->necp_client_uuid
,
781 mpte
->mpte_triggered_cell
= 1;
783 uuid_unparse_upper(mpsotomppcb(mp_so
)->necp_client_uuid
, uuidstr
);
784 os_log_info(mptcp_log_handle
, "%s asked irat to bringup cell for uuid %s, err %d\n",
785 __func__
, uuidstr
, err
);
787 os_log_info(mptcp_log_handle
, "%s UUID is already null\n", __func__
);
793 mptcp_check_subflows_and_add(struct mptses
*mpte
)
795 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
796 boolean_t cellular_viable
= FALSE
;
797 boolean_t want_cellular
= TRUE
;
800 if (!mptcp_ok_to_create_subflows(mp_tp
))
803 for (i
= 0; i
< mpte
->mpte_itfinfo_size
; i
++) {
804 struct mpt_itf_info
*info
;
810 info
= &mpte
->mpte_itfinfo
[i
];
812 if (info
->no_mptcp_support
)
815 ifindex
= info
->ifindex
;
816 if (ifindex
== IFSCOPE_NONE
)
819 ifnet_head_lock_shared();
820 ifp
= ifindex2ifnet
[ifindex
];
826 if (IFNET_IS_CELLULAR(ifp
))
827 cellular_viable
= TRUE
;
829 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
830 const struct ifnet
*subifp
= sotoinpcb(mpts
->mpts_socket
)->inp_last_outifp
;
836 * In Handover mode, only create cell subflow if
837 * 1. Wi-Fi Assist is active
838 * 2. Symptoms marked WiFi as weak
839 * 3. We are experiencing RTOs or we are not sending data.
841 * This covers the scenario, where:
842 * 1. We send and get retransmission timeouts (thus,
843 * we confirmed that WiFi is indeed bad).
844 * 2. We are not sending and the server tries to send.
845 * Establshing a cell-subflow gives the server a
846 * chance to send us some data over cell if WiFi
847 * is dead. We establish the subflow with the
848 * backup-bit set, so the server is not allowed to
849 * send on this subflow as long as WiFi is providing
852 if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_HANDOVER
&&
853 !IFNET_IS_CELLULAR(subifp
) &&
854 !(mpts
->mpts_flags
& (MPTSF_DISCONNECTING
| MPTSF_DISCONNECTED
| MPTSF_CLOSE_REQD
)) &&
855 (mptcp_is_wifi_unusable(mpte
) == 0 ||
856 (sototcpcb(mpts
->mpts_socket
)->t_rxtshift
< mptcp_fail_thresh
* 2 &&
857 ((mpte
->mpte_flags
& MPTE_FIRSTPARTY
) || mptetoso(mpte
)->so_snd
.sb_cc
)))) {
858 os_log_debug(mptcp_log_handle
, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
859 __func__
, mptcp_is_wifi_unusable(mpte
),
860 sototcpcb(mpts
->mpts_socket
)->t_rxtshift
,
861 !!(mpte
->mpte_flags
& MPTE_FIRSTPARTY
),
862 mptetoso(mpte
)->so_snd
.sb_cc
,
863 ifindex
, subifp
->if_index
);
866 /* We found a proper subflow on WiFi - no need for cell */
867 want_cellular
= FALSE
;
870 os_log_debug(mptcp_log_handle
, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
871 __func__
, mpte
->mpte_svctype
, IFNET_IS_CELLULAR(subifp
), mpts
->mpts_flags
,
872 mptcp_is_wifi_unusable(mpte
), sototcpcb(mpts
->mpts_socket
)->t_rxtshift
,
873 !!(mpte
->mpte_flags
& MPTE_FIRSTPARTY
), mptetoso(mpte
)->so_snd
.sb_cc
);
877 if (subifp
->if_index
== ifindex
&&
878 !(mpts
->mpts_socket
->so_state
& SS_ISDISCONNECTED
) &&
879 sototcpcb(mpts
->mpts_socket
)->t_state
!= TCPS_CLOSED
) {
881 * We found a subflow on this interface.
882 * No need to create a new one.
889 if (!found
&& !(mpte
->mpte_flags
& MPTE_FIRSTPARTY
) &&
890 !(mpte
->mpte_flags
& MPTE_ACCESS_GRANTED
) &&
891 mptcp_developer_mode
== 0) {
892 mptcp_ask_symptoms(mpte
);
897 struct sockaddr
*dst
= &mpte
->mpte_dst
;
898 struct sockaddr_in6 nat64pre
;
900 if (mpte
->mpte_dst
.sa_family
== AF_INET
&&
901 !info
->has_v4_conn
&& info
->has_nat64_conn
) {
902 struct ipv6_prefix nat64prefixes
[NAT64_MAX_NUM_PREFIXES
];
905 bzero(&nat64pre
, sizeof(struct sockaddr_in6
));
907 error
= ifnet_get_nat64prefix(ifp
, nat64prefixes
);
909 os_log_error(mptcp_log_handle
, "%s: no NAT64-prefix on itf %s, error %d\n",
910 __func__
, ifp
->if_name
, error
);
914 for (j
= 0; j
< NAT64_MAX_NUM_PREFIXES
; j
++) {
915 if (nat64prefixes
[j
].prefix_len
!= 0)
919 VERIFY(j
< NAT64_MAX_NUM_PREFIXES
);
921 error
= mptcp_synthesize_nat64(&nat64prefixes
[j
].ipv6_prefix
,
922 nat64prefixes
[j
].prefix_len
,
923 &mpte
->__mpte_dst_v4
.sin_addr
);
925 os_log_info(mptcp_log_handle
, "%s: cannot synthesize this addr\n",
930 memcpy(&nat64pre
.sin6_addr
,
931 &nat64prefixes
[j
].ipv6_prefix
,
932 sizeof(nat64pre
.sin6_addr
));
933 nat64pre
.sin6_len
= sizeof(struct sockaddr_in6
);
934 nat64pre
.sin6_family
= AF_INET6
;
935 nat64pre
.sin6_port
= mpte
->__mpte_dst_v6
.sin6_port
;
936 nat64pre
.sin6_flowinfo
= 0;
937 nat64pre
.sin6_scope_id
= 0;
939 dst
= (struct sockaddr
*)&nat64pre
;
942 /* Initial subflow started on a NAT64'd address? */
943 if (mpte
->mpte_dst
.sa_family
== AF_INET6
&&
944 mpte
->mpte_dst_v4_nat64
.sin_family
== AF_INET
) {
945 dst
= (struct sockaddr
*)&mpte
->mpte_dst_v4_nat64
;
948 if (dst
->sa_family
== AF_INET
&& !info
->has_v4_conn
)
950 if (dst
->sa_family
== AF_INET6
&& !info
->has_v6_conn
)
953 mptcp_subflow_add(mpte
, NULL
, dst
, ifindex
, NULL
);
957 if (!cellular_viable
&& want_cellular
) {
958 /* Trigger Cell Bringup */
959 mptcp_trigger_cell_bringup(mpte
);
964 * Based on the MPTCP Service-type and the state of the subflows, we
965 * will destroy subflows here.
968 mptcp_check_subflows_and_remove(struct mptses
*mpte
)
970 struct mptsub
*mpts
, *tmpts
;
971 int found_working_subflow
= 0, removed_some
= 0;
972 int wifi_unusable
= mptcp_is_wifi_unusable(mpte
);
974 if (mpte
->mpte_svctype
!= MPTCP_SVCTYPE_HANDOVER
)
978 * Look for a subflow that is on a non-cellular interface
979 * and actually works (aka, no retransmission timeout).
981 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
982 const struct ifnet
*ifp
= sotoinpcb(mpts
->mpts_socket
)->inp_last_outifp
;
986 if (ifp
== NULL
|| IFNET_IS_CELLULAR(ifp
))
989 so
= mpts
->mpts_socket
;
992 if (!(mpts
->mpts_flags
& MPTSF_CONNECTED
) ||
993 tp
->t_state
!= TCPS_ESTABLISHED
)
996 /* Is this subflow in good condition? */
997 if (tp
->t_rxtshift
== 0)
998 found_working_subflow
= 1;
1000 /* Or WiFi is fine */
1002 found_working_subflow
= 1;
1006 * Couldn't find a working subflow, let's not remove those on a cellular
1009 if (!found_working_subflow
)
1012 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
1013 const struct ifnet
*ifp
= sotoinpcb(mpts
->mpts_socket
)->inp_last_outifp
;
1015 /* Only remove cellular subflows */
1016 if (ifp
== NULL
|| !IFNET_IS_CELLULAR(ifp
))
1019 soevent(mpts
->mpts_socket
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MUSTRST
);
1024 mptcp_unset_cellicon();
1028 mptcp_remove_subflows(struct mptses
*mpte
)
1030 struct mptsub
*mpts
, *tmpts
;
1032 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
1033 if (mpts
->mpts_flags
& MPTSF_CLOSE_REQD
) {
1034 mpts
->mpts_flags
&= ~MPTSF_CLOSE_REQD
;
1036 soevent(mpts
->mpts_socket
,
1037 SO_FILT_HINT_LOCKED
| SO_FILT_HINT_NOSRCADDR
);
1043 mptcp_create_subflows(__unused
void *arg
)
1048 * Start with clearing, because we might be processing connections
1049 * while a new event comes in.
1051 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled
))
1052 mptcplog((LOG_ERR
, "%s: bit was already cleared!\n", __func__
),
1053 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1055 /* Iterate over all MPTCP connections */
1057 lck_mtx_lock(&mtcbinfo
.mppi_lock
);
1059 TAILQ_FOREACH(mpp
, &mtcbinfo
.mppi_pcbs
, mpp_entry
) {
1060 struct mptses
*mpte
;
1061 struct socket
*mp_so
;
1063 if (!(mpp
->mpp_flags
& MPP_CREATE_SUBFLOWS
))
1068 mpp
->mpp_flags
&= ~MPP_CREATE_SUBFLOWS
;
1070 mpte
= mpp
->mpp_pcbe
;
1071 mp_so
= mpp
->mpp_socket
;
1073 VERIFY(mp_so
->so_usecount
> 0);
1075 mptcp_check_subflows_and_add(mpte
);
1076 mptcp_remove_subflows(mpte
);
1078 mp_so
->so_usecount
--; /* See mptcp_sched_create_subflows */
1082 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
1086 * We need this because we are coming from an NECP-event. This event gets posted
1087 * while holding NECP-locks. The creation of the subflow however leads us back
1088 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1089 * So, we would deadlock there as we already hold the NECP-lock.
1091 * So, let's schedule this separately. It also gives NECP the chance to make
1092 * progress, without having to wait for MPTCP to finish its subflow creation.
1095 mptcp_sched_create_subflows(struct mptses
*mpte
)
1097 struct mppcb
*mpp
= mpte
->mpte_mppcb
;
1098 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
1099 struct socket
*mp_so
= mpp
->mpp_socket
;
1101 if (!mptcp_ok_to_create_subflows(mp_tp
)) {
1102 mptcplog((LOG_DEBUG
, "%s: not a good time for subflows, state %u flags %#x",
1103 __func__
, mp_tp
->mpt_state
, mp_tp
->mpt_flags
),
1104 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
1108 if (!(mpp
->mpp_flags
& MPP_CREATE_SUBFLOWS
)) {
1109 mp_so
->so_usecount
++; /* To prevent it from being free'd in-between */
1110 mpp
->mpp_flags
|= MPP_CREATE_SUBFLOWS
;
1113 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled
))
1116 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1117 timeout(mptcp_create_subflows
, NULL
, hz
/10);
1121 * Allocate an MPTCP socket option structure.
1124 mptcp_sopt_alloc(int how
)
1128 mpo
= (how
== M_WAITOK
) ? zalloc(mptopt_zone
) :
1129 zalloc_noblock(mptopt_zone
);
1131 bzero(mpo
, mptopt_zone_size
);
1138 * Free an MPTCP socket option structure.
1141 mptcp_sopt_free(struct mptopt
*mpo
)
1143 VERIFY(!(mpo
->mpo_flags
& MPOF_ATTACHED
));
1145 zfree(mptopt_zone
, mpo
);
1149 * Add a socket option to the MPTCP socket option list.
1152 mptcp_sopt_insert(struct mptses
*mpte
, struct mptopt
*mpo
)
1154 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
1155 mpo
->mpo_flags
|= MPOF_ATTACHED
;
1156 TAILQ_INSERT_TAIL(&mpte
->mpte_sopts
, mpo
, mpo_entry
);
1160 * Remove a socket option from the MPTCP socket option list.
1163 mptcp_sopt_remove(struct mptses
*mpte
, struct mptopt
*mpo
)
1165 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
1166 VERIFY(mpo
->mpo_flags
& MPOF_ATTACHED
);
1167 mpo
->mpo_flags
&= ~MPOF_ATTACHED
;
1168 TAILQ_REMOVE(&mpte
->mpte_sopts
, mpo
, mpo_entry
);
1172 * Search for an existing <sopt_level,sopt_name> socket option.
1175 mptcp_sopt_find(struct mptses
*mpte
, struct sockopt
*sopt
)
1179 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
1181 TAILQ_FOREACH(mpo
, &mpte
->mpte_sopts
, mpo_entry
) {
1182 if (mpo
->mpo_level
== sopt
->sopt_level
&&
1183 mpo
->mpo_name
== sopt
->sopt_name
)
1190 * Allocate a MPTCP subflow structure.
1192 static struct mptsub
*
1193 mptcp_subflow_alloc(void)
1195 struct mptsub
*mpts
= zalloc(mptsub_zone
);
1200 bzero(mpts
, mptsub_zone_size
);
1205 * Deallocate a subflow structure, called when all of the references held
1206 * on it have been released. This implies that the subflow has been deleted.
1209 mptcp_subflow_free(struct mptsub
*mpts
)
1211 VERIFY(mpts
->mpts_refcnt
== 0);
1212 VERIFY(!(mpts
->mpts_flags
& MPTSF_ATTACHED
));
1213 VERIFY(mpts
->mpts_mpte
== NULL
);
1214 VERIFY(mpts
->mpts_socket
== NULL
);
1216 if (mpts
->mpts_src
!= NULL
) {
1217 FREE(mpts
->mpts_src
, M_SONAME
);
1218 mpts
->mpts_src
= NULL
;
1221 zfree(mptsub_zone
, mpts
);
1225 mptcp_subflow_addref(struct mptsub
*mpts
)
1227 if (++mpts
->mpts_refcnt
== 0)
1228 panic("%s: mpts %p wraparound refcnt\n", __func__
, mpts
);
1233 mptcp_subflow_remref(struct mptsub
*mpts
)
1235 if (mpts
->mpts_refcnt
== 0) {
1236 panic("%s: mpts %p negative refcnt\n", __func__
, mpts
);
1239 if (--mpts
->mpts_refcnt
> 0)
1242 /* callee will unlock and destroy lock */
1243 mptcp_subflow_free(mpts
);
1247 mptcp_subflow_attach(struct mptses
*mpte
, struct mptsub
*mpts
, struct socket
*so
)
1249 struct socket
*mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1250 struct tcpcb
*tp
= sototcpcb(so
);
1253 * From this moment on, the subflow is linked to the MPTCP-connection.
1254 * Locking,... happens now at the MPTCP-layer
1256 tp
->t_mptcb
= mpte
->mpte_mptcb
;
1257 so
->so_flags
|= SOF_MP_SUBFLOW
;
1258 mp_so
->so_usecount
++;
1261 * Insert the subflow into the list, and associate the MPTCP PCB
1262 * as well as the the subflow socket. From this point on, removing
1263 * the subflow needs to be done via mptcp_subflow_del().
1265 TAILQ_INSERT_TAIL(&mpte
->mpte_subflows
, mpts
, mpts_entry
);
1266 mpte
->mpte_numflows
++;
1268 atomic_bitset_32(&mpts
->mpts_flags
, MPTSF_ATTACHED
);
1269 mpts
->mpts_mpte
= mpte
;
1270 mpts
->mpts_socket
= so
;
1272 mptcp_subflow_addref(mpts
); /* for being in MPTCP subflow list */
1273 mptcp_subflow_addref(mpts
); /* for subflow socket */
1277 mptcp_subflow_necp_cb(void *handle
, __unused
int action
,
1278 __unused
uint32_t interface_index
,
1279 uint32_t necp_flags
, bool *viable
)
1281 boolean_t low_power
= !!(necp_flags
& NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER
);
1282 struct inpcb
*inp
= (struct inpcb
*)handle
;
1283 struct socket
*so
= inp
->inp_socket
;
1284 struct mptsub
*mpts
;
1285 struct mptses
*mpte
;
1288 action
= NECP_CLIENT_CBACTION_NONVIABLE
;
1290 if (action
!= NECP_CLIENT_CBACTION_NONVIABLE
)
1294 * The socket is being garbage-collected. There is nothing to be done
1297 if (so
->so_usecount
== 0)
1302 /* Check again after we acquired the lock. */
1303 if (so
->so_usecount
== 0)
1306 mpte
= tptomptp(sototcpcb(so
))->mpt_mpte
;
1307 mpts
= sototcpcb(so
)->t_mpsub
;
1309 os_log_debug(mptcp_log_handle
, "%s Subflow on itf %u became non-viable, power %u",
1310 __func__
, mpts
->mpts_ifscope
, low_power
);
1312 mpts
->mpts_flags
|= MPTSF_CLOSE_REQD
;
1314 mptcp_sched_create_subflows(mpte
);
1316 if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_HANDOVER
&& viable
!= NULL
)
1320 socket_unlock(so
, 1);
1324 * Create an MPTCP subflow socket.
1327 mptcp_subflow_socreate(struct mptses
*mpte
, struct mptsub
*mpts
, int dom
,
1330 lck_mtx_t
*subflow_mtx
;
1331 struct mptopt smpo
, *mpo
, *tmpo
;
1333 struct socket
*mp_so
;
1337 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
1338 mp_so
= mptetoso(mpte
);
1340 p
= proc_find(mp_so
->last_pid
);
1341 if (p
== PROC_NULL
) {
1342 mptcplog((LOG_ERR
, "%s: Couldn't find proc for pid %u\n", __func__
, mp_so
->last_pid
),
1343 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1349 * Create the subflow socket (multipath subflow, non-blocking.)
1351 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1352 * socket; it will be cleared when the socket is peeled off or closed.
1353 * It also indicates to the underlying TCP to handle MPTCP options.
1354 * A multipath subflow socket implies SS_NOFDREF state.
1358 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1359 * the ipi-lock. We cannot hold the socket-lock at that point.
1362 error
= socreate_internal(dom
, so
, SOCK_STREAM
, IPPROTO_TCP
, p
,
1363 SOCF_ASYNC
, PROC_NULL
);
1366 mptcplog((LOG_ERR
, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1367 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), error
),
1368 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1372 mptcp_subflow_free(mpts
);
1377 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1378 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1379 * Which is why we also need to get the lock with pr_getlock, as after
1380 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1382 subflow_mtx
= ((*so
)->so_proto
->pr_getlock
)(*so
, 0);
1383 lck_mtx_lock(subflow_mtx
);
1386 * Must be the first thing we do, to make sure all pointers for this
1389 mptcp_subflow_attach(mpte
, mpts
, *so
);
1392 * A multipath subflow socket is used internally in the kernel,
1393 * therefore it does not have a file desciptor associated by
1396 (*so
)->so_state
|= SS_NOFDREF
;
1398 lck_mtx_unlock(subflow_mtx
);
1400 /* prevent the socket buffers from being compressed */
1401 (*so
)->so_rcv
.sb_flags
|= SB_NOCOMPRESS
;
1402 (*so
)->so_snd
.sb_flags
|= SB_NOCOMPRESS
;
1404 /* Inherit preconnect and TFO data flags */
1405 if (mp_so
->so_flags1
& SOF1_PRECONNECT_DATA
)
1406 (*so
)->so_flags1
|= SOF1_PRECONNECT_DATA
;
1407 if (mp_so
->so_flags1
& SOF1_DATA_IDEMPOTENT
)
1408 (*so
)->so_flags1
|= SOF1_DATA_IDEMPOTENT
;
1410 /* Inherit uuid and create the related flow. */
1411 if (!uuid_is_null(mpsotomppcb(mp_so
)->necp_client_uuid
)) {
1412 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
1414 sotoinpcb(*so
)->necp_cb
= mptcp_subflow_necp_cb
;
1417 * A note on the unlock: With MPTCP, we do multiple times a
1418 * necp_client_register_socket_flow. This is problematic,
1419 * because now the lock-ordering guarantee (first necp-locks,
1420 * then socket-locks) is no more respected. So, we need to
1424 error
= necp_client_register_socket_flow(mp_so
->last_pid
,
1425 mpsotomppcb(mp_so
)->necp_client_uuid
, sotoinpcb(*so
));
1431 /* Possible state-change during the unlock above */
1432 if (mp_tp
->mpt_state
>= MPTCPS_TIME_WAIT
||
1433 (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
))
1436 uuid_copy(sotoinpcb(*so
)->necp_client_uuid
, mpsotomppcb(mp_so
)->necp_client_uuid
);
1438 mptcplog((LOG_NOTICE
, "%s: uuid is not set!\n"),
1439 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
1442 /* inherit the other socket options */
1443 bzero(&smpo
, sizeof (smpo
));
1444 smpo
.mpo_flags
|= MPOF_SUBFLOW_OK
;
1445 smpo
.mpo_level
= SOL_SOCKET
;
1446 smpo
.mpo_intval
= 1;
1448 /* disable SIGPIPE */
1449 smpo
.mpo_name
= SO_NOSIGPIPE
;
1450 if ((error
= mptcp_subflow_sosetopt(mpte
, mpts
, &smpo
)) != 0)
1453 /* find out if the subflow's source address goes away */
1454 smpo
.mpo_name
= SO_NOADDRERR
;
1455 if ((error
= mptcp_subflow_sosetopt(mpte
, mpts
, &smpo
)) != 0)
1458 /* enable keepalive */
1459 smpo
.mpo_name
= SO_KEEPALIVE
;
1460 if ((error
= mptcp_subflow_sosetopt(mpte
, mpts
, &smpo
)) != 0)
1463 smpo
.mpo_level
= IPPROTO_TCP
;
1464 smpo
.mpo_intval
= mptcp_subflow_keeptime
;
1465 smpo
.mpo_name
= TCP_KEEPALIVE
;
1466 if ((error
= mptcp_subflow_sosetopt(mpte
, mpts
, &smpo
)) != 0)
1469 if (mpte
->mpte_mptcb
->mpt_state
>= MPTCPS_ESTABLISHED
) {
1471 * On secondary subflows we might need to set the cell-fallback
1472 * flag (see conditions in mptcp_subflow_sosetopt).
1474 smpo
.mpo_level
= SOL_SOCKET
;
1475 smpo
.mpo_name
= SO_MARK_CELLFALLBACK
;
1476 smpo
.mpo_intval
= 1;
1477 if ((error
= mptcp_subflow_sosetopt(mpte
, mpts
, &smpo
)) != 0)
1481 /* replay setsockopt(2) on the subflow sockets for eligible options */
1482 TAILQ_FOREACH_SAFE(mpo
, &mpte
->mpte_sopts
, mpo_entry
, tmpo
) {
1485 if (!(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
))
1489 * Skip those that are handled internally; these options
1490 * should not have been recorded and marked with the
1491 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1493 if (mpo
->mpo_level
== SOL_SOCKET
&&
1494 (mpo
->mpo_name
== SO_NOSIGPIPE
||
1495 mpo
->mpo_name
== SO_NOADDRERR
||
1496 mpo
->mpo_name
== SO_KEEPALIVE
))
1499 interim
= (mpo
->mpo_flags
& MPOF_INTERIM
);
1500 if (mptcp_subflow_sosetopt(mpte
, mpts
, mpo
) != 0 && interim
) {
1501 mptcplog((LOG_ERR
, "%s: subflow socreate mp_so 0x%llx"
1502 " sopt %s val %d interim record removed\n", __func__
,
1503 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
1504 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
),
1506 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1507 mptcp_sopt_remove(mpte
, mpo
);
1508 mptcp_sopt_free(mpo
);
1514 * We need to receive everything that the subflow socket has,
1515 * so use a customized socket receive function. We will undo
1516 * this when the socket is peeled off or closed.
1520 (*so
)->so_proto
= &mptcp_subflow_protosw
;
1524 (*so
)->so_proto
= (struct protosw
*)&mptcp_subflow_protosw6
;
1534 DTRACE_MPTCP3(subflow__create
, struct mptses
*, mpte
,
1535 int, dom
, int, error
);
1540 mptcp_subflow_abort(mpts
, error
);
1544 mptcplog((LOG_ERR
, "%s: subflow socreate failed with error %d\n",
1545 __func__
, error
), MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1551 * Close an MPTCP subflow socket.
1553 * Note that this may be called on an embryonic subflow, and the only
1554 * thing that is guaranteed valid is the protocol-user request.
1557 mptcp_subflow_soclose(struct mptsub
*mpts
)
1559 struct socket
*so
= mpts
->mpts_socket
;
1561 if (mpts
->mpts_flags
& MPTSF_CLOSED
)
1565 VERIFY(so
->so_flags
& SOF_MP_SUBFLOW
);
1566 VERIFY((so
->so_state
& (SS_NBIO
|SS_NOFDREF
)) == (SS_NBIO
|SS_NOFDREF
));
1568 DTRACE_MPTCP5(subflow__close
, struct mptsub
*, mpts
,
1569 struct socket
*, so
,
1570 struct sockbuf
*, &so
->so_rcv
,
1571 struct sockbuf
*, &so
->so_snd
,
1572 struct mptses
*, mpts
->mpts_mpte
);
1574 mpts
->mpts_flags
|= MPTSF_CLOSED
;
1576 if (so
->so_retaincnt
== 0) {
1581 VERIFY(so
->so_usecount
> 0);
1589 * Connect an MPTCP subflow socket.
1591 * Note that in the pending connect case, the subflow socket may have been
1592 * bound to an interface and/or a source IP address which may no longer be
1593 * around by the time this routine is called; in that case the connect attempt
1594 * will most likely fail.
1597 mptcp_subflow_soconnectx(struct mptses
*mpte
, struct mptsub
*mpts
)
1599 char dbuf
[MAX_IPv6_STR_LEN
];
1600 struct socket
*mp_so
, *so
;
1601 struct mptcb
*mp_tp
;
1602 struct sockaddr
*dst
;
1604 int af
, error
, dport
;
1606 mp_so
= mptetoso(mpte
);
1607 mp_tp
= mpte
->mpte_mptcb
;
1608 so
= mpts
->mpts_socket
;
1609 af
= mpts
->mpts_dst
.sa_family
;
1610 dst
= &mpts
->mpts_dst
;
1612 VERIFY((mpts
->mpts_flags
& (MPTSF_CONNECTING
|MPTSF_CONNECTED
)) == MPTSF_CONNECTING
);
1613 VERIFY(mpts
->mpts_socket
!= NULL
);
1614 VERIFY(af
== AF_INET
|| af
== AF_INET6
);
1616 if (af
== AF_INET
) {
1617 inet_ntop(af
, &SIN(dst
)->sin_addr
.s_addr
, dbuf
, sizeof (dbuf
));
1618 dport
= ntohs(SIN(dst
)->sin_port
);
1620 inet_ntop(af
, &SIN6(dst
)->sin6_addr
, dbuf
, sizeof (dbuf
));
1621 dport
= ntohs(SIN6(dst
)->sin6_port
);
1624 os_log_info(mptcp_log_handle
,
1625 "%s: ifindex %u dst %s:%d pended %u\n", __func__
, mpts
->mpts_ifscope
,
1626 dbuf
, dport
, !!(mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
));
1628 p
= proc_find(mp_so
->last_pid
);
1629 if (p
== PROC_NULL
) {
1630 mptcplog((LOG_ERR
, "%s: Couldn't find proc for pid %u\n", __func__
, mp_so
->last_pid
),
1631 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1636 mpts
->mpts_flags
&= ~MPTSF_CONNECT_PENDING
;
1638 mptcp_attach_to_subf(so
, mpte
->mpte_mptcb
, mpte
->mpte_addrid_last
);
1640 /* connect the subflow socket */
1641 error
= soconnectxlocked(so
, mpts
->mpts_src
, &mpts
->mpts_dst
,
1642 p
, mpts
->mpts_ifscope
,
1643 mpte
->mpte_associd
, NULL
, 0, NULL
, 0, NULL
, NULL
);
1645 mpts
->mpts_iss
= sototcpcb(so
)->iss
;
1647 /* See tcp_connect_complete */
1648 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
&&
1649 (mp_so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
1650 mp_tp
->mpt_sndwnd
= sototcpcb(so
)->snd_wnd
;
1653 /* Allocate a unique address id per subflow */
1654 mpte
->mpte_addrid_last
++;
1655 if (mpte
->mpte_addrid_last
== 0)
1656 mpte
->mpte_addrid_last
++;
1660 DTRACE_MPTCP3(subflow__connect
, struct mptses
*, mpte
,
1661 struct mptsub
*, mpts
, int, error
);
1663 mptcplog((LOG_ERR
, "%s: connectx failed with error %d ifscope %u\n",
1664 __func__
, error
, mpts
->mpts_ifscope
),
1665 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
1671 * MPTCP subflow socket receive routine, derived from soreceive().
1674 mptcp_subflow_soreceive(struct socket
*so
, struct sockaddr
**psa
,
1675 struct uio
*uio
, struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1678 struct socket
*mp_so
= mptetoso(tptomptp(sototcpcb(so
))->mpt_mpte
);
1679 int flags
, error
= 0;
1680 struct proc
*p
= current_proc();
1681 struct mbuf
*m
, **mp
= mp0
;
1682 boolean_t proc_held
= FALSE
;
1684 mpte_lock_assert_held(tptomptp(sototcpcb(so
))->mpt_mpte
);
1685 VERIFY(so
->so_proto
->pr_flags
& PR_CONNREQUIRED
);
1687 #ifdef MORE_LOCKING_DEBUG
1688 if (so
->so_usecount
== 1) {
1689 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
1694 * We return all that is there in the subflow's socket receive buffer
1695 * to the MPTCP layer, so we require that the caller passes in the
1696 * expected parameters.
1698 if (mp
== NULL
|| controlp
!= NULL
)
1705 flags
= *flagsp
&~ MSG_EOR
;
1709 if (flags
& (MSG_PEEK
|MSG_OOB
|MSG_NEEDSA
|MSG_WAITALL
|MSG_WAITSTREAM
))
1710 return (EOPNOTSUPP
);
1712 flags
|= (MSG_DONTWAIT
|MSG_NBIO
);
1715 * If a recv attempt is made on a previously-accepted socket
1716 * that has been marked as inactive (disconnected), reject
1719 if (so
->so_flags
& SOF_DEFUNCT
) {
1720 struct sockbuf
*sb
= &so
->so_rcv
;
1724 * This socket should have been disconnected and flushed
1725 * prior to being returned from sodefunct(); there should
1726 * be no data on its receive list, so panic otherwise.
1728 if (so
->so_state
& SS_DEFUNCT
)
1729 sb_empty_assert(sb
, __func__
);
1734 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1735 * and if so just return to the caller. This could happen when
1736 * soreceive() is called by a socket upcall function during the
1737 * time the socket is freed. The socket buffer would have been
1738 * locked across the upcall, therefore we cannot put this thread
1739 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1740 * we may livelock), because the lock on the socket buffer will
1741 * only be released when the upcall routine returns to its caller.
1742 * Because the socket has been officially closed, there can be
1743 * no further read on it.
1745 * A multipath subflow socket would have its SS_NOFDREF set by
1746 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1747 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1749 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
1750 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
))
1754 * For consistency with soreceive() semantics, we need to obey
1755 * SB_LOCK in case some other code path has locked the buffer.
1757 error
= sblock(&so
->so_rcv
, 0);
1761 m
= so
->so_rcv
.sb_mb
;
1764 * Panic if we notice inconsistencies in the socket's
1765 * receive list; both sb_mb and sb_cc should correctly
1766 * reflect the contents of the list, otherwise we may
1767 * end up with false positives during select() or poll()
1768 * which could put the application in a bad state.
1770 SB_MB_CHECK(&so
->so_rcv
);
1772 if (so
->so_error
!= 0) {
1773 error
= so
->so_error
;
1778 if (so
->so_state
& SS_CANTRCVMORE
) {
1782 if (!(so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
))) {
1788 * MSG_DONTWAIT is implicitly defined and this routine will
1789 * never block, so return EWOULDBLOCK when there is nothing.
1791 error
= EWOULDBLOCK
;
1795 mptcp_update_last_owner(so
, mp_so
);
1797 if (mp_so
->last_pid
!= proc_pid(p
)) {
1798 p
= proc_find(mp_so
->last_pid
);
1799 if (p
== PROC_NULL
) {
1806 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
1807 SBLASTRECORDCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 1");
1808 SBLASTMBUFCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 1");
1811 int dlen
= 0, dfin
= 0, error_out
= 0;
1812 struct mbuf
*start
= m
;
1818 VERIFY(m
->m_nextpkt
== NULL
);
1820 if ((m
->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
1821 orig_dlen
= dlen
= m
->m_pkthdr
.mp_rlen
;
1822 dsn
= m
->m_pkthdr
.mp_dsn
;
1823 sseq
= m
->m_pkthdr
.mp_rseq
;
1824 csum
= m
->m_pkthdr
.mp_csum
;
1826 /* We did fallback */
1827 mptcp_adj_rmap(so
, m
, 0, 0, 0, 0);
1829 sbfree(&so
->so_rcv
, m
);
1834 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
1840 so
->so_rcv
.sb_lastrecord
= m
;
1842 SB_EMPTY_FIXUP(&so
->so_rcv
);
1848 if (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
)
1852 * Check if the full mapping is now present
1854 if ((int)so
->so_rcv
.sb_cc
< dlen
- dfin
) {
1855 mptcplog((LOG_INFO
, "%s not enough data (%u) need %u for dsn %u\n",
1856 __func__
, so
->so_rcv
.sb_cc
, dlen
, (uint32_t)dsn
),
1857 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_LOG
);
1860 error
= EWOULDBLOCK
;
1864 /* Now, get the full mapping */
1866 if (mptcp_adj_rmap(so
, m
, orig_dlen
- dlen
, dsn
, sseq
, orig_dlen
)) {
1870 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MUSTRST
);
1875 sbfree(&so
->so_rcv
, m
);
1880 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
1884 if (dlen
- dfin
== 0)
1887 VERIFY(dlen
<= 0 || m
);
1893 so
->so_rcv
.sb_lastrecord
= m
;
1895 SB_EMPTY_FIXUP(&so
->so_rcv
);
1902 if (mptcp_validate_csum(sototcpcb(so
), start
, dsn
, sseq
, orig_dlen
, csum
, dfin
)) {
1908 SBLASTRECORDCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 2");
1909 SBLASTMBUFCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 2");
1912 DTRACE_MPTCP3(subflow__receive
, struct socket
*, so
,
1913 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
);
1919 sbunlock(&so
->so_rcv
, TRUE
);
1929 * MPTCP subflow socket send routine, derived from sosend().
1932 mptcp_subflow_sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1933 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1935 struct socket
*mp_so
= mptetoso(tptomptp(sototcpcb(so
))->mpt_mpte
);
1936 struct proc
*p
= current_proc();
1937 boolean_t en_tracing
= FALSE
, proc_held
= FALSE
;
1939 int sblocked
= 1; /* Pretend as if it is already locked, so we won't relock it */
1942 VERIFY(control
== NULL
);
1943 VERIFY(addr
== NULL
);
1944 VERIFY(uio
== NULL
);
1946 VERIFY((so
->so_flags
& SOF_CONTENT_FILTER
) == 0);
1948 VERIFY(top
->m_pkthdr
.len
> 0 && top
->m_pkthdr
.len
<= UINT16_MAX
);
1949 VERIFY(top
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
1952 * trace if tracing & network (vs. unix) sockets & and
1955 if (ENTR_SHOULDTRACE
&&
1956 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
1957 struct inpcb
*inp
= sotoinpcb(so
);
1958 if (inp
->inp_last_outifp
!= NULL
&&
1959 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
1961 en_tracing_val
= top
->m_pkthdr
.len
;
1962 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
1963 VM_KERNEL_ADDRPERM(so
),
1964 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
1965 (int64_t)en_tracing_val
);
1969 mptcp_update_last_owner(so
, mp_so
);
1971 if (mp_so
->last_pid
!= proc_pid(p
)) {
1972 p
= proc_find(mp_so
->last_pid
);
1973 if (p
== PROC_NULL
) {
1981 inp_update_necp_policy(sotoinpcb(so
), NULL
, NULL
, 0);
1984 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
1986 error
= sosendcheck(so
, NULL
, top
->m_pkthdr
.len
, 0, 1, 0, &sblocked
, NULL
);
1990 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)(so
, 0, top
, NULL
, NULL
, p
);
2000 soclearfastopen(so
);
2003 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2004 VM_KERNEL_ADDRPERM(so
),
2005 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2006 (int64_t)en_tracing_val
);
2014 * Establish an initial MPTCP connection (if first subflow and not yet
2015 * connected), or add a subflow to an existing MPTCP connection.
2018 mptcp_subflow_add(struct mptses
*mpte
, struct sockaddr
*src
,
2019 struct sockaddr
*dst
, uint32_t ifscope
, sae_connid_t
*pcid
)
2021 struct socket
*mp_so
, *so
= NULL
;
2022 struct mptcb
*mp_tp
;
2023 struct mptsub
*mpts
= NULL
;
2026 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
2027 mp_so
= mptetoso(mpte
);
2028 mp_tp
= mpte
->mpte_mptcb
;
2030 if (mp_tp
->mpt_state
>= MPTCPS_CLOSE_WAIT
) {
2031 /* If the remote end sends Data FIN, refuse subflow adds */
2032 mptcplog((LOG_ERR
, "%s state %u\n", __func__
, mp_tp
->mpt_state
),
2033 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
2038 mpts
= mptcp_subflow_alloc();
2040 mptcplog((LOG_ERR
, "%s malloc subflow failed\n", __func__
),
2041 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
2047 int len
= src
->sa_len
;
2049 MALLOC(mpts
->mpts_src
, struct sockaddr
*, len
, M_SONAME
,
2051 if (mpts
->mpts_src
== NULL
) {
2052 mptcplog((LOG_ERR
, "%s malloc mpts_src failed", __func__
),
2053 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
2057 bcopy(src
, mpts
->mpts_src
, len
);
2060 memcpy(&mpts
->mpts_dst
, dst
, dst
->sa_len
);
2062 af
= mpts
->mpts_dst
.sa_family
;
2064 mpts
->mpts_ifscope
= ifscope
;
2066 /* create the subflow socket */
2067 if ((error
= mptcp_subflow_socreate(mpte
, mpts
, af
, &so
)) != 0)
2069 * Returning (error) and not cleaning up, because up to here
2070 * all we did is creating mpts.
2072 * And the contract is that the call to mptcp_subflow_socreate,
2073 * moves ownership of mpts to mptcp_subflow_socreate.
2078 * We may be called from within the kernel. Still need to account this
2079 * one to the real app.
2081 mptcp_update_last_owner(mpts
->mpts_socket
, mp_so
);
2084 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2085 * -1 (SAE_CONNID_ALL).
2087 mpte
->mpte_connid_last
++;
2088 if (mpte
->mpte_connid_last
== SAE_CONNID_ALL
||
2089 mpte
->mpte_connid_last
== SAE_CONNID_ANY
)
2090 mpte
->mpte_connid_last
++;
2092 mpts
->mpts_connid
= mpte
->mpte_connid_last
;
2094 mpts
->mpts_rel_seq
= 1;
2096 /* Allocate a unique address id per subflow */
2097 mpte
->mpte_addrid_last
++;
2098 if (mpte
->mpte_addrid_last
== 0)
2099 mpte
->mpte_addrid_last
++;
2101 /* register for subflow socket read/write events */
2102 sock_setupcalls_locked(so
, mptcp_subflow_rupcall
, mpts
, mptcp_subflow_wupcall
, mpts
, 1);
2104 /* Register for subflow socket control events */
2105 sock_catchevents_locked(so
, mptcp_subflow_eupcall1
, mpts
,
2106 SO_FILT_HINT_CONNRESET
| SO_FILT_HINT_CANTRCVMORE
|
2107 SO_FILT_HINT_TIMEOUT
| SO_FILT_HINT_NOSRCADDR
|
2108 SO_FILT_HINT_IFDENIED
| SO_FILT_HINT_CONNECTED
|
2109 SO_FILT_HINT_DISCONNECTED
| SO_FILT_HINT_MPFAILOVER
|
2110 SO_FILT_HINT_MPSTATUS
| SO_FILT_HINT_MUSTRST
|
2111 SO_FILT_HINT_MPCANTRCVMORE
| SO_FILT_HINT_ADAPTIVE_RTIMO
|
2112 SO_FILT_HINT_ADAPTIVE_WTIMO
);
2115 VERIFY(!(mpts
->mpts_flags
&
2116 (MPTSF_CONNECTING
|MPTSF_CONNECTED
|MPTSF_CONNECT_PENDING
)));
2119 * Indicate to the TCP subflow whether or not it should establish
2120 * the initial MPTCP connection, or join an existing one. Fill
2121 * in the connection request structure with additional info needed
2122 * by the underlying TCP (to be used in the TCP options, etc.)
2124 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
&& mpte
->mpte_numflows
== 1) {
2125 mpts
->mpts_flags
|= MPTSF_INITIAL_SUB
;
2127 if (mp_tp
->mpt_state
== MPTCPS_CLOSED
) {
2128 mptcp_init_local_parms(mpte
);
2130 soisconnecting(mp_so
);
2132 /* If fastopen is requested, set state in mpts */
2133 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
)
2134 mpts
->mpts_flags
|= MPTSF_TFO_REQD
;
2136 if (!(mp_tp
->mpt_flags
& MPTCPF_JOIN_READY
))
2137 mpts
->mpts_flags
|= MPTSF_CONNECT_PENDING
;
2140 mpts
->mpts_flags
|= MPTSF_CONNECTING
;
2142 if (af
== AF_INET
|| af
== AF_INET6
) {
2143 char dbuf
[MAX_IPv6_STR_LEN
];
2145 mptcplog((LOG_DEBUG
, "MPTCP Socket: %s "
2146 "mp_so 0x%llx dst %s[%d] cid %d "
2147 "[pending %s]\n", __func__
,
2148 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2149 inet_ntop(af
, ((af
== AF_INET
) ?
2150 (void *)&SIN(&mpts
->mpts_dst
)->sin_addr
.s_addr
:
2151 (void *)&SIN6(&mpts
->mpts_dst
)->sin6_addr
),
2152 dbuf
, sizeof (dbuf
)), ((af
== AF_INET
) ?
2153 ntohs(SIN(&mpts
->mpts_dst
)->sin_port
) :
2154 ntohs(SIN6(&mpts
->mpts_dst
)->sin6_port
)),
2156 ((mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) ?
2158 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2161 /* connect right away if first attempt, or if join can be done now */
2162 if (!(mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
))
2163 error
= mptcp_subflow_soconnectx(mpte
, mpts
);
2169 *pcid
= mpts
->mpts_connid
;
2174 mptcp_subflow_abort(mpts
, error
);
2180 mptcp_subflow_free(mpts
);
2186 mptcpstats_update(struct mptcp_itf_stats
*stats
, struct mptsub
*mpts
)
2188 int index
= mptcp_get_statsindex(stats
, mpts
);
2191 struct inpcb
*inp
= sotoinpcb(mpts
->mpts_socket
);
2193 stats
[index
].mpis_txbytes
+= inp
->inp_stat
->txbytes
;
2194 stats
[index
].mpis_rxbytes
+= inp
->inp_stat
->rxbytes
;
2199 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2200 * will no longer be accessible after a subflow is deleted, thus this
2201 * should occur only after the subflow socket has been disconnected.
2204 mptcp_subflow_del(struct mptses
*mpte
, struct mptsub
*mpts
)
2206 struct socket
*mp_so
= mptetoso(mpte
);
2207 struct socket
*so
= mpts
->mpts_socket
;
2208 struct tcpcb
*tp
= sototcpcb(so
);
2210 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
2211 VERIFY(mpts
->mpts_mpte
== mpte
);
2212 VERIFY(mpts
->mpts_flags
& MPTSF_ATTACHED
);
2213 VERIFY(mpte
->mpte_numflows
!= 0);
2214 VERIFY(mp_so
->so_usecount
> 0);
2216 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2217 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2218 mp_so
->so_usecount
, mp_so
->so_retaincnt
, mpts
->mpts_connid
,
2219 mpts
->mpts_flags
, mp_so
->so_error
),
2220 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2222 mptcpstats_update(mpte
->mpte_itfstats
, mpts
);
2223 mpte
->mpte_init_rxbytes
= sotoinpcb(so
)->inp_stat
->rxbytes
;
2224 mpte
->mpte_init_txbytes
= sotoinpcb(so
)->inp_stat
->txbytes
;
2226 atomic_bitclear_32(&mpts
->mpts_flags
, MPTSF_ATTACHED
);
2227 TAILQ_REMOVE(&mpte
->mpte_subflows
, mpts
, mpts_entry
);
2228 mpte
->mpte_numflows
--;
2229 if (mpte
->mpte_active_sub
== mpts
)
2230 mpte
->mpte_active_sub
= NULL
;
2233 * Drop references held by this subflow socket; there
2234 * will be no further upcalls made from this point.
2236 sock_setupcalls_locked(so
, NULL
, NULL
, NULL
, NULL
, 0);
2237 sock_catchevents_locked(so
, NULL
, NULL
, 0);
2239 mptcp_detach_mptcb_from_subf(mpte
->mpte_mptcb
, so
);
2241 mp_so
->so_usecount
--; /* for subflow socket */
2242 mpts
->mpts_mpte
= NULL
;
2243 mpts
->mpts_socket
= NULL
;
2245 mptcp_subflow_remref(mpts
); /* for MPTCP subflow list */
2246 mptcp_subflow_remref(mpts
); /* for subflow socket */
2248 so
->so_flags
&= ~SOF_MP_SUBFLOW
;
2254 mptcp_subflow_shutdown(struct mptses
*mpte
, struct mptsub
*mpts
)
2256 struct socket
*so
= mpts
->mpts_socket
;
2257 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
2260 if (mp_tp
->mpt_state
> MPTCPS_CLOSE_WAIT
)
2263 if (!(so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) &&
2264 (so
->so_state
& SS_ISCONNECTED
)) {
2265 mptcplog((LOG_DEBUG
, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2266 __func__
, mpts
->mpts_connid
, send_dfin
),
2267 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2270 mptcp_send_dfin(so
);
2271 soshutdownlock(so
, SHUT_WR
);
2277 mptcp_subflow_abort(struct mptsub
*mpts
, int error
)
2279 struct socket
*so
= mpts
->mpts_socket
;
2280 struct tcpcb
*tp
= sototcpcb(so
);
2282 if (mpts
->mpts_flags
& MPTSF_DISCONNECTED
)
2285 mptcplog((LOG_DEBUG
, "%s aborting connection state %u\n", __func__
, tp
->t_state
),
2286 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2288 if (tp
->t_state
!= TCPS_CLOSED
)
2289 tcp_drop(tp
, error
);
2291 mptcp_subflow_eupcall1(so
, mpts
, SO_FILT_HINT_DISCONNECTED
);
2295 * Disconnect a subflow socket.
2298 mptcp_subflow_disconnect(struct mptses
*mpte
, struct mptsub
*mpts
)
2301 struct mptcb
*mp_tp
;
2304 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
2306 VERIFY(mpts
->mpts_mpte
== mpte
);
2307 VERIFY(mpts
->mpts_socket
!= NULL
);
2309 if (mpts
->mpts_flags
& (MPTSF_DISCONNECTING
|MPTSF_DISCONNECTED
))
2312 mpts
->mpts_flags
|= MPTSF_DISCONNECTING
;
2314 so
= mpts
->mpts_socket
;
2315 mp_tp
= mpte
->mpte_mptcb
;
2316 if (mp_tp
->mpt_state
> MPTCPS_CLOSE_WAIT
)
2319 if (!(so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) &&
2320 (so
->so_state
& SS_ISCONNECTED
)) {
2321 mptcplog((LOG_DEBUG
, "%s: cid %d fin %d\n",
2322 __func__
, mpts
->mpts_connid
, send_dfin
),
2323 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2326 mptcp_send_dfin(so
);
2327 (void) soshutdownlock(so
, SHUT_RD
);
2328 (void) soshutdownlock(so
, SHUT_WR
);
2329 (void) sodisconnectlocked(so
);
2332 * Generate a disconnect event for this subflow socket, in case
2333 * the lower layer doesn't do it; this is needed because the
2334 * subflow socket deletion relies on it.
2336 mptcp_subflow_eupcall1(so
, mpts
, SO_FILT_HINT_DISCONNECTED
);
2340 * Called when the associated subflow socket posted a read event.
2343 mptcp_subflow_rupcall(struct socket
*so
, void *arg
, int waitf
)
2345 #pragma unused(so, waitf)
2346 struct mptsub
*mpts
= arg
, *tmpts
;
2347 struct mptses
*mpte
= mpts
->mpts_mpte
;
2349 VERIFY(mpte
!= NULL
);
2351 if (mptcp_should_defer_upcall(mpte
->mpte_mppcb
)) {
2352 if (!(mpte
->mpte_mppcb
->mpp_flags
& MPP_RUPCALL
))
2353 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SHOULD_RWAKEUP
;
2357 mpte
->mpte_mppcb
->mpp_flags
|= MPP_RUPCALL
;
2358 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
2359 if (mpts
->mpts_socket
->so_usecount
== 0) {
2360 /* Will be removed soon by tcp_garbage_collect */
2364 mptcp_subflow_addref(mpts
);
2365 mpts
->mpts_socket
->so_usecount
++;
2367 mptcp_subflow_input(mpte
, mpts
);
2369 mptcp_subflow_remref(mpts
); /* ours */
2371 VERIFY(mpts
->mpts_socket
->so_usecount
!= 0);
2372 mpts
->mpts_socket
->so_usecount
--;
2375 mptcp_handle_deferred_upcalls(mpte
->mpte_mppcb
, MPP_RUPCALL
);
2379 * Subflow socket input.
2382 mptcp_subflow_input(struct mptses
*mpte
, struct mptsub
*mpts
)
2384 struct socket
*mp_so
= mptetoso(mpte
);
2385 struct mbuf
*m
= NULL
;
2387 int error
, wakeup
= 0;
2389 VERIFY(!(mpte
->mpte_mppcb
->mpp_flags
& MPP_INSIDE_INPUT
));
2390 mpte
->mpte_mppcb
->mpp_flags
|= MPP_INSIDE_INPUT
;
2392 DTRACE_MPTCP2(subflow__input
, struct mptses
*, mpte
,
2393 struct mptsub
*, mpts
);
2395 if (!(mpts
->mpts_flags
& MPTSF_CONNECTED
))
2398 so
= mpts
->mpts_socket
;
2400 error
= sock_receive_internal(so
, NULL
, &m
, 0, NULL
);
2401 if (error
!= 0 && error
!= EWOULDBLOCK
) {
2402 mptcplog((LOG_ERR
, "%s: cid %d error %d\n",
2403 __func__
, mpts
->mpts_connid
, error
),
2404 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_ERR
);
2405 if (error
== ENODATA
) {
2407 * Don't ignore ENODATA so as to discover
2408 * nasty middleboxes.
2410 mp_so
->so_error
= ENODATA
;
2415 } else if (error
== 0) {
2416 mptcplog((LOG_DEBUG
, "%s: cid %d \n", __func__
, mpts
->mpts_connid
),
2417 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2420 /* In fallback, make sure to accept data on all but one subflow */
2421 if (m
&& (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) &&
2422 !(mpts
->mpts_flags
& MPTSF_ACTIVE
)) {
2423 mptcplog((LOG_DEBUG
, "%s: degraded and got data on non-active flow\n",
2424 __func__
), MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2430 if (IFNET_IS_CELLULAR(sotoinpcb(so
)->inp_last_outifp
)) {
2431 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SET_CELLICON
;
2433 mpte
->mpte_used_cell
= 1;
2435 mpte
->mpte_mppcb
->mpp_flags
|= MPP_UNSET_CELLICON
;
2437 mpte
->mpte_used_wifi
= 1;
2440 mptcp_input(mpte
, m
);
2443 /* notify protocol that we drained all the data */
2444 if (error
== 0 && m
!= NULL
&&
2445 (so
->so_proto
->pr_flags
& PR_WANTRCVD
) && so
->so_pcb
!= NULL
)
2446 (*so
->so_proto
->pr_usrreqs
->pru_rcvd
)(so
, 0);
2450 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SHOULD_RWAKEUP
;
2452 mptcp_handle_deferred_upcalls(mpte
->mpte_mppcb
, MPP_INSIDE_INPUT
);
2456 * Subflow socket write upcall.
2458 * Called when the associated subflow socket posted a read event.
2461 mptcp_subflow_wupcall(struct socket
*so
, void *arg
, int waitf
)
2463 #pragma unused(so, waitf)
2464 struct mptsub
*mpts
= arg
;
2465 struct mptses
*mpte
= mpts
->mpts_mpte
;
2467 VERIFY(mpte
!= NULL
);
2469 if (mptcp_should_defer_upcall(mpte
->mpte_mppcb
)) {
2470 if (!(mpte
->mpte_mppcb
->mpp_flags
& MPP_WUPCALL
))
2471 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SHOULD_WWAKEUP
;
2479 mptcp_search_seq_in_sub(struct mbuf
*m
, struct socket
*so
)
2481 struct mbuf
*so_m
= so
->so_snd
.sb_mb
;
2482 uint64_t dsn
= m
->m_pkthdr
.mp_dsn
;
2485 VERIFY(so_m
->m_flags
& M_PKTHDR
);
2486 VERIFY(so_m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
2488 /* Part of the segment is covered, don't reinject here */
2489 if (so_m
->m_pkthdr
.mp_dsn
<= dsn
&&
2490 so_m
->m_pkthdr
.mp_dsn
+ so_m
->m_pkthdr
.mp_rlen
> dsn
)
2493 so_m
= so_m
->m_next
;
2500 * Subflow socket output.
2502 * Called for sending data from MPTCP to the underlying subflow socket.
2505 mptcp_subflow_output(struct mptses
*mpte
, struct mptsub
*mpts
, int flags
)
2507 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
2508 struct mbuf
*sb_mb
, *m
, *mpt_mbuf
= NULL
, *head
, *tail
;
2509 struct socket
*mp_so
, *so
;
2511 uint64_t mpt_dsn
= 0, off
= 0;
2512 int sb_cc
= 0, error
= 0, wakeup
= 0;
2514 uint16_t tot_sent
= 0;
2515 boolean_t reinjected
= FALSE
;
2517 mpte_lock_assert_held(mpte
);
2519 mp_so
= mptetoso(mpte
);
2520 so
= mpts
->mpts_socket
;
2523 VERIFY(!(mpte
->mpte_mppcb
->mpp_flags
& MPP_INSIDE_OUTPUT
));
2524 mpte
->mpte_mppcb
->mpp_flags
|= MPP_INSIDE_OUTPUT
;
2526 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so
)));
2527 VERIFY((mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) ||
2528 (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) ||
2529 (mpts
->mpts_flags
& MPTSF_TFO_REQD
));
2530 VERIFY(mptcp_subflow_cwnd_space(mpts
->mpts_socket
) > 0);
2532 mptcplog((LOG_DEBUG
, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2533 __func__
, mpts
->mpts_flags
, mpte
->mpte_flags
,
2534 mptcp_subflow_cwnd_space(so
)),
2535 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2536 DTRACE_MPTCP2(subflow__output
, struct mptses
*, mpte
,
2537 struct mptsub
*, mpts
);
2539 /* Remove Addr Option is not sent reliably as per I-D */
2540 if (mpte
->mpte_flags
& MPTE_SND_REM_ADDR
) {
2541 tp
->t_rem_aid
= mpte
->mpte_lost_aid
;
2542 tp
->t_mpflags
|= TMPF_SND_REM_ADDR
;
2543 mpte
->mpte_flags
&= ~MPTE_SND_REM_ADDR
;
2547 * The mbuf chains containing the metadata (as well as pointing to
2548 * the user data sitting at the MPTCP output queue) would then be
2549 * sent down to the subflow socket.
2551 * Some notes on data sequencing:
2553 * a. Each mbuf must be a M_PKTHDR.
2554 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2555 * in the mbuf pkthdr structure.
2556 * c. Each mbuf containing the MPTCP metadata must have its
2557 * pkt_flags marked with the PKTF_MPTCP flag.
2560 if (mpte
->mpte_reinjectq
)
2561 sb_mb
= mpte
->mpte_reinjectq
;
2563 sb_mb
= mp_so
->so_snd
.sb_mb
;
2565 if (sb_mb
== NULL
) {
2566 mptcplog((LOG_ERR
, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2567 __func__
, (uint32_t)mp_tp
->mpt_sndmax
, (uint32_t)mp_tp
->mpt_sndnxt
,
2568 (uint32_t)mp_tp
->mpt_snduna
, mp_tp
->mpt_state
, mp_so
->so_flags1
),
2569 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2571 /* Fix it to prevent looping */
2572 if (MPTCP_SEQ_LT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_snduna
))
2573 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
2577 VERIFY(sb_mb
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
2579 if (sb_mb
->m_pkthdr
.mp_rlen
== 0 &&
2580 !(so
->so_state
& SS_ISCONNECTED
) &&
2581 (so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
2582 tp
->t_mpflags
|= TMPF_TFO_REQUEST
;
2583 goto zero_len_write
;
2586 mpt_dsn
= sb_mb
->m_pkthdr
.mp_dsn
;
2588 /* First, drop acknowledged data */
2589 if (MPTCP_SEQ_LT(mpt_dsn
, mp_tp
->mpt_snduna
)) {
2590 mptcplog((LOG_ERR
, "%s: dropping data, should have been done earlier "
2591 "dsn %u suna %u reinject? %u\n",
2592 __func__
, (uint32_t)mpt_dsn
,
2593 (uint32_t)mp_tp
->mpt_snduna
, !!mpte
->mpte_reinjectq
),
2594 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2595 if (mpte
->mpte_reinjectq
) {
2596 mptcp_clean_reinjectq(mpte
);
2599 len
= mp_tp
->mpt_snduna
- mpt_dsn
;
2600 sbdrop(&mp_so
->so_snd
, (int)len
);
2605 /* Check again because of above sbdrop */
2606 if (mp_so
->so_snd
.sb_mb
== NULL
&& mpte
->mpte_reinjectq
== NULL
) {
2607 mptcplog((LOG_ERR
, "%s send-buffer is empty\n", __func__
),
2608 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2613 * In degraded mode, we don't receive data acks, so force free
2614 * mbufs less than snd_nxt
2616 if ((mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) &&
2617 (mp_tp
->mpt_flags
& MPTCPF_POST_FALLBACK_SYNC
) &&
2618 mp_so
->so_snd
.sb_mb
) {
2619 mpt_dsn
= mp_so
->so_snd
.sb_mb
->m_pkthdr
.mp_dsn
;
2620 if (MPTCP_SEQ_LT(mpt_dsn
, mp_tp
->mpt_snduna
)) {
2622 len
= mp_tp
->mpt_snduna
- mpt_dsn
;
2623 sbdrop(&mp_so
->so_snd
, (int)len
);
2626 mptcplog((LOG_ERR
, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2627 __func__
, (uint32_t)mpt_dsn
, (uint32_t)mp_tp
->mpt_sndnxt
, (uint32_t)mp_tp
->mpt_snduna
),
2628 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2632 if ((mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) &&
2633 !(mp_tp
->mpt_flags
& MPTCPF_POST_FALLBACK_SYNC
)) {
2634 mp_tp
->mpt_flags
|= MPTCPF_POST_FALLBACK_SYNC
;
2635 so
->so_flags1
|= SOF1_POST_FALLBACK_SYNC
;
2639 * Adjust the top level notion of next byte used for retransmissions
2642 if (MPTCP_SEQ_LT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_snduna
))
2643 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
2645 /* Now determine the offset from which to start transmitting data */
2646 if (mpte
->mpte_reinjectq
)
2647 sb_mb
= mpte
->mpte_reinjectq
;
2650 sb_mb
= mp_so
->so_snd
.sb_mb
;
2651 if (sb_mb
== NULL
) {
2652 mptcplog((LOG_ERR
, "%s send-buffer is still empty\n", __func__
),
2653 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2657 if (sb_mb
== mpte
->mpte_reinjectq
) {
2658 sb_cc
= sb_mb
->m_pkthdr
.mp_rlen
;
2661 if (mptcp_search_seq_in_sub(sb_mb
, so
)) {
2662 if (mptcp_can_send_more(mp_tp
, TRUE
)) {
2671 } else if (flags
& MPTCP_SUBOUT_PROBING
) {
2672 sb_cc
= sb_mb
->m_pkthdr
.mp_rlen
;
2675 sb_cc
= min(mp_so
->so_snd
.sb_cc
, mp_tp
->mpt_sndwnd
);
2678 * With TFO, there might be no data at all, thus still go into this
2681 if ((mp_so
->so_flags1
& SOF1_PRECONNECT_DATA
) ||
2682 MPTCP_SEQ_LT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_sndmax
)) {
2683 off
= mp_tp
->mpt_sndnxt
- mp_tp
->mpt_snduna
;
2686 mptcplog((LOG_ERR
, "%s this should not happen: sndnxt %u sndmax %u\n",
2687 __func__
, (uint32_t)mp_tp
->mpt_sndnxt
,
2688 (uint32_t)mp_tp
->mpt_sndmax
),
2689 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2695 sb_cc
= min(sb_cc
, mptcp_subflow_cwnd_space(so
));
2697 mptcplog((LOG_ERR
, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2698 __func__
, sb_cc
, mp_so
->so_snd
.sb_cc
, mp_tp
->mpt_sndwnd
,
2699 (uint32_t)mp_tp
->mpt_sndnxt
, (uint32_t)mp_tp
->mpt_sndmax
,
2700 mptcp_subflow_cwnd_space(so
)),
2701 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2704 sb_cc
= min(sb_cc
, UINT16_MAX
);
2707 * Create a DSN mapping for the data we are about to send. It all
2708 * has the same mapping.
2711 mpt_dsn
= sb_mb
->m_pkthdr
.mp_dsn
;
2713 mpt_dsn
= mp_tp
->mpt_snduna
+ off
;
2716 while (mpt_mbuf
&& reinjected
== FALSE
&&
2717 (mpt_mbuf
->m_pkthdr
.mp_rlen
== 0 ||
2718 mpt_mbuf
->m_pkthdr
.mp_rlen
<= (uint32_t)off
)) {
2719 off
-= mpt_mbuf
->m_pkthdr
.mp_rlen
;
2720 mpt_mbuf
= mpt_mbuf
->m_next
;
2722 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
)
2723 mptcplog((LOG_DEBUG
, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2724 __func__
, mpts
->mpts_connid
, (uint32_t)mp_tp
->mpt_snduna
, (uint32_t)mp_tp
->mpt_sndnxt
,
2725 mpts
->mpts_probecnt
),
2726 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2728 VERIFY((mpt_mbuf
== NULL
) || (mpt_mbuf
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
2732 while (tot_sent
< sb_cc
) {
2735 mlen
= mpt_mbuf
->m_len
;
2737 mlen
= min(mlen
, sb_cc
- tot_sent
);
2740 mptcplog((LOG_ERR
, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2741 __func__
, (int)mlen
, mpt_mbuf
->m_pkthdr
.mp_rlen
,
2742 (uint32_t)off
, sb_cc
, tot_sent
),
2743 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2750 m
= m_copym_mode(mpt_mbuf
, (int)off
, mlen
, M_DONTWAIT
,
2751 M_COPYM_MUST_COPY_HDR
);
2753 mptcplog((LOG_ERR
, "%s m_copym_mode failed\n", __func__
),
2754 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2759 /* Create a DSN mapping for the data (m_copym does it) */
2760 VERIFY(m
->m_flags
& M_PKTHDR
);
2761 VERIFY(m
->m_next
== NULL
);
2763 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP
;
2764 m
->m_pkthdr
.pkt_flags
&= ~PKTF_MPSO
;
2765 m
->m_pkthdr
.mp_dsn
= mpt_dsn
;
2766 m
->m_pkthdr
.mp_rseq
= mpts
->mpts_rel_seq
;
2767 m
->m_pkthdr
.len
= mlen
;
2779 mpt_mbuf
= mpt_mbuf
->m_next
;
2783 if (sb_cc
< sb_mb
->m_pkthdr
.mp_rlen
) {
2784 struct mbuf
*n
= sb_mb
;
2787 n
->m_pkthdr
.mp_dsn
+= sb_cc
;
2788 n
->m_pkthdr
.mp_rlen
-= sb_cc
;
2791 m_adj(sb_mb
, sb_cc
);
2793 mpte
->mpte_reinjectq
= sb_mb
->m_nextpkt
;
2798 mptcplog((LOG_DEBUG
, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2799 __func__
, (uint32_t)mpt_dsn
, mpts
->mpts_rel_seq
,
2800 tot_sent
, mpts
->mpts_connid
), MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2802 if (head
&& (mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
)) {
2803 dss_csum
= mptcp_output_csum(head
, mpt_dsn
, mpts
->mpts_rel_seq
,
2807 /* Now, let's update rel-seq and the data-level length */
2808 mpts
->mpts_rel_seq
+= tot_sent
;
2811 if (mp_tp
->mpt_flags
& MPTCPF_CHECKSUM
)
2812 m
->m_pkthdr
.mp_csum
= dss_csum
;
2813 m
->m_pkthdr
.mp_rlen
= tot_sent
;
2818 if ((mpts
->mpts_flags
& MPTSF_TFO_REQD
) &&
2819 (tp
->t_tfo_stats
== 0))
2820 tp
->t_mpflags
|= TMPF_TFO_REQUEST
;
2822 error
= sock_sendmbuf(so
, NULL
, head
, 0, NULL
);
2824 DTRACE_MPTCP7(send
, struct mbuf
*, m
, struct socket
*, so
,
2825 struct sockbuf
*, &so
->so_rcv
,
2826 struct sockbuf
*, &so
->so_snd
,
2827 struct mptses
*, mpte
, struct mptsub
*, mpts
,
2833 (error
== EWOULDBLOCK
&& (tp
->t_mpflags
& TMPF_TFO_REQUEST
))) {
2834 uint64_t new_sndnxt
= mp_tp
->mpt_sndnxt
+ tot_sent
;
2836 if (mpts
->mpts_probesoon
&& mpts
->mpts_maxseg
&& tot_sent
) {
2837 tcpstat
.tcps_mp_num_probes
++;
2838 if ((uint32_t)tot_sent
< mpts
->mpts_maxseg
)
2839 mpts
->mpts_probecnt
+= 1;
2841 mpts
->mpts_probecnt
+=
2842 tot_sent
/mpts
->mpts_maxseg
;
2845 if (!reinjected
&& !(flags
& MPTCP_SUBOUT_PROBING
)) {
2846 if (MPTCP_DATASEQ_HIGH32(new_sndnxt
) >
2847 MPTCP_DATASEQ_HIGH32(mp_tp
->mpt_sndnxt
))
2848 mp_tp
->mpt_flags
|= MPTCPF_SND_64BITDSN
;
2849 mp_tp
->mpt_sndnxt
= new_sndnxt
;
2852 mptcp_cancel_timer(mp_tp
, MPTT_REXMT
);
2854 /* Must be here as mptcp_can_send_more() checks for this */
2855 soclearfastopen(mp_so
);
2857 if ((mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) ||
2858 (mpts
->mpts_probesoon
!= 0))
2859 mptcplog((LOG_DEBUG
, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2860 __func__
, mpts
->mpts_connid
,
2861 !!(mpts
->mpts_flags
& MPTSF_MP_DEGRADED
),
2862 tot_sent
, (int) sb_cc
, mpts
->mpts_probecnt
,
2863 (tcp_now
- mpts
->mpts_probesoon
)),
2864 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
2866 if (IFNET_IS_CELLULAR(sotoinpcb(so
)->inp_last_outifp
)) {
2867 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SET_CELLICON
;
2869 mpte
->mpte_used_cell
= 1;
2871 mpte
->mpte_mppcb
->mpp_flags
|= MPP_UNSET_CELLICON
;
2873 mpte
->mpte_used_wifi
= 1;
2877 * Don't propagate EWOULDBLOCK - it's already taken care of
2878 * in mptcp_usr_send for TFO.
2882 mptcplog((LOG_ERR
, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2883 __func__
, mpts
->mpts_connid
, error
, tot_sent
, so
->so_flags
, so
->so_state
, so
->so_error
, so
->so_snd
.sb_hiwat
, so
->so_snd
.sb_lowat
),
2884 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_ERR
);
2889 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SHOULD_WWAKEUP
;
2891 mptcp_handle_deferred_upcalls(mpte
->mpte_mppcb
, MPP_INSIDE_OUTPUT
);
2895 /* Opting to call pru_send as no mbuf at subflow level */
2896 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)(so
, 0, NULL
, NULL
,
2897 NULL
, current_proc());
2903 mptcp_add_reinjectq(struct mptses
*mpte
, struct mbuf
*m
)
2905 struct mbuf
*n
, *prev
= NULL
;
2907 mptcplog((LOG_DEBUG
, "%s reinjecting dsn %u dlen %u rseq %u\n",
2908 __func__
, (uint32_t)m
->m_pkthdr
.mp_dsn
, m
->m_pkthdr
.mp_rlen
,
2909 m
->m_pkthdr
.mp_rseq
),
2910 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2912 n
= mpte
->mpte_reinjectq
;
2914 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2915 * equal than m's sequence number.
2918 if (MPTCP_SEQ_GEQ(n
->m_pkthdr
.mp_dsn
, m
->m_pkthdr
.mp_dsn
))
2927 /* m is already fully covered by the next mbuf in the queue */
2928 if (n
->m_pkthdr
.mp_dsn
== m
->m_pkthdr
.mp_dsn
&&
2929 n
->m_pkthdr
.mp_rlen
>= m
->m_pkthdr
.mp_rlen
) {
2930 mptcplog((LOG_DEBUG
, "%s fully covered with len %u\n",
2931 __func__
, n
->m_pkthdr
.mp_rlen
),
2932 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2936 /* m is covering the next mbuf entirely, thus we remove this guy */
2937 if (m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
>= n
->m_pkthdr
.mp_dsn
+ n
->m_pkthdr
.mp_rlen
) {
2938 struct mbuf
*tmp
= n
->m_nextpkt
;
2940 mptcplog((LOG_DEBUG
, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2941 __func__
, m
->m_pkthdr
.mp_dsn
, m
->m_pkthdr
.mp_rlen
,
2942 n
->m_pkthdr
.mp_dsn
, n
->m_pkthdr
.mp_rlen
),
2943 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2945 m
->m_nextpkt
= NULL
;
2947 mpte
->mpte_reinjectq
= tmp
;
2949 prev
->m_nextpkt
= tmp
;
2958 /* m is already fully covered by the previous mbuf in the queue */
2959 if (prev
->m_pkthdr
.mp_dsn
+ prev
->m_pkthdr
.mp_rlen
>= m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.len
) {
2960 mptcplog((LOG_DEBUG
, "%s prev covers us from %u with len %u\n",
2961 __func__
, prev
->m_pkthdr
.mp_dsn
, prev
->m_pkthdr
.mp_rlen
),
2962 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
2968 mpte
->mpte_reinjectq
= m
;
2970 prev
->m_nextpkt
= m
;
2981 static struct mbuf
*
2982 mptcp_lookup_dsn(struct mptses
*mpte
, uint64_t dsn
)
2984 struct socket
*mp_so
= mptetoso(mpte
);
2987 m
= mp_so
->so_snd
.sb_mb
;
2990 /* If this segment covers what we are looking for, return it. */
2991 if (MPTCP_SEQ_LEQ(m
->m_pkthdr
.mp_dsn
, dsn
) &&
2992 MPTCP_SEQ_GT(m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
, dsn
))
2996 /* Segment is no more in the queue */
2997 if (MPTCP_SEQ_GT(m
->m_pkthdr
.mp_dsn
, dsn
))
3006 static struct mbuf
*
3007 mptcp_copy_mbuf_list(struct mbuf
*m
, int len
)
3009 struct mbuf
*top
= NULL
, *tail
= NULL
;
3011 uint32_t dlen
, rseq
;
3013 dsn
= m
->m_pkthdr
.mp_dsn
;
3014 dlen
= m
->m_pkthdr
.mp_rlen
;
3015 rseq
= m
->m_pkthdr
.mp_rseq
;
3020 VERIFY((m
->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
3022 n
= m_copym_mode(m
, 0, m
->m_len
, M_DONTWAIT
, M_COPYM_MUST_COPY_HDR
);
3024 mptcplog((LOG_ERR
, "%s m_copym_mode returned NULL\n", __func__
),
3025 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
3029 VERIFY(n
->m_flags
& M_PKTHDR
);
3030 VERIFY(n
->m_next
== NULL
);
3031 VERIFY(n
->m_pkthdr
.mp_dsn
== dsn
);
3032 VERIFY(n
->m_pkthdr
.mp_rlen
== dlen
);
3033 VERIFY(n
->m_pkthdr
.mp_rseq
== rseq
);
3034 VERIFY(n
->m_len
== m
->m_len
);
3036 n
->m_pkthdr
.pkt_flags
|= (PKTF_MPSO
| PKTF_MPTCP
);
3060 mptcp_reinject_mbufs(struct socket
*so
)
3062 struct tcpcb
*tp
= sototcpcb(so
);
3063 struct mptsub
*mpts
= tp
->t_mpsub
;
3064 struct mptcb
*mp_tp
= tptomptp(tp
);
3065 struct mptses
*mpte
= mp_tp
->mpt_mpte
;;
3066 struct sockbuf
*sb
= &so
->so_snd
;
3071 struct mbuf
*n
= m
->m_next
, *orig
= m
;
3073 mptcplog((LOG_DEBUG
, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3074 __func__
, tp
->snd_una
, m
->m_pkthdr
.mp_rseq
, mpts
->mpts_iss
,
3075 m
->m_pkthdr
.mp_rlen
, m
->m_pkthdr
.pkt_flags
),
3076 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
3078 VERIFY((m
->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
3080 if (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_REINJ
)
3083 /* Has it all already been acknowledged at the data-level? */
3084 if (MPTCP_SEQ_GEQ(mp_tp
->mpt_snduna
, m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
))
3087 /* Part of this has already been acknowledged - lookup in the
3088 * MPTCP-socket for the segment.
3090 if (SEQ_GT(tp
->snd_una
- mpts
->mpts_iss
, m
->m_pkthdr
.mp_rseq
)) {
3091 m
= mptcp_lookup_dsn(mpte
, m
->m_pkthdr
.mp_dsn
);
3096 /* Copy the mbuf with headers (aka, DSN-numbers) */
3097 m
= mptcp_copy_mbuf_list(m
, m
->m_pkthdr
.mp_rlen
);
3101 VERIFY(m
->m_nextpkt
== NULL
);
3103 /* Now, add to the reinject-queue, eliminating overlapping
3106 mptcp_add_reinjectq(mpte
, m
);
3108 orig
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP_REINJ
;
3111 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3113 VERIFY((n
->m_flags
& M_PKTHDR
) && (n
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
3115 if (n
->m_pkthdr
.mp_dsn
!= orig
->m_pkthdr
.mp_dsn
)
3118 n
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP_REINJ
;
3127 mptcp_clean_reinjectq(struct mptses
*mpte
)
3129 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
3131 mpte_lock_assert_held(mpte
);
3133 while (mpte
->mpte_reinjectq
) {
3134 struct mbuf
*m
= mpte
->mpte_reinjectq
;
3136 if (MPTCP_SEQ_GEQ(m
->m_pkthdr
.mp_dsn
, mp_tp
->mpt_snduna
) ||
3137 MPTCP_SEQ_GT(m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
, mp_tp
->mpt_snduna
))
3140 mpte
->mpte_reinjectq
= m
->m_nextpkt
;
3141 m
->m_nextpkt
= NULL
;
3147 * Subflow socket control event upcall.
3150 mptcp_subflow_eupcall1(struct socket
*so
, void *arg
, uint32_t events
)
3153 struct mptsub
*mpts
= arg
;
3154 struct mptses
*mpte
= mpts
->mpts_mpte
;
3156 VERIFY(mpte
!= NULL
);
3157 mpte_lock_assert_held(mpte
);
3159 if ((mpts
->mpts_evctl
& events
) == events
)
3162 mpts
->mpts_evctl
|= events
;
3164 if (mptcp_should_defer_upcall(mpte
->mpte_mppcb
)) {
3165 mpte
->mpte_mppcb
->mpp_flags
|= MPP_SHOULD_WORKLOOP
;
3169 mptcp_subflow_workloop(mpte
);
3173 * Subflow socket control events.
3175 * Called for handling events related to the underlying subflow socket.
3178 mptcp_subflow_events(struct mptses
*mpte
, struct mptsub
*mpts
,
3179 uint64_t *p_mpsofilt_hint
)
3181 ev_ret_t ret
= MPTS_EVRET_OK
;
3182 int i
, mpsub_ev_entry_count
= sizeof(mpsub_ev_entry_tbl
) /
3183 sizeof(mpsub_ev_entry_tbl
[0]);
3185 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3187 /* bail if there's nothing to process */
3188 if (!mpts
->mpts_evctl
)
3191 if (mpts
->mpts_evctl
& (SO_FILT_HINT_CONNRESET
|SO_FILT_HINT_MUSTRST
|
3192 SO_FILT_HINT_CANTSENDMORE
|SO_FILT_HINT_TIMEOUT
|
3193 SO_FILT_HINT_NOSRCADDR
|SO_FILT_HINT_IFDENIED
|
3194 SO_FILT_HINT_DISCONNECTED
)) {
3195 mpts
->mpts_evctl
|= SO_FILT_HINT_MPFAILOVER
;
3198 DTRACE_MPTCP3(subflow__events
, struct mptses
*, mpte
,
3199 struct mptsub
*, mpts
, uint32_t, mpts
->mpts_evctl
);
3201 mptcplog((LOG_DEBUG
, "%s cid %d events=%b\n", __func__
,
3202 mpts
->mpts_connid
, mpts
->mpts_evctl
, SO_FILT_HINT_BITS
),
3203 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_VERBOSE
);
3206 * Process all the socket filter hints and reset the hint
3207 * once it is handled
3209 for (i
= 0; i
< mpsub_ev_entry_count
&& mpts
->mpts_evctl
; i
++) {
3211 * Always execute the DISCONNECTED event, because it will wakeup
3214 if ((mpts
->mpts_evctl
& mpsub_ev_entry_tbl
[i
].sofilt_hint_mask
) &&
3215 (ret
>= MPTS_EVRET_OK
||
3216 mpsub_ev_entry_tbl
[i
].sofilt_hint_mask
== SO_FILT_HINT_DISCONNECTED
)) {
3217 mpts
->mpts_evctl
&= ~mpsub_ev_entry_tbl
[i
].sofilt_hint_mask
;
3219 mpsub_ev_entry_tbl
[i
].sofilt_hint_ev_hdlr(mpte
, mpts
, p_mpsofilt_hint
, mpsub_ev_entry_tbl
[i
].sofilt_hint_mask
);
3220 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
3225 * We should be getting only events specified via sock_catchevents(),
3226 * so loudly complain if we have any unprocessed one(s).
3228 if (mpts
->mpts_evctl
|| ret
< MPTS_EVRET_OK
)
3229 mptcplog((LOG_WARNING
, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__
,
3230 (mpts
->mpts_evctl
&& ret
== MPTS_EVRET_OK
) ? "MPTCP_ERROR " : "",
3232 mptcp_evret2str(ret
), ret
, mpts
->mpts_evctl
, SO_FILT_HINT_BITS
),
3233 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3235 mptcplog((LOG_DEBUG
, "%s: Done, events %b\n", __func__
,
3236 mpts
->mpts_evctl
, SO_FILT_HINT_BITS
),
3237 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_VERBOSE
);
3243 mptcp_subflow_propagate_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3244 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3246 struct socket
*mp_so
, *so
;
3247 struct mptcb
*mp_tp
;
3249 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3250 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3251 mp_so
= mptetoso(mpte
);
3252 mp_tp
= mpte
->mpte_mptcb
;
3253 so
= mpts
->mpts_socket
;
3255 mptcplog((LOG_DEBUG
, "%s: cid %d event %d\n", __func__
,
3256 mpts
->mpts_connid
, event
),
3257 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3260 * We got an event for this subflow that might need to be propagated,
3261 * based on the state of the MPTCP connection.
3263 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
||
3264 ((mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) && (mpts
->mpts_flags
& MPTSF_ACTIVE
))) {
3265 mp_so
->so_error
= so
->so_error
;
3266 *p_mpsofilt_hint
|= event
;
3269 return (MPTS_EVRET_OK
);
3273 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3276 mptcp_subflow_nosrcaddr_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3277 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3279 #pragma unused(p_mpsofilt_hint, event)
3280 struct socket
*mp_so
;
3283 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3285 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3286 mp_so
= mptetoso(mpte
);
3287 tp
= intotcpcb(sotoinpcb(mpts
->mpts_socket
));
3290 * This overwrites any previous mpte_lost_aid to avoid storing
3291 * too much state when the typical case has only two subflows.
3293 mpte
->mpte_flags
|= MPTE_SND_REM_ADDR
;
3294 mpte
->mpte_lost_aid
= tp
->t_local_aid
;
3296 mptcplog((LOG_DEBUG
, "%s cid %d\n", __func__
, mpts
->mpts_connid
),
3297 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3300 * The subflow connection has lost its source address.
3302 mptcp_subflow_abort(mpts
, EADDRNOTAVAIL
);
3304 if (mp_so
->so_flags
& SOF_NOADDRAVAIL
)
3305 mptcp_subflow_propagate_ev(mpte
, mpts
, p_mpsofilt_hint
, event
);
3307 return (MPTS_EVRET_DELETE
);
3311 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3312 * indicates that the remote side sent a Data FIN
3315 mptcp_subflow_mpcantrcvmore_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3316 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3318 #pragma unused(event)
3319 struct mptcb
*mp_tp
;
3321 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3322 mp_tp
= mpte
->mpte_mptcb
;
3324 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
, mpts
->mpts_connid
),
3325 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3328 * We got a Data FIN for the MPTCP connection.
3329 * The FIN may arrive with data. The data is handed up to the
3330 * mptcp socket and the user is notified so that it may close
3331 * the socket if needed.
3333 if (mp_tp
->mpt_state
== MPTCPS_CLOSE_WAIT
)
3334 *p_mpsofilt_hint
|= SO_FILT_HINT_CANTRCVMORE
;
3336 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
3340 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3343 mptcp_subflow_failover_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3344 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3346 #pragma unused(event, p_mpsofilt_hint)
3347 struct mptsub
*mpts_alt
= NULL
;
3348 struct socket
*alt_so
= NULL
;
3349 struct socket
*mp_so
;
3350 int altpath_exists
= 0;
3352 mpte_lock_assert_held(mpte
);
3353 mp_so
= mptetoso(mpte
);
3354 mptcplog((LOG_NOTICE
, "%s: mp_so 0x%llx\n", __func__
,
3355 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)),
3356 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3358 mptcp_reinject_mbufs(mpts
->mpts_socket
);
3360 mpts_alt
= mptcp_get_subflow(mpte
, mpts
, NULL
);
3362 * If there is no alternate eligible subflow, ignore the
3365 if (mpts_alt
== NULL
) {
3366 mptcplog((LOG_WARNING
, "%s: no alternate path\n", __func__
),
3367 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3373 alt_so
= mpts_alt
->mpts_socket
;
3374 if (mpts_alt
->mpts_flags
& MPTSF_FAILINGOVER
) {
3375 /* All data acknowledged and no RTT spike */
3376 if (alt_so
->so_snd
.sb_cc
== 0 && mptcp_no_rto_spike(alt_so
)) {
3377 mpts_alt
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
3379 /* no alternate path available */
3384 if (altpath_exists
) {
3385 mpts_alt
->mpts_flags
|= MPTSF_ACTIVE
;
3387 mpte
->mpte_active_sub
= mpts_alt
;
3388 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
3389 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
3391 mptcplog((LOG_NOTICE
, "%s: switched from %d to %d\n",
3392 __func__
, mpts
->mpts_connid
, mpts_alt
->mpts_connid
),
3393 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3395 mptcpstats_inc_switch(mpte
, mpts
);
3399 mptcplog((LOG_DEBUG
, "%s: no alt cid = %d\n", __func__
,
3401 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3403 mpts
->mpts_socket
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
3406 return (MPTS_EVRET_OK
);
3410 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3413 mptcp_subflow_ifdenied_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3414 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3416 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3417 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3419 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
,
3420 mpts
->mpts_connid
), MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3423 * The subflow connection cannot use the outgoing interface, let's
3424 * close this subflow.
3426 mptcp_subflow_abort(mpts
, EPERM
);
3428 mptcp_subflow_propagate_ev(mpte
, mpts
, p_mpsofilt_hint
, event
);
3430 return (MPTS_EVRET_DELETE
);
3434 * https://tools.ietf.org/html/rfc6052#section-2
3435 * https://tools.ietf.org/html/rfc6147#section-5.2
3438 mptcp_desynthesize_ipv6_addr(const struct in6_addr
*addr
,
3439 const struct ipv6_prefix
*prefix
,
3440 struct in_addr
*addrv4
)
3442 char buf
[MAX_IPv4_STR_LEN
];
3443 char *ptrv4
= (char *)addrv4
;
3444 const char *ptr
= (const char *)addr
;
3446 if (memcmp(addr
, &prefix
->ipv6_prefix
, prefix
->prefix_len
) != 0)
3449 switch (prefix
->prefix_len
) {
3450 case NAT64_PREFIX_LEN_96
:
3451 memcpy(ptrv4
, ptr
+ 12, 4);
3453 case NAT64_PREFIX_LEN_64
:
3454 memcpy(ptrv4
, ptr
+ 9, 4);
3456 case NAT64_PREFIX_LEN_56
:
3457 memcpy(ptrv4
, ptr
+ 7, 1);
3458 memcpy(ptrv4
+ 1, ptr
+ 9, 3);
3460 case NAT64_PREFIX_LEN_48
:
3461 memcpy(ptrv4
, ptr
+ 6, 2);
3462 memcpy(ptrv4
+ 2, ptr
+ 9, 2);
3464 case NAT64_PREFIX_LEN_40
:
3465 memcpy(ptrv4
, ptr
+ 5, 3);
3466 memcpy(ptrv4
+ 3, ptr
+ 9, 1);
3468 case NAT64_PREFIX_LEN_32
:
3469 memcpy(ptrv4
, ptr
+ 4, 4);
3472 panic("NAT64-prefix len is wrong: %u\n",
3473 prefix
->prefix_len
);
3476 os_log_info(mptcp_log_handle
, "%s desynthesized to %s\n", __func__
,
3477 inet_ntop(AF_INET
, (void *)addrv4
, buf
, sizeof(buf
)));
3483 mptcp_handle_ipv6_connection(struct mptses
*mpte
, const struct mptsub
*mpts
)
3485 struct ipv6_prefix nat64prefixes
[NAT64_MAX_NUM_PREFIXES
];
3486 struct socket
*so
= mpts
->mpts_socket
;
3490 ifp
= sotoinpcb(so
)->inp_last_outifp
;
3492 if (ifnet_get_nat64prefix(ifp
, nat64prefixes
) == ENOENT
) {
3493 mptcp_ask_for_nat64(ifp
);
3498 for (j
= 0; j
< NAT64_MAX_NUM_PREFIXES
; j
++) {
3501 if (nat64prefixes
[j
].prefix_len
== 0)
3504 success
= mptcp_desynthesize_ipv6_addr(&mpte
->__mpte_dst_v6
.sin6_addr
,
3506 &mpte
->mpte_dst_v4_nat64
.sin_addr
);
3508 mpte
->mpte_dst_v4_nat64
.sin_len
= sizeof(mpte
->mpte_dst_v4_nat64
);
3509 mpte
->mpte_dst_v4_nat64
.sin_family
= AF_INET
;
3510 mpte
->mpte_dst_v4_nat64
.sin_port
= mpte
->__mpte_dst_v6
.sin6_port
;
3517 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3520 mptcp_subflow_connected_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3521 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3523 #pragma unused(event, p_mpsofilt_hint)
3524 struct socket
*mp_so
, *so
;
3527 struct mptcb
*mp_tp
;
3529 boolean_t mpok
= FALSE
;
3531 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3532 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3534 mp_so
= mptetoso(mpte
);
3535 mp_tp
= mpte
->mpte_mptcb
;
3536 so
= mpts
->mpts_socket
;
3538 af
= mpts
->mpts_dst
.sa_family
;
3540 if (mpts
->mpts_flags
& MPTSF_CONNECTED
)
3541 return (MPTS_EVRET_OK
);
3543 if ((mpts
->mpts_flags
& MPTSF_DISCONNECTED
) ||
3544 (mpts
->mpts_flags
& MPTSF_DISCONNECTING
)) {
3545 if (!(so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) &&
3546 (so
->so_state
& SS_ISCONNECTED
)) {
3547 mptcplog((LOG_DEBUG
, "%s: cid %d disconnect before tcp connect\n",
3548 __func__
, mpts
->mpts_connid
),
3549 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3550 (void) soshutdownlock(so
, SHUT_RD
);
3551 (void) soshutdownlock(so
, SHUT_WR
);
3552 (void) sodisconnectlocked(so
);
3554 return (MPTS_EVRET_OK
);
3558 * The subflow connection has been connected. Find out whether it
3559 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3561 * a. If MPTCP connection is not yet established, then this must be
3562 * the first subflow connection. If MPTCP failed to negotiate,
3563 * fallback to regular TCP by degrading this subflow.
3565 * b. If MPTCP connection has been established, then this must be
3566 * one of the subsequent subflow connections. If MPTCP failed
3567 * to negotiate, disconnect the connection.
3569 * Right now, we simply unblock any waiters at the MPTCP socket layer
3570 * if the MPTCP connection has not been established.
3573 if (so
->so_state
& SS_ISDISCONNECTED
) {
3575 * With MPTCP joins, a connection is connected at the subflow
3576 * level, but the 4th ACK from the server elevates the MPTCP
3577 * subflow to connected state. So there is a small window
3578 * where the subflow could get disconnected before the
3579 * connected event is processed.
3581 return (MPTS_EVRET_OK
);
3584 if (mpts
->mpts_flags
& MPTSF_TFO_REQD
)
3585 mptcp_drop_tfo_data(mpte
, mpts
);
3587 mpts
->mpts_flags
&= ~(MPTSF_CONNECTING
| MPTSF_TFO_REQD
);
3588 mpts
->mpts_flags
|= MPTSF_CONNECTED
;
3590 if (tp
->t_mpflags
& TMPF_MPTCP_TRUE
)
3591 mpts
->mpts_flags
|= MPTSF_MP_CAPABLE
;
3593 tp
->t_mpflags
&= ~TMPF_TFO_REQUEST
;
3595 /* get/verify the outbound interface */
3596 inp
= sotoinpcb(so
);
3598 mpts
->mpts_maxseg
= tp
->t_maxseg
;
3600 mptcplog((LOG_DEBUG
, "%s: cid %d outif %s is %s\n", __func__
, mpts
->mpts_connid
,
3601 ((inp
->inp_last_outifp
!= NULL
) ? inp
->inp_last_outifp
->if_xname
: "NULL"),
3602 ((mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) ? "MPTCP capable" : "a regular TCP")),
3603 (MPTCP_SOCKET_DBG
| MPTCP_EVENTS_DBG
), MPTCP_LOGLVL_LOG
);
3605 mpok
= (mpts
->mpts_flags
& MPTSF_MP_CAPABLE
);
3607 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
3608 mp_tp
->mpt_state
= MPTCPS_ESTABLISHED
;
3609 mpte
->mpte_associd
= mpts
->mpts_connid
;
3610 DTRACE_MPTCP2(state__change
,
3611 struct mptcb
*, mp_tp
,
3612 uint32_t, 0 /* event */);
3614 if (SOCK_DOM(so
) == AF_INET
) {
3615 in_getsockaddr_s(so
, &mpte
->__mpte_src_v4
);
3617 in6_getsockaddr_s(so
, &mpte
->__mpte_src_v6
);
3620 mpts
->mpts_flags
|= MPTSF_ACTIVE
;
3622 /* case (a) above */
3624 tcpstat
.tcps_mpcap_fallback
++;
3626 tp
->t_mpflags
|= TMPF_INFIN_SENT
;
3627 mptcp_notify_mpfail(so
);
3629 if (IFNET_IS_CELLULAR(inp
->inp_last_outifp
) &&
3630 mpte
->mpte_svctype
!= MPTCP_SVCTYPE_AGGREGATE
) {
3631 tp
->t_mpflags
|= (TMPF_BACKUP_PATH
| TMPF_SND_MPPRIO
);
3633 mpts
->mpts_flags
|= MPTSF_PREFERRED
;
3635 mpts
->mpts_flags
|= MPTSF_MPCAP_CTRSET
;
3636 mpte
->mpte_nummpcapflows
++;
3638 if (SOCK_DOM(so
) == AF_INET6
)
3639 mptcp_handle_ipv6_connection(mpte
, mpts
);
3641 mptcp_check_subflows_and_add(mpte
);
3643 if (IFNET_IS_CELLULAR(inp
->inp_last_outifp
))
3644 mpte
->mpte_initial_cell
= 1;
3646 mpte
->mpte_handshake_success
= 1;
3649 mp_tp
->mpt_sndwnd
= tp
->snd_wnd
;
3650 mp_tp
->mpt_sndwl1
= mp_tp
->mpt_rcvnxt
;
3651 mp_tp
->mpt_sndwl2
= mp_tp
->mpt_snduna
;
3652 soisconnected(mp_so
);
3654 mptcplog((LOG_DEBUG
, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3655 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), mpok
),
3656 MPTCP_STATE_DBG
, MPTCP_LOGLVL_LOG
);
3660 * In case of additional flows, the MPTCP socket is not
3661 * MPTSF_MP_CAPABLE until an ACK is received from server
3662 * for 3-way handshake. TCP would have guaranteed that this
3663 * is an MPTCP subflow.
3665 if (IFNET_IS_CELLULAR(inp
->inp_last_outifp
) &&
3666 !(tp
->t_mpflags
& TMPF_BACKUP_PATH
) &&
3667 mpte
->mpte_svctype
!= MPTCP_SVCTYPE_AGGREGATE
) {
3668 tp
->t_mpflags
|= (TMPF_BACKUP_PATH
| TMPF_SND_MPPRIO
);
3669 mpts
->mpts_flags
&= ~MPTSF_PREFERRED
;
3671 mpts
->mpts_flags
|= MPTSF_PREFERRED
;
3674 mpts
->mpts_flags
|= MPTSF_MPCAP_CTRSET
;
3675 mpte
->mpte_nummpcapflows
++;
3677 mpts
->mpts_rel_seq
= 1;
3679 mptcp_check_subflows_and_remove(mpte
);
3683 /* Should we try the alternate port? */
3684 if (mpte
->mpte_alternate_port
&&
3685 inp
->inp_fport
!= mpte
->mpte_alternate_port
) {
3686 union sockaddr_in_4_6 dst
;
3687 struct sockaddr_in
*dst_in
= (struct sockaddr_in
*)&dst
;
3689 memcpy(&dst
, &mpts
->mpts_dst
, mpts
->mpts_dst
.sa_len
);
3691 dst_in
->sin_port
= mpte
->mpte_alternate_port
;
3693 mptcp_subflow_add(mpte
, NULL
, (struct sockaddr
*)&dst
,
3694 mpts
->mpts_ifscope
, NULL
);
3695 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3696 for (i
= 0; i
< mpte
->mpte_itfinfo_size
; i
++) {
3697 struct mpt_itf_info
*info
= &mpte
->mpte_itfinfo
[i
];
3699 if (inp
->inp_last_outifp
->if_index
== info
->ifindex
) {
3700 info
->no_mptcp_support
= 1;
3706 tcpstat
.tcps_join_fallback
++;
3707 if (IFNET_IS_CELLULAR(inp
->inp_last_outifp
))
3708 tcpstat
.tcps_mptcp_cell_proxy
++;
3710 tcpstat
.tcps_mptcp_wifi_proxy
++;
3712 soevent(mpts
->mpts_socket
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MUSTRST
);
3714 return (MPTS_EVRET_OK
);
3717 /* This call, just to "book" an entry in the stats-table for this ifindex */
3718 mptcp_get_statsindex(mpte
->mpte_itfstats
, mpts
);
3722 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
3726 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3729 mptcp_subflow_disconnected_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3730 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3732 #pragma unused(event, p_mpsofilt_hint)
3733 struct socket
*mp_so
, *so
;
3734 struct mptcb
*mp_tp
;
3736 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3737 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3738 mp_so
= mptetoso(mpte
);
3739 mp_tp
= mpte
->mpte_mptcb
;
3740 so
= mpts
->mpts_socket
;
3742 mptcplog((LOG_DEBUG
, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3743 __func__
, mpts
->mpts_connid
, so
->so_error
, mp_tp
->mpt_state
,
3744 !!(mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
),
3745 !!(mpts
->mpts_flags
& MPTSF_ACTIVE
), sototcpcb(so
)->t_mpflags
),
3746 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3748 if (mpts
->mpts_flags
& MPTSF_DISCONNECTED
)
3749 return (MPTS_EVRET_DELETE
);
3751 mpts
->mpts_flags
|= MPTSF_DISCONNECTED
;
3753 /* The subflow connection has been disconnected. */
3755 if (mpts
->mpts_flags
& MPTSF_MPCAP_CTRSET
) {
3756 mpte
->mpte_nummpcapflows
--;
3757 if (mpte
->mpte_active_sub
== mpts
) {
3758 mpte
->mpte_active_sub
= NULL
;
3759 mptcplog((LOG_DEBUG
, "%s: resetting active subflow \n",
3760 __func__
), MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3762 mpts
->mpts_flags
&= ~MPTSF_MPCAP_CTRSET
;
3765 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
||
3766 ((mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) && (mpts
->mpts_flags
& MPTSF_ACTIVE
)) ||
3767 (sototcpcb(so
)->t_mpflags
& TMPF_FASTCLOSERCV
)) {
3768 mptcp_drop(mpte
, mp_tp
, so
->so_error
);
3772 * Clear flags that are used by getconninfo to return state.
3773 * Retain like MPTSF_DELETEOK for internal purposes.
3775 mpts
->mpts_flags
&= ~(MPTSF_CONNECTING
|MPTSF_CONNECT_PENDING
|
3776 MPTSF_CONNECTED
|MPTSF_DISCONNECTING
|MPTSF_PREFERRED
|
3777 MPTSF_MP_CAPABLE
|MPTSF_MP_READY
|MPTSF_MP_DEGRADED
|MPTSF_ACTIVE
);
3779 return (MPTS_EVRET_DELETE
);
3783 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3786 mptcp_subflow_mpstatus_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3787 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3789 #pragma unused(event, p_mpsofilt_hint)
3790 struct socket
*mp_so
, *so
;
3791 struct mptcb
*mp_tp
;
3792 ev_ret_t ret
= MPTS_EVRET_OK
;
3794 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3795 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3796 mp_so
= mptetoso(mpte
);
3797 mp_tp
= mpte
->mpte_mptcb
;
3798 so
= mpts
->mpts_socket
;
3800 if (sototcpcb(so
)->t_mpflags
& TMPF_MPTCP_TRUE
)
3801 mpts
->mpts_flags
|= MPTSF_MP_CAPABLE
;
3803 mpts
->mpts_flags
&= ~MPTSF_MP_CAPABLE
;
3805 if (sototcpcb(so
)->t_mpflags
& TMPF_TCP_FALLBACK
) {
3806 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
)
3808 mpts
->mpts_flags
|= MPTSF_MP_DEGRADED
;
3810 mpts
->mpts_flags
&= ~MPTSF_MP_DEGRADED
;
3813 if (sototcpcb(so
)->t_mpflags
& TMPF_MPTCP_READY
)
3814 mpts
->mpts_flags
|= MPTSF_MP_READY
;
3816 mpts
->mpts_flags
&= ~MPTSF_MP_READY
;
3818 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
3819 mp_tp
->mpt_flags
|= MPTCPF_FALLBACK_TO_TCP
;
3820 mp_tp
->mpt_flags
&= ~MPTCPF_JOIN_READY
;
3823 if (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) {
3824 VERIFY(!(mp_tp
->mpt_flags
& MPTCPF_JOIN_READY
));
3825 ret
= MPTS_EVRET_DISCONNECT_FALLBACK
;
3827 m_freem_list(mpte
->mpte_reinjectq
);
3828 mpte
->mpte_reinjectq
= NULL
;
3829 } else if (mpts
->mpts_flags
& MPTSF_MP_READY
) {
3830 mp_tp
->mpt_flags
|= MPTCPF_JOIN_READY
;
3831 ret
= MPTS_EVRET_CONNECT_PENDING
;
3834 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3835 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3836 mp_tp
->mpt_flags
, MPTCPF_BITS
, mpts
->mpts_connid
,
3837 mpts
->mpts_flags
, MPTSF_BITS
),
3838 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3845 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3848 mptcp_subflow_mustrst_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3849 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3851 #pragma unused(event)
3852 struct socket
*mp_so
, *so
;
3853 struct mptcb
*mp_tp
;
3854 boolean_t is_fastclose
;
3856 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
3857 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3858 mp_so
= mptetoso(mpte
);
3859 mp_tp
= mpte
->mpte_mptcb
;
3860 so
= mpts
->mpts_socket
;
3862 /* We got an invalid option or a fast close */
3863 struct tcptemp
*t_template
;
3864 struct inpcb
*inp
= sotoinpcb(so
);
3865 struct tcpcb
*tp
= NULL
;
3867 tp
= intotcpcb(inp
);
3868 so
->so_error
= ECONNABORTED
;
3870 is_fastclose
= !!(tp
->t_mpflags
& TMPF_FASTCLOSERCV
);
3872 t_template
= tcp_maketemplate(tp
);
3874 struct tcp_respond_args tra
;
3876 bzero(&tra
, sizeof(tra
));
3877 if (inp
->inp_flags
& INP_BOUND_IF
)
3878 tra
.ifscope
= inp
->inp_boundifp
->if_index
;
3880 tra
.ifscope
= IFSCOPE_NONE
;
3881 tra
.awdl_unrestricted
= 1;
3883 tcp_respond(tp
, t_template
->tt_ipgen
,
3884 &t_template
->tt_t
, (struct mbuf
*)NULL
,
3885 tp
->rcv_nxt
, tp
->snd_una
, TH_RST
, &tra
);
3886 (void) m_free(dtom(t_template
));
3887 mptcplog((LOG_DEBUG
, "MPTCP Events: "
3888 "%s: mp_so 0x%llx cid %d \n",
3889 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3890 so
, mpts
->mpts_connid
),
3891 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_LOG
);
3893 mptcp_subflow_abort(mpts
, ECONNABORTED
);
3895 if (!(mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) && is_fastclose
) {
3896 *p_mpsofilt_hint
|= SO_FILT_HINT_CONNRESET
;
3898 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
)
3899 mp_so
->so_error
= ECONNABORTED
;
3901 mp_so
->so_error
= ECONNRESET
;
3904 * mptcp_drop is being called after processing the events, to fully
3905 * close the MPTCP connection
3909 if (mp_tp
->mpt_gc_ticks
== MPT_GC_TICKS
)
3910 mp_tp
->mpt_gc_ticks
= MPT_GC_TICKS_FAST
;
3912 return (MPTS_EVRET_DELETE
);
3916 mptcp_subflow_adaptive_rtimo_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3917 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3919 #pragma unused(event)
3920 bool found_active
= false;
3922 mpts
->mpts_flags
|= MPTSF_READ_STALL
;
3924 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
3925 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
3927 if (!TCPS_HAVEESTABLISHED(tp
->t_state
) ||
3928 TCPS_HAVERCVDFIN2(tp
->t_state
))
3931 if (!(mpts
->mpts_flags
& MPTSF_READ_STALL
)) {
3932 found_active
= true;
3938 *p_mpsofilt_hint
|= SO_FILT_HINT_ADAPTIVE_RTIMO
;
3940 return (MPTS_EVRET_OK
);
3944 mptcp_subflow_adaptive_wtimo_ev(struct mptses
*mpte
, struct mptsub
*mpts
,
3945 uint64_t *p_mpsofilt_hint
, uint64_t event
)
3947 #pragma unused(event)
3948 bool found_active
= false;
3950 mpts
->mpts_flags
|= MPTSF_WRITE_STALL
;
3952 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
3953 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
3955 if (!TCPS_HAVEESTABLISHED(tp
->t_state
) ||
3956 tp
->t_state
> TCPS_CLOSE_WAIT
)
3959 if (!(mpts
->mpts_flags
& MPTSF_WRITE_STALL
)) {
3960 found_active
= true;
3966 *p_mpsofilt_hint
|= SO_FILT_HINT_ADAPTIVE_WTIMO
;
3968 return (MPTS_EVRET_OK
);
3972 mptcp_evret2str(ev_ret_t ret
)
3974 const char *c
= "UNKNOWN";
3977 case MPTS_EVRET_DELETE
:
3978 c
= "MPTS_EVRET_DELETE";
3980 case MPTS_EVRET_CONNECT_PENDING
:
3981 c
= "MPTS_EVRET_CONNECT_PENDING";
3983 case MPTS_EVRET_DISCONNECT_FALLBACK
:
3984 c
= "MPTS_EVRET_DISCONNECT_FALLBACK";
3987 c
= "MPTS_EVRET_OK";
3996 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3997 * caller must ensure that the option can be issued on subflow sockets, via
3998 * MPOF_SUBFLOW_OK flag.
4001 mptcp_subflow_sosetopt(struct mptses
*mpte
, struct mptsub
*mpts
, struct mptopt
*mpo
)
4003 struct socket
*mp_so
, *so
;
4004 struct sockopt sopt
;
4007 VERIFY(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
);
4008 mpte_lock_assert_held(mpte
);
4010 mp_so
= mptetoso(mpte
);
4011 so
= mpts
->mpts_socket
;
4013 if (mpte
->mpte_mptcb
->mpt_state
>= MPTCPS_ESTABLISHED
&&
4014 mpo
->mpo_level
== SOL_SOCKET
&&
4015 mpo
->mpo_name
== SO_MARK_CELLFALLBACK
) {
4016 struct ifnet
*ifp
= ifindex2ifnet
[mpts
->mpts_ifscope
];
4018 mptcplog((LOG_DEBUG
, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4019 __func__
, mpte
->mpte_flags
, mpte
->mpte_svctype
, mptcp_is_wifi_unusable(mpte
),
4020 sotoinpcb(so
)->inp_last_outifp
? IFNET_IS_CELLULAR(sotoinpcb(so
)->inp_last_outifp
) : -1,
4021 mpts
->mpts_ifscope
!= IFSCOPE_NONE
&& ifp
? IFNET_IS_CELLULAR(ifp
) : -1),
4022 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4025 * When we open a new subflow, mark it as cell fallback, if
4026 * this subflow goes over cell.
4028 * (except for first-party apps)
4031 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
)
4034 if (sotoinpcb(so
)->inp_last_outifp
&&
4035 !IFNET_IS_CELLULAR(sotoinpcb(so
)->inp_last_outifp
))
4039 * This here is an OR, because if the app is not binding to the
4040 * interface, then it definitely is not a cell-fallback
4043 if (mpts
->mpts_ifscope
== IFSCOPE_NONE
|| ifp
== NULL
||
4044 !IFNET_IS_CELLULAR(ifp
))
4048 mpo
->mpo_flags
&= ~MPOF_INTERIM
;
4050 bzero(&sopt
, sizeof (sopt
));
4051 sopt
.sopt_dir
= SOPT_SET
;
4052 sopt
.sopt_level
= mpo
->mpo_level
;
4053 sopt
.sopt_name
= mpo
->mpo_name
;
4054 sopt
.sopt_val
= CAST_USER_ADDR_T(&mpo
->mpo_intval
);
4055 sopt
.sopt_valsize
= sizeof (int);
4056 sopt
.sopt_p
= kernproc
;
4058 error
= sosetoptlock(so
, &sopt
, 0);
4060 mptcplog((LOG_INFO
, "%s: mp_so 0x%llx sopt %s "
4061 "val %d set successful\n", __func__
,
4062 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4063 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
),
4065 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
4067 mptcplog((LOG_ERR
, "%s:mp_so 0x%llx sopt %s "
4068 "val %d set error %d\n", __func__
,
4069 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4070 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
),
4071 mpo
->mpo_intval
, error
),
4072 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
4078 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4079 * caller must ensure that the option can be issued on subflow sockets, via
4080 * MPOF_SUBFLOW_OK flag.
4083 mptcp_subflow_sogetopt(struct mptses
*mpte
, struct socket
*so
,
4086 struct socket
*mp_so
;
4087 struct sockopt sopt
;
4090 VERIFY(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
);
4091 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
4092 mp_so
= mptetoso(mpte
);
4094 bzero(&sopt
, sizeof (sopt
));
4095 sopt
.sopt_dir
= SOPT_GET
;
4096 sopt
.sopt_level
= mpo
->mpo_level
;
4097 sopt
.sopt_name
= mpo
->mpo_name
;
4098 sopt
.sopt_val
= CAST_USER_ADDR_T(&mpo
->mpo_intval
);
4099 sopt
.sopt_valsize
= sizeof (int);
4100 sopt
.sopt_p
= kernproc
;
4102 error
= sogetoptlock(so
, &sopt
, 0); /* already locked */
4104 mptcplog((LOG_DEBUG
, "MPTCP Socket: "
4105 "%s: mp_so 0x%llx sopt %s "
4106 "val %d get successful\n", __func__
,
4107 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4108 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
),
4110 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4112 mptcplog((LOG_ERR
, "MPTCP Socket: "
4113 "%s: mp_so 0x%llx sopt %s get error %d\n",
4114 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4115 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
), error
),
4116 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
4123 * MPTCP garbage collector.
4125 * This routine is called by the MP domain on-demand, periodic callout,
4126 * which is triggered when a MPTCP socket is closed. The callout will
4127 * repeat as long as this routine returns a non-zero value.
4130 mptcp_gc(struct mppcbinfo
*mppi
)
4132 struct mppcb
*mpp
, *tmpp
;
4133 uint32_t active
= 0;
4135 LCK_MTX_ASSERT(&mppi
->mppi_lock
, LCK_MTX_ASSERT_OWNED
);
4137 TAILQ_FOREACH_SAFE(mpp
, &mppi
->mppi_pcbs
, mpp_entry
, tmpp
) {
4138 struct socket
*mp_so
;
4139 struct mptses
*mpte
;
4140 struct mptcb
*mp_tp
;
4142 VERIFY(mpp
->mpp_flags
& MPP_ATTACHED
);
4143 mp_so
= mpp
->mpp_socket
;
4144 VERIFY(mp_so
!= NULL
);
4145 mpte
= mptompte(mpp
);
4146 VERIFY(mpte
!= NULL
);
4147 mp_tp
= mpte
->mpte_mptcb
;
4148 VERIFY(mp_tp
!= NULL
);
4150 mptcplog((LOG_DEBUG
, "MPTCP Socket: "
4151 "%s: mp_so 0x%llx found "
4152 "(u=%d,r=%d,s=%d)\n", __func__
,
4153 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), mp_so
->so_usecount
,
4154 mp_so
->so_retaincnt
, mpp
->mpp_state
),
4155 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4157 if (!mpte_try_lock(mpte
)) {
4158 mptcplog((LOG_DEBUG
, "MPTCP Socket: "
4159 "%s: mp_so 0x%llx skipped lock "
4160 "(u=%d,r=%d)\n", __func__
,
4161 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4162 mp_so
->so_usecount
, mp_so
->so_retaincnt
),
4163 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4168 /* check again under the lock */
4169 if (mp_so
->so_usecount
> 0) {
4170 boolean_t wakeup
= FALSE
;
4171 struct mptsub
*mpts
, *tmpts
;
4173 mptcplog((LOG_DEBUG
, "MPTCP Socket: "
4174 "%s: mp_so 0x%llx skipped usecount "
4175 "[u=%d,r=%d] %d %d\n", __func__
,
4176 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4177 mp_so
->so_usecount
, mp_so
->so_retaincnt
,
4178 mp_tp
->mpt_gc_ticks
,
4180 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4182 if (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_1
) {
4183 if (mp_tp
->mpt_gc_ticks
> 0)
4184 mp_tp
->mpt_gc_ticks
--;
4185 if (mp_tp
->mpt_gc_ticks
== 0) {
4190 TAILQ_FOREACH_SAFE(mpts
,
4191 &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
4192 mptcp_subflow_eupcall1(mpts
->mpts_socket
,
4193 mpts
, SO_FILT_HINT_DISCONNECTED
);
4201 if (mpp
->mpp_state
!= MPPCB_STATE_DEAD
) {
4202 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4203 "[u=%d,r=%d,s=%d]\n", __func__
,
4204 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4205 mp_so
->so_usecount
, mp_so
->so_retaincnt
,
4209 if (mp_tp
->mpt_state
== MPTCPS_TIME_WAIT
)
4210 mptcp_close(mpte
, mp_tp
);
4212 mptcp_session_destroy(mpte
);
4214 mptcplog((LOG_DEBUG
, "MPTCP Socket: "
4215 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
4216 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
4217 mp_so
->so_usecount
, mp_so
->so_retaincnt
),
4218 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4220 DTRACE_MPTCP4(dispose
, struct socket
*, mp_so
,
4221 struct sockbuf
*, &mp_so
->so_rcv
,
4222 struct sockbuf
*, &mp_so
->so_snd
,
4223 struct mppcb
*, mpp
);
4233 * Drop a MPTCP connection, reporting the specified error.
4236 mptcp_drop(struct mptses
*mpte
, struct mptcb
*mp_tp
, int errno
)
4238 struct socket
*mp_so
;
4240 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
4241 VERIFY(mpte
->mpte_mptcb
== mp_tp
);
4242 mp_so
= mptetoso(mpte
);
4244 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
4245 uint32_t, 0 /* event */);
4247 if (errno
== ETIMEDOUT
&& mp_tp
->mpt_softerror
!= 0)
4248 errno
= mp_tp
->mpt_softerror
;
4249 mp_so
->so_error
= errno
;
4251 return (mptcp_close(mpte
, mp_tp
));
4255 * Close a MPTCP control block.
4258 mptcp_close(struct mptses
*mpte
, struct mptcb
*mp_tp
)
4260 struct socket
*mp_so
= NULL
;
4261 struct mptsub
*mpts
= NULL
, *tmpts
= NULL
;
4263 mpte_lock_assert_held(mpte
); /* same as MP socket lock */
4264 VERIFY(mpte
->mpte_mptcb
== mp_tp
);
4265 mp_so
= mptetoso(mpte
);
4267 mp_tp
->mpt_state
= MPTCPS_TERMINATE
;
4271 soisdisconnected(mp_so
);
4273 /* Clean up all subflows */
4274 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
4275 mptcp_subflow_disconnect(mpte
, mpts
);
4282 mptcp_notify_close(struct socket
*so
)
4284 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_DISCONNECTED
));
4291 mptcp_subflow_workloop(struct mptses
*mpte
)
4293 struct socket
*mp_so
;
4294 struct mptsub
*mpts
, *tmpts
;
4295 boolean_t connect_pending
= FALSE
, disconnect_fallback
= FALSE
;
4296 uint64_t mpsofilt_hint_mask
= SO_FILT_HINT_LOCKED
;
4298 mpte_lock_assert_held(mpte
);
4299 VERIFY(mpte
->mpte_mppcb
!= NULL
);
4300 mp_so
= mptetoso(mpte
);
4301 VERIFY(mp_so
!= NULL
);
4303 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
4306 if (mpts
->mpts_socket
->so_usecount
== 0) {
4307 /* Will be removed soon by tcp_garbage_collect */
4311 mptcp_subflow_addref(mpts
);
4312 mpts
->mpts_socket
->so_usecount
++;
4314 ret
= mptcp_subflow_events(mpte
, mpts
, &mpsofilt_hint_mask
);
4317 * If MPTCP socket is closed, disconnect all subflows.
4318 * This will generate a disconnect event which will
4319 * be handled during the next iteration, causing a
4320 * non-zero error to be returned above.
4322 if (mp_so
->so_flags
& SOF_PCBCLEARING
)
4323 mptcp_subflow_disconnect(mpte
, mpts
);
4329 case MPTS_EVRET_DELETE
:
4330 mptcp_subflow_soclose(mpts
);
4332 case MPTS_EVRET_CONNECT_PENDING
:
4333 connect_pending
= TRUE
;
4335 case MPTS_EVRET_DISCONNECT_FALLBACK
:
4336 disconnect_fallback
= TRUE
;
4339 mptcplog((LOG_DEBUG
,
4340 "MPTCP Socket: %s: mptcp_subflow_events "
4341 "returned invalid value: %d\n", __func__
,
4343 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4346 mptcp_subflow_remref(mpts
); /* ours */
4348 VERIFY(mpts
->mpts_socket
->so_usecount
!= 0);
4349 mpts
->mpts_socket
->so_usecount
--;
4352 if (mpsofilt_hint_mask
!= SO_FILT_HINT_LOCKED
) {
4353 VERIFY(mpsofilt_hint_mask
& SO_FILT_HINT_LOCKED
);
4355 soevent(mp_so
, mpsofilt_hint_mask
);
4358 if (!connect_pending
&& !disconnect_fallback
)
4361 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
4362 if (disconnect_fallback
) {
4363 struct socket
*so
= NULL
;
4364 struct inpcb
*inp
= NULL
;
4365 struct tcpcb
*tp
= NULL
;
4367 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
)
4370 mpts
->mpts_flags
|= MPTSF_MP_DEGRADED
;
4372 if (mpts
->mpts_flags
& (MPTSF_DISCONNECTING
|
4373 MPTSF_DISCONNECTED
|MPTSF_CONNECT_PENDING
))
4376 so
= mpts
->mpts_socket
;
4379 * The MPTCP connection has degraded to a fallback
4380 * mode, so there is no point in keeping this subflow
4381 * regardless of its MPTCP-readiness state, unless it
4382 * is the primary one which we use for fallback. This
4383 * assumes that the subflow used for fallback is the
4387 inp
= sotoinpcb(so
);
4388 tp
= intotcpcb(inp
);
4390 ~(TMPF_MPTCP_READY
|TMPF_MPTCP_TRUE
);
4391 tp
->t_mpflags
|= TMPF_TCP_FALLBACK
;
4393 if (mpts
->mpts_flags
& MPTSF_ACTIVE
) {
4396 tp
->t_mpflags
|= TMPF_RESET
;
4397 soevent(so
, SO_FILT_HINT_MUSTRST
);
4398 } else if (connect_pending
) {
4400 * The MPTCP connection has progressed to a state
4401 * where it supports full multipath semantics; allow
4402 * additional joins to be attempted for all subflows
4403 * that are in the PENDING state.
4405 if (mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) {
4406 int error
= mptcp_subflow_soconnectx(mpte
, mpts
);
4409 mptcp_subflow_abort(mpts
, error
);
4416 * Protocol pr_lock callback.
4419 mptcp_lock(struct socket
*mp_so
, int refcount
, void *lr
)
4421 struct mppcb
*mpp
= mpsotomppcb(mp_so
);
4425 lr_saved
= __builtin_return_address(0);
4430 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__
,
4431 mp_so
, lr_saved
, solockhistory_nr(mp_so
));
4436 if (mp_so
->so_usecount
< 0) {
4437 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__
,
4438 mp_so
, mp_so
->so_pcb
, lr_saved
, mp_so
->so_usecount
,
4439 solockhistory_nr(mp_so
));
4443 mp_so
->so_usecount
++;
4444 mp_so
->lock_lr
[mp_so
->next_lock_lr
] = lr_saved
;
4445 mp_so
->next_lock_lr
= (mp_so
->next_lock_lr
+ 1) % SO_LCKDBG_MAX
;
4451 * Protocol pr_unlock callback.
4454 mptcp_unlock(struct socket
*mp_so
, int refcount
, void *lr
)
4456 struct mppcb
*mpp
= mpsotomppcb(mp_so
);
4460 lr_saved
= __builtin_return_address(0);
4465 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__
,
4466 mp_so
, mp_so
->so_usecount
, lr_saved
,
4467 solockhistory_nr(mp_so
));
4470 mpp_lock_assert_held(mpp
);
4473 mp_so
->so_usecount
--;
4475 if (mp_so
->so_usecount
< 0) {
4476 panic("%s: so=%p usecount=%x lrh= %s\n", __func__
,
4477 mp_so
, mp_so
->so_usecount
, solockhistory_nr(mp_so
));
4480 mp_so
->unlock_lr
[mp_so
->next_unlock_lr
] = lr_saved
;
4481 mp_so
->next_unlock_lr
= (mp_so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
4488 * Protocol pr_getlock callback.
4491 mptcp_getlock(struct socket
*mp_so
, int flags
)
4493 struct mppcb
*mpp
= mpsotomppcb(mp_so
);
4496 panic("%s: so=%p NULL so_pcb %s\n", __func__
, mp_so
,
4497 solockhistory_nr(mp_so
));
4500 if (mp_so
->so_usecount
< 0) {
4501 panic("%s: so=%p usecount=%x lrh= %s\n", __func__
,
4502 mp_so
, mp_so
->so_usecount
, solockhistory_nr(mp_so
));
4505 return (mpp_getlock(mpp
, flags
));
4509 * MPTCP Join support
4513 mptcp_attach_to_subf(struct socket
*so
, struct mptcb
*mp_tp
,
4516 struct tcpcb
*tp
= sototcpcb(so
);
4517 struct mptcp_subf_auth_entry
*sauth_entry
;
4518 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4521 * The address ID of the first flow is implicitly 0.
4523 if (mp_tp
->mpt_state
== MPTCPS_CLOSED
) {
4524 tp
->t_local_aid
= 0;
4526 tp
->t_local_aid
= addr_id
;
4527 tp
->t_mpflags
|= (TMPF_PREESTABLISHED
| TMPF_JOINED_FLOW
);
4528 so
->so_flags
|= SOF_MP_SEC_SUBFLOW
;
4530 sauth_entry
= zalloc(mpt_subauth_zone
);
4531 sauth_entry
->msae_laddr_id
= tp
->t_local_aid
;
4532 sauth_entry
->msae_raddr_id
= 0;
4533 sauth_entry
->msae_raddr_rand
= 0;
4535 sauth_entry
->msae_laddr_rand
= RandomULong();
4536 if (sauth_entry
->msae_laddr_rand
== 0)
4538 LIST_INSERT_HEAD(&mp_tp
->mpt_subauth_list
, sauth_entry
, msae_next
);
4542 mptcp_detach_mptcb_from_subf(struct mptcb
*mp_tp
, struct socket
*so
)
4544 struct mptcp_subf_auth_entry
*sauth_entry
;
4545 struct tcpcb
*tp
= NULL
;
4552 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
4553 if (sauth_entry
->msae_laddr_id
== tp
->t_local_aid
) {
4559 LIST_REMOVE(sauth_entry
, msae_next
);
4563 zfree(mpt_subauth_zone
, sauth_entry
);
4567 mptcp_get_rands(mptcp_addr_id addr_id
, struct mptcb
*mp_tp
, u_int32_t
*lrand
,
4570 struct mptcp_subf_auth_entry
*sauth_entry
;
4571 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4573 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
4574 if (sauth_entry
->msae_laddr_id
== addr_id
) {
4576 *lrand
= sauth_entry
->msae_laddr_rand
;
4578 *rrand
= sauth_entry
->msae_raddr_rand
;
4585 mptcp_set_raddr_rand(mptcp_addr_id laddr_id
, struct mptcb
*mp_tp
,
4586 mptcp_addr_id raddr_id
, u_int32_t raddr_rand
)
4588 struct mptcp_subf_auth_entry
*sauth_entry
;
4589 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4591 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
4592 if (sauth_entry
->msae_laddr_id
== laddr_id
) {
4593 if ((sauth_entry
->msae_raddr_id
!= 0) &&
4594 (sauth_entry
->msae_raddr_id
!= raddr_id
)) {
4595 mptcplog((LOG_ERR
, "MPTCP Socket: %s mismatched"
4596 " address ids %d %d \n", __func__
, raddr_id
,
4597 sauth_entry
->msae_raddr_id
),
4598 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
4601 sauth_entry
->msae_raddr_id
= raddr_id
;
4602 if ((sauth_entry
->msae_raddr_rand
!= 0) &&
4603 (sauth_entry
->msae_raddr_rand
!= raddr_rand
)) {
4604 mptcplog((LOG_ERR
, "MPTCP Socket: "
4605 "%s: dup SYN_ACK %d %d \n",
4606 __func__
, raddr_rand
,
4607 sauth_entry
->msae_raddr_rand
),
4608 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_LOG
);
4611 sauth_entry
->msae_raddr_rand
= raddr_rand
;
4618 * SHA1 support for MPTCP
4621 mptcp_do_sha1(mptcp_key_t
*key
, char *sha_digest
)
4624 const unsigned char *sha1_base
;
4627 sha1_base
= (const unsigned char *) key
;
4628 sha1_size
= sizeof (mptcp_key_t
);
4629 SHA1Init(&sha1ctxt
);
4630 SHA1Update(&sha1ctxt
, sha1_base
, sha1_size
);
4631 SHA1Final(sha_digest
, &sha1ctxt
);
4635 mptcp_hmac_sha1(mptcp_key_t key1
, mptcp_key_t key2
,
4636 u_int32_t rand1
, u_int32_t rand2
, u_char
*digest
)
4639 mptcp_key_t key_ipad
[8] = {0}; /* key XOR'd with inner pad */
4640 mptcp_key_t key_opad
[8] = {0}; /* key XOR'd with outer pad */
4644 bzero(digest
, SHA1_RESULTLEN
);
4646 /* Set up the Key for HMAC */
4653 /* Set up the message for HMAC */
4657 /* Key is 512 block length, so no need to compute hash */
4659 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4661 for (i
= 0; i
< 8; i
++) {
4662 key_ipad
[i
] ^= 0x3636363636363636;
4663 key_opad
[i
] ^= 0x5c5c5c5c5c5c5c5c;
4666 /* Perform inner SHA1 */
4667 SHA1Init(&sha1ctxt
);
4668 SHA1Update(&sha1ctxt
, (unsigned char *)key_ipad
, sizeof (key_ipad
));
4669 SHA1Update(&sha1ctxt
, (unsigned char *)data
, sizeof (data
));
4670 SHA1Final(digest
, &sha1ctxt
);
4672 /* Perform outer SHA1 */
4673 SHA1Init(&sha1ctxt
);
4674 SHA1Update(&sha1ctxt
, (unsigned char *)key_opad
, sizeof (key_opad
));
4675 SHA1Update(&sha1ctxt
, (unsigned char *)digest
, SHA1_RESULTLEN
);
4676 SHA1Final(digest
, &sha1ctxt
);
4680 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4681 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4684 mptcp_get_hmac(mptcp_addr_id aid
, struct mptcb
*mp_tp
, u_char
*digest
)
4686 uint32_t lrand
, rrand
;
4688 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4691 mptcp_get_rands(aid
, mp_tp
, &lrand
, &rrand
);
4692 mptcp_hmac_sha1(mp_tp
->mpt_localkey
, mp_tp
->mpt_remotekey
, lrand
, rrand
,
4697 * Authentication data generation
4700 mptcp_generate_token(char *sha_digest
, int sha_digest_len
, caddr_t token
,
4703 VERIFY(token_len
== sizeof (u_int32_t
));
4704 VERIFY(sha_digest_len
== SHA1_RESULTLEN
);
4706 /* Most significant 32 bits of the SHA1 hash */
4707 bcopy(sha_digest
, token
, sizeof (u_int32_t
));
4712 mptcp_generate_idsn(char *sha_digest
, int sha_digest_len
, caddr_t idsn
,
4715 VERIFY(idsn_len
== sizeof (u_int64_t
));
4716 VERIFY(sha_digest_len
== SHA1_RESULTLEN
);
4719 * Least significant 64 bits of the SHA1 hash
4722 idsn
[7] = sha_digest
[12];
4723 idsn
[6] = sha_digest
[13];
4724 idsn
[5] = sha_digest
[14];
4725 idsn
[4] = sha_digest
[15];
4726 idsn
[3] = sha_digest
[16];
4727 idsn
[2] = sha_digest
[17];
4728 idsn
[1] = sha_digest
[18];
4729 idsn
[0] = sha_digest
[19];
4734 mptcp_conn_properties(struct mptcb
*mp_tp
)
4736 /* There is only Version 0 at this time */
4737 mp_tp
->mpt_version
= MPTCP_STD_VERSION_0
;
4739 /* Set DSS checksum flag */
4741 mp_tp
->mpt_flags
|= MPTCPF_CHECKSUM
;
4743 /* Set up receive window */
4744 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
4746 /* Set up gc ticks */
4747 mp_tp
->mpt_gc_ticks
= MPT_GC_TICKS
;
4751 mptcp_init_local_parms(struct mptses
*mpte
)
4753 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
4754 char key_digest
[SHA1_RESULTLEN
];
4756 read_frandom(&mp_tp
->mpt_localkey
, sizeof(mp_tp
->mpt_localkey
));
4757 mptcp_do_sha1(&mp_tp
->mpt_localkey
, key_digest
);
4759 mptcp_generate_token(key_digest
, SHA1_RESULTLEN
,
4760 (caddr_t
)&mp_tp
->mpt_localtoken
, sizeof (mp_tp
->mpt_localtoken
));
4761 mptcp_generate_idsn(key_digest
, SHA1_RESULTLEN
,
4762 (caddr_t
)&mp_tp
->mpt_local_idsn
, sizeof (u_int64_t
));
4764 /* The subflow SYN is also first MPTCP byte */
4765 mp_tp
->mpt_snduna
= mp_tp
->mpt_sndmax
= mp_tp
->mpt_local_idsn
+ 1;
4766 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
4768 mptcp_conn_properties(mp_tp
);
4772 mptcp_init_remote_parms(struct mptcb
*mp_tp
)
4774 char remote_digest
[SHA1_RESULTLEN
];
4775 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4777 /* Only Version 0 is supported for auth purposes */
4778 if (mp_tp
->mpt_version
!= MPTCP_STD_VERSION_0
)
4781 /* Setup local and remote tokens and Initial DSNs */
4782 mptcp_do_sha1(&mp_tp
->mpt_remotekey
, remote_digest
);
4783 mptcp_generate_token(remote_digest
, SHA1_RESULTLEN
,
4784 (caddr_t
)&mp_tp
->mpt_remotetoken
, sizeof (mp_tp
->mpt_remotetoken
));
4785 mptcp_generate_idsn(remote_digest
, SHA1_RESULTLEN
,
4786 (caddr_t
)&mp_tp
->mpt_remote_idsn
, sizeof (u_int64_t
));
4787 mp_tp
->mpt_rcvnxt
= mp_tp
->mpt_remote_idsn
+ 1;
4793 mptcp_send_dfin(struct socket
*so
)
4795 struct tcpcb
*tp
= NULL
;
4796 struct inpcb
*inp
= NULL
;
4798 inp
= sotoinpcb(so
);
4802 tp
= intotcpcb(inp
);
4806 if (!(tp
->t_mpflags
& TMPF_RESET
))
4807 tp
->t_mpflags
|= TMPF_SEND_DFIN
;
4811 * Data Sequence Mapping routines
4814 mptcp_insert_dsn(struct mppcb
*mpp
, struct mbuf
*m
)
4816 struct mptcb
*mp_tp
;
4821 __IGNORE_WCASTALIGN(mp_tp
= &((struct mpp_mtp
*)mpp
)->mtcb
);
4822 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
4825 VERIFY(m
->m_flags
& M_PKTHDR
);
4826 m
->m_pkthdr
.pkt_flags
|= (PKTF_MPTCP
| PKTF_MPSO
);
4827 m
->m_pkthdr
.mp_dsn
= mp_tp
->mpt_sndmax
;
4828 m
->m_pkthdr
.mp_rlen
= m_pktlen(m
);
4829 mp_tp
->mpt_sndmax
+= m_pktlen(m
);
4835 mptcp_fallback_sbdrop(struct socket
*so
, struct mbuf
*m
, int len
)
4837 struct mptcb
*mp_tp
= tptomptp(sototcpcb(so
));
4844 while (m
&& len
> 0) {
4845 VERIFY(m
->m_flags
& M_PKTHDR
);
4846 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4848 data_ack
= m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
;
4849 dsn
= m
->m_pkthdr
.mp_dsn
;
4855 if (m
&& len
== 0) {
4857 * If there is one more mbuf in the chain, it automatically means
4858 * that up to m->mp_dsn has been ack'ed.
4860 * This means, we actually correct data_ack back down (compared
4861 * to what we set inside the loop - dsn + data_len). Because in
4862 * the loop we are "optimistic" and assume that the full mapping
4863 * will be acked. If that's not the case and we get out of the
4864 * loop with m != NULL, it means only up to m->mp_dsn has been
4867 data_ack
= m
->m_pkthdr
.mp_dsn
;
4872 * If len is negative, meaning we acked in the middle of an mbuf,
4873 * only up to this mbuf's data-sequence number has been acked
4874 * at the MPTCP-level.
4879 mptcplog((LOG_DEBUG
, "%s inferred ack up to %u\n", __func__
, (uint32_t)data_ack
),
4880 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
4881 mptcp_data_ack_rcvd(mp_tp
, sototcpcb(so
), data_ack
);
4885 mptcp_preproc_sbdrop(struct socket
*so
, struct mbuf
*m
, unsigned int len
)
4889 /* TFO makes things complicated. */
4890 if (so
->so_flags1
& SOF1_TFO_REWIND
) {
4892 so
->so_flags1
&= ~SOF1_TFO_REWIND
;
4895 while (m
&& (!(so
->so_flags
& SOF_MP_SUBFLOW
) || rewinding
)) {
4897 VERIFY(m
->m_flags
& M_PKTHDR
);
4898 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4900 sub_len
= m
->m_pkthdr
.mp_rlen
;
4902 if (sub_len
< len
) {
4903 m
->m_pkthdr
.mp_dsn
+= sub_len
;
4904 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPSO
)) {
4905 m
->m_pkthdr
.mp_rseq
+= sub_len
;
4907 m
->m_pkthdr
.mp_rlen
= 0;
4910 /* sub_len >= len */
4912 m
->m_pkthdr
.mp_dsn
+= len
;
4913 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPSO
)) {
4915 m
->m_pkthdr
.mp_rseq
+= len
;
4917 mptcplog((LOG_DEBUG
, "%s: dsn %u ssn %u len %d %d\n",
4918 __func__
, (u_int32_t
)m
->m_pkthdr
.mp_dsn
,
4919 m
->m_pkthdr
.mp_rseq
, m
->m_pkthdr
.mp_rlen
, len
),
4920 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
4921 m
->m_pkthdr
.mp_rlen
-= len
;
4927 if (so
->so_flags
& SOF_MP_SUBFLOW
&&
4928 !(sototcpcb(so
)->t_mpflags
& TMPF_TFO_REQUEST
) &&
4929 !(sototcpcb(so
)->t_mpflags
& TMPF_RCVD_DACK
)) {
4931 * Received an ack without receiving a DATA_ACK.
4932 * Need to fallback to regular TCP (or destroy this subflow).
4934 sototcpcb(so
)->t_mpflags
|= TMPF_INFIN_SENT
;
4935 mptcp_notify_mpfail(so
);
4939 /* Obtain the DSN mapping stored in the mbuf */
4941 mptcp_output_getm_dsnmap32(struct socket
*so
, int off
,
4942 uint32_t *dsn
, uint32_t *relseq
, uint16_t *data_len
, uint16_t *dss_csum
)
4946 mptcp_output_getm_dsnmap64(so
, off
, &dsn64
, relseq
, data_len
, dss_csum
);
4947 *dsn
= (u_int32_t
)MPTCP_DATASEQ_LOW32(dsn64
);
4951 mptcp_output_getm_dsnmap64(struct socket
*so
, int off
, uint64_t *dsn
,
4952 uint32_t *relseq
, uint16_t *data_len
,
4955 struct mbuf
*m
= so
->so_snd
.sb_mb
;
4961 * In the subflow socket, the DSN sequencing can be discontiguous,
4962 * but the subflow sequence mapping is contiguous. Use the subflow
4963 * sequence property to find the right mbuf and corresponding dsn
4968 VERIFY(m
->m_flags
& M_PKTHDR
);
4969 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4971 if (off
>= m
->m_len
) {
4981 VERIFY(m
->m_pkthdr
.mp_rlen
<= UINT16_MAX
);
4983 *dsn
= m
->m_pkthdr
.mp_dsn
;
4984 *relseq
= m
->m_pkthdr
.mp_rseq
;
4985 *data_len
= m
->m_pkthdr
.mp_rlen
;
4986 *dss_csum
= m
->m_pkthdr
.mp_csum
;
4988 mptcplog((LOG_DEBUG
, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4989 __func__
, (u_int32_t
)(*dsn
), *relseq
, *data_len
, off
, off_orig
),
4990 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
4994 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4995 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4996 * When it trims data tcp_input calls m_adj() which does not remove the
4997 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4998 * The dsn map insertion cannot be delayed after trim, because data can be in
4999 * the reassembly queue for a while and the DSN option info in tp will be
5000 * overwritten for every new packet received.
5001 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5002 * with mptcp_adj_rmap()
5005 mptcp_insert_rmap(struct tcpcb
*tp
, struct mbuf
*m
, struct tcphdr
*th
)
5007 VERIFY(m
->m_flags
& M_PKTHDR
);
5008 VERIFY(!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
5010 if (tp
->t_mpflags
& TMPF_EMBED_DSN
) {
5011 m
->m_pkthdr
.mp_dsn
= tp
->t_rcv_map
.mpt_dsn
;
5012 m
->m_pkthdr
.mp_rseq
= tp
->t_rcv_map
.mpt_sseq
;
5013 m
->m_pkthdr
.mp_rlen
= tp
->t_rcv_map
.mpt_len
;
5014 m
->m_pkthdr
.mp_csum
= tp
->t_rcv_map
.mpt_csum
;
5015 if (tp
->t_rcv_map
.mpt_dfin
)
5016 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP_DFIN
;
5018 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP
;
5020 tp
->t_mpflags
&= ~TMPF_EMBED_DSN
;
5021 tp
->t_mpflags
|= TMPF_MPTCP_ACKNOW
;
5022 } else if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
) {
5023 if (th
->th_flags
& TH_FIN
)
5024 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP_DFIN
;
5029 mptcp_adj_rmap(struct socket
*so
, struct mbuf
*m
, int off
, uint64_t dsn
,
5030 uint32_t rseq
, uint16_t dlen
)
5032 struct mptsub
*mpts
= sototcpcb(so
)->t_mpsub
;
5034 if (m_pktlen(m
) == 0)
5037 if ((m
->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
5038 if (off
&& (dsn
!= m
->m_pkthdr
.mp_dsn
||
5039 rseq
!= m
->m_pkthdr
.mp_rseq
||
5040 dlen
!= m
->m_pkthdr
.mp_rlen
)) {
5041 mptcplog((LOG_ERR
, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
5042 __func__
, dsn
, m
->m_pkthdr
.mp_dsn
,
5043 rseq
, m
->m_pkthdr
.mp_rseq
,
5044 dlen
, m
->m_pkthdr
.mp_rlen
),
5045 MPTCP_RECEIVER_DBG
, MPTCP_LOGLVL_ERR
);
5048 m
->m_pkthdr
.mp_dsn
+= off
;
5049 m
->m_pkthdr
.mp_rseq
+= off
;
5050 m
->m_pkthdr
.mp_rlen
= m
->m_pkthdr
.len
;
5052 if (!(mpts
->mpts_flags
& MPTSF_CONFIRMED
)) {
5053 /* data arrived without an DSS option mapping */
5055 /* initial subflow can fallback right after SYN handshake */
5056 mptcp_notify_mpfail(so
);
5060 mpts
->mpts_flags
|= MPTSF_CONFIRMED
;
5066 * Following routines help with failure detection and failover of data
5067 * transfer from one subflow to another.
5070 mptcp_act_on_txfail(struct socket
*so
)
5072 struct tcpcb
*tp
= NULL
;
5073 struct inpcb
*inp
= sotoinpcb(so
);
5078 tp
= intotcpcb(inp
);
5082 if (so
->so_flags
& SOF_MP_TRYFAILOVER
)
5085 so
->so_flags
|= SOF_MP_TRYFAILOVER
;
5086 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPFAILOVER
));
5090 * Support for MP_FAIL option
5093 mptcp_get_map_for_dsn(struct socket
*so
, u_int64_t dsn_fail
, u_int32_t
*tcp_seq
)
5095 struct mbuf
*m
= so
->so_snd
.sb_mb
;
5104 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
5105 VERIFY(m
->m_flags
& M_PKTHDR
);
5106 dsn
= m
->m_pkthdr
.mp_dsn
;
5107 datalen
= m
->m_pkthdr
.mp_rlen
;
5108 if (MPTCP_SEQ_LEQ(dsn
, dsn_fail
) &&
5109 (MPTCP_SEQ_GEQ(dsn
+ datalen
, dsn_fail
))) {
5110 off
= dsn_fail
- dsn
;
5111 *tcp_seq
= m
->m_pkthdr
.mp_rseq
+ off
;
5112 mptcplog((LOG_DEBUG
, "%s: %llu %llu \n", __func__
, dsn
,
5113 dsn_fail
), MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
5121 * If there was no mbuf data and a fallback to TCP occurred, there's
5122 * not much else to do.
5125 mptcplog((LOG_ERR
, "MPTCP Sender: "
5126 "%s: %llu not found \n", __func__
, dsn_fail
),
5127 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_LOG
);
5132 * Support for sending contiguous MPTCP bytes in subflow
5133 * Also for preventing sending data with ACK in 3-way handshake
5136 mptcp_adj_sendlen(struct socket
*so
, int32_t off
)
5138 struct tcpcb
*tp
= sototcpcb(so
);
5139 struct mptsub
*mpts
= tp
->t_mpsub
;
5141 uint32_t mdss_subflow_seq
;
5142 int mdss_subflow_off
;
5143 uint16_t mdss_data_len
;
5146 mptcp_output_getm_dsnmap64(so
, off
, &mdss_dsn
, &mdss_subflow_seq
,
5147 &mdss_data_len
, &dss_csum
);
5150 * We need to compute how much of the mapping still remains.
5151 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5153 mdss_subflow_off
= (mdss_subflow_seq
+ mpts
->mpts_iss
) - tp
->snd_una
;
5156 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5157 * seq has been set to 1 (while it should be 0).
5159 if (tp
->t_mpflags
& TMPF_TFO_REQUEST
)
5162 if (off
< mdss_subflow_off
)
5163 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__
,
5164 off
, mdss_subflow_off
, mdss_subflow_seq
, mpts
->mpts_iss
, tp
->snd_una
);
5165 VERIFY(off
>= mdss_subflow_off
);
5167 mptcplog((LOG_DEBUG
, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5168 __func__
, mdss_data_len
, off
, mdss_subflow_off
, mdss_subflow_seq
,
5169 mpts
->mpts_iss
, tp
->snd_una
), MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
5170 return (mdss_data_len
- (off
- mdss_subflow_off
));
5174 mptcp_get_maxseg(struct mptses
*mpte
)
5176 struct mptsub
*mpts
;
5177 uint32_t maxseg
= 0;
5179 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
5180 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
5182 if (!TCPS_HAVEESTABLISHED(tp
->t_state
) ||
5183 TCPS_HAVERCVDFIN2(tp
->t_state
))
5186 if (tp
->t_maxseg
> maxseg
)
5187 maxseg
= tp
->t_maxseg
;
5194 mptcp_get_rcvscale(struct mptses
*mpte
)
5196 struct mptsub
*mpts
;
5197 uint8_t rcvscale
= UINT8_MAX
;
5199 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
5200 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
5202 if (!TCPS_HAVEESTABLISHED(tp
->t_state
) ||
5203 TCPS_HAVERCVDFIN2(tp
->t_state
))
5206 if (tp
->rcv_scale
< rcvscale
)
5207 rcvscale
= tp
->rcv_scale
;
5213 /* Similar to tcp_sbrcv_reserve */
5215 mptcp_sbrcv_reserve(struct mptcb
*mp_tp
, struct sockbuf
*sbrcv
,
5216 u_int32_t newsize
, u_int32_t idealsize
)
5218 uint8_t rcvscale
= mptcp_get_rcvscale(mp_tp
->mpt_mpte
);
5220 /* newsize should not exceed max */
5221 newsize
= min(newsize
, tcp_autorcvbuf_max
);
5223 /* The receive window scale negotiated at the
5224 * beginning of the connection will also set a
5225 * limit on the socket buffer size
5227 newsize
= min(newsize
, TCP_MAXWIN
<< rcvscale
);
5229 /* Set new socket buffer size */
5230 if (newsize
> sbrcv
->sb_hiwat
&&
5231 (sbreserve(sbrcv
, newsize
) == 1)) {
5232 sbrcv
->sb_idealsize
= min(max(sbrcv
->sb_idealsize
,
5233 (idealsize
!= 0) ? idealsize
: newsize
), tcp_autorcvbuf_max
);
5235 /* Again check the limit set by the advertised
5238 sbrcv
->sb_idealsize
= min(sbrcv
->sb_idealsize
,
5239 TCP_MAXWIN
<< rcvscale
);
5244 mptcp_sbrcv_grow(struct mptcb
*mp_tp
)
5246 struct mptses
*mpte
= mp_tp
->mpt_mpte
;
5247 struct socket
*mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
5248 struct sockbuf
*sbrcv
= &mp_so
->so_rcv
;
5249 uint32_t hiwat_sum
= 0;
5250 uint32_t ideal_sum
= 0;
5251 struct mptsub
*mpts
;
5254 * Do not grow the receive socket buffer if
5255 * - auto resizing is disabled, globally or on this socket
5256 * - the high water mark already reached the maximum
5257 * - the stream is in background and receive side is being
5259 * - if there are segments in reassembly queue indicating loss,
5260 * do not need to increase recv window during recovery as more
5261 * data is not going to be sent. A duplicate ack sent during
5262 * recovery should not change the receive window
5264 if (tcp_do_autorcvbuf
== 0 ||
5265 (sbrcv
->sb_flags
& SB_AUTOSIZE
) == 0 ||
5266 tcp_cansbgrow(sbrcv
) == 0 ||
5267 sbrcv
->sb_hiwat
>= tcp_autorcvbuf_max
||
5268 (mp_so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ||
5269 !LIST_EMPTY(&mp_tp
->mpt_segq
)) {
5270 /* Can not resize the socket buffer, just return */
5275 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5277 * But, for this we first need accurate receiver-RTT estimations, which
5278 * we currently don't have.
5280 * Let's use a dummy algorithm for now, just taking the sum of all
5281 * subflow's receive-buffers. It's too low, but that's all we can get
5285 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
5286 hiwat_sum
+= mpts
->mpts_socket
->so_rcv
.sb_hiwat
;
5287 ideal_sum
+= mpts
->mpts_socket
->so_rcv
.sb_idealsize
;
5290 mptcp_sbrcv_reserve(mp_tp
, sbrcv
, hiwat_sum
, ideal_sum
);
5294 * Determine if we can grow the recieve socket buffer to avoid sending
5295 * a zero window update to the peer. We allow even socket buffers that
5296 * have fixed size (set by the application) to grow if the resource
5297 * constraints are met. They will also be trimmed after the application
5300 * Similar to tcp_sbrcv_grow_rwin
5303 mptcp_sbrcv_grow_rwin(struct mptcb
*mp_tp
, struct sockbuf
*sb
)
5305 struct socket
*mp_so
= mp_tp
->mpt_mpte
->mpte_mppcb
->mpp_socket
;
5306 u_int32_t rcvbufinc
= mptcp_get_maxseg(mp_tp
->mpt_mpte
) << 4;
5307 u_int32_t rcvbuf
= sb
->sb_hiwat
;
5309 if (tcp_recv_bg
== 1 || IS_TCP_RECV_BG(mp_so
))
5312 if (tcp_do_autorcvbuf
== 1 &&
5313 tcp_cansbgrow(sb
) &&
5314 /* Diff to tcp_sbrcv_grow_rwin */
5315 (mp_so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) == 0 &&
5316 (rcvbuf
- sb
->sb_cc
) < rcvbufinc
&&
5317 rcvbuf
< tcp_autorcvbuf_max
&&
5318 (sb
->sb_idealsize
> 0 &&
5319 sb
->sb_hiwat
<= (sb
->sb_idealsize
+ rcvbufinc
))) {
5320 sbreserve(sb
, min((sb
->sb_hiwat
+ rcvbufinc
), tcp_autorcvbuf_max
));
5324 /* Similar to tcp_sbspace */
5326 mptcp_sbspace(struct mptcb
*mp_tp
)
5328 struct sockbuf
*sb
= &mp_tp
->mpt_mpte
->mpte_mppcb
->mpp_socket
->so_rcv
;
5331 int32_t pending
= 0;
5333 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
5335 mptcp_sbrcv_grow_rwin(mp_tp
, sb
);
5337 /* hiwat might have changed */
5338 rcvbuf
= sb
->sb_hiwat
;
5340 space
= ((int32_t) imin((rcvbuf
- sb
->sb_cc
),
5341 (sb
->sb_mbmax
- sb
->sb_mbcnt
)));
5346 /* Compensate for data being processed by content filters */
5347 pending
= cfil_sock_data_space(sb
);
5348 #endif /* CONTENT_FILTER */
5349 if (pending
> space
)
5358 * Support Fallback to Regular TCP
5361 mptcp_notify_mpready(struct socket
*so
)
5363 struct tcpcb
*tp
= NULL
;
5368 tp
= intotcpcb(sotoinpcb(so
));
5373 DTRACE_MPTCP4(multipath__ready
, struct socket
*, so
,
5374 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
,
5375 struct tcpcb
*, tp
);
5377 if (!(tp
->t_mpflags
& TMPF_MPTCP_TRUE
))
5380 if (tp
->t_mpflags
& TMPF_MPTCP_READY
)
5383 tp
->t_mpflags
&= ~TMPF_TCP_FALLBACK
;
5384 tp
->t_mpflags
|= TMPF_MPTCP_READY
;
5386 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPSTATUS
));
5390 mptcp_notify_mpfail(struct socket
*so
)
5392 struct tcpcb
*tp
= NULL
;
5397 tp
= intotcpcb(sotoinpcb(so
));
5402 DTRACE_MPTCP4(multipath__failed
, struct socket
*, so
,
5403 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
,
5404 struct tcpcb
*, tp
);
5406 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
)
5409 tp
->t_mpflags
&= ~(TMPF_MPTCP_READY
|TMPF_MPTCP_TRUE
);
5410 tp
->t_mpflags
|= TMPF_TCP_FALLBACK
;
5412 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPSTATUS
));
5416 * Keepalive helper function
5419 mptcp_ok_to_keepalive(struct mptcb
*mp_tp
)
5422 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
5424 if (mp_tp
->mpt_state
>= MPTCPS_CLOSE_WAIT
) {
5431 * MPTCP t_maxseg adjustment function
5434 mptcp_adj_mss(struct tcpcb
*tp
, boolean_t mtudisc
)
5437 struct mptcb
*mp_tp
= tptomptp(tp
);
5439 #define MPTCP_COMPUTE_LEN { \
5440 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5441 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5444 /* adjust to 32-bit boundary + EOL */ \
5450 mpte_lock_assert_held(mp_tp
->mpt_mpte
);
5453 * For the first subflow and subsequent subflows, adjust mss for
5454 * most common MPTCP option size, for case where tcp_mss is called
5455 * during option processing and MTU discovery.
5458 if (tp
->t_mpflags
& TMPF_MPTCP_TRUE
&&
5459 !(tp
->t_mpflags
& TMPF_JOINED_FLOW
)) {
5463 if (tp
->t_mpflags
& TMPF_PREESTABLISHED
&&
5464 tp
->t_mpflags
& TMPF_SENT_JOIN
) {
5468 if (tp
->t_mpflags
& TMPF_MPTCP_TRUE
) {
5477 * Update the pid, upid, uuid of the subflow so, based on parent so
5480 mptcp_update_last_owner(struct socket
*so
, struct socket
*mp_so
)
5482 if (so
->last_pid
!= mp_so
->last_pid
||
5483 so
->last_upid
!= mp_so
->last_upid
) {
5484 so
->last_upid
= mp_so
->last_upid
;
5485 so
->last_pid
= mp_so
->last_pid
;
5486 uuid_copy(so
->last_uuid
, mp_so
->last_uuid
);
5488 so_update_policy(so
);
5492 fill_mptcp_subflow(struct socket
*so
, mptcp_flow_t
*flow
, struct mptsub
*mpts
)
5496 tcp_getconninfo(so
, &flow
->flow_ci
);
5497 inp
= sotoinpcb(so
);
5499 if ((inp
->inp_vflag
& INP_IPV6
) != 0) {
5500 flow
->flow_src
.ss_family
= AF_INET6
;
5501 flow
->flow_dst
.ss_family
= AF_INET6
;
5502 flow
->flow_src
.ss_len
= sizeof(struct sockaddr_in6
);
5503 flow
->flow_dst
.ss_len
= sizeof(struct sockaddr_in6
);
5504 SIN6(&flow
->flow_src
)->sin6_port
= inp
->in6p_lport
;
5505 SIN6(&flow
->flow_dst
)->sin6_port
= inp
->in6p_fport
;
5506 SIN6(&flow
->flow_src
)->sin6_addr
= inp
->in6p_laddr
;
5507 SIN6(&flow
->flow_dst
)->sin6_addr
= inp
->in6p_faddr
;
5510 if ((inp
->inp_vflag
& INP_IPV4
) != 0) {
5511 flow
->flow_src
.ss_family
= AF_INET
;
5512 flow
->flow_dst
.ss_family
= AF_INET
;
5513 flow
->flow_src
.ss_len
= sizeof(struct sockaddr_in
);
5514 flow
->flow_dst
.ss_len
= sizeof(struct sockaddr_in
);
5515 SIN(&flow
->flow_src
)->sin_port
= inp
->inp_lport
;
5516 SIN(&flow
->flow_dst
)->sin_port
= inp
->inp_fport
;
5517 SIN(&flow
->flow_src
)->sin_addr
= inp
->inp_laddr
;
5518 SIN(&flow
->flow_dst
)->sin_addr
= inp
->inp_faddr
;
5520 flow
->flow_len
= sizeof(*flow
);
5521 flow
->flow_tcpci_offset
= offsetof(mptcp_flow_t
, flow_ci
);
5522 flow
->flow_flags
= mpts
->mpts_flags
;
5523 flow
->flow_cid
= mpts
->mpts_connid
;
5524 flow
->flow_relseq
= mpts
->mpts_rel_seq
;
5525 flow
->flow_soerror
= mpts
->mpts_socket
->so_error
;
5526 flow
->flow_probecnt
= mpts
->mpts_probecnt
;
5530 mptcp_pcblist SYSCTL_HANDLER_ARGS
5532 #pragma unused(oidp, arg1, arg2)
5536 struct mptses
*mpte
;
5537 struct mptcb
*mp_tp
;
5538 struct mptsub
*mpts
;
5540 conninfo_mptcp_t mptcpci
;
5541 mptcp_flow_t
*flows
= NULL
;
5543 if (req
->newptr
!= USER_ADDR_NULL
)
5546 lck_mtx_lock(&mtcbinfo
.mppi_lock
);
5547 if (req
->oldptr
== USER_ADDR_NULL
) {
5548 size_t n
= mtcbinfo
.mppi_count
;
5549 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
5550 req
->oldidx
= (n
+ n
/8) * sizeof(conninfo_mptcp_t
) +
5551 4 * (n
+ n
/8) * sizeof(mptcp_flow_t
);
5554 TAILQ_FOREACH(mpp
, &mtcbinfo
.mppi_pcbs
, mpp_entry
) {
5557 VERIFY(mpp
->mpp_flags
& MPP_ATTACHED
);
5558 mpte
= mptompte(mpp
);
5559 VERIFY(mpte
!= NULL
);
5560 mpte_lock_assert_held(mpte
);
5561 mp_tp
= mpte
->mpte_mptcb
;
5562 VERIFY(mp_tp
!= NULL
);
5564 bzero(&mptcpci
, sizeof(mptcpci
));
5565 mptcpci
.mptcpci_state
= mp_tp
->mpt_state
;
5566 mptcpci
.mptcpci_flags
= mp_tp
->mpt_flags
;
5567 mptcpci
.mptcpci_ltoken
= mp_tp
->mpt_localtoken
;
5568 mptcpci
.mptcpci_rtoken
= mp_tp
->mpt_remotetoken
;
5569 mptcpci
.mptcpci_notsent_lowat
= mp_tp
->mpt_notsent_lowat
;
5570 mptcpci
.mptcpci_snduna
= mp_tp
->mpt_snduna
;
5571 mptcpci
.mptcpci_sndnxt
= mp_tp
->mpt_sndnxt
;
5572 mptcpci
.mptcpci_sndmax
= mp_tp
->mpt_sndmax
;
5573 mptcpci
.mptcpci_lidsn
= mp_tp
->mpt_local_idsn
;
5574 mptcpci
.mptcpci_sndwnd
= mp_tp
->mpt_sndwnd
;
5575 mptcpci
.mptcpci_rcvnxt
= mp_tp
->mpt_rcvnxt
;
5576 mptcpci
.mptcpci_rcvatmark
= mp_tp
->mpt_rcvnxt
;
5577 mptcpci
.mptcpci_ridsn
= mp_tp
->mpt_remote_idsn
;
5578 mptcpci
.mptcpci_rcvwnd
= mp_tp
->mpt_rcvwnd
;
5580 mptcpci
.mptcpci_nflows
= mpte
->mpte_numflows
;
5581 mptcpci
.mptcpci_mpte_flags
= mpte
->mpte_flags
;
5582 mptcpci
.mptcpci_mpte_addrid
= mpte
->mpte_addrid_last
;
5583 mptcpci
.mptcpci_flow_offset
=
5584 offsetof(conninfo_mptcp_t
, mptcpci_flows
);
5586 len
= sizeof(*flows
) * mpte
->mpte_numflows
;
5587 if (mpte
->mpte_numflows
!= 0) {
5588 flows
= _MALLOC(len
, M_TEMP
, M_WAITOK
| M_ZERO
);
5589 if (flows
== NULL
) {
5593 mptcpci
.mptcpci_len
= sizeof(mptcpci
) +
5594 sizeof(*flows
) * (mptcpci
.mptcpci_nflows
- 1);
5595 error
= SYSCTL_OUT(req
, &mptcpci
,
5596 sizeof(mptcpci
) - sizeof(mptcp_flow_t
));
5598 mptcpci
.mptcpci_len
= sizeof(mptcpci
);
5599 error
= SYSCTL_OUT(req
, &mptcpci
, sizeof(mptcpci
));
5603 FREE(flows
, M_TEMP
);
5607 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
5608 so
= mpts
->mpts_socket
;
5609 fill_mptcp_subflow(so
, &flows
[f
], mpts
);
5614 error
= SYSCTL_OUT(req
, flows
, len
);
5615 FREE(flows
, M_TEMP
);
5620 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
5625 SYSCTL_PROC(_net_inet_mptcp
, OID_AUTO
, pcblist
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5626 0, 0, mptcp_pcblist
, "S,conninfo_mptcp_t",
5627 "List of active MPTCP connections");
5630 * Set notsent lowat mark on the MPTCB
5633 mptcp_set_notsent_lowat(struct mptses
*mpte
, int optval
)
5635 struct mptcb
*mp_tp
= NULL
;
5638 if (mpte
->mpte_mppcb
->mpp_flags
& MPP_ATTACHED
)
5639 mp_tp
= mpte
->mpte_mptcb
;
5642 mp_tp
->mpt_notsent_lowat
= optval
;
5650 mptcp_get_notsent_lowat(struct mptses
*mpte
)
5652 struct mptcb
*mp_tp
= NULL
;
5654 if (mpte
->mpte_mppcb
->mpp_flags
& MPP_ATTACHED
)
5655 mp_tp
= mpte
->mpte_mptcb
;
5658 return (mp_tp
->mpt_notsent_lowat
);
5664 mptcp_notsent_lowat_check(struct socket
*so
)
5666 struct mptses
*mpte
;
5668 struct mptcb
*mp_tp
;
5669 struct mptsub
*mpts
;
5673 mpp
= mpsotomppcb(so
);
5674 if (mpp
== NULL
|| mpp
->mpp_state
== MPPCB_STATE_DEAD
) {
5678 mpte
= mptompte(mpp
);
5679 mpte_lock_assert_held(mpte
);
5680 mp_tp
= mpte
->mpte_mptcb
;
5682 notsent
= so
->so_snd
.sb_cc
;
5684 if ((notsent
== 0) ||
5685 ((notsent
- (mp_tp
->mpt_sndnxt
- mp_tp
->mpt_snduna
)) <=
5686 mp_tp
->mpt_notsent_lowat
)) {
5687 mptcplog((LOG_DEBUG
, "MPTCP Sender: "
5688 "lowat %d notsent %d actual %d \n",
5689 mp_tp
->mpt_notsent_lowat
, notsent
,
5690 notsent
- (mp_tp
->mpt_sndnxt
- mp_tp
->mpt_snduna
)),
5691 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
5695 /* When Nagle's algorithm is not disabled, it is better
5696 * to wakeup the client even before there is atleast one
5697 * maxseg of data to write.
5699 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
5701 if (mpts
->mpts_flags
& MPTSF_ACTIVE
) {
5702 struct socket
*subf_so
= mpts
->mpts_socket
;
5703 struct tcpcb
*tp
= intotcpcb(sotoinpcb(subf_so
));
5705 notsent
= so
->so_snd
.sb_cc
-
5706 (tp
->snd_nxt
- tp
->snd_una
);
5708 if ((tp
->t_flags
& TF_NODELAY
) == 0 &&
5709 notsent
> 0 && (notsent
<= (int)tp
->t_maxseg
)) {
5712 mptcplog((LOG_DEBUG
, "MPTCP Sender: lowat %d notsent %d"
5713 " nodelay false \n",
5714 mp_tp
->mpt_notsent_lowat
, notsent
),
5715 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
5722 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5723 static kern_ctl_ref mptcp_kern_ctrl_ref
= NULL
;
5724 static uint32_t mptcp_kern_skt_inuse
= 0;
5725 static uint32_t mptcp_kern_skt_unit
;
5726 symptoms_advisory_t mptcp_advisory
;
5729 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref
, struct sockaddr_ctl
*sac
,
5732 #pragma unused(kctlref, sac, unitinfo)
5734 if (OSIncrementAtomic(&mptcp_kern_skt_inuse
) > 0)
5735 os_log_error(mptcp_log_handle
, "%s MPTCP kernel-control socket for Symptoms already open!", __func__
);
5737 mptcp_kern_skt_unit
= sac
->sc_unit
;
5743 mptcp_allow_uuid(uuid_t uuid
)
5747 /* Iterate over all MPTCP connections */
5749 lck_mtx_lock(&mtcbinfo
.mppi_lock
);
5751 TAILQ_FOREACH(mpp
, &mtcbinfo
.mppi_pcbs
, mpp_entry
) {
5752 struct mptses
*mpte
;
5753 struct socket
*mp_so
;
5757 mpte
= mpp
->mpp_pcbe
;
5758 mp_so
= mpp
->mpp_socket
;
5760 if (mp_so
->so_flags
& SOF_DELEGATED
&&
5761 uuid_compare(uuid
, mp_so
->e_uuid
))
5763 else if (!(mp_so
->so_flags
& SOF_DELEGATED
) &&
5764 uuid_compare(uuid
, mp_so
->last_uuid
))
5767 mpte
->mpte_flags
|= MPTE_ACCESS_GRANTED
;
5769 mptcp_check_subflows_and_add(mpte
);
5770 mptcp_remove_subflows(mpte
);
5772 mpte
->mpte_flags
&= ~MPTE_ACCESS_GRANTED
;
5778 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
5782 mptcp_wifi_status_changed(void)
5786 /* Iterate over all MPTCP connections */
5788 lck_mtx_lock(&mtcbinfo
.mppi_lock
);
5790 TAILQ_FOREACH(mpp
, &mtcbinfo
.mppi_pcbs
, mpp_entry
) {
5791 struct mptses
*mpte
;
5792 struct socket
*mp_so
;
5796 mpte
= mpp
->mpp_pcbe
;
5797 mp_so
= mpp
->mpp_socket
;
5799 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5800 if (mpte
->mpte_svctype
!= MPTCP_SVCTYPE_HANDOVER
)
5803 mptcp_check_subflows_and_add(mpte
);
5804 mptcp_check_subflows_and_remove(mpte
);
5810 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
5814 mptcp_ask_symptoms(struct mptses
*mpte
)
5816 struct mptcp_symptoms_ask_uuid ask
;
5817 struct socket
*mp_so
;
5821 if (mptcp_kern_skt_unit
== 0) {
5822 os_log_error(mptcp_log_handle
, "%s skt_unit is still 0\n", __func__
);
5826 mp_so
= mptetoso(mpte
);
5828 if (mp_so
->so_flags
& SOF_DELEGATED
)
5831 pid
= mp_so
->last_pid
;
5834 if (p
== PROC_NULL
) {
5835 os_log_error(mptcp_log_handle
, "%s Couldn't find proc for pid %u\n", __func__
, pid
);
5839 ask
.cmd
= MPTCP_SYMPTOMS_ASK_UUID
;
5841 if (mp_so
->so_flags
& SOF_DELEGATED
)
5842 uuid_copy(ask
.uuid
, mp_so
->e_uuid
);
5844 uuid_copy(ask
.uuid
, mp_so
->last_uuid
);
5846 prio
= proc_get_effective_task_policy(proc_task(p
), TASK_POLICY_ROLE
);
5848 if (prio
== TASK_BACKGROUND_APPLICATION
)
5849 ask
.priority
= MPTCP_SYMPTOMS_BACKGROUND
;
5850 else if (prio
== TASK_FOREGROUND_APPLICATION
)
5851 ask
.priority
= MPTCP_SYMPTOMS_FOREGROUND
;
5853 ask
.priority
= MPTCP_SYMPTOMS_UNKNOWN
;
5855 err
= ctl_enqueuedata(mptcp_kern_ctrl_ref
, mptcp_kern_skt_unit
,
5856 &ask
, sizeof(ask
), CTL_DATA_EOR
);
5858 os_log_debug(mptcp_log_handle
, "%s asked symptoms about pid %u, prio %u, err %d\n",
5859 __func__
, pid
, ask
.priority
, err
);
5866 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref
, u_int32_t kcunit
,
5869 #pragma unused(kctlref, kcunit, unitinfo)
5871 OSDecrementAtomic(&mptcp_kern_skt_inuse
);
5877 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref
, u_int32_t kcunit
, void *unitinfo
,
5878 mbuf_t m
, int flags
)
5880 #pragma unused(kctlref, unitinfo, flags)
5881 symptoms_advisory_t
*sa
= NULL
;
5883 if (kcunit
!= mptcp_kern_skt_unit
)
5884 os_log_error(mptcp_log_handle
, "%s kcunit %u is different from expected one %u\n",
5885 __func__
, kcunit
, mptcp_kern_skt_unit
);
5887 if (mbuf_pkthdr_len(m
) < sizeof(*sa
)) {
5892 if (mbuf_len(m
) < sizeof(*sa
)) {
5899 if (sa
->sa_nwk_status
!= SYMPTOMS_ADVISORY_NOCOMMENT
&&
5900 sa
->sa_nwk_status
!= SYMPTOMS_ADVISORY_USEAPP
) {
5901 uint8_t old_wifi_status
= mptcp_advisory
.sa_wifi_status
;
5903 mptcplog((LOG_DEBUG
, "%s: wifi %d,%d\n",
5904 __func__
, sa
->sa_wifi_status
, mptcp_advisory
.sa_wifi_status
),
5905 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_VERBOSE
);
5907 if ((sa
->sa_wifi_status
&
5908 (SYMPTOMS_ADVISORY_WIFI_BAD
| SYMPTOMS_ADVISORY_WIFI_OK
)) !=
5909 (SYMPTOMS_ADVISORY_WIFI_BAD
| SYMPTOMS_ADVISORY_WIFI_OK
))
5910 mptcp_advisory
.sa_wifi_status
= sa
->sa_wifi_status
;
5912 if (old_wifi_status
!= mptcp_advisory
.sa_wifi_status
)
5913 mptcp_wifi_status_changed();
5914 } else if (sa
->sa_nwk_status
== SYMPTOMS_ADVISORY_NOCOMMENT
) {
5915 mptcplog((LOG_DEBUG
, "%s: NOCOMMENT wifi %d\n", __func__
,
5916 mptcp_advisory
.sa_wifi_status
),
5917 MPTCP_EVENTS_DBG
, MPTCP_LOGLVL_VERBOSE
);
5918 } else if (sa
->sa_nwk_status
== SYMPTOMS_ADVISORY_USEAPP
) {
5921 mptcplog((LOG_DEBUG
, "%s Got response about useApp\n", __func__
),
5922 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
5924 uuid_copy(uuid
, (unsigned char *)(sa
+ 1));
5926 mptcp_allow_uuid(uuid
);
5934 mptcp_control_register(void)
5936 /* Set up the advisory control socket */
5937 struct kern_ctl_reg mptcp_kern_ctl
;
5939 bzero(&mptcp_kern_ctl
, sizeof(mptcp_kern_ctl
));
5940 strlcpy(mptcp_kern_ctl
.ctl_name
, MPTCP_KERN_CTL_NAME
,
5941 sizeof(mptcp_kern_ctl
.ctl_name
));
5942 mptcp_kern_ctl
.ctl_connect
= mptcp_symptoms_ctl_connect
;
5943 mptcp_kern_ctl
.ctl_disconnect
= mptcp_symptoms_ctl_disconnect
;
5944 mptcp_kern_ctl
.ctl_send
= mptcp_symptoms_ctl_send
;
5945 mptcp_kern_ctl
.ctl_flags
= CTL_FLAG_PRIVILEGED
;
5947 (void)ctl_register(&mptcp_kern_ctl
, &mptcp_kern_ctrl_ref
);
5951 * Three return-values:
5954 * -1 : WiFi-state is unknown, use subflow-only heuristics
5957 mptcp_is_wifi_unusable(struct mptses
*mpte
)
5959 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
) {
5960 if (mptcp_advisory
.sa_wifi_status
)
5961 return ((mptcp_advisory
.sa_wifi_status
& SYMPTOMS_ADVISORY_WIFI_BAD
) ? 1 : 0);
5964 * If it's a first-party app and we don't have any info
5965 * about the Wi-Fi state, let's be pessimistic.
5970 return ((mptcp_advisory
.sa_wifi_status
& SYMPTOMS_ADVISORY_WIFI_BAD
) ? 1 : 0);
5974 mptcp_subflow_is_bad(struct mptses
*mpte
, struct mptsub
*mpts
)
5976 struct tcpcb
*tp
= sototcpcb(mpts
->mpts_socket
);
5977 int fail_thresh
= mptcp_fail_thresh
;
5979 if (mpte
->mpte_svctype
== MPTCP_SVCTYPE_HANDOVER
)
5982 return (tp
->t_rxtshift
>= fail_thresh
&&
5983 (mptetoso(mpte
)->so_snd
.sb_cc
|| mpte
->mpte_reinjectq
));
5986 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5988 mptcp_drop_tfo_data(struct mptses
*mpte
, struct mptsub
*mpts
)
5990 struct socket
*mp_so
= mptetoso(mpte
);
5991 struct socket
*so
= mpts
->mpts_socket
;
5992 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
5993 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
5995 /* If data was sent with SYN, rewind state */
5996 if (tp
->t_tfo_stats
& TFO_S_SYN_DATA_ACKED
) {
5997 u_int64_t mp_droplen
= mp_tp
->mpt_sndnxt
- mp_tp
->mpt_snduna
;
5998 unsigned int tcp_droplen
= tp
->snd_una
- tp
->iss
- 1;
6000 VERIFY(mp_droplen
<= (UINT_MAX
));
6001 VERIFY(mp_droplen
>= tcp_droplen
);
6003 mpts
->mpts_flags
&= ~MPTSF_TFO_REQD
;
6004 mpts
->mpts_iss
+= tcp_droplen
;
6005 tp
->t_mpflags
&= ~TMPF_TFO_REQUEST
;
6007 if (mp_droplen
> tcp_droplen
) {
6008 /* handle partial TCP ack */
6009 mp_so
->so_flags1
|= SOF1_TFO_REWIND
;
6010 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
+ (mp_droplen
- tcp_droplen
);
6011 mp_droplen
= tcp_droplen
;
6013 /* all data on SYN was acked */
6014 mpts
->mpts_rel_seq
= 1;
6015 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
6017 mp_tp
->mpt_sndmax
-= tcp_droplen
;
6019 if (mp_droplen
!= 0) {
6020 VERIFY(mp_so
->so_snd
.sb_mb
!= NULL
);
6021 sbdrop(&mp_so
->so_snd
, (int)mp_droplen
);
6023 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
6024 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
6025 mpts
->mpts_connid
, tcp_droplen
, mp_droplen
),
6026 MPTCP_SENDER_DBG
, MPTCP_LOGLVL_VERBOSE
);
6031 mptcp_freeq(struct mptcb
*mp_tp
)
6033 struct tseg_qent
*q
;
6036 while ((q
= LIST_FIRST(&mp_tp
->mpt_segq
)) != NULL
) {
6037 LIST_REMOVE(q
, tqe_q
);
6039 zfree(tcp_reass_zone
, q
);
6042 mp_tp
->mpt_reassqlen
= 0;
6047 mptcp_post_event(u_int32_t event_code
, int value
)
6049 struct kev_mptcp_data event_data
;
6050 struct kev_msg ev_msg
;
6052 memset(&ev_msg
, 0, sizeof(ev_msg
));
6054 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
6055 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
6056 ev_msg
.kev_subclass
= KEV_MPTCP_SUBCLASS
;
6057 ev_msg
.event_code
= event_code
;
6059 event_data
.value
= value
;
6061 ev_msg
.dv
[0].data_ptr
= &event_data
;
6062 ev_msg
.dv
[0].data_length
= sizeof(event_data
);
6064 return kev_post_msg(&ev_msg
);
6068 mptcp_set_cellicon(struct mptses
*mpte
)
6072 /* First-party apps (Siri) don't flip the cellicon */
6073 if (mpte
->mpte_flags
& MPTE_FIRSTPARTY
)
6076 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
6077 mptcp_last_cellicon_set
= tcp_now
;
6079 /* If cellicon is already set, get out of here! */
6080 if (OSTestAndSet(7, &mptcp_cellicon_is_set
))
6083 error
= mptcp_post_event(KEV_MPTCP_CELLUSE
, 1);
6086 mptcplog((LOG_ERR
, "%s: Setting cellicon failed with %d\n",
6087 __func__
, error
), MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
6089 mptcplog((LOG_DEBUG
, "%s successfully set the cellicon\n", __func__
),
6090 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
6094 mptcp_unset_cellicon(void)
6098 /* If cellicon is already unset, get out of here! */
6099 if (OSTestAndClear(7, &mptcp_cellicon_is_set
))
6103 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6104 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6107 if (TSTMP_GT(mptcp_last_cellicon_set
+ MPTCP_CELLICON_TOGGLE_RATE
,
6109 OSTestAndSet(7, &mptcp_cellicon_is_set
);
6113 error
= mptcp_post_event(KEV_MPTCP_CELLUSE
, 0);
6116 mptcplog((LOG_ERR
, "%s: Unsetting cellicon failed with %d\n",
6117 __func__
, error
), MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_ERR
);
6119 mptcplog((LOG_DEBUG
, "%s successfully unset the cellicon\n", __func__
),
6120 MPTCP_SOCKET_DBG
, MPTCP_LOGLVL_VERBOSE
);
6124 mptcp_reset_rexmit_state(struct tcpcb
*tp
)
6126 struct mptsub
*mpts
;
6134 so
= inp
->inp_socket
;
6138 if (!(so
->so_flags
& SOF_MP_SUBFLOW
))
6143 mpts
->mpts_flags
&= ~MPTSF_WRITE_STALL
;
6144 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
6148 mptcp_reset_keepalive(struct tcpcb
*tp
)
6150 struct mptsub
*mpts
= tp
->t_mpsub
;
6152 mpts
->mpts_flags
&= ~MPTSF_READ_STALL
;