2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
34 #include <sys/mcache.h>
35 #include <sys/resourcevar.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/syslog.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/sysctl.h>
43 #include <kern/zalloc.h>
44 #include <kern/locks.h>
46 #include <mach/thread_act.h>
50 #include <netinet/in.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/in_var.h>
53 #include <netinet/tcp.h>
54 #include <netinet/tcp_fsm.h>
55 #include <netinet/tcp_seq.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/mptcp_var.h>
58 #include <netinet/mptcp.h>
59 #include <netinet/mptcp_seq.h>
60 #include <netinet/mptcp_timer.h>
61 #include <libkern/crypto/sha1.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6protosw.h>
66 #include <dev/random/randomdev.h>
69 * Notes on MPTCP implementation.
71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
72 * communication domain. The structure mtcbinfo describes the MPTCP instance
73 * of a Multipath protocol in that domain. It is used to keep track of all
74 * MPTCP PCB instances in the system, and is protected by the global lock
77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
78 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
79 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
80 * allocated from the same memory block, and each structure has a pointer
81 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
83 * PCB (mppcb) as well as the MPTCP Session (mptses).
85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
86 * in particular, the list of subflows as well as the MPTCP thread.
88 * A functioning MPTCP Session consists of one or more subflow sockets. Each
89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
90 * represented by the mptsub structure. Because each subflow requires access
91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
92 * subflow. This gets decremented prior to the subflow's destruction. The
93 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
95 * To handle events (read, write, control) from the subflows, an MPTCP thread
96 * is created; currently, there is one thread per MPTCP Session. In order to
97 * prevent the MPTCP socket from being destroyed while being accessed by the
98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
99 * which will be decremented prior to the thread's termination. The thread
100 * lock (mpte_thread_lock) is used to synchronize its signalling.
102 * Lock ordering is defined as follows:
104 * mtcbinfo (mppi_lock)
110 * It is not a requirement that all of the above locks need to be acquired
111 * in succession, but the correct lock ordering must be followed when there
112 * are more than one locks that need to be held. The MPTCP thread lock is
113 * is not constrained by this arrangement, because none of the other locks
114 * is ever acquired while holding mpte_thread_lock; therefore it may be called
115 * at any moment to signal the thread.
117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
118 * work is done by the MPTCP garbage collector which is invoked on demand by
119 * the PF_MULTIPATH garbage collector. This process will take place once all
120 * of the subflows have been destroyed, and the MPTCP thread be instructed to
124 static void mptcp_sesdestroy(struct mptses
*);
125 static void mptcp_thread_signal_locked(struct mptses
*);
126 static void mptcp_thread_terminate_signal(struct mptses
*);
127 static void mptcp_thread_dowork(struct mptses
*);
128 static void mptcp_thread_func(void *, wait_result_t
);
129 static void mptcp_thread_destroy(struct mptses
*);
130 static void mptcp_key_pool_init(void);
131 static void mptcp_attach_to_subf(struct socket
*, struct mptcb
*, connid_t
);
132 static void mptcp_detach_mptcb_from_subf(struct mptcb
*, struct socket
*);
133 static void mptcp_conn_properties(struct mptcb
*);
134 static void mptcp_init_statevars(struct mptcb
*);
136 static uint32_t mptcp_gc(struct mppcbinfo
*);
137 static int mptcp_subflow_socreate(struct mptses
*, struct mptsub
*,
138 int, struct proc
*, struct socket
**);
139 static int mptcp_subflow_soclose(struct mptsub
*, struct socket
*);
140 static int mptcp_subflow_soconnectx(struct mptses
*, struct mptsub
*);
141 static int mptcp_subflow_soreceive(struct socket
*, struct sockaddr
**,
142 struct uio
*, struct mbuf
**, struct mbuf
**, int *);
143 static void mptcp_subflow_rupcall(struct socket
*, void *, int);
144 static void mptcp_subflow_input(struct mptses
*, struct mptsub
*);
145 static void mptcp_subflow_wupcall(struct socket
*, void *, int);
146 static void mptcp_subflow_eupcall(struct socket
*, void *, uint32_t);
147 static void mptcp_update_last_owner(struct mptsub
*, struct socket
*);
150 * Possible return values for subflow event handlers. Note that success
151 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
152 * indicate errors or actions which require immediate attention; they will
153 * prevent the rest of the handlers from processing their respective events
154 * until the next round of events processing.
157 MPTS_EVRET_DELETE
= 1, /* delete this subflow */
158 MPTS_EVRET_OK
= 2, /* OK */
159 MPTS_EVRET_CONNECT_PENDING
= 3, /* resume pended connects */
160 MPTS_EVRET_DISCONNECT_FALLBACK
= 4, /* abort all but preferred */
161 MPTS_EVRET_OK_UPDATE
= 5, /* OK with conninfo update */
164 static ev_ret_t
mptcp_subflow_events(struct mptses
*, struct mptsub
*);
165 static ev_ret_t
mptcp_subflow_connreset_ev(struct mptses
*, struct mptsub
*);
166 static ev_ret_t
mptcp_subflow_cantrcvmore_ev(struct mptses
*, struct mptsub
*);
167 static ev_ret_t
mptcp_subflow_cantsendmore_ev(struct mptses
*, struct mptsub
*);
168 static ev_ret_t
mptcp_subflow_timeout_ev(struct mptses
*, struct mptsub
*);
169 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses
*, struct mptsub
*);
170 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses
*, struct mptsub
*);
171 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses
*, struct mptsub
*);
172 static ev_ret_t
mptcp_subflow_suspend_ev(struct mptses
*, struct mptsub
*);
173 static ev_ret_t
mptcp_subflow_resume_ev(struct mptses
*, struct mptsub
*);
174 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses
*, struct mptsub
*);
175 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses
*, struct mptsub
*);
176 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses
*, struct mptsub
*);
177 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses
*, struct mptsub
*);
178 static const char *mptcp_evret2str(ev_ret_t
);
180 static mptcp_key_t
*mptcp_reserve_key(void);
181 static int mptcp_do_sha1(mptcp_key_t
*, char *, int);
182 static int mptcp_init_authparms(struct mptcb
*);
183 static int mptcp_delete_ok(struct mptses
*mpte
, struct mptsub
*mpts
);
185 static unsigned int mptsub_zone_size
; /* size of mptsub */
186 static struct zone
*mptsub_zone
; /* zone for mptsub */
188 static unsigned int mptopt_zone_size
; /* size of mptopt */
189 static struct zone
*mptopt_zone
; /* zone for mptopt */
191 static unsigned int mpt_subauth_entry_size
; /* size of subf auth entry */
192 static struct zone
*mpt_subauth_zone
; /* zone of subf auth entry */
194 struct mppcbinfo mtcbinfo
;
196 static struct mptcp_keys_pool_head mptcp_keys_pool
;
198 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
199 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
201 SYSCTL_DECL(_net_inet
);
203 SYSCTL_NODE(_net_inet
, OID_AUTO
, mptcp
, CTLFLAG_RW
|CTLFLAG_LOCKED
, 0, "MPTCP");
205 uint32_t mptcp_verbose
= 0; /* more noise if greater than 1 */
206 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, verbose
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
207 &mptcp_verbose
, 0, "MPTCP verbosity level");
209 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, pcbcount
, CTLFLAG_RD
|CTLFLAG_LOCKED
,
210 &mtcbinfo
.mppi_count
, 0, "Number of active PCBs");
213 * Since there is one kernel thread per mptcp socket, imposing an artificial
214 * limit on number of allowed mptcp sockets.
216 uint32_t mptcp_socket_limit
= MPPCB_LIMIT
;
217 SYSCTL_UINT(_net_inet_mptcp
, OID_AUTO
, sk_lim
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
218 &mptcp_socket_limit
, 0, "MPTCP socket limit");
220 static struct protosw mptcp_subflow_protosw
;
221 static struct pr_usrreqs mptcp_subflow_usrreqs
;
223 static struct ip6protosw mptcp_subflow_protosw6
;
224 static struct pr_usrreqs mptcp_subflow_usrreqs6
;
228 * Protocol pr_init callback.
231 mptcp_init(struct protosw
*pp
, struct domain
*dp
)
234 static int mptcp_initialized
= 0;
237 struct ip6protosw
*prp6
;
240 VERIFY((pp
->pr_flags
& (PR_INITIALIZED
|PR_ATTACHED
)) == PR_ATTACHED
);
242 /* do this only once */
243 if (mptcp_initialized
)
245 mptcp_initialized
= 1;
248 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
249 * we must be able to find IPPROTO_TCP entries for both.
251 prp
= pffindproto_locked(PF_INET
, IPPROTO_TCP
, SOCK_STREAM
);
253 bcopy(prp
, &mptcp_subflow_protosw
, sizeof (*prp
));
254 bcopy(prp
->pr_usrreqs
, &mptcp_subflow_usrreqs
,
255 sizeof (mptcp_subflow_usrreqs
));
256 mptcp_subflow_protosw
.pr_entry
.tqe_next
= NULL
;
257 mptcp_subflow_protosw
.pr_entry
.tqe_prev
= NULL
;
258 mptcp_subflow_protosw
.pr_usrreqs
= &mptcp_subflow_usrreqs
;
259 mptcp_subflow_usrreqs
.pru_soreceive
= mptcp_subflow_soreceive
;
260 mptcp_subflow_usrreqs
.pru_rcvoob
= pru_rcvoob_notsupp
;
262 * Socket filters shouldn't attach/detach to/from this protosw
263 * since pr_protosw is to be used instead, which points to the
264 * real protocol; if they do, it is a bug and we should panic.
266 mptcp_subflow_protosw
.pr_filter_head
.tqh_first
=
267 (struct socket_filter
*)(uintptr_t)0xdeadbeefdeadbeef;
268 mptcp_subflow_protosw
.pr_filter_head
.tqh_last
=
269 (struct socket_filter
**)(uintptr_t)0xdeadbeefdeadbeef;
272 prp6
= (struct ip6protosw
*)pffindproto_locked(PF_INET6
,
273 IPPROTO_TCP
, SOCK_STREAM
);
274 VERIFY(prp6
!= NULL
);
275 bcopy(prp6
, &mptcp_subflow_protosw6
, sizeof (*prp6
));
276 bcopy(prp6
->pr_usrreqs
, &mptcp_subflow_usrreqs6
,
277 sizeof (mptcp_subflow_usrreqs6
));
278 mptcp_subflow_protosw6
.pr_entry
.tqe_next
= NULL
;
279 mptcp_subflow_protosw6
.pr_entry
.tqe_prev
= NULL
;
280 mptcp_subflow_protosw6
.pr_usrreqs
= &mptcp_subflow_usrreqs6
;
281 mptcp_subflow_usrreqs6
.pru_soreceive
= mptcp_subflow_soreceive
;
282 mptcp_subflow_usrreqs6
.pru_rcvoob
= pru_rcvoob_notsupp
;
284 * Socket filters shouldn't attach/detach to/from this protosw
285 * since pr_protosw is to be used instead, which points to the
286 * real protocol; if they do, it is a bug and we should panic.
288 mptcp_subflow_protosw6
.pr_filter_head
.tqh_first
=
289 (struct socket_filter
*)(uintptr_t)0xdeadbeefdeadbeef;
290 mptcp_subflow_protosw6
.pr_filter_head
.tqh_last
=
291 (struct socket_filter
**)(uintptr_t)0xdeadbeefdeadbeef;
294 bzero(&mtcbinfo
, sizeof (mtcbinfo
));
295 TAILQ_INIT(&mtcbinfo
.mppi_pcbs
);
296 mtcbinfo
.mppi_size
= sizeof (struct mpp_mtp
);
297 if ((mtcbinfo
.mppi_zone
= zinit(mtcbinfo
.mppi_size
,
298 1024 * mtcbinfo
.mppi_size
, 8192, "mptcb")) == NULL
) {
299 panic("%s: unable to allocate MPTCP PCB zone\n", __func__
);
302 zone_change(mtcbinfo
.mppi_zone
, Z_CALLERACCT
, FALSE
);
303 zone_change(mtcbinfo
.mppi_zone
, Z_EXPAND
, TRUE
);
305 mtcbinfo
.mppi_lock_grp_attr
= lck_grp_attr_alloc_init();
306 mtcbinfo
.mppi_lock_grp
= lck_grp_alloc_init("mppcb",
307 mtcbinfo
.mppi_lock_grp_attr
);
308 mtcbinfo
.mppi_lock_attr
= lck_attr_alloc_init();
309 lck_mtx_init(&mtcbinfo
.mppi_lock
, mtcbinfo
.mppi_lock_grp
,
310 mtcbinfo
.mppi_lock_attr
);
311 mtcbinfo
.mppi_gc
= mptcp_gc
;
313 mtcbinfo
.mppi_timer
= mptcp_timer
;
315 /* attach to MP domain for garbage collection to take place */
316 mp_pcbinfo_attach(&mtcbinfo
);
318 mptsub_zone_size
= sizeof (struct mptsub
);
319 if ((mptsub_zone
= zinit(mptsub_zone_size
, 1024 * mptsub_zone_size
,
320 8192, "mptsub")) == NULL
) {
321 panic("%s: unable to allocate MPTCP subflow zone\n", __func__
);
324 zone_change(mptsub_zone
, Z_CALLERACCT
, FALSE
);
325 zone_change(mptsub_zone
, Z_EXPAND
, TRUE
);
327 mptopt_zone_size
= sizeof (struct mptopt
);
328 if ((mptopt_zone
= zinit(mptopt_zone_size
, 128 * mptopt_zone_size
,
329 1024, "mptopt")) == NULL
) {
330 panic("%s: unable to allocate MPTCP option zone\n", __func__
);
333 zone_change(mptopt_zone
, Z_CALLERACCT
, FALSE
);
334 zone_change(mptopt_zone
, Z_EXPAND
, TRUE
);
336 mpt_subauth_entry_size
= sizeof (struct mptcp_subf_auth_entry
);
337 if ((mpt_subauth_zone
= zinit(mpt_subauth_entry_size
,
338 1024 * mpt_subauth_entry_size
, 8192, "mptauth")) == NULL
) {
339 panic("%s: unable to allocate MPTCP address auth zone \n",
343 zone_change(mpt_subauth_zone
, Z_CALLERACCT
, FALSE
);
344 zone_change(mpt_subauth_zone
, Z_EXPAND
, TRUE
);
346 /* Set up a list of unique keys */
347 mptcp_key_pool_init();
352 * Create an MPTCP session, called as a result of opening a MPTCP socket.
355 mptcp_sescreate(struct socket
*mp_so
, struct mppcb
*mpp
)
357 struct mppcbinfo
*mppi
;
363 mppi
= mpp
->mpp_pcbinfo
;
364 VERIFY(mppi
!= NULL
);
366 mpte
= &((struct mpp_mtp
*)mpp
)->mpp_ses
;
367 mp_tp
= &((struct mpp_mtp
*)mpp
)->mtcb
;
369 /* MPTCP Multipath PCB Extension */
370 bzero(mpte
, sizeof (*mpte
));
371 VERIFY(mpp
->mpp_pcbe
== NULL
);
372 mpp
->mpp_pcbe
= mpte
;
373 mpte
->mpte_mppcb
= mpp
;
374 mpte
->mpte_mptcb
= mp_tp
;
376 TAILQ_INIT(&mpte
->mpte_sopts
);
377 TAILQ_INIT(&mpte
->mpte_subflows
);
378 mpte
->mpte_associd
= ASSOCID_ANY
;
379 mpte
->mpte_connid_last
= CONNID_ANY
;
381 lck_mtx_init(&mpte
->mpte_thread_lock
, mppi
->mppi_lock_grp
,
382 mppi
->mppi_lock_attr
);
387 * This can be rather expensive if we have lots of MPTCP sockets,
388 * but we need a kernel thread for this model to work. Perhaps we
389 * could amortize the costs by having one worker thread per a group
392 if (kernel_thread_start(mptcp_thread_func
, mpte
,
393 &mpte
->mpte_thread
) != KERN_SUCCESS
) {
397 mp_so
->so_usecount
++; /* for thread */
399 /* MPTCP Protocol Control Block */
400 bzero(mp_tp
, sizeof (*mp_tp
));
401 lck_mtx_init(&mp_tp
->mpt_lock
, mppi
->mppi_lock_grp
,
402 mppi
->mppi_lock_attr
);
403 mp_tp
->mpt_mpte
= mpte
;
407 lck_mtx_destroy(&mpte
->mpte_thread_lock
, mppi
->mppi_lock_grp
);
408 DTRACE_MPTCP5(session__create
, struct socket
*, mp_so
,
409 struct sockbuf
*, &mp_so
->so_rcv
,
410 struct sockbuf
*, &mp_so
->so_snd
,
411 struct mppcb
*, mpp
, int, error
);
413 return ((error
!= 0) ? NULL
: mpte
);
417 * Destroy an MPTCP session.
420 mptcp_sesdestroy(struct mptses
*mpte
)
424 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
426 mp_tp
= mpte
->mpte_mptcb
;
427 VERIFY(mp_tp
!= NULL
);
430 * MPTCP Multipath PCB Extension section
432 mptcp_flush_sopts(mpte
);
433 VERIFY(TAILQ_EMPTY(&mpte
->mpte_subflows
) && mpte
->mpte_numflows
== 0);
435 lck_mtx_destroy(&mpte
->mpte_thread_lock
,
436 mpte
->mpte_mppcb
->mpp_pcbinfo
->mppi_lock_grp
);
439 * MPTCP Protocol Control Block section
441 lck_mtx_destroy(&mp_tp
->mpt_lock
,
442 mpte
->mpte_mppcb
->mpp_pcbinfo
->mppi_lock_grp
);
444 DTRACE_MPTCP2(session__destroy
, struct mptses
*, mpte
,
445 struct mptcb
*, mp_tp
);
449 * Allocate an MPTCP socket option structure.
452 mptcp_sopt_alloc(int how
)
456 mpo
= (how
== M_WAITOK
) ? zalloc(mptopt_zone
) :
457 zalloc_noblock(mptopt_zone
);
459 bzero(mpo
, mptopt_zone_size
);
466 * Free an MPTCP socket option structure.
469 mptcp_sopt_free(struct mptopt
*mpo
)
471 VERIFY(!(mpo
->mpo_flags
& MPOF_ATTACHED
));
473 zfree(mptopt_zone
, mpo
);
477 * Add a socket option to the MPTCP socket option list.
480 mptcp_sopt_insert(struct mptses
*mpte
, struct mptopt
*mpo
)
482 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
483 VERIFY(!(mpo
->mpo_flags
& MPOF_ATTACHED
));
484 mpo
->mpo_flags
|= MPOF_ATTACHED
;
485 TAILQ_INSERT_TAIL(&mpte
->mpte_sopts
, mpo
, mpo_entry
);
489 * Remove a socket option from the MPTCP socket option list.
492 mptcp_sopt_remove(struct mptses
*mpte
, struct mptopt
*mpo
)
494 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
495 VERIFY(mpo
->mpo_flags
& MPOF_ATTACHED
);
496 mpo
->mpo_flags
&= ~MPOF_ATTACHED
;
497 TAILQ_REMOVE(&mpte
->mpte_sopts
, mpo
, mpo_entry
);
501 * Search for an existing <sopt_level,sopt_name> socket option.
504 mptcp_sopt_find(struct mptses
*mpte
, struct sockopt
*sopt
)
508 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
510 TAILQ_FOREACH(mpo
, &mpte
->mpte_sopts
, mpo_entry
) {
511 if (mpo
->mpo_level
== sopt
->sopt_level
&&
512 mpo
->mpo_name
== sopt
->sopt_name
)
515 VERIFY(mpo
== NULL
|| sopt
->sopt_valsize
== sizeof (int));
521 * Flushes all recorded socket options from an MP socket.
524 mptcp_flush_sopts(struct mptses
*mpte
)
526 struct mptopt
*mpo
, *tmpo
;
528 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
530 TAILQ_FOREACH_SAFE(mpo
, &mpte
->mpte_sopts
, mpo_entry
, tmpo
) {
531 mptcp_sopt_remove(mpte
, mpo
);
532 mptcp_sopt_free(mpo
);
534 VERIFY(TAILQ_EMPTY(&mpte
->mpte_sopts
));
538 * Allocate a MPTCP subflow structure.
541 mptcp_subflow_alloc(int how
)
545 mpts
= (how
== M_WAITOK
) ? zalloc(mptsub_zone
) :
546 zalloc_noblock(mptsub_zone
);
548 bzero(mpts
, mptsub_zone_size
);
549 lck_mtx_init(&mpts
->mpts_lock
, mtcbinfo
.mppi_lock_grp
,
550 mtcbinfo
.mppi_lock_attr
);
557 * Deallocate a subflow structure, called when all of the references held
558 * on it have been released. This implies that the subflow has been deleted.
561 mptcp_subflow_free(struct mptsub
*mpts
)
563 MPTS_LOCK_ASSERT_HELD(mpts
);
565 VERIFY(mpts
->mpts_refcnt
== 0);
566 VERIFY(!(mpts
->mpts_flags
& MPTSF_ATTACHED
));
567 VERIFY(mpts
->mpts_mpte
== NULL
);
568 VERIFY(mpts
->mpts_socket
== NULL
);
570 if (mpts
->mpts_src_sl
!= NULL
) {
571 sockaddrlist_free(mpts
->mpts_src_sl
);
572 mpts
->mpts_src_sl
= NULL
;
574 if (mpts
->mpts_dst_sl
!= NULL
) {
575 sockaddrlist_free(mpts
->mpts_dst_sl
);
576 mpts
->mpts_dst_sl
= NULL
;
579 lck_mtx_destroy(&mpts
->mpts_lock
, mtcbinfo
.mppi_lock_grp
);
581 zfree(mptsub_zone
, mpts
);
585 * Create an MPTCP subflow socket.
588 mptcp_subflow_socreate(struct mptses
*mpte
, struct mptsub
*mpts
, int dom
,
589 struct proc
*p
, struct socket
**so
)
591 struct mptopt smpo
, *mpo
, *tmpo
;
592 struct socket
*mp_so
;
596 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
597 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
600 * Create the subflow socket (multipath subflow, non-blocking.)
602 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
603 * socket; it will be cleared when the socket is peeled off or closed.
604 * It also indicates to the underlying TCP to handle MPTCP options.
605 * A multipath subflow socket implies SS_NOFDREF state.
607 if ((error
= socreate_internal(dom
, so
, SOCK_STREAM
,
608 IPPROTO_TCP
, p
, SOCF_ASYNC
| SOCF_MP_SUBFLOW
, PROC_NULL
)) != 0) {
609 mptcplog((LOG_ERR
, "MPTCP ERROR %s: mp_so 0x%llx unable to "
610 "create subflow socket error %d\n", __func__
,
611 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), error
));
616 VERIFY((*so
)->so_flags
& SOF_MP_SUBFLOW
);
617 VERIFY(((*so
)->so_state
& (SS_NBIO
|SS_NOFDREF
)) ==
618 (SS_NBIO
|SS_NOFDREF
));
620 /* prevent the socket buffers from being compressed */
621 (*so
)->so_rcv
.sb_flags
|= SB_NOCOMPRESS
;
622 (*so
)->so_snd
.sb_flags
|= SB_NOCOMPRESS
;
624 bzero(&smpo
, sizeof (smpo
));
625 smpo
.mpo_flags
|= MPOF_SUBFLOW_OK
;
626 smpo
.mpo_level
= SOL_SOCKET
;
629 /* disable SIGPIPE */
630 smpo
.mpo_name
= SO_NOSIGPIPE
;
631 if ((error
= mptcp_subflow_sosetopt(mpte
, *so
, &smpo
)) != 0)
634 /* find out if the subflow's source address goes away */
635 smpo
.mpo_name
= SO_NOADDRERR
;
636 if ((error
= mptcp_subflow_sosetopt(mpte
, *so
, &smpo
)) != 0)
639 /* enable keepalive */
640 smpo
.mpo_name
= SO_KEEPALIVE
;
641 if ((error
= mptcp_subflow_sosetopt(mpte
, *so
, &smpo
)) != 0)
645 * Limit the receive socket buffer size to 64k.
647 * We need to take into consideration the window scale option
648 * which could be negotiated in one subflow but disabled in
650 * XXX This can be improved in the future.
652 smpo
.mpo_name
= SO_RCVBUF
;
653 smpo
.mpo_intval
= MPTCP_RWIN_MAX
;
654 if ((error
= mptcp_subflow_sosetopt(mpte
, *so
, &smpo
)) != 0)
657 /* N.B.: set by sosetopt */
658 VERIFY(!((*so
)->so_rcv
.sb_flags
& SB_AUTOSIZE
));
659 /* Prevent automatic socket buffer sizing. */
660 (*so
)->so_snd
.sb_flags
&= ~SB_AUTOSIZE
;
662 smpo
.mpo_level
= IPPROTO_TCP
;
663 smpo
.mpo_intval
= mptcp_subflow_keeptime
;
664 smpo
.mpo_name
= TCP_KEEPALIVE
;
665 if ((error
= mptcp_subflow_sosetopt(mpte
, *so
, &smpo
)) != 0)
668 /* replay setsockopt(2) on the subflow sockets for eligible options */
669 TAILQ_FOREACH_SAFE(mpo
, &mpte
->mpte_sopts
, mpo_entry
, tmpo
) {
672 if (!(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
))
676 * Skip those that are handled internally; these options
677 * should not have been recorded and marked with the
678 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
680 if (mpo
->mpo_level
== SOL_SOCKET
&&
681 (mpo
->mpo_name
== SO_NOSIGPIPE
||
682 mpo
->mpo_name
== SO_NOADDRERR
||
683 mpo
->mpo_name
== SO_KEEPALIVE
))
686 interim
= (mpo
->mpo_flags
& MPOF_INTERIM
);
687 if (mptcp_subflow_sosetopt(mpte
, *so
, mpo
) != 0 && interim
) {
689 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx sopt %s val %d "
690 "interim record removed\n", __func__
,
691 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
692 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
,
693 buf
, sizeof (buf
)), mpo
->mpo_intval
));
694 mptcp_sopt_remove(mpte
, mpo
);
695 mptcp_sopt_free(mpo
);
701 * We need to receive everything that the subflow socket has,
702 * so use a customized socket receive function. We will undo
703 * this when the socket is peeled off or closed.
705 mpts
->mpts_oprotosw
= (*so
)->so_proto
;
708 (*so
)->so_proto
= &mptcp_subflow_protosw
;
712 (*so
)->so_proto
= (struct protosw
*)&mptcp_subflow_protosw6
;
721 socket_unlock(*so
, 0);
723 DTRACE_MPTCP4(subflow__create
, struct mptses
*, mpte
,
724 struct mptsub
*, mpts
, int, dom
, int, error
);
730 * Close an MPTCP subflow socket.
732 * Note that this may be called on an embryonic subflow, and the only
733 * thing that is guaranteed valid is the protocol-user request.
736 mptcp_subflow_soclose(struct mptsub
*mpts
, struct socket
*so
)
738 MPTS_LOCK_ASSERT_HELD(mpts
);
741 VERIFY(so
->so_flags
& SOF_MP_SUBFLOW
);
742 VERIFY((so
->so_state
& (SS_NBIO
|SS_NOFDREF
)) == (SS_NBIO
|SS_NOFDREF
));
744 /* restore protocol-user requests */
745 VERIFY(mpts
->mpts_oprotosw
!= NULL
);
746 so
->so_proto
= mpts
->mpts_oprotosw
;
747 socket_unlock(so
, 0);
749 mpts
->mpts_socket
= NULL
; /* may already be NULL */
751 DTRACE_MPTCP5(subflow__close
, struct mptsub
*, mpts
,
753 struct sockbuf
*, &so
->so_rcv
,
754 struct sockbuf
*, &so
->so_snd
,
755 struct mptses
*, mpts
->mpts_mpte
);
757 return (soclose(so
));
761 * Connect an MPTCP subflow socket.
763 * This may be called inline as part of adding a subflow, or asynchronously
764 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
765 * pending connect case, the subflow socket may have been bound to an interface
766 * and/or a source IP address which may no longer be around by the time this
767 * routine is called; in that case the connect attempt will most likely fail.
770 mptcp_subflow_soconnectx(struct mptses
*mpte
, struct mptsub
*mpts
)
775 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
776 MPTS_LOCK_ASSERT_HELD(mpts
);
778 VERIFY((mpts
->mpts_flags
& (MPTSF_CONNECTING
|MPTSF_CONNECTED
)) ==
780 VERIFY(mpts
->mpts_socket
!= NULL
);
781 so
= mpts
->mpts_socket
;
782 af
= mpts
->mpts_family
;
784 if (af
== AF_INET
|| af
== AF_INET6
) {
785 struct sockaddr_entry
*dst_se
;
786 char dbuf
[MAX_IPv6_STR_LEN
];
788 dst_se
= TAILQ_FIRST(&mpts
->mpts_dst_sl
->sl_head
);
789 VERIFY(dst_se
!= NULL
);
791 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx dst %s[%d] cid %d "
792 "[pended %s]\n", __func__
,
793 (u_int64_t
)VM_KERNEL_ADDRPERM(mpte
->mpte_mppcb
->mpp_socket
),
794 inet_ntop(af
, ((af
== AF_INET
) ?
795 (void *)&SIN(dst_se
->se_addr
)->sin_addr
.s_addr
:
796 (void *)&SIN6(dst_se
->se_addr
)->sin6_addr
),
797 dbuf
, sizeof (dbuf
)), ((af
== AF_INET
) ?
798 ntohs(SIN(dst_se
->se_addr
)->sin_port
) :
799 ntohs(SIN6(dst_se
->se_addr
)->sin6_port
)),
801 ((mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) ?
805 mpts
->mpts_flags
&= ~MPTSF_CONNECT_PENDING
;
808 mptcp_attach_to_subf(so
, mpte
->mpte_mptcb
, mpts
->mpts_connid
);
809 /* connect the subflow socket */
810 error
= soconnectxlocked(so
, &mpts
->mpts_src_sl
, &mpts
->mpts_dst_sl
,
811 mpts
->mpts_mpcr
.mpcr_proc
, mpts
->mpts_mpcr
.mpcr_ifscope
,
812 mpte
->mpte_associd
, NULL
, TCP_CONNREQF_MPTCP
,
813 &mpts
->mpts_mpcr
, sizeof (mpts
->mpts_mpcr
));
814 socket_unlock(so
, 0);
816 DTRACE_MPTCP3(subflow__connect
, struct mptses
*, mpte
,
817 struct mptsub
*, mpts
, int, error
);
823 * MPTCP subflow socket receive routine, derived from soreceive().
826 mptcp_subflow_soreceive(struct socket
*so
, struct sockaddr
**psa
,
827 struct uio
*uio
, struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
830 int flags
, error
= 0;
831 struct proc
*p
= current_proc();
832 struct mbuf
*m
, **mp
= mp0
;
833 struct mbuf
*nextrecord
;
836 VERIFY(so
->so_proto
->pr_flags
& PR_CONNREQUIRED
);
838 #ifdef MORE_LOCKING_DEBUG
839 if (so
->so_usecount
== 1) {
840 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
845 * We return all that is there in the subflow's socket receive buffer
846 * to the MPTCP layer, so we require that the caller passes in the
847 * expected parameters.
849 if (mp
== NULL
|| controlp
!= NULL
) {
850 socket_unlock(so
, 1);
857 flags
= *flagsp
&~ MSG_EOR
;
861 if (flags
& (MSG_PEEK
|MSG_OOB
|MSG_NEEDSA
|MSG_WAITALL
|MSG_WAITSTREAM
)) {
862 socket_unlock(so
, 1);
865 flags
|= (MSG_DONTWAIT
|MSG_NBIO
);
868 * If a recv attempt is made on a previously-accepted socket
869 * that has been marked as inactive (disconnected), reject
872 if (so
->so_flags
& SOF_DEFUNCT
) {
873 struct sockbuf
*sb
= &so
->so_rcv
;
876 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
877 __func__
, proc_pid(p
), (uint64_t)VM_KERNEL_ADDRPERM(so
),
878 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
880 * This socket should have been disconnected and flushed
881 * prior to being returned from sodefunct(); there should
882 * be no data on its receive list, so panic otherwise.
884 if (so
->so_state
& SS_DEFUNCT
)
885 sb_empty_assert(sb
, __func__
);
886 socket_unlock(so
, 1);
891 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
892 * and if so just return to the caller. This could happen when
893 * soreceive() is called by a socket upcall function during the
894 * time the socket is freed. The socket buffer would have been
895 * locked across the upcall, therefore we cannot put this thread
896 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
897 * we may livelock), because the lock on the socket buffer will
898 * only be released when the upcall routine returns to its caller.
899 * Because the socket has been officially closed, there can be
900 * no further read on it.
902 * A multipath subflow socket would have its SS_NOFDREF set by
903 * default, so check for SOF_MP_SUBFLOW socket flag; when the
904 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
906 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
907 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
908 socket_unlock(so
, 1);
913 * For consistency with soreceive() semantics, we need to obey
914 * SB_LOCK in case some other code path has locked the buffer.
916 error
= sblock(&so
->so_rcv
, 0);
918 socket_unlock(so
, 1);
922 m
= so
->so_rcv
.sb_mb
;
925 * Panic if we notice inconsistencies in the socket's
926 * receive list; both sb_mb and sb_cc should correctly
927 * reflect the contents of the list, otherwise we may
928 * end up with false positives during select() or poll()
929 * which could put the application in a bad state.
931 SB_MB_CHECK(&so
->so_rcv
);
933 if (so
->so_error
!= 0) {
934 error
= so
->so_error
;
939 if (so
->so_state
& SS_CANTRCVMORE
) {
943 if (!(so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
))) {
949 * MSG_DONTWAIT is implicitly defined and this routine will
950 * never block, so return EWOULDBLOCK when there is nothing.
956 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
957 SBLASTRECORDCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 1");
958 SBLASTMBUFCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 1");
961 nextrecord
= m
->m_nextpkt
;
962 sbfree(&so
->so_rcv
, m
);
967 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
972 m
->m_nextpkt
= nextrecord
;
973 if (nextrecord
== NULL
)
974 so
->so_rcv
.sb_lastrecord
= m
;
976 m
= so
->so_rcv
.sb_mb
= nextrecord
;
977 SB_EMPTY_FIXUP(&so
->so_rcv
);
979 SBLASTRECORDCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 2");
980 SBLASTMBUFCHK(&so
->so_rcv
, "mptcp_subflow_soreceive 2");
983 DTRACE_MPTCP3(subflow__receive
, struct socket
*, so
,
984 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
);
985 /* notify protocol that we drained all the data */
986 if ((so
->so_proto
->pr_flags
& PR_WANTRCVD
) && so
->so_pcb
!= NULL
)
987 (*so
->so_proto
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
993 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
1000 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1001 * the work done earlier when the subflow socket was created.
1004 mptcp_subflow_sopeeloff(struct mptses
*mpte
, struct mptsub
*mpts
,
1008 struct socket
*mp_so
;
1011 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1012 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1013 MPTS_LOCK_ASSERT_HELD(mpts
);
1016 VERIFY(so
->so_flags
& SOF_MP_SUBFLOW
);
1017 VERIFY((so
->so_state
& (SS_NBIO
|SS_NOFDREF
)) == (SS_NBIO
|SS_NOFDREF
));
1019 /* inherit MPTCP socket states */
1020 if (!(mp_so
->so_state
& SS_NBIO
))
1021 so
->so_state
&= ~SS_NBIO
;
1024 * At this point, the socket is not yet closed, as there is at least
1025 * one outstanding usecount previously held by mpts_socket from
1026 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1028 so
->so_flags
&= ~SOF_MP_SUBFLOW
;
1029 so
->so_state
&= ~SS_NOFDREF
;
1030 so
->so_state
&= ~SOF_MPTCP_TRUE
;
1032 /* allow socket buffers to be compressed */
1033 so
->so_rcv
.sb_flags
&= ~SB_NOCOMPRESS
;
1034 so
->so_snd
.sb_flags
&= ~SB_NOCOMPRESS
;
1037 * Allow socket buffer auto sizing.
1039 * This will increase the current 64k buffer size to whatever is best.
1041 so
->so_rcv
.sb_flags
|= SB_AUTOSIZE
;
1042 so
->so_snd
.sb_flags
|= SB_AUTOSIZE
;
1044 /* restore protocol-user requests */
1045 VERIFY(mpts
->mpts_oprotosw
!= NULL
);
1046 so
->so_proto
= mpts
->mpts_oprotosw
;
1048 bzero(&smpo
, sizeof (smpo
));
1049 smpo
.mpo_flags
|= MPOF_SUBFLOW_OK
;
1050 smpo
.mpo_level
= SOL_SOCKET
;
1052 /* inherit SOF_NOSIGPIPE from parent MP socket */
1053 p
= (mp_so
->so_flags
& SOF_NOSIGPIPE
);
1054 c
= (so
->so_flags
& SOF_NOSIGPIPE
);
1055 smpo
.mpo_intval
= ((p
- c
) > 0) ? 1 : 0;
1056 smpo
.mpo_name
= SO_NOSIGPIPE
;
1058 (void) mptcp_subflow_sosetopt(mpte
, so
, &smpo
);
1060 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1061 p
= (mp_so
->so_flags
& SOF_NOADDRAVAIL
);
1062 c
= (so
->so_flags
& SOF_NOADDRAVAIL
);
1063 smpo
.mpo_intval
= ((p
- c
) > 0) ? 1 : 0;
1064 smpo
.mpo_name
= SO_NOADDRERR
;
1066 (void) mptcp_subflow_sosetopt(mpte
, so
, &smpo
);
1068 /* inherit SO_KEEPALIVE from parent MP socket */
1069 p
= (mp_so
->so_options
& SO_KEEPALIVE
);
1070 c
= (so
->so_options
& SO_KEEPALIVE
);
1071 smpo
.mpo_intval
= ((p
- c
) > 0) ? 1 : 0;
1072 smpo
.mpo_name
= SO_KEEPALIVE
;
1074 (void) mptcp_subflow_sosetopt(mpte
, so
, &smpo
);
1076 /* unset TCP level default keepalive option */
1077 p
= (intotcpcb(sotoinpcb(mp_so
)))->t_keepidle
;
1078 c
= (intotcpcb(sotoinpcb(so
)))->t_keepidle
;
1079 smpo
.mpo_level
= IPPROTO_TCP
;
1080 smpo
.mpo_intval
= 0;
1081 smpo
.mpo_name
= TCP_KEEPALIVE
;
1083 (void) mptcp_subflow_sosetopt(mpte
, so
, &smpo
);
1084 socket_unlock(so
, 0);
1086 DTRACE_MPTCP5(subflow__peeloff
, struct mptses
*, mpte
,
1087 struct mptsub
*, mpts
, struct socket
*, so
,
1088 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
);
1092 * Establish an initial MPTCP connection (if first subflow and not yet
1093 * connected), or add a subflow to an existing MPTCP connection.
1096 mptcp_subflow_add(struct mptses
*mpte
, struct mptsub
*mpts
,
1097 struct proc
*p
, uint32_t ifscope
)
1099 struct sockaddr_entry
*se
, *src_se
= NULL
, *dst_se
= NULL
;
1100 struct socket
*mp_so
, *so
= NULL
;
1101 struct mptsub_connreq mpcr
;
1102 struct mptcb
*mp_tp
;
1105 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1106 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1107 mp_tp
= mpte
->mpte_mptcb
;
1110 VERIFY(!(mpts
->mpts_flags
& (MPTSF_CONNECTING
|MPTSF_CONNECTED
)));
1111 VERIFY(mpts
->mpts_mpte
== NULL
);
1112 VERIFY(mpts
->mpts_socket
== NULL
);
1113 VERIFY(mpts
->mpts_dst_sl
!= NULL
);
1114 VERIFY(mpts
->mpts_connid
== CONNID_ANY
);
1116 /* select source (if specified) and destination addresses */
1117 if ((error
= in_selectaddrs(AF_UNSPEC
, &mpts
->mpts_src_sl
, &src_se
,
1118 &mpts
->mpts_dst_sl
, &dst_se
)) != 0)
1121 VERIFY(mpts
->mpts_dst_sl
!= NULL
&& dst_se
!= NULL
);
1122 VERIFY(src_se
== NULL
|| mpts
->mpts_src_sl
!= NULL
);
1123 af
= mpts
->mpts_family
= dst_se
->se_addr
->sa_family
;
1124 VERIFY(src_se
== NULL
|| src_se
->se_addr
->sa_family
== af
);
1125 VERIFY(af
== AF_INET
|| af
== AF_INET6
);
1128 * If the source address is not specified, allocate a storage for
1129 * it, so that later on we can fill it in with the actual source
1130 * IP address chosen by the underlying layer for the subflow after
1133 if (mpts
->mpts_src_sl
== NULL
) {
1135 sockaddrlist_dup(mpts
->mpts_dst_sl
, M_WAITOK
);
1136 if (mpts
->mpts_src_sl
== NULL
) {
1140 se
= TAILQ_FIRST(&mpts
->mpts_src_sl
->sl_head
);
1141 VERIFY(se
!= NULL
&& se
->se_addr
!= NULL
&&
1142 se
->se_addr
->sa_len
== dst_se
->se_addr
->sa_len
);
1143 bzero(se
->se_addr
, se
->se_addr
->sa_len
);
1144 se
->se_addr
->sa_len
= dst_se
->se_addr
->sa_len
;
1145 se
->se_addr
->sa_family
= dst_se
->se_addr
->sa_family
;
1148 /* create the subflow socket */
1149 if ((error
= mptcp_subflow_socreate(mpte
, mpts
, af
, p
, &so
)) != 0)
1153 * XXX: adi@apple.com
1155 * This probably needs to be made smarter, but for now simply
1156 * increment the counter, while avoiding 0 (CONNID_ANY) and
1157 * -1 (CONNID_ALL). Assume that an MPTCP connection will not
1158 * live too long with (2^32)-2 subflow connection attempts.
1160 mpte
->mpte_connid_last
++;
1161 if (mpte
->mpte_connid_last
== CONNID_ALL
||
1162 mpte
->mpte_connid_last
== CONNID_ANY
)
1163 mpte
->mpte_connid_last
++;
1165 mpts
->mpts_connid
= mpte
->mpte_connid_last
;
1166 VERIFY(mpts
->mpts_connid
!= CONNID_ANY
&&
1167 mpts
->mpts_connid
!= CONNID_ALL
);
1169 /* bind subflow socket to the specified interface */
1170 if (ifscope
!= IFSCOPE_NONE
) {
1172 error
= inp_bindif(sotoinpcb(so
), ifscope
, &mpts
->mpts_outif
);
1174 socket_unlock(so
, 0);
1175 (void) mptcp_subflow_soclose(mpts
, so
);
1178 VERIFY(mpts
->mpts_outif
!= NULL
);
1179 mpts
->mpts_flags
|= MPTSF_BOUND_IF
;
1181 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx bindif %s[%d] "
1182 "cid %d\n", __func__
,
1183 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
1184 mpts
->mpts_outif
->if_xname
,
1185 ifscope
, mpts
->mpts_connid
));
1186 socket_unlock(so
, 0);
1189 /* if source address and/or port is specified, bind to it */
1190 if (src_se
!= NULL
) {
1191 struct sockaddr
*sa
= src_se
->se_addr
;
1192 uint32_t mpts_flags
= 0;
1197 if (SIN(sa
)->sin_addr
.s_addr
!= INADDR_ANY
)
1198 mpts_flags
|= MPTSF_BOUND_IP
;
1199 if ((lport
= SIN(sa
)->sin_port
) != 0)
1200 mpts_flags
|= MPTSF_BOUND_PORT
;
1204 VERIFY(af
== AF_INET6
);
1205 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa
)->sin6_addr
))
1206 mpts_flags
|= MPTSF_BOUND_IP
;
1207 if ((lport
= SIN6(sa
)->sin6_port
) != 0)
1208 mpts_flags
|= MPTSF_BOUND_PORT
;
1213 error
= sobindlock(so
, sa
, 1); /* will lock/unlock socket */
1215 (void) mptcp_subflow_soclose(mpts
, so
);
1218 mpts
->mpts_flags
|= mpts_flags
;
1220 if (af
== AF_INET
|| af
== AF_INET6
) {
1221 char sbuf
[MAX_IPv6_STR_LEN
];
1223 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx bindip %s[%d] "
1224 "cid %d\n", __func__
,
1225 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
1226 inet_ntop(af
, ((af
== AF_INET
) ?
1227 (void *)&SIN(sa
)->sin_addr
.s_addr
:
1228 (void *)&SIN6(sa
)->sin6_addr
), sbuf
, sizeof (sbuf
)),
1229 ntohs(lport
), mpts
->mpts_connid
));
1234 * Insert the subflow into the list, and associate the MPTCP PCB
1235 * as well as the the subflow socket. From this point on, removing
1236 * the subflow needs to be done via mptcp_subflow_del().
1238 TAILQ_INSERT_TAIL(&mpte
->mpte_subflows
, mpts
, mpts_entry
);
1239 mpte
->mpte_numflows
++;
1241 atomic_bitset_32(&mpts
->mpts_flags
, MPTSF_ATTACHED
);
1242 mpts
->mpts_mpte
= mpte
;
1243 mpts
->mpts_socket
= so
;
1244 MPTS_ADDREF_LOCKED(mpts
); /* for being in MPTCP subflow list */
1245 MPTS_ADDREF_LOCKED(mpts
); /* for subflow socket */
1246 mp_so
->so_usecount
++; /* for subflow socket */
1248 /* register for subflow socket read/write events */
1249 (void) sock_setupcalls(so
, mptcp_subflow_rupcall
, mpts
,
1250 mptcp_subflow_wupcall
, mpts
);
1253 * Register for subflow socket control events; ignore
1254 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1255 * will generate it here.
1257 (void) sock_catchevents(so
, mptcp_subflow_eupcall
, mpts
,
1258 SO_FILT_HINT_CONNRESET
| SO_FILT_HINT_CANTRCVMORE
|
1259 SO_FILT_HINT_CANTSENDMORE
| SO_FILT_HINT_TIMEOUT
|
1260 SO_FILT_HINT_NOSRCADDR
| SO_FILT_HINT_IFDENIED
|
1261 SO_FILT_HINT_SUSPEND
| SO_FILT_HINT_RESUME
|
1262 SO_FILT_HINT_CONNECTED
| SO_FILT_HINT_DISCONNECTED
|
1263 SO_FILT_HINT_MPFAILOVER
| SO_FILT_HINT_MPSTATUS
|
1264 SO_FILT_HINT_MUSTRST
);
1267 VERIFY(!(mpts
->mpts_flags
&
1268 (MPTSF_CONNECTING
|MPTSF_CONNECTED
|MPTSF_CONNECT_PENDING
)));
1270 bzero(&mpcr
, sizeof (mpcr
));
1272 mpcr
.mpcr_ifscope
= ifscope
;
1274 * Indicate to the TCP subflow whether or not it should establish
1275 * the initial MPTCP connection, or join an existing one. Fill
1276 * in the connection request structure with additional info needed
1277 * by the underlying TCP (to be used in the TCP options, etc.)
1280 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
&& mpte
->mpte_numflows
== 1) {
1281 if (mp_tp
->mpt_state
== MPTCPS_CLOSED
) {
1282 mp_tp
->mpt_localkey
= mptcp_reserve_key();
1283 mptcp_conn_properties(mp_tp
);
1286 soisconnecting(mp_so
);
1287 mpcr
.mpcr_type
= MPTSUB_CONNREQ_MP_ENABLE
;
1289 if (!(mp_tp
->mpt_flags
& MPTCPF_JOIN_READY
))
1290 mpts
->mpts_flags
|= MPTSF_CONNECT_PENDING
;
1292 mpcr
.mpcr_type
= MPTSUB_CONNREQ_MP_ADD
;
1295 mpts
->mpts_mpcr
= mpcr
;
1296 mpts
->mpts_flags
|= MPTSF_CONNECTING
;
1298 if (af
== AF_INET
|| af
== AF_INET6
) {
1299 char dbuf
[MAX_IPv6_STR_LEN
];
1301 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx dst %s[%d] cid %d "
1302 "[pending %s]\n", __func__
,
1303 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
1304 inet_ntop(af
, ((af
== AF_INET
) ?
1305 (void *)&SIN(dst_se
->se_addr
)->sin_addr
.s_addr
:
1306 (void *)&SIN6(dst_se
->se_addr
)->sin6_addr
),
1307 dbuf
, sizeof (dbuf
)), ((af
== AF_INET
) ?
1308 ntohs(SIN(dst_se
->se_addr
)->sin_port
) :
1309 ntohs(SIN6(dst_se
->se_addr
)->sin6_port
)),
1311 ((mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) ?
1315 /* connect right away if first attempt, or if join can be done now */
1316 if (!(mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
))
1317 error
= mptcp_subflow_soconnectx(mpte
, mpts
);
1322 soevent(mp_so
, SO_FILT_HINT_LOCKED
|
1323 SO_FILT_HINT_CONNINFO_UPDATED
);
1329 mptcp_delete_ok(struct mptses
*mpte
, struct mptsub
*mpts
)
1332 struct mptcb
*mp_tp
= NULL
;
1334 MPTE_LOCK_ASSERT_HELD(mpte
);
1335 mp_tp
= mpte
->mpte_mptcb
;
1336 VERIFY(mp_tp
!= NULL
);
1339 if ((mpts
->mpts_soerror
== 0) &&
1340 (mpts
->mpts_flags
& MPTSF_ACTIVE
) &&
1341 (mp_tp
->mpt_state
!= MPTCPS_CLOSED
) &&
1342 (mp_tp
->mpt_state
<= MPTCPS_TIME_WAIT
))
1350 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1351 * will no longer be accessible after a subflow is deleted, thus this
1352 * should occur only after the subflow socket has been disconnected.
1353 * If peeloff(2) is called, leave the socket open.
1356 mptcp_subflow_del(struct mptses
*mpte
, struct mptsub
*mpts
, boolean_t close
)
1358 struct socket
*mp_so
, *so
;
1360 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1361 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1364 so
= mpts
->mpts_socket
;
1367 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d "
1368 "[close %s] %d %x\n", __func__
,
1369 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
1371 mp_so
->so_retaincnt
, mpts
->mpts_connid
,
1372 (close
? "YES" : "NO"), mpts
->mpts_soerror
,
1375 VERIFY(mpts
->mpts_mpte
== mpte
);
1376 VERIFY(mpts
->mpts_connid
!= CONNID_ANY
&&
1377 mpts
->mpts_connid
!= CONNID_ALL
);
1379 VERIFY(mpts
->mpts_flags
& MPTSF_ATTACHED
);
1380 atomic_bitclear_32(&mpts
->mpts_flags
, MPTSF_ATTACHED
);
1381 TAILQ_REMOVE(&mpte
->mpte_subflows
, mpts
, mpts_entry
);
1382 VERIFY(mpte
->mpte_numflows
!= 0);
1383 mpte
->mpte_numflows
--;
1386 * Drop references held by this subflow socket; there
1387 * will be no further upcalls made from this point.
1389 (void) sock_setupcalls(so
, NULL
, NULL
, NULL
, NULL
);
1390 (void) sock_catchevents(so
, NULL
, NULL
, 0);
1391 mptcp_detach_mptcb_from_subf(mpte
->mpte_mptcb
, so
);
1393 (void) mptcp_subflow_soclose(mpts
, so
);
1395 VERIFY(mp_so
->so_usecount
!= 0);
1396 mp_so
->so_usecount
--; /* for subflow socket */
1397 mpts
->mpts_mpte
= NULL
;
1398 mpts
->mpts_socket
= NULL
;
1401 MPTS_REMREF(mpts
); /* for MPTCP subflow list */
1402 MPTS_REMREF(mpts
); /* for subflow socket */
1404 soevent(mp_so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNINFO_UPDATED
);
1408 * Disconnect a subflow socket.
1411 mptcp_subflow_disconnect(struct mptses
*mpte
, struct mptsub
*mpts
,
1415 struct mptcb
*mp_tp
;
1418 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1419 MPTS_LOCK_ASSERT_HELD(mpts
);
1421 VERIFY(mpts
->mpts_mpte
== mpte
);
1422 VERIFY(mpts
->mpts_socket
!= NULL
);
1423 VERIFY(mpts
->mpts_connid
!= CONNID_ANY
&&
1424 mpts
->mpts_connid
!= CONNID_ALL
);
1426 if (mpts
->mpts_flags
& (MPTSF_DISCONNECTING
|MPTSF_DISCONNECTED
))
1429 mpts
->mpts_flags
|= MPTSF_DISCONNECTING
;
1432 * If this is coming from disconnectx(2) or issued as part of
1433 * closing the MPTCP socket, the subflow shouldn't stick around.
1434 * Otherwise let it linger around in case the upper layers need
1435 * to retrieve its conninfo.
1438 mpts
->mpts_flags
|= MPTSF_DELETEOK
;
1440 so
= mpts
->mpts_socket
;
1441 mp_tp
= mpte
->mpte_mptcb
;
1443 if (mp_tp
->mpt_state
> MPTCPS_ESTABLISHED
)
1448 if (!(so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) &&
1449 (so
->so_state
& SS_ISCONNECTED
)) {
1450 mptcplog((LOG_DEBUG
, "%s: cid %d fin %d [linger %s]\n",
1451 __func__
, mpts
->mpts_connid
, send_dfin
,
1452 (deleteok
? "NO" : "YES")));
1455 mptcp_send_dfin(so
);
1456 (void) soshutdownlock(so
, SHUT_RD
);
1457 (void) soshutdownlock(so
, SHUT_WR
);
1458 (void) sodisconnectlocked(so
);
1460 socket_unlock(so
, 0);
1462 * Generate a disconnect event for this subflow socket, in case
1463 * the lower layer doesn't do it; this is needed because the
1464 * subflow socket deletion relies on it. This will also end up
1465 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1466 * we cannot do that here because subflow lock is currently held.
1468 mptcp_subflow_eupcall(so
, mpts
, SO_FILT_HINT_DISCONNECTED
);
1472 * Subflow socket read upcall.
1474 * Called when the associated subflow socket posted a read event. The subflow
1475 * socket lock has been released prior to invoking the callback. Note that the
1476 * upcall may occur synchronously as a result of MPTCP performing an action on
1477 * it, or asynchronously as a result of an event happening at the subflow layer.
1478 * Therefore, to maintain lock ordering, the only lock that can be acquired
1479 * here is the thread lock, for signalling purposes.
1482 mptcp_subflow_rupcall(struct socket
*so
, void *arg
, int waitf
)
1484 #pragma unused(so, waitf)
1485 struct mptsub
*mpts
= arg
;
1486 struct mptses
*mpte
= mpts
->mpts_mpte
;
1488 VERIFY(mpte
!= NULL
);
1490 lck_mtx_lock(&mpte
->mpte_thread_lock
);
1491 mptcp_thread_signal_locked(mpte
);
1492 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
1496 * Subflow socket input.
1498 * Called in the context of the MPTCP thread, for reading data from the
1499 * underlying subflow socket and delivering it to MPTCP.
1502 mptcp_subflow_input(struct mptses
*mpte
, struct mptsub
*mpts
)
1504 struct mbuf
*m
= NULL
;
1507 struct mptsub
*mpts_alt
= NULL
;
1509 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1510 MPTS_LOCK_ASSERT_HELD(mpts
);
1512 DTRACE_MPTCP2(subflow__input
, struct mptses
*, mpte
,
1513 struct mptsub
*, mpts
);
1515 if (!(mpts
->mpts_flags
& MPTSF_CONNECTED
))
1518 so
= mpts
->mpts_socket
;
1520 error
= sock_receive_internal(so
, NULL
, &m
, 0, NULL
);
1521 if (error
!= 0 && error
!= EWOULDBLOCK
) {
1522 mptcplog((LOG_ERR
, "%s: cid %d error %d\n",
1523 __func__
, mpts
->mpts_connid
, error
));
1525 mpts_alt
= mptcp_get_subflow(mpte
, mpts
);
1526 if (mpts_alt
== NULL
) {
1527 mptcplog((LOG_ERR
, "%s: no alt path cid %d\n",
1528 __func__
, mpts
->mpts_connid
));
1529 mpte
->mpte_mppcb
->mpp_socket
->so_error
= error
;
1532 } else if (error
== 0) {
1533 mptcplog3((LOG_DEBUG
, "%s: cid %d \n",
1534 __func__
, mpts
->mpts_connid
));
1537 /* In fallback, make sure to accept data on all but one subflow */
1538 if ((mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) &&
1539 (!(mpts
->mpts_flags
& MPTSF_ACTIVE
))) {
1546 * Release subflow lock since this may trigger MPTCP to send,
1547 * possibly on a different subflow. An extra reference has
1548 * been held on the subflow by the MPTCP thread before coming
1549 * here, so we can be sure that it won't go away, in the event
1550 * the MP socket lock gets released.
1553 mptcp_input(mpte
, m
);
1559 * Subflow socket write upcall.
1561 * Called when the associated subflow socket posted a read event. The subflow
1562 * socket lock has been released prior to invoking the callback. Note that the
1563 * upcall may occur synchronously as a result of MPTCP performing an action on
1564 * it, or asynchronously as a result of an event happening at the subflow layer.
1565 * Therefore, to maintain lock ordering, the only lock that can be acquired
1566 * here is the thread lock, for signalling purposes.
1569 mptcp_subflow_wupcall(struct socket
*so
, void *arg
, int waitf
)
1571 #pragma unused(so, waitf)
1572 struct mptsub
*mpts
= arg
;
1573 struct mptses
*mpte
= mpts
->mpts_mpte
;
1575 VERIFY(mpte
!= NULL
);
1577 lck_mtx_lock(&mpte
->mpte_thread_lock
);
1578 mptcp_thread_signal_locked(mpte
);
1579 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
1583 * Subflow socket output.
1585 * Called for sending data from MPTCP to the underlying subflow socket.
1588 mptcp_subflow_output(struct mptses
*mpte
, struct mptsub
*mpts
)
1590 struct socket
*mp_so
, *so
;
1591 size_t sb_cc
= 0, tot_sent
= 0;
1594 u_int64_t mpt_dsn
= 0;
1595 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
1596 struct mbuf
*mpt_mbuf
= NULL
;
1597 unsigned int off
= 0;
1599 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1600 MPTS_LOCK_ASSERT_HELD(mpts
);
1601 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1602 so
= mpts
->mpts_socket
;
1604 DTRACE_MPTCP2(subflow__output
, struct mptses
*, mpte
,
1605 struct mptsub
*, mpts
);
1607 /* subflow socket is suspended? */
1608 if (mpts
->mpts_flags
& MPTSF_SUSPENDED
) {
1609 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx cid %d is flow "
1610 "controlled\n", __func__
,
1611 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), mpts
->mpts_connid
));
1615 /* subflow socket is not MPTCP capable? */
1616 if (!(mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) &&
1617 !(mpts
->mpts_flags
& MPTSF_MP_DEGRADED
)) {
1618 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx cid %d not "
1619 "MPTCP capable\n", __func__
,
1620 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), mpts
->mpts_connid
));
1624 /* Remove Addr Option is not sent reliably as per I-D */
1625 if (mpte
->mpte_flags
& MPTE_SND_REM_ADDR
) {
1626 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
1627 tp
->t_rem_aid
= mpte
->mpte_lost_aid
;
1628 if (mptcp_remaddr_enable
)
1629 tp
->t_mpflags
|= TMPF_SND_REM_ADDR
;
1630 mpte
->mpte_flags
&= ~MPTE_SND_REM_ADDR
;
1634 * The mbuf chains containing the metadata (as well as pointing to
1635 * the user data sitting at the MPTCP output queue) would then be
1636 * sent down to the subflow socket.
1638 * Some notes on data sequencing:
1640 * a. Each mbuf must be a M_PKTHDR.
1641 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1642 * in the mbuf pkthdr structure.
1643 * c. Each mbuf containing the MPTCP metadata must have its
1644 * pkt_flags marked with the PKTF_MPTCP flag.
1647 /* First, drop acknowledged data */
1648 sb_mb
= mp_so
->so_snd
.sb_mb
;
1649 if (sb_mb
== NULL
) {
1653 VERIFY(sb_mb
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
1656 while (mpt_mbuf
&& mpt_mbuf
->m_pkthdr
.mp_rlen
== 0) {
1657 mpt_mbuf
= mpt_mbuf
->m_next
;
1659 if (mpt_mbuf
&& (mpt_mbuf
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
)) {
1660 mpt_dsn
= mpt_mbuf
->m_pkthdr
.mp_dsn
;
1666 if (MPTCP_SEQ_LT(mpt_dsn
, mp_tp
->mpt_snduna
)) {
1668 len
= mp_tp
->mpt_snduna
- mpt_dsn
;
1669 sbdrop(&mp_so
->so_snd
, len
);
1674 * In degraded mode, we don't receive data acks, so force free
1675 * mbufs less than snd_nxt
1677 mpt_dsn
= mp_so
->so_snd
.sb_mb
->m_pkthdr
.mp_dsn
;
1678 if ((mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) &&
1679 MPTCP_SEQ_LT(mpt_dsn
, mp_tp
->mpt_sndnxt
)) {
1681 len
= mp_tp
->mpt_sndnxt
- mpt_dsn
;
1682 sbdrop(&mp_so
->so_snd
, len
);
1683 mp_tp
->mpt_snduna
= mp_tp
->mpt_sndnxt
;
1687 * Adjust the subflow's notion of next byte to send based on
1688 * the last unacknowledged byte
1690 if (MPTCP_SEQ_LT(mpts
->mpts_sndnxt
, mp_tp
->mpt_snduna
)) {
1691 mpts
->mpts_sndnxt
= mp_tp
->mpt_snduna
;
1695 * Adjust the top level notion of next byte used for retransmissions
1698 if (MPTCP_SEQ_LT(mp_tp
->mpt_sndnxt
, mp_tp
->mpt_snduna
)) {
1699 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
1703 /* Now determine the offset from which to start transmitting data */
1704 sb_mb
= mp_so
->so_snd
.sb_mb
;
1705 sb_cc
= mp_so
->so_snd
.sb_cc
;
1706 if (sb_mb
== NULL
) {
1710 if (MPTCP_SEQ_LT(mpts
->mpts_sndnxt
, mp_tp
->mpt_sndmax
)) {
1711 off
= mpts
->mpts_sndnxt
- mp_tp
->mpt_snduna
;
1720 mpt_dsn
= mpt_mbuf
->m_pkthdr
.mp_dsn
;
1722 while (mpt_mbuf
&& ((mpt_mbuf
->m_pkthdr
.mp_rlen
== 0) ||
1723 (mpt_mbuf
->m_pkthdr
.mp_rlen
<= off
))) {
1724 off
-= mpt_mbuf
->m_pkthdr
.mp_rlen
;
1725 mpt_mbuf
= mpt_mbuf
->m_next
;
1726 mpt_dsn
= mpt_mbuf
->m_pkthdr
.mp_dsn
;
1728 if ((mpts
->mpts_connid
== 2) || (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
))
1729 mptcplog((LOG_INFO
, "%s: snduna = %llu off = %d id = %d"
1732 mp_tp
->mpt_snduna
, off
, mpts
->mpts_connid
,
1733 mpts
->mpts_sndnxt
));
1735 VERIFY(mpt_mbuf
&& (mpt_mbuf
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
1737 while (tot_sent
< sb_cc
) {
1739 size_t mlen
, len
= 0;
1741 mlen
= mpt_mbuf
->m_pkthdr
.mp_rlen
;
1747 panic("%s: unexpected %lu %lu \n", __func__
,
1751 m
= m_copym_mode(mpt_mbuf
, off
, mlen
, M_DONTWAIT
,
1758 /* Create a DSN mapping for the data (m_copym does it) */
1759 mpt_dsn
= mpt_mbuf
->m_pkthdr
.mp_dsn
;
1760 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP
;
1761 m
->m_pkthdr
.pkt_flags
&= ~PKTF_MPSO
;
1762 m
->m_pkthdr
.mp_dsn
= mpt_dsn
+ off
;
1763 m
->m_pkthdr
.mp_rseq
= mpts
->mpts_rel_seq
;
1764 m
->m_pkthdr
.mp_rlen
= mlen
;
1765 mpts
->mpts_rel_seq
+= mlen
;
1766 m
->m_pkthdr
.len
= mlen
;
1768 /* last contiguous mapping is stored for error cases */
1769 if (mpts
->mpts_lastmap
.mptsl_dsn
+
1770 mpts
->mpts_lastmap
.mptsl_len
== mpt_dsn
) {
1771 mpts
->mpts_lastmap
.mptsl_len
+= tot_sent
;
1772 } else if (MPTCP_SEQ_LT((mpts
->mpts_lastmap
.mptsl_dsn
+
1773 mpts
->mpts_lastmap
.mptsl_len
), mpt_dsn
)) {
1774 if (m
->m_pkthdr
.mp_dsn
== 0)
1775 panic("%s %llu", __func__
, mpt_dsn
);
1776 mpts
->mpts_lastmap
.mptsl_dsn
= m
->m_pkthdr
.mp_dsn
;
1777 mpts
->mpts_lastmap
.mptsl_sseq
= m
->m_pkthdr
.mp_rseq
;
1778 mpts
->mpts_lastmap
.mptsl_len
= m
->m_pkthdr
.mp_rlen
;
1781 error
= sock_sendmbuf(so
, NULL
, m
, 0, &len
);
1782 DTRACE_MPTCP7(send
, struct mbuf
*, m
, struct socket
*, so
,
1783 struct sockbuf
*, &so
->so_rcv
,
1784 struct sockbuf
*, &so
->so_snd
,
1785 struct mptses
*, mpte
, struct mptsub
*, mpts
,
1788 mptcplog((LOG_ERR
, "%s: len = %zd error = %d \n",
1789 __func__
, len
, error
));
1792 mpts
->mpts_sndnxt
+= mlen
;
1794 if (MPTCP_SEQ_LT(mp_tp
->mpt_sndnxt
, mpts
->mpts_sndnxt
)) {
1795 if (MPTCP_DATASEQ_HIGH32(mpts
->mpts_sndnxt
) >
1796 MPTCP_DATASEQ_HIGH32(mp_tp
->mpt_sndnxt
))
1797 mp_tp
->mpt_flags
|= MPTCPF_SND_64BITDSN
;
1798 mp_tp
->mpt_sndnxt
= mpts
->mpts_sndnxt
;
1802 mptcplog((LOG_ERR
, "%s: cid %d wrote %d "
1803 "(expected %d)\n", __func__
,
1804 mpts
->mpts_connid
, len
, mlen
));
1808 mpt_mbuf
= mpt_mbuf
->m_next
;
1811 if (error
!= 0 && error
!= EWOULDBLOCK
) {
1812 mptcplog((LOG_ERR
, "MPTCP ERROR %s: cid %d error %d\n",
1813 __func__
, mpts
->mpts_connid
, error
));
1815 if ((mpts
->mpts_connid
== 2) ||
1816 (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
))
1817 mptcplog((LOG_DEBUG
, "%s: cid %d wrote %d %d\n",
1818 __func__
, mpts
->mpts_connid
, tot_sent
,
1821 mptcp_cancel_timer(mp_tp
, MPTT_REXMT
);
1829 * Subflow socket control event upcall.
1831 * Called when the associated subflow socket posted one or more control events.
1832 * The subflow socket lock has been released prior to invoking the callback.
1833 * Note that the upcall may occur synchronously as a result of MPTCP performing
1834 * an action on it, or asynchronously as a result of an event happening at the
1835 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
1836 * be acquired here is the thread lock, for signalling purposes.
1839 mptcp_subflow_eupcall(struct socket
*so
, void *arg
, uint32_t events
)
1842 struct mptsub
*mpts
= arg
;
1843 struct mptses
*mpte
= mpts
->mpts_mpte
;
1845 VERIFY(mpte
!= NULL
);
1847 lck_mtx_lock(&mpte
->mpte_thread_lock
);
1848 atomic_bitset_32(&mpts
->mpts_evctl
, events
);
1849 mptcp_thread_signal_locked(mpte
);
1850 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
1854 * Subflow socket control events.
1856 * Called for handling events related to the underlying subflow socket.
1859 mptcp_subflow_events(struct mptses
*mpte
, struct mptsub
*mpts
)
1862 ev_ret_t ret
= MPTS_EVRET_OK
;
1864 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1865 MPTS_LOCK_ASSERT_HELD(mpts
);
1867 /* bail if there's nothing to process */
1868 if ((events
= mpts
->mpts_evctl
) == 0)
1871 if (events
& (SO_FILT_HINT_CONNRESET
|SO_FILT_HINT_MUSTRST
|
1872 SO_FILT_HINT_CANTRCVMORE
|SO_FILT_HINT_CANTSENDMORE
|
1873 SO_FILT_HINT_TIMEOUT
|SO_FILT_HINT_NOSRCADDR
|
1874 SO_FILT_HINT_IFDENIED
|SO_FILT_HINT_SUSPEND
|
1875 SO_FILT_HINT_DISCONNECTED
)) {
1876 events
|= SO_FILT_HINT_MPFAILOVER
;
1879 DTRACE_MPTCP3(subflow__events
, struct mptses
*, mpte
,
1880 struct mptsub
*, mpts
, uint32_t, events
);
1882 mptcplog2((LOG_DEBUG
, "%s: cid %d events=%b\n", __func__
,
1883 mpts
->mpts_connid
, events
, SO_FILT_HINT_BITS
));
1885 if ((events
& SO_FILT_HINT_MPFAILOVER
) && (ret
>= MPTS_EVRET_OK
)) {
1886 ev_ret_t error
= mptcp_subflow_failover_ev(mpte
, mpts
);
1887 events
&= ~SO_FILT_HINT_MPFAILOVER
;
1888 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1890 if ((events
& SO_FILT_HINT_CONNRESET
) && (ret
>= MPTS_EVRET_OK
)) {
1891 ev_ret_t error
= mptcp_subflow_connreset_ev(mpte
, mpts
);
1892 events
&= ~SO_FILT_HINT_CONNRESET
;
1893 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1895 if ((events
& SO_FILT_HINT_MUSTRST
) && (ret
>= MPTS_EVRET_OK
)) {
1896 ev_ret_t error
= mptcp_subflow_mustrst_ev(mpte
, mpts
);
1897 events
&= ~SO_FILT_HINT_MUSTRST
;
1898 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1900 if ((events
& SO_FILT_HINT_CANTRCVMORE
) && (ret
>= MPTS_EVRET_OK
)) {
1901 ev_ret_t error
= mptcp_subflow_cantrcvmore_ev(mpte
, mpts
);
1902 events
&= ~SO_FILT_HINT_CANTRCVMORE
;
1903 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1905 if ((events
& SO_FILT_HINT_CANTSENDMORE
) && (ret
>= MPTS_EVRET_OK
)) {
1906 ev_ret_t error
= mptcp_subflow_cantsendmore_ev(mpte
, mpts
);
1907 events
&= ~SO_FILT_HINT_CANTSENDMORE
;
1908 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1910 if ((events
& SO_FILT_HINT_TIMEOUT
) && (ret
>= MPTS_EVRET_OK
)) {
1911 ev_ret_t error
= mptcp_subflow_timeout_ev(mpte
, mpts
);
1912 events
&= ~SO_FILT_HINT_TIMEOUT
;
1913 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1915 if ((events
& SO_FILT_HINT_NOSRCADDR
) && (ret
>= MPTS_EVRET_OK
)) {
1916 ev_ret_t error
= mptcp_subflow_nosrcaddr_ev(mpte
, mpts
);
1917 events
&= ~SO_FILT_HINT_NOSRCADDR
;
1918 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1920 if ((events
& SO_FILT_HINT_IFDENIED
) && (ret
>= MPTS_EVRET_OK
)) {
1921 ev_ret_t error
= mptcp_subflow_ifdenied_ev(mpte
, mpts
);
1922 events
&= ~SO_FILT_HINT_IFDENIED
;
1923 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1925 if ((events
& SO_FILT_HINT_SUSPEND
) && (ret
>= MPTS_EVRET_OK
)) {
1926 ev_ret_t error
= mptcp_subflow_suspend_ev(mpte
, mpts
);
1927 events
&= ~SO_FILT_HINT_SUSPEND
;
1928 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1930 if ((events
& SO_FILT_HINT_RESUME
) && (ret
>= MPTS_EVRET_OK
)) {
1931 ev_ret_t error
= mptcp_subflow_resume_ev(mpte
, mpts
);
1932 events
&= ~SO_FILT_HINT_RESUME
;
1933 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1935 if ((events
& SO_FILT_HINT_CONNECTED
) && (ret
>= MPTS_EVRET_OK
)) {
1936 ev_ret_t error
= mptcp_subflow_connected_ev(mpte
, mpts
);
1937 events
&= ~SO_FILT_HINT_CONNECTED
;
1938 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1940 if ((events
& SO_FILT_HINT_MPSTATUS
) && (ret
>= MPTS_EVRET_OK
)) {
1941 ev_ret_t error
= mptcp_subflow_mpstatus_ev(mpte
, mpts
);
1942 events
&= ~SO_FILT_HINT_MPSTATUS
;
1943 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1945 if ((events
& SO_FILT_HINT_DISCONNECTED
) && (ret
>= MPTS_EVRET_OK
)) {
1946 ev_ret_t error
= mptcp_subflow_disconnected_ev(mpte
, mpts
);
1947 events
&= ~SO_FILT_HINT_DISCONNECTED
;
1948 ret
= ((error
>= MPTS_EVRET_OK
) ? MAX(error
, ret
) : error
);
1951 * We should be getting only events specified via sock_catchevents(),
1952 * so loudly complain if we have any unprocessed one(s).
1954 if (events
!= 0 || ret
< MPTS_EVRET_OK
) {
1955 mptcplog((LOG_ERR
, "%s%s: cid %d evret %s (%d)"
1956 " unhandled events=%b\n",
1957 (events
!= 0) ? "MPTCP_ERROR " : "",
1958 __func__
, mpts
->mpts_connid
,
1959 mptcp_evret2str(ret
), ret
, events
, SO_FILT_HINT_BITS
));
1962 /* clear the ones we've processed */
1963 atomic_bitclear_32(&mpts
->mpts_evctl
, ~events
);
1969 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
1972 mptcp_subflow_connreset_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
1974 struct socket
*mp_so
, *so
;
1975 struct mptcb
*mp_tp
;
1978 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
1979 MPTS_LOCK_ASSERT_HELD(mpts
);
1980 VERIFY(mpte
->mpte_mppcb
!= NULL
);
1981 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
1982 mp_tp
= mpte
->mpte_mptcb
;
1983 so
= mpts
->mpts_socket
;
1985 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
1986 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
1988 mptcplog((LOG_DEBUG
, "%s: cid %d [linger %s]\n", __func__
,
1989 mpts
->mpts_connid
, (linger
? "YES" : "NO")));
1991 if (mpts
->mpts_soerror
== 0)
1992 mpts
->mpts_soerror
= ECONNREFUSED
;
1995 * We got a TCP RST for this subflow connection.
1997 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
1998 * client if the MPTCP connection has not been established. Otherwise
1999 * we close the socket.
2001 mptcp_subflow_disconnect(mpte
, mpts
, !linger
);
2004 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2005 mp_so
->so_error
= ECONNREFUSED
;
2010 * Keep the subflow socket around, unless the MPTCP socket has
2011 * been detached or the subflow has been disconnected explicitly,
2012 * in which case it should be deleted right away.
2014 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2018 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2021 mptcp_subflow_cantrcvmore_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2025 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2026 MPTS_LOCK_ASSERT_HELD(mpts
);
2028 so
= mpts
->mpts_socket
;
2030 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
, mpts
->mpts_connid
));
2033 * We got a FIN for this subflow connection. This subflow socket
2034 * is no longer available for receiving data;
2035 * The FIN may arrive with data. The data is handed up to the
2036 * mptcp socket and the subflow is disconnected.
2039 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
2043 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2046 mptcp_subflow_cantsendmore_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2050 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2051 MPTS_LOCK_ASSERT_HELD(mpts
);
2053 so
= mpts
->mpts_socket
;
2055 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
, mpts
->mpts_connid
));
2056 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
2060 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2063 mptcp_subflow_timeout_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2065 struct socket
*mp_so
, *so
;
2066 struct mptcb
*mp_tp
;
2069 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2070 MPTS_LOCK_ASSERT_HELD(mpts
);
2071 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2072 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2073 mp_tp
= mpte
->mpte_mptcb
;
2074 so
= mpts
->mpts_socket
;
2076 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
2077 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
2079 mptcplog((LOG_NOTICE
, "%s: cid %d [linger %s]\n", __func__
,
2080 mpts
->mpts_connid
, (linger
? "YES" : "NO")));
2082 if (mpts
->mpts_soerror
== 0)
2083 mpts
->mpts_soerror
= ETIMEDOUT
;
2086 * The subflow connection has timed out.
2088 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2089 * client if the MPTCP connection has not been established. Otherwise
2092 mptcp_subflow_disconnect(mpte
, mpts
, !linger
);
2095 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2096 mp_so
->so_error
= ETIMEDOUT
;
2101 * Keep the subflow socket around, unless the MPTCP socket has
2102 * been detached or the subflow has been disconnected explicitly,
2103 * in which case it should be deleted right away.
2105 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2109 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2112 mptcp_subflow_nosrcaddr_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2114 struct socket
*mp_so
, *so
;
2115 struct mptcb
*mp_tp
;
2117 struct tcpcb
*tp
= NULL
;
2119 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2120 MPTS_LOCK_ASSERT_HELD(mpts
);
2122 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2123 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2124 mp_tp
= mpte
->mpte_mptcb
;
2125 so
= mpts
->mpts_socket
;
2127 /* Not grabbing socket lock as t_local_aid is write once only */
2128 tp
= intotcpcb(sotoinpcb(so
));
2130 * This overwrites any previous mpte_lost_aid to avoid storing
2131 * too much state when the typical case has only two subflows.
2133 mpte
->mpte_flags
|= MPTE_SND_REM_ADDR
;
2134 mpte
->mpte_lost_aid
= tp
->t_local_aid
;
2136 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
2137 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
2139 mptcplog((LOG_DEBUG
, "%s: cid %d [linger %s]\n", __func__
,
2140 mpts
->mpts_connid
, (linger
? "YES" : "NO")));
2142 if (mpts
->mpts_soerror
== 0)
2143 mpts
->mpts_soerror
= EADDRNOTAVAIL
;
2146 * The subflow connection has lost its source address.
2148 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2149 * client if the MPTCP connection has not been established. If it
2150 * has been established with one subflow , we keep the MPTCP
2151 * connection valid without any subflows till closed by application.
2152 * This lets tcp connection manager decide whether to close this or
2153 * not as it reacts to reachability changes too.
2155 mptcp_subflow_disconnect(mpte
, mpts
, !linger
);
2158 if ((mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) &&
2159 (mp_so
->so_flags
& SOF_NOADDRAVAIL
)) {
2160 mp_so
->so_error
= EADDRNOTAVAIL
;
2165 * Keep the subflow socket around, unless the MPTCP socket has
2166 * been detached or the subflow has been disconnected explicitly,
2167 * in which case it should be deleted right away.
2169 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2173 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2176 mptcp_subflow_failover_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2178 struct mptsub
*mpts_alt
= NULL
;
2179 struct socket
*so
= NULL
;
2180 struct socket
*mp_so
;
2181 int altpath_exists
= 0;
2183 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2184 MPTS_LOCK_ASSERT_HELD(mpts
);
2185 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2186 mptcplog2((LOG_NOTICE
, "%s: mp_so 0x%llx\n", __func__
,
2187 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
)));
2190 mpts_alt
= mptcp_get_subflow(mpte
, mpts
);
2193 * If there is no alternate eligible subflow, ignore the
2196 if (mpts_alt
== NULL
) {
2197 mptcplog2((LOG_WARNING
, "%s: no alternate path\n", __func__
));
2201 MPTS_LOCK(mpts_alt
);
2203 so
= mpts_alt
->mpts_socket
;
2204 if (mpts_alt
->mpts_flags
& MPTSF_FAILINGOVER
) {
2206 /* All data acknowledged */
2207 if (so
->so_snd
.sb_cc
== 0) {
2208 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
2209 mpts_alt
->mpts_flags
&= ~MPTSF_FAILINGOVER
;
2211 /* no alternate path available */
2214 socket_unlock(so
, 1);
2216 if (altpath_exists
) {
2217 mpts_alt
->mpts_flags
|= MPTSF_ACTIVE
;
2218 struct mptcb
*mp_tp
= mpte
->mpte_mptcb
;
2219 /* Bring the subflow's notion of snd_nxt into the send window */
2221 mpts_alt
->mpts_sndnxt
= mp_tp
->mpt_snduna
;
2223 mpte
->mpte_active_sub
= mpts_alt
;
2226 socket_unlock(so
, 1);
2228 MPTS_UNLOCK(mpts_alt
);
2230 if (altpath_exists
) {
2232 SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNINFO_UPDATED
);
2233 mptcplog((LOG_NOTICE
, "%s: mp_so 0x%llx switched from "
2234 "%d to %d\n", __func__
,
2235 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2236 mpts
->mpts_connid
, mpts_alt
->mpts_connid
));
2237 tcpstat
.tcps_mp_switches
++;
2241 if (altpath_exists
) {
2242 mpts
->mpts_flags
|= MPTSF_FAILINGOVER
;
2243 mpts
->mpts_flags
&= ~MPTSF_ACTIVE
;
2245 so
= mpts
->mpts_socket
;
2247 so
->so_flags
&= ~SOF_MP_TRYFAILOVER
;
2248 socket_unlock(so
, 1);
2251 MPTS_LOCK_ASSERT_HELD(mpts
);
2252 return (MPTS_EVRET_OK
);
2256 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2259 mptcp_subflow_ifdenied_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2261 struct socket
*mp_so
, *so
;
2262 struct mptcb
*mp_tp
;
2265 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2266 MPTS_LOCK_ASSERT_HELD(mpts
);
2267 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2268 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2269 mp_tp
= mpte
->mpte_mptcb
;
2270 so
= mpts
->mpts_socket
;
2272 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
2273 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
2275 mptcplog((LOG_DEBUG
, "%s: cid %d [linger %s]\n", __func__
,
2276 mpts
->mpts_connid
, (linger
? "YES" : "NO")));
2278 if (mpts
->mpts_soerror
== 0)
2279 mpts
->mpts_soerror
= EHOSTUNREACH
;
2282 * The subflow connection cannot use the outgoing interface.
2284 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2285 * client if the MPTCP connection has not been established. If it
2286 * has been established, let the upper layer call disconnectx.
2288 mptcp_subflow_disconnect(mpte
, mpts
, !linger
);
2291 soevent(mp_so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_IFDENIED
);
2294 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2295 mp_so
->so_error
= EHOSTUNREACH
;
2301 * Keep the subflow socket around, unless the MPTCP socket has
2302 * been detached or the subflow has been disconnected explicitly,
2303 * in which case it should be deleted right away.
2305 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2309 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2312 mptcp_subflow_suspend_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2316 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2317 MPTS_LOCK_ASSERT_HELD(mpts
);
2319 so
= mpts
->mpts_socket
;
2321 /* the subflow connection is being flow controlled */
2322 mpts
->mpts_flags
|= MPTSF_SUSPENDED
;
2324 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
,
2325 mpts
->mpts_connid
));
2327 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
2331 * Handle SO_FILT_HINT_RESUME subflow socket event.
2334 mptcp_subflow_resume_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2338 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2339 MPTS_LOCK_ASSERT_HELD(mpts
);
2341 so
= mpts
->mpts_socket
;
2343 /* the subflow connection is no longer flow controlled */
2344 mpts
->mpts_flags
&= ~MPTSF_SUSPENDED
;
2346 mptcplog((LOG_DEBUG
, "%s: cid %d\n", __func__
, mpts
->mpts_connid
));
2348 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
2352 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2355 mptcp_subflow_connected_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2357 char buf0
[MAX_IPv6_STR_LEN
], buf1
[MAX_IPv6_STR_LEN
];
2358 struct sockaddr_entry
*src_se
, *dst_se
;
2359 struct sockaddr_storage src
;
2360 struct socket
*mp_so
, *so
;
2361 struct mptcb
*mp_tp
;
2362 struct ifnet
*outifp
;
2364 boolean_t mpok
= FALSE
;
2366 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2367 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2368 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2369 mp_tp
= mpte
->mpte_mptcb
;
2371 MPTS_LOCK_ASSERT_HELD(mpts
);
2372 so
= mpts
->mpts_socket
;
2373 af
= mpts
->mpts_family
;
2375 if (mpts
->mpts_flags
& MPTSF_CONNECTED
)
2376 return (MPTS_EVRET_OK
);
2378 if ((mpts
->mpts_flags
& MPTSF_DISCONNECTED
) ||
2379 (mpts
->mpts_flags
& MPTSF_DISCONNECTING
)) {
2380 return (MPTS_EVRET_OK
);
2384 * The subflow connection has been connected. Find out whether it
2385 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2387 * a. If MPTCP connection is not yet established, then this must be
2388 * the first subflow connection. If MPTCP failed to negotiate,
2389 * indicate to the MPTCP socket client via EPROTO, that the
2390 * underlying TCP connection may be peeled off via peeloff(2).
2391 * Otherwise, mark the MPTCP socket as connected.
2393 * b. If MPTCP connection has been established, then this must be
2394 * one of the subsequent subflow connections. If MPTCP failed
2395 * to negotiate, disconnect the connection since peeloff(2)
2396 * is no longer possible.
2398 * Right now, we simply unblock any waiters at the MPTCP socket layer
2399 * if the MPTCP connection has not been established.
2403 if (so
->so_state
& SS_ISDISCONNECTED
) {
2405 * With MPTCP joins, a connection is connected at the subflow
2406 * level, but the 4th ACK from the server elevates the MPTCP
2407 * subflow to connected state. So there is a small window
2408 * where the subflow could get disconnected before the
2409 * connected event is processed.
2411 socket_unlock(so
, 0);
2412 return (MPTS_EVRET_OK
);
2415 mpts
->mpts_soerror
= 0;
2416 mpts
->mpts_flags
&= ~MPTSF_CONNECTING
;
2417 mpts
->mpts_flags
|= MPTSF_CONNECTED
;
2418 if (sototcpcb(so
)->t_mpflags
& TMPF_MPTCP_TRUE
)
2419 mpts
->mpts_flags
|= MPTSF_MP_CAPABLE
;
2421 VERIFY(mpts
->mpts_dst_sl
!= NULL
);
2422 dst_se
= TAILQ_FIRST(&mpts
->mpts_dst_sl
->sl_head
);
2423 VERIFY(dst_se
!= NULL
&& dst_se
->se_addr
!= NULL
&&
2424 dst_se
->se_addr
->sa_family
== af
);
2426 VERIFY(mpts
->mpts_src_sl
!= NULL
);
2427 src_se
= TAILQ_FIRST(&mpts
->mpts_src_sl
->sl_head
);
2428 VERIFY(src_se
!= NULL
&& src_se
->se_addr
!= NULL
&&
2429 src_se
->se_addr
->sa_family
== af
);
2431 /* get/check source IP address */
2434 error
= in_getsockaddr_s(so
, &src
);
2436 struct sockaddr_in
*ms
= SIN(src_se
->se_addr
);
2437 struct sockaddr_in
*s
= SIN(&src
);
2439 VERIFY(s
->sin_len
== ms
->sin_len
);
2440 VERIFY(ms
->sin_family
== AF_INET
);
2442 if ((mpts
->mpts_flags
& MPTSF_BOUND_IP
) &&
2443 bcmp(&ms
->sin_addr
, &s
->sin_addr
,
2444 sizeof (ms
->sin_addr
)) != 0) {
2445 mptcplog((LOG_ERR
, "%s: cid %d local "
2446 "address %s (expected %s)\n", __func__
,
2447 mpts
->mpts_connid
, inet_ntop(AF_INET
,
2448 (void *)&s
->sin_addr
.s_addr
, buf0
,
2449 sizeof (buf0
)), inet_ntop(AF_INET
,
2450 (void *)&ms
->sin_addr
.s_addr
, buf1
,
2453 bcopy(s
, ms
, sizeof (*s
));
2459 error
= in6_getsockaddr_s(so
, &src
);
2461 struct sockaddr_in6
*ms
= SIN6(src_se
->se_addr
);
2462 struct sockaddr_in6
*s
= SIN6(&src
);
2464 VERIFY(s
->sin6_len
== ms
->sin6_len
);
2465 VERIFY(ms
->sin6_family
== AF_INET6
);
2467 if ((mpts
->mpts_flags
& MPTSF_BOUND_IP
) &&
2468 bcmp(&ms
->sin6_addr
, &s
->sin6_addr
,
2469 sizeof (ms
->sin6_addr
)) != 0) {
2470 mptcplog((LOG_ERR
, "%s: cid %d local "
2471 "address %s (expected %s)\n", __func__
,
2472 mpts
->mpts_connid
, inet_ntop(AF_INET6
,
2473 (void *)&s
->sin6_addr
, buf0
,
2474 sizeof (buf0
)), inet_ntop(AF_INET6
,
2475 (void *)&ms
->sin6_addr
, buf1
,
2478 bcopy(s
, ms
, sizeof (*s
));
2489 mptcplog((LOG_ERR
, "%s: cid %d getsockaddr failed (%d)\n",
2490 __func__
, mpts
->mpts_connid
, error
));
2493 /* get/verify the outbound interface */
2494 outifp
= sotoinpcb(so
)->inp_last_outifp
; /* could be NULL */
2495 if (mpts
->mpts_flags
& MPTSF_BOUND_IF
) {
2496 VERIFY(mpts
->mpts_outif
!= NULL
);
2497 if (mpts
->mpts_outif
!= outifp
) {
2498 mptcplog((LOG_ERR
, "%s: cid %d outif %s "
2499 "(expected %s)\n", __func__
, mpts
->mpts_connid
,
2500 ((outifp
!= NULL
) ? outifp
->if_xname
: "NULL"),
2501 mpts
->mpts_outif
->if_xname
));
2503 outifp
= mpts
->mpts_outif
;
2506 mpts
->mpts_outif
= outifp
;
2509 socket_unlock(so
, 0);
2511 mptcplog((LOG_DEBUG
, "%s: cid %d outif %s %s[%d] -> %s[%d] "
2512 "is %s\n", __func__
, mpts
->mpts_connid
, ((outifp
!= NULL
) ?
2513 outifp
->if_xname
: "NULL"), inet_ntop(af
, (af
== AF_INET
) ?
2514 (void *)&SIN(src_se
->se_addr
)->sin_addr
.s_addr
:
2515 (void *)&SIN6(src_se
->se_addr
)->sin6_addr
, buf0
, sizeof (buf0
)),
2516 ((af
== AF_INET
) ? ntohs(SIN(src_se
->se_addr
)->sin_port
) :
2517 ntohs(SIN6(src_se
->se_addr
)->sin6_port
)),
2518 inet_ntop(af
, ((af
== AF_INET
) ?
2519 (void *)&SIN(dst_se
->se_addr
)->sin_addr
.s_addr
:
2520 (void *)&SIN6(dst_se
->se_addr
)->sin6_addr
), buf1
, sizeof (buf1
)),
2521 ((af
== AF_INET
) ? ntohs(SIN(dst_se
->se_addr
)->sin_port
) :
2522 ntohs(SIN6(dst_se
->se_addr
)->sin6_port
)),
2523 ((mpts
->mpts_flags
& MPTSF_MP_CAPABLE
) ?
2524 "MPTCP capable" : "a regular TCP")));
2526 mpok
= (mpts
->mpts_flags
& MPTSF_MP_CAPABLE
);
2529 soevent(mp_so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNINFO_UPDATED
);
2532 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2533 /* case (a) above */
2535 mp_tp
->mpt_flags
|= MPTCPF_PEEL_OFF
;
2536 (void) mptcp_drop(mpte
, mp_tp
, EPROTO
);
2539 if (mptcp_init_authparms(mp_tp
) != 0) {
2540 mp_tp
->mpt_flags
|= MPTCPF_PEEL_OFF
;
2541 (void) mptcp_drop(mpte
, mp_tp
, EPROTO
);
2545 mp_tp
->mpt_state
= MPTCPS_ESTABLISHED
;
2546 mpte
->mpte_associd
= mpts
->mpts_connid
;
2547 DTRACE_MPTCP2(state__change
,
2548 struct mptcb
*, mp_tp
,
2549 uint32_t, 0 /* event */);
2550 mptcp_init_statevars(mp_tp
);
2553 (void) mptcp_setconnorder(mpte
,
2554 mpts
->mpts_connid
, 1);
2555 soisconnected(mp_so
);
2560 /* Initialize the relative sequence number */
2561 mpts
->mpts_rel_seq
= 1;
2562 mpts
->mpts_flags
|= MPTSF_MPCAP_CTRSET
;
2563 mpte
->mpte_nummpcapflows
++;
2564 MPT_LOCK_SPIN(mp_tp
);
2565 mpts
->mpts_sndnxt
= mp_tp
->mpt_snduna
;
2572 * In case of additional flows, the MPTCP socket is not
2573 * MPTSF_MP_CAPABLE until an ACK is received from server
2574 * for 3-way handshake. TCP would have guaranteed that this
2575 * is an MPTCP subflow.
2578 mpts
->mpts_flags
|= MPTSF_MPCAP_CTRSET
;
2579 mpte
->mpte_nummpcapflows
++;
2580 mpts
->mpts_rel_seq
= 1;
2581 MPT_LOCK_SPIN(mp_tp
);
2582 mpts
->mpts_sndnxt
= mp_tp
->mpt_snduna
;
2585 MPTS_LOCK_ASSERT_HELD(mpts
);
2587 return (MPTS_EVRET_OK
); /* keep the subflow socket around */
2591 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2594 mptcp_subflow_disconnected_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2596 struct socket
*mp_so
, *so
;
2597 struct mptcb
*mp_tp
;
2600 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2601 MPTS_LOCK_ASSERT_HELD(mpts
);
2602 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2603 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2604 mp_tp
= mpte
->mpte_mptcb
;
2605 so
= mpts
->mpts_socket
;
2607 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
2608 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
2610 mptcplog2((LOG_DEBUG
, "%s: cid %d [linger %s]\n", __func__
,
2611 mpts
->mpts_connid
, (linger
? "YES" : "NO")));
2613 if (mpts
->mpts_flags
& MPTSF_DISCONNECTED
)
2614 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2617 * Clear flags that are used by getconninfo to return state.
2618 * Retain like MPTSF_DELETEOK, MPTSF_ACTIVE for internal purposes.
2620 mpts
->mpts_flags
&= ~(MPTSF_CONNECTING
|MPTSF_CONNECT_PENDING
|
2621 MPTSF_CONNECTED
|MPTSF_DISCONNECTING
|MPTSF_PREFERRED
|
2622 MPTSF_MP_CAPABLE
|MPTSF_MP_READY
|MPTSF_MP_DEGRADED
|
2623 MPTSF_SUSPENDED
|MPTSF_ACTIVE
);
2624 mpts
->mpts_flags
|= MPTSF_DISCONNECTED
;
2627 * The subflow connection has been disconnected.
2629 * Right now, we simply unblock any waiters at the MPTCP socket layer
2630 * if the MPTCP connection has not been established.
2634 soevent(mp_so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNINFO_UPDATED
);
2636 if (mpts
->mpts_flags
& MPTSF_MPCAP_CTRSET
) {
2637 mpte
->mpte_nummpcapflows
--;
2638 mpts
->mpts_flags
&= ~MPTSF_MPCAP_CTRSET
;
2642 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2644 soisdisconnected(mp_so
);
2651 * The underlying subflow socket has been disconnected;
2652 * it is no longer useful to us. Keep the subflow socket
2653 * around, unless the MPTCP socket has been detached or
2654 * the subflow has been disconnected explicitly, in which
2655 * case it should be deleted right away.
2657 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2661 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2664 mptcp_subflow_mpstatus_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2666 struct socket
*mp_so
, *so
;
2667 struct mptcb
*mp_tp
;
2668 ev_ret_t ret
= MPTS_EVRET_OK_UPDATE
;
2670 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2671 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2672 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2673 mp_tp
= mpte
->mpte_mptcb
;
2675 MPTS_LOCK_ASSERT_HELD(mpts
);
2676 so
= mpts
->mpts_socket
;
2681 if (sototcpcb(so
)->t_mpflags
& TMPF_MPTCP_TRUE
)
2682 mpts
->mpts_flags
|= MPTSF_MP_CAPABLE
;
2684 mpts
->mpts_flags
&= ~MPTSF_MP_CAPABLE
;
2686 if (sototcpcb(so
)->t_mpflags
& TMPF_TCP_FALLBACK
) {
2687 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
)
2689 mpts
->mpts_flags
|= MPTSF_MP_DEGRADED
;
2692 mpts
->mpts_flags
&= ~MPTSF_MP_DEGRADED
;
2694 if (sototcpcb(so
)->t_mpflags
& TMPF_MPTCP_READY
)
2695 mpts
->mpts_flags
|= MPTSF_MP_READY
;
2697 mpts
->mpts_flags
&= ~MPTSF_MP_READY
;
2699 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
2700 mp_tp
->mpt_flags
|= MPTCPF_FALLBACK_TO_TCP
;
2701 mp_tp
->mpt_flags
&= ~MPTCPF_JOIN_READY
;
2704 if (mp_tp
->mpt_flags
& MPTCPF_FALLBACK_TO_TCP
) {
2705 VERIFY(!(mp_tp
->mpt_flags
& MPTCPF_JOIN_READY
));
2706 ret
= MPTS_EVRET_DISCONNECT_FALLBACK
;
2707 } else if (mpts
->mpts_flags
& MPTSF_MP_READY
) {
2708 mp_tp
->mpt_flags
|= MPTCPF_JOIN_READY
;
2709 ret
= MPTS_EVRET_CONNECT_PENDING
;
2712 mptcplog2((LOG_DEBUG
, "%s: mp_so 0x%llx mpt_flags=%b cid %d "
2713 "mptsf=%b\n", __func__
,
2714 (u_int64_t
)VM_KERNEL_ADDRPERM(mpte
->mpte_mppcb
->mpp_socket
),
2715 mp_tp
->mpt_flags
, MPTCPF_BITS
, mpts
->mpts_connid
,
2716 mpts
->mpts_flags
, MPTSF_BITS
));
2719 socket_unlock(so
, 0);
2725 * Handle SO_FILT_HINT_MUSTRST subflow socket event
2728 mptcp_subflow_mustrst_ev(struct mptses
*mpte
, struct mptsub
*mpts
)
2730 struct socket
*mp_so
, *so
;
2731 struct mptcb
*mp_tp
;
2735 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2736 MPTS_LOCK_ASSERT_HELD(mpts
);
2737 VERIFY(mpte
->mpte_mppcb
!= NULL
);
2738 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2739 mp_tp
= mpte
->mpte_mptcb
;
2740 so
= mpts
->mpts_socket
;
2742 linger
= (!(mpts
->mpts_flags
& MPTSF_DELETEOK
) &&
2743 !(mp_so
->so_flags
& SOF_PCBCLEARING
));
2745 if (mpts
->mpts_soerror
== 0)
2746 mpts
->mpts_soerror
= ECONNABORTED
;
2748 so
->so_error
= ECONNABORTED
;
2750 /* We got an invalid option or a fast close */
2752 struct tcptemp
*t_template
;
2753 struct inpcb
*inp
= sotoinpcb(so
);
2754 struct tcpcb
*tp
= NULL
;
2756 tp
= intotcpcb(inp
);
2758 t_template
= tcp_maketemplate(tp
);
2760 unsigned int ifscope
, nocell
= 0;
2762 if (inp
->inp_flags
& INP_BOUND_IF
)
2763 ifscope
= inp
->inp_boundifp
->if_index
;
2765 ifscope
= IFSCOPE_NONE
;
2767 if (inp
->inp_flags
& INP_NO_IFT_CELLULAR
)
2770 tcp_respond(tp
, t_template
->tt_ipgen
,
2771 &t_template
->tt_t
, (struct mbuf
*)NULL
,
2772 tp
->rcv_nxt
, tp
->snd_una
, TH_RST
, ifscope
, nocell
);
2773 (void) m_free(dtom(t_template
));
2774 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx cid %d \n",
2775 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2776 so
, mpts
->mpts_connid
));
2778 socket_unlock(so
, 0);
2779 mptcp_subflow_disconnect(mpte
, mpts
, !linger
);
2782 soevent(mp_so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNINFO_UPDATED
);
2785 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
2786 mp_so
->so_error
= ECONNABORTED
;
2792 * Keep the subflow socket around unless the subflow has been
2793 * disconnected explicitly.
2795 return (linger
? MPTS_EVRET_OK
: MPTS_EVRET_DELETE
);
2799 mptcp_evret2str(ev_ret_t ret
)
2801 const char *c
= "UNKNOWN";
2804 case MPTS_EVRET_DELETE
:
2805 c
= "MPTS_EVRET_DELETE";
2807 case MPTS_EVRET_CONNECT_PENDING
:
2808 c
= "MPTS_EVRET_CONNECT_PENDING";
2810 case MPTS_EVRET_DISCONNECT_FALLBACK
:
2811 c
= "MPTS_EVRET_DISCONNECT_FALLBACK";
2814 c
= "MPTS_EVRET_OK";
2816 case MPTS_EVRET_OK_UPDATE
:
2817 c
= "MPTS_EVRET_OK_UPDATE";
2824 * Add a reference to a subflow structure; used by MPTS_ADDREF().
2827 mptcp_subflow_addref(struct mptsub
*mpts
, int locked
)
2832 MPTS_LOCK_ASSERT_HELD(mpts
);
2834 if (++mpts
->mpts_refcnt
== 0) {
2835 panic("%s: mpts %p wraparound refcnt\n", __func__
, mpts
);
2843 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
2846 mptcp_subflow_remref(struct mptsub
*mpts
)
2849 if (mpts
->mpts_refcnt
== 0) {
2850 panic("%s: mpts %p negative refcnt\n", __func__
, mpts
);
2853 if (--mpts
->mpts_refcnt
> 0) {
2857 /* callee will unlock and destroy lock */
2858 mptcp_subflow_free(mpts
);
2862 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
2863 * caller must ensure that the option can be issued on subflow sockets, via
2864 * MPOF_SUBFLOW_OK flag.
2867 mptcp_subflow_sosetopt(struct mptses
*mpte
, struct socket
*so
,
2870 struct socket
*mp_so
;
2871 struct sockopt sopt
;
2875 VERIFY(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
);
2876 mpo
->mpo_flags
&= ~MPOF_INTERIM
;
2878 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2879 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2881 bzero(&sopt
, sizeof (sopt
));
2882 sopt
.sopt_dir
= SOPT_SET
;
2883 sopt
.sopt_level
= mpo
->mpo_level
;
2884 sopt
.sopt_name
= mpo
->mpo_name
;
2885 sopt
.sopt_val
= CAST_USER_ADDR_T(&mpo
->mpo_intval
);
2886 sopt
.sopt_valsize
= sizeof (int);
2887 sopt
.sopt_p
= kernproc
;
2889 error
= sosetoptlock(so
, &sopt
, 0); /* already locked */
2891 mptcplog2((LOG_DEBUG
, "%s: mp_so 0x%llx sopt %s "
2892 "val %d set successful\n", __func__
,
2893 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2894 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
,
2895 buf
, sizeof (buf
)), mpo
->mpo_intval
));
2897 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx sopt %s "
2898 "val %d set error %d\n", __func__
,
2899 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2900 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
,
2901 buf
, sizeof (buf
)), mpo
->mpo_intval
, error
));
2907 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
2908 * caller must ensure that the option can be issued on subflow sockets, via
2909 * MPOF_SUBFLOW_OK flag.
2912 mptcp_subflow_sogetopt(struct mptses
*mpte
, struct socket
*so
,
2915 struct socket
*mp_so
;
2916 struct sockopt sopt
;
2920 VERIFY(mpo
->mpo_flags
& MPOF_SUBFLOW_OK
);
2921 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
2922 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
2924 bzero(&sopt
, sizeof (sopt
));
2925 sopt
.sopt_dir
= SOPT_GET
;
2926 sopt
.sopt_level
= mpo
->mpo_level
;
2927 sopt
.sopt_name
= mpo
->mpo_name
;
2928 sopt
.sopt_val
= CAST_USER_ADDR_T(&mpo
->mpo_intval
);
2929 sopt
.sopt_valsize
= sizeof (int);
2930 sopt
.sopt_p
= kernproc
;
2932 error
= sogetoptlock(so
, &sopt
, 0); /* already locked */
2934 mptcplog2((LOG_DEBUG
, "%s: mp_so 0x%llx sopt %s "
2935 "val %d get successful\n", __func__
,
2936 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2937 mptcp_sopt2str(mpo
->mpo_level
, mpo
->mpo_name
,
2938 buf
, sizeof (buf
)), mpo
->mpo_intval
));
2940 mptcplog((LOG_ERR
, "%s: mp_so 0x%llx sopt %s get error %d\n",
2941 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2942 mptcp_sopt2str(mpo
->mpo_level
,
2943 mpo
->mpo_name
, buf
, sizeof (buf
)), error
));
2950 * MPTCP garbage collector.
2952 * This routine is called by the MP domain on-demand, periodic callout,
2953 * which is triggered when a MPTCP socket is closed. The callout will
2954 * repeat as long as this routine returns a non-zero value.
2957 mptcp_gc(struct mppcbinfo
*mppi
)
2959 struct mppcb
*mpp
, *tmpp
;
2960 uint32_t active
= 0;
2962 lck_mtx_assert(&mppi
->mppi_lock
, LCK_MTX_ASSERT_OWNED
);
2964 mptcplog3((LOG_DEBUG
, "%s: running\n", __func__
));
2966 TAILQ_FOREACH_SAFE(mpp
, &mppi
->mppi_pcbs
, mpp_entry
, tmpp
) {
2967 struct socket
*mp_so
;
2968 struct mptses
*mpte
;
2969 struct mptcb
*mp_tp
;
2971 VERIFY(mpp
->mpp_flags
& MPP_ATTACHED
);
2972 mp_so
= mpp
->mpp_socket
;
2973 VERIFY(mp_so
!= NULL
);
2974 mpte
= mptompte(mpp
);
2975 VERIFY(mpte
!= NULL
);
2976 mp_tp
= mpte
->mpte_mptcb
;
2977 VERIFY(mp_tp
!= NULL
);
2979 mptcplog3((LOG_DEBUG
, "%s: mp_so 0x%llx found "
2980 "(u=%d,r=%d,s=%d)\n", __func__
,
2981 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
), mp_so
->so_usecount
,
2982 mp_so
->so_retaincnt
, mpp
->mpp_state
));
2984 if (!lck_mtx_try_lock(&mpp
->mpp_lock
)) {
2985 mptcplog3((LOG_DEBUG
, "%s: mp_so 0x%llx skipped "
2986 "(u=%d,r=%d)\n", __func__
,
2987 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
2988 mp_so
->so_usecount
, mp_so
->so_retaincnt
));
2993 /* check again under the lock */
2994 if (mp_so
->so_usecount
> 1) {
2995 boolean_t wakeup
= FALSE
;
2996 struct mptsub
*mpts
, *tmpts
;
2998 mptcplog3((LOG_DEBUG
, "%s: mp_so 0x%llx skipped "
2999 "[u=%d,r=%d] %d %d\n", __func__
,
3000 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3001 mp_so
->so_usecount
, mp_so
->so_retaincnt
,
3002 mp_tp
->mpt_gc_ticks
,
3005 if (mp_tp
->mpt_state
>= MPTCPS_FIN_WAIT_1
) {
3006 if (mp_tp
->mpt_gc_ticks
> 0)
3007 mp_tp
->mpt_gc_ticks
--;
3008 if (mp_tp
->mpt_gc_ticks
== 0) {
3010 if (mp_tp
->mpt_localkey
!= NULL
) {
3012 mp_tp
->mpt_localkey
);
3013 mp_tp
->mpt_localkey
= NULL
;
3019 TAILQ_FOREACH_SAFE(mpts
,
3020 &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
3022 mpts
->mpts_flags
|= MPTSF_DELETEOK
;
3023 if (mpts
->mpts_soerror
== 0)
3024 mpts
->mpts_soerror
= ETIMEDOUT
;
3025 mptcp_subflow_eupcall(mpts
->mpts_socket
,
3026 mpts
, SO_FILT_HINT_DISCONNECTED
);
3030 lck_mtx_unlock(&mpp
->mpp_lock
);
3035 if (mpp
->mpp_state
!= MPPCB_STATE_DEAD
) {
3036 mptcplog3((LOG_DEBUG
, "%s: mp_so 0x%llx skipped "
3037 "[u=%d,r=%d,s=%d]\n", __func__
,
3038 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3039 mp_so
->so_usecount
, mp_so
->so_retaincnt
,
3041 lck_mtx_unlock(&mpp
->mpp_lock
);
3047 * The PCB has been detached, and there is exactly 1 refnct
3048 * held by the MPTCP thread. Signal that thread to terminate,
3049 * after which the last refcnt will be released. That will
3050 * allow it to be destroyed below during the next round.
3052 if (mp_so
->so_usecount
== 1) {
3053 mptcplog2((LOG_DEBUG
, "%s: mp_so 0x%llx scheduled for "
3054 "termination [u=%d,r=%d]\n", __func__
,
3055 (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3056 mp_so
->so_usecount
, mp_so
->so_retaincnt
));
3057 /* signal MPTCP thread to terminate */
3058 mptcp_thread_terminate_signal(mpte
);
3059 lck_mtx_unlock(&mpp
->mpp_lock
);
3064 mptcplog((LOG_DEBUG
, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3065 __func__
, (u_int64_t
)VM_KERNEL_ADDRPERM(mp_so
),
3066 mp_so
->so_usecount
, mp_so
->so_retaincnt
));
3067 DTRACE_MPTCP4(dispose
, struct socket
*, mp_so
,
3068 struct sockbuf
*, &mp_so
->so_rcv
,
3069 struct sockbuf
*, &mp_so
->so_snd
,
3070 struct mppcb
*, mpp
);
3079 * Drop a MPTCP connection, reporting the specified error.
3082 mptcp_drop(struct mptses
*mpte
, struct mptcb
*mp_tp
, int errno
)
3084 struct socket
*mp_so
;
3086 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
3087 MPT_LOCK_ASSERT_HELD(mp_tp
);
3088 VERIFY(mpte
->mpte_mptcb
== mp_tp
);
3089 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
3091 mp_tp
->mpt_state
= MPTCPS_CLOSED
;
3092 DTRACE_MPTCP2(state__change
, struct mptcb
*, mp_tp
,
3093 uint32_t, 0 /* event */);
3095 if (errno
== ETIMEDOUT
&& mp_tp
->mpt_softerror
!= 0)
3096 errno
= mp_tp
->mpt_softerror
;
3097 mp_so
->so_error
= errno
;
3099 return (mptcp_close(mpte
, mp_tp
));
3103 * Close a MPTCP control block.
3106 mptcp_close(struct mptses
*mpte
, struct mptcb
*mp_tp
)
3108 struct socket
*mp_so
;
3109 struct mptsub
*mpts
, *tmpts
;
3111 MPTE_LOCK_ASSERT_HELD(mpte
); /* same as MP socket lock */
3112 MPT_LOCK_ASSERT_HELD(mp_tp
);
3113 VERIFY(mpte
->mpte_mptcb
== mp_tp
);
3114 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
3115 if (mp_tp
->mpt_localkey
!= NULL
) {
3116 mptcp_free_key(mp_tp
->mpt_localkey
);
3117 mp_tp
->mpt_localkey
= NULL
;
3121 soisdisconnected(mp_so
);
3124 if (mp_tp
->mpt_flags
& MPTCPF_PEEL_OFF
) {
3129 /* Clean up all subflows */
3130 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
3132 mptcp_subflow_disconnect(mpte
, mpts
, TRUE
);
3134 mptcp_subflow_del(mpte
, mpts
, TRUE
);
3142 mptcp_notify_close(struct socket
*so
)
3144 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_DISCONNECTED
));
3148 * Signal MPTCP thread to wake up.
3151 mptcp_thread_signal(struct mptses
*mpte
)
3153 lck_mtx_lock(&mpte
->mpte_thread_lock
);
3154 mptcp_thread_signal_locked(mpte
);
3155 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
3159 * Signal MPTCP thread to wake up (locked version)
3162 mptcp_thread_signal_locked(struct mptses
*mpte
)
3164 lck_mtx_assert(&mpte
->mpte_thread_lock
, LCK_MTX_ASSERT_OWNED
);
3166 mpte
->mpte_thread_reqs
++;
3167 if (!mpte
->mpte_thread_active
&& mpte
->mpte_thread
!= THREAD_NULL
)
3168 wakeup_one((caddr_t
)&mpte
->mpte_thread
);
3172 * Signal MPTCP thread to terminate.
3175 mptcp_thread_terminate_signal(struct mptses
*mpte
)
3177 lck_mtx_lock(&mpte
->mpte_thread_lock
);
3178 if (mpte
->mpte_thread
!= THREAD_NULL
) {
3179 mpte
->mpte_thread
= THREAD_NULL
;
3180 mpte
->mpte_thread_reqs
++;
3181 if (!mpte
->mpte_thread_active
)
3182 wakeup_one((caddr_t
)&mpte
->mpte_thread
);
3184 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
3188 * MPTCP thread workloop.
3191 mptcp_thread_dowork(struct mptses
*mpte
)
3193 struct socket
*mp_so
;
3194 struct mptsub
*mpts
, *tmpts
;
3195 boolean_t connect_pending
= FALSE
, disconnect_fallback
= FALSE
;
3196 boolean_t conninfo_update
= FALSE
;
3198 MPTE_LOCK(mpte
); /* same as MP socket lock */
3199 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3200 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
3201 VERIFY(mp_so
!= NULL
);
3203 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
3207 MPTS_ADDREF_LOCKED(mpts
); /* for us */
3209 /* Update process ownership based on parent mptcp socket */
3210 mptcp_update_last_owner(mpts
, mp_so
);
3212 mptcp_subflow_input(mpte
, mpts
);
3213 ret
= mptcp_subflow_events(mpte
, mpts
);
3215 if (mpts
->mpts_flags
& MPTSF_ACTIVE
) {
3216 mptcplog3((LOG_INFO
, "%s: cid %d \n", __func__
,
3217 mpts
->mpts_connid
));
3218 (void) mptcp_subflow_output(mpte
, mpts
);
3222 * If MPTCP socket is closed, disconnect all subflows.
3223 * This will generate a disconnect event which will
3224 * be handled during the next iteration, causing a
3225 * non-zero error to be returned above.
3227 if (mp_so
->so_flags
& SOF_PCBCLEARING
)
3228 mptcp_subflow_disconnect(mpte
, mpts
, FALSE
);
3232 case MPTS_EVRET_OK_UPDATE
:
3233 conninfo_update
= TRUE
;
3238 case MPTS_EVRET_DELETE
:
3239 if (mptcp_delete_ok(mpte
, mpts
)) {
3240 mptcp_subflow_del(mpte
, mpts
, TRUE
);
3243 case MPTS_EVRET_CONNECT_PENDING
:
3244 connect_pending
= TRUE
;
3246 case MPTS_EVRET_DISCONNECT_FALLBACK
:
3247 disconnect_fallback
= TRUE
;
3250 MPTS_REMREF(mpts
); /* ours */
3253 if (conninfo_update
) {
3254 soevent(mp_so
, SO_FILT_HINT_LOCKED
|
3255 SO_FILT_HINT_CONNINFO_UPDATED
);
3258 if (!connect_pending
&& !disconnect_fallback
) {
3263 TAILQ_FOREACH_SAFE(mpts
, &mpte
->mpte_subflows
, mpts_entry
, tmpts
) {
3265 if (disconnect_fallback
) {
3266 struct socket
*so
= NULL
;
3267 struct inpcb
*inp
= NULL
;
3268 struct tcpcb
*tp
= NULL
;
3270 if (mpts
->mpts_flags
& MPTSF_MP_DEGRADED
) {
3275 mpts
->mpts_flags
|= MPTSF_MP_DEGRADED
;
3277 if (mpts
->mpts_flags
& (MPTSF_DISCONNECTING
|
3278 MPTSF_DISCONNECTED
)) {
3282 so
= mpts
->mpts_socket
;
3285 * The MPTCP connection has degraded to a fallback
3286 * mode, so there is no point in keeping this subflow
3287 * regardless of its MPTCP-readiness state, unless it
3288 * is the primary one which we use for fallback. This
3289 * assumes that the subflow used for fallback is the
3294 inp
= sotoinpcb(so
);
3295 tp
= intotcpcb(inp
);
3297 ~(TMPF_MPTCP_READY
|TMPF_MPTCP_TRUE
);
3298 tp
->t_mpflags
|= TMPF_TCP_FALLBACK
;
3299 if (mpts
->mpts_flags
& MPTSF_ACTIVE
) {
3300 socket_unlock(so
, 1);
3304 tp
->t_mpflags
|= TMPF_RESET
;
3305 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MUSTRST
);
3306 socket_unlock(so
, 1);
3308 } else if (connect_pending
) {
3310 * The MPTCP connection has progressed to a state
3311 * where it supports full multipath semantics; allow
3312 * additional joins to be attempted for all subflows
3313 * that are in the PENDING state.
3315 if (mpts
->mpts_flags
& MPTSF_CONNECT_PENDING
) {
3316 (void) mptcp_subflow_soconnectx(mpte
, mpts
);
3329 mptcp_thread_func(void *v
, wait_result_t w
)
3332 struct mptses
*mpte
= v
;
3333 struct timespec
*ts
= NULL
;
3335 VERIFY(mpte
!= NULL
);
3337 lck_mtx_lock_spin(&mpte
->mpte_thread_lock
);
3340 lck_mtx_assert(&mpte
->mpte_thread_lock
, LCK_MTX_ASSERT_OWNED
);
3342 if (mpte
->mpte_thread
!= THREAD_NULL
) {
3343 (void) msleep(&mpte
->mpte_thread
,
3344 &mpte
->mpte_thread_lock
, (PZERO
- 1) | PSPIN
,
3348 /* MPTCP socket is closed? */
3349 if (mpte
->mpte_thread
== THREAD_NULL
) {
3350 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
3351 /* callee will destroy thread lock */
3352 mptcp_thread_destroy(mpte
);
3357 mpte
->mpte_thread_active
= 1;
3359 uint32_t reqs
= mpte
->mpte_thread_reqs
;
3361 lck_mtx_unlock(&mpte
->mpte_thread_lock
);
3362 mptcp_thread_dowork(mpte
);
3363 lck_mtx_lock_spin(&mpte
->mpte_thread_lock
);
3365 /* if there's no pending request, we're done */
3366 if (reqs
== mpte
->mpte_thread_reqs
||
3367 mpte
->mpte_thread
== THREAD_NULL
)
3370 mpte
->mpte_thread_reqs
= 0;
3371 mpte
->mpte_thread_active
= 0;
3376 * Destroy a MTCP thread, to be called in the MPTCP thread context
3377 * upon receiving an indication to self-terminate. This routine
3378 * will not return, as the current thread is terminated at the end.
3381 mptcp_thread_destroy(struct mptses
*mpte
)
3383 struct socket
*mp_so
;
3385 MPTE_LOCK(mpte
); /* same as MP socket lock */
3386 VERIFY(mpte
->mpte_thread
== THREAD_NULL
);
3387 VERIFY(mpte
->mpte_mppcb
!= NULL
);
3389 mptcp_sesdestroy(mpte
);
3391 mp_so
= mpte
->mpte_mppcb
->mpp_socket
;
3392 VERIFY(mp_so
!= NULL
);
3393 VERIFY(mp_so
->so_usecount
!= 0);
3394 mp_so
->so_usecount
--; /* for thread */
3395 mpte
->mpte_mppcb
->mpp_flags
|= MPP_DEFUNCT
;
3398 /* for the extra refcnt from kernel_thread_start() */
3399 thread_deallocate(current_thread());
3400 /* this is the end */
3401 thread_terminate(current_thread());
3406 * Protocol pr_lock callback.
3409 mptcp_lock(struct socket
*mp_so
, int refcount
, void *lr
)
3411 struct mppcb
*mpp
= sotomppcb(mp_so
);
3415 lr_saved
= __builtin_return_address(0);
3420 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__
,
3421 mp_so
, lr_saved
, solockhistory_nr(mp_so
));
3424 lck_mtx_lock(&mpp
->mpp_lock
);
3426 if (mp_so
->so_usecount
< 0) {
3427 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__
,
3428 mp_so
, mp_so
->so_pcb
, lr_saved
, mp_so
->so_usecount
,
3429 solockhistory_nr(mp_so
));
3433 mp_so
->so_usecount
++;
3434 mp_so
->lock_lr
[mp_so
->next_lock_lr
] = lr_saved
;
3435 mp_so
->next_lock_lr
= (mp_so
->next_lock_lr
+ 1) % SO_LCKDBG_MAX
;
3441 * Protocol pr_unlock callback.
3444 mptcp_unlock(struct socket
*mp_so
, int refcount
, void *lr
)
3446 struct mppcb
*mpp
= sotomppcb(mp_so
);
3450 lr_saved
= __builtin_return_address(0);
3455 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__
,
3456 mp_so
, mp_so
->so_usecount
, lr_saved
,
3457 solockhistory_nr(mp_so
));
3460 lck_mtx_assert(&mpp
->mpp_lock
, LCK_MTX_ASSERT_OWNED
);
3463 mp_so
->so_usecount
--;
3465 if (mp_so
->so_usecount
< 0) {
3466 panic("%s: so=%p usecount=%x lrh= %s\n", __func__
,
3467 mp_so
, mp_so
->so_usecount
, solockhistory_nr(mp_so
));
3470 mp_so
->unlock_lr
[mp_so
->next_unlock_lr
] = lr_saved
;
3471 mp_so
->next_unlock_lr
= (mp_so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
3472 lck_mtx_unlock(&mpp
->mpp_lock
);
3478 * Protocol pr_getlock callback.
3481 mptcp_getlock(struct socket
*mp_so
, int locktype
)
3483 #pragma unused(locktype)
3484 struct mppcb
*mpp
= sotomppcb(mp_so
);
3487 panic("%s: so=%p NULL so_pcb %s\n", __func__
, mp_so
,
3488 solockhistory_nr(mp_so
));
3491 if (mp_so
->so_usecount
< 0) {
3492 panic("%s: so=%p usecount=%x lrh= %s\n", __func__
,
3493 mp_so
, mp_so
->so_usecount
, solockhistory_nr(mp_so
));
3496 return (&mpp
->mpp_lock
);
3500 * Key generation functions
3503 mptcp_generate_unique_key(struct mptcp_key_entry
*key_entry
)
3505 struct mptcp_key_entry
*key_elm
;
3507 read_random(&key_entry
->mkey_value
, sizeof (key_entry
->mkey_value
));
3508 if (key_entry
->mkey_value
== 0)
3510 mptcp_do_sha1(&key_entry
->mkey_value
, key_entry
->mkey_digest
,
3511 sizeof (key_entry
->mkey_digest
));
3513 LIST_FOREACH(key_elm
, &mptcp_keys_pool
, mkey_next
) {
3514 if (key_elm
->mkey_value
== key_entry
->mkey_value
) {
3517 if (bcmp(key_elm
->mkey_digest
, key_entry
->mkey_digest
, 4) ==
3524 static mptcp_key_t
*
3525 mptcp_reserve_key(void)
3527 struct mptcp_key_entry
*key_elm
;
3528 struct mptcp_key_entry
*found_elm
= NULL
;
3530 lck_mtx_lock(&mptcp_keys_pool
.mkph_lock
);
3531 LIST_FOREACH(key_elm
, &mptcp_keys_pool
, mkey_next
) {
3532 if (key_elm
->mkey_flags
== MKEYF_FREE
) {
3533 key_elm
->mkey_flags
= MKEYF_INUSE
;
3534 found_elm
= key_elm
;
3538 lck_mtx_unlock(&mptcp_keys_pool
.mkph_lock
);
3541 return (&found_elm
->mkey_value
);
3544 key_elm
= (struct mptcp_key_entry
*)
3545 zalloc(mptcp_keys_pool
.mkph_key_entry_zone
);
3546 key_elm
->mkey_flags
= MKEYF_INUSE
;
3548 lck_mtx_lock(&mptcp_keys_pool
.mkph_lock
);
3549 mptcp_generate_unique_key(key_elm
);
3550 LIST_INSERT_HEAD(&mptcp_keys_pool
, key_elm
, mkey_next
);
3551 mptcp_keys_pool
.mkph_count
+= 1;
3552 lck_mtx_unlock(&mptcp_keys_pool
.mkph_lock
);
3553 return (&key_elm
->mkey_value
);
3557 mptcp_get_stored_digest(mptcp_key_t
*key
)
3559 struct mptcp_key_entry
*key_holder
;
3560 caddr_t digest
= NULL
;
3562 lck_mtx_lock(&mptcp_keys_pool
.mkph_lock
);
3563 key_holder
= (struct mptcp_key_entry
*)(void *)((caddr_t
)key
-
3564 offsetof(struct mptcp_key_entry
, mkey_value
));
3565 if (key_holder
->mkey_flags
!= MKEYF_INUSE
)
3566 panic_plain("%s", __func__
);
3567 digest
= &key_holder
->mkey_digest
[0];
3568 lck_mtx_unlock(&mptcp_keys_pool
.mkph_lock
);
3573 mptcp_free_key(mptcp_key_t
*key
)
3575 struct mptcp_key_entry
*key_holder
;
3576 struct mptcp_key_entry
*key_elm
;
3577 int pt
= RandomULong();
3579 mptcplog((LOG_INFO
, "%s\n", __func__
));
3581 lck_mtx_lock(&mptcp_keys_pool
.mkph_lock
);
3582 key_holder
= (struct mptcp_key_entry
*)(void*)((caddr_t
)key
-
3583 offsetof(struct mptcp_key_entry
, mkey_value
));
3584 key_holder
->mkey_flags
= MKEYF_FREE
;
3586 LIST_REMOVE(key_holder
, mkey_next
);
3587 mptcp_keys_pool
.mkph_count
-= 1;
3589 /* Free half the time */
3591 zfree(mptcp_keys_pool
.mkph_key_entry_zone
, key_holder
);
3593 /* Insert it at random point to avoid early reuse */
3595 if (mptcp_keys_pool
.mkph_count
> 1) {
3596 pt
= pt
% (mptcp_keys_pool
.mkph_count
- 1);
3597 LIST_FOREACH(key_elm
, &mptcp_keys_pool
, mkey_next
) {
3599 LIST_INSERT_AFTER(key_elm
, key_holder
,
3605 panic("missed insertion");
3607 LIST_INSERT_HEAD(&mptcp_keys_pool
, key_holder
,
3610 mptcp_keys_pool
.mkph_count
+= 1;
3612 lck_mtx_unlock(&mptcp_keys_pool
.mkph_lock
);
3616 mptcp_key_pool_init(void)
3619 struct mptcp_key_entry
*key_entry
;
3621 LIST_INIT(&mptcp_keys_pool
);
3622 mptcp_keys_pool
.mkph_count
= 0;
3624 mptcp_keys_pool
.mkph_key_elm_sz
= (vm_size_t
)
3625 (sizeof (struct mptcp_key_entry
));
3626 mptcp_keys_pool
.mkph_key_entry_zone
= zinit(
3627 mptcp_keys_pool
.mkph_key_elm_sz
,
3628 MPTCP_MX_KEY_ALLOCS
* mptcp_keys_pool
.mkph_key_elm_sz
,
3629 MPTCP_MX_PREALLOC_ZONE_SZ
, "mptkeys");
3630 if (mptcp_keys_pool
.mkph_key_entry_zone
== NULL
) {
3631 panic("%s: unable to allocate MPTCP keys zone \n", __func__
);
3634 zone_change(mptcp_keys_pool
.mkph_key_entry_zone
, Z_CALLERACCT
, FALSE
);
3635 zone_change(mptcp_keys_pool
.mkph_key_entry_zone
, Z_EXPAND
, TRUE
);
3637 for (i
= 0; i
< MPTCP_KEY_PREALLOCS_MX
; i
++) {
3638 key_entry
= (struct mptcp_key_entry
*)
3639 zalloc(mptcp_keys_pool
.mkph_key_entry_zone
);
3640 key_entry
->mkey_flags
= MKEYF_FREE
;
3641 mptcp_generate_unique_key(key_entry
);
3642 LIST_INSERT_HEAD(&mptcp_keys_pool
, key_entry
, mkey_next
);
3643 mptcp_keys_pool
.mkph_count
+= 1;
3645 lck_mtx_init(&mptcp_keys_pool
.mkph_lock
, mtcbinfo
.mppi_lock_grp
,
3646 mtcbinfo
.mppi_lock_attr
);
3650 * MPTCP Join support
3654 mptcp_attach_to_subf(struct socket
*so
, struct mptcb
*mp_tp
,
3657 struct tcpcb
*tp
= sototcpcb(so
);
3658 struct mptcp_subf_auth_entry
*sauth_entry
;
3659 MPT_LOCK_ASSERT_NOTHELD(mp_tp
);
3661 MPT_LOCK_SPIN(mp_tp
);
3662 tp
->t_mptcb
= mp_tp
;
3665 * As long as the mpts_connid is unique it can be used as the
3666 * address ID for additional subflows.
3667 * The address ID of the first flow is implicitly 0.
3669 if (mp_tp
->mpt_state
== MPTCPS_CLOSED
) {
3670 tp
->t_local_aid
= 0;
3672 tp
->t_local_aid
= conn_id
;
3673 tp
->t_mpflags
|= (TMPF_PREESTABLISHED
| TMPF_JOINED_FLOW
);
3674 so
->so_flags
|= SOF_MP_SEC_SUBFLOW
;
3676 sauth_entry
= zalloc(mpt_subauth_zone
);
3677 sauth_entry
->msae_laddr_id
= tp
->t_local_aid
;
3678 sauth_entry
->msae_raddr_id
= 0;
3679 sauth_entry
->msae_raddr_rand
= 0;
3681 sauth_entry
->msae_laddr_rand
= RandomULong();
3682 if (sauth_entry
->msae_laddr_rand
== 0)
3684 LIST_INSERT_HEAD(&mp_tp
->mpt_subauth_list
, sauth_entry
, msae_next
);
3688 mptcp_detach_mptcb_from_subf(struct mptcb
*mp_tp
, struct socket
*so
)
3690 struct mptcp_subf_auth_entry
*sauth_entry
;
3691 struct tcpcb
*tp
= sototcpcb(so
);
3698 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
3699 if (sauth_entry
->msae_laddr_id
== tp
->t_local_aid
) {
3705 LIST_REMOVE(sauth_entry
, msae_next
);
3706 zfree(mpt_subauth_zone
, sauth_entry
);
3713 mptcp_get_rands(mptcp_addr_id addr_id
, struct mptcb
*mp_tp
, u_int32_t
*lrand
,
3716 struct mptcp_subf_auth_entry
*sauth_entry
;
3717 MPT_LOCK_ASSERT_NOTHELD(mp_tp
);
3720 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
3721 if (sauth_entry
->msae_laddr_id
== addr_id
) {
3723 *lrand
= sauth_entry
->msae_laddr_rand
;
3725 *rrand
= sauth_entry
->msae_raddr_rand
;
3733 mptcp_set_raddr_rand(mptcp_addr_id laddr_id
, struct mptcb
*mp_tp
,
3734 mptcp_addr_id raddr_id
, u_int32_t raddr_rand
)
3736 struct mptcp_subf_auth_entry
*sauth_entry
;
3737 MPT_LOCK_ASSERT_NOTHELD(mp_tp
);
3740 LIST_FOREACH(sauth_entry
, &mp_tp
->mpt_subauth_list
, msae_next
) {
3741 if (sauth_entry
->msae_laddr_id
== laddr_id
) {
3742 if ((sauth_entry
->msae_raddr_id
!= 0) &&
3743 (sauth_entry
->msae_raddr_id
!= raddr_id
)) {
3744 mptcplog((LOG_ERR
, "MPTCP ERROR %s: mismatched"
3745 " address ids %d %d \n", __func__
, raddr_id
,
3746 sauth_entry
->msae_raddr_id
));
3750 sauth_entry
->msae_raddr_id
= raddr_id
;
3751 if ((sauth_entry
->msae_raddr_rand
!= 0) &&
3752 (sauth_entry
->msae_raddr_rand
!= raddr_rand
)) {
3753 mptcplog((LOG_ERR
, "%s: dup SYN_ACK %d %d \n",
3754 __func__
, raddr_rand
,
3755 sauth_entry
->msae_raddr_rand
));
3759 sauth_entry
->msae_raddr_rand
= raddr_rand
;
3768 * SHA1 support for MPTCP
3771 mptcp_do_sha1(mptcp_key_t
*key
, char *sha_digest
, int digest_len
)
3774 const unsigned char *sha1_base
;
3777 if (digest_len
!= SHA1_RESULTLEN
) {
3781 sha1_base
= (const unsigned char *) key
;
3782 sha1_size
= sizeof (mptcp_key_t
);
3783 SHA1Init(&sha1ctxt
);
3784 SHA1Update(&sha1ctxt
, sha1_base
, sha1_size
);
3785 SHA1Final(sha_digest
, &sha1ctxt
);
3790 mptcp_hmac_sha1(mptcp_key_t key1
, mptcp_key_t key2
,
3791 u_int32_t rand1
, u_int32_t rand2
, u_char
*digest
, int digest_len
)
3794 mptcp_key_t key_ipad
[8] = {0}; /* key XOR'd with inner pad */
3795 mptcp_key_t key_opad
[8] = {0}; /* key XOR'd with outer pad */
3799 bzero(digest
, digest_len
);
3801 /* Set up the Key for HMAC */
3808 /* Set up the message for HMAC */
3812 /* Key is 512 block length, so no need to compute hash */
3814 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
3816 for (i
= 0; i
< 8; i
++) {
3817 key_ipad
[i
] ^= 0x3636363636363636;
3818 key_opad
[i
] ^= 0x5c5c5c5c5c5c5c5c;
3821 /* Perform inner SHA1 */
3822 SHA1Init(&sha1ctxt
);
3823 SHA1Update(&sha1ctxt
, (unsigned char *)key_ipad
, sizeof (key_ipad
));
3824 SHA1Update(&sha1ctxt
, (unsigned char *)data
, sizeof (data
));
3825 SHA1Final(digest
, &sha1ctxt
);
3827 /* Perform outer SHA1 */
3828 SHA1Init(&sha1ctxt
);
3829 SHA1Update(&sha1ctxt
, (unsigned char *)key_opad
, sizeof (key_opad
));
3830 SHA1Update(&sha1ctxt
, (unsigned char *)digest
, SHA1_RESULTLEN
);
3831 SHA1Final(digest
, &sha1ctxt
);
3835 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
3836 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
3839 mptcp_get_hmac(mptcp_addr_id aid
, struct mptcb
*mp_tp
, u_char
*digest
,
3842 uint32_t lrand
, rrand
;
3843 mptcp_key_t localkey
, remotekey
;
3844 MPT_LOCK_ASSERT_NOTHELD(mp_tp
);
3846 if (digest_len
!= SHA1_RESULTLEN
)
3850 mptcp_get_rands(aid
, mp_tp
, &lrand
, &rrand
);
3851 MPT_LOCK_SPIN(mp_tp
);
3852 localkey
= *mp_tp
->mpt_localkey
;
3853 remotekey
= mp_tp
->mpt_remotekey
;
3855 mptcp_hmac_sha1(localkey
, remotekey
, lrand
, rrand
, digest
,
3860 mptcp_get_trunced_hmac(mptcp_addr_id aid
, struct mptcb
*mp_tp
)
3862 u_char digest
[SHA1_RESULTLEN
];
3863 u_int64_t trunced_digest
;
3865 mptcp_get_hmac(aid
, mp_tp
, &digest
[0], sizeof (digest
));
3866 bcopy(digest
, &trunced_digest
, 8);
3867 return (trunced_digest
);
3871 * Authentication data generation
3874 mptcp_generate_token(char *sha_digest
, int sha_digest_len
, caddr_t token
,
3877 VERIFY(token_len
== sizeof (u_int32_t
));
3878 VERIFY(sha_digest_len
== SHA1_RESULTLEN
);
3880 /* Most significant 32 bits of the SHA1 hash */
3881 bcopy(sha_digest
, token
, sizeof (u_int32_t
));
3886 mptcp_generate_idsn(char *sha_digest
, int sha_digest_len
, caddr_t idsn
,
3889 VERIFY(idsn_len
== sizeof (u_int64_t
));
3890 VERIFY(sha_digest_len
== SHA1_RESULTLEN
);
3893 * Least significant 64 bits of the SHA1 hash
3896 idsn
[7] = sha_digest
[12];
3897 idsn
[6] = sha_digest
[13];
3898 idsn
[5] = sha_digest
[14];
3899 idsn
[4] = sha_digest
[15];
3900 idsn
[3] = sha_digest
[16];
3901 idsn
[2] = sha_digest
[17];
3902 idsn
[1] = sha_digest
[18];
3903 idsn
[0] = sha_digest
[19];
3908 mptcp_init_authparms(struct mptcb
*mp_tp
)
3910 caddr_t local_digest
= NULL
;
3911 char remote_digest
[MPTCP_SHA1_RESULTLEN
];
3912 MPT_LOCK_ASSERT_HELD(mp_tp
);
3914 /* Only Version 0 is supported for auth purposes */
3915 if (mp_tp
->mpt_version
!= MP_DRAFT_VERSION_12
)
3918 /* Setup local and remote tokens and Initial DSNs */
3919 local_digest
= mptcp_get_stored_digest(mp_tp
->mpt_localkey
);
3920 mptcp_generate_token(local_digest
, SHA1_RESULTLEN
,
3921 (caddr_t
)&mp_tp
->mpt_localtoken
, sizeof (mp_tp
->mpt_localtoken
));
3922 mptcp_generate_idsn(local_digest
, SHA1_RESULTLEN
,
3923 (caddr_t
)&mp_tp
->mpt_local_idsn
, sizeof (u_int64_t
));
3925 if (!mptcp_do_sha1(&mp_tp
->mpt_remotekey
, remote_digest
,
3927 mptcplog((LOG_ERR
, "MPTCP ERROR %s: unexpected failure",
3931 mptcp_generate_token(remote_digest
, SHA1_RESULTLEN
,
3932 (caddr_t
)&mp_tp
->mpt_remotetoken
, sizeof (mp_tp
->mpt_localtoken
));
3933 mptcp_generate_idsn(remote_digest
, SHA1_RESULTLEN
,
3934 (caddr_t
)&mp_tp
->mpt_remote_idsn
, sizeof (u_int64_t
));
3939 mptcp_init_statevars(struct mptcb
*mp_tp
)
3941 MPT_LOCK_ASSERT_HELD(mp_tp
);
3943 /* The subflow SYN is also first MPTCP byte */
3944 mp_tp
->mpt_snduna
= mp_tp
->mpt_sndmax
= mp_tp
->mpt_local_idsn
+ 1;
3945 mp_tp
->mpt_sndnxt
= mp_tp
->mpt_snduna
;
3947 mp_tp
->mpt_rcvatmark
= mp_tp
->mpt_rcvnxt
= mp_tp
->mpt_remote_idsn
+ 1;
3951 mptcp_conn_properties(struct mptcb
*mp_tp
)
3953 /* There is only Version 0 at this time */
3954 mp_tp
->mpt_version
= MP_DRAFT_VERSION_12
;
3956 /* Set DSS checksum flag */
3958 mp_tp
->mpt_flags
|= MPTCPF_CHECKSUM
;
3960 /* Set up receive window */
3961 mp_tp
->mpt_rcvwnd
= mptcp_sbspace(mp_tp
);
3963 /* Set up gc ticks */
3964 mp_tp
->mpt_gc_ticks
= MPT_GC_TICKS
;
3971 mptcp_get_localtoken(void* mptcb_arg
)
3973 struct mptcb
*mp_tp
= (struct mptcb
*)mptcb_arg
;
3974 return (mp_tp
->mpt_localtoken
);
3978 mptcp_get_remotetoken(void* mptcb_arg
)
3980 struct mptcb
*mp_tp
= (struct mptcb
*)mptcb_arg
;
3981 return (mp_tp
->mpt_remotetoken
);
3985 mptcp_get_localkey(void* mptcb_arg
)
3987 struct mptcb
*mp_tp
= (struct mptcb
*)mptcb_arg
;
3988 if (mp_tp
->mpt_localkey
!= NULL
)
3989 return (*mp_tp
->mpt_localkey
);
3995 mptcp_get_remotekey(void* mptcb_arg
)
3997 struct mptcb
*mp_tp
= (struct mptcb
*)mptcb_arg
;
3998 return (mp_tp
->mpt_remotekey
);
4002 mptcp_send_dfin(struct socket
*so
)
4004 struct tcpcb
*tp
= NULL
;
4005 struct inpcb
*inp
= NULL
;
4007 inp
= sotoinpcb(so
);
4011 tp
= intotcpcb(inp
);
4015 if (!(tp
->t_mpflags
& TMPF_RESET
))
4016 tp
->t_mpflags
|= TMPF_SEND_DFIN
;
4020 * Data Sequence Mapping routines
4023 mptcp_insert_dsn(struct mppcb
*mpp
, struct mbuf
*m
)
4025 struct mptcb
*mp_tp
;
4030 mp_tp
= &((struct mpp_mtp
*)mpp
)->mtcb
;
4032 if (mp_tp
->mpt_state
< MPTCPS_ESTABLISHED
) {
4034 panic("%s: data write before establishment.",
4040 VERIFY(m
->m_flags
& M_PKTHDR
);
4041 m
->m_pkthdr
.pkt_flags
|= (PKTF_MPTCP
| PKTF_MPSO
);
4042 m
->m_pkthdr
.mp_dsn
= mp_tp
->mpt_sndmax
;
4043 m
->m_pkthdr
.mp_rlen
= m_pktlen(m
);
4044 mp_tp
->mpt_sndmax
+= m_pktlen(m
);
4051 mptcp_preproc_sbdrop(struct mbuf
*m
, unsigned int len
)
4053 u_int32_t sub_len
= 0;
4056 VERIFY(m
->m_flags
& M_PKTHDR
);
4058 if (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
) {
4059 sub_len
= m
->m_pkthdr
.mp_rlen
;
4061 if (sub_len
< len
) {
4062 m
->m_pkthdr
.mp_dsn
+= sub_len
;
4063 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPSO
)) {
4064 m
->m_pkthdr
.mp_rseq
+= sub_len
;
4066 m
->m_pkthdr
.mp_rlen
= 0;
4069 /* sub_len >= len */
4070 m
->m_pkthdr
.mp_dsn
+= len
;
4071 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_MPSO
)) {
4072 m
->m_pkthdr
.mp_rseq
+= len
;
4074 mptcplog3((LOG_INFO
,
4075 "%s: %llu %u %d %d\n", __func__
,
4076 m
->m_pkthdr
.mp_dsn
, m
->m_pkthdr
.mp_rseq
,
4077 m
->m_pkthdr
.mp_rlen
, len
));
4078 m
->m_pkthdr
.mp_rlen
-= len
;
4082 panic("%s: MPTCP tag not set", __func__
);
4089 /* Obtain the DSN mapping stored in the mbuf */
4091 mptcp_output_getm_dsnmap32(struct socket
*so
, int off
, uint32_t datalen
,
4092 u_int32_t
*dsn
, u_int32_t
*relseq
, u_int16_t
*data_len
, u_int64_t
*dsn64p
)
4096 mptcp_output_getm_dsnmap64(so
, off
, datalen
, &dsn64
, relseq
, data_len
);
4097 *dsn
= (u_int32_t
)MPTCP_DATASEQ_LOW32(dsn64
);
4102 mptcp_output_getm_dsnmap64(struct socket
*so
, int off
, uint32_t datalen
,
4103 u_int64_t
*dsn
, u_int32_t
*relseq
, u_int16_t
*data_len
)
4105 struct mbuf
*m
= so
->so_snd
.sb_mb
;
4106 struct mbuf
*mnext
= NULL
;
4107 uint32_t runlen
= 0;
4109 uint32_t contig_len
= 0;
4117 * In the subflow socket, the DSN sequencing can be discontiguous,
4118 * but the subflow sequence mapping is contiguous. Use the subflow
4119 * sequence property to find the right mbuf and corresponding dsn
4124 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4125 VERIFY(m
->m_flags
& M_PKTHDR
);
4127 if ((unsigned int)off
>= m
->m_pkthdr
.mp_rlen
) {
4128 off
-= m
->m_pkthdr
.mp_rlen
;
4136 panic("%s: bad offset", __func__
);
4140 dsn64
= m
->m_pkthdr
.mp_dsn
+ off
;
4142 *relseq
= m
->m_pkthdr
.mp_rseq
+ off
;
4145 * Now find the last contiguous byte and its length from
4148 runlen
= m
->m_pkthdr
.mp_rlen
- off
;
4149 contig_len
= runlen
;
4151 /* If datalen does not span multiple mbufs, return */
4152 if (datalen
<= runlen
) {
4153 *data_len
= min(datalen
, UINT16_MAX
);
4158 while (datalen
> runlen
) {
4159 if (mnext
== NULL
) {
4160 panic("%s: bad datalen = %d, %d %d", __func__
, datalen
,
4164 VERIFY(mnext
->m_flags
& M_PKTHDR
);
4165 VERIFY(mnext
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4168 * case A. contiguous DSN stream
4169 * case B. discontiguous DSN stream
4171 if (mnext
->m_pkthdr
.mp_dsn
== (dsn64
+ runlen
)) {
4173 runlen
+= mnext
->m_pkthdr
.mp_rlen
;
4174 contig_len
+= mnext
->m_pkthdr
.mp_rlen
;
4175 mptcplog3((LOG_INFO
, "%s: contig \n",
4179 mptcplog((LOG_INFO
, "%s: discontig %d %d \n",
4180 __func__
, datalen
, contig_len
));
4183 mnext
= mnext
->m_next
;
4185 datalen
= min(datalen
, UINT16_MAX
);
4186 *data_len
= min(datalen
, contig_len
);
4187 mptcplog3((LOG_INFO
, "%s: %llu %u %d %d \n", __func__
,
4188 *dsn
, *relseq
, *data_len
, off
));
4192 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4193 * here. It must be called from mptcp_adj_rmap() which is called only after
4194 * reassembly of out of order data. The rcvnxt variable must
4195 * be updated only when atleast some insequence new data is received.
4198 mptcp_adj_rcvnxt(struct tcpcb
*tp
, struct mbuf
*m
)
4200 struct mptcb
*mp_tp
= tptomptp(tp
);
4205 if ((MPTCP_SEQ_GEQ(mp_tp
->mpt_rcvnxt
, m
->m_pkthdr
.mp_dsn
)) &&
4206 (MPTCP_SEQ_LEQ(mp_tp
->mpt_rcvnxt
, (m
->m_pkthdr
.mp_dsn
+
4207 m
->m_pkthdr
.mp_rlen
)))) {
4208 mp_tp
->mpt_rcvnxt
= m
->m_pkthdr
.mp_dsn
+ m
->m_pkthdr
.mp_rlen
;
4214 * Note that this is called only from tcp_input() which may trim data
4215 * after the dsn mapping is inserted into the mbuf. When it trims data
4216 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the
4217 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion
4218 * cannot be delayed after trim, because data can be in the reassembly
4219 * queue for a while and the DSN option info in tp will be overwritten for
4220 * every new packet received.
4221 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4222 * with mptcp_adj_rmap()
4225 mptcp_insert_rmap(struct tcpcb
*tp
, struct mbuf
*m
)
4227 VERIFY(!(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
));
4229 if (tp
->t_mpflags
& TMPF_EMBED_DSN
) {
4230 VERIFY(m
->m_flags
& M_PKTHDR
);
4231 m
->m_pkthdr
.mp_dsn
= tp
->t_rcv_map
.mpt_dsn
;
4232 m
->m_pkthdr
.mp_rseq
= tp
->t_rcv_map
.mpt_sseq
;
4233 m
->m_pkthdr
.mp_rlen
= tp
->t_rcv_map
.mpt_len
;
4234 m
->m_pkthdr
.pkt_flags
|= PKTF_MPTCP
;
4235 tp
->t_mpflags
&= ~TMPF_EMBED_DSN
;
4236 tp
->t_mpflags
|= TMPF_MPTCP_ACKNOW
;
4241 mptcp_adj_rmap(struct socket
*so
, struct mbuf
*m
)
4244 u_int32_t sseq
, datalen
;
4245 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
4246 u_int32_t old_rcvnxt
= 0;
4248 if (m_pktlen(m
) == 0)
4251 if (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
) {
4252 VERIFY(m
->m_flags
& M_PKTHDR
);
4254 dsn
= m
->m_pkthdr
.mp_dsn
;
4255 sseq
= m
->m_pkthdr
.mp_rseq
+ tp
->irs
;
4256 datalen
= m
->m_pkthdr
.mp_rlen
;
4258 /* data arrived without an DSS option mapping */
4259 mptcp_notify_mpfail(so
);
4263 /* In the common case, data is in window and in sequence */
4264 if (m
->m_pkthdr
.len
== (int)datalen
) {
4265 mptcp_adj_rcvnxt(tp
, m
);
4269 if (m
->m_pkthdr
.len
> (int)datalen
) {
4270 panic("%s: mbuf len = %d expected = %d", __func__
,
4271 m
->m_pkthdr
.len
, datalen
);
4274 old_rcvnxt
= tp
->rcv_nxt
- m
->m_pkthdr
.len
;
4275 if (SEQ_GT(old_rcvnxt
, sseq
)) {
4276 /* data trimmed from the left */
4277 int off
= old_rcvnxt
- sseq
;
4278 m
->m_pkthdr
.mp_dsn
+= off
;
4279 m
->m_pkthdr
.mp_rseq
+= off
;
4280 m
->m_pkthdr
.mp_rlen
-= off
;
4281 } else if (old_rcvnxt
== sseq
) {
4283 * Data was trimmed from the right
4285 m
->m_pkthdr
.mp_rlen
= m
->m_pkthdr
.len
;
4287 /* XXX handle gracefully with reass or fallback in January */
4288 panic("%s: partial map %u %u", __func__
, old_rcvnxt
, sseq
);
4291 mptcp_adj_rcvnxt(tp
, m
);
4296 * Following routines help with failure detection and failover of data
4297 * transfer from one subflow to another.
4300 mptcp_act_on_txfail(struct socket
*so
)
4302 struct tcpcb
*tp
= NULL
;
4303 struct inpcb
*inp
= sotoinpcb(so
);
4308 tp
= intotcpcb(inp
);
4312 if (tp
->t_state
!= TCPS_ESTABLISHED
)
4313 mptcplog((LOG_INFO
, "%s: state = %d \n", __func__
,
4316 if (so
->so_flags
& SOF_MP_TRYFAILOVER
) {
4320 so
->so_flags
|= SOF_MP_TRYFAILOVER
;
4321 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPFAILOVER
));
4325 * Support for MP_FAIL option
4328 mptcp_get_map_for_dsn(struct socket
*so
, u_int64_t dsn_fail
, u_int32_t
*tcp_seq
)
4330 struct mbuf
*m
= so
->so_snd
.sb_mb
;
4339 VERIFY(m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP
);
4340 VERIFY(m
->m_flags
& M_PKTHDR
);
4341 dsn
= m
->m_pkthdr
.mp_dsn
;
4342 datalen
= m
->m_pkthdr
.mp_rlen
;
4343 if (MPTCP_SEQ_LEQ(dsn
, dsn_fail
) &&
4344 (MPTCP_SEQ_GEQ(dsn
+ datalen
, dsn_fail
))) {
4345 off
= dsn_fail
- dsn
;
4346 *tcp_seq
= m
->m_pkthdr
.mp_rseq
+ off
;
4354 * If there was no mbuf data and a fallback to TCP occurred, there's
4355 * not much else to do.
4358 mptcplog((LOG_ERR
, "%s: %llu not found \n", __func__
, dsn_fail
));
4363 * Support for sending contiguous MPTCP bytes in subflow
4366 mptcp_adj_sendlen(struct socket
*so
, int32_t off
, int32_t len
)
4368 u_int64_t mdss_dsn
= 0;
4369 u_int32_t mdss_subflow_seq
= 0;
4370 u_int16_t mdss_data_len
= 0;
4375 mptcp_output_getm_dsnmap64(so
, off
, (u_int32_t
)len
,
4376 &mdss_dsn
, &mdss_subflow_seq
, &mdss_data_len
);
4378 return (mdss_data_len
);
4382 mptcp_sbspace(struct mptcb
*mpt
)
4388 MPT_LOCK_ASSERT_HELD(mpt
);
4389 MPTE_LOCK_ASSERT_HELD(mpt
->mpt_mpte
);
4391 sb
= &mpt
->mpt_mpte
->mpte_mppcb
->mpp_socket
->so_rcv
;
4392 rcvbuf
= sb
->sb_hiwat
;
4393 space
= ((int32_t)imin((rcvbuf
- sb
->sb_cc
),
4394 (sb
->sb_mbmax
- sb
->sb_mbcnt
)));
4397 /* XXX check if it's too small? */
4403 * Support Fallback to Regular TCP
4406 mptcp_notify_mpready(struct socket
*so
)
4408 struct tcpcb
*tp
= NULL
;
4413 tp
= intotcpcb(sotoinpcb(so
));
4418 DTRACE_MPTCP4(multipath__ready
, struct socket
*, so
,
4419 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
,
4420 struct tcpcb
*, tp
);
4422 if (!(tp
->t_mpflags
& TMPF_MPTCP_TRUE
))
4425 if (tp
->t_mpflags
& TMPF_MPTCP_READY
)
4428 tp
->t_mpflags
&= ~TMPF_TCP_FALLBACK
;
4429 tp
->t_mpflags
|= TMPF_MPTCP_READY
;
4431 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPSTATUS
));
4435 mptcp_notify_mpfail(struct socket
*so
)
4437 struct tcpcb
*tp
= NULL
;
4442 tp
= intotcpcb(sotoinpcb(so
));
4447 DTRACE_MPTCP4(multipath__failed
, struct socket
*, so
,
4448 struct sockbuf
*, &so
->so_rcv
, struct sockbuf
*, &so
->so_snd
,
4449 struct tcpcb
*, tp
);
4451 if (tp
->t_mpflags
& TMPF_TCP_FALLBACK
)
4454 tp
->t_mpflags
&= ~(TMPF_MPTCP_READY
|TMPF_MPTCP_TRUE
);
4455 tp
->t_mpflags
|= TMPF_TCP_FALLBACK
;
4457 soevent(so
, (SO_FILT_HINT_LOCKED
| SO_FILT_HINT_MPSTATUS
));
4461 * Keepalive helper function
4464 mptcp_ok_to_keepalive(struct mptcb
*mp_tp
)
4467 VERIFY(mp_tp
!= NULL
);
4469 if (mp_tp
->mpt_state
>= MPTCPS_CLOSE_WAIT
) {
4477 * MPTCP t_maxseg adjustment function
4480 mptcp_adj_mss(struct tcpcb
*tp
, boolean_t mtudisc
)
4483 struct mptcb
*mp_tp
= tptomptp(tp
);
4485 #define MPTCP_COMPUTE_LEN { \
4486 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
4488 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
4491 /* adjust to 32-bit boundary + EOL */ \
4493 MPT_UNLOCK(mp_tp); \
4499 * For the first subflow and subsequent subflows, adjust mss for
4500 * most common MPTCP option size, for case where tcp_mss is called
4501 * during option processing and MTU discovery.
4503 if ((tp
->t_mpflags
& TMPF_PREESTABLISHED
) &&
4504 (!(tp
->t_mpflags
& TMPF_JOINED_FLOW
))) {
4508 if ((tp
->t_mpflags
& TMPF_PREESTABLISHED
) &&
4509 (tp
->t_mpflags
& TMPF_SENT_JOIN
)) {
4513 if ((mtudisc
) && (tp
->t_mpflags
& TMPF_MPTCP_TRUE
)) {
4521 * Update the pid, upid, uuid of the subflow so, based on parent so
4524 mptcp_update_last_owner(struct mptsub
*mpts
, struct socket
*parent_mpso
)
4526 struct socket
*subflow_so
= mpts
->mpts_socket
;
4528 MPTS_LOCK_ASSERT_HELD(mpts
);
4530 socket_lock(subflow_so
, 0);
4531 if ((subflow_so
->last_pid
!= parent_mpso
->last_pid
) ||
4532 (subflow_so
->last_upid
!= parent_mpso
->last_upid
)) {
4533 subflow_so
->last_upid
= parent_mpso
->last_upid
;
4534 subflow_so
->last_pid
= parent_mpso
->last_pid
;
4535 uuid_copy(subflow_so
->last_uuid
, parent_mpso
->last_uuid
);
4537 so_update_policy(subflow_so
);
4538 socket_unlock(subflow_so
, 0);
4542 fill_mptcp_subflow(struct socket
*so
, mptcp_flow_t
*flow
, struct mptsub
*mpts
)
4546 tcp_getconninfo(so
, &flow
->flow_ci
);
4547 inp
= sotoinpcb(so
);
4549 if ((inp
->inp_vflag
& INP_IPV6
) != 0) {
4550 flow
->flow_src
.ss_family
= AF_INET6
;
4551 flow
->flow_dst
.ss_family
= AF_INET6
;
4552 flow
->flow_src
.ss_len
= sizeof(struct sockaddr_in6
);
4553 flow
->flow_dst
.ss_len
= sizeof(struct sockaddr_in6
);
4554 SIN6(&flow
->flow_src
)->sin6_port
= inp
->in6p_lport
;
4555 SIN6(&flow
->flow_dst
)->sin6_port
= inp
->in6p_fport
;
4556 SIN6(&flow
->flow_src
)->sin6_addr
= inp
->in6p_laddr
;
4557 SIN6(&flow
->flow_dst
)->sin6_addr
= inp
->in6p_faddr
;
4561 flow
->flow_src
.ss_family
= AF_INET
;
4562 flow
->flow_dst
.ss_family
= AF_INET
;
4563 flow
->flow_src
.ss_len
= sizeof(struct sockaddr_in
);
4564 flow
->flow_dst
.ss_len
= sizeof(struct sockaddr_in
);
4565 SIN(&flow
->flow_src
)->sin_port
= inp
->inp_lport
;
4566 SIN(&flow
->flow_dst
)->sin_port
= inp
->inp_fport
;
4567 SIN(&flow
->flow_src
)->sin_addr
= inp
->inp_laddr
;
4568 SIN(&flow
->flow_dst
)->sin_addr
= inp
->inp_faddr
;
4570 flow
->flow_flags
= mpts
->mpts_flags
;
4571 flow
->flow_cid
= mpts
->mpts_connid
;
4575 mptcp_pcblist SYSCTL_HANDLER_ARGS
4577 #pragma unused(oidp, arg1, arg2)
4581 struct mptses
*mpte
;
4582 struct mptcb
*mp_tp
;
4583 struct mptsub
*mpts
;
4585 conninfo_mptcp_t mptcpci
;
4586 mptcp_flow_t
*flows
;
4588 if (req
->newptr
!= USER_ADDR_NULL
)
4591 lck_mtx_lock(&mtcbinfo
.mppi_lock
);
4592 n
= mtcbinfo
.mppi_count
;
4593 if (req
->oldptr
== USER_ADDR_NULL
) {
4594 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
4595 req
->oldidx
= (n
+ n
/8) * sizeof(conninfo_mptcp_t
) +
4596 4 * (n
+ n
/8) * sizeof(mptcp_flow_t
);
4599 TAILQ_FOREACH(mpp
, &mtcbinfo
.mppi_pcbs
, mpp_entry
) {
4600 bzero(&mptcpci
, sizeof(mptcpci
));
4601 lck_mtx_lock(&mpp
->mpp_lock
);
4602 VERIFY(mpp
->mpp_flags
& MPP_ATTACHED
);
4603 mpte
= mptompte(mpp
);
4604 VERIFY(mpte
!= NULL
);
4605 mp_tp
= mpte
->mpte_mptcb
;
4606 VERIFY(mp_tp
!= NULL
);
4607 len
= sizeof(*flows
) * mpte
->mpte_numflows
;
4608 flows
= _MALLOC(len
, M_TEMP
, M_WAITOK
| M_ZERO
);
4609 if (flows
== NULL
) {
4610 lck_mtx_unlock(&mpp
->mpp_lock
);
4613 /* N.B. we don't take the mpt_lock just for the state. */
4614 mptcpci
.mptcpci_state
= mp_tp
->mpt_state
;
4615 mptcpci
.mptcpci_nflows
= mpte
->mpte_numflows
;
4616 mptcpci
.mptcpci_len
= sizeof(mptcpci
) +
4617 sizeof(*flows
) * (mptcpci
.mptcpci_nflows
- 1);
4618 error
= SYSCTL_OUT(req
, &mptcpci
,
4619 sizeof(mptcpci
) - sizeof(*flows
));
4621 lck_mtx_unlock(&mpp
->mpp_lock
);
4622 FREE(flows
, M_TEMP
);
4626 TAILQ_FOREACH(mpts
, &mpte
->mpte_subflows
, mpts_entry
) {
4628 so
= mpts
->mpts_socket
;
4630 fill_mptcp_subflow(so
, &flows
[f
], mpts
);
4631 socket_unlock(so
, 0);
4635 lck_mtx_unlock(&mpp
->mpp_lock
);
4636 error
= SYSCTL_OUT(req
, flows
, len
);
4637 FREE(flows
, M_TEMP
);
4641 lck_mtx_unlock(&mtcbinfo
.mppi_lock
);
4646 SYSCTL_PROC(_net_inet_mptcp
, OID_AUTO
, pcblist
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
4647 0, 0, mptcp_pcblist
, "S,conninfo_mptcp_t",
4648 "List of active MPTCP connections");