]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-2782.10.72.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
fe8ab488 2 * Copyright (c) 2012-2014 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/in_var.h>
53#include <netinet/tcp.h>
54#include <netinet/tcp_fsm.h>
55#include <netinet/tcp_seq.h>
56#include <netinet/tcp_var.h>
57#include <netinet/mptcp_var.h>
58#include <netinet/mptcp.h>
59#include <netinet/mptcp_seq.h>
60#include <netinet/mptcp_timer.h>
61#include <libkern/crypto/sha1.h>
62#if INET6
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6protosw.h>
65#endif /* INET6 */
66#include <dev/random/randomdev.h>
67
68/*
69 * Notes on MPTCP implementation.
70 *
71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
72 * communication domain. The structure mtcbinfo describes the MPTCP instance
73 * of a Multipath protocol in that domain. It is used to keep track of all
74 * MPTCP PCB instances in the system, and is protected by the global lock
75 * mppi_lock.
76 *
77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
78 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
79 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
80 * allocated from the same memory block, and each structure has a pointer
81 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
83 * PCB (mppcb) as well as the MPTCP Session (mptses).
84 *
85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
86 * in particular, the list of subflows as well as the MPTCP thread.
87 *
88 * A functioning MPTCP Session consists of one or more subflow sockets. Each
89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
90 * represented by the mptsub structure. Because each subflow requires access
91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
92 * subflow. This gets decremented prior to the subflow's destruction. The
93 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
94 *
95 * To handle events (read, write, control) from the subflows, an MPTCP thread
96 * is created; currently, there is one thread per MPTCP Session. In order to
97 * prevent the MPTCP socket from being destroyed while being accessed by the
98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
99 * which will be decremented prior to the thread's termination. The thread
100 * lock (mpte_thread_lock) is used to synchronize its signalling.
101 *
102 * Lock ordering is defined as follows:
103 *
104 * mtcbinfo (mppi_lock)
105 * mp_so (mpp_lock)
106 * mpts (mpts_lock)
107 * so (inpcb_mtx)
108 * mptcb (mpt_lock)
109 *
110 * It is not a requirement that all of the above locks need to be acquired
111 * in succession, but the correct lock ordering must be followed when there
112 * are more than one locks that need to be held. The MPTCP thread lock is
113 * is not constrained by this arrangement, because none of the other locks
114 * is ever acquired while holding mpte_thread_lock; therefore it may be called
115 * at any moment to signal the thread.
116 *
117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
118 * work is done by the MPTCP garbage collector which is invoked on demand by
119 * the PF_MULTIPATH garbage collector. This process will take place once all
120 * of the subflows have been destroyed, and the MPTCP thread be instructed to
121 * self-terminate.
122 */
123
124static void mptcp_sesdestroy(struct mptses *);
125static void mptcp_thread_signal_locked(struct mptses *);
126static void mptcp_thread_terminate_signal(struct mptses *);
127static void mptcp_thread_dowork(struct mptses *);
128static void mptcp_thread_func(void *, wait_result_t);
129static void mptcp_thread_destroy(struct mptses *);
130static void mptcp_key_pool_init(void);
fe8ab488 131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e
A
132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
133static void mptcp_conn_properties(struct mptcb *);
134static void mptcp_init_statevars(struct mptcb *);
135
136static uint32_t mptcp_gc(struct mppcbinfo *);
137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138 int, struct proc *, struct socket **);
139static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142 struct uio *, struct mbuf **, struct mbuf **, int *);
143static void mptcp_subflow_rupcall(struct socket *, void *, int);
144static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145static void mptcp_subflow_wupcall(struct socket *, void *, int);
146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147static void mptcp_update_last_owner(struct mptsub *, struct socket *);
fe8ab488 148static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
39236c6e
A
149
150/*
151 * Possible return values for subflow event handlers. Note that success
152 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
153 * indicate errors or actions which require immediate attention; they will
154 * prevent the rest of the handlers from processing their respective events
155 * until the next round of events processing.
156 */
157typedef enum {
158 MPTS_EVRET_DELETE = 1, /* delete this subflow */
159 MPTS_EVRET_OK = 2, /* OK */
160 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
161 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
162 MPTS_EVRET_OK_UPDATE = 5, /* OK with conninfo update */
163} ev_ret_t;
164
165static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *);
166static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *);
167static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *);
168static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *);
169static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *);
170static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *);
171static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *);
172static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *);
173static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *);
174static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *);
175static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *);
176static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *);
177static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *);
178static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *);
fe8ab488
A
179static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *);
180static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *);
181static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *);
182
39236c6e
A
183static const char *mptcp_evret2str(ev_ret_t);
184
185static mptcp_key_t *mptcp_reserve_key(void);
186static int mptcp_do_sha1(mptcp_key_t *, char *, int);
187static int mptcp_init_authparms(struct mptcb *);
39236c6e
A
188
189static unsigned int mptsub_zone_size; /* size of mptsub */
190static struct zone *mptsub_zone; /* zone for mptsub */
191
192static unsigned int mptopt_zone_size; /* size of mptopt */
193static struct zone *mptopt_zone; /* zone for mptopt */
194
195static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
196static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
197
198struct mppcbinfo mtcbinfo;
199
200static struct mptcp_keys_pool_head mptcp_keys_pool;
201
202#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
203#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
204
205SYSCTL_DECL(_net_inet);
206
207SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
208
209uint32_t mptcp_verbose = 0; /* more noise if greater than 1 */
210SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED,
211 &mptcp_verbose, 0, "MPTCP verbosity level");
212
213SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
214 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
215
216/*
217 * Since there is one kernel thread per mptcp socket, imposing an artificial
218 * limit on number of allowed mptcp sockets.
219 */
220uint32_t mptcp_socket_limit = MPPCB_LIMIT;
221SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
222 &mptcp_socket_limit, 0, "MPTCP socket limit");
223
fe8ab488
A
224/*
225 * SYSCTL to turn on delayed cellular subflow start.
226 */
227uint32_t mptcp_delayed_subf_start = 0;
228SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
229 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
230
231/*
232 * SYSCTL for RTT spike measurement threshold in msecs.
233 */
234int32_t mptcp_rto_spike_thresh = 3000;
235SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, rto_spikethresh,
236 CTLFLAG_RW|CTLFLAG_LOCKED, &mptcp_rto_spike_thresh, 0,
237 "MPTCP RTT spike thresh");
238
39236c6e
A
239static struct protosw mptcp_subflow_protosw;
240static struct pr_usrreqs mptcp_subflow_usrreqs;
241#if INET6
242static struct ip6protosw mptcp_subflow_protosw6;
243static struct pr_usrreqs mptcp_subflow_usrreqs6;
244#endif /* INET6 */
245
246/*
247 * Protocol pr_init callback.
248 */
249void
250mptcp_init(struct protosw *pp, struct domain *dp)
251{
252#pragma unused(dp)
253 static int mptcp_initialized = 0;
254 struct protosw *prp;
255#if INET6
256 struct ip6protosw *prp6;
257#endif /* INET6 */
258
259 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
260
261 /* do this only once */
262 if (mptcp_initialized)
263 return;
264 mptcp_initialized = 1;
265
266 /*
267 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
268 * we must be able to find IPPROTO_TCP entries for both.
269 */
270 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
271 VERIFY(prp != NULL);
272 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
273 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
274 sizeof (mptcp_subflow_usrreqs));
275 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
276 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
277 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
278 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
279 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
280 /*
281 * Socket filters shouldn't attach/detach to/from this protosw
282 * since pr_protosw is to be used instead, which points to the
283 * real protocol; if they do, it is a bug and we should panic.
284 */
285 mptcp_subflow_protosw.pr_filter_head.tqh_first =
286 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
287 mptcp_subflow_protosw.pr_filter_head.tqh_last =
288 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
289
290#if INET6
291 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
292 IPPROTO_TCP, SOCK_STREAM);
293 VERIFY(prp6 != NULL);
294 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
295 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
296 sizeof (mptcp_subflow_usrreqs6));
297 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
298 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
299 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
300 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
301 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
302 /*
303 * Socket filters shouldn't attach/detach to/from this protosw
304 * since pr_protosw is to be used instead, which points to the
305 * real protocol; if they do, it is a bug and we should panic.
306 */
307 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
308 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
309 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
310 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
311#endif /* INET6 */
312
313 bzero(&mtcbinfo, sizeof (mtcbinfo));
314 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
315 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
316 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
317 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
318 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
319 /* NOTREACHED */
320 }
321 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
322 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
323
324 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
325 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
326 mtcbinfo.mppi_lock_grp_attr);
327 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
328 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
329 mtcbinfo.mppi_lock_attr);
330 mtcbinfo.mppi_gc = mptcp_gc;
331
332 mtcbinfo.mppi_timer = mptcp_timer;
333
334 /* attach to MP domain for garbage collection to take place */
335 mp_pcbinfo_attach(&mtcbinfo);
336
337 mptsub_zone_size = sizeof (struct mptsub);
338 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
339 8192, "mptsub")) == NULL) {
340 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
341 /* NOTREACHED */
342 }
343 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
344 zone_change(mptsub_zone, Z_EXPAND, TRUE);
345
346 mptopt_zone_size = sizeof (struct mptopt);
347 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
348 1024, "mptopt")) == NULL) {
349 panic("%s: unable to allocate MPTCP option zone\n", __func__);
350 /* NOTREACHED */
351 }
352 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
353 zone_change(mptopt_zone, Z_EXPAND, TRUE);
354
355 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
356 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
357 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
358 panic("%s: unable to allocate MPTCP address auth zone \n",
359 __func__);
360 /* NOTREACHED */
361 }
362 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
363 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
364
365 /* Set up a list of unique keys */
366 mptcp_key_pool_init();
367
368}
369
370/*
371 * Create an MPTCP session, called as a result of opening a MPTCP socket.
372 */
373struct mptses *
374mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
375{
376 struct mppcbinfo *mppi;
377 struct mptses *mpte;
378 struct mptcb *mp_tp;
379 int error = 0;
380
381 VERIFY(mpp != NULL);
382 mppi = mpp->mpp_pcbinfo;
383 VERIFY(mppi != NULL);
384
385 mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
386 mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
387
388 /* MPTCP Multipath PCB Extension */
389 bzero(mpte, sizeof (*mpte));
390 VERIFY(mpp->mpp_pcbe == NULL);
391 mpp->mpp_pcbe = mpte;
392 mpte->mpte_mppcb = mpp;
393 mpte->mpte_mptcb = mp_tp;
394
395 TAILQ_INIT(&mpte->mpte_sopts);
396 TAILQ_INIT(&mpte->mpte_subflows);
397 mpte->mpte_associd = ASSOCID_ANY;
398 mpte->mpte_connid_last = CONNID_ANY;
399
400 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
401 mppi->mppi_lock_attr);
402
403 /*
404 * XXX: adi@apple.com
405 *
406 * This can be rather expensive if we have lots of MPTCP sockets,
407 * but we need a kernel thread for this model to work. Perhaps we
408 * could amortize the costs by having one worker thread per a group
409 * of MPTCP sockets.
410 */
411 if (kernel_thread_start(mptcp_thread_func, mpte,
412 &mpte->mpte_thread) != KERN_SUCCESS) {
413 error = ENOBUFS;
414 goto out;
415 }
416 mp_so->so_usecount++; /* for thread */
417
418 /* MPTCP Protocol Control Block */
419 bzero(mp_tp, sizeof (*mp_tp));
420 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
421 mppi->mppi_lock_attr);
422 mp_tp->mpt_mpte = mpte;
423
424out:
425 if (error != 0)
426 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
427 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
428 struct sockbuf *, &mp_so->so_rcv,
429 struct sockbuf *, &mp_so->so_snd,
430 struct mppcb *, mpp, int, error);
431
432 return ((error != 0) ? NULL : mpte);
433}
434
435/*
436 * Destroy an MPTCP session.
437 */
438static void
439mptcp_sesdestroy(struct mptses *mpte)
440{
441 struct mptcb *mp_tp;
442
443 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
444
445 mp_tp = mpte->mpte_mptcb;
446 VERIFY(mp_tp != NULL);
447
448 /*
449 * MPTCP Multipath PCB Extension section
450 */
451 mptcp_flush_sopts(mpte);
452 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
453
454 lck_mtx_destroy(&mpte->mpte_thread_lock,
455 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
456
457 /*
458 * MPTCP Protocol Control Block section
459 */
460 lck_mtx_destroy(&mp_tp->mpt_lock,
461 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
462
463 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
464 struct mptcb *, mp_tp);
465}
466
467/*
468 * Allocate an MPTCP socket option structure.
469 */
470struct mptopt *
471mptcp_sopt_alloc(int how)
472{
473 struct mptopt *mpo;
474
475 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
476 zalloc_noblock(mptopt_zone);
477 if (mpo != NULL) {
478 bzero(mpo, mptopt_zone_size);
479 }
480
481 return (mpo);
482}
483
484/*
485 * Free an MPTCP socket option structure.
486 */
487void
488mptcp_sopt_free(struct mptopt *mpo)
489{
490 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
491
492 zfree(mptopt_zone, mpo);
493}
494
495/*
496 * Add a socket option to the MPTCP socket option list.
497 */
498void
499mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
500{
501 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
502 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
503 mpo->mpo_flags |= MPOF_ATTACHED;
504 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
505}
506
507/*
508 * Remove a socket option from the MPTCP socket option list.
509 */
510void
511mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
512{
513 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
514 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
515 mpo->mpo_flags &= ~MPOF_ATTACHED;
516 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
517}
518
519/*
520 * Search for an existing <sopt_level,sopt_name> socket option.
521 */
522struct mptopt *
523mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
524{
525 struct mptopt *mpo;
526
527 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
528
529 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
530 if (mpo->mpo_level == sopt->sopt_level &&
531 mpo->mpo_name == sopt->sopt_name)
532 break;
533 }
534 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
535
536 return (mpo);
537}
538
539/*
540 * Flushes all recorded socket options from an MP socket.
541 */
542void
543mptcp_flush_sopts(struct mptses *mpte)
544{
545 struct mptopt *mpo, *tmpo;
546
547 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
548
549 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
550 mptcp_sopt_remove(mpte, mpo);
551 mptcp_sopt_free(mpo);
552 }
553 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
554}
555
556/*
557 * Allocate a MPTCP subflow structure.
558 */
559struct mptsub *
560mptcp_subflow_alloc(int how)
561{
562 struct mptsub *mpts;
563
564 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
565 zalloc_noblock(mptsub_zone);
566 if (mpts != NULL) {
567 bzero(mpts, mptsub_zone_size);
568 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
569 mtcbinfo.mppi_lock_attr);
570 }
571
572 return (mpts);
573}
574
575/*
576 * Deallocate a subflow structure, called when all of the references held
577 * on it have been released. This implies that the subflow has been deleted.
578 */
579void
580mptcp_subflow_free(struct mptsub *mpts)
581{
582 MPTS_LOCK_ASSERT_HELD(mpts);
583
584 VERIFY(mpts->mpts_refcnt == 0);
585 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
586 VERIFY(mpts->mpts_mpte == NULL);
587 VERIFY(mpts->mpts_socket == NULL);
588
589 if (mpts->mpts_src_sl != NULL) {
590 sockaddrlist_free(mpts->mpts_src_sl);
591 mpts->mpts_src_sl = NULL;
592 }
593 if (mpts->mpts_dst_sl != NULL) {
594 sockaddrlist_free(mpts->mpts_dst_sl);
595 mpts->mpts_dst_sl = NULL;
596 }
597 MPTS_UNLOCK(mpts);
598 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
599
600 zfree(mptsub_zone, mpts);
601}
602
603/*
604 * Create an MPTCP subflow socket.
605 */
606static int
607mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
608 struct proc *p, struct socket **so)
609{
610 struct mptopt smpo, *mpo, *tmpo;
611 struct socket *mp_so;
612 int error;
613
614 *so = NULL;
615 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
616 mp_so = mpte->mpte_mppcb->mpp_socket;
617
618 /*
619 * Create the subflow socket (multipath subflow, non-blocking.)
620 *
621 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
622 * socket; it will be cleared when the socket is peeled off or closed.
623 * It also indicates to the underlying TCP to handle MPTCP options.
624 * A multipath subflow socket implies SS_NOFDREF state.
625 */
626 if ((error = socreate_internal(dom, so, SOCK_STREAM,
627 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
628 mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to "
629 "create subflow socket error %d\n", __func__,
630 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error));
631 return (error);
632 }
633
634 socket_lock(*so, 0);
635 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
636 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
637 (SS_NBIO|SS_NOFDREF));
638
639 /* prevent the socket buffers from being compressed */
640 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
641 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
642
643 bzero(&smpo, sizeof (smpo));
644 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
645 smpo.mpo_level = SOL_SOCKET;
646 smpo.mpo_intval = 1;
647
648 /* disable SIGPIPE */
649 smpo.mpo_name = SO_NOSIGPIPE;
650 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
651 goto out;
652
653 /* find out if the subflow's source address goes away */
654 smpo.mpo_name = SO_NOADDRERR;
655 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
656 goto out;
657
658 /* enable keepalive */
659 smpo.mpo_name = SO_KEEPALIVE;
660 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
661 goto out;
662
663 /*
664 * Limit the receive socket buffer size to 64k.
665 *
666 * We need to take into consideration the window scale option
667 * which could be negotiated in one subflow but disabled in
668 * another subflow.
669 * XXX This can be improved in the future.
670 */
671 smpo.mpo_name = SO_RCVBUF;
672 smpo.mpo_intval = MPTCP_RWIN_MAX;
673 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
674 goto out;
675
676 /* N.B.: set by sosetopt */
677 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
678 /* Prevent automatic socket buffer sizing. */
679 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
680
681 smpo.mpo_level = IPPROTO_TCP;
682 smpo.mpo_intval = mptcp_subflow_keeptime;
683 smpo.mpo_name = TCP_KEEPALIVE;
684 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
685 goto out;
686
687 /* replay setsockopt(2) on the subflow sockets for eligible options */
688 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
689 int interim;
690
691 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
692 continue;
693
694 /*
695 * Skip those that are handled internally; these options
696 * should not have been recorded and marked with the
697 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
698 */
699 if (mpo->mpo_level == SOL_SOCKET &&
700 (mpo->mpo_name == SO_NOSIGPIPE ||
701 mpo->mpo_name == SO_NOADDRERR ||
702 mpo->mpo_name == SO_KEEPALIVE))
703 continue;
704
705 interim = (mpo->mpo_flags & MPOF_INTERIM);
706 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
707 char buf[32];
708 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d "
709 "interim record removed\n", __func__,
710 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
711 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
712 buf, sizeof (buf)), mpo->mpo_intval));
713 mptcp_sopt_remove(mpte, mpo);
714 mptcp_sopt_free(mpo);
715 continue;
716 }
717 }
718
719 /*
720 * We need to receive everything that the subflow socket has,
721 * so use a customized socket receive function. We will undo
722 * this when the socket is peeled off or closed.
723 */
724 mpts->mpts_oprotosw = (*so)->so_proto;
725 switch (dom) {
726 case PF_INET:
727 (*so)->so_proto = &mptcp_subflow_protosw;
728 break;
729#if INET6
730 case PF_INET6:
731 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
732 break;
733#endif /* INET6 */
734 default:
735 VERIFY(0);
736 /* NOTREACHED */
737 }
738
739out:
740 socket_unlock(*so, 0);
741
742 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
743 struct mptsub *, mpts, int, dom, int, error);
744
745 return (error);
746}
747
748/*
749 * Close an MPTCP subflow socket.
750 *
751 * Note that this may be called on an embryonic subflow, and the only
752 * thing that is guaranteed valid is the protocol-user request.
753 */
754static int
755mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
756{
757 MPTS_LOCK_ASSERT_HELD(mpts);
758
759 socket_lock(so, 0);
760 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
761 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
762
763 /* restore protocol-user requests */
764 VERIFY(mpts->mpts_oprotosw != NULL);
765 so->so_proto = mpts->mpts_oprotosw;
766 socket_unlock(so, 0);
767
768 mpts->mpts_socket = NULL; /* may already be NULL */
769
770 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
771 struct socket *, so,
772 struct sockbuf *, &so->so_rcv,
773 struct sockbuf *, &so->so_snd,
774 struct mptses *, mpts->mpts_mpte);
775
776 return (soclose(so));
777}
778
779/*
780 * Connect an MPTCP subflow socket.
781 *
782 * This may be called inline as part of adding a subflow, or asynchronously
783 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
784 * pending connect case, the subflow socket may have been bound to an interface
785 * and/or a source IP address which may no longer be around by the time this
786 * routine is called; in that case the connect attempt will most likely fail.
787 */
788static int
789mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
790{
791 struct socket *so;
792 int af, error;
793
794 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
795 MPTS_LOCK_ASSERT_HELD(mpts);
796
797 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
798 MPTSF_CONNECTING);
799 VERIFY(mpts->mpts_socket != NULL);
800 so = mpts->mpts_socket;
801 af = mpts->mpts_family;
802
803 if (af == AF_INET || af == AF_INET6) {
804 struct sockaddr_entry *dst_se;
805 char dbuf[MAX_IPv6_STR_LEN];
806
807 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
808 VERIFY(dst_se != NULL);
809
810 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
811 "[pended %s]\n", __func__,
812 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
813 inet_ntop(af, ((af == AF_INET) ?
814 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
815 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
816 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
817 ntohs(SIN(dst_se->se_addr)->sin_port) :
818 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
819 mpts->mpts_connid,
820 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
821 "YES" : "NO")));
822 }
823
824 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
825
826 socket_lock(so, 0);
fe8ab488
A
827 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
828
39236c6e
A
829 /* connect the subflow socket */
830 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
831 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
832 mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP,
833 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr));
834 socket_unlock(so, 0);
835
fe8ab488
A
836 /* Allocate a unique address id per subflow */
837 mpte->mpte_addrid_last++;
838 if (mpte->mpte_addrid_last == 0)
839 mpte->mpte_addrid_last++;
840
39236c6e
A
841 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
842 struct mptsub *, mpts, int, error);
843
844 return (error);
845}
846
847/*
848 * MPTCP subflow socket receive routine, derived from soreceive().
849 */
850static int
851mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
852 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
853{
854#pragma unused(uio)
855 int flags, error = 0;
856 struct proc *p = current_proc();
857 struct mbuf *m, **mp = mp0;
858 struct mbuf *nextrecord;
859
860 socket_lock(so, 1);
861 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
862
863#ifdef MORE_LOCKING_DEBUG
864 if (so->so_usecount == 1) {
865 panic("%s: so=%x no other reference on socket\n", __func__, so);
866 /* NOTREACHED */
867 }
868#endif
869 /*
870 * We return all that is there in the subflow's socket receive buffer
871 * to the MPTCP layer, so we require that the caller passes in the
872 * expected parameters.
873 */
874 if (mp == NULL || controlp != NULL) {
875 socket_unlock(so, 1);
876 return (EINVAL);
877 }
878 *mp = NULL;
879 if (psa != NULL)
880 *psa = NULL;
881 if (flagsp != NULL)
882 flags = *flagsp &~ MSG_EOR;
883 else
884 flags = 0;
885
886 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
887 socket_unlock(so, 1);
888 return (EOPNOTSUPP);
889 }
890 flags |= (MSG_DONTWAIT|MSG_NBIO);
891
892 /*
893 * If a recv attempt is made on a previously-accepted socket
894 * that has been marked as inactive (disconnected), reject
895 * the request.
896 */
897 if (so->so_flags & SOF_DEFUNCT) {
898 struct sockbuf *sb = &so->so_rcv;
899
900 error = ENOTCONN;
901 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
902 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
903 SOCK_DOM(so), SOCK_TYPE(so), error));
904 /*
905 * This socket should have been disconnected and flushed
906 * prior to being returned from sodefunct(); there should
907 * be no data on its receive list, so panic otherwise.
908 */
909 if (so->so_state & SS_DEFUNCT)
910 sb_empty_assert(sb, __func__);
911 socket_unlock(so, 1);
912 return (error);
913 }
914
915 /*
916 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
917 * and if so just return to the caller. This could happen when
918 * soreceive() is called by a socket upcall function during the
919 * time the socket is freed. The socket buffer would have been
920 * locked across the upcall, therefore we cannot put this thread
921 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
922 * we may livelock), because the lock on the socket buffer will
923 * only be released when the upcall routine returns to its caller.
924 * Because the socket has been officially closed, there can be
925 * no further read on it.
926 *
927 * A multipath subflow socket would have its SS_NOFDREF set by
928 * default, so check for SOF_MP_SUBFLOW socket flag; when the
929 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
930 */
931 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
932 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
933 socket_unlock(so, 1);
934 return (0);
935 }
936
937 /*
938 * For consistency with soreceive() semantics, we need to obey
939 * SB_LOCK in case some other code path has locked the buffer.
940 */
941 error = sblock(&so->so_rcv, 0);
942 if (error != 0) {
943 socket_unlock(so, 1);
944 return (error);
945 }
946
947 m = so->so_rcv.sb_mb;
948 if (m == NULL) {
949 /*
950 * Panic if we notice inconsistencies in the socket's
951 * receive list; both sb_mb and sb_cc should correctly
952 * reflect the contents of the list, otherwise we may
953 * end up with false positives during select() or poll()
954 * which could put the application in a bad state.
955 */
956 SB_MB_CHECK(&so->so_rcv);
957
958 if (so->so_error != 0) {
959 error = so->so_error;
960 so->so_error = 0;
961 goto release;
962 }
963
964 if (so->so_state & SS_CANTRCVMORE) {
965 goto release;
966 }
967
968 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
969 error = ENOTCONN;
970 goto release;
971 }
972
973 /*
974 * MSG_DONTWAIT is implicitly defined and this routine will
975 * never block, so return EWOULDBLOCK when there is nothing.
976 */
977 error = EWOULDBLOCK;
978 goto release;
979 }
980
981 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
982 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
983 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
984
985 while (m != NULL) {
986 nextrecord = m->m_nextpkt;
987 sbfree(&so->so_rcv, m);
988
989 if (mp != NULL) {
990 *mp = m;
991 mp = &m->m_next;
992 so->so_rcv.sb_mb = m = m->m_next;
993 *mp = NULL;
994 }
995
996 if (m != NULL) {
997 m->m_nextpkt = nextrecord;
998 if (nextrecord == NULL)
999 so->so_rcv.sb_lastrecord = m;
1000 } else {
1001 m = so->so_rcv.sb_mb = nextrecord;
1002 SB_EMPTY_FIXUP(&so->so_rcv);
1003 }
1004 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1005 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1006 }
1007
1008 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1009 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1010 /* notify protocol that we drained all the data */
1011 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1012 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1013
1014 if (flagsp != NULL)
1015 *flagsp |= flags;
1016
1017release:
1018 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1019 return (error);
1020
1021}
1022
1023
1024/*
1025 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1026 * the work done earlier when the subflow socket was created.
1027 */
1028void
1029mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1030 struct socket *so)
1031{
1032 struct mptopt smpo;
1033 struct socket *mp_so;
1034 int p, c;
1035
1036 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1037 mp_so = mpte->mpte_mppcb->mpp_socket;
1038 MPTS_LOCK_ASSERT_HELD(mpts);
1039
1040 socket_lock(so, 0);
1041 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1042 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1043
1044 /* inherit MPTCP socket states */
1045 if (!(mp_so->so_state & SS_NBIO))
1046 so->so_state &= ~SS_NBIO;
1047
1048 /*
1049 * At this point, the socket is not yet closed, as there is at least
1050 * one outstanding usecount previously held by mpts_socket from
1051 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1052 */
1053 so->so_flags &= ~SOF_MP_SUBFLOW;
1054 so->so_state &= ~SS_NOFDREF;
fe8ab488 1055 so->so_flags &= ~SOF_MPTCP_TRUE;
39236c6e
A
1056
1057 /* allow socket buffers to be compressed */
1058 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1059 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1060
1061 /*
1062 * Allow socket buffer auto sizing.
1063 *
1064 * This will increase the current 64k buffer size to whatever is best.
1065 */
fe8ab488
A
1066 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
1067 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1068 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1069 so->so_snd.sb_flags |= SB_AUTOSIZE;
39236c6e
A
1070
1071 /* restore protocol-user requests */
1072 VERIFY(mpts->mpts_oprotosw != NULL);
1073 so->so_proto = mpts->mpts_oprotosw;
1074
1075 bzero(&smpo, sizeof (smpo));
1076 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1077 smpo.mpo_level = SOL_SOCKET;
1078
1079 /* inherit SOF_NOSIGPIPE from parent MP socket */
1080 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1081 c = (so->so_flags & SOF_NOSIGPIPE);
1082 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1083 smpo.mpo_name = SO_NOSIGPIPE;
1084 if ((p - c) != 0)
1085 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1086
1087 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1088 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1089 c = (so->so_flags & SOF_NOADDRAVAIL);
1090 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1091 smpo.mpo_name = SO_NOADDRERR;
1092 if ((p - c) != 0)
1093 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1094
1095 /* inherit SO_KEEPALIVE from parent MP socket */
1096 p = (mp_so->so_options & SO_KEEPALIVE);
1097 c = (so->so_options & SO_KEEPALIVE);
1098 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1099 smpo.mpo_name = SO_KEEPALIVE;
1100 if ((p - c) != 0)
1101 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1102
1103 /* unset TCP level default keepalive option */
1104 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1105 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1106 smpo.mpo_level = IPPROTO_TCP;
1107 smpo.mpo_intval = 0;
1108 smpo.mpo_name = TCP_KEEPALIVE;
1109 if ((p - c) != 0)
1110 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1111 socket_unlock(so, 0);
1112
1113 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1114 struct mptsub *, mpts, struct socket *, so,
1115 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1116}
1117
1118/*
1119 * Establish an initial MPTCP connection (if first subflow and not yet
1120 * connected), or add a subflow to an existing MPTCP connection.
1121 */
1122int
1123mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1124 struct proc *p, uint32_t ifscope)
1125{
1126 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1127 struct socket *mp_so, *so = NULL;
1128 struct mptsub_connreq mpcr;
1129 struct mptcb *mp_tp;
1130 int af, error = 0;
1131
1132 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1133 mp_so = mpte->mpte_mppcb->mpp_socket;
1134 mp_tp = mpte->mpte_mptcb;
1135
fe8ab488
A
1136 MPT_LOCK(mp_tp);
1137 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1138 /* If the remote end sends Data FIN, refuse subflow adds */
1139 error = ENOTCONN;
1140 MPT_UNLOCK(mp_tp);
1141 return (error);
1142 }
1143 MPT_UNLOCK(mp_tp);
1144
39236c6e
A
1145 MPTS_LOCK(mpts);
1146 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1147 VERIFY(mpts->mpts_mpte == NULL);
1148 VERIFY(mpts->mpts_socket == NULL);
1149 VERIFY(mpts->mpts_dst_sl != NULL);
1150 VERIFY(mpts->mpts_connid == CONNID_ANY);
1151
1152 /* select source (if specified) and destination addresses */
1153 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1154 &mpts->mpts_dst_sl, &dst_se)) != 0)
1155 goto out;
1156
1157 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1158 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1159 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1160 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1161 VERIFY(af == AF_INET || af == AF_INET6);
1162
1163 /*
1164 * If the source address is not specified, allocate a storage for
1165 * it, so that later on we can fill it in with the actual source
1166 * IP address chosen by the underlying layer for the subflow after
1167 * it is connected.
1168 */
1169 if (mpts->mpts_src_sl == NULL) {
1170 mpts->mpts_src_sl =
1171 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1172 if (mpts->mpts_src_sl == NULL) {
1173 error = ENOBUFS;
1174 goto out;
1175 }
1176 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1177 VERIFY(se != NULL && se->se_addr != NULL &&
1178 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1179 bzero(se->se_addr, se->se_addr->sa_len);
1180 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1181 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1182 }
1183
1184 /* create the subflow socket */
1185 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1186 goto out;
1187
fe8ab488
A
1188 /* If fastjoin is requested, set state in mpts */
1189 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
1190 (mp_tp->mpt_state == MPTCPS_ESTABLISHED) &&
1191 (mpte->mpte_nummpcapflows == 0)) {
1192 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1193 mpts->mpts_rel_seq = 1;
1194 MPT_LOCK(mp_tp);
1195 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1196 MPT_UNLOCK(mp_tp);
1197 }
1198
39236c6e 1199 /*
fe8ab488
A
1200 * Increment the counter, while avoiding 0 (CONNID_ANY) and
1201 * -1 (CONNID_ALL).
39236c6e
A
1202 */
1203 mpte->mpte_connid_last++;
1204 if (mpte->mpte_connid_last == CONNID_ALL ||
1205 mpte->mpte_connid_last == CONNID_ANY)
1206 mpte->mpte_connid_last++;
1207
1208 mpts->mpts_connid = mpte->mpte_connid_last;
1209 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1210 mpts->mpts_connid != CONNID_ALL);
fe8ab488
A
1211
1212 /* Allocate a unique address id per subflow */
1213 mpte->mpte_addrid_last++;
1214 if (mpte->mpte_addrid_last == 0)
1215 mpte->mpte_addrid_last++;
39236c6e
A
1216
1217 /* bind subflow socket to the specified interface */
1218 if (ifscope != IFSCOPE_NONE) {
1219 socket_lock(so, 0);
1220 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1221 if (error != 0) {
1222 socket_unlock(so, 0);
1223 (void) mptcp_subflow_soclose(mpts, so);
1224 goto out;
1225 }
1226 VERIFY(mpts->mpts_outif != NULL);
1227 mpts->mpts_flags |= MPTSF_BOUND_IF;
1228
1229 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] "
1230 "cid %d\n", __func__,
1231 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1232 mpts->mpts_outif->if_xname,
1233 ifscope, mpts->mpts_connid));
1234 socket_unlock(so, 0);
1235 }
1236
1237 /* if source address and/or port is specified, bind to it */
1238 if (src_se != NULL) {
1239 struct sockaddr *sa = src_se->se_addr;
1240 uint32_t mpts_flags = 0;
1241 in_port_t lport;
1242
1243 switch (af) {
1244 case AF_INET:
1245 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1246 mpts_flags |= MPTSF_BOUND_IP;
1247 if ((lport = SIN(sa)->sin_port) != 0)
1248 mpts_flags |= MPTSF_BOUND_PORT;
1249 break;
1250#if INET6
1251 case AF_INET6:
1252 VERIFY(af == AF_INET6);
1253 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1254 mpts_flags |= MPTSF_BOUND_IP;
1255 if ((lport = SIN6(sa)->sin6_port) != 0)
1256 mpts_flags |= MPTSF_BOUND_PORT;
1257 break;
1258#endif /* INET6 */
1259 }
1260
1261 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1262 if (error != 0) {
1263 (void) mptcp_subflow_soclose(mpts, so);
1264 goto out;
1265 }
1266 mpts->mpts_flags |= mpts_flags;
1267
1268 if (af == AF_INET || af == AF_INET6) {
1269 char sbuf[MAX_IPv6_STR_LEN];
1270
1271 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] "
1272 "cid %d\n", __func__,
1273 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1274 inet_ntop(af, ((af == AF_INET) ?
1275 (void *)&SIN(sa)->sin_addr.s_addr :
1276 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
1277 ntohs(lport), mpts->mpts_connid));
1278 }
1279 }
1280
1281 /*
1282 * Insert the subflow into the list, and associate the MPTCP PCB
1283 * as well as the the subflow socket. From this point on, removing
1284 * the subflow needs to be done via mptcp_subflow_del().
1285 */
1286 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1287 mpte->mpte_numflows++;
1288
1289 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1290 mpts->mpts_mpte = mpte;
1291 mpts->mpts_socket = so;
1292 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1293 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1294 mp_so->so_usecount++; /* for subflow socket */
1295
1296 /* register for subflow socket read/write events */
1297 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1298 mptcp_subflow_wupcall, mpts);
1299
1300 /*
1301 * Register for subflow socket control events; ignore
1302 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1303 * will generate it here.
1304 */
1305 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1306 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1307 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1308 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1309 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1310 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1311 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
fe8ab488
A
1312 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1313 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
39236c6e
A
1314
1315 /* sanity check */
1316 VERIFY(!(mpts->mpts_flags &
1317 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1318
1319 bzero(&mpcr, sizeof (mpcr));
1320 mpcr.mpcr_proc = p;
1321 mpcr.mpcr_ifscope = ifscope;
1322 /*
1323 * Indicate to the TCP subflow whether or not it should establish
1324 * the initial MPTCP connection, or join an existing one. Fill
1325 * in the connection request structure with additional info needed
1326 * by the underlying TCP (to be used in the TCP options, etc.)
1327 */
1328 MPT_LOCK(mp_tp);
1329 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1330 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1331 mp_tp->mpt_localkey = mptcp_reserve_key();
1332 mptcp_conn_properties(mp_tp);
1333 }
1334 MPT_UNLOCK(mp_tp);
1335 soisconnecting(mp_so);
1336 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1337 } else {
1338 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1339 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
fe8ab488
A
1340
1341 /* avoid starting up cellular subflow unless required */
1342 if ((mptcp_delayed_subf_start) &&
1343 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1344 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1345 }
39236c6e
A
1346 MPT_UNLOCK(mp_tp);
1347 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1348 }
1349
1350 mpts->mpts_mpcr = mpcr;
1351 mpts->mpts_flags |= MPTSF_CONNECTING;
1352
1353 if (af == AF_INET || af == AF_INET6) {
1354 char dbuf[MAX_IPv6_STR_LEN];
1355
1356 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
1357 "[pending %s]\n", __func__,
1358 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1359 inet_ntop(af, ((af == AF_INET) ?
1360 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1361 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1362 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1363 ntohs(SIN(dst_se->se_addr)->sin_port) :
1364 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1365 mpts->mpts_connid,
1366 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
1367 "YES" : "NO")));
1368 }
1369
1370 /* connect right away if first attempt, or if join can be done now */
1371 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1372 error = mptcp_subflow_soconnectx(mpte, mpts);
1373
1374out:
1375 MPTS_UNLOCK(mpts);
1376 if (error == 0) {
1377 soevent(mp_so, SO_FILT_HINT_LOCKED |
1378 SO_FILT_HINT_CONNINFO_UPDATED);
1379 }
1380 return (error);
1381}
1382
39236c6e
A
1383/*
1384 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1385 * will no longer be accessible after a subflow is deleted, thus this
1386 * should occur only after the subflow socket has been disconnected.
1387 * If peeloff(2) is called, leave the socket open.
1388 */
1389void
1390mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1391{
1392 struct socket *mp_so, *so;
1393
1394 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1395 mp_so = mpte->mpte_mppcb->mpp_socket;
1396
1397 MPTS_LOCK(mpts);
1398 so = mpts->mpts_socket;
1399 VERIFY(so != NULL);
fe8ab488
A
1400
1401 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1402 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1403 MPTS_UNLOCK(mpts);
1404 mptcplog((LOG_DEBUG, "%s: %d %x\n", __func__,
1405 mpts->mpts_soerror, mpts->mpts_flags));
1406 return;
1407 }
39236c6e
A
1408
1409 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d "
1410 "[close %s] %d %x\n", __func__,
1411 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1412 mp_so->so_usecount,
1413 mp_so->so_retaincnt, mpts->mpts_connid,
1414 (close ? "YES" : "NO"), mpts->mpts_soerror,
1415 mpts->mpts_flags));
1416
1417 VERIFY(mpts->mpts_mpte == mpte);
1418 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1419 mpts->mpts_connid != CONNID_ALL);
1420
1421 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1422 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1423 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1424 VERIFY(mpte->mpte_numflows != 0);
1425 mpte->mpte_numflows--;
fe8ab488
A
1426 if (mpte->mpte_active_sub == mpts)
1427 mpte->mpte_active_sub = NULL;
39236c6e
A
1428
1429 /*
1430 * Drop references held by this subflow socket; there
1431 * will be no further upcalls made from this point.
1432 */
1433 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1434 (void) sock_catchevents(so, NULL, NULL, 0);
fe8ab488 1435
39236c6e 1436 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
fe8ab488 1437
39236c6e
A
1438 if (close)
1439 (void) mptcp_subflow_soclose(mpts, so);
1440
1441 VERIFY(mp_so->so_usecount != 0);
1442 mp_so->so_usecount--; /* for subflow socket */
1443 mpts->mpts_mpte = NULL;
1444 mpts->mpts_socket = NULL;
1445 MPTS_UNLOCK(mpts);
1446
1447 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1448 MPTS_REMREF(mpts); /* for subflow socket */
1449
1450 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1451}
1452
1453/*
1454 * Disconnect a subflow socket.
1455 */
1456void
1457mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1458 boolean_t deleteok)
1459{
1460 struct socket *so;
1461 struct mptcb *mp_tp;
1462 int send_dfin = 0;
1463
1464 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1465 MPTS_LOCK_ASSERT_HELD(mpts);
1466
1467 VERIFY(mpts->mpts_mpte == mpte);
1468 VERIFY(mpts->mpts_socket != NULL);
1469 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1470 mpts->mpts_connid != CONNID_ALL);
1471
1472 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1473 return;
1474
1475 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1476
1477 /*
1478 * If this is coming from disconnectx(2) or issued as part of
1479 * closing the MPTCP socket, the subflow shouldn't stick around.
1480 * Otherwise let it linger around in case the upper layers need
1481 * to retrieve its conninfo.
1482 */
1483 if (deleteok)
1484 mpts->mpts_flags |= MPTSF_DELETEOK;
1485
1486 so = mpts->mpts_socket;
1487 mp_tp = mpte->mpte_mptcb;
1488 MPT_LOCK(mp_tp);
1489 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1490 send_dfin = 1;
1491 MPT_UNLOCK(mp_tp);
1492
1493 socket_lock(so, 0);
1494 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1495 (so->so_state & SS_ISCONNECTED)) {
1496 mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n",
1497 __func__, mpts->mpts_connid, send_dfin,
1498 (deleteok ? "NO" : "YES")));
1499
1500 if (send_dfin)
1501 mptcp_send_dfin(so);
1502 (void) soshutdownlock(so, SHUT_RD);
1503 (void) soshutdownlock(so, SHUT_WR);
1504 (void) sodisconnectlocked(so);
1505 }
1506 socket_unlock(so, 0);
1507 /*
1508 * Generate a disconnect event for this subflow socket, in case
1509 * the lower layer doesn't do it; this is needed because the
1510 * subflow socket deletion relies on it. This will also end up
1511 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1512 * we cannot do that here because subflow lock is currently held.
1513 */
1514 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1515}
1516
1517/*
1518 * Subflow socket read upcall.
1519 *
1520 * Called when the associated subflow socket posted a read event. The subflow
1521 * socket lock has been released prior to invoking the callback. Note that the
1522 * upcall may occur synchronously as a result of MPTCP performing an action on
1523 * it, or asynchronously as a result of an event happening at the subflow layer.
1524 * Therefore, to maintain lock ordering, the only lock that can be acquired
1525 * here is the thread lock, for signalling purposes.
1526 */
1527static void
1528mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1529{
1530#pragma unused(so, waitf)
1531 struct mptsub *mpts = arg;
1532 struct mptses *mpte = mpts->mpts_mpte;
1533
fe8ab488
A
1534 /*
1535 * mpte should never be NULL, except in a race with
1536 * mptcp_subflow_del
1537 */
1538 if (mpte == NULL)
1539 return;
39236c6e
A
1540
1541 lck_mtx_lock(&mpte->mpte_thread_lock);
1542 mptcp_thread_signal_locked(mpte);
1543 lck_mtx_unlock(&mpte->mpte_thread_lock);
1544}
1545
1546/*
1547 * Subflow socket input.
1548 *
1549 * Called in the context of the MPTCP thread, for reading data from the
1550 * underlying subflow socket and delivering it to MPTCP.
1551 */
1552static void
1553mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1554{
1555 struct mbuf *m = NULL;
1556 struct socket *so;
1557 int error;
1558 struct mptsub *mpts_alt = NULL;
1559
1560 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1561 MPTS_LOCK_ASSERT_HELD(mpts);
1562
1563 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1564 struct mptsub *, mpts);
1565
1566 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1567 return;
1568
1569 so = mpts->mpts_socket;
1570
1571 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1572 if (error != 0 && error != EWOULDBLOCK) {
1573 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
1574 __func__, mpts->mpts_connid, error));
1575 MPTS_UNLOCK(mpts);
1576 mpts_alt = mptcp_get_subflow(mpte, mpts);
1577 if (mpts_alt == NULL) {
fe8ab488
A
1578 if (mptcp_delayed_subf_start) {
1579 mpts_alt = mptcp_get_pending_subflow(mpte,
1580 mpts);
1581 if (mpts_alt) {
1582 mptcplog((LOG_INFO,"%s: pending %d\n",
1583 __func__, mpts_alt->mpts_connid));
1584 } else {
1585 mptcplog((LOG_ERR, "%s: no pending",
1586 "%d\n", __func__,
1587 mpts->mpts_connid));
1588 mpte->mpte_mppcb->mpp_socket->so_error =
1589 error;
1590 }
1591 } else {
1592 mptcplog((LOG_ERR, "%s: no alt path cid %d\n",
1593 __func__, mpts->mpts_connid));
1594 mpte->mpte_mppcb->mpp_socket->so_error = error;
1595 }
39236c6e
A
1596 }
1597 MPTS_LOCK(mpts);
1598 } else if (error == 0) {
1599 mptcplog3((LOG_DEBUG, "%s: cid %d \n",
1600 __func__, mpts->mpts_connid));
1601 }
1602
1603 /* In fallback, make sure to accept data on all but one subflow */
1604 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1605 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1606 m_freem(m);
1607 return;
1608 }
1609
1610 if (m != NULL) {
1611 /*
1612 * Release subflow lock since this may trigger MPTCP to send,
1613 * possibly on a different subflow. An extra reference has
1614 * been held on the subflow by the MPTCP thread before coming
1615 * here, so we can be sure that it won't go away, in the event
1616 * the MP socket lock gets released.
1617 */
1618 MPTS_UNLOCK(mpts);
1619 mptcp_input(mpte, m);
1620 MPTS_LOCK(mpts);
1621 }
1622}
1623
1624/*
1625 * Subflow socket write upcall.
1626 *
1627 * Called when the associated subflow socket posted a read event. The subflow
1628 * socket lock has been released prior to invoking the callback. Note that the
1629 * upcall may occur synchronously as a result of MPTCP performing an action on
1630 * it, or asynchronously as a result of an event happening at the subflow layer.
1631 * Therefore, to maintain lock ordering, the only lock that can be acquired
1632 * here is the thread lock, for signalling purposes.
1633 */
1634static void
1635mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1636{
1637#pragma unused(so, waitf)
1638 struct mptsub *mpts = arg;
1639 struct mptses *mpte = mpts->mpts_mpte;
1640
fe8ab488
A
1641 /*
1642 * mpte should never be NULL except in a race with
1643 * mptcp_subflow_del which doesn't hold socket lock across critical
1644 * section. This upcall is made after releasing the socket lock.
1645 * Interleaving of socket operations becomes possible therefore.
1646 */
1647 if (mpte == NULL)
1648 return;
39236c6e
A
1649
1650 lck_mtx_lock(&mpte->mpte_thread_lock);
1651 mptcp_thread_signal_locked(mpte);
1652 lck_mtx_unlock(&mpte->mpte_thread_lock);
1653}
1654
1655/*
1656 * Subflow socket output.
1657 *
1658 * Called for sending data from MPTCP to the underlying subflow socket.
1659 */
1660int
1661mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1662{
1663 struct socket *mp_so, *so;
1664 size_t sb_cc = 0, tot_sent = 0;
1665 struct mbuf *sb_mb;
1666 int error = 0;
1667 u_int64_t mpt_dsn = 0;
1668 struct mptcb *mp_tp = mpte->mpte_mptcb;
1669 struct mbuf *mpt_mbuf = NULL;
fe8ab488
A
1670 u_int64_t off = 0;
1671 struct mbuf *head, *tail;
39236c6e
A
1672
1673 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1674 MPTS_LOCK_ASSERT_HELD(mpts);
1675 mp_so = mpte->mpte_mppcb->mpp_socket;
1676 so = mpts->mpts_socket;
1677
1678 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1679 struct mptsub *, mpts);
1680
1681 /* subflow socket is suspended? */
1682 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
1683 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow "
1684 "controlled\n", __func__,
1685 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1686 goto out;
1687 }
1688
1689 /* subflow socket is not MPTCP capable? */
1690 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
fe8ab488
A
1691 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1692 !(mpts->mpts_flags & MPTSF_FASTJ_SEND)) {
39236c6e
A
1693 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not "
1694 "MPTCP capable\n", __func__,
1695 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1696 goto out;
1697 }
1698
1699 /* Remove Addr Option is not sent reliably as per I-D */
1700 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1701 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1702 tp->t_rem_aid = mpte->mpte_lost_aid;
1703 if (mptcp_remaddr_enable)
1704 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1705 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1706 }
1707
1708 /*
1709 * The mbuf chains containing the metadata (as well as pointing to
1710 * the user data sitting at the MPTCP output queue) would then be
1711 * sent down to the subflow socket.
1712 *
1713 * Some notes on data sequencing:
1714 *
1715 * a. Each mbuf must be a M_PKTHDR.
1716 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1717 * in the mbuf pkthdr structure.
1718 * c. Each mbuf containing the MPTCP metadata must have its
1719 * pkt_flags marked with the PKTF_MPTCP flag.
1720 */
1721
1722 /* First, drop acknowledged data */
1723 sb_mb = mp_so->so_snd.sb_mb;
1724 if (sb_mb == NULL) {
1725 goto out;
1726 }
1727
1728 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1729
1730 mpt_mbuf = sb_mb;
1731 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1732 mpt_mbuf = mpt_mbuf->m_next;
1733 }
1734 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1735 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1736 } else {
1737 goto out;
1738 }
1739
1740 MPT_LOCK(mp_tp);
1741 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
fe8ab488 1742 u_int64_t len = 0;
39236c6e 1743 len = mp_tp->mpt_snduna - mpt_dsn;
fe8ab488 1744 sbdrop(&mp_so->so_snd, (int)len);
39236c6e
A
1745
1746 }
1747
1748 /*
1749 * In degraded mode, we don't receive data acks, so force free
1750 * mbufs less than snd_nxt
1751 */
fe8ab488
A
1752 if (mp_so->so_snd.sb_mb == NULL) {
1753 MPT_UNLOCK(mp_tp);
1754 goto out;
1755 }
1756
39236c6e
A
1757 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1758 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 1759 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
39236c6e 1760 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
fe8ab488 1761 u_int64_t len = 0;
39236c6e 1762 len = mp_tp->mpt_sndnxt - mpt_dsn;
fe8ab488 1763 sbdrop(&mp_so->so_snd, (int)len);
39236c6e
A
1764 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1765 }
1766
fe8ab488
A
1767 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1768 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1769 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1770 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1771 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1772 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1773 }
1774
39236c6e
A
1775 /*
1776 * Adjust the subflow's notion of next byte to send based on
1777 * the last unacknowledged byte
1778 */
1779 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1780 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
fe8ab488
A
1781 /*
1782 * With FastJoin, a write before the fastjoin event will use
1783 * an uninitialized relative sequence number.
1784 */
1785 if (mpts->mpts_rel_seq == 0)
1786 mpts->mpts_rel_seq = 1;
39236c6e
A
1787 }
1788
1789 /*
1790 * Adjust the top level notion of next byte used for retransmissions
1791 * and sending FINs.
1792 */
1793 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1794 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1795 }
1796
1797
1798 /* Now determine the offset from which to start transmitting data */
1799 sb_mb = mp_so->so_snd.sb_mb;
1800 sb_cc = mp_so->so_snd.sb_cc;
1801 if (sb_mb == NULL) {
1802 MPT_UNLOCK(mp_tp);
1803 goto out;
1804 }
1805 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1806 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
fe8ab488 1807 sb_cc -= (size_t)off;
39236c6e
A
1808 } else {
1809 MPT_UNLOCK(mp_tp);
1810 goto out;
1811 }
1812 MPT_UNLOCK(mp_tp);
1813
1814 mpt_mbuf = sb_mb;
1815 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1816
1817 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
fe8ab488 1818 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
39236c6e
A
1819 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1820 mpt_mbuf = mpt_mbuf->m_next;
1821 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1822 }
1823 if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED))
fe8ab488 1824 mptcplog2((LOG_INFO, "%s: snduna = %llu off = %lld id = %d"
39236c6e
A
1825 " %llu \n",
1826 __func__,
1827 mp_tp->mpt_snduna, off, mpts->mpts_connid,
1828 mpts->mpts_sndnxt));
1829
1830 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1831
fe8ab488
A
1832 head = tail = NULL;
1833
39236c6e
A
1834 while (tot_sent < sb_cc) {
1835 struct mbuf *m;
fe8ab488 1836 size_t mlen;
39236c6e
A
1837
1838 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1839 mlen -= off;
1840 if (mlen == 0)
1841 goto out;
1842
1843 if (mlen > sb_cc) {
1844 panic("%s: unexpected %lu %lu \n", __func__,
1845 mlen, sb_cc);
1846 }
1847
fe8ab488
A
1848 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
1849 M_COPYM_MUST_COPY_HDR);
39236c6e
A
1850 if (m == NULL) {
1851 error = ENOBUFS;
1852 break;
1853 }
1854
1855 /* Create a DSN mapping for the data (m_copym does it) */
1856 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
fe8ab488 1857 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
1858 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1859 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
1860 m->m_pkthdr.mp_dsn = mpt_dsn + off;
1861 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
1862 m->m_pkthdr.mp_rlen = mlen;
1863 mpts->mpts_rel_seq += mlen;
1864 m->m_pkthdr.len = mlen;
1865
fe8ab488
A
1866 if (head == NULL) {
1867 head = tail = m;
1868 } else {
1869 tail->m_next = m;
1870 tail = m;
1871 }
1872
39236c6e
A
1873 /* last contiguous mapping is stored for error cases */
1874 if (mpts->mpts_lastmap.mptsl_dsn +
1875 mpts->mpts_lastmap.mptsl_len == mpt_dsn) {
1876 mpts->mpts_lastmap.mptsl_len += tot_sent;
1877 } else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn +
1878 mpts->mpts_lastmap.mptsl_len), mpt_dsn)) {
1879 if (m->m_pkthdr.mp_dsn == 0)
1880 panic("%s %llu", __func__, mpt_dsn);
1881 mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn;
1882 mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq;
1883 mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen;
1884 }
1885
fe8ab488
A
1886 tot_sent += mlen;
1887 off = 0;
1888 mpt_mbuf = mpt_mbuf->m_next;
1889 }
1890
1891 if (head != NULL) {
1892
1893 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
1894 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1895 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
1896 }
1897
1898 error = sock_sendmbuf(so, NULL, head, 0, NULL);
1899
1900 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
39236c6e
A
1901 struct sockbuf *, &so->so_rcv,
1902 struct sockbuf *, &so->so_snd,
1903 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
1904 size_t, tot_sent);
1905 }
1906
1907 if (error == 0) {
1908 mpts->mpts_sndnxt += tot_sent;
39236c6e
A
1909 MPT_LOCK(mp_tp);
1910 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
1911 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
1912 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
1913 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
1914 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
1915 }
fe8ab488 1916 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
39236c6e 1917 MPT_UNLOCK(mp_tp);
fe8ab488
A
1918
1919 /* Send once in SYN_SENT state to avoid sending SYN spam */
1920 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
1921 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
1922 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
39236c6e 1923 }
39236c6e 1924
fe8ab488 1925 if ((mpts->mpts_connid >= 2) ||
39236c6e 1926 (mpts->mpts_flags & MPTSF_MP_DEGRADED))
fe8ab488
A
1927 mptcplog2((LOG_DEBUG, "%s: cid %d wrote %d %d\n",
1928 __func__, mpts->mpts_connid, (int)tot_sent,
1929 (int) sb_cc));
1930 } else {
1931 mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d len %zd\n",
1932 __func__, mpts->mpts_connid, error, tot_sent));
39236c6e
A
1933 }
1934out:
1935 return (error);
1936}
1937
1938/*
1939 * Subflow socket control event upcall.
1940 *
1941 * Called when the associated subflow socket posted one or more control events.
1942 * The subflow socket lock has been released prior to invoking the callback.
1943 * Note that the upcall may occur synchronously as a result of MPTCP performing
1944 * an action on it, or asynchronously as a result of an event happening at the
1945 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
1946 * be acquired here is the thread lock, for signalling purposes.
1947 */
1948static void
1949mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
1950{
1951#pragma unused(so)
1952 struct mptsub *mpts = arg;
1953 struct mptses *mpte = mpts->mpts_mpte;
1954
1955 VERIFY(mpte != NULL);
1956
1957 lck_mtx_lock(&mpte->mpte_thread_lock);
1958 atomic_bitset_32(&mpts->mpts_evctl, events);
1959 mptcp_thread_signal_locked(mpte);
1960 lck_mtx_unlock(&mpte->mpte_thread_lock);
1961}
1962
1963/*
1964 * Subflow socket control events.
1965 *
1966 * Called for handling events related to the underlying subflow socket.
1967 */
1968static ev_ret_t
1969mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts)
1970{
fe8ab488 1971 uint32_t events, save_events;
39236c6e
A
1972 ev_ret_t ret = MPTS_EVRET_OK;
1973
1974 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1975 MPTS_LOCK_ASSERT_HELD(mpts);
1976
1977 /* bail if there's nothing to process */
1978 if ((events = mpts->mpts_evctl) == 0)
1979 return (ret);
1980
1981 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
1982 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
1983 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
1984 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
1985 SO_FILT_HINT_DISCONNECTED)) {
1986 events |= SO_FILT_HINT_MPFAILOVER;
1987 }
1988
fe8ab488
A
1989 save_events = events;
1990
39236c6e
A
1991 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
1992 struct mptsub *, mpts, uint32_t, events);
1993
1994 mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__,
1995 mpts->mpts_connid, events, SO_FILT_HINT_BITS));
1996
fe8ab488
A
1997 if ((events & SO_FILT_HINT_MPCANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
1998 ev_ret_t error = mptcp_subflow_mpcantrcvmore_ev(mpte, mpts);
1999 events &= ~SO_FILT_HINT_MPCANTRCVMORE;
2000 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2001 }
39236c6e
A
2002 if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) {
2003 ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts);
2004 events &= ~SO_FILT_HINT_MPFAILOVER;
2005 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2006 }
2007 if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) {
2008 ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts);
2009 events &= ~SO_FILT_HINT_CONNRESET;
2010 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2011 }
2012 if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) {
2013 ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts);
2014 events &= ~SO_FILT_HINT_MUSTRST;
2015 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2016 }
2017 if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
2018 ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts);
2019 events &= ~SO_FILT_HINT_CANTRCVMORE;
2020 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2021 }
2022 if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) {
2023 ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts);
2024 events &= ~SO_FILT_HINT_CANTSENDMORE;
2025 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2026 }
2027 if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) {
2028 ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts);
2029 events &= ~SO_FILT_HINT_TIMEOUT;
2030 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2031 }
2032 if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) {
2033 ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts);
2034 events &= ~SO_FILT_HINT_NOSRCADDR;
2035 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2036 }
2037 if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) {
2038 ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts);
2039 events &= ~SO_FILT_HINT_IFDENIED;
2040 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2041 }
2042 if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) {
2043 ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts);
2044 events &= ~SO_FILT_HINT_SUSPEND;
2045 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2046 }
2047 if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) {
2048 ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts);
2049 events &= ~SO_FILT_HINT_RESUME;
2050 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2051 }
2052 if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) {
2053 ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts);
2054 events &= ~SO_FILT_HINT_CONNECTED;
2055 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2056 }
2057 if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) {
2058 ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts);
2059 events &= ~SO_FILT_HINT_MPSTATUS;
2060 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2061 }
fe8ab488
A
2062 if ((events & SO_FILT_HINT_DELETEOK) && (ret >= MPTS_EVRET_OK)) {
2063 ev_ret_t error = mptcp_deleteok_ev(mpte, mpts);
2064 events &= ~SO_FILT_HINT_DELETEOK;
2065 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2066 }
39236c6e
A
2067 if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) {
2068 ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts);
2069 events &= ~SO_FILT_HINT_DISCONNECTED;
2070 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2071 }
fe8ab488
A
2072 if ((events & SO_FILT_HINT_MPFASTJ) && (ret >= MPTS_EVRET_OK)) {
2073 ev_ret_t error = mptcp_fastjoin_ev(mpte, mpts);
2074 events &= ~SO_FILT_HINT_MPFASTJ;
2075 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2076 }
2077
39236c6e
A
2078 /*
2079 * We should be getting only events specified via sock_catchevents(),
2080 * so loudly complain if we have any unprocessed one(s).
2081 */
2082 if (events != 0 || ret < MPTS_EVRET_OK) {
2083 mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)"
2084 " unhandled events=%b\n",
2085 (events != 0) ? "MPTCP_ERROR " : "",
2086 __func__, mpts->mpts_connid,
2087 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS));
2088 }
2089
2090 /* clear the ones we've processed */
fe8ab488
A
2091 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
2092
39236c6e
A
2093 return (ret);
2094}
2095
2096/*
2097 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2098 */
2099static ev_ret_t
2100mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts)
2101{
2102 struct socket *mp_so, *so;
2103 struct mptcb *mp_tp;
2104 boolean_t linger;
2105
2106 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2107 MPTS_LOCK_ASSERT_HELD(mpts);
2108 VERIFY(mpte->mpte_mppcb != NULL);
2109 mp_so = mpte->mpte_mppcb->mpp_socket;
2110 mp_tp = mpte->mpte_mptcb;
2111 so = mpts->mpts_socket;
2112
2113 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2114 !(mp_so->so_flags & SOF_PCBCLEARING));
2115
2116 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2117 mpts->mpts_connid, (linger ? "YES" : "NO")));
2118
39236c6e
A
2119 /*
2120 * We got a TCP RST for this subflow connection.
2121 *
2122 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
fe8ab488
A
2123 * client if the MPTCP connection has not been established or
2124 * if the connection has only one subflow and is a connection being
2125 * resumed. Otherwise we close the socket.
39236c6e
A
2126 */
2127 mptcp_subflow_disconnect(mpte, mpts, !linger);
2128
2129 MPT_LOCK(mp_tp);
2130 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
fe8ab488
A
2131 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
2132 } else if (mpte->mpte_nummpcapflows < 1) {
2133 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
2134 MPT_UNLOCK(mp_tp);
2135 MPTS_UNLOCK(mpts);
2136 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET);
2137 MPTS_LOCK(mpts);
2138 MPT_LOCK(mp_tp);
39236c6e
A
2139 }
2140 MPT_UNLOCK(mp_tp);
2141
2142 /*
2143 * Keep the subflow socket around, unless the MPTCP socket has
2144 * been detached or the subflow has been disconnected explicitly,
2145 * in which case it should be deleted right away.
2146 */
2147 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2148}
2149
2150/*
2151 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2152 */
2153static ev_ret_t
2154mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2155{
2156 struct socket *so;
2157
2158 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2159 MPTS_LOCK_ASSERT_HELD(mpts);
2160
2161 so = mpts->mpts_socket;
2162
2163 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2164
2165 /*
2166 * We got a FIN for this subflow connection. This subflow socket
2167 * is no longer available for receiving data;
2168 * The FIN may arrive with data. The data is handed up to the
2169 * mptcp socket and the subflow is disconnected.
2170 */
2171
2172 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2173}
2174
2175/*
2176 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2177 */
2178static ev_ret_t
2179mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts)
2180{
2181 struct socket *so;
2182
2183 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2184 MPTS_LOCK_ASSERT_HELD(mpts);
2185
2186 so = mpts->mpts_socket;
2187
2188 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2189 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2190}
2191
2192/*
2193 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2194 */
2195static ev_ret_t
2196mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts)
2197{
2198 struct socket *mp_so, *so;
2199 struct mptcb *mp_tp;
2200 boolean_t linger;
2201
2202 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2203 MPTS_LOCK_ASSERT_HELD(mpts);
2204 VERIFY(mpte->mpte_mppcb != NULL);
2205 mp_so = mpte->mpte_mppcb->mpp_socket;
2206 mp_tp = mpte->mpte_mptcb;
2207 so = mpts->mpts_socket;
2208
2209 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2210 !(mp_so->so_flags & SOF_PCBCLEARING));
2211
2212 mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__,
2213 mpts->mpts_connid, (linger ? "YES" : "NO")));
2214
2215 if (mpts->mpts_soerror == 0)
2216 mpts->mpts_soerror = ETIMEDOUT;
2217
2218 /*
2219 * The subflow connection has timed out.
2220 *
2221 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2222 * client if the MPTCP connection has not been established. Otherwise
2223 * drop it.
2224 */
2225 mptcp_subflow_disconnect(mpte, mpts, !linger);
2226
2227 MPT_LOCK(mp_tp);
2228 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2229 mp_so->so_error = ETIMEDOUT;
2230 }
2231 MPT_UNLOCK(mp_tp);
2232
2233 /*
2234 * Keep the subflow socket around, unless the MPTCP socket has
2235 * been detached or the subflow has been disconnected explicitly,
2236 * in which case it should be deleted right away.
2237 */
2238 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2239}
2240
2241/*
2242 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2243 */
2244static ev_ret_t
2245mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts)
2246{
2247 struct socket *mp_so, *so;
2248 struct mptcb *mp_tp;
2249 boolean_t linger;
2250 struct tcpcb *tp = NULL;
2251
2252 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2253 MPTS_LOCK_ASSERT_HELD(mpts);
2254
2255 VERIFY(mpte->mpte_mppcb != NULL);
2256 mp_so = mpte->mpte_mppcb->mpp_socket;
2257 mp_tp = mpte->mpte_mptcb;
2258 so = mpts->mpts_socket;
2259
2260 /* Not grabbing socket lock as t_local_aid is write once only */
2261 tp = intotcpcb(sotoinpcb(so));
2262 /*
2263 * This overwrites any previous mpte_lost_aid to avoid storing
2264 * too much state when the typical case has only two subflows.
2265 */
2266 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2267 mpte->mpte_lost_aid = tp->t_local_aid;
2268
2269 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2270 !(mp_so->so_flags & SOF_PCBCLEARING));
2271
2272 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2273 mpts->mpts_connid, (linger ? "YES" : "NO")));
2274
2275 if (mpts->mpts_soerror == 0)
2276 mpts->mpts_soerror = EADDRNOTAVAIL;
2277
2278 /*
2279 * The subflow connection has lost its source address.
2280 *
2281 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2282 * client if the MPTCP connection has not been established. If it
2283 * has been established with one subflow , we keep the MPTCP
2284 * connection valid without any subflows till closed by application.
2285 * This lets tcp connection manager decide whether to close this or
2286 * not as it reacts to reachability changes too.
2287 */
2288 mptcp_subflow_disconnect(mpte, mpts, !linger);
2289
2290 MPT_LOCK(mp_tp);
2291 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2292 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2293 mp_so->so_error = EADDRNOTAVAIL;
2294 }
2295 MPT_UNLOCK(mp_tp);
2296
2297 /*
2298 * Keep the subflow socket around, unless the MPTCP socket has
2299 * been detached or the subflow has been disconnected explicitly,
2300 * in which case it should be deleted right away.
2301 */
2302 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2303}
2304
fe8ab488
A
2305/*
2306 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2307 * indicates that the remote side sent a Data FIN
2308 */
2309static ev_ret_t
2310mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2311{
2312 struct socket *so, *mp_so;
2313 struct mptcb *mp_tp;
2314
2315 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2316 MPTS_LOCK_ASSERT_HELD(mpts);
2317 mp_so = mpte->mpte_mppcb->mpp_socket;
2318 so = mpts->mpts_socket;
2319 mp_tp = mpte->mpte_mptcb;
2320
2321 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2322
2323 /*
2324 * We got a Data FIN for the MPTCP connection.
2325 * The FIN may arrive with data. The data is handed up to the
2326 * mptcp socket and the user is notified so that it may close
2327 * the socket if needed.
2328 */
2329 MPT_LOCK(mp_tp);
2330 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
2331 MPT_UNLOCK(mp_tp);
2332 MPTS_UNLOCK(mpts);
2333 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
2334 MPTS_LOCK(mpts);
2335 MPT_LOCK(mp_tp);
2336 }
2337 MPT_UNLOCK(mp_tp);
2338 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2339}
2340
39236c6e
A
2341/*
2342 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2343 */
2344static ev_ret_t
2345mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts)
2346{
2347 struct mptsub *mpts_alt = NULL;
2348 struct socket *so = NULL;
2349 struct socket *mp_so;
2350 int altpath_exists = 0;
2351
2352 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2353 MPTS_LOCK_ASSERT_HELD(mpts);
2354 mp_so = mpte->mpte_mppcb->mpp_socket;
2355 mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
2356 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
2357
2358 MPTS_UNLOCK(mpts);
2359 mpts_alt = mptcp_get_subflow(mpte, mpts);
2360
2361 /*
2362 * If there is no alternate eligible subflow, ignore the
2363 * failover hint.
2364 */
2365 if (mpts_alt == NULL) {
2366 mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__));
fe8ab488
A
2367 if (mptcp_delayed_subf_start) {
2368 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2369 if (mpts_alt != NULL) {
2370 MPTS_LOCK(mpts_alt);
2371 (void) mptcp_subflow_soconnectx(mpte,
2372 mpts_alt);
2373 MPTS_UNLOCK(mpts_alt);
2374 }
2375 }
39236c6e
A
2376 MPTS_LOCK(mpts);
2377 goto done;
2378 }
2379 MPTS_LOCK(mpts_alt);
2380 altpath_exists = 1;
2381 so = mpts_alt->mpts_socket;
2382 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2383 socket_lock(so, 1);
fe8ab488
A
2384 /* All data acknowledged and no RTT spike */
2385 if ((so->so_snd.sb_cc == 0) &&
2386 (mptcp_no_rto_spike(so))) {
39236c6e
A
2387 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2388 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2389 } else {
2390 /* no alternate path available */
2391 altpath_exists = 0;
2392 }
2393 socket_unlock(so, 1);
2394 }
2395 if (altpath_exists) {
fe8ab488
A
2396 mptcplog2((LOG_INFO, "%s: cid = %d\n",
2397 __func__, mpts_alt->mpts_connid));
39236c6e
A
2398 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
2399 struct mptcb *mp_tp = mpte->mpte_mptcb;
2400 /* Bring the subflow's notion of snd_nxt into the send window */
2401 MPT_LOCK(mp_tp);
2402 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2403 MPT_UNLOCK(mp_tp);
2404 mpte->mpte_active_sub = mpts_alt;
2405 socket_lock(so, 1);
2406 sowwakeup(so);
2407 socket_unlock(so, 1);
2408 }
2409 MPTS_UNLOCK(mpts_alt);
2410
2411 if (altpath_exists) {
2412 soevent(mp_so,
2413 SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2414 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from "
2415 "%d to %d\n", __func__,
2416 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2417 mpts->mpts_connid, mpts_alt->mpts_connid));
2418 tcpstat.tcps_mp_switches++;
2419 }
2420
2421 MPTS_LOCK(mpts);
2422 if (altpath_exists) {
2423 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2424 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2425 } else {
fe8ab488
A
2426 mptcplog2((LOG_INFO, "%s: no alt cid = %d\n",
2427 __func__, mpts->mpts_connid));
2428done:
39236c6e
A
2429 so = mpts->mpts_socket;
2430 socket_lock(so, 1);
2431 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2432 socket_unlock(so, 1);
2433 }
39236c6e
A
2434 MPTS_LOCK_ASSERT_HELD(mpts);
2435 return (MPTS_EVRET_OK);
2436}
2437
2438/*
2439 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2440 */
2441static ev_ret_t
2442mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts)
2443{
2444 struct socket *mp_so, *so;
2445 struct mptcb *mp_tp;
2446 boolean_t linger;
2447
2448 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2449 MPTS_LOCK_ASSERT_HELD(mpts);
2450 VERIFY(mpte->mpte_mppcb != NULL);
2451 mp_so = mpte->mpte_mppcb->mpp_socket;
2452 mp_tp = mpte->mpte_mptcb;
2453 so = mpts->mpts_socket;
2454
2455 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2456 !(mp_so->so_flags & SOF_PCBCLEARING));
2457
2458 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2459 mpts->mpts_connid, (linger ? "YES" : "NO")));
2460
2461 if (mpts->mpts_soerror == 0)
2462 mpts->mpts_soerror = EHOSTUNREACH;
2463
2464 /*
2465 * The subflow connection cannot use the outgoing interface.
2466 *
2467 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2468 * client if the MPTCP connection has not been established. If it
2469 * has been established, let the upper layer call disconnectx.
2470 */
2471 mptcp_subflow_disconnect(mpte, mpts, !linger);
2472 MPTS_UNLOCK(mpts);
2473
2474 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED);
2475
2476 MPT_LOCK(mp_tp);
2477 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2478 mp_so->so_error = EHOSTUNREACH;
2479 }
2480 MPT_UNLOCK(mp_tp);
2481
2482 MPTS_LOCK(mpts);
2483 /*
2484 * Keep the subflow socket around, unless the MPTCP socket has
2485 * been detached or the subflow has been disconnected explicitly,
2486 * in which case it should be deleted right away.
2487 */
2488 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2489}
2490
2491/*
2492 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2493 */
2494static ev_ret_t
2495mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts)
2496{
2497 struct socket *so;
2498
2499 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2500 MPTS_LOCK_ASSERT_HELD(mpts);
2501
2502 so = mpts->mpts_socket;
2503
2504 /* the subflow connection is being flow controlled */
2505 mpts->mpts_flags |= MPTSF_SUSPENDED;
2506
2507 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
2508 mpts->mpts_connid));
2509
2510 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2511}
2512
2513/*
2514 * Handle SO_FILT_HINT_RESUME subflow socket event.
2515 */
2516static ev_ret_t
2517mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts)
2518{
2519 struct socket *so;
2520
2521 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2522 MPTS_LOCK_ASSERT_HELD(mpts);
2523
2524 so = mpts->mpts_socket;
2525
2526 /* the subflow connection is no longer flow controlled */
2527 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2528
2529 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2530
2531 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2532}
2533
2534/*
2535 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2536 */
2537static ev_ret_t
2538mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts)
2539{
2540 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2541 struct sockaddr_entry *src_se, *dst_se;
2542 struct sockaddr_storage src;
2543 struct socket *mp_so, *so;
2544 struct mptcb *mp_tp;
2545 struct ifnet *outifp;
2546 int af, error = 0;
2547 boolean_t mpok = FALSE;
2548
2549 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2550 VERIFY(mpte->mpte_mppcb != NULL);
2551 mp_so = mpte->mpte_mppcb->mpp_socket;
2552 mp_tp = mpte->mpte_mptcb;
2553
2554 MPTS_LOCK_ASSERT_HELD(mpts);
2555 so = mpts->mpts_socket;
2556 af = mpts->mpts_family;
2557
2558 if (mpts->mpts_flags & MPTSF_CONNECTED)
2559 return (MPTS_EVRET_OK);
2560
2561 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2562 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
2563 socket_lock(so, 0);
2564 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2565 (so->so_state & SS_ISCONNECTED)) {
2566 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
2567 __func__, mpts->mpts_connid));
2568 (void) soshutdownlock(so, SHUT_RD);
2569 (void) soshutdownlock(so, SHUT_WR);
2570 (void) sodisconnectlocked(so);
2571 }
2572 socket_unlock(so, 0);
39236c6e
A
2573 return (MPTS_EVRET_OK);
2574 }
2575
2576 /*
2577 * The subflow connection has been connected. Find out whether it
2578 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2579 *
2580 * a. If MPTCP connection is not yet established, then this must be
2581 * the first subflow connection. If MPTCP failed to negotiate,
2582 * indicate to the MPTCP socket client via EPROTO, that the
2583 * underlying TCP connection may be peeled off via peeloff(2).
2584 * Otherwise, mark the MPTCP socket as connected.
2585 *
2586 * b. If MPTCP connection has been established, then this must be
2587 * one of the subsequent subflow connections. If MPTCP failed
2588 * to negotiate, disconnect the connection since peeloff(2)
2589 * is no longer possible.
2590 *
2591 * Right now, we simply unblock any waiters at the MPTCP socket layer
2592 * if the MPTCP connection has not been established.
2593 */
2594 socket_lock(so, 0);
2595
2596 if (so->so_state & SS_ISDISCONNECTED) {
2597 /*
2598 * With MPTCP joins, a connection is connected at the subflow
2599 * level, but the 4th ACK from the server elevates the MPTCP
2600 * subflow to connected state. So there is a small window
2601 * where the subflow could get disconnected before the
2602 * connected event is processed.
2603 */
2604 socket_unlock(so, 0);
2605 return (MPTS_EVRET_OK);
2606 }
2607
2608 mpts->mpts_soerror = 0;
2609 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2610 mpts->mpts_flags |= MPTSF_CONNECTED;
2611 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2612 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2613
2614 VERIFY(mpts->mpts_dst_sl != NULL);
2615 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2616 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2617 dst_se->se_addr->sa_family == af);
2618
2619 VERIFY(mpts->mpts_src_sl != NULL);
2620 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2621 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2622 src_se->se_addr->sa_family == af);
2623
2624 /* get/check source IP address */
2625 switch (af) {
2626 case AF_INET: {
2627 error = in_getsockaddr_s(so, &src);
2628 if (error == 0) {
2629 struct sockaddr_in *ms = SIN(src_se->se_addr);
2630 struct sockaddr_in *s = SIN(&src);
2631
2632 VERIFY(s->sin_len == ms->sin_len);
2633 VERIFY(ms->sin_family == AF_INET);
2634
2635 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2636 bcmp(&ms->sin_addr, &s->sin_addr,
2637 sizeof (ms->sin_addr)) != 0) {
2638 mptcplog((LOG_ERR, "%s: cid %d local "
2639 "address %s (expected %s)\n", __func__,
2640 mpts->mpts_connid, inet_ntop(AF_INET,
2641 (void *)&s->sin_addr.s_addr, buf0,
2642 sizeof (buf0)), inet_ntop(AF_INET,
2643 (void *)&ms->sin_addr.s_addr, buf1,
2644 sizeof (buf1))));
2645 }
2646 bcopy(s, ms, sizeof (*s));
2647 }
2648 break;
2649 }
2650#if INET6
2651 case AF_INET6: {
2652 error = in6_getsockaddr_s(so, &src);
2653 if (error == 0) {
2654 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2655 struct sockaddr_in6 *s = SIN6(&src);
2656
2657 VERIFY(s->sin6_len == ms->sin6_len);
2658 VERIFY(ms->sin6_family == AF_INET6);
2659
2660 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2661 bcmp(&ms->sin6_addr, &s->sin6_addr,
2662 sizeof (ms->sin6_addr)) != 0) {
2663 mptcplog((LOG_ERR, "%s: cid %d local "
2664 "address %s (expected %s)\n", __func__,
2665 mpts->mpts_connid, inet_ntop(AF_INET6,
2666 (void *)&s->sin6_addr, buf0,
2667 sizeof (buf0)), inet_ntop(AF_INET6,
2668 (void *)&ms->sin6_addr, buf1,
2669 sizeof (buf1))));
2670 }
2671 bcopy(s, ms, sizeof (*s));
2672 }
2673 break;
2674 }
2675#endif /* INET6 */
2676 default:
2677 VERIFY(0);
2678 /* NOTREACHED */
2679 }
2680
2681 if (error != 0) {
2682 mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n",
2683 __func__, mpts->mpts_connid, error));
2684 }
2685
2686 /* get/verify the outbound interface */
2687 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2688 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2689 VERIFY(mpts->mpts_outif != NULL);
2690 if (mpts->mpts_outif != outifp) {
2691 mptcplog((LOG_ERR, "%s: cid %d outif %s "
2692 "(expected %s)\n", __func__, mpts->mpts_connid,
2693 ((outifp != NULL) ? outifp->if_xname : "NULL"),
2694 mpts->mpts_outif->if_xname));
2695 if (outifp == NULL)
2696 outifp = mpts->mpts_outif;
2697 }
2698 } else {
2699 mpts->mpts_outif = outifp;
2700 }
2701
2702 socket_unlock(so, 0);
2703
2704 mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] "
2705 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2706 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2707 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2708 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2709 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2710 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2711 inet_ntop(af, ((af == AF_INET) ?
2712 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2713 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2714 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2715 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2716 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
2717 "MPTCP capable" : "a regular TCP")));
2718
2719 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2720 MPTS_UNLOCK(mpts);
2721
2722 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2723
2724 MPT_LOCK(mp_tp);
2725 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2726 /* case (a) above */
2727 if (!mpok) {
2728 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2729 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2730 MPT_UNLOCK(mp_tp);
2731 } else {
2732 if (mptcp_init_authparms(mp_tp) != 0) {
2733 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2734 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2735 MPT_UNLOCK(mp_tp);
2736 mpok = FALSE;
2737 } else {
2738 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2739 mpte->mpte_associd = mpts->mpts_connid;
2740 DTRACE_MPTCP2(state__change,
2741 struct mptcb *, mp_tp,
2742 uint32_t, 0 /* event */);
2743 mptcp_init_statevars(mp_tp);
2744 MPT_UNLOCK(mp_tp);
2745
2746 (void) mptcp_setconnorder(mpte,
2747 mpts->mpts_connid, 1);
2748 soisconnected(mp_so);
2749 }
2750 }
2751 MPTS_LOCK(mpts);
2752 if (mpok) {
2753 /* Initialize the relative sequence number */
2754 mpts->mpts_rel_seq = 1;
2755 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2756 mpte->mpte_nummpcapflows++;
2757 MPT_LOCK_SPIN(mp_tp);
2758 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2759 MPT_UNLOCK(mp_tp);
2760 }
2761 } else if (mpok) {
2762 MPT_UNLOCK(mp_tp);
fe8ab488
A
2763 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2764 /* Experimental code, disabled by default. */
2765 sorwakeup(mp_so);
2766 sowwakeup(mp_so);
2767 }
39236c6e
A
2768 /*
2769 * case (b) above
2770 * In case of additional flows, the MPTCP socket is not
2771 * MPTSF_MP_CAPABLE until an ACK is received from server
2772 * for 3-way handshake. TCP would have guaranteed that this
2773 * is an MPTCP subflow.
2774 */
2775 MPTS_LOCK(mpts);
2776 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
fe8ab488 2777 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
39236c6e 2778 mpte->mpte_nummpcapflows++;
fe8ab488
A
2779 /* With Fastjoin, rel sequence will be nonzero */
2780 if (mpts->mpts_rel_seq == 0)
2781 mpts->mpts_rel_seq = 1;
39236c6e 2782 MPT_LOCK_SPIN(mp_tp);
fe8ab488
A
2783 /* With Fastjoin, sndnxt is updated before connected_ev */
2784 if (mpts->mpts_sndnxt == 0) {
2785 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2786 }
39236c6e 2787 MPT_UNLOCK(mp_tp);
fe8ab488
A
2788 mptcp_output_needed(mpte, mpts);
2789 } else {
2790 MPT_UNLOCK(mp_tp);
2791 MPTS_LOCK(mpts);
39236c6e 2792 }
fe8ab488 2793
39236c6e
A
2794 MPTS_LOCK_ASSERT_HELD(mpts);
2795
2796 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2797}
2798
2799/*
2800 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2801 */
2802static ev_ret_t
2803mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts)
2804{
2805 struct socket *mp_so, *so;
2806 struct mptcb *mp_tp;
2807 boolean_t linger;
2808
2809 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2810 MPTS_LOCK_ASSERT_HELD(mpts);
2811 VERIFY(mpte->mpte_mppcb != NULL);
2812 mp_so = mpte->mpte_mppcb->mpp_socket;
2813 mp_tp = mpte->mpte_mptcb;
2814 so = mpts->mpts_socket;
2815
2816 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2817 !(mp_so->so_flags & SOF_PCBCLEARING));
2818
2819 mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2820 mpts->mpts_connid, (linger ? "YES" : "NO")));
2821
2822 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2823 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2824
2825 /*
2826 * Clear flags that are used by getconninfo to return state.
fe8ab488 2827 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e
A
2828 */
2829 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
2830 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
2831 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
2832 MPTSF_SUSPENDED|MPTSF_ACTIVE);
2833 mpts->mpts_flags |= MPTSF_DISCONNECTED;
2834
2835 /*
2836 * The subflow connection has been disconnected.
2837 *
2838 * Right now, we simply unblock any waiters at the MPTCP socket layer
2839 * if the MPTCP connection has not been established.
2840 */
2841 MPTS_UNLOCK(mpts);
2842
2843 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2844
2845 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
2846 mpte->mpte_nummpcapflows--;
fe8ab488
A
2847 if (mpte->mpte_active_sub == mpts) {
2848 mpte->mpte_active_sub = NULL;
2849 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
2850 __func__));
2851 }
39236c6e
A
2852 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
2853 }
2854
2855 MPT_LOCK(mp_tp);
2856 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2857 MPT_UNLOCK(mp_tp);
2858 soisdisconnected(mp_so);
2859 } else {
2860 MPT_UNLOCK(mp_tp);
2861 }
2862
2863 MPTS_LOCK(mpts);
2864 /*
2865 * The underlying subflow socket has been disconnected;
2866 * it is no longer useful to us. Keep the subflow socket
2867 * around, unless the MPTCP socket has been detached or
2868 * the subflow has been disconnected explicitly, in which
2869 * case it should be deleted right away.
2870 */
2871 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2872}
2873
2874/*
2875 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2876 */
2877static ev_ret_t
2878mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts)
2879{
2880 struct socket *mp_so, *so;
2881 struct mptcb *mp_tp;
2882 ev_ret_t ret = MPTS_EVRET_OK_UPDATE;
2883
2884 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2885 VERIFY(mpte->mpte_mppcb != NULL);
2886 mp_so = mpte->mpte_mppcb->mpp_socket;
2887 mp_tp = mpte->mpte_mptcb;
2888
2889 MPTS_LOCK_ASSERT_HELD(mpts);
2890 so = mpts->mpts_socket;
2891
2892 socket_lock(so, 0);
2893 MPT_LOCK(mp_tp);
2894
2895 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2896 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2897 else
2898 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
2899
2900 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
2901 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2902 goto done;
2903 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
2904 }
2905 else
2906 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
2907
2908 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
2909 mpts->mpts_flags |= MPTSF_MP_READY;
2910 else
2911 mpts->mpts_flags &= ~MPTSF_MP_READY;
2912
2913 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
2914 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
2915 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
2916 }
2917
2918 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2919 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
2920 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
2921 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
2922 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
2923 ret = MPTS_EVRET_CONNECT_PENDING;
2924 }
2925
2926 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d "
2927 "mptsf=%b\n", __func__,
2928 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
2929 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
2930 mpts->mpts_flags, MPTSF_BITS));
2931done:
2932 MPT_UNLOCK(mp_tp);
2933 socket_unlock(so, 0);
39236c6e
A
2934 return (ret);
2935}
2936
2937/*
2938 * Handle SO_FILT_HINT_MUSTRST subflow socket event
2939 */
2940static ev_ret_t
2941mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts)
2942{
2943 struct socket *mp_so, *so;
2944 struct mptcb *mp_tp;
2945 boolean_t linger;
2946
2947
2948 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2949 MPTS_LOCK_ASSERT_HELD(mpts);
2950 VERIFY(mpte->mpte_mppcb != NULL);
2951 mp_so = mpte->mpte_mppcb->mpp_socket;
2952 mp_tp = mpte->mpte_mptcb;
2953 so = mpts->mpts_socket;
2954
2955 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2956 !(mp_so->so_flags & SOF_PCBCLEARING));
2957
2958 if (mpts->mpts_soerror == 0)
2959 mpts->mpts_soerror = ECONNABORTED;
2960
39236c6e
A
2961 /* We got an invalid option or a fast close */
2962 socket_lock(so, 0);
2963 struct tcptemp *t_template;
2964 struct inpcb *inp = sotoinpcb(so);
2965 struct tcpcb *tp = NULL;
2966
2967 tp = intotcpcb(inp);
fe8ab488 2968 so->so_error = ECONNABORTED;
39236c6e
A
2969
2970 t_template = tcp_maketemplate(tp);
2971 if (t_template) {
fe8ab488 2972 struct tcp_respond_args tra;
39236c6e 2973
fe8ab488 2974 bzero(&tra, sizeof(tra));
39236c6e 2975 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 2976 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 2977 else
fe8ab488
A
2978 tra.ifscope = IFSCOPE_NONE;
2979 tra.awdl_unrestricted = 1;
39236c6e
A
2980
2981 tcp_respond(tp, t_template->tt_ipgen,
2982 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 2983 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e
A
2984 (void) m_free(dtom(t_template));
2985 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n",
2986 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2987 so, mpts->mpts_connid));
2988 }
2989 socket_unlock(so, 0);
2990 mptcp_subflow_disconnect(mpte, mpts, !linger);
2991 MPTS_UNLOCK(mpts);
2992
fe8ab488
A
2993 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED |
2994 SO_FILT_HINT_CONNRESET);
39236c6e
A
2995
2996 MPT_LOCK(mp_tp);
fe8ab488
A
2997 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) ||
2998 (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) {
39236c6e
A
2999 mp_so->so_error = ECONNABORTED;
3000 }
3001 MPT_UNLOCK(mp_tp);
3002
3003 MPTS_LOCK(mpts);
3004 /*
3005 * Keep the subflow socket around unless the subflow has been
3006 * disconnected explicitly.
3007 */
3008 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3009}
3010
fe8ab488
A
3011static ev_ret_t
3012mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts)
3013{
3014 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3015 MPTS_LOCK_ASSERT_HELD(mpts);
3016 VERIFY(mpte->mpte_mppcb != NULL);
3017
3018 if (mpte->mpte_nummpcapflows == 0) {
3019 struct mptcb *mp_tp = mpte->mpte_mptcb;
3020 mptcplog((LOG_DEBUG,"%s %llx %llx \n",
3021 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt));
3022 mpte->mpte_active_sub = mpts;
3023 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3024 MPT_LOCK(mp_tp);
3025 /*
3026 * If mptcp_subflow_output is called before fastjoin_ev
3027 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3028 * and further mpts->mpts_sndnxt is incremented by len copied.
3029 */
3030 if (mpts->mpts_sndnxt == 0) {
3031 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
3032 mpts->mpts_rel_seq = 1;
3033 }
3034 MPT_UNLOCK(mp_tp);
3035 }
3036
3037 return (MPTS_EVRET_OK);
3038}
3039
3040static ev_ret_t
3041mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts)
3042{
3043 MPTE_LOCK_ASSERT_HELD(mpte);
3044 MPTS_LOCK_ASSERT_HELD(mpts);
3045 VERIFY(mpte->mpte_mppcb != NULL);
3046 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid));
3047
3048 mpts->mpts_flags |= MPTSF_DELETEOK;
3049 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3050 return (MPTS_EVRET_DELETE);
3051 else
3052 return (MPTS_EVRET_OK);
3053}
3054
39236c6e
A
3055static const char *
3056mptcp_evret2str(ev_ret_t ret)
3057{
3058 const char *c = "UNKNOWN";
3059
3060 switch (ret) {
3061 case MPTS_EVRET_DELETE:
3062 c = "MPTS_EVRET_DELETE";
3063 break;
3064 case MPTS_EVRET_CONNECT_PENDING:
3065 c = "MPTS_EVRET_CONNECT_PENDING";
3066 break;
3067 case MPTS_EVRET_DISCONNECT_FALLBACK:
3068 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3069 break;
3070 case MPTS_EVRET_OK:
3071 c = "MPTS_EVRET_OK";
3072 break;
3073 case MPTS_EVRET_OK_UPDATE:
3074 c = "MPTS_EVRET_OK_UPDATE";
3075 break;
3076 }
3077 return (c);
3078}
3079
3080/*
3081 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3082 */
3083void
3084mptcp_subflow_addref(struct mptsub *mpts, int locked)
3085{
3086 if (!locked)
3087 MPTS_LOCK(mpts);
3088 else
3089 MPTS_LOCK_ASSERT_HELD(mpts);
3090
3091 if (++mpts->mpts_refcnt == 0) {
3092 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3093 /* NOTREACHED */
3094 }
3095 if (!locked)
3096 MPTS_UNLOCK(mpts);
3097}
3098
3099/*
3100 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3101 */
3102void
3103mptcp_subflow_remref(struct mptsub *mpts)
3104{
3105 MPTS_LOCK(mpts);
3106 if (mpts->mpts_refcnt == 0) {
3107 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3108 /* NOTREACHED */
3109 }
3110 if (--mpts->mpts_refcnt > 0) {
3111 MPTS_UNLOCK(mpts);
3112 return;
3113 }
3114 /* callee will unlock and destroy lock */
3115 mptcp_subflow_free(mpts);
3116}
3117
3118/*
3119 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3120 * caller must ensure that the option can be issued on subflow sockets, via
3121 * MPOF_SUBFLOW_OK flag.
3122 */
3123int
3124mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3125 struct mptopt *mpo)
3126{
3127 struct socket *mp_so;
3128 struct sockopt sopt;
3129 char buf[32];
3130 int error;
3131
3132 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3133 mpo->mpo_flags &= ~MPOF_INTERIM;
3134
3135 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3136 mp_so = mpte->mpte_mppcb->mpp_socket;
3137
3138 bzero(&sopt, sizeof (sopt));
3139 sopt.sopt_dir = SOPT_SET;
3140 sopt.sopt_level = mpo->mpo_level;
3141 sopt.sopt_name = mpo->mpo_name;
3142 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3143 sopt.sopt_valsize = sizeof (int);
3144 sopt.sopt_p = kernproc;
3145
3146 error = sosetoptlock(so, &sopt, 0); /* already locked */
3147 if (error == 0) {
3148 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
3149 "val %d set successful\n", __func__,
3150 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3151 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3152 buf, sizeof (buf)), mpo->mpo_intval));
3153 } else {
3154 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s "
3155 "val %d set error %d\n", __func__,
3156 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3157 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3158 buf, sizeof (buf)), mpo->mpo_intval, error));
3159 }
3160 return (error);
3161}
3162
3163/*
3164 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3165 * caller must ensure that the option can be issued on subflow sockets, via
3166 * MPOF_SUBFLOW_OK flag.
3167 */
3168int
3169mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3170 struct mptopt *mpo)
3171{
3172 struct socket *mp_so;
3173 struct sockopt sopt;
3174 char buf[32];
3175 int error;
3176
3177 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3178 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3179 mp_so = mpte->mpte_mppcb->mpp_socket;
3180
3181 bzero(&sopt, sizeof (sopt));
3182 sopt.sopt_dir = SOPT_GET;
3183 sopt.sopt_level = mpo->mpo_level;
3184 sopt.sopt_name = mpo->mpo_name;
3185 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3186 sopt.sopt_valsize = sizeof (int);
3187 sopt.sopt_p = kernproc;
3188
3189 error = sogetoptlock(so, &sopt, 0); /* already locked */
3190 if (error == 0) {
3191 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
3192 "val %d get successful\n", __func__,
3193 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3194 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3195 buf, sizeof (buf)), mpo->mpo_intval));
3196 } else {
3197 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n",
3198 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3199 mptcp_sopt2str(mpo->mpo_level,
3200 mpo->mpo_name, buf, sizeof (buf)), error));
3201 }
3202 return (error);
3203}
3204
3205
3206/*
3207 * MPTCP garbage collector.
3208 *
3209 * This routine is called by the MP domain on-demand, periodic callout,
3210 * which is triggered when a MPTCP socket is closed. The callout will
3211 * repeat as long as this routine returns a non-zero value.
3212 */
3213static uint32_t
3214mptcp_gc(struct mppcbinfo *mppi)
3215{
3216 struct mppcb *mpp, *tmpp;
3217 uint32_t active = 0;
3218
3219 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3220
3221 mptcplog3((LOG_DEBUG, "%s: running\n", __func__));
3222
3223 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3224 struct socket *mp_so;
3225 struct mptses *mpte;
3226 struct mptcb *mp_tp;
3227
3228 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3229 mp_so = mpp->mpp_socket;
3230 VERIFY(mp_so != NULL);
3231 mpte = mptompte(mpp);
3232 VERIFY(mpte != NULL);
3233 mp_tp = mpte->mpte_mptcb;
3234 VERIFY(mp_tp != NULL);
3235
3236 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found "
3237 "(u=%d,r=%d,s=%d)\n", __func__,
3238 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3239 mp_so->so_retaincnt, mpp->mpp_state));
3240
3241 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3242 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3243 "(u=%d,r=%d)\n", __func__,
3244 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3245 mp_so->so_usecount, mp_so->so_retaincnt));
3246 active++;
3247 continue;
3248 }
3249
3250 /* check again under the lock */
3251 if (mp_so->so_usecount > 1) {
3252 boolean_t wakeup = FALSE;
3253 struct mptsub *mpts, *tmpts;
3254
3255 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3256 "[u=%d,r=%d] %d %d\n", __func__,
3257 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3258 mp_so->so_usecount, mp_so->so_retaincnt,
3259 mp_tp->mpt_gc_ticks,
3260 mp_tp->mpt_state));
3261 MPT_LOCK(mp_tp);
3262 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3263 if (mp_tp->mpt_gc_ticks > 0)
3264 mp_tp->mpt_gc_ticks--;
3265 if (mp_tp->mpt_gc_ticks == 0) {
3266 wakeup = TRUE;
3267 if (mp_tp->mpt_localkey != NULL) {
3268 mptcp_free_key(
3269 mp_tp->mpt_localkey);
3270 mp_tp->mpt_localkey = NULL;
3271 }
3272 }
3273 }
3274 MPT_UNLOCK(mp_tp);
3275 if (wakeup) {
3276 TAILQ_FOREACH_SAFE(mpts,
3277 &mpte->mpte_subflows, mpts_entry, tmpts) {
3278 MPTS_LOCK(mpts);
3279 mpts->mpts_flags |= MPTSF_DELETEOK;
3280 if (mpts->mpts_soerror == 0)
3281 mpts->mpts_soerror = ETIMEDOUT;
3282 mptcp_subflow_eupcall(mpts->mpts_socket,
3283 mpts, SO_FILT_HINT_DISCONNECTED);
3284 MPTS_UNLOCK(mpts);
3285 }
3286 }
3287 lck_mtx_unlock(&mpp->mpp_lock);
3288 active++;
3289 continue;
3290 }
3291
3292 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3293 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3294 "[u=%d,r=%d,s=%d]\n", __func__,
3295 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3296 mp_so->so_usecount, mp_so->so_retaincnt,
3297 mpp->mpp_state));
3298 lck_mtx_unlock(&mpp->mpp_lock);
3299 active++;
3300 continue;
3301 }
3302
3303 /*
3304 * The PCB has been detached, and there is exactly 1 refnct
3305 * held by the MPTCP thread. Signal that thread to terminate,
3306 * after which the last refcnt will be released. That will
3307 * allow it to be destroyed below during the next round.
3308 */
3309 if (mp_so->so_usecount == 1) {
3310 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for "
3311 "termination [u=%d,r=%d]\n", __func__,
3312 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3313 mp_so->so_usecount, mp_so->so_retaincnt));
3314 /* signal MPTCP thread to terminate */
3315 mptcp_thread_terminate_signal(mpte);
3316 lck_mtx_unlock(&mpp->mpp_lock);
3317 active++;
3318 continue;
3319 }
3320
3321 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3322 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3323 mp_so->so_usecount, mp_so->so_retaincnt));
3324 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3325 struct sockbuf *, &mp_so->so_rcv,
3326 struct sockbuf *, &mp_so->so_snd,
3327 struct mppcb *, mpp);
3328
3329 mp_pcbdispose(mpp);
3330 }
3331
3332 return (active);
3333}
3334
3335/*
3336 * Drop a MPTCP connection, reporting the specified error.
3337 */
3338struct mptses *
3339mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3340{
3341 struct socket *mp_so;
3342
3343 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3344 MPT_LOCK_ASSERT_HELD(mp_tp);
3345 VERIFY(mpte->mpte_mptcb == mp_tp);
3346 mp_so = mpte->mpte_mppcb->mpp_socket;
3347
fe8ab488 3348 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e
A
3349 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3350 uint32_t, 0 /* event */);
3351
3352 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3353 errno = mp_tp->mpt_softerror;
3354 mp_so->so_error = errno;
3355
3356 return (mptcp_close(mpte, mp_tp));
3357}
3358
3359/*
3360 * Close a MPTCP control block.
3361 */
3362struct mptses *
3363mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3364{
3365 struct socket *mp_so;
3366 struct mptsub *mpts, *tmpts;
3367
3368 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3369 MPT_LOCK_ASSERT_HELD(mp_tp);
3370 VERIFY(mpte->mpte_mptcb == mp_tp);
3371 mp_so = mpte->mpte_mppcb->mpp_socket;
3372 if (mp_tp->mpt_localkey != NULL) {
3373 mptcp_free_key(mp_tp->mpt_localkey);
3374 mp_tp->mpt_localkey = NULL;
3375 }
3376
3377 MPT_UNLOCK(mp_tp);
3378 soisdisconnected(mp_so);
3379
3380 MPT_LOCK(mp_tp);
3381 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3382 return (NULL);
3383 }
3384 MPT_UNLOCK(mp_tp);
3385
3386 /* Clean up all subflows */
3387 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3388 MPTS_LOCK(mpts);
fe8ab488 3389 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
39236c6e
A
3390 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3391 MPTS_UNLOCK(mpts);
3392 mptcp_subflow_del(mpte, mpts, TRUE);
3393 }
3394 MPT_LOCK(mp_tp);
3395
3396 return (NULL);
3397}
3398
3399void
3400mptcp_notify_close(struct socket *so)
3401{
3402 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3403}
3404
3405/*
3406 * Signal MPTCP thread to wake up.
3407 */
3408void
3409mptcp_thread_signal(struct mptses *mpte)
3410{
3411 lck_mtx_lock(&mpte->mpte_thread_lock);
3412 mptcp_thread_signal_locked(mpte);
3413 lck_mtx_unlock(&mpte->mpte_thread_lock);
3414}
3415
3416/*
3417 * Signal MPTCP thread to wake up (locked version)
3418 */
3419static void
3420mptcp_thread_signal_locked(struct mptses *mpte)
3421{
3422 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3423
3424 mpte->mpte_thread_reqs++;
3425 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3426 wakeup_one((caddr_t)&mpte->mpte_thread);
3427}
3428
3429/*
3430 * Signal MPTCP thread to terminate.
3431 */
3432static void
3433mptcp_thread_terminate_signal(struct mptses *mpte)
3434{
3435 lck_mtx_lock(&mpte->mpte_thread_lock);
3436 if (mpte->mpte_thread != THREAD_NULL) {
3437 mpte->mpte_thread = THREAD_NULL;
3438 mpte->mpte_thread_reqs++;
3439 if (!mpte->mpte_thread_active)
3440 wakeup_one((caddr_t)&mpte->mpte_thread);
3441 }
3442 lck_mtx_unlock(&mpte->mpte_thread_lock);
3443}
3444
3445/*
3446 * MPTCP thread workloop.
3447 */
3448static void
3449mptcp_thread_dowork(struct mptses *mpte)
3450{
3451 struct socket *mp_so;
3452 struct mptsub *mpts, *tmpts;
3453 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3454 boolean_t conninfo_update = FALSE;
3455
3456 MPTE_LOCK(mpte); /* same as MP socket lock */
3457 VERIFY(mpte->mpte_mppcb != NULL);
3458 mp_so = mpte->mpte_mppcb->mpp_socket;
3459 VERIFY(mp_so != NULL);
3460
3461 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3462 ev_ret_t ret;
3463
3464 MPTS_LOCK(mpts);
3465 MPTS_ADDREF_LOCKED(mpts); /* for us */
3466
3467 /* Update process ownership based on parent mptcp socket */
3468 mptcp_update_last_owner(mpts, mp_so);
3469
3470 mptcp_subflow_input(mpte, mpts);
3471 ret = mptcp_subflow_events(mpte, mpts);
3472
3473 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3474 mptcplog3((LOG_INFO, "%s: cid %d \n", __func__,
3475 mpts->mpts_connid));
3476 (void) mptcp_subflow_output(mpte, mpts);
3477 }
3478
3479 /*
3480 * If MPTCP socket is closed, disconnect all subflows.
3481 * This will generate a disconnect event which will
3482 * be handled during the next iteration, causing a
3483 * non-zero error to be returned above.
3484 */
3485 if (mp_so->so_flags & SOF_PCBCLEARING)
3486 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3487 MPTS_UNLOCK(mpts);
3488
3489 switch (ret) {
3490 case MPTS_EVRET_OK_UPDATE:
3491 conninfo_update = TRUE;
3492 break;
3493 case MPTS_EVRET_OK:
3494 /* nothing to do */
3495 break;
3496 case MPTS_EVRET_DELETE:
fe8ab488 3497 mptcp_subflow_del(mpte, mpts, TRUE);
39236c6e
A
3498 break;
3499 case MPTS_EVRET_CONNECT_PENDING:
3500 connect_pending = TRUE;
3501 break;
3502 case MPTS_EVRET_DISCONNECT_FALLBACK:
3503 disconnect_fallback = TRUE;
3504 break;
3505 }
3506 MPTS_REMREF(mpts); /* ours */
3507 }
3508
3509 if (conninfo_update) {
3510 soevent(mp_so, SO_FILT_HINT_LOCKED |
3511 SO_FILT_HINT_CONNINFO_UPDATED);
3512 }
3513
3514 if (!connect_pending && !disconnect_fallback) {
3515 MPTE_UNLOCK(mpte);
3516 return;
3517 }
3518
3519 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3520 MPTS_LOCK(mpts);
3521 if (disconnect_fallback) {
3522 struct socket *so = NULL;
3523 struct inpcb *inp = NULL;
3524 struct tcpcb *tp = NULL;
3525
3526 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3527 MPTS_UNLOCK(mpts);
3528 continue;
3529 }
3530
3531 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3532
3533 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3534 MPTSF_DISCONNECTED)) {
3535 MPTS_UNLOCK(mpts);
3536 continue;
3537 }
3538 so = mpts->mpts_socket;
3539
3540 /*
3541 * The MPTCP connection has degraded to a fallback
3542 * mode, so there is no point in keeping this subflow
3543 * regardless of its MPTCP-readiness state, unless it
3544 * is the primary one which we use for fallback. This
3545 * assumes that the subflow used for fallback is the
3546 * ACTIVE one.
3547 */
3548
3549 socket_lock(so, 1);
3550 inp = sotoinpcb(so);
3551 tp = intotcpcb(inp);
3552 tp->t_mpflags &=
3553 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3554 tp->t_mpflags |= TMPF_TCP_FALLBACK;
3555 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3556 socket_unlock(so, 1);
3557 MPTS_UNLOCK(mpts);
3558 continue;
3559 }
3560 tp->t_mpflags |= TMPF_RESET;
3561 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3562 socket_unlock(so, 1);
3563
3564 } else if (connect_pending) {
fe8ab488
A
3565 /*
3566 * If delayed subflow start is set and cellular,
3567 * delay the connect till a retransmission timeout
3568 */
3569
3570 if ((mptcp_delayed_subf_start) &&
3571 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3572 MPTS_UNLOCK(mpts);
3573 continue;
3574 }
3575
39236c6e
A
3576 /*
3577 * The MPTCP connection has progressed to a state
3578 * where it supports full multipath semantics; allow
3579 * additional joins to be attempted for all subflows
3580 * that are in the PENDING state.
3581 */
3582 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3583 (void) mptcp_subflow_soconnectx(mpte, mpts);
3584 }
3585 }
3586 MPTS_UNLOCK(mpts);
3587 }
3588
3589 MPTE_UNLOCK(mpte);
3590}
3591
3592/*
3593 * MPTCP thread.
3594 */
3595static void
3596mptcp_thread_func(void *v, wait_result_t w)
3597{
3598#pragma unused(w)
3599 struct mptses *mpte = v;
3600 struct timespec *ts = NULL;
3601
3602 VERIFY(mpte != NULL);
3603
3604 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3605
3606 for (;;) {
3607 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3608
3609 if (mpte->mpte_thread != THREAD_NULL) {
3610 (void) msleep(&mpte->mpte_thread,
3611 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3612 __func__, ts);
3613 }
3614
3615 /* MPTCP socket is closed? */
3616 if (mpte->mpte_thread == THREAD_NULL) {
3617 lck_mtx_unlock(&mpte->mpte_thread_lock);
3618 /* callee will destroy thread lock */
3619 mptcp_thread_destroy(mpte);
3620 /* NOTREACHED */
3621 return;
3622 }
3623
3624 mpte->mpte_thread_active = 1;
3625 for (;;) {
3626 uint32_t reqs = mpte->mpte_thread_reqs;
3627
3628 lck_mtx_unlock(&mpte->mpte_thread_lock);
3629 mptcp_thread_dowork(mpte);
3630 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3631
3632 /* if there's no pending request, we're done */
3633 if (reqs == mpte->mpte_thread_reqs ||
3634 mpte->mpte_thread == THREAD_NULL)
3635 break;
3636 }
3637 mpte->mpte_thread_reqs = 0;
3638 mpte->mpte_thread_active = 0;
3639 }
3640}
3641
3642/*
3643 * Destroy a MTCP thread, to be called in the MPTCP thread context
3644 * upon receiving an indication to self-terminate. This routine
3645 * will not return, as the current thread is terminated at the end.
3646 */
3647static void
3648mptcp_thread_destroy(struct mptses *mpte)
3649{
3650 struct socket *mp_so;
3651
3652 MPTE_LOCK(mpte); /* same as MP socket lock */
3653 VERIFY(mpte->mpte_thread == THREAD_NULL);
3654 VERIFY(mpte->mpte_mppcb != NULL);
3655
3656 mptcp_sesdestroy(mpte);
3657
3658 mp_so = mpte->mpte_mppcb->mpp_socket;
3659 VERIFY(mp_so != NULL);
3660 VERIFY(mp_so->so_usecount != 0);
3661 mp_so->so_usecount--; /* for thread */
3662 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3663 MPTE_UNLOCK(mpte);
3664
3665 /* for the extra refcnt from kernel_thread_start() */
3666 thread_deallocate(current_thread());
3667 /* this is the end */
3668 thread_terminate(current_thread());
3669 /* NOTREACHED */
3670}
3671
3672/*
3673 * Protocol pr_lock callback.
3674 */
3675int
3676mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3677{
3678 struct mppcb *mpp = sotomppcb(mp_so);
3679 void *lr_saved;
3680
3681 if (lr == NULL)
3682 lr_saved = __builtin_return_address(0);
3683 else
3684 lr_saved = lr;
3685
3686 if (mpp == NULL) {
3687 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3688 mp_so, lr_saved, solockhistory_nr(mp_so));
3689 /* NOTREACHED */
3690 }
3691 lck_mtx_lock(&mpp->mpp_lock);
3692
3693 if (mp_so->so_usecount < 0) {
3694 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3695 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3696 solockhistory_nr(mp_so));
3697 /* NOTREACHED */
3698 }
3699 if (refcount != 0)
3700 mp_so->so_usecount++;
3701 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3702 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3703
3704 return (0);
3705}
3706
3707/*
3708 * Protocol pr_unlock callback.
3709 */
3710int
3711mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3712{
3713 struct mppcb *mpp = sotomppcb(mp_so);
3714 void *lr_saved;
3715
3716 if (lr == NULL)
3717 lr_saved = __builtin_return_address(0);
3718 else
3719 lr_saved = lr;
3720
3721 if (mpp == NULL) {
3722 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3723 mp_so, mp_so->so_usecount, lr_saved,
3724 solockhistory_nr(mp_so));
3725 /* NOTREACHED */
3726 }
3727 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3728
3729 if (refcount != 0)
3730 mp_so->so_usecount--;
3731
3732 if (mp_so->so_usecount < 0) {
3733 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3734 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3735 /* NOTREACHED */
3736 }
3737 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3738 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3739 lck_mtx_unlock(&mpp->mpp_lock);
3740
3741 return (0);
3742}
3743
3744/*
3745 * Protocol pr_getlock callback.
3746 */
3747lck_mtx_t *
3748mptcp_getlock(struct socket *mp_so, int locktype)
3749{
3750#pragma unused(locktype)
3751 struct mppcb *mpp = sotomppcb(mp_so);
3752
3753 if (mpp == NULL) {
3754 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3755 solockhistory_nr(mp_so));
3756 /* NOTREACHED */
3757 }
3758 if (mp_so->so_usecount < 0) {
3759 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3760 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3761 /* NOTREACHED */
3762 }
3763 return (&mpp->mpp_lock);
3764}
3765
3766/*
3767 * Key generation functions
3768 */
3769static void
3770mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
3771{
3772 struct mptcp_key_entry *key_elm;
3773try_again:
3774 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
3775 if (key_entry->mkey_value == 0)
3776 goto try_again;
3777 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
3778 sizeof (key_entry->mkey_digest));
3779
3780 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3781 if (key_elm->mkey_value == key_entry->mkey_value) {
3782 goto try_again;
3783 }
3784 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
3785 0) {
3786 goto try_again;
3787 }
3788 }
3789}
3790
3791static mptcp_key_t *
3792mptcp_reserve_key(void)
3793{
3794 struct mptcp_key_entry *key_elm;
3795 struct mptcp_key_entry *found_elm = NULL;
3796
3797 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3798 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3799 if (key_elm->mkey_flags == MKEYF_FREE) {
3800 key_elm->mkey_flags = MKEYF_INUSE;
3801 found_elm = key_elm;
3802 break;
3803 }
3804 }
3805 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3806
3807 if (found_elm) {
3808 return (&found_elm->mkey_value);
3809 }
3810
3811 key_elm = (struct mptcp_key_entry *)
3812 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3813 key_elm->mkey_flags = MKEYF_INUSE;
3814
3815 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3816 mptcp_generate_unique_key(key_elm);
3817 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
3818 mptcp_keys_pool.mkph_count += 1;
3819 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3820 return (&key_elm->mkey_value);
3821}
3822
3823static caddr_t
3824mptcp_get_stored_digest(mptcp_key_t *key)
3825{
3826 struct mptcp_key_entry *key_holder;
3827 caddr_t digest = NULL;
3828
3829 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3830 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
3831 offsetof(struct mptcp_key_entry, mkey_value));
3832 if (key_holder->mkey_flags != MKEYF_INUSE)
3833 panic_plain("%s", __func__);
3834 digest = &key_holder->mkey_digest[0];
3835 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3836 return (digest);
3837}
3838
3839void
3840mptcp_free_key(mptcp_key_t *key)
3841{
3842 struct mptcp_key_entry *key_holder;
3843 struct mptcp_key_entry *key_elm;
3844 int pt = RandomULong();
3845
3846 mptcplog((LOG_INFO, "%s\n", __func__));
3847
3848 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3849 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
3850 offsetof(struct mptcp_key_entry, mkey_value));
3851 key_holder->mkey_flags = MKEYF_FREE;
3852
3853 LIST_REMOVE(key_holder, mkey_next);
3854 mptcp_keys_pool.mkph_count -= 1;
3855
3856 /* Free half the time */
3857 if (pt & 0x01) {
3858 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
3859 } else {
3860 /* Insert it at random point to avoid early reuse */
3861 int i = 0;
3862 if (mptcp_keys_pool.mkph_count > 1) {
3863 pt = pt % (mptcp_keys_pool.mkph_count - 1);
3864 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3865 if (++i >= pt) {
3866 LIST_INSERT_AFTER(key_elm, key_holder,
3867 mkey_next);
3868 break;
3869 }
3870 }
3871 if (i < pt)
3872 panic("missed insertion");
3873 } else {
3874 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
3875 mkey_next);
3876 }
3877 mptcp_keys_pool.mkph_count += 1;
3878 }
3879 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3880}
3881
3882static void
3883mptcp_key_pool_init(void)
3884{
3885 int i;
3886 struct mptcp_key_entry *key_entry;
3887
3888 LIST_INIT(&mptcp_keys_pool);
3889 mptcp_keys_pool.mkph_count = 0;
3890
3891 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
3892 (sizeof (struct mptcp_key_entry));
3893 mptcp_keys_pool.mkph_key_entry_zone = zinit(
3894 mptcp_keys_pool.mkph_key_elm_sz,
3895 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
3896 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
3897 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
3898 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
3899 /* NOTREACHED */
3900 }
3901 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
3902 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
3903
3904 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
3905 key_entry = (struct mptcp_key_entry *)
3906 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3907 key_entry->mkey_flags = MKEYF_FREE;
3908 mptcp_generate_unique_key(key_entry);
3909 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
3910 mptcp_keys_pool.mkph_count += 1;
3911 }
3912 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
3913 mtcbinfo.mppi_lock_attr);
3914}
3915
3916/*
3917 * MPTCP Join support
3918 */
3919
3920static void
3921mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 3922 uint8_t addr_id)
39236c6e
A
3923{
3924 struct tcpcb *tp = sototcpcb(so);
3925 struct mptcp_subf_auth_entry *sauth_entry;
3926 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3927
3928 MPT_LOCK_SPIN(mp_tp);
3929 tp->t_mptcb = mp_tp;
39236c6e 3930 /*
39236c6e
A
3931 * The address ID of the first flow is implicitly 0.
3932 */
3933 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
3934 tp->t_local_aid = 0;
3935 } else {
fe8ab488 3936 tp->t_local_aid = addr_id;
39236c6e
A
3937 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
3938 so->so_flags |= SOF_MP_SEC_SUBFLOW;
3939 }
fe8ab488 3940 MPT_UNLOCK(mp_tp);
39236c6e
A
3941 sauth_entry = zalloc(mpt_subauth_zone);
3942 sauth_entry->msae_laddr_id = tp->t_local_aid;
3943 sauth_entry->msae_raddr_id = 0;
3944 sauth_entry->msae_raddr_rand = 0;
3945try_again:
3946 sauth_entry->msae_laddr_rand = RandomULong();
3947 if (sauth_entry->msae_laddr_rand == 0)
3948 goto try_again;
fe8ab488 3949 MPT_LOCK_SPIN(mp_tp);
39236c6e 3950 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
fe8ab488 3951 MPT_UNLOCK(mp_tp);
39236c6e
A
3952}
3953
3954static void
3955mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
3956{
3957 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 3958 struct tcpcb *tp = NULL;
39236c6e
A
3959 int found = 0;
3960
fe8ab488
A
3961 socket_lock(so, 0);
3962 tp = sototcpcb(so);
3963 if (tp == NULL) {
3964 socket_unlock(so, 0);
39236c6e 3965 return;
fe8ab488 3966 }
39236c6e
A
3967
3968 MPT_LOCK(mp_tp);
3969 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3970 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
3971 found = 1;
3972 break;
3973 }
3974 }
3975 if (found) {
3976 LIST_REMOVE(sauth_entry, msae_next);
3977 zfree(mpt_subauth_zone, sauth_entry);
3978 }
39236c6e 3979 MPT_UNLOCK(mp_tp);
fe8ab488
A
3980
3981 tp->t_mptcb = NULL;
3982 socket_unlock(so, 0);
39236c6e
A
3983}
3984
3985void
3986mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
3987 u_int32_t *rrand)
3988{
3989 struct mptcp_subf_auth_entry *sauth_entry;
3990 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3991
3992 MPT_LOCK(mp_tp);
3993 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3994 if (sauth_entry->msae_laddr_id == addr_id) {
3995 if (lrand)
3996 *lrand = sauth_entry->msae_laddr_rand;
3997 if (rrand)
3998 *rrand = sauth_entry->msae_raddr_rand;
3999 break;
4000 }
4001 }
4002 MPT_UNLOCK(mp_tp);
4003}
4004
4005void
4006mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4007 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4008{
4009 struct mptcp_subf_auth_entry *sauth_entry;
4010 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4011
4012 MPT_LOCK(mp_tp);
4013 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4014 if (sauth_entry->msae_laddr_id == laddr_id) {
4015 if ((sauth_entry->msae_raddr_id != 0) &&
4016 (sauth_entry->msae_raddr_id != raddr_id)) {
4017 mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched"
4018 " address ids %d %d \n", __func__, raddr_id,
4019 sauth_entry->msae_raddr_id));
4020 MPT_UNLOCK(mp_tp);
4021 return;
4022 }
4023 sauth_entry->msae_raddr_id = raddr_id;
4024 if ((sauth_entry->msae_raddr_rand != 0) &&
4025 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4026 mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n",
4027 __func__, raddr_rand,
4028 sauth_entry->msae_raddr_rand));
4029 MPT_UNLOCK(mp_tp);
4030 return;
4031 }
4032 sauth_entry->msae_raddr_rand = raddr_rand;
4033 MPT_UNLOCK(mp_tp);
4034 return;
4035 }
4036 }
4037 MPT_UNLOCK(mp_tp);
4038}
4039
4040/*
4041 * SHA1 support for MPTCP
4042 */
4043static int
4044mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4045{
4046 SHA1_CTX sha1ctxt;
4047 const unsigned char *sha1_base;
4048 int sha1_size;
4049
4050 if (digest_len != SHA1_RESULTLEN) {
4051 return (FALSE);
4052 }
4053
4054 sha1_base = (const unsigned char *) key;
4055 sha1_size = sizeof (mptcp_key_t);
4056 SHA1Init(&sha1ctxt);
4057 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4058 SHA1Final(sha_digest, &sha1ctxt);
4059 return (TRUE);
4060}
4061
4062void
4063mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4064 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4065{
4066 SHA1_CTX sha1ctxt;
4067 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4068 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4069 u_int32_t data[2];
4070 int i;
4071
4072 bzero(digest, digest_len);
4073
4074 /* Set up the Key for HMAC */
4075 key_ipad[0] = key1;
4076 key_ipad[1] = key2;
4077
4078 key_opad[0] = key1;
4079 key_opad[1] = key2;
4080
4081 /* Set up the message for HMAC */
4082 data[0] = rand1;
4083 data[1] = rand2;
4084
4085 /* Key is 512 block length, so no need to compute hash */
4086
4087 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4088
4089 for (i = 0; i < 8; i++) {
4090 key_ipad[i] ^= 0x3636363636363636;
4091 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4092 }
4093
4094 /* Perform inner SHA1 */
4095 SHA1Init(&sha1ctxt);
4096 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4097 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4098 SHA1Final(digest, &sha1ctxt);
4099
4100 /* Perform outer SHA1 */
4101 SHA1Init(&sha1ctxt);
4102 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4103 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4104 SHA1Final(digest, &sha1ctxt);
4105}
4106
4107/*
4108 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4109 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4110 */
4111void
4112mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4113 int digest_len)
4114{
4115 uint32_t lrand, rrand;
4116 mptcp_key_t localkey, remotekey;
4117 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4118
4119 if (digest_len != SHA1_RESULTLEN)
4120 return;
4121
4122 lrand = rrand = 0;
4123 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4124 MPT_LOCK_SPIN(mp_tp);
4125 localkey = *mp_tp->mpt_localkey;
4126 remotekey = mp_tp->mpt_remotekey;
4127 MPT_UNLOCK(mp_tp);
4128 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4129 digest_len);
4130}
4131
4132u_int64_t
4133mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4134{
4135 u_char digest[SHA1_RESULTLEN];
4136 u_int64_t trunced_digest;
4137
4138 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4139 bcopy(digest, &trunced_digest, 8);
4140 return (trunced_digest);
4141}
4142
4143/*
4144 * Authentication data generation
4145 */
4146int
4147mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4148 int token_len)
4149{
4150 VERIFY(token_len == sizeof (u_int32_t));
4151 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4152
4153 /* Most significant 32 bits of the SHA1 hash */
4154 bcopy(sha_digest, token, sizeof (u_int32_t));
4155 return (TRUE);
4156}
4157
4158int
4159mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4160 int idsn_len)
4161{
4162 VERIFY(idsn_len == sizeof (u_int64_t));
4163 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4164
4165 /*
4166 * Least significant 64 bits of the SHA1 hash
4167 */
4168
4169 idsn[7] = sha_digest[12];
4170 idsn[6] = sha_digest[13];
4171 idsn[5] = sha_digest[14];
4172 idsn[4] = sha_digest[15];
4173 idsn[3] = sha_digest[16];
4174 idsn[2] = sha_digest[17];
4175 idsn[1] = sha_digest[18];
4176 idsn[0] = sha_digest[19];
4177 return (TRUE);
4178}
4179
4180static int
4181mptcp_init_authparms(struct mptcb *mp_tp)
4182{
4183 caddr_t local_digest = NULL;
4184 char remote_digest[MPTCP_SHA1_RESULTLEN];
4185 MPT_LOCK_ASSERT_HELD(mp_tp);
4186
4187 /* Only Version 0 is supported for auth purposes */
4188 if (mp_tp->mpt_version != MP_DRAFT_VERSION_12)
4189 return (-1);
4190
4191 /* Setup local and remote tokens and Initial DSNs */
4192 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4193 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4194 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4195 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4196 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4197
4198 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4199 SHA1_RESULTLEN)) {
4200 mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure",
4201 __func__));
4202 return (-1);
4203 }
4204 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4205 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken));
4206 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4207 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4208 return (0);
4209}
4210
4211static void
4212mptcp_init_statevars(struct mptcb *mp_tp)
4213{
4214 MPT_LOCK_ASSERT_HELD(mp_tp);
4215
4216 /* The subflow SYN is also first MPTCP byte */
4217 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4218 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4219
4220 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4221}
4222
4223static void
4224mptcp_conn_properties(struct mptcb *mp_tp)
4225{
4226 /* There is only Version 0 at this time */
4227 mp_tp->mpt_version = MP_DRAFT_VERSION_12;
4228
4229 /* Set DSS checksum flag */
4230 if (mptcp_dss_csum)
4231 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4232
4233 /* Set up receive window */
4234 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4235
4236 /* Set up gc ticks */
4237 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4238}
4239
4240/*
4241 * Helper Functions
4242 */
4243mptcp_token_t
4244mptcp_get_localtoken(void* mptcb_arg)
4245{
4246 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4247 return (mp_tp->mpt_localtoken);
4248}
4249
4250mptcp_token_t
4251mptcp_get_remotetoken(void* mptcb_arg)
4252{
4253 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4254 return (mp_tp->mpt_remotetoken);
4255}
4256
4257u_int64_t
4258mptcp_get_localkey(void* mptcb_arg)
4259{
4260 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4261 if (mp_tp->mpt_localkey != NULL)
4262 return (*mp_tp->mpt_localkey);
4263 else
4264 return (0);
4265}
4266
4267u_int64_t
4268mptcp_get_remotekey(void* mptcb_arg)
4269{
4270 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4271 return (mp_tp->mpt_remotekey);
4272}
4273
4274void
4275mptcp_send_dfin(struct socket *so)
4276{
4277 struct tcpcb *tp = NULL;
4278 struct inpcb *inp = NULL;
4279
4280 inp = sotoinpcb(so);
4281 if (!inp)
4282 return;
4283
4284 tp = intotcpcb(inp);
4285 if (!tp)
4286 return;
4287
4288 if (!(tp->t_mpflags & TMPF_RESET))
4289 tp->t_mpflags |= TMPF_SEND_DFIN;
4290}
4291
4292/*
4293 * Data Sequence Mapping routines
4294 */
4295void
4296mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4297{
4298 struct mptcb *mp_tp;
4299
4300 if (m == NULL)
4301 return;
4302
4303 mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
4304 MPT_LOCK(mp_tp);
4305 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4306 MPT_UNLOCK(mp_tp);
4307 panic("%s: data write before establishment.",
4308 __func__);
4309 return;
4310 }
4311
4312 while (m) {
4313 VERIFY(m->m_flags & M_PKTHDR);
4314 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4315 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4316 m->m_pkthdr.mp_rlen = m_pktlen(m);
4317 mp_tp->mpt_sndmax += m_pktlen(m);
4318 m = m->m_next;
4319 }
4320 MPT_UNLOCK(mp_tp);
4321}
4322
4323void
4324mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len)
4325{
4326 u_int32_t sub_len = 0;
4327
4328 while (m) {
4329 VERIFY(m->m_flags & M_PKTHDR);
4330
4331 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4332 sub_len = m->m_pkthdr.mp_rlen;
4333
4334 if (sub_len < len) {
4335 m->m_pkthdr.mp_dsn += sub_len;
4336 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4337 m->m_pkthdr.mp_rseq += sub_len;
4338 }
4339 m->m_pkthdr.mp_rlen = 0;
4340 len -= sub_len;
4341 } else {
4342 /* sub_len >= len */
4343 m->m_pkthdr.mp_dsn += len;
4344 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4345 m->m_pkthdr.mp_rseq += len;
4346 }
4347 mptcplog3((LOG_INFO,
4348 "%s: %llu %u %d %d\n", __func__,
4349 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
4350 m->m_pkthdr.mp_rlen, len));
4351 m->m_pkthdr.mp_rlen -= len;
4352 return;
4353 }
4354 } else {
4355 panic("%s: MPTCP tag not set", __func__);
4356 /* NOTREACHED */
4357 }
4358 m = m->m_next;
4359 }
4360}
4361
4362/* Obtain the DSN mapping stored in the mbuf */
4363void
4364mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4365 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4366{
4367 u_int64_t dsn64;
4368
4369 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4370 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4371 *dsn64p = dsn64;
4372}
4373
4374void
4375mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4376 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4377{
4378 struct mbuf *m = so->so_snd.sb_mb;
4379 struct mbuf *mnext = NULL;
4380 uint32_t runlen = 0;
4381 u_int64_t dsn64;
4382 uint32_t contig_len = 0;
4383
4384 if (m == NULL)
4385 return;
4386
4387 if (off < 0)
4388 return;
4389 /*
4390 * In the subflow socket, the DSN sequencing can be discontiguous,
4391 * but the subflow sequence mapping is contiguous. Use the subflow
4392 * sequence property to find the right mbuf and corresponding dsn
4393 * mapping.
4394 */
4395
4396 while (m) {
4397 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4398 VERIFY(m->m_flags & M_PKTHDR);
4399
4400 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4401 off -= m->m_pkthdr.mp_rlen;
4402 m = m->m_next;
4403 } else {
4404 break;
4405 }
4406 }
4407
4408 if (m == NULL) {
4409 panic("%s: bad offset", __func__);
4410 /* NOTREACHED */
4411 }
4412
4413 dsn64 = m->m_pkthdr.mp_dsn + off;
4414 *dsn = dsn64;
4415 *relseq = m->m_pkthdr.mp_rseq + off;
4416
4417 /*
4418 * Now find the last contiguous byte and its length from
4419 * start.
4420 */
4421 runlen = m->m_pkthdr.mp_rlen - off;
4422 contig_len = runlen;
4423
4424 /* If datalen does not span multiple mbufs, return */
4425 if (datalen <= runlen) {
4426 *data_len = min(datalen, UINT16_MAX);
4427 return;
4428 }
4429
4430 mnext = m->m_next;
4431 while (datalen > runlen) {
4432 if (mnext == NULL) {
4433 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4434 runlen, off);
4435 /* NOTREACHED */
4436 }
4437 VERIFY(mnext->m_flags & M_PKTHDR);
4438 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4439
4440 /*
4441 * case A. contiguous DSN stream
4442 * case B. discontiguous DSN stream
4443 */
4444 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4445 /* case A */
4446 runlen += mnext->m_pkthdr.mp_rlen;
4447 contig_len += mnext->m_pkthdr.mp_rlen;
4448 mptcplog3((LOG_INFO, "%s: contig \n",
4449 __func__));
4450 } else {
4451 /* case B */
fe8ab488
A
4452 mptcplog((LOG_INFO,
4453 "%s: discontig datalen %d contig_len %d cc %d \n",
4454 __func__, datalen, contig_len, so->so_snd.sb_cc));
39236c6e
A
4455 break;
4456 }
4457 mnext = mnext->m_next;
4458 }
4459 datalen = min(datalen, UINT16_MAX);
4460 *data_len = min(datalen, contig_len);
4461 mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__,
4462 *dsn, *relseq, *data_len, off));
4463}
4464
4465/*
4466 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4467 * here. It must be called from mptcp_adj_rmap() which is called only after
4468 * reassembly of out of order data. The rcvnxt variable must
4469 * be updated only when atleast some insequence new data is received.
4470 */
4471static void
4472mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4473{
4474 struct mptcb *mp_tp = tptomptp(tp);
4475
4476 if (mp_tp == NULL)
4477 return;
4478 MPT_LOCK(mp_tp);
4479 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4480 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4481 m->m_pkthdr.mp_rlen)))) {
4482 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4483 }
4484 MPT_UNLOCK(mp_tp);
4485}
4486
4487/*
4488 * Note that this is called only from tcp_input() which may trim data
4489 * after the dsn mapping is inserted into the mbuf. When it trims data
4490 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the
4491 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion
4492 * cannot be delayed after trim, because data can be in the reassembly
4493 * queue for a while and the DSN option info in tp will be overwritten for
4494 * every new packet received.
4495 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4496 * with mptcp_adj_rmap()
4497 */
4498void
4499mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4500{
4501 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4502
4503 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4504 VERIFY(m->m_flags & M_PKTHDR);
4505 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4506 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4507 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4508 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4509 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4510 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4511 }
4512}
4513
fe8ab488 4514int
39236c6e
A
4515mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4516{
4517 u_int64_t dsn;
4518 u_int32_t sseq, datalen;
4519 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4520 u_int32_t old_rcvnxt = 0;
4521
4522 if (m_pktlen(m) == 0)
fe8ab488 4523 return 0;
39236c6e
A
4524
4525 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4526 VERIFY(m->m_flags & M_PKTHDR);
4527
4528 dsn = m->m_pkthdr.mp_dsn;
4529 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4530 datalen = m->m_pkthdr.mp_rlen;
4531 } else {
4532 /* data arrived without an DSS option mapping */
fe8ab488
A
4533
4534 /* initial subflow can fallback right after SYN handshake */
39236c6e 4535 mptcp_notify_mpfail(so);
fe8ab488 4536 return 0;
39236c6e
A
4537 }
4538
4539 /* In the common case, data is in window and in sequence */
4540 if (m->m_pkthdr.len == (int)datalen) {
4541 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4542 return 0;
39236c6e
A
4543 }
4544
4545 if (m->m_pkthdr.len > (int)datalen) {
4546 panic("%s: mbuf len = %d expected = %d", __func__,
4547 m->m_pkthdr.len, datalen);
4548 }
4549
4550 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4551 if (SEQ_GT(old_rcvnxt, sseq)) {
4552 /* data trimmed from the left */
4553 int off = old_rcvnxt - sseq;
4554 m->m_pkthdr.mp_dsn += off;
4555 m->m_pkthdr.mp_rseq += off;
fe8ab488 4556 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e
A
4557 } else if (old_rcvnxt == sseq) {
4558 /*
4559 * Data was trimmed from the right
4560 */
4561 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4562 } else {
fe8ab488
A
4563 /* handle gracefully with reass or fallback */
4564 mptcp_notify_mpfail(so);
4565 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP;
4566 m_freem(m);
4567 return -1;
39236c6e
A
4568 }
4569 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4570 return 0;
39236c6e
A
4571}
4572
4573/*
4574 * Following routines help with failure detection and failover of data
4575 * transfer from one subflow to another.
4576 */
4577void
4578mptcp_act_on_txfail(struct socket *so)
4579{
4580 struct tcpcb *tp = NULL;
4581 struct inpcb *inp = sotoinpcb(so);
4582
4583 if (inp == NULL)
4584 return;
4585
4586 tp = intotcpcb(inp);
4587 if (tp == NULL)
4588 return;
4589
4590 if (tp->t_state != TCPS_ESTABLISHED)
4591 mptcplog((LOG_INFO, "%s: state = %d \n", __func__,
4592 tp->t_state));
fe8ab488
A
4593
4594 mptcplog((LOG_INFO, "%s: Failover = %d \n", __func__,
4595 (so->so_flags & SOF_MP_TRYFAILOVER) ? 1 : 0));
39236c6e
A
4596
4597 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4598 return;
4599 }
4600
4601 so->so_flags |= SOF_MP_TRYFAILOVER;
4602 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4603}
4604
4605/*
4606 * Support for MP_FAIL option
4607 */
4608int
4609mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4610{
4611 struct mbuf *m = so->so_snd.sb_mb;
4612 u_int64_t dsn;
4613 int off = 0;
4614 u_int32_t datalen;
4615
4616 if (m == NULL)
4617 return (-1);
4618
4619 while (m != NULL) {
4620 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4621 VERIFY(m->m_flags & M_PKTHDR);
4622 dsn = m->m_pkthdr.mp_dsn;
4623 datalen = m->m_pkthdr.mp_rlen;
4624 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4625 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4626 off = dsn_fail - dsn;
4627 *tcp_seq = m->m_pkthdr.mp_rseq + off;
fe8ab488
A
4628 mptcplog((LOG_INFO, "%s: %llu %llu \n",
4629 __func__, dsn, dsn_fail));
39236c6e
A
4630 return (0);
4631 }
4632
4633 m = m->m_next;
4634 }
4635
4636 /*
4637 * If there was no mbuf data and a fallback to TCP occurred, there's
4638 * not much else to do.
4639 */
4640
4641 mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail));
4642 return (-1);
4643}
4644
4645/*
4646 * Support for sending contiguous MPTCP bytes in subflow
fe8ab488 4647 * Also for preventing sending data with ACK in 3-way handshake
39236c6e
A
4648 */
4649int32_t
4650mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4651{
4652 u_int64_t mdss_dsn = 0;
4653 u_int32_t mdss_subflow_seq = 0;
4654 u_int16_t mdss_data_len = 0;
4655
4656 if (len == 0)
4657 return (len);
4658
4659 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4660 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4661
fe8ab488
A
4662 /*
4663 * Special case handling for Fast Join. We want to send data right
4664 * after ACK of the 3-way handshake, but not piggyback the data
4665 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4666 * mdss_data_len control this.
4667 */
4668 struct tcpcb *tp = NULL;
4669 tp = intotcpcb(sotoinpcb(so));
4670 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4671 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4672 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4673 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4674 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4675 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
4676 mdss_data_len = 0;
4677 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4678 }
39236c6e
A
4679 return (mdss_data_len);
4680}
4681
4682int32_t
4683mptcp_sbspace(struct mptcb *mpt)
4684{
4685 struct sockbuf *sb;
4686 uint32_t rcvbuf;
4687 int32_t space;
4688
4689 MPT_LOCK_ASSERT_HELD(mpt);
4690 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4691
4692 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4693 rcvbuf = sb->sb_hiwat;
4694 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4695 (sb->sb_mbmax - sb->sb_mbcnt)));
4696 if (space < 0)
4697 space = 0;
4698 /* XXX check if it's too small? */
4699
4700 return (space);
4701}
4702
4703/*
4704 * Support Fallback to Regular TCP
4705 */
4706void
4707mptcp_notify_mpready(struct socket *so)
4708{
4709 struct tcpcb *tp = NULL;
4710
4711 if (so == NULL)
4712 return;
4713
4714 tp = intotcpcb(sotoinpcb(so));
4715
4716 if (tp == NULL)
4717 return;
4718
4719 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4720 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4721 struct tcpcb *, tp);
4722
4723 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4724 return;
4725
4726 if (tp->t_mpflags & TMPF_MPTCP_READY)
4727 return;
4728
4729 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4730 tp->t_mpflags |= TMPF_MPTCP_READY;
4731
4732 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4733}
4734
4735void
4736mptcp_notify_mpfail(struct socket *so)
4737{
4738 struct tcpcb *tp = NULL;
4739
4740 if (so == NULL)
4741 return;
4742
4743 tp = intotcpcb(sotoinpcb(so));
4744
4745 if (tp == NULL)
4746 return;
4747
4748 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4749 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4750 struct tcpcb *, tp);
4751
4752 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
4753 return;
4754
4755 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4756 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4757
4758 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4759}
4760
4761/*
4762 * Keepalive helper function
4763 */
4764boolean_t
4765mptcp_ok_to_keepalive(struct mptcb *mp_tp)
4766{
4767 boolean_t ret = 1;
4768 VERIFY(mp_tp != NULL);
4769 MPT_LOCK(mp_tp);
4770 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
4771 ret = 0;
4772 }
4773 MPT_UNLOCK(mp_tp);
4774 return (ret);
4775}
4776
4777/*
4778 * MPTCP t_maxseg adjustment function
4779 */
4780int
4781mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
4782{
4783 int mss_lower = 0;
4784 struct mptcb *mp_tp = tptomptp(tp);
4785
4786#define MPTCP_COMPUTE_LEN { \
4787 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
4788 MPT_LOCK(mp_tp); \
4789 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
4790 mss_lower += 2; \
4791 else \
4792 /* adjust to 32-bit boundary + EOL */ \
4793 mss_lower += 2; \
4794 MPT_UNLOCK(mp_tp); \
4795}
4796 if (mp_tp == NULL)
4797 return (0);
4798
4799 /*
4800 * For the first subflow and subsequent subflows, adjust mss for
4801 * most common MPTCP option size, for case where tcp_mss is called
4802 * during option processing and MTU discovery.
4803 */
4804 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4805 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
4806 MPTCP_COMPUTE_LEN;
4807 }
4808
4809 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4810 (tp->t_mpflags & TMPF_SENT_JOIN)) {
4811 MPTCP_COMPUTE_LEN;
4812 }
4813
4814 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
4815 MPTCP_COMPUTE_LEN;
4816 }
4817
4818 return (mss_lower);
4819}
4820
4821/*
4822 * Update the pid, upid, uuid of the subflow so, based on parent so
4823 */
4824void
4825mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
4826{
4827 struct socket *subflow_so = mpts->mpts_socket;
4828
4829 MPTS_LOCK_ASSERT_HELD(mpts);
4830
4831 socket_lock(subflow_so, 0);
4832 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
4833 (subflow_so->last_upid != parent_mpso->last_upid)) {
4834 subflow_so->last_upid = parent_mpso->last_upid;
4835 subflow_so->last_pid = parent_mpso->last_pid;
4836 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
4837 }
4838 so_update_policy(subflow_so);
4839 socket_unlock(subflow_so, 0);
4840}
4841
4842static void
4843fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
4844{
4845 struct inpcb *inp;
4846
4847 tcp_getconninfo(so, &flow->flow_ci);
4848 inp = sotoinpcb(so);
4849#if INET6
4850 if ((inp->inp_vflag & INP_IPV6) != 0) {
4851 flow->flow_src.ss_family = AF_INET6;
4852 flow->flow_dst.ss_family = AF_INET6;
4853 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
4854 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
4855 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
4856 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
4857 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
4858 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
4859 } else
4860#endif
4861 {
4862 flow->flow_src.ss_family = AF_INET;
4863 flow->flow_dst.ss_family = AF_INET;
4864 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
4865 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
4866 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
4867 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
4868 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
4869 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
4870 }
4871 flow->flow_flags = mpts->mpts_flags;
4872 flow->flow_cid = mpts->mpts_connid;
4873}
4874
4875static int
4876mptcp_pcblist SYSCTL_HANDLER_ARGS
4877{
4878#pragma unused(oidp, arg1, arg2)
4879 int error = 0, f;
4880 size_t n, len;
4881 struct mppcb *mpp;
4882 struct mptses *mpte;
4883 struct mptcb *mp_tp;
4884 struct mptsub *mpts;
4885 struct socket *so;
4886 conninfo_mptcp_t mptcpci;
fe8ab488 4887 mptcp_flow_t *flows = NULL;
39236c6e
A
4888
4889 if (req->newptr != USER_ADDR_NULL)
4890 return (EPERM);
4891
4892 lck_mtx_lock(&mtcbinfo.mppi_lock);
4893 n = mtcbinfo.mppi_count;
4894 if (req->oldptr == USER_ADDR_NULL) {
4895 lck_mtx_unlock(&mtcbinfo.mppi_lock);
4896 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
4897 4 * (n + n/8) * sizeof(mptcp_flow_t);
4898 return (0);
4899 }
4900 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 4901 flows = NULL;
39236c6e
A
4902 bzero(&mptcpci, sizeof(mptcpci));
4903 lck_mtx_lock(&mpp->mpp_lock);
4904 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4905 mpte = mptompte(mpp);
4906 VERIFY(mpte != NULL);
4907 mp_tp = mpte->mpte_mptcb;
4908 VERIFY(mp_tp != NULL);
39236c6e
A
4909 /* N.B. we don't take the mpt_lock just for the state. */
4910 mptcpci.mptcpci_state = mp_tp->mpt_state;
4911 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
fe8ab488
A
4912 len = sizeof(*flows) * mpte->mpte_numflows;
4913 if (mpte->mpte_numflows != 0) {
4914 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
4915 if (flows == NULL) {
4916 lck_mtx_unlock(&mpp->mpp_lock);
4917 break;
4918 }
4919 mptcpci.mptcpci_len = sizeof(mptcpci) +
4920 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
4921 error = SYSCTL_OUT(req, &mptcpci,
4922 sizeof(mptcpci) - sizeof(mptcp_flow_t));
4923 } else {
4924 mptcpci.mptcpci_len = sizeof(mptcpci);
4925 error = SYSCTL_OUT(req, &mptcpci,
4926 sizeof(mptcpci));
4927 }
39236c6e
A
4928 if (error) {
4929 lck_mtx_unlock(&mpp->mpp_lock);
4930 FREE(flows, M_TEMP);
4931 break;
4932 }
4933 f = 0;
4934 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4935 MPTS_LOCK(mpts);
4936 so = mpts->mpts_socket;
4937 socket_lock(so, 0);
4938 fill_mptcp_subflow(so, &flows[f], mpts);
4939 socket_unlock(so, 0);
4940 MPTS_UNLOCK(mpts);
4941 f++;
4942 }
4943 lck_mtx_unlock(&mpp->mpp_lock);
fe8ab488
A
4944 if (flows) {
4945 error = SYSCTL_OUT(req, flows, len);
4946 FREE(flows, M_TEMP);
4947 if (error)
4948 break;
4949 }
39236c6e
A
4950 }
4951 lck_mtx_unlock(&mtcbinfo.mppi_lock);
4952
4953 return (error);
4954}
4955
4956SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
4957 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
4958 "List of active MPTCP connections");
fe8ab488
A
4959
4960/*
4961 * Check the health of the other subflows and do an mptcp_output if
4962 * there is no other active or functional subflow at the time of
4963 * call of this function.
4964 */
4965static void
4966mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
4967{
4968 struct mptsub *from_mpts = NULL;
4969
4970 MPTE_LOCK_ASSERT_HELD(mpte);
4971
4972 MPTS_UNLOCK(to_mpts);
4973
4974 from_mpts = mpte->mpte_active_sub;
4975
4976 if (from_mpts == NULL)
4977 goto output_needed;
4978
4979 MPTS_LOCK(from_mpts);
4980
4981 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4982 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4983 MPTS_UNLOCK(from_mpts);
4984 goto output_needed;
4985 }
4986
4987 MPTS_UNLOCK(from_mpts);
4988 MPTS_LOCK(to_mpts);
4989 return;
4990
4991output_needed:
4992 mptcp_output(mpte);
4993 MPTS_LOCK(to_mpts);
4994}
4995
4996
4997/*
4998 * When WiFi signal starts fading, there's more loss and RTT spikes.
4999 * Check if there has been a large spike by comparing against
5000 * a tolerable RTT spike threshold.
5001 */
5002boolean_t
5003mptcp_no_rto_spike(struct socket *so)
5004{
5005 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5006 int32_t spike = 0;
5007
5008 if (tp->t_rxtcur > mptcp_rto_spike_thresh) {
5009 spike = tp->t_rxtcur - mptcp_rto_spike_thresh;
5010
5011 mptcplog2((LOG_INFO, "%s: spike = %d rto = %d",
5012 "best = %d cur = %d\n", __func__, spike,
5013 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
5014 tp->t_rttcur));
5015
5016 }
5017
5018 if (spike > 0 ) {
5019 return (FALSE);
5020 } else {
5021 return (TRUE);
5022 }
5023}
5024
5025/*
5026 * Set notsent lowat mark on the MPTCB
5027 */
5028int
5029mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5030{
5031 struct mptcb *mp_tp = NULL;
5032 int error = 0;
5033
5034 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5035 mp_tp = mpte->mpte_mptcb;
5036
5037 if (mp_tp)
5038 mp_tp->mpt_notsent_lowat = optval;
5039 else
5040 error = EINVAL;
5041
5042 return error;
5043}
5044
5045u_int32_t
5046mptcp_get_notsent_lowat(struct mptses *mpte)
5047{
5048 struct mptcb *mp_tp = NULL;
5049
5050 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5051 mp_tp = mpte->mpte_mptcb;
5052
5053 if (mp_tp)
5054 return mp_tp->mpt_notsent_lowat;
5055 else
5056 return 0;
5057}
5058
5059int
5060mptcp_notsent_lowat_check(struct socket *so) {
5061 struct mptses *mpte;
5062 struct mppcb *mpp;
5063 struct mptcb *mp_tp;
5064 struct mptsub *mpts;
5065
5066 int notsent = 0;
5067
5068 mpp = sotomppcb(so);
5069 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5070 return (0);
5071 }
5072
5073 mpte = mptompte(mpp);
5074 mp_tp = mpte->mpte_mptcb;
5075
5076 MPT_LOCK(mp_tp);
5077 notsent = so->so_snd.sb_cc;
5078
5079 if ((notsent == 0) ||
5080 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5081 mp_tp->mpt_notsent_lowat)) {
5082 mptcplog3((LOG_INFO, "%s: lowat %d notsent %d actual %d \n",
5083 __func__, mp_tp->mpt_notsent_lowat, notsent,
5084 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)));
5085 MPT_UNLOCK(mp_tp);
5086 return (1);
5087 }
5088 MPT_UNLOCK(mp_tp);
5089
5090 /* When Nagle's algorithm is not disabled, it is better
5091 * to wakeup the client even before there is atleast one
5092 * maxseg of data to write.
5093 */
5094 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5095 int retval = 0;
5096 MPTS_LOCK(mpts);
5097 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5098 struct socket *subf_so = mpts->mpts_socket;
5099 socket_lock(subf_so, 0);
5100 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5101
5102 notsent = so->so_snd.sb_cc -
5103 (tp->snd_nxt - tp->snd_una);
5104
5105 if ((tp->t_flags & TF_NODELAY) == 0 &&
5106 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5107 retval = 1;
5108 }
5109 mptcplog3((LOG_INFO, "%s: lowat %d notsent %d"
5110 " nodelay false \n",
5111 __func__, mp_tp->mpt_notsent_lowat, notsent));
5112 socket_unlock(subf_so, 0);
5113 MPTS_UNLOCK(mpts);
5114 return (retval);
5115 }
5116 MPTS_UNLOCK(mpts);
5117 }
5118 return (0);
5119}
5120