]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/netinet/mptcp_subr.c
xnu-2422.1.72.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/in_var.h>
53#include <netinet/tcp.h>
54#include <netinet/tcp_fsm.h>
55#include <netinet/tcp_seq.h>
56#include <netinet/tcp_var.h>
57#include <netinet/mptcp_var.h>
58#include <netinet/mptcp.h>
59#include <netinet/mptcp_seq.h>
60#include <netinet/mptcp_timer.h>
61#include <libkern/crypto/sha1.h>
62#if INET6
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6protosw.h>
65#endif /* INET6 */
66#include <dev/random/randomdev.h>
67
68/*
69 * Notes on MPTCP implementation.
70 *
71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
72 * communication domain. The structure mtcbinfo describes the MPTCP instance
73 * of a Multipath protocol in that domain. It is used to keep track of all
74 * MPTCP PCB instances in the system, and is protected by the global lock
75 * mppi_lock.
76 *
77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
78 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
79 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
80 * allocated from the same memory block, and each structure has a pointer
81 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
83 * PCB (mppcb) as well as the MPTCP Session (mptses).
84 *
85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
86 * in particular, the list of subflows as well as the MPTCP thread.
87 *
88 * A functioning MPTCP Session consists of one or more subflow sockets. Each
89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
90 * represented by the mptsub structure. Because each subflow requires access
91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
92 * subflow. This gets decremented prior to the subflow's destruction. The
93 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
94 *
95 * To handle events (read, write, control) from the subflows, an MPTCP thread
96 * is created; currently, there is one thread per MPTCP Session. In order to
97 * prevent the MPTCP socket from being destroyed while being accessed by the
98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
99 * which will be decremented prior to the thread's termination. The thread
100 * lock (mpte_thread_lock) is used to synchronize its signalling.
101 *
102 * Lock ordering is defined as follows:
103 *
104 * mtcbinfo (mppi_lock)
105 * mp_so (mpp_lock)
106 * mpts (mpts_lock)
107 * so (inpcb_mtx)
108 * mptcb (mpt_lock)
109 *
110 * It is not a requirement that all of the above locks need to be acquired
111 * in succession, but the correct lock ordering must be followed when there
112 * are more than one locks that need to be held. The MPTCP thread lock is
113 * is not constrained by this arrangement, because none of the other locks
114 * is ever acquired while holding mpte_thread_lock; therefore it may be called
115 * at any moment to signal the thread.
116 *
117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
118 * work is done by the MPTCP garbage collector which is invoked on demand by
119 * the PF_MULTIPATH garbage collector. This process will take place once all
120 * of the subflows have been destroyed, and the MPTCP thread be instructed to
121 * self-terminate.
122 */
123
124static void mptcp_sesdestroy(struct mptses *);
125static void mptcp_thread_signal_locked(struct mptses *);
126static void mptcp_thread_terminate_signal(struct mptses *);
127static void mptcp_thread_dowork(struct mptses *);
128static void mptcp_thread_func(void *, wait_result_t);
129static void mptcp_thread_destroy(struct mptses *);
130static void mptcp_key_pool_init(void);
131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, connid_t);
132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
133static void mptcp_conn_properties(struct mptcb *);
134static void mptcp_init_statevars(struct mptcb *);
135
136static uint32_t mptcp_gc(struct mppcbinfo *);
137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138 int, struct proc *, struct socket **);
139static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142 struct uio *, struct mbuf **, struct mbuf **, int *);
143static void mptcp_subflow_rupcall(struct socket *, void *, int);
144static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145static void mptcp_subflow_wupcall(struct socket *, void *, int);
146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147static void mptcp_update_last_owner(struct mptsub *, struct socket *);
148
149/*
150 * Possible return values for subflow event handlers. Note that success
151 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
152 * indicate errors or actions which require immediate attention; they will
153 * prevent the rest of the handlers from processing their respective events
154 * until the next round of events processing.
155 */
156typedef enum {
157 MPTS_EVRET_DELETE = 1, /* delete this subflow */
158 MPTS_EVRET_OK = 2, /* OK */
159 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
160 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
161 MPTS_EVRET_OK_UPDATE = 5, /* OK with conninfo update */
162} ev_ret_t;
163
164static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *);
165static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *);
166static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *);
167static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *);
168static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *);
169static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *);
170static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *);
171static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *);
172static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *);
173static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *);
174static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *);
175static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *);
176static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *);
177static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *);
178static const char *mptcp_evret2str(ev_ret_t);
179
180static mptcp_key_t *mptcp_reserve_key(void);
181static int mptcp_do_sha1(mptcp_key_t *, char *, int);
182static int mptcp_init_authparms(struct mptcb *);
183static int mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts);
184
185static unsigned int mptsub_zone_size; /* size of mptsub */
186static struct zone *mptsub_zone; /* zone for mptsub */
187
188static unsigned int mptopt_zone_size; /* size of mptopt */
189static struct zone *mptopt_zone; /* zone for mptopt */
190
191static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
192static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
193
194struct mppcbinfo mtcbinfo;
195
196static struct mptcp_keys_pool_head mptcp_keys_pool;
197
198#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
199#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
200
201SYSCTL_DECL(_net_inet);
202
203SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
204
205uint32_t mptcp_verbose = 0; /* more noise if greater than 1 */
206SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED,
207 &mptcp_verbose, 0, "MPTCP verbosity level");
208
209SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
210 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
211
212/*
213 * Since there is one kernel thread per mptcp socket, imposing an artificial
214 * limit on number of allowed mptcp sockets.
215 */
216uint32_t mptcp_socket_limit = MPPCB_LIMIT;
217SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
218 &mptcp_socket_limit, 0, "MPTCP socket limit");
219
220static struct protosw mptcp_subflow_protosw;
221static struct pr_usrreqs mptcp_subflow_usrreqs;
222#if INET6
223static struct ip6protosw mptcp_subflow_protosw6;
224static struct pr_usrreqs mptcp_subflow_usrreqs6;
225#endif /* INET6 */
226
227/*
228 * Protocol pr_init callback.
229 */
230void
231mptcp_init(struct protosw *pp, struct domain *dp)
232{
233#pragma unused(dp)
234 static int mptcp_initialized = 0;
235 struct protosw *prp;
236#if INET6
237 struct ip6protosw *prp6;
238#endif /* INET6 */
239
240 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
241
242 /* do this only once */
243 if (mptcp_initialized)
244 return;
245 mptcp_initialized = 1;
246
247 /*
248 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
249 * we must be able to find IPPROTO_TCP entries for both.
250 */
251 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
252 VERIFY(prp != NULL);
253 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
254 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
255 sizeof (mptcp_subflow_usrreqs));
256 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
257 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
258 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
259 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
260 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
261 /*
262 * Socket filters shouldn't attach/detach to/from this protosw
263 * since pr_protosw is to be used instead, which points to the
264 * real protocol; if they do, it is a bug and we should panic.
265 */
266 mptcp_subflow_protosw.pr_filter_head.tqh_first =
267 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
268 mptcp_subflow_protosw.pr_filter_head.tqh_last =
269 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
270
271#if INET6
272 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
273 IPPROTO_TCP, SOCK_STREAM);
274 VERIFY(prp6 != NULL);
275 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
276 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
277 sizeof (mptcp_subflow_usrreqs6));
278 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
279 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
280 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
281 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
282 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
283 /*
284 * Socket filters shouldn't attach/detach to/from this protosw
285 * since pr_protosw is to be used instead, which points to the
286 * real protocol; if they do, it is a bug and we should panic.
287 */
288 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
289 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
290 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
291 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
292#endif /* INET6 */
293
294 bzero(&mtcbinfo, sizeof (mtcbinfo));
295 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
296 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
297 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
298 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
299 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
300 /* NOTREACHED */
301 }
302 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
303 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
304
305 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
306 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
307 mtcbinfo.mppi_lock_grp_attr);
308 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
309 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
310 mtcbinfo.mppi_lock_attr);
311 mtcbinfo.mppi_gc = mptcp_gc;
312
313 mtcbinfo.mppi_timer = mptcp_timer;
314
315 /* attach to MP domain for garbage collection to take place */
316 mp_pcbinfo_attach(&mtcbinfo);
317
318 mptsub_zone_size = sizeof (struct mptsub);
319 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
320 8192, "mptsub")) == NULL) {
321 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
322 /* NOTREACHED */
323 }
324 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
325 zone_change(mptsub_zone, Z_EXPAND, TRUE);
326
327 mptopt_zone_size = sizeof (struct mptopt);
328 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
329 1024, "mptopt")) == NULL) {
330 panic("%s: unable to allocate MPTCP option zone\n", __func__);
331 /* NOTREACHED */
332 }
333 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
334 zone_change(mptopt_zone, Z_EXPAND, TRUE);
335
336 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
337 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
338 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
339 panic("%s: unable to allocate MPTCP address auth zone \n",
340 __func__);
341 /* NOTREACHED */
342 }
343 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
344 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
345
346 /* Set up a list of unique keys */
347 mptcp_key_pool_init();
348
349}
350
351/*
352 * Create an MPTCP session, called as a result of opening a MPTCP socket.
353 */
354struct mptses *
355mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
356{
357 struct mppcbinfo *mppi;
358 struct mptses *mpte;
359 struct mptcb *mp_tp;
360 int error = 0;
361
362 VERIFY(mpp != NULL);
363 mppi = mpp->mpp_pcbinfo;
364 VERIFY(mppi != NULL);
365
366 mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
367 mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
368
369 /* MPTCP Multipath PCB Extension */
370 bzero(mpte, sizeof (*mpte));
371 VERIFY(mpp->mpp_pcbe == NULL);
372 mpp->mpp_pcbe = mpte;
373 mpte->mpte_mppcb = mpp;
374 mpte->mpte_mptcb = mp_tp;
375
376 TAILQ_INIT(&mpte->mpte_sopts);
377 TAILQ_INIT(&mpte->mpte_subflows);
378 mpte->mpte_associd = ASSOCID_ANY;
379 mpte->mpte_connid_last = CONNID_ANY;
380
381 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
382 mppi->mppi_lock_attr);
383
384 /*
385 * XXX: adi@apple.com
386 *
387 * This can be rather expensive if we have lots of MPTCP sockets,
388 * but we need a kernel thread for this model to work. Perhaps we
389 * could amortize the costs by having one worker thread per a group
390 * of MPTCP sockets.
391 */
392 if (kernel_thread_start(mptcp_thread_func, mpte,
393 &mpte->mpte_thread) != KERN_SUCCESS) {
394 error = ENOBUFS;
395 goto out;
396 }
397 mp_so->so_usecount++; /* for thread */
398
399 /* MPTCP Protocol Control Block */
400 bzero(mp_tp, sizeof (*mp_tp));
401 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
402 mppi->mppi_lock_attr);
403 mp_tp->mpt_mpte = mpte;
404
405out:
406 if (error != 0)
407 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
408 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
409 struct sockbuf *, &mp_so->so_rcv,
410 struct sockbuf *, &mp_so->so_snd,
411 struct mppcb *, mpp, int, error);
412
413 return ((error != 0) ? NULL : mpte);
414}
415
416/*
417 * Destroy an MPTCP session.
418 */
419static void
420mptcp_sesdestroy(struct mptses *mpte)
421{
422 struct mptcb *mp_tp;
423
424 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
425
426 mp_tp = mpte->mpte_mptcb;
427 VERIFY(mp_tp != NULL);
428
429 /*
430 * MPTCP Multipath PCB Extension section
431 */
432 mptcp_flush_sopts(mpte);
433 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
434
435 lck_mtx_destroy(&mpte->mpte_thread_lock,
436 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
437
438 /*
439 * MPTCP Protocol Control Block section
440 */
441 lck_mtx_destroy(&mp_tp->mpt_lock,
442 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
443
444 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
445 struct mptcb *, mp_tp);
446}
447
448/*
449 * Allocate an MPTCP socket option structure.
450 */
451struct mptopt *
452mptcp_sopt_alloc(int how)
453{
454 struct mptopt *mpo;
455
456 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
457 zalloc_noblock(mptopt_zone);
458 if (mpo != NULL) {
459 bzero(mpo, mptopt_zone_size);
460 }
461
462 return (mpo);
463}
464
465/*
466 * Free an MPTCP socket option structure.
467 */
468void
469mptcp_sopt_free(struct mptopt *mpo)
470{
471 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
472
473 zfree(mptopt_zone, mpo);
474}
475
476/*
477 * Add a socket option to the MPTCP socket option list.
478 */
479void
480mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
481{
482 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
483 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
484 mpo->mpo_flags |= MPOF_ATTACHED;
485 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
486}
487
488/*
489 * Remove a socket option from the MPTCP socket option list.
490 */
491void
492mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
493{
494 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
495 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
496 mpo->mpo_flags &= ~MPOF_ATTACHED;
497 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
498}
499
500/*
501 * Search for an existing <sopt_level,sopt_name> socket option.
502 */
503struct mptopt *
504mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
505{
506 struct mptopt *mpo;
507
508 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
509
510 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
511 if (mpo->mpo_level == sopt->sopt_level &&
512 mpo->mpo_name == sopt->sopt_name)
513 break;
514 }
515 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
516
517 return (mpo);
518}
519
520/*
521 * Flushes all recorded socket options from an MP socket.
522 */
523void
524mptcp_flush_sopts(struct mptses *mpte)
525{
526 struct mptopt *mpo, *tmpo;
527
528 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
529
530 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
531 mptcp_sopt_remove(mpte, mpo);
532 mptcp_sopt_free(mpo);
533 }
534 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
535}
536
537/*
538 * Allocate a MPTCP subflow structure.
539 */
540struct mptsub *
541mptcp_subflow_alloc(int how)
542{
543 struct mptsub *mpts;
544
545 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
546 zalloc_noblock(mptsub_zone);
547 if (mpts != NULL) {
548 bzero(mpts, mptsub_zone_size);
549 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
550 mtcbinfo.mppi_lock_attr);
551 }
552
553 return (mpts);
554}
555
556/*
557 * Deallocate a subflow structure, called when all of the references held
558 * on it have been released. This implies that the subflow has been deleted.
559 */
560void
561mptcp_subflow_free(struct mptsub *mpts)
562{
563 MPTS_LOCK_ASSERT_HELD(mpts);
564
565 VERIFY(mpts->mpts_refcnt == 0);
566 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
567 VERIFY(mpts->mpts_mpte == NULL);
568 VERIFY(mpts->mpts_socket == NULL);
569
570 if (mpts->mpts_src_sl != NULL) {
571 sockaddrlist_free(mpts->mpts_src_sl);
572 mpts->mpts_src_sl = NULL;
573 }
574 if (mpts->mpts_dst_sl != NULL) {
575 sockaddrlist_free(mpts->mpts_dst_sl);
576 mpts->mpts_dst_sl = NULL;
577 }
578 MPTS_UNLOCK(mpts);
579 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
580
581 zfree(mptsub_zone, mpts);
582}
583
584/*
585 * Create an MPTCP subflow socket.
586 */
587static int
588mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
589 struct proc *p, struct socket **so)
590{
591 struct mptopt smpo, *mpo, *tmpo;
592 struct socket *mp_so;
593 int error;
594
595 *so = NULL;
596 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
597 mp_so = mpte->mpte_mppcb->mpp_socket;
598
599 /*
600 * Create the subflow socket (multipath subflow, non-blocking.)
601 *
602 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
603 * socket; it will be cleared when the socket is peeled off or closed.
604 * It also indicates to the underlying TCP to handle MPTCP options.
605 * A multipath subflow socket implies SS_NOFDREF state.
606 */
607 if ((error = socreate_internal(dom, so, SOCK_STREAM,
608 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
609 mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to "
610 "create subflow socket error %d\n", __func__,
611 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error));
612 return (error);
613 }
614
615 socket_lock(*so, 0);
616 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
617 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
618 (SS_NBIO|SS_NOFDREF));
619
620 /* prevent the socket buffers from being compressed */
621 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
622 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
623
624 bzero(&smpo, sizeof (smpo));
625 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
626 smpo.mpo_level = SOL_SOCKET;
627 smpo.mpo_intval = 1;
628
629 /* disable SIGPIPE */
630 smpo.mpo_name = SO_NOSIGPIPE;
631 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
632 goto out;
633
634 /* find out if the subflow's source address goes away */
635 smpo.mpo_name = SO_NOADDRERR;
636 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
637 goto out;
638
639 /* enable keepalive */
640 smpo.mpo_name = SO_KEEPALIVE;
641 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
642 goto out;
643
644 /*
645 * Limit the receive socket buffer size to 64k.
646 *
647 * We need to take into consideration the window scale option
648 * which could be negotiated in one subflow but disabled in
649 * another subflow.
650 * XXX This can be improved in the future.
651 */
652 smpo.mpo_name = SO_RCVBUF;
653 smpo.mpo_intval = MPTCP_RWIN_MAX;
654 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
655 goto out;
656
657 /* N.B.: set by sosetopt */
658 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
659 /* Prevent automatic socket buffer sizing. */
660 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
661
662 smpo.mpo_level = IPPROTO_TCP;
663 smpo.mpo_intval = mptcp_subflow_keeptime;
664 smpo.mpo_name = TCP_KEEPALIVE;
665 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
666 goto out;
667
668 /* replay setsockopt(2) on the subflow sockets for eligible options */
669 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
670 int interim;
671
672 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
673 continue;
674
675 /*
676 * Skip those that are handled internally; these options
677 * should not have been recorded and marked with the
678 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
679 */
680 if (mpo->mpo_level == SOL_SOCKET &&
681 (mpo->mpo_name == SO_NOSIGPIPE ||
682 mpo->mpo_name == SO_NOADDRERR ||
683 mpo->mpo_name == SO_KEEPALIVE))
684 continue;
685
686 interim = (mpo->mpo_flags & MPOF_INTERIM);
687 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
688 char buf[32];
689 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d "
690 "interim record removed\n", __func__,
691 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
692 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
693 buf, sizeof (buf)), mpo->mpo_intval));
694 mptcp_sopt_remove(mpte, mpo);
695 mptcp_sopt_free(mpo);
696 continue;
697 }
698 }
699
700 /*
701 * We need to receive everything that the subflow socket has,
702 * so use a customized socket receive function. We will undo
703 * this when the socket is peeled off or closed.
704 */
705 mpts->mpts_oprotosw = (*so)->so_proto;
706 switch (dom) {
707 case PF_INET:
708 (*so)->so_proto = &mptcp_subflow_protosw;
709 break;
710#if INET6
711 case PF_INET6:
712 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
713 break;
714#endif /* INET6 */
715 default:
716 VERIFY(0);
717 /* NOTREACHED */
718 }
719
720out:
721 socket_unlock(*so, 0);
722
723 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
724 struct mptsub *, mpts, int, dom, int, error);
725
726 return (error);
727}
728
729/*
730 * Close an MPTCP subflow socket.
731 *
732 * Note that this may be called on an embryonic subflow, and the only
733 * thing that is guaranteed valid is the protocol-user request.
734 */
735static int
736mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
737{
738 MPTS_LOCK_ASSERT_HELD(mpts);
739
740 socket_lock(so, 0);
741 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
742 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
743
744 /* restore protocol-user requests */
745 VERIFY(mpts->mpts_oprotosw != NULL);
746 so->so_proto = mpts->mpts_oprotosw;
747 socket_unlock(so, 0);
748
749 mpts->mpts_socket = NULL; /* may already be NULL */
750
751 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
752 struct socket *, so,
753 struct sockbuf *, &so->so_rcv,
754 struct sockbuf *, &so->so_snd,
755 struct mptses *, mpts->mpts_mpte);
756
757 return (soclose(so));
758}
759
760/*
761 * Connect an MPTCP subflow socket.
762 *
763 * This may be called inline as part of adding a subflow, or asynchronously
764 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
765 * pending connect case, the subflow socket may have been bound to an interface
766 * and/or a source IP address which may no longer be around by the time this
767 * routine is called; in that case the connect attempt will most likely fail.
768 */
769static int
770mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
771{
772 struct socket *so;
773 int af, error;
774
775 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
776 MPTS_LOCK_ASSERT_HELD(mpts);
777
778 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
779 MPTSF_CONNECTING);
780 VERIFY(mpts->mpts_socket != NULL);
781 so = mpts->mpts_socket;
782 af = mpts->mpts_family;
783
784 if (af == AF_INET || af == AF_INET6) {
785 struct sockaddr_entry *dst_se;
786 char dbuf[MAX_IPv6_STR_LEN];
787
788 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
789 VERIFY(dst_se != NULL);
790
791 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
792 "[pended %s]\n", __func__,
793 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
794 inet_ntop(af, ((af == AF_INET) ?
795 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
796 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
797 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
798 ntohs(SIN(dst_se->se_addr)->sin_port) :
799 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
800 mpts->mpts_connid,
801 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
802 "YES" : "NO")));
803 }
804
805 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
806
807 socket_lock(so, 0);
808 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpts->mpts_connid);
809 /* connect the subflow socket */
810 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
811 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
812 mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP,
813 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr));
814 socket_unlock(so, 0);
815
816 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
817 struct mptsub *, mpts, int, error);
818
819 return (error);
820}
821
822/*
823 * MPTCP subflow socket receive routine, derived from soreceive().
824 */
825static int
826mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
827 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
828{
829#pragma unused(uio)
830 int flags, error = 0;
831 struct proc *p = current_proc();
832 struct mbuf *m, **mp = mp0;
833 struct mbuf *nextrecord;
834
835 socket_lock(so, 1);
836 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
837
838#ifdef MORE_LOCKING_DEBUG
839 if (so->so_usecount == 1) {
840 panic("%s: so=%x no other reference on socket\n", __func__, so);
841 /* NOTREACHED */
842 }
843#endif
844 /*
845 * We return all that is there in the subflow's socket receive buffer
846 * to the MPTCP layer, so we require that the caller passes in the
847 * expected parameters.
848 */
849 if (mp == NULL || controlp != NULL) {
850 socket_unlock(so, 1);
851 return (EINVAL);
852 }
853 *mp = NULL;
854 if (psa != NULL)
855 *psa = NULL;
856 if (flagsp != NULL)
857 flags = *flagsp &~ MSG_EOR;
858 else
859 flags = 0;
860
861 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
862 socket_unlock(so, 1);
863 return (EOPNOTSUPP);
864 }
865 flags |= (MSG_DONTWAIT|MSG_NBIO);
866
867 /*
868 * If a recv attempt is made on a previously-accepted socket
869 * that has been marked as inactive (disconnected), reject
870 * the request.
871 */
872 if (so->so_flags & SOF_DEFUNCT) {
873 struct sockbuf *sb = &so->so_rcv;
874
875 error = ENOTCONN;
876 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
877 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
878 SOCK_DOM(so), SOCK_TYPE(so), error));
879 /*
880 * This socket should have been disconnected and flushed
881 * prior to being returned from sodefunct(); there should
882 * be no data on its receive list, so panic otherwise.
883 */
884 if (so->so_state & SS_DEFUNCT)
885 sb_empty_assert(sb, __func__);
886 socket_unlock(so, 1);
887 return (error);
888 }
889
890 /*
891 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
892 * and if so just return to the caller. This could happen when
893 * soreceive() is called by a socket upcall function during the
894 * time the socket is freed. The socket buffer would have been
895 * locked across the upcall, therefore we cannot put this thread
896 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
897 * we may livelock), because the lock on the socket buffer will
898 * only be released when the upcall routine returns to its caller.
899 * Because the socket has been officially closed, there can be
900 * no further read on it.
901 *
902 * A multipath subflow socket would have its SS_NOFDREF set by
903 * default, so check for SOF_MP_SUBFLOW socket flag; when the
904 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
905 */
906 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
907 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
908 socket_unlock(so, 1);
909 return (0);
910 }
911
912 /*
913 * For consistency with soreceive() semantics, we need to obey
914 * SB_LOCK in case some other code path has locked the buffer.
915 */
916 error = sblock(&so->so_rcv, 0);
917 if (error != 0) {
918 socket_unlock(so, 1);
919 return (error);
920 }
921
922 m = so->so_rcv.sb_mb;
923 if (m == NULL) {
924 /*
925 * Panic if we notice inconsistencies in the socket's
926 * receive list; both sb_mb and sb_cc should correctly
927 * reflect the contents of the list, otherwise we may
928 * end up with false positives during select() or poll()
929 * which could put the application in a bad state.
930 */
931 SB_MB_CHECK(&so->so_rcv);
932
933 if (so->so_error != 0) {
934 error = so->so_error;
935 so->so_error = 0;
936 goto release;
937 }
938
939 if (so->so_state & SS_CANTRCVMORE) {
940 goto release;
941 }
942
943 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
944 error = ENOTCONN;
945 goto release;
946 }
947
948 /*
949 * MSG_DONTWAIT is implicitly defined and this routine will
950 * never block, so return EWOULDBLOCK when there is nothing.
951 */
952 error = EWOULDBLOCK;
953 goto release;
954 }
955
956 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
957 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
958 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
959
960 while (m != NULL) {
961 nextrecord = m->m_nextpkt;
962 sbfree(&so->so_rcv, m);
963
964 if (mp != NULL) {
965 *mp = m;
966 mp = &m->m_next;
967 so->so_rcv.sb_mb = m = m->m_next;
968 *mp = NULL;
969 }
970
971 if (m != NULL) {
972 m->m_nextpkt = nextrecord;
973 if (nextrecord == NULL)
974 so->so_rcv.sb_lastrecord = m;
975 } else {
976 m = so->so_rcv.sb_mb = nextrecord;
977 SB_EMPTY_FIXUP(&so->so_rcv);
978 }
979 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
980 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
981 }
982
983 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
984 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
985 /* notify protocol that we drained all the data */
986 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
987 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
988
989 if (flagsp != NULL)
990 *flagsp |= flags;
991
992release:
993 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
994 return (error);
995
996}
997
998
999/*
1000 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1001 * the work done earlier when the subflow socket was created.
1002 */
1003void
1004mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1005 struct socket *so)
1006{
1007 struct mptopt smpo;
1008 struct socket *mp_so;
1009 int p, c;
1010
1011 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1012 mp_so = mpte->mpte_mppcb->mpp_socket;
1013 MPTS_LOCK_ASSERT_HELD(mpts);
1014
1015 socket_lock(so, 0);
1016 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1017 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1018
1019 /* inherit MPTCP socket states */
1020 if (!(mp_so->so_state & SS_NBIO))
1021 so->so_state &= ~SS_NBIO;
1022
1023 /*
1024 * At this point, the socket is not yet closed, as there is at least
1025 * one outstanding usecount previously held by mpts_socket from
1026 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1027 */
1028 so->so_flags &= ~SOF_MP_SUBFLOW;
1029 so->so_state &= ~SS_NOFDREF;
1030 so->so_state &= ~SOF_MPTCP_TRUE;
1031
1032 /* allow socket buffers to be compressed */
1033 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1034 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1035
1036 /*
1037 * Allow socket buffer auto sizing.
1038 *
1039 * This will increase the current 64k buffer size to whatever is best.
1040 */
1041 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1042 so->so_snd.sb_flags |= SB_AUTOSIZE;
1043
1044 /* restore protocol-user requests */
1045 VERIFY(mpts->mpts_oprotosw != NULL);
1046 so->so_proto = mpts->mpts_oprotosw;
1047
1048 bzero(&smpo, sizeof (smpo));
1049 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1050 smpo.mpo_level = SOL_SOCKET;
1051
1052 /* inherit SOF_NOSIGPIPE from parent MP socket */
1053 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1054 c = (so->so_flags & SOF_NOSIGPIPE);
1055 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1056 smpo.mpo_name = SO_NOSIGPIPE;
1057 if ((p - c) != 0)
1058 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1059
1060 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1061 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1062 c = (so->so_flags & SOF_NOADDRAVAIL);
1063 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1064 smpo.mpo_name = SO_NOADDRERR;
1065 if ((p - c) != 0)
1066 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1067
1068 /* inherit SO_KEEPALIVE from parent MP socket */
1069 p = (mp_so->so_options & SO_KEEPALIVE);
1070 c = (so->so_options & SO_KEEPALIVE);
1071 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1072 smpo.mpo_name = SO_KEEPALIVE;
1073 if ((p - c) != 0)
1074 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1075
1076 /* unset TCP level default keepalive option */
1077 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1078 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1079 smpo.mpo_level = IPPROTO_TCP;
1080 smpo.mpo_intval = 0;
1081 smpo.mpo_name = TCP_KEEPALIVE;
1082 if ((p - c) != 0)
1083 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1084 socket_unlock(so, 0);
1085
1086 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1087 struct mptsub *, mpts, struct socket *, so,
1088 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1089}
1090
1091/*
1092 * Establish an initial MPTCP connection (if first subflow and not yet
1093 * connected), or add a subflow to an existing MPTCP connection.
1094 */
1095int
1096mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1097 struct proc *p, uint32_t ifscope)
1098{
1099 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1100 struct socket *mp_so, *so = NULL;
1101 struct mptsub_connreq mpcr;
1102 struct mptcb *mp_tp;
1103 int af, error = 0;
1104
1105 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1106 mp_so = mpte->mpte_mppcb->mpp_socket;
1107 mp_tp = mpte->mpte_mptcb;
1108
1109 MPTS_LOCK(mpts);
1110 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1111 VERIFY(mpts->mpts_mpte == NULL);
1112 VERIFY(mpts->mpts_socket == NULL);
1113 VERIFY(mpts->mpts_dst_sl != NULL);
1114 VERIFY(mpts->mpts_connid == CONNID_ANY);
1115
1116 /* select source (if specified) and destination addresses */
1117 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1118 &mpts->mpts_dst_sl, &dst_se)) != 0)
1119 goto out;
1120
1121 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1122 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1123 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1124 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1125 VERIFY(af == AF_INET || af == AF_INET6);
1126
1127 /*
1128 * If the source address is not specified, allocate a storage for
1129 * it, so that later on we can fill it in with the actual source
1130 * IP address chosen by the underlying layer for the subflow after
1131 * it is connected.
1132 */
1133 if (mpts->mpts_src_sl == NULL) {
1134 mpts->mpts_src_sl =
1135 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1136 if (mpts->mpts_src_sl == NULL) {
1137 error = ENOBUFS;
1138 goto out;
1139 }
1140 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1141 VERIFY(se != NULL && se->se_addr != NULL &&
1142 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1143 bzero(se->se_addr, se->se_addr->sa_len);
1144 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1145 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1146 }
1147
1148 /* create the subflow socket */
1149 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1150 goto out;
1151
1152 /*
1153 * XXX: adi@apple.com
1154 *
1155 * This probably needs to be made smarter, but for now simply
1156 * increment the counter, while avoiding 0 (CONNID_ANY) and
1157 * -1 (CONNID_ALL). Assume that an MPTCP connection will not
1158 * live too long with (2^32)-2 subflow connection attempts.
1159 */
1160 mpte->mpte_connid_last++;
1161 if (mpte->mpte_connid_last == CONNID_ALL ||
1162 mpte->mpte_connid_last == CONNID_ANY)
1163 mpte->mpte_connid_last++;
1164
1165 mpts->mpts_connid = mpte->mpte_connid_last;
1166 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1167 mpts->mpts_connid != CONNID_ALL);
1168
1169 /* bind subflow socket to the specified interface */
1170 if (ifscope != IFSCOPE_NONE) {
1171 socket_lock(so, 0);
1172 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1173 if (error != 0) {
1174 socket_unlock(so, 0);
1175 (void) mptcp_subflow_soclose(mpts, so);
1176 goto out;
1177 }
1178 VERIFY(mpts->mpts_outif != NULL);
1179 mpts->mpts_flags |= MPTSF_BOUND_IF;
1180
1181 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] "
1182 "cid %d\n", __func__,
1183 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1184 mpts->mpts_outif->if_xname,
1185 ifscope, mpts->mpts_connid));
1186 socket_unlock(so, 0);
1187 }
1188
1189 /* if source address and/or port is specified, bind to it */
1190 if (src_se != NULL) {
1191 struct sockaddr *sa = src_se->se_addr;
1192 uint32_t mpts_flags = 0;
1193 in_port_t lport;
1194
1195 switch (af) {
1196 case AF_INET:
1197 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1198 mpts_flags |= MPTSF_BOUND_IP;
1199 if ((lport = SIN(sa)->sin_port) != 0)
1200 mpts_flags |= MPTSF_BOUND_PORT;
1201 break;
1202#if INET6
1203 case AF_INET6:
1204 VERIFY(af == AF_INET6);
1205 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1206 mpts_flags |= MPTSF_BOUND_IP;
1207 if ((lport = SIN6(sa)->sin6_port) != 0)
1208 mpts_flags |= MPTSF_BOUND_PORT;
1209 break;
1210#endif /* INET6 */
1211 }
1212
1213 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1214 if (error != 0) {
1215 (void) mptcp_subflow_soclose(mpts, so);
1216 goto out;
1217 }
1218 mpts->mpts_flags |= mpts_flags;
1219
1220 if (af == AF_INET || af == AF_INET6) {
1221 char sbuf[MAX_IPv6_STR_LEN];
1222
1223 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] "
1224 "cid %d\n", __func__,
1225 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1226 inet_ntop(af, ((af == AF_INET) ?
1227 (void *)&SIN(sa)->sin_addr.s_addr :
1228 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
1229 ntohs(lport), mpts->mpts_connid));
1230 }
1231 }
1232
1233 /*
1234 * Insert the subflow into the list, and associate the MPTCP PCB
1235 * as well as the the subflow socket. From this point on, removing
1236 * the subflow needs to be done via mptcp_subflow_del().
1237 */
1238 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1239 mpte->mpte_numflows++;
1240
1241 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1242 mpts->mpts_mpte = mpte;
1243 mpts->mpts_socket = so;
1244 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1245 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1246 mp_so->so_usecount++; /* for subflow socket */
1247
1248 /* register for subflow socket read/write events */
1249 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1250 mptcp_subflow_wupcall, mpts);
1251
1252 /*
1253 * Register for subflow socket control events; ignore
1254 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1255 * will generate it here.
1256 */
1257 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1258 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1259 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1260 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1261 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1262 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1263 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
1264 SO_FILT_HINT_MUSTRST);
1265
1266 /* sanity check */
1267 VERIFY(!(mpts->mpts_flags &
1268 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1269
1270 bzero(&mpcr, sizeof (mpcr));
1271 mpcr.mpcr_proc = p;
1272 mpcr.mpcr_ifscope = ifscope;
1273 /*
1274 * Indicate to the TCP subflow whether or not it should establish
1275 * the initial MPTCP connection, or join an existing one. Fill
1276 * in the connection request structure with additional info needed
1277 * by the underlying TCP (to be used in the TCP options, etc.)
1278 */
1279 MPT_LOCK(mp_tp);
1280 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1281 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1282 mp_tp->mpt_localkey = mptcp_reserve_key();
1283 mptcp_conn_properties(mp_tp);
1284 }
1285 MPT_UNLOCK(mp_tp);
1286 soisconnecting(mp_so);
1287 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1288 } else {
1289 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1290 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1291 MPT_UNLOCK(mp_tp);
1292 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1293 }
1294
1295 mpts->mpts_mpcr = mpcr;
1296 mpts->mpts_flags |= MPTSF_CONNECTING;
1297
1298 if (af == AF_INET || af == AF_INET6) {
1299 char dbuf[MAX_IPv6_STR_LEN];
1300
1301 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
1302 "[pending %s]\n", __func__,
1303 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1304 inet_ntop(af, ((af == AF_INET) ?
1305 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1306 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1307 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1308 ntohs(SIN(dst_se->se_addr)->sin_port) :
1309 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1310 mpts->mpts_connid,
1311 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
1312 "YES" : "NO")));
1313 }
1314
1315 /* connect right away if first attempt, or if join can be done now */
1316 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1317 error = mptcp_subflow_soconnectx(mpte, mpts);
1318
1319out:
1320 MPTS_UNLOCK(mpts);
1321 if (error == 0) {
1322 soevent(mp_so, SO_FILT_HINT_LOCKED |
1323 SO_FILT_HINT_CONNINFO_UPDATED);
1324 }
1325 return (error);
1326}
1327
1328static int
1329mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts)
1330{
1331 int ret = 1;
1332 struct mptcb *mp_tp = NULL;
1333
1334 MPTE_LOCK_ASSERT_HELD(mpte);
1335 mp_tp = mpte->mpte_mptcb;
1336 VERIFY(mp_tp != NULL);
1337 MPTS_LOCK(mpts);
1338 MPT_LOCK(mp_tp);
1339 if ((mpts->mpts_soerror == 0) &&
1340 (mpts->mpts_flags & MPTSF_ACTIVE) &&
1341 (mp_tp->mpt_state != MPTCPS_CLOSED) &&
1342 (mp_tp->mpt_state <= MPTCPS_TIME_WAIT))
1343 ret = 0;
1344 MPT_UNLOCK(mp_tp);
1345 MPTS_UNLOCK(mpts);
1346 return (ret);
1347}
1348
1349/*
1350 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1351 * will no longer be accessible after a subflow is deleted, thus this
1352 * should occur only after the subflow socket has been disconnected.
1353 * If peeloff(2) is called, leave the socket open.
1354 */
1355void
1356mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1357{
1358 struct socket *mp_so, *so;
1359
1360 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1361 mp_so = mpte->mpte_mppcb->mpp_socket;
1362
1363 MPTS_LOCK(mpts);
1364 so = mpts->mpts_socket;
1365 VERIFY(so != NULL);
1366
1367 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d "
1368 "[close %s] %d %x\n", __func__,
1369 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1370 mp_so->so_usecount,
1371 mp_so->so_retaincnt, mpts->mpts_connid,
1372 (close ? "YES" : "NO"), mpts->mpts_soerror,
1373 mpts->mpts_flags));
1374
1375 VERIFY(mpts->mpts_mpte == mpte);
1376 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1377 mpts->mpts_connid != CONNID_ALL);
1378
1379 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1380 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1381 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1382 VERIFY(mpte->mpte_numflows != 0);
1383 mpte->mpte_numflows--;
1384
1385 /*
1386 * Drop references held by this subflow socket; there
1387 * will be no further upcalls made from this point.
1388 */
1389 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1390 (void) sock_catchevents(so, NULL, NULL, 0);
1391 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
1392 if (close)
1393 (void) mptcp_subflow_soclose(mpts, so);
1394
1395 VERIFY(mp_so->so_usecount != 0);
1396 mp_so->so_usecount--; /* for subflow socket */
1397 mpts->mpts_mpte = NULL;
1398 mpts->mpts_socket = NULL;
1399 MPTS_UNLOCK(mpts);
1400
1401 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1402 MPTS_REMREF(mpts); /* for subflow socket */
1403
1404 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1405}
1406
1407/*
1408 * Disconnect a subflow socket.
1409 */
1410void
1411mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1412 boolean_t deleteok)
1413{
1414 struct socket *so;
1415 struct mptcb *mp_tp;
1416 int send_dfin = 0;
1417
1418 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1419 MPTS_LOCK_ASSERT_HELD(mpts);
1420
1421 VERIFY(mpts->mpts_mpte == mpte);
1422 VERIFY(mpts->mpts_socket != NULL);
1423 VERIFY(mpts->mpts_connid != CONNID_ANY &&
1424 mpts->mpts_connid != CONNID_ALL);
1425
1426 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1427 return;
1428
1429 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1430
1431 /*
1432 * If this is coming from disconnectx(2) or issued as part of
1433 * closing the MPTCP socket, the subflow shouldn't stick around.
1434 * Otherwise let it linger around in case the upper layers need
1435 * to retrieve its conninfo.
1436 */
1437 if (deleteok)
1438 mpts->mpts_flags |= MPTSF_DELETEOK;
1439
1440 so = mpts->mpts_socket;
1441 mp_tp = mpte->mpte_mptcb;
1442 MPT_LOCK(mp_tp);
1443 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1444 send_dfin = 1;
1445 MPT_UNLOCK(mp_tp);
1446
1447 socket_lock(so, 0);
1448 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1449 (so->so_state & SS_ISCONNECTED)) {
1450 mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n",
1451 __func__, mpts->mpts_connid, send_dfin,
1452 (deleteok ? "NO" : "YES")));
1453
1454 if (send_dfin)
1455 mptcp_send_dfin(so);
1456 (void) soshutdownlock(so, SHUT_RD);
1457 (void) soshutdownlock(so, SHUT_WR);
1458 (void) sodisconnectlocked(so);
1459 }
1460 socket_unlock(so, 0);
1461 /*
1462 * Generate a disconnect event for this subflow socket, in case
1463 * the lower layer doesn't do it; this is needed because the
1464 * subflow socket deletion relies on it. This will also end up
1465 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1466 * we cannot do that here because subflow lock is currently held.
1467 */
1468 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1469}
1470
1471/*
1472 * Subflow socket read upcall.
1473 *
1474 * Called when the associated subflow socket posted a read event. The subflow
1475 * socket lock has been released prior to invoking the callback. Note that the
1476 * upcall may occur synchronously as a result of MPTCP performing an action on
1477 * it, or asynchronously as a result of an event happening at the subflow layer.
1478 * Therefore, to maintain lock ordering, the only lock that can be acquired
1479 * here is the thread lock, for signalling purposes.
1480 */
1481static void
1482mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1483{
1484#pragma unused(so, waitf)
1485 struct mptsub *mpts = arg;
1486 struct mptses *mpte = mpts->mpts_mpte;
1487
1488 VERIFY(mpte != NULL);
1489
1490 lck_mtx_lock(&mpte->mpte_thread_lock);
1491 mptcp_thread_signal_locked(mpte);
1492 lck_mtx_unlock(&mpte->mpte_thread_lock);
1493}
1494
1495/*
1496 * Subflow socket input.
1497 *
1498 * Called in the context of the MPTCP thread, for reading data from the
1499 * underlying subflow socket and delivering it to MPTCP.
1500 */
1501static void
1502mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1503{
1504 struct mbuf *m = NULL;
1505 struct socket *so;
1506 int error;
1507 struct mptsub *mpts_alt = NULL;
1508
1509 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1510 MPTS_LOCK_ASSERT_HELD(mpts);
1511
1512 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1513 struct mptsub *, mpts);
1514
1515 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1516 return;
1517
1518 so = mpts->mpts_socket;
1519
1520 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1521 if (error != 0 && error != EWOULDBLOCK) {
1522 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
1523 __func__, mpts->mpts_connid, error));
1524 MPTS_UNLOCK(mpts);
1525 mpts_alt = mptcp_get_subflow(mpte, mpts);
1526 if (mpts_alt == NULL) {
1527 mptcplog((LOG_ERR, "%s: no alt path cid %d\n",
1528 __func__, mpts->mpts_connid));
1529 mpte->mpte_mppcb->mpp_socket->so_error = error;
1530 }
1531 MPTS_LOCK(mpts);
1532 } else if (error == 0) {
1533 mptcplog3((LOG_DEBUG, "%s: cid %d \n",
1534 __func__, mpts->mpts_connid));
1535 }
1536
1537 /* In fallback, make sure to accept data on all but one subflow */
1538 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1539 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1540 m_freem(m);
1541 return;
1542 }
1543
1544 if (m != NULL) {
1545 /*
1546 * Release subflow lock since this may trigger MPTCP to send,
1547 * possibly on a different subflow. An extra reference has
1548 * been held on the subflow by the MPTCP thread before coming
1549 * here, so we can be sure that it won't go away, in the event
1550 * the MP socket lock gets released.
1551 */
1552 MPTS_UNLOCK(mpts);
1553 mptcp_input(mpte, m);
1554 MPTS_LOCK(mpts);
1555 }
1556}
1557
1558/*
1559 * Subflow socket write upcall.
1560 *
1561 * Called when the associated subflow socket posted a read event. The subflow
1562 * socket lock has been released prior to invoking the callback. Note that the
1563 * upcall may occur synchronously as a result of MPTCP performing an action on
1564 * it, or asynchronously as a result of an event happening at the subflow layer.
1565 * Therefore, to maintain lock ordering, the only lock that can be acquired
1566 * here is the thread lock, for signalling purposes.
1567 */
1568static void
1569mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1570{
1571#pragma unused(so, waitf)
1572 struct mptsub *mpts = arg;
1573 struct mptses *mpte = mpts->mpts_mpte;
1574
1575 VERIFY(mpte != NULL);
1576
1577 lck_mtx_lock(&mpte->mpte_thread_lock);
1578 mptcp_thread_signal_locked(mpte);
1579 lck_mtx_unlock(&mpte->mpte_thread_lock);
1580}
1581
1582/*
1583 * Subflow socket output.
1584 *
1585 * Called for sending data from MPTCP to the underlying subflow socket.
1586 */
1587int
1588mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1589{
1590 struct socket *mp_so, *so;
1591 size_t sb_cc = 0, tot_sent = 0;
1592 struct mbuf *sb_mb;
1593 int error = 0;
1594 u_int64_t mpt_dsn = 0;
1595 struct mptcb *mp_tp = mpte->mpte_mptcb;
1596 struct mbuf *mpt_mbuf = NULL;
1597 unsigned int off = 0;
1598
1599 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1600 MPTS_LOCK_ASSERT_HELD(mpts);
1601 mp_so = mpte->mpte_mppcb->mpp_socket;
1602 so = mpts->mpts_socket;
1603
1604 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1605 struct mptsub *, mpts);
1606
1607 /* subflow socket is suspended? */
1608 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
1609 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow "
1610 "controlled\n", __func__,
1611 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1612 goto out;
1613 }
1614
1615 /* subflow socket is not MPTCP capable? */
1616 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
1617 !(mpts->mpts_flags & MPTSF_MP_DEGRADED)) {
1618 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not "
1619 "MPTCP capable\n", __func__,
1620 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1621 goto out;
1622 }
1623
1624 /* Remove Addr Option is not sent reliably as per I-D */
1625 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1626 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1627 tp->t_rem_aid = mpte->mpte_lost_aid;
1628 if (mptcp_remaddr_enable)
1629 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1630 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1631 }
1632
1633 /*
1634 * The mbuf chains containing the metadata (as well as pointing to
1635 * the user data sitting at the MPTCP output queue) would then be
1636 * sent down to the subflow socket.
1637 *
1638 * Some notes on data sequencing:
1639 *
1640 * a. Each mbuf must be a M_PKTHDR.
1641 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1642 * in the mbuf pkthdr structure.
1643 * c. Each mbuf containing the MPTCP metadata must have its
1644 * pkt_flags marked with the PKTF_MPTCP flag.
1645 */
1646
1647 /* First, drop acknowledged data */
1648 sb_mb = mp_so->so_snd.sb_mb;
1649 if (sb_mb == NULL) {
1650 goto out;
1651 }
1652
1653 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1654
1655 mpt_mbuf = sb_mb;
1656 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1657 mpt_mbuf = mpt_mbuf->m_next;
1658 }
1659 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1660 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1661 } else {
1662 goto out;
1663 }
1664
1665 MPT_LOCK(mp_tp);
1666 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
1667 int len = 0;
1668 len = mp_tp->mpt_snduna - mpt_dsn;
1669 sbdrop(&mp_so->so_snd, len);
1670
1671 }
1672
1673 /*
1674 * In degraded mode, we don't receive data acks, so force free
1675 * mbufs less than snd_nxt
1676 */
1677 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1678 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1679 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
1680 int len = 0;
1681 len = mp_tp->mpt_sndnxt - mpt_dsn;
1682 sbdrop(&mp_so->so_snd, len);
1683 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1684 }
1685
1686 /*
1687 * Adjust the subflow's notion of next byte to send based on
1688 * the last unacknowledged byte
1689 */
1690 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1691 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1692 }
1693
1694 /*
1695 * Adjust the top level notion of next byte used for retransmissions
1696 * and sending FINs.
1697 */
1698 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1699 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1700 }
1701
1702
1703 /* Now determine the offset from which to start transmitting data */
1704 sb_mb = mp_so->so_snd.sb_mb;
1705 sb_cc = mp_so->so_snd.sb_cc;
1706 if (sb_mb == NULL) {
1707 MPT_UNLOCK(mp_tp);
1708 goto out;
1709 }
1710 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1711 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
1712 sb_cc -= off;
1713 } else {
1714 MPT_UNLOCK(mp_tp);
1715 goto out;
1716 }
1717 MPT_UNLOCK(mp_tp);
1718
1719 mpt_mbuf = sb_mb;
1720 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1721
1722 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
1723 (mpt_mbuf->m_pkthdr.mp_rlen <= off))) {
1724 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1725 mpt_mbuf = mpt_mbuf->m_next;
1726 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1727 }
1728 if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1729 mptcplog((LOG_INFO, "%s: snduna = %llu off = %d id = %d"
1730 " %llu \n",
1731 __func__,
1732 mp_tp->mpt_snduna, off, mpts->mpts_connid,
1733 mpts->mpts_sndnxt));
1734
1735 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1736
1737 while (tot_sent < sb_cc) {
1738 struct mbuf *m;
1739 size_t mlen, len = 0;
1740
1741 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1742 mlen -= off;
1743 if (mlen == 0)
1744 goto out;
1745
1746 if (mlen > sb_cc) {
1747 panic("%s: unexpected %lu %lu \n", __func__,
1748 mlen, sb_cc);
1749 }
1750
1751 m = m_copym_mode(mpt_mbuf, off, mlen, M_DONTWAIT,
1752 M_COPYM_COPY_HDR);
1753 if (m == NULL) {
1754 error = ENOBUFS;
1755 break;
1756 }
1757
1758 /* Create a DSN mapping for the data (m_copym does it) */
1759 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1760 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1761 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
1762 m->m_pkthdr.mp_dsn = mpt_dsn + off;
1763 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
1764 m->m_pkthdr.mp_rlen = mlen;
1765 mpts->mpts_rel_seq += mlen;
1766 m->m_pkthdr.len = mlen;
1767
1768 /* last contiguous mapping is stored for error cases */
1769 if (mpts->mpts_lastmap.mptsl_dsn +
1770 mpts->mpts_lastmap.mptsl_len == mpt_dsn) {
1771 mpts->mpts_lastmap.mptsl_len += tot_sent;
1772 } else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn +
1773 mpts->mpts_lastmap.mptsl_len), mpt_dsn)) {
1774 if (m->m_pkthdr.mp_dsn == 0)
1775 panic("%s %llu", __func__, mpt_dsn);
1776 mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn;
1777 mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq;
1778 mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen;
1779 }
1780
1781 error = sock_sendmbuf(so, NULL, m, 0, &len);
1782 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
1783 struct sockbuf *, &so->so_rcv,
1784 struct sockbuf *, &so->so_snd,
1785 struct mptses *, mpte, struct mptsub *, mpts,
1786 size_t, mlen);
1787 if (error != 0) {
1788 mptcplog((LOG_ERR, "%s: len = %zd error = %d \n",
1789 __func__, len, error));
1790 break;
1791 }
1792 mpts->mpts_sndnxt += mlen;
1793 MPT_LOCK(mp_tp);
1794 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
1795 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
1796 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
1797 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
1798 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
1799 }
1800 MPT_UNLOCK(mp_tp);
1801 if (len != mlen) {
1802 mptcplog((LOG_ERR, "%s: cid %d wrote %d "
1803 "(expected %d)\n", __func__,
1804 mpts->mpts_connid, len, mlen));
1805 }
1806 tot_sent += mlen;
1807 off = 0;
1808 mpt_mbuf = mpt_mbuf->m_next;
1809 }
1810
1811 if (error != 0 && error != EWOULDBLOCK) {
1812 mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d\n",
1813 __func__, mpts->mpts_connid, error));
1814 } if (error == 0) {
1815 if ((mpts->mpts_connid == 2) ||
1816 (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1817 mptcplog((LOG_DEBUG, "%s: cid %d wrote %d %d\n",
1818 __func__, mpts->mpts_connid, tot_sent,
1819 sb_cc));
1820 MPT_LOCK(mp_tp);
1821 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
1822 MPT_UNLOCK(mp_tp);
1823 }
1824out:
1825 return (error);
1826}
1827
1828/*
1829 * Subflow socket control event upcall.
1830 *
1831 * Called when the associated subflow socket posted one or more control events.
1832 * The subflow socket lock has been released prior to invoking the callback.
1833 * Note that the upcall may occur synchronously as a result of MPTCP performing
1834 * an action on it, or asynchronously as a result of an event happening at the
1835 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
1836 * be acquired here is the thread lock, for signalling purposes.
1837 */
1838static void
1839mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
1840{
1841#pragma unused(so)
1842 struct mptsub *mpts = arg;
1843 struct mptses *mpte = mpts->mpts_mpte;
1844
1845 VERIFY(mpte != NULL);
1846
1847 lck_mtx_lock(&mpte->mpte_thread_lock);
1848 atomic_bitset_32(&mpts->mpts_evctl, events);
1849 mptcp_thread_signal_locked(mpte);
1850 lck_mtx_unlock(&mpte->mpte_thread_lock);
1851}
1852
1853/*
1854 * Subflow socket control events.
1855 *
1856 * Called for handling events related to the underlying subflow socket.
1857 */
1858static ev_ret_t
1859mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts)
1860{
1861 uint32_t events;
1862 ev_ret_t ret = MPTS_EVRET_OK;
1863
1864 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1865 MPTS_LOCK_ASSERT_HELD(mpts);
1866
1867 /* bail if there's nothing to process */
1868 if ((events = mpts->mpts_evctl) == 0)
1869 return (ret);
1870
1871 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
1872 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
1873 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
1874 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
1875 SO_FILT_HINT_DISCONNECTED)) {
1876 events |= SO_FILT_HINT_MPFAILOVER;
1877 }
1878
1879 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
1880 struct mptsub *, mpts, uint32_t, events);
1881
1882 mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__,
1883 mpts->mpts_connid, events, SO_FILT_HINT_BITS));
1884
1885 if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) {
1886 ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts);
1887 events &= ~SO_FILT_HINT_MPFAILOVER;
1888 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1889 }
1890 if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) {
1891 ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts);
1892 events &= ~SO_FILT_HINT_CONNRESET;
1893 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1894 }
1895 if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) {
1896 ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts);
1897 events &= ~SO_FILT_HINT_MUSTRST;
1898 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1899 }
1900 if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
1901 ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts);
1902 events &= ~SO_FILT_HINT_CANTRCVMORE;
1903 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1904 }
1905 if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) {
1906 ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts);
1907 events &= ~SO_FILT_HINT_CANTSENDMORE;
1908 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1909 }
1910 if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) {
1911 ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts);
1912 events &= ~SO_FILT_HINT_TIMEOUT;
1913 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1914 }
1915 if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) {
1916 ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts);
1917 events &= ~SO_FILT_HINT_NOSRCADDR;
1918 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1919 }
1920 if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) {
1921 ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts);
1922 events &= ~SO_FILT_HINT_IFDENIED;
1923 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1924 }
1925 if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) {
1926 ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts);
1927 events &= ~SO_FILT_HINT_SUSPEND;
1928 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1929 }
1930 if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) {
1931 ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts);
1932 events &= ~SO_FILT_HINT_RESUME;
1933 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1934 }
1935 if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) {
1936 ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts);
1937 events &= ~SO_FILT_HINT_CONNECTED;
1938 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1939 }
1940 if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) {
1941 ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts);
1942 events &= ~SO_FILT_HINT_MPSTATUS;
1943 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1944 }
1945 if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) {
1946 ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts);
1947 events &= ~SO_FILT_HINT_DISCONNECTED;
1948 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1949 }
1950 /*
1951 * We should be getting only events specified via sock_catchevents(),
1952 * so loudly complain if we have any unprocessed one(s).
1953 */
1954 if (events != 0 || ret < MPTS_EVRET_OK) {
1955 mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)"
1956 " unhandled events=%b\n",
1957 (events != 0) ? "MPTCP_ERROR " : "",
1958 __func__, mpts->mpts_connid,
1959 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS));
1960 }
1961
1962 /* clear the ones we've processed */
1963 atomic_bitclear_32(&mpts->mpts_evctl, ~events);
1964
1965 return (ret);
1966}
1967
1968/*
1969 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
1970 */
1971static ev_ret_t
1972mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts)
1973{
1974 struct socket *mp_so, *so;
1975 struct mptcb *mp_tp;
1976 boolean_t linger;
1977
1978 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1979 MPTS_LOCK_ASSERT_HELD(mpts);
1980 VERIFY(mpte->mpte_mppcb != NULL);
1981 mp_so = mpte->mpte_mppcb->mpp_socket;
1982 mp_tp = mpte->mpte_mptcb;
1983 so = mpts->mpts_socket;
1984
1985 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
1986 !(mp_so->so_flags & SOF_PCBCLEARING));
1987
1988 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
1989 mpts->mpts_connid, (linger ? "YES" : "NO")));
1990
1991 if (mpts->mpts_soerror == 0)
1992 mpts->mpts_soerror = ECONNREFUSED;
1993
1994 /*
1995 * We got a TCP RST for this subflow connection.
1996 *
1997 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
1998 * client if the MPTCP connection has not been established. Otherwise
1999 * we close the socket.
2000 */
2001 mptcp_subflow_disconnect(mpte, mpts, !linger);
2002
2003 MPT_LOCK(mp_tp);
2004 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2005 mp_so->so_error = ECONNREFUSED;
2006 }
2007 MPT_UNLOCK(mp_tp);
2008
2009 /*
2010 * Keep the subflow socket around, unless the MPTCP socket has
2011 * been detached or the subflow has been disconnected explicitly,
2012 * in which case it should be deleted right away.
2013 */
2014 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2015}
2016
2017/*
2018 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2019 */
2020static ev_ret_t
2021mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2022{
2023 struct socket *so;
2024
2025 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2026 MPTS_LOCK_ASSERT_HELD(mpts);
2027
2028 so = mpts->mpts_socket;
2029
2030 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2031
2032 /*
2033 * We got a FIN for this subflow connection. This subflow socket
2034 * is no longer available for receiving data;
2035 * The FIN may arrive with data. The data is handed up to the
2036 * mptcp socket and the subflow is disconnected.
2037 */
2038
2039 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2040}
2041
2042/*
2043 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2044 */
2045static ev_ret_t
2046mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts)
2047{
2048 struct socket *so;
2049
2050 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2051 MPTS_LOCK_ASSERT_HELD(mpts);
2052
2053 so = mpts->mpts_socket;
2054
2055 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2056 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2057}
2058
2059/*
2060 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2061 */
2062static ev_ret_t
2063mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts)
2064{
2065 struct socket *mp_so, *so;
2066 struct mptcb *mp_tp;
2067 boolean_t linger;
2068
2069 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2070 MPTS_LOCK_ASSERT_HELD(mpts);
2071 VERIFY(mpte->mpte_mppcb != NULL);
2072 mp_so = mpte->mpte_mppcb->mpp_socket;
2073 mp_tp = mpte->mpte_mptcb;
2074 so = mpts->mpts_socket;
2075
2076 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2077 !(mp_so->so_flags & SOF_PCBCLEARING));
2078
2079 mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__,
2080 mpts->mpts_connid, (linger ? "YES" : "NO")));
2081
2082 if (mpts->mpts_soerror == 0)
2083 mpts->mpts_soerror = ETIMEDOUT;
2084
2085 /*
2086 * The subflow connection has timed out.
2087 *
2088 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2089 * client if the MPTCP connection has not been established. Otherwise
2090 * drop it.
2091 */
2092 mptcp_subflow_disconnect(mpte, mpts, !linger);
2093
2094 MPT_LOCK(mp_tp);
2095 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2096 mp_so->so_error = ETIMEDOUT;
2097 }
2098 MPT_UNLOCK(mp_tp);
2099
2100 /*
2101 * Keep the subflow socket around, unless the MPTCP socket has
2102 * been detached or the subflow has been disconnected explicitly,
2103 * in which case it should be deleted right away.
2104 */
2105 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2106}
2107
2108/*
2109 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2110 */
2111static ev_ret_t
2112mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts)
2113{
2114 struct socket *mp_so, *so;
2115 struct mptcb *mp_tp;
2116 boolean_t linger;
2117 struct tcpcb *tp = NULL;
2118
2119 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2120 MPTS_LOCK_ASSERT_HELD(mpts);
2121
2122 VERIFY(mpte->mpte_mppcb != NULL);
2123 mp_so = mpte->mpte_mppcb->mpp_socket;
2124 mp_tp = mpte->mpte_mptcb;
2125 so = mpts->mpts_socket;
2126
2127 /* Not grabbing socket lock as t_local_aid is write once only */
2128 tp = intotcpcb(sotoinpcb(so));
2129 /*
2130 * This overwrites any previous mpte_lost_aid to avoid storing
2131 * too much state when the typical case has only two subflows.
2132 */
2133 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2134 mpte->mpte_lost_aid = tp->t_local_aid;
2135
2136 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2137 !(mp_so->so_flags & SOF_PCBCLEARING));
2138
2139 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2140 mpts->mpts_connid, (linger ? "YES" : "NO")));
2141
2142 if (mpts->mpts_soerror == 0)
2143 mpts->mpts_soerror = EADDRNOTAVAIL;
2144
2145 /*
2146 * The subflow connection has lost its source address.
2147 *
2148 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2149 * client if the MPTCP connection has not been established. If it
2150 * has been established with one subflow , we keep the MPTCP
2151 * connection valid without any subflows till closed by application.
2152 * This lets tcp connection manager decide whether to close this or
2153 * not as it reacts to reachability changes too.
2154 */
2155 mptcp_subflow_disconnect(mpte, mpts, !linger);
2156
2157 MPT_LOCK(mp_tp);
2158 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2159 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2160 mp_so->so_error = EADDRNOTAVAIL;
2161 }
2162 MPT_UNLOCK(mp_tp);
2163
2164 /*
2165 * Keep the subflow socket around, unless the MPTCP socket has
2166 * been detached or the subflow has been disconnected explicitly,
2167 * in which case it should be deleted right away.
2168 */
2169 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2170}
2171
2172/*
2173 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2174 */
2175static ev_ret_t
2176mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts)
2177{
2178 struct mptsub *mpts_alt = NULL;
2179 struct socket *so = NULL;
2180 struct socket *mp_so;
2181 int altpath_exists = 0;
2182
2183 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2184 MPTS_LOCK_ASSERT_HELD(mpts);
2185 mp_so = mpte->mpte_mppcb->mpp_socket;
2186 mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
2187 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
2188
2189 MPTS_UNLOCK(mpts);
2190 mpts_alt = mptcp_get_subflow(mpte, mpts);
2191
2192 /*
2193 * If there is no alternate eligible subflow, ignore the
2194 * failover hint.
2195 */
2196 if (mpts_alt == NULL) {
2197 mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__));
2198 MPTS_LOCK(mpts);
2199 goto done;
2200 }
2201 MPTS_LOCK(mpts_alt);
2202 altpath_exists = 1;
2203 so = mpts_alt->mpts_socket;
2204 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2205 socket_lock(so, 1);
2206 /* All data acknowledged */
2207 if (so->so_snd.sb_cc == 0) {
2208 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2209 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2210 } else {
2211 /* no alternate path available */
2212 altpath_exists = 0;
2213 }
2214 socket_unlock(so, 1);
2215 }
2216 if (altpath_exists) {
2217 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
2218 struct mptcb *mp_tp = mpte->mpte_mptcb;
2219 /* Bring the subflow's notion of snd_nxt into the send window */
2220 MPT_LOCK(mp_tp);
2221 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2222 MPT_UNLOCK(mp_tp);
2223 mpte->mpte_active_sub = mpts_alt;
2224 socket_lock(so, 1);
2225 sowwakeup(so);
2226 socket_unlock(so, 1);
2227 }
2228 MPTS_UNLOCK(mpts_alt);
2229
2230 if (altpath_exists) {
2231 soevent(mp_so,
2232 SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2233 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from "
2234 "%d to %d\n", __func__,
2235 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2236 mpts->mpts_connid, mpts_alt->mpts_connid));
2237 tcpstat.tcps_mp_switches++;
2238 }
2239
2240 MPTS_LOCK(mpts);
2241 if (altpath_exists) {
2242 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2243 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2244 } else {
2245 so = mpts->mpts_socket;
2246 socket_lock(so, 1);
2247 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2248 socket_unlock(so, 1);
2249 }
2250done:
2251 MPTS_LOCK_ASSERT_HELD(mpts);
2252 return (MPTS_EVRET_OK);
2253}
2254
2255/*
2256 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2257 */
2258static ev_ret_t
2259mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts)
2260{
2261 struct socket *mp_so, *so;
2262 struct mptcb *mp_tp;
2263 boolean_t linger;
2264
2265 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2266 MPTS_LOCK_ASSERT_HELD(mpts);
2267 VERIFY(mpte->mpte_mppcb != NULL);
2268 mp_so = mpte->mpte_mppcb->mpp_socket;
2269 mp_tp = mpte->mpte_mptcb;
2270 so = mpts->mpts_socket;
2271
2272 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2273 !(mp_so->so_flags & SOF_PCBCLEARING));
2274
2275 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2276 mpts->mpts_connid, (linger ? "YES" : "NO")));
2277
2278 if (mpts->mpts_soerror == 0)
2279 mpts->mpts_soerror = EHOSTUNREACH;
2280
2281 /*
2282 * The subflow connection cannot use the outgoing interface.
2283 *
2284 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2285 * client if the MPTCP connection has not been established. If it
2286 * has been established, let the upper layer call disconnectx.
2287 */
2288 mptcp_subflow_disconnect(mpte, mpts, !linger);
2289 MPTS_UNLOCK(mpts);
2290
2291 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED);
2292
2293 MPT_LOCK(mp_tp);
2294 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2295 mp_so->so_error = EHOSTUNREACH;
2296 }
2297 MPT_UNLOCK(mp_tp);
2298
2299 MPTS_LOCK(mpts);
2300 /*
2301 * Keep the subflow socket around, unless the MPTCP socket has
2302 * been detached or the subflow has been disconnected explicitly,
2303 * in which case it should be deleted right away.
2304 */
2305 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2306}
2307
2308/*
2309 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2310 */
2311static ev_ret_t
2312mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts)
2313{
2314 struct socket *so;
2315
2316 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2317 MPTS_LOCK_ASSERT_HELD(mpts);
2318
2319 so = mpts->mpts_socket;
2320
2321 /* the subflow connection is being flow controlled */
2322 mpts->mpts_flags |= MPTSF_SUSPENDED;
2323
2324 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
2325 mpts->mpts_connid));
2326
2327 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2328}
2329
2330/*
2331 * Handle SO_FILT_HINT_RESUME subflow socket event.
2332 */
2333static ev_ret_t
2334mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts)
2335{
2336 struct socket *so;
2337
2338 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2339 MPTS_LOCK_ASSERT_HELD(mpts);
2340
2341 so = mpts->mpts_socket;
2342
2343 /* the subflow connection is no longer flow controlled */
2344 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2345
2346 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2347
2348 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2349}
2350
2351/*
2352 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2353 */
2354static ev_ret_t
2355mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts)
2356{
2357 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2358 struct sockaddr_entry *src_se, *dst_se;
2359 struct sockaddr_storage src;
2360 struct socket *mp_so, *so;
2361 struct mptcb *mp_tp;
2362 struct ifnet *outifp;
2363 int af, error = 0;
2364 boolean_t mpok = FALSE;
2365
2366 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2367 VERIFY(mpte->mpte_mppcb != NULL);
2368 mp_so = mpte->mpte_mppcb->mpp_socket;
2369 mp_tp = mpte->mpte_mptcb;
2370
2371 MPTS_LOCK_ASSERT_HELD(mpts);
2372 so = mpts->mpts_socket;
2373 af = mpts->mpts_family;
2374
2375 if (mpts->mpts_flags & MPTSF_CONNECTED)
2376 return (MPTS_EVRET_OK);
2377
2378 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2379 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
2380 return (MPTS_EVRET_OK);
2381 }
2382
2383 /*
2384 * The subflow connection has been connected. Find out whether it
2385 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2386 *
2387 * a. If MPTCP connection is not yet established, then this must be
2388 * the first subflow connection. If MPTCP failed to negotiate,
2389 * indicate to the MPTCP socket client via EPROTO, that the
2390 * underlying TCP connection may be peeled off via peeloff(2).
2391 * Otherwise, mark the MPTCP socket as connected.
2392 *
2393 * b. If MPTCP connection has been established, then this must be
2394 * one of the subsequent subflow connections. If MPTCP failed
2395 * to negotiate, disconnect the connection since peeloff(2)
2396 * is no longer possible.
2397 *
2398 * Right now, we simply unblock any waiters at the MPTCP socket layer
2399 * if the MPTCP connection has not been established.
2400 */
2401 socket_lock(so, 0);
2402
2403 if (so->so_state & SS_ISDISCONNECTED) {
2404 /*
2405 * With MPTCP joins, a connection is connected at the subflow
2406 * level, but the 4th ACK from the server elevates the MPTCP
2407 * subflow to connected state. So there is a small window
2408 * where the subflow could get disconnected before the
2409 * connected event is processed.
2410 */
2411 socket_unlock(so, 0);
2412 return (MPTS_EVRET_OK);
2413 }
2414
2415 mpts->mpts_soerror = 0;
2416 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2417 mpts->mpts_flags |= MPTSF_CONNECTED;
2418 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2419 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2420
2421 VERIFY(mpts->mpts_dst_sl != NULL);
2422 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2423 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2424 dst_se->se_addr->sa_family == af);
2425
2426 VERIFY(mpts->mpts_src_sl != NULL);
2427 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2428 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2429 src_se->se_addr->sa_family == af);
2430
2431 /* get/check source IP address */
2432 switch (af) {
2433 case AF_INET: {
2434 error = in_getsockaddr_s(so, &src);
2435 if (error == 0) {
2436 struct sockaddr_in *ms = SIN(src_se->se_addr);
2437 struct sockaddr_in *s = SIN(&src);
2438
2439 VERIFY(s->sin_len == ms->sin_len);
2440 VERIFY(ms->sin_family == AF_INET);
2441
2442 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2443 bcmp(&ms->sin_addr, &s->sin_addr,
2444 sizeof (ms->sin_addr)) != 0) {
2445 mptcplog((LOG_ERR, "%s: cid %d local "
2446 "address %s (expected %s)\n", __func__,
2447 mpts->mpts_connid, inet_ntop(AF_INET,
2448 (void *)&s->sin_addr.s_addr, buf0,
2449 sizeof (buf0)), inet_ntop(AF_INET,
2450 (void *)&ms->sin_addr.s_addr, buf1,
2451 sizeof (buf1))));
2452 }
2453 bcopy(s, ms, sizeof (*s));
2454 }
2455 break;
2456 }
2457#if INET6
2458 case AF_INET6: {
2459 error = in6_getsockaddr_s(so, &src);
2460 if (error == 0) {
2461 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2462 struct sockaddr_in6 *s = SIN6(&src);
2463
2464 VERIFY(s->sin6_len == ms->sin6_len);
2465 VERIFY(ms->sin6_family == AF_INET6);
2466
2467 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2468 bcmp(&ms->sin6_addr, &s->sin6_addr,
2469 sizeof (ms->sin6_addr)) != 0) {
2470 mptcplog((LOG_ERR, "%s: cid %d local "
2471 "address %s (expected %s)\n", __func__,
2472 mpts->mpts_connid, inet_ntop(AF_INET6,
2473 (void *)&s->sin6_addr, buf0,
2474 sizeof (buf0)), inet_ntop(AF_INET6,
2475 (void *)&ms->sin6_addr, buf1,
2476 sizeof (buf1))));
2477 }
2478 bcopy(s, ms, sizeof (*s));
2479 }
2480 break;
2481 }
2482#endif /* INET6 */
2483 default:
2484 VERIFY(0);
2485 /* NOTREACHED */
2486 }
2487
2488 if (error != 0) {
2489 mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n",
2490 __func__, mpts->mpts_connid, error));
2491 }
2492
2493 /* get/verify the outbound interface */
2494 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2495 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2496 VERIFY(mpts->mpts_outif != NULL);
2497 if (mpts->mpts_outif != outifp) {
2498 mptcplog((LOG_ERR, "%s: cid %d outif %s "
2499 "(expected %s)\n", __func__, mpts->mpts_connid,
2500 ((outifp != NULL) ? outifp->if_xname : "NULL"),
2501 mpts->mpts_outif->if_xname));
2502 if (outifp == NULL)
2503 outifp = mpts->mpts_outif;
2504 }
2505 } else {
2506 mpts->mpts_outif = outifp;
2507 }
2508
2509 socket_unlock(so, 0);
2510
2511 mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] "
2512 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2513 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2514 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2515 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2516 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2517 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2518 inet_ntop(af, ((af == AF_INET) ?
2519 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2520 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2521 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2522 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2523 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
2524 "MPTCP capable" : "a regular TCP")));
2525
2526 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2527 MPTS_UNLOCK(mpts);
2528
2529 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2530
2531 MPT_LOCK(mp_tp);
2532 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2533 /* case (a) above */
2534 if (!mpok) {
2535 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2536 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2537 MPT_UNLOCK(mp_tp);
2538 } else {
2539 if (mptcp_init_authparms(mp_tp) != 0) {
2540 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2541 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2542 MPT_UNLOCK(mp_tp);
2543 mpok = FALSE;
2544 } else {
2545 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2546 mpte->mpte_associd = mpts->mpts_connid;
2547 DTRACE_MPTCP2(state__change,
2548 struct mptcb *, mp_tp,
2549 uint32_t, 0 /* event */);
2550 mptcp_init_statevars(mp_tp);
2551 MPT_UNLOCK(mp_tp);
2552
2553 (void) mptcp_setconnorder(mpte,
2554 mpts->mpts_connid, 1);
2555 soisconnected(mp_so);
2556 }
2557 }
2558 MPTS_LOCK(mpts);
2559 if (mpok) {
2560 /* Initialize the relative sequence number */
2561 mpts->mpts_rel_seq = 1;
2562 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2563 mpte->mpte_nummpcapflows++;
2564 MPT_LOCK_SPIN(mp_tp);
2565 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2566 MPT_UNLOCK(mp_tp);
2567 }
2568 } else if (mpok) {
2569 MPT_UNLOCK(mp_tp);
2570 /*
2571 * case (b) above
2572 * In case of additional flows, the MPTCP socket is not
2573 * MPTSF_MP_CAPABLE until an ACK is received from server
2574 * for 3-way handshake. TCP would have guaranteed that this
2575 * is an MPTCP subflow.
2576 */
2577 MPTS_LOCK(mpts);
2578 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2579 mpte->mpte_nummpcapflows++;
2580 mpts->mpts_rel_seq = 1;
2581 MPT_LOCK_SPIN(mp_tp);
2582 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2583 MPT_UNLOCK(mp_tp);
2584 }
2585 MPTS_LOCK_ASSERT_HELD(mpts);
2586
2587 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2588}
2589
2590/*
2591 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2592 */
2593static ev_ret_t
2594mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts)
2595{
2596 struct socket *mp_so, *so;
2597 struct mptcb *mp_tp;
2598 boolean_t linger;
2599
2600 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2601 MPTS_LOCK_ASSERT_HELD(mpts);
2602 VERIFY(mpte->mpte_mppcb != NULL);
2603 mp_so = mpte->mpte_mppcb->mpp_socket;
2604 mp_tp = mpte->mpte_mptcb;
2605 so = mpts->mpts_socket;
2606
2607 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2608 !(mp_so->so_flags & SOF_PCBCLEARING));
2609
2610 mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2611 mpts->mpts_connid, (linger ? "YES" : "NO")));
2612
2613 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2614 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2615
2616 /*
2617 * Clear flags that are used by getconninfo to return state.
2618 * Retain like MPTSF_DELETEOK, MPTSF_ACTIVE for internal purposes.
2619 */
2620 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
2621 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
2622 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
2623 MPTSF_SUSPENDED|MPTSF_ACTIVE);
2624 mpts->mpts_flags |= MPTSF_DISCONNECTED;
2625
2626 /*
2627 * The subflow connection has been disconnected.
2628 *
2629 * Right now, we simply unblock any waiters at the MPTCP socket layer
2630 * if the MPTCP connection has not been established.
2631 */
2632 MPTS_UNLOCK(mpts);
2633
2634 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2635
2636 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
2637 mpte->mpte_nummpcapflows--;
2638 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
2639 }
2640
2641 MPT_LOCK(mp_tp);
2642 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2643 MPT_UNLOCK(mp_tp);
2644 soisdisconnected(mp_so);
2645 } else {
2646 MPT_UNLOCK(mp_tp);
2647 }
2648
2649 MPTS_LOCK(mpts);
2650 /*
2651 * The underlying subflow socket has been disconnected;
2652 * it is no longer useful to us. Keep the subflow socket
2653 * around, unless the MPTCP socket has been detached or
2654 * the subflow has been disconnected explicitly, in which
2655 * case it should be deleted right away.
2656 */
2657 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2658}
2659
2660/*
2661 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2662 */
2663static ev_ret_t
2664mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts)
2665{
2666 struct socket *mp_so, *so;
2667 struct mptcb *mp_tp;
2668 ev_ret_t ret = MPTS_EVRET_OK_UPDATE;
2669
2670 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2671 VERIFY(mpte->mpte_mppcb != NULL);
2672 mp_so = mpte->mpte_mppcb->mpp_socket;
2673 mp_tp = mpte->mpte_mptcb;
2674
2675 MPTS_LOCK_ASSERT_HELD(mpts);
2676 so = mpts->mpts_socket;
2677
2678 socket_lock(so, 0);
2679 MPT_LOCK(mp_tp);
2680
2681 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2682 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2683 else
2684 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
2685
2686 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
2687 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2688 goto done;
2689 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
2690 }
2691 else
2692 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
2693
2694 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
2695 mpts->mpts_flags |= MPTSF_MP_READY;
2696 else
2697 mpts->mpts_flags &= ~MPTSF_MP_READY;
2698
2699 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
2700 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
2701 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
2702 }
2703
2704 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2705 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
2706 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
2707 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
2708 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
2709 ret = MPTS_EVRET_CONNECT_PENDING;
2710 }
2711
2712 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d "
2713 "mptsf=%b\n", __func__,
2714 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
2715 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
2716 mpts->mpts_flags, MPTSF_BITS));
2717done:
2718 MPT_UNLOCK(mp_tp);
2719 socket_unlock(so, 0);
2720
2721 return (ret);
2722}
2723
2724/*
2725 * Handle SO_FILT_HINT_MUSTRST subflow socket event
2726 */
2727static ev_ret_t
2728mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts)
2729{
2730 struct socket *mp_so, *so;
2731 struct mptcb *mp_tp;
2732 boolean_t linger;
2733
2734
2735 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2736 MPTS_LOCK_ASSERT_HELD(mpts);
2737 VERIFY(mpte->mpte_mppcb != NULL);
2738 mp_so = mpte->mpte_mppcb->mpp_socket;
2739 mp_tp = mpte->mpte_mptcb;
2740 so = mpts->mpts_socket;
2741
2742 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2743 !(mp_so->so_flags & SOF_PCBCLEARING));
2744
2745 if (mpts->mpts_soerror == 0)
2746 mpts->mpts_soerror = ECONNABORTED;
2747
2748 so->so_error = ECONNABORTED;
2749
2750 /* We got an invalid option or a fast close */
2751 socket_lock(so, 0);
2752 struct tcptemp *t_template;
2753 struct inpcb *inp = sotoinpcb(so);
2754 struct tcpcb *tp = NULL;
2755
2756 tp = intotcpcb(inp);
2757
2758 t_template = tcp_maketemplate(tp);
2759 if (t_template) {
2760 unsigned int ifscope, nocell = 0;
2761
2762 if (inp->inp_flags & INP_BOUND_IF)
2763 ifscope = inp->inp_boundifp->if_index;
2764 else
2765 ifscope = IFSCOPE_NONE;
2766
2767 if (inp->inp_flags & INP_NO_IFT_CELLULAR)
2768 nocell = 1;
2769
2770 tcp_respond(tp, t_template->tt_ipgen,
2771 &t_template->tt_t, (struct mbuf *)NULL,
2772 tp->rcv_nxt, tp->snd_una, TH_RST, ifscope, nocell);
2773 (void) m_free(dtom(t_template));
2774 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n",
2775 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2776 so, mpts->mpts_connid));
2777 }
2778 socket_unlock(so, 0);
2779 mptcp_subflow_disconnect(mpte, mpts, !linger);
2780 MPTS_UNLOCK(mpts);
2781
2782 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2783
2784 MPT_LOCK(mp_tp);
2785 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2786 mp_so->so_error = ECONNABORTED;
2787 }
2788 MPT_UNLOCK(mp_tp);
2789
2790 MPTS_LOCK(mpts);
2791 /*
2792 * Keep the subflow socket around unless the subflow has been
2793 * disconnected explicitly.
2794 */
2795 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2796}
2797
2798static const char *
2799mptcp_evret2str(ev_ret_t ret)
2800{
2801 const char *c = "UNKNOWN";
2802
2803 switch (ret) {
2804 case MPTS_EVRET_DELETE:
2805 c = "MPTS_EVRET_DELETE";
2806 break;
2807 case MPTS_EVRET_CONNECT_PENDING:
2808 c = "MPTS_EVRET_CONNECT_PENDING";
2809 break;
2810 case MPTS_EVRET_DISCONNECT_FALLBACK:
2811 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
2812 break;
2813 case MPTS_EVRET_OK:
2814 c = "MPTS_EVRET_OK";
2815 break;
2816 case MPTS_EVRET_OK_UPDATE:
2817 c = "MPTS_EVRET_OK_UPDATE";
2818 break;
2819 }
2820 return (c);
2821}
2822
2823/*
2824 * Add a reference to a subflow structure; used by MPTS_ADDREF().
2825 */
2826void
2827mptcp_subflow_addref(struct mptsub *mpts, int locked)
2828{
2829 if (!locked)
2830 MPTS_LOCK(mpts);
2831 else
2832 MPTS_LOCK_ASSERT_HELD(mpts);
2833
2834 if (++mpts->mpts_refcnt == 0) {
2835 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
2836 /* NOTREACHED */
2837 }
2838 if (!locked)
2839 MPTS_UNLOCK(mpts);
2840}
2841
2842/*
2843 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
2844 */
2845void
2846mptcp_subflow_remref(struct mptsub *mpts)
2847{
2848 MPTS_LOCK(mpts);
2849 if (mpts->mpts_refcnt == 0) {
2850 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
2851 /* NOTREACHED */
2852 }
2853 if (--mpts->mpts_refcnt > 0) {
2854 MPTS_UNLOCK(mpts);
2855 return;
2856 }
2857 /* callee will unlock and destroy lock */
2858 mptcp_subflow_free(mpts);
2859}
2860
2861/*
2862 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
2863 * caller must ensure that the option can be issued on subflow sockets, via
2864 * MPOF_SUBFLOW_OK flag.
2865 */
2866int
2867mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
2868 struct mptopt *mpo)
2869{
2870 struct socket *mp_so;
2871 struct sockopt sopt;
2872 char buf[32];
2873 int error;
2874
2875 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
2876 mpo->mpo_flags &= ~MPOF_INTERIM;
2877
2878 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2879 mp_so = mpte->mpte_mppcb->mpp_socket;
2880
2881 bzero(&sopt, sizeof (sopt));
2882 sopt.sopt_dir = SOPT_SET;
2883 sopt.sopt_level = mpo->mpo_level;
2884 sopt.sopt_name = mpo->mpo_name;
2885 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
2886 sopt.sopt_valsize = sizeof (int);
2887 sopt.sopt_p = kernproc;
2888
2889 error = sosetoptlock(so, &sopt, 0); /* already locked */
2890 if (error == 0) {
2891 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
2892 "val %d set successful\n", __func__,
2893 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2894 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2895 buf, sizeof (buf)), mpo->mpo_intval));
2896 } else {
2897 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s "
2898 "val %d set error %d\n", __func__,
2899 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2900 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2901 buf, sizeof (buf)), mpo->mpo_intval, error));
2902 }
2903 return (error);
2904}
2905
2906/*
2907 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
2908 * caller must ensure that the option can be issued on subflow sockets, via
2909 * MPOF_SUBFLOW_OK flag.
2910 */
2911int
2912mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
2913 struct mptopt *mpo)
2914{
2915 struct socket *mp_so;
2916 struct sockopt sopt;
2917 char buf[32];
2918 int error;
2919
2920 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
2921 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2922 mp_so = mpte->mpte_mppcb->mpp_socket;
2923
2924 bzero(&sopt, sizeof (sopt));
2925 sopt.sopt_dir = SOPT_GET;
2926 sopt.sopt_level = mpo->mpo_level;
2927 sopt.sopt_name = mpo->mpo_name;
2928 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
2929 sopt.sopt_valsize = sizeof (int);
2930 sopt.sopt_p = kernproc;
2931
2932 error = sogetoptlock(so, &sopt, 0); /* already locked */
2933 if (error == 0) {
2934 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
2935 "val %d get successful\n", __func__,
2936 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2937 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2938 buf, sizeof (buf)), mpo->mpo_intval));
2939 } else {
2940 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n",
2941 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2942 mptcp_sopt2str(mpo->mpo_level,
2943 mpo->mpo_name, buf, sizeof (buf)), error));
2944 }
2945 return (error);
2946}
2947
2948
2949/*
2950 * MPTCP garbage collector.
2951 *
2952 * This routine is called by the MP domain on-demand, periodic callout,
2953 * which is triggered when a MPTCP socket is closed. The callout will
2954 * repeat as long as this routine returns a non-zero value.
2955 */
2956static uint32_t
2957mptcp_gc(struct mppcbinfo *mppi)
2958{
2959 struct mppcb *mpp, *tmpp;
2960 uint32_t active = 0;
2961
2962 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
2963
2964 mptcplog3((LOG_DEBUG, "%s: running\n", __func__));
2965
2966 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
2967 struct socket *mp_so;
2968 struct mptses *mpte;
2969 struct mptcb *mp_tp;
2970
2971 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
2972 mp_so = mpp->mpp_socket;
2973 VERIFY(mp_so != NULL);
2974 mpte = mptompte(mpp);
2975 VERIFY(mpte != NULL);
2976 mp_tp = mpte->mpte_mptcb;
2977 VERIFY(mp_tp != NULL);
2978
2979 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found "
2980 "(u=%d,r=%d,s=%d)\n", __func__,
2981 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
2982 mp_so->so_retaincnt, mpp->mpp_state));
2983
2984 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
2985 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
2986 "(u=%d,r=%d)\n", __func__,
2987 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2988 mp_so->so_usecount, mp_so->so_retaincnt));
2989 active++;
2990 continue;
2991 }
2992
2993 /* check again under the lock */
2994 if (mp_so->so_usecount > 1) {
2995 boolean_t wakeup = FALSE;
2996 struct mptsub *mpts, *tmpts;
2997
2998 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
2999 "[u=%d,r=%d] %d %d\n", __func__,
3000 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3001 mp_so->so_usecount, mp_so->so_retaincnt,
3002 mp_tp->mpt_gc_ticks,
3003 mp_tp->mpt_state));
3004 MPT_LOCK(mp_tp);
3005 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3006 if (mp_tp->mpt_gc_ticks > 0)
3007 mp_tp->mpt_gc_ticks--;
3008 if (mp_tp->mpt_gc_ticks == 0) {
3009 wakeup = TRUE;
3010 if (mp_tp->mpt_localkey != NULL) {
3011 mptcp_free_key(
3012 mp_tp->mpt_localkey);
3013 mp_tp->mpt_localkey = NULL;
3014 }
3015 }
3016 }
3017 MPT_UNLOCK(mp_tp);
3018 if (wakeup) {
3019 TAILQ_FOREACH_SAFE(mpts,
3020 &mpte->mpte_subflows, mpts_entry, tmpts) {
3021 MPTS_LOCK(mpts);
3022 mpts->mpts_flags |= MPTSF_DELETEOK;
3023 if (mpts->mpts_soerror == 0)
3024 mpts->mpts_soerror = ETIMEDOUT;
3025 mptcp_subflow_eupcall(mpts->mpts_socket,
3026 mpts, SO_FILT_HINT_DISCONNECTED);
3027 MPTS_UNLOCK(mpts);
3028 }
3029 }
3030 lck_mtx_unlock(&mpp->mpp_lock);
3031 active++;
3032 continue;
3033 }
3034
3035 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3036 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3037 "[u=%d,r=%d,s=%d]\n", __func__,
3038 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3039 mp_so->so_usecount, mp_so->so_retaincnt,
3040 mpp->mpp_state));
3041 lck_mtx_unlock(&mpp->mpp_lock);
3042 active++;
3043 continue;
3044 }
3045
3046 /*
3047 * The PCB has been detached, and there is exactly 1 refnct
3048 * held by the MPTCP thread. Signal that thread to terminate,
3049 * after which the last refcnt will be released. That will
3050 * allow it to be destroyed below during the next round.
3051 */
3052 if (mp_so->so_usecount == 1) {
3053 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for "
3054 "termination [u=%d,r=%d]\n", __func__,
3055 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3056 mp_so->so_usecount, mp_so->so_retaincnt));
3057 /* signal MPTCP thread to terminate */
3058 mptcp_thread_terminate_signal(mpte);
3059 lck_mtx_unlock(&mpp->mpp_lock);
3060 active++;
3061 continue;
3062 }
3063
3064 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3065 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3066 mp_so->so_usecount, mp_so->so_retaincnt));
3067 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3068 struct sockbuf *, &mp_so->so_rcv,
3069 struct sockbuf *, &mp_so->so_snd,
3070 struct mppcb *, mpp);
3071
3072 mp_pcbdispose(mpp);
3073 }
3074
3075 return (active);
3076}
3077
3078/*
3079 * Drop a MPTCP connection, reporting the specified error.
3080 */
3081struct mptses *
3082mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3083{
3084 struct socket *mp_so;
3085
3086 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3087 MPT_LOCK_ASSERT_HELD(mp_tp);
3088 VERIFY(mpte->mpte_mptcb == mp_tp);
3089 mp_so = mpte->mpte_mppcb->mpp_socket;
3090
3091 mp_tp->mpt_state = MPTCPS_CLOSED;
3092 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3093 uint32_t, 0 /* event */);
3094
3095 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3096 errno = mp_tp->mpt_softerror;
3097 mp_so->so_error = errno;
3098
3099 return (mptcp_close(mpte, mp_tp));
3100}
3101
3102/*
3103 * Close a MPTCP control block.
3104 */
3105struct mptses *
3106mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3107{
3108 struct socket *mp_so;
3109 struct mptsub *mpts, *tmpts;
3110
3111 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3112 MPT_LOCK_ASSERT_HELD(mp_tp);
3113 VERIFY(mpte->mpte_mptcb == mp_tp);
3114 mp_so = mpte->mpte_mppcb->mpp_socket;
3115 if (mp_tp->mpt_localkey != NULL) {
3116 mptcp_free_key(mp_tp->mpt_localkey);
3117 mp_tp->mpt_localkey = NULL;
3118 }
3119
3120 MPT_UNLOCK(mp_tp);
3121 soisdisconnected(mp_so);
3122
3123 MPT_LOCK(mp_tp);
3124 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3125 return (NULL);
3126 }
3127 MPT_UNLOCK(mp_tp);
3128
3129 /* Clean up all subflows */
3130 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3131 MPTS_LOCK(mpts);
3132 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3133 MPTS_UNLOCK(mpts);
3134 mptcp_subflow_del(mpte, mpts, TRUE);
3135 }
3136 MPT_LOCK(mp_tp);
3137
3138 return (NULL);
3139}
3140
3141void
3142mptcp_notify_close(struct socket *so)
3143{
3144 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3145}
3146
3147/*
3148 * Signal MPTCP thread to wake up.
3149 */
3150void
3151mptcp_thread_signal(struct mptses *mpte)
3152{
3153 lck_mtx_lock(&mpte->mpte_thread_lock);
3154 mptcp_thread_signal_locked(mpte);
3155 lck_mtx_unlock(&mpte->mpte_thread_lock);
3156}
3157
3158/*
3159 * Signal MPTCP thread to wake up (locked version)
3160 */
3161static void
3162mptcp_thread_signal_locked(struct mptses *mpte)
3163{
3164 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3165
3166 mpte->mpte_thread_reqs++;
3167 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3168 wakeup_one((caddr_t)&mpte->mpte_thread);
3169}
3170
3171/*
3172 * Signal MPTCP thread to terminate.
3173 */
3174static void
3175mptcp_thread_terminate_signal(struct mptses *mpte)
3176{
3177 lck_mtx_lock(&mpte->mpte_thread_lock);
3178 if (mpte->mpte_thread != THREAD_NULL) {
3179 mpte->mpte_thread = THREAD_NULL;
3180 mpte->mpte_thread_reqs++;
3181 if (!mpte->mpte_thread_active)
3182 wakeup_one((caddr_t)&mpte->mpte_thread);
3183 }
3184 lck_mtx_unlock(&mpte->mpte_thread_lock);
3185}
3186
3187/*
3188 * MPTCP thread workloop.
3189 */
3190static void
3191mptcp_thread_dowork(struct mptses *mpte)
3192{
3193 struct socket *mp_so;
3194 struct mptsub *mpts, *tmpts;
3195 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3196 boolean_t conninfo_update = FALSE;
3197
3198 MPTE_LOCK(mpte); /* same as MP socket lock */
3199 VERIFY(mpte->mpte_mppcb != NULL);
3200 mp_so = mpte->mpte_mppcb->mpp_socket;
3201 VERIFY(mp_so != NULL);
3202
3203 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3204 ev_ret_t ret;
3205
3206 MPTS_LOCK(mpts);
3207 MPTS_ADDREF_LOCKED(mpts); /* for us */
3208
3209 /* Update process ownership based on parent mptcp socket */
3210 mptcp_update_last_owner(mpts, mp_so);
3211
3212 mptcp_subflow_input(mpte, mpts);
3213 ret = mptcp_subflow_events(mpte, mpts);
3214
3215 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3216 mptcplog3((LOG_INFO, "%s: cid %d \n", __func__,
3217 mpts->mpts_connid));
3218 (void) mptcp_subflow_output(mpte, mpts);
3219 }
3220
3221 /*
3222 * If MPTCP socket is closed, disconnect all subflows.
3223 * This will generate a disconnect event which will
3224 * be handled during the next iteration, causing a
3225 * non-zero error to be returned above.
3226 */
3227 if (mp_so->so_flags & SOF_PCBCLEARING)
3228 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3229 MPTS_UNLOCK(mpts);
3230
3231 switch (ret) {
3232 case MPTS_EVRET_OK_UPDATE:
3233 conninfo_update = TRUE;
3234 break;
3235 case MPTS_EVRET_OK:
3236 /* nothing to do */
3237 break;
3238 case MPTS_EVRET_DELETE:
3239 if (mptcp_delete_ok(mpte, mpts)) {
3240 mptcp_subflow_del(mpte, mpts, TRUE);
3241 }
3242 break;
3243 case MPTS_EVRET_CONNECT_PENDING:
3244 connect_pending = TRUE;
3245 break;
3246 case MPTS_EVRET_DISCONNECT_FALLBACK:
3247 disconnect_fallback = TRUE;
3248 break;
3249 }
3250 MPTS_REMREF(mpts); /* ours */
3251 }
3252
3253 if (conninfo_update) {
3254 soevent(mp_so, SO_FILT_HINT_LOCKED |
3255 SO_FILT_HINT_CONNINFO_UPDATED);
3256 }
3257
3258 if (!connect_pending && !disconnect_fallback) {
3259 MPTE_UNLOCK(mpte);
3260 return;
3261 }
3262
3263 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3264 MPTS_LOCK(mpts);
3265 if (disconnect_fallback) {
3266 struct socket *so = NULL;
3267 struct inpcb *inp = NULL;
3268 struct tcpcb *tp = NULL;
3269
3270 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3271 MPTS_UNLOCK(mpts);
3272 continue;
3273 }
3274
3275 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3276
3277 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3278 MPTSF_DISCONNECTED)) {
3279 MPTS_UNLOCK(mpts);
3280 continue;
3281 }
3282 so = mpts->mpts_socket;
3283
3284 /*
3285 * The MPTCP connection has degraded to a fallback
3286 * mode, so there is no point in keeping this subflow
3287 * regardless of its MPTCP-readiness state, unless it
3288 * is the primary one which we use for fallback. This
3289 * assumes that the subflow used for fallback is the
3290 * ACTIVE one.
3291 */
3292
3293 socket_lock(so, 1);
3294 inp = sotoinpcb(so);
3295 tp = intotcpcb(inp);
3296 tp->t_mpflags &=
3297 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3298 tp->t_mpflags |= TMPF_TCP_FALLBACK;
3299 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3300 socket_unlock(so, 1);
3301 MPTS_UNLOCK(mpts);
3302 continue;
3303 }
3304 tp->t_mpflags |= TMPF_RESET;
3305 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3306 socket_unlock(so, 1);
3307
3308 } else if (connect_pending) {
3309 /*
3310 * The MPTCP connection has progressed to a state
3311 * where it supports full multipath semantics; allow
3312 * additional joins to be attempted for all subflows
3313 * that are in the PENDING state.
3314 */
3315 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3316 (void) mptcp_subflow_soconnectx(mpte, mpts);
3317 }
3318 }
3319 MPTS_UNLOCK(mpts);
3320 }
3321
3322 MPTE_UNLOCK(mpte);
3323}
3324
3325/*
3326 * MPTCP thread.
3327 */
3328static void
3329mptcp_thread_func(void *v, wait_result_t w)
3330{
3331#pragma unused(w)
3332 struct mptses *mpte = v;
3333 struct timespec *ts = NULL;
3334
3335 VERIFY(mpte != NULL);
3336
3337 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3338
3339 for (;;) {
3340 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3341
3342 if (mpte->mpte_thread != THREAD_NULL) {
3343 (void) msleep(&mpte->mpte_thread,
3344 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3345 __func__, ts);
3346 }
3347
3348 /* MPTCP socket is closed? */
3349 if (mpte->mpte_thread == THREAD_NULL) {
3350 lck_mtx_unlock(&mpte->mpte_thread_lock);
3351 /* callee will destroy thread lock */
3352 mptcp_thread_destroy(mpte);
3353 /* NOTREACHED */
3354 return;
3355 }
3356
3357 mpte->mpte_thread_active = 1;
3358 for (;;) {
3359 uint32_t reqs = mpte->mpte_thread_reqs;
3360
3361 lck_mtx_unlock(&mpte->mpte_thread_lock);
3362 mptcp_thread_dowork(mpte);
3363 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3364
3365 /* if there's no pending request, we're done */
3366 if (reqs == mpte->mpte_thread_reqs ||
3367 mpte->mpte_thread == THREAD_NULL)
3368 break;
3369 }
3370 mpte->mpte_thread_reqs = 0;
3371 mpte->mpte_thread_active = 0;
3372 }
3373}
3374
3375/*
3376 * Destroy a MTCP thread, to be called in the MPTCP thread context
3377 * upon receiving an indication to self-terminate. This routine
3378 * will not return, as the current thread is terminated at the end.
3379 */
3380static void
3381mptcp_thread_destroy(struct mptses *mpte)
3382{
3383 struct socket *mp_so;
3384
3385 MPTE_LOCK(mpte); /* same as MP socket lock */
3386 VERIFY(mpte->mpte_thread == THREAD_NULL);
3387 VERIFY(mpte->mpte_mppcb != NULL);
3388
3389 mptcp_sesdestroy(mpte);
3390
3391 mp_so = mpte->mpte_mppcb->mpp_socket;
3392 VERIFY(mp_so != NULL);
3393 VERIFY(mp_so->so_usecount != 0);
3394 mp_so->so_usecount--; /* for thread */
3395 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3396 MPTE_UNLOCK(mpte);
3397
3398 /* for the extra refcnt from kernel_thread_start() */
3399 thread_deallocate(current_thread());
3400 /* this is the end */
3401 thread_terminate(current_thread());
3402 /* NOTREACHED */
3403}
3404
3405/*
3406 * Protocol pr_lock callback.
3407 */
3408int
3409mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3410{
3411 struct mppcb *mpp = sotomppcb(mp_so);
3412 void *lr_saved;
3413
3414 if (lr == NULL)
3415 lr_saved = __builtin_return_address(0);
3416 else
3417 lr_saved = lr;
3418
3419 if (mpp == NULL) {
3420 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3421 mp_so, lr_saved, solockhistory_nr(mp_so));
3422 /* NOTREACHED */
3423 }
3424 lck_mtx_lock(&mpp->mpp_lock);
3425
3426 if (mp_so->so_usecount < 0) {
3427 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3428 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3429 solockhistory_nr(mp_so));
3430 /* NOTREACHED */
3431 }
3432 if (refcount != 0)
3433 mp_so->so_usecount++;
3434 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3435 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3436
3437 return (0);
3438}
3439
3440/*
3441 * Protocol pr_unlock callback.
3442 */
3443int
3444mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3445{
3446 struct mppcb *mpp = sotomppcb(mp_so);
3447 void *lr_saved;
3448
3449 if (lr == NULL)
3450 lr_saved = __builtin_return_address(0);
3451 else
3452 lr_saved = lr;
3453
3454 if (mpp == NULL) {
3455 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3456 mp_so, mp_so->so_usecount, lr_saved,
3457 solockhistory_nr(mp_so));
3458 /* NOTREACHED */
3459 }
3460 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3461
3462 if (refcount != 0)
3463 mp_so->so_usecount--;
3464
3465 if (mp_so->so_usecount < 0) {
3466 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3467 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3468 /* NOTREACHED */
3469 }
3470 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3471 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3472 lck_mtx_unlock(&mpp->mpp_lock);
3473
3474 return (0);
3475}
3476
3477/*
3478 * Protocol pr_getlock callback.
3479 */
3480lck_mtx_t *
3481mptcp_getlock(struct socket *mp_so, int locktype)
3482{
3483#pragma unused(locktype)
3484 struct mppcb *mpp = sotomppcb(mp_so);
3485
3486 if (mpp == NULL) {
3487 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3488 solockhistory_nr(mp_so));
3489 /* NOTREACHED */
3490 }
3491 if (mp_so->so_usecount < 0) {
3492 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3493 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3494 /* NOTREACHED */
3495 }
3496 return (&mpp->mpp_lock);
3497}
3498
3499/*
3500 * Key generation functions
3501 */
3502static void
3503mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
3504{
3505 struct mptcp_key_entry *key_elm;
3506try_again:
3507 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
3508 if (key_entry->mkey_value == 0)
3509 goto try_again;
3510 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
3511 sizeof (key_entry->mkey_digest));
3512
3513 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3514 if (key_elm->mkey_value == key_entry->mkey_value) {
3515 goto try_again;
3516 }
3517 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
3518 0) {
3519 goto try_again;
3520 }
3521 }
3522}
3523
3524static mptcp_key_t *
3525mptcp_reserve_key(void)
3526{
3527 struct mptcp_key_entry *key_elm;
3528 struct mptcp_key_entry *found_elm = NULL;
3529
3530 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3531 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3532 if (key_elm->mkey_flags == MKEYF_FREE) {
3533 key_elm->mkey_flags = MKEYF_INUSE;
3534 found_elm = key_elm;
3535 break;
3536 }
3537 }
3538 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3539
3540 if (found_elm) {
3541 return (&found_elm->mkey_value);
3542 }
3543
3544 key_elm = (struct mptcp_key_entry *)
3545 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3546 key_elm->mkey_flags = MKEYF_INUSE;
3547
3548 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3549 mptcp_generate_unique_key(key_elm);
3550 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
3551 mptcp_keys_pool.mkph_count += 1;
3552 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3553 return (&key_elm->mkey_value);
3554}
3555
3556static caddr_t
3557mptcp_get_stored_digest(mptcp_key_t *key)
3558{
3559 struct mptcp_key_entry *key_holder;
3560 caddr_t digest = NULL;
3561
3562 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3563 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
3564 offsetof(struct mptcp_key_entry, mkey_value));
3565 if (key_holder->mkey_flags != MKEYF_INUSE)
3566 panic_plain("%s", __func__);
3567 digest = &key_holder->mkey_digest[0];
3568 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3569 return (digest);
3570}
3571
3572void
3573mptcp_free_key(mptcp_key_t *key)
3574{
3575 struct mptcp_key_entry *key_holder;
3576 struct mptcp_key_entry *key_elm;
3577 int pt = RandomULong();
3578
3579 mptcplog((LOG_INFO, "%s\n", __func__));
3580
3581 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3582 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
3583 offsetof(struct mptcp_key_entry, mkey_value));
3584 key_holder->mkey_flags = MKEYF_FREE;
3585
3586 LIST_REMOVE(key_holder, mkey_next);
3587 mptcp_keys_pool.mkph_count -= 1;
3588
3589 /* Free half the time */
3590 if (pt & 0x01) {
3591 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
3592 } else {
3593 /* Insert it at random point to avoid early reuse */
3594 int i = 0;
3595 if (mptcp_keys_pool.mkph_count > 1) {
3596 pt = pt % (mptcp_keys_pool.mkph_count - 1);
3597 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3598 if (++i >= pt) {
3599 LIST_INSERT_AFTER(key_elm, key_holder,
3600 mkey_next);
3601 break;
3602 }
3603 }
3604 if (i < pt)
3605 panic("missed insertion");
3606 } else {
3607 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
3608 mkey_next);
3609 }
3610 mptcp_keys_pool.mkph_count += 1;
3611 }
3612 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3613}
3614
3615static void
3616mptcp_key_pool_init(void)
3617{
3618 int i;
3619 struct mptcp_key_entry *key_entry;
3620
3621 LIST_INIT(&mptcp_keys_pool);
3622 mptcp_keys_pool.mkph_count = 0;
3623
3624 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
3625 (sizeof (struct mptcp_key_entry));
3626 mptcp_keys_pool.mkph_key_entry_zone = zinit(
3627 mptcp_keys_pool.mkph_key_elm_sz,
3628 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
3629 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
3630 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
3631 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
3632 /* NOTREACHED */
3633 }
3634 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
3635 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
3636
3637 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
3638 key_entry = (struct mptcp_key_entry *)
3639 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3640 key_entry->mkey_flags = MKEYF_FREE;
3641 mptcp_generate_unique_key(key_entry);
3642 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
3643 mptcp_keys_pool.mkph_count += 1;
3644 }
3645 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
3646 mtcbinfo.mppi_lock_attr);
3647}
3648
3649/*
3650 * MPTCP Join support
3651 */
3652
3653static void
3654mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
3655 connid_t conn_id)
3656{
3657 struct tcpcb *tp = sototcpcb(so);
3658 struct mptcp_subf_auth_entry *sauth_entry;
3659 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3660
3661 MPT_LOCK_SPIN(mp_tp);
3662 tp->t_mptcb = mp_tp;
3663 MPT_UNLOCK(mp_tp);
3664 /*
3665 * As long as the mpts_connid is unique it can be used as the
3666 * address ID for additional subflows.
3667 * The address ID of the first flow is implicitly 0.
3668 */
3669 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
3670 tp->t_local_aid = 0;
3671 } else {
3672 tp->t_local_aid = conn_id;
3673 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
3674 so->so_flags |= SOF_MP_SEC_SUBFLOW;
3675 }
3676 sauth_entry = zalloc(mpt_subauth_zone);
3677 sauth_entry->msae_laddr_id = tp->t_local_aid;
3678 sauth_entry->msae_raddr_id = 0;
3679 sauth_entry->msae_raddr_rand = 0;
3680try_again:
3681 sauth_entry->msae_laddr_rand = RandomULong();
3682 if (sauth_entry->msae_laddr_rand == 0)
3683 goto try_again;
3684 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
3685}
3686
3687static void
3688mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
3689{
3690 struct mptcp_subf_auth_entry *sauth_entry;
3691 struct tcpcb *tp = sototcpcb(so);
3692 int found = 0;
3693
3694 if (tp == NULL)
3695 return;
3696
3697 MPT_LOCK(mp_tp);
3698 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3699 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
3700 found = 1;
3701 break;
3702 }
3703 }
3704 if (found) {
3705 LIST_REMOVE(sauth_entry, msae_next);
3706 zfree(mpt_subauth_zone, sauth_entry);
3707 }
3708 tp->t_mptcb = NULL;
3709 MPT_UNLOCK(mp_tp);
3710}
3711
3712void
3713mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
3714 u_int32_t *rrand)
3715{
3716 struct mptcp_subf_auth_entry *sauth_entry;
3717 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3718
3719 MPT_LOCK(mp_tp);
3720 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3721 if (sauth_entry->msae_laddr_id == addr_id) {
3722 if (lrand)
3723 *lrand = sauth_entry->msae_laddr_rand;
3724 if (rrand)
3725 *rrand = sauth_entry->msae_raddr_rand;
3726 break;
3727 }
3728 }
3729 MPT_UNLOCK(mp_tp);
3730}
3731
3732void
3733mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
3734 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
3735{
3736 struct mptcp_subf_auth_entry *sauth_entry;
3737 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3738
3739 MPT_LOCK(mp_tp);
3740 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3741 if (sauth_entry->msae_laddr_id == laddr_id) {
3742 if ((sauth_entry->msae_raddr_id != 0) &&
3743 (sauth_entry->msae_raddr_id != raddr_id)) {
3744 mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched"
3745 " address ids %d %d \n", __func__, raddr_id,
3746 sauth_entry->msae_raddr_id));
3747 MPT_UNLOCK(mp_tp);
3748 return;
3749 }
3750 sauth_entry->msae_raddr_id = raddr_id;
3751 if ((sauth_entry->msae_raddr_rand != 0) &&
3752 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3753 mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n",
3754 __func__, raddr_rand,
3755 sauth_entry->msae_raddr_rand));
3756 MPT_UNLOCK(mp_tp);
3757 return;
3758 }
3759 sauth_entry->msae_raddr_rand = raddr_rand;
3760 MPT_UNLOCK(mp_tp);
3761 return;
3762 }
3763 }
3764 MPT_UNLOCK(mp_tp);
3765}
3766
3767/*
3768 * SHA1 support for MPTCP
3769 */
3770static int
3771mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
3772{
3773 SHA1_CTX sha1ctxt;
3774 const unsigned char *sha1_base;
3775 int sha1_size;
3776
3777 if (digest_len != SHA1_RESULTLEN) {
3778 return (FALSE);
3779 }
3780
3781 sha1_base = (const unsigned char *) key;
3782 sha1_size = sizeof (mptcp_key_t);
3783 SHA1Init(&sha1ctxt);
3784 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
3785 SHA1Final(sha_digest, &sha1ctxt);
3786 return (TRUE);
3787}
3788
3789void
3790mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
3791 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
3792{
3793 SHA1_CTX sha1ctxt;
3794 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
3795 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
3796 u_int32_t data[2];
3797 int i;
3798
3799 bzero(digest, digest_len);
3800
3801 /* Set up the Key for HMAC */
3802 key_ipad[0] = key1;
3803 key_ipad[1] = key2;
3804
3805 key_opad[0] = key1;
3806 key_opad[1] = key2;
3807
3808 /* Set up the message for HMAC */
3809 data[0] = rand1;
3810 data[1] = rand2;
3811
3812 /* Key is 512 block length, so no need to compute hash */
3813
3814 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
3815
3816 for (i = 0; i < 8; i++) {
3817 key_ipad[i] ^= 0x3636363636363636;
3818 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
3819 }
3820
3821 /* Perform inner SHA1 */
3822 SHA1Init(&sha1ctxt);
3823 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
3824 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
3825 SHA1Final(digest, &sha1ctxt);
3826
3827 /* Perform outer SHA1 */
3828 SHA1Init(&sha1ctxt);
3829 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
3830 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
3831 SHA1Final(digest, &sha1ctxt);
3832}
3833
3834/*
3835 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
3836 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
3837 */
3838void
3839mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
3840 int digest_len)
3841{
3842 uint32_t lrand, rrand;
3843 mptcp_key_t localkey, remotekey;
3844 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3845
3846 if (digest_len != SHA1_RESULTLEN)
3847 return;
3848
3849 lrand = rrand = 0;
3850 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
3851 MPT_LOCK_SPIN(mp_tp);
3852 localkey = *mp_tp->mpt_localkey;
3853 remotekey = mp_tp->mpt_remotekey;
3854 MPT_UNLOCK(mp_tp);
3855 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
3856 digest_len);
3857}
3858
3859u_int64_t
3860mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
3861{
3862 u_char digest[SHA1_RESULTLEN];
3863 u_int64_t trunced_digest;
3864
3865 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
3866 bcopy(digest, &trunced_digest, 8);
3867 return (trunced_digest);
3868}
3869
3870/*
3871 * Authentication data generation
3872 */
3873int
3874mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
3875 int token_len)
3876{
3877 VERIFY(token_len == sizeof (u_int32_t));
3878 VERIFY(sha_digest_len == SHA1_RESULTLEN);
3879
3880 /* Most significant 32 bits of the SHA1 hash */
3881 bcopy(sha_digest, token, sizeof (u_int32_t));
3882 return (TRUE);
3883}
3884
3885int
3886mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
3887 int idsn_len)
3888{
3889 VERIFY(idsn_len == sizeof (u_int64_t));
3890 VERIFY(sha_digest_len == SHA1_RESULTLEN);
3891
3892 /*
3893 * Least significant 64 bits of the SHA1 hash
3894 */
3895
3896 idsn[7] = sha_digest[12];
3897 idsn[6] = sha_digest[13];
3898 idsn[5] = sha_digest[14];
3899 idsn[4] = sha_digest[15];
3900 idsn[3] = sha_digest[16];
3901 idsn[2] = sha_digest[17];
3902 idsn[1] = sha_digest[18];
3903 idsn[0] = sha_digest[19];
3904 return (TRUE);
3905}
3906
3907static int
3908mptcp_init_authparms(struct mptcb *mp_tp)
3909{
3910 caddr_t local_digest = NULL;
3911 char remote_digest[MPTCP_SHA1_RESULTLEN];
3912 MPT_LOCK_ASSERT_HELD(mp_tp);
3913
3914 /* Only Version 0 is supported for auth purposes */
3915 if (mp_tp->mpt_version != MP_DRAFT_VERSION_12)
3916 return (-1);
3917
3918 /* Setup local and remote tokens and Initial DSNs */
3919 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
3920 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
3921 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
3922 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
3923 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
3924
3925 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
3926 SHA1_RESULTLEN)) {
3927 mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure",
3928 __func__));
3929 return (-1);
3930 }
3931 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
3932 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken));
3933 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
3934 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
3935 return (0);
3936}
3937
3938static void
3939mptcp_init_statevars(struct mptcb *mp_tp)
3940{
3941 MPT_LOCK_ASSERT_HELD(mp_tp);
3942
3943 /* The subflow SYN is also first MPTCP byte */
3944 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
3945 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3946
3947 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
3948}
3949
3950static void
3951mptcp_conn_properties(struct mptcb *mp_tp)
3952{
3953 /* There is only Version 0 at this time */
3954 mp_tp->mpt_version = MP_DRAFT_VERSION_12;
3955
3956 /* Set DSS checksum flag */
3957 if (mptcp_dss_csum)
3958 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
3959
3960 /* Set up receive window */
3961 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
3962
3963 /* Set up gc ticks */
3964 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
3965}
3966
3967/*
3968 * Helper Functions
3969 */
3970mptcp_token_t
3971mptcp_get_localtoken(void* mptcb_arg)
3972{
3973 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3974 return (mp_tp->mpt_localtoken);
3975}
3976
3977mptcp_token_t
3978mptcp_get_remotetoken(void* mptcb_arg)
3979{
3980 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3981 return (mp_tp->mpt_remotetoken);
3982}
3983
3984u_int64_t
3985mptcp_get_localkey(void* mptcb_arg)
3986{
3987 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3988 if (mp_tp->mpt_localkey != NULL)
3989 return (*mp_tp->mpt_localkey);
3990 else
3991 return (0);
3992}
3993
3994u_int64_t
3995mptcp_get_remotekey(void* mptcb_arg)
3996{
3997 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3998 return (mp_tp->mpt_remotekey);
3999}
4000
4001void
4002mptcp_send_dfin(struct socket *so)
4003{
4004 struct tcpcb *tp = NULL;
4005 struct inpcb *inp = NULL;
4006
4007 inp = sotoinpcb(so);
4008 if (!inp)
4009 return;
4010
4011 tp = intotcpcb(inp);
4012 if (!tp)
4013 return;
4014
4015 if (!(tp->t_mpflags & TMPF_RESET))
4016 tp->t_mpflags |= TMPF_SEND_DFIN;
4017}
4018
4019/*
4020 * Data Sequence Mapping routines
4021 */
4022void
4023mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4024{
4025 struct mptcb *mp_tp;
4026
4027 if (m == NULL)
4028 return;
4029
4030 mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
4031 MPT_LOCK(mp_tp);
4032 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4033 MPT_UNLOCK(mp_tp);
4034 panic("%s: data write before establishment.",
4035 __func__);
4036 return;
4037 }
4038
4039 while (m) {
4040 VERIFY(m->m_flags & M_PKTHDR);
4041 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4042 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4043 m->m_pkthdr.mp_rlen = m_pktlen(m);
4044 mp_tp->mpt_sndmax += m_pktlen(m);
4045 m = m->m_next;
4046 }
4047 MPT_UNLOCK(mp_tp);
4048}
4049
4050void
4051mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len)
4052{
4053 u_int32_t sub_len = 0;
4054
4055 while (m) {
4056 VERIFY(m->m_flags & M_PKTHDR);
4057
4058 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4059 sub_len = m->m_pkthdr.mp_rlen;
4060
4061 if (sub_len < len) {
4062 m->m_pkthdr.mp_dsn += sub_len;
4063 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4064 m->m_pkthdr.mp_rseq += sub_len;
4065 }
4066 m->m_pkthdr.mp_rlen = 0;
4067 len -= sub_len;
4068 } else {
4069 /* sub_len >= len */
4070 m->m_pkthdr.mp_dsn += len;
4071 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4072 m->m_pkthdr.mp_rseq += len;
4073 }
4074 mptcplog3((LOG_INFO,
4075 "%s: %llu %u %d %d\n", __func__,
4076 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
4077 m->m_pkthdr.mp_rlen, len));
4078 m->m_pkthdr.mp_rlen -= len;
4079 return;
4080 }
4081 } else {
4082 panic("%s: MPTCP tag not set", __func__);
4083 /* NOTREACHED */
4084 }
4085 m = m->m_next;
4086 }
4087}
4088
4089/* Obtain the DSN mapping stored in the mbuf */
4090void
4091mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4092 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4093{
4094 u_int64_t dsn64;
4095
4096 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4097 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4098 *dsn64p = dsn64;
4099}
4100
4101void
4102mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4103 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4104{
4105 struct mbuf *m = so->so_snd.sb_mb;
4106 struct mbuf *mnext = NULL;
4107 uint32_t runlen = 0;
4108 u_int64_t dsn64;
4109 uint32_t contig_len = 0;
4110
4111 if (m == NULL)
4112 return;
4113
4114 if (off < 0)
4115 return;
4116 /*
4117 * In the subflow socket, the DSN sequencing can be discontiguous,
4118 * but the subflow sequence mapping is contiguous. Use the subflow
4119 * sequence property to find the right mbuf and corresponding dsn
4120 * mapping.
4121 */
4122
4123 while (m) {
4124 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4125 VERIFY(m->m_flags & M_PKTHDR);
4126
4127 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4128 off -= m->m_pkthdr.mp_rlen;
4129 m = m->m_next;
4130 } else {
4131 break;
4132 }
4133 }
4134
4135 if (m == NULL) {
4136 panic("%s: bad offset", __func__);
4137 /* NOTREACHED */
4138 }
4139
4140 dsn64 = m->m_pkthdr.mp_dsn + off;
4141 *dsn = dsn64;
4142 *relseq = m->m_pkthdr.mp_rseq + off;
4143
4144 /*
4145 * Now find the last contiguous byte and its length from
4146 * start.
4147 */
4148 runlen = m->m_pkthdr.mp_rlen - off;
4149 contig_len = runlen;
4150
4151 /* If datalen does not span multiple mbufs, return */
4152 if (datalen <= runlen) {
4153 *data_len = min(datalen, UINT16_MAX);
4154 return;
4155 }
4156
4157 mnext = m->m_next;
4158 while (datalen > runlen) {
4159 if (mnext == NULL) {
4160 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4161 runlen, off);
4162 /* NOTREACHED */
4163 }
4164 VERIFY(mnext->m_flags & M_PKTHDR);
4165 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4166
4167 /*
4168 * case A. contiguous DSN stream
4169 * case B. discontiguous DSN stream
4170 */
4171 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4172 /* case A */
4173 runlen += mnext->m_pkthdr.mp_rlen;
4174 contig_len += mnext->m_pkthdr.mp_rlen;
4175 mptcplog3((LOG_INFO, "%s: contig \n",
4176 __func__));
4177 } else {
4178 /* case B */
4179 mptcplog((LOG_INFO, "%s: discontig %d %d \n",
4180 __func__, datalen, contig_len));
4181 break;
4182 }
4183 mnext = mnext->m_next;
4184 }
4185 datalen = min(datalen, UINT16_MAX);
4186 *data_len = min(datalen, contig_len);
4187 mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__,
4188 *dsn, *relseq, *data_len, off));
4189}
4190
4191/*
4192 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4193 * here. It must be called from mptcp_adj_rmap() which is called only after
4194 * reassembly of out of order data. The rcvnxt variable must
4195 * be updated only when atleast some insequence new data is received.
4196 */
4197static void
4198mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4199{
4200 struct mptcb *mp_tp = tptomptp(tp);
4201
4202 if (mp_tp == NULL)
4203 return;
4204 MPT_LOCK(mp_tp);
4205 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4206 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4207 m->m_pkthdr.mp_rlen)))) {
4208 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4209 }
4210 MPT_UNLOCK(mp_tp);
4211}
4212
4213/*
4214 * Note that this is called only from tcp_input() which may trim data
4215 * after the dsn mapping is inserted into the mbuf. When it trims data
4216 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the
4217 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion
4218 * cannot be delayed after trim, because data can be in the reassembly
4219 * queue for a while and the DSN option info in tp will be overwritten for
4220 * every new packet received.
4221 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4222 * with mptcp_adj_rmap()
4223 */
4224void
4225mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4226{
4227 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4228
4229 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4230 VERIFY(m->m_flags & M_PKTHDR);
4231 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4232 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4233 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4234 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4235 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4236 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4237 }
4238}
4239
4240void
4241mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4242{
4243 u_int64_t dsn;
4244 u_int32_t sseq, datalen;
4245 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4246 u_int32_t old_rcvnxt = 0;
4247
4248 if (m_pktlen(m) == 0)
4249 return;
4250
4251 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4252 VERIFY(m->m_flags & M_PKTHDR);
4253
4254 dsn = m->m_pkthdr.mp_dsn;
4255 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4256 datalen = m->m_pkthdr.mp_rlen;
4257 } else {
4258 /* data arrived without an DSS option mapping */
4259 mptcp_notify_mpfail(so);
4260 return;
4261 }
4262
4263 /* In the common case, data is in window and in sequence */
4264 if (m->m_pkthdr.len == (int)datalen) {
4265 mptcp_adj_rcvnxt(tp, m);
4266 return;
4267 }
4268
4269 if (m->m_pkthdr.len > (int)datalen) {
4270 panic("%s: mbuf len = %d expected = %d", __func__,
4271 m->m_pkthdr.len, datalen);
4272 }
4273
4274 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4275 if (SEQ_GT(old_rcvnxt, sseq)) {
4276 /* data trimmed from the left */
4277 int off = old_rcvnxt - sseq;
4278 m->m_pkthdr.mp_dsn += off;
4279 m->m_pkthdr.mp_rseq += off;
4280 m->m_pkthdr.mp_rlen -= off;
4281 } else if (old_rcvnxt == sseq) {
4282 /*
4283 * Data was trimmed from the right
4284 */
4285 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4286 } else {
4287 /* XXX handle gracefully with reass or fallback in January */
4288 panic("%s: partial map %u %u", __func__, old_rcvnxt, sseq);
4289 /* NOTREACHED */
4290 }
4291 mptcp_adj_rcvnxt(tp, m);
4292
4293}
4294
4295/*
4296 * Following routines help with failure detection and failover of data
4297 * transfer from one subflow to another.
4298 */
4299void
4300mptcp_act_on_txfail(struct socket *so)
4301{
4302 struct tcpcb *tp = NULL;
4303 struct inpcb *inp = sotoinpcb(so);
4304
4305 if (inp == NULL)
4306 return;
4307
4308 tp = intotcpcb(inp);
4309 if (tp == NULL)
4310 return;
4311
4312 if (tp->t_state != TCPS_ESTABLISHED)
4313 mptcplog((LOG_INFO, "%s: state = %d \n", __func__,
4314 tp->t_state));
4315
4316 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4317 return;
4318 }
4319
4320 so->so_flags |= SOF_MP_TRYFAILOVER;
4321 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4322}
4323
4324/*
4325 * Support for MP_FAIL option
4326 */
4327int
4328mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4329{
4330 struct mbuf *m = so->so_snd.sb_mb;
4331 u_int64_t dsn;
4332 int off = 0;
4333 u_int32_t datalen;
4334
4335 if (m == NULL)
4336 return (-1);
4337
4338 while (m != NULL) {
4339 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4340 VERIFY(m->m_flags & M_PKTHDR);
4341 dsn = m->m_pkthdr.mp_dsn;
4342 datalen = m->m_pkthdr.mp_rlen;
4343 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4344 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4345 off = dsn_fail - dsn;
4346 *tcp_seq = m->m_pkthdr.mp_rseq + off;
4347 return (0);
4348 }
4349
4350 m = m->m_next;
4351 }
4352
4353 /*
4354 * If there was no mbuf data and a fallback to TCP occurred, there's
4355 * not much else to do.
4356 */
4357
4358 mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail));
4359 return (-1);
4360}
4361
4362/*
4363 * Support for sending contiguous MPTCP bytes in subflow
4364 */
4365int32_t
4366mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4367{
4368 u_int64_t mdss_dsn = 0;
4369 u_int32_t mdss_subflow_seq = 0;
4370 u_int16_t mdss_data_len = 0;
4371
4372 if (len == 0)
4373 return (len);
4374
4375 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4376 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4377
4378 return (mdss_data_len);
4379}
4380
4381int32_t
4382mptcp_sbspace(struct mptcb *mpt)
4383{
4384 struct sockbuf *sb;
4385 uint32_t rcvbuf;
4386 int32_t space;
4387
4388 MPT_LOCK_ASSERT_HELD(mpt);
4389 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4390
4391 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4392 rcvbuf = sb->sb_hiwat;
4393 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4394 (sb->sb_mbmax - sb->sb_mbcnt)));
4395 if (space < 0)
4396 space = 0;
4397 /* XXX check if it's too small? */
4398
4399 return (space);
4400}
4401
4402/*
4403 * Support Fallback to Regular TCP
4404 */
4405void
4406mptcp_notify_mpready(struct socket *so)
4407{
4408 struct tcpcb *tp = NULL;
4409
4410 if (so == NULL)
4411 return;
4412
4413 tp = intotcpcb(sotoinpcb(so));
4414
4415 if (tp == NULL)
4416 return;
4417
4418 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4419 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4420 struct tcpcb *, tp);
4421
4422 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4423 return;
4424
4425 if (tp->t_mpflags & TMPF_MPTCP_READY)
4426 return;
4427
4428 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4429 tp->t_mpflags |= TMPF_MPTCP_READY;
4430
4431 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4432}
4433
4434void
4435mptcp_notify_mpfail(struct socket *so)
4436{
4437 struct tcpcb *tp = NULL;
4438
4439 if (so == NULL)
4440 return;
4441
4442 tp = intotcpcb(sotoinpcb(so));
4443
4444 if (tp == NULL)
4445 return;
4446
4447 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4448 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4449 struct tcpcb *, tp);
4450
4451 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
4452 return;
4453
4454 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4455 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4456
4457 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4458}
4459
4460/*
4461 * Keepalive helper function
4462 */
4463boolean_t
4464mptcp_ok_to_keepalive(struct mptcb *mp_tp)
4465{
4466 boolean_t ret = 1;
4467 VERIFY(mp_tp != NULL);
4468 MPT_LOCK(mp_tp);
4469 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
4470 ret = 0;
4471 }
4472 MPT_UNLOCK(mp_tp);
4473 return (ret);
4474}
4475
4476/*
4477 * MPTCP t_maxseg adjustment function
4478 */
4479int
4480mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
4481{
4482 int mss_lower = 0;
4483 struct mptcb *mp_tp = tptomptp(tp);
4484
4485#define MPTCP_COMPUTE_LEN { \
4486 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
4487 MPT_LOCK(mp_tp); \
4488 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
4489 mss_lower += 2; \
4490 else \
4491 /* adjust to 32-bit boundary + EOL */ \
4492 mss_lower += 2; \
4493 MPT_UNLOCK(mp_tp); \
4494}
4495 if (mp_tp == NULL)
4496 return (0);
4497
4498 /*
4499 * For the first subflow and subsequent subflows, adjust mss for
4500 * most common MPTCP option size, for case where tcp_mss is called
4501 * during option processing and MTU discovery.
4502 */
4503 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4504 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
4505 MPTCP_COMPUTE_LEN;
4506 }
4507
4508 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4509 (tp->t_mpflags & TMPF_SENT_JOIN)) {
4510 MPTCP_COMPUTE_LEN;
4511 }
4512
4513 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
4514 MPTCP_COMPUTE_LEN;
4515 }
4516
4517 return (mss_lower);
4518}
4519
4520/*
4521 * Update the pid, upid, uuid of the subflow so, based on parent so
4522 */
4523void
4524mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
4525{
4526 struct socket *subflow_so = mpts->mpts_socket;
4527
4528 MPTS_LOCK_ASSERT_HELD(mpts);
4529
4530 socket_lock(subflow_so, 0);
4531 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
4532 (subflow_so->last_upid != parent_mpso->last_upid)) {
4533 subflow_so->last_upid = parent_mpso->last_upid;
4534 subflow_so->last_pid = parent_mpso->last_pid;
4535 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
4536 }
4537 so_update_policy(subflow_so);
4538 socket_unlock(subflow_so, 0);
4539}
4540
4541static void
4542fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
4543{
4544 struct inpcb *inp;
4545
4546 tcp_getconninfo(so, &flow->flow_ci);
4547 inp = sotoinpcb(so);
4548#if INET6
4549 if ((inp->inp_vflag & INP_IPV6) != 0) {
4550 flow->flow_src.ss_family = AF_INET6;
4551 flow->flow_dst.ss_family = AF_INET6;
4552 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
4553 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
4554 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
4555 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
4556 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
4557 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
4558 } else
4559#endif
4560 {
4561 flow->flow_src.ss_family = AF_INET;
4562 flow->flow_dst.ss_family = AF_INET;
4563 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
4564 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
4565 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
4566 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
4567 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
4568 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
4569 }
4570 flow->flow_flags = mpts->mpts_flags;
4571 flow->flow_cid = mpts->mpts_connid;
4572}
4573
4574static int
4575mptcp_pcblist SYSCTL_HANDLER_ARGS
4576{
4577#pragma unused(oidp, arg1, arg2)
4578 int error = 0, f;
4579 size_t n, len;
4580 struct mppcb *mpp;
4581 struct mptses *mpte;
4582 struct mptcb *mp_tp;
4583 struct mptsub *mpts;
4584 struct socket *so;
4585 conninfo_mptcp_t mptcpci;
4586 mptcp_flow_t *flows;
4587
4588 if (req->newptr != USER_ADDR_NULL)
4589 return (EPERM);
4590
4591 lck_mtx_lock(&mtcbinfo.mppi_lock);
4592 n = mtcbinfo.mppi_count;
4593 if (req->oldptr == USER_ADDR_NULL) {
4594 lck_mtx_unlock(&mtcbinfo.mppi_lock);
4595 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
4596 4 * (n + n/8) * sizeof(mptcp_flow_t);
4597 return (0);
4598 }
4599 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
4600 bzero(&mptcpci, sizeof(mptcpci));
4601 lck_mtx_lock(&mpp->mpp_lock);
4602 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4603 mpte = mptompte(mpp);
4604 VERIFY(mpte != NULL);
4605 mp_tp = mpte->mpte_mptcb;
4606 VERIFY(mp_tp != NULL);
4607 len = sizeof(*flows) * mpte->mpte_numflows;
4608 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
4609 if (flows == NULL) {
4610 lck_mtx_unlock(&mpp->mpp_lock);
4611 break;
4612 }
4613 /* N.B. we don't take the mpt_lock just for the state. */
4614 mptcpci.mptcpci_state = mp_tp->mpt_state;
4615 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
4616 mptcpci.mptcpci_len = sizeof(mptcpci) +
4617 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
4618 error = SYSCTL_OUT(req, &mptcpci,
4619 sizeof(mptcpci) - sizeof(*flows));
4620 if (error) {
4621 lck_mtx_unlock(&mpp->mpp_lock);
4622 FREE(flows, M_TEMP);
4623 break;
4624 }
4625 f = 0;
4626 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4627 MPTS_LOCK(mpts);
4628 so = mpts->mpts_socket;
4629 socket_lock(so, 0);
4630 fill_mptcp_subflow(so, &flows[f], mpts);
4631 socket_unlock(so, 0);
4632 MPTS_UNLOCK(mpts);
4633 f++;
4634 }
4635 lck_mtx_unlock(&mpp->mpp_lock);
4636 error = SYSCTL_OUT(req, flows, len);
4637 FREE(flows, M_TEMP);
4638 if (error)
4639 break;
4640 }
4641 lck_mtx_unlock(&mtcbinfo.mppi_lock);
4642
4643 return (error);
4644}
4645
4646SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
4647 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
4648 "List of active MPTCP connections");