]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-3248.20.55.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
3e170ce0 2 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
3e170ce0 50#include <net/if_var.h>
39236c6e
A
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_fsm.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58#include <netinet/mptcp_var.h>
59#include <netinet/mptcp.h>
60#include <netinet/mptcp_seq.h>
61#include <netinet/mptcp_timer.h>
62#include <libkern/crypto/sha1.h>
63#if INET6
64#include <netinet6/in6_pcb.h>
65#include <netinet6/ip6protosw.h>
66#endif /* INET6 */
67#include <dev/random/randomdev.h>
68
69/*
70 * Notes on MPTCP implementation.
71 *
72 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
73 * communication domain. The structure mtcbinfo describes the MPTCP instance
74 * of a Multipath protocol in that domain. It is used to keep track of all
75 * MPTCP PCB instances in the system, and is protected by the global lock
76 * mppi_lock.
77 *
78 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
79 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
80 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
81 * allocated from the same memory block, and each structure has a pointer
82 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
83 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
84 * PCB (mppcb) as well as the MPTCP Session (mptses).
85 *
86 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
87 * in particular, the list of subflows as well as the MPTCP thread.
88 *
89 * A functioning MPTCP Session consists of one or more subflow sockets. Each
90 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
91 * represented by the mptsub structure. Because each subflow requires access
92 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
93 * subflow. This gets decremented prior to the subflow's destruction. The
94 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
95 *
96 * To handle events (read, write, control) from the subflows, an MPTCP thread
97 * is created; currently, there is one thread per MPTCP Session. In order to
98 * prevent the MPTCP socket from being destroyed while being accessed by the
99 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
100 * which will be decremented prior to the thread's termination. The thread
101 * lock (mpte_thread_lock) is used to synchronize its signalling.
102 *
103 * Lock ordering is defined as follows:
104 *
105 * mtcbinfo (mppi_lock)
106 * mp_so (mpp_lock)
107 * mpts (mpts_lock)
108 * so (inpcb_mtx)
109 * mptcb (mpt_lock)
110 *
111 * It is not a requirement that all of the above locks need to be acquired
112 * in succession, but the correct lock ordering must be followed when there
113 * are more than one locks that need to be held. The MPTCP thread lock is
114 * is not constrained by this arrangement, because none of the other locks
115 * is ever acquired while holding mpte_thread_lock; therefore it may be called
116 * at any moment to signal the thread.
117 *
118 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
119 * work is done by the MPTCP garbage collector which is invoked on demand by
120 * the PF_MULTIPATH garbage collector. This process will take place once all
121 * of the subflows have been destroyed, and the MPTCP thread be instructed to
122 * self-terminate.
123 */
124
125static void mptcp_sesdestroy(struct mptses *);
126static void mptcp_thread_signal_locked(struct mptses *);
127static void mptcp_thread_terminate_signal(struct mptses *);
128static void mptcp_thread_dowork(struct mptses *);
129static void mptcp_thread_func(void *, wait_result_t);
130static void mptcp_thread_destroy(struct mptses *);
131static void mptcp_key_pool_init(void);
fe8ab488 132static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e
A
133static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
134static void mptcp_conn_properties(struct mptcb *);
135static void mptcp_init_statevars(struct mptcb *);
136
137static uint32_t mptcp_gc(struct mppcbinfo *);
138static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
139 int, struct proc *, struct socket **);
140static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
141static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
142static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
143 struct uio *, struct mbuf **, struct mbuf **, int *);
144static void mptcp_subflow_rupcall(struct socket *, void *, int);
145static void mptcp_subflow_input(struct mptses *, struct mptsub *);
146static void mptcp_subflow_wupcall(struct socket *, void *, int);
147static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
148static void mptcp_update_last_owner(struct mptsub *, struct socket *);
fe8ab488 149static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
3e170ce0 150static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
39236c6e
A
151
152/*
153 * Possible return values for subflow event handlers. Note that success
154 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
155 * indicate errors or actions which require immediate attention; they will
156 * prevent the rest of the handlers from processing their respective events
157 * until the next round of events processing.
158 */
159typedef enum {
160 MPTS_EVRET_DELETE = 1, /* delete this subflow */
161 MPTS_EVRET_OK = 2, /* OK */
162 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
163 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
164} ev_ret_t;
165
3e170ce0
A
166static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
167static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
168static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
169static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
170static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
171static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
172static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
173static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
174static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
175static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
176static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
177static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
178static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
179static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
180static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
181static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
182static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
fe8ab488 183
39236c6e
A
184static const char *mptcp_evret2str(ev_ret_t);
185
186static mptcp_key_t *mptcp_reserve_key(void);
187static int mptcp_do_sha1(mptcp_key_t *, char *, int);
188static int mptcp_init_authparms(struct mptcb *);
39236c6e
A
189
190static unsigned int mptsub_zone_size; /* size of mptsub */
191static struct zone *mptsub_zone; /* zone for mptsub */
192
193static unsigned int mptopt_zone_size; /* size of mptopt */
194static struct zone *mptopt_zone; /* zone for mptopt */
195
196static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
197static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
198
199struct mppcbinfo mtcbinfo;
200
201static struct mptcp_keys_pool_head mptcp_keys_pool;
202
203#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
204#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
205
206SYSCTL_DECL(_net_inet);
207
208SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
209
3e170ce0
A
210uint32_t mptcp_dbg_area = 0; /* more noise if greater than 1 */
211SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
212 &mptcp_dbg_area, 0, "MPTCP debug area");
213
214uint32_t mptcp_dbg_level = 0;
215SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
216 &mptcp_dbg_level, 0, "MPTCP debug level");
217
39236c6e
A
218
219SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
220 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
221
222/*
223 * Since there is one kernel thread per mptcp socket, imposing an artificial
224 * limit on number of allowed mptcp sockets.
225 */
226uint32_t mptcp_socket_limit = MPPCB_LIMIT;
227SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
228 &mptcp_socket_limit, 0, "MPTCP socket limit");
229
fe8ab488
A
230/*
231 * SYSCTL to turn on delayed cellular subflow start.
232 */
233uint32_t mptcp_delayed_subf_start = 0;
234SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
235 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
236
237/*
3e170ce0 238 * sysctl to use network status hints from symptomsd
fe8ab488 239 */
3e170ce0
A
240uint32_t mptcp_use_symptomsd = 1;
241SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
242 &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
fe8ab488 243
39236c6e
A
244static struct protosw mptcp_subflow_protosw;
245static struct pr_usrreqs mptcp_subflow_usrreqs;
246#if INET6
247static struct ip6protosw mptcp_subflow_protosw6;
248static struct pr_usrreqs mptcp_subflow_usrreqs6;
249#endif /* INET6 */
250
3e170ce0
A
251typedef struct mptcp_subflow_event_entry {
252 uint64_t sofilt_hint_mask;
253 ev_ret_t (*sofilt_hint_ev_hdlr)(
254 struct mptses *mpte,
255 struct mptsub *mpts,
256 uint64_t *p_mpsofilt_hint);
257} mptsub_ev_entry_t;
258
259static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
263 },
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
267 },
268 {
269 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
275 },
276 {
277 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
278 .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
279 },
280 { .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
281 .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
282 },
283 {
284 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
285 .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
286 },
287 {
288 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
289 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
290 },
291 {
292 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
293 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
294 },
295 {
296 .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
297 .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
298 },
299 {
300 .sofilt_hint_mask = SO_FILT_HINT_RESUME,
301 .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
302 },
303 {
304 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
305 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
306 },
307 {
308 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
309 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
310 },
311 {
312 .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
313 .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
314 },
315 {
316 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
317 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
318 },
319 {
320 .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
321 .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
322 }
323};
324
39236c6e
A
325/*
326 * Protocol pr_init callback.
327 */
328void
329mptcp_init(struct protosw *pp, struct domain *dp)
330{
331#pragma unused(dp)
332 static int mptcp_initialized = 0;
333 struct protosw *prp;
334#if INET6
335 struct ip6protosw *prp6;
336#endif /* INET6 */
337
338 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
339
340 /* do this only once */
341 if (mptcp_initialized)
342 return;
343 mptcp_initialized = 1;
344
345 /*
346 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
347 * we must be able to find IPPROTO_TCP entries for both.
348 */
349 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
350 VERIFY(prp != NULL);
351 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
352 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
353 sizeof (mptcp_subflow_usrreqs));
354 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
355 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
356 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
357 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
358 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
359 /*
360 * Socket filters shouldn't attach/detach to/from this protosw
361 * since pr_protosw is to be used instead, which points to the
362 * real protocol; if they do, it is a bug and we should panic.
363 */
364 mptcp_subflow_protosw.pr_filter_head.tqh_first =
365 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
366 mptcp_subflow_protosw.pr_filter_head.tqh_last =
367 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
368
369#if INET6
370 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
371 IPPROTO_TCP, SOCK_STREAM);
372 VERIFY(prp6 != NULL);
373 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
374 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
375 sizeof (mptcp_subflow_usrreqs6));
376 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
377 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
378 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
379 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
380 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
381 /*
382 * Socket filters shouldn't attach/detach to/from this protosw
383 * since pr_protosw is to be used instead, which points to the
384 * real protocol; if they do, it is a bug and we should panic.
385 */
386 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
387 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
388 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
389 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
390#endif /* INET6 */
391
392 bzero(&mtcbinfo, sizeof (mtcbinfo));
393 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
394 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
395 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
396 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
397 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
398 /* NOTREACHED */
399 }
400 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
401 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
402
403 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
404 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
405 mtcbinfo.mppi_lock_grp_attr);
406 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
407 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
408 mtcbinfo.mppi_lock_attr);
39236c6e 409
3e170ce0 410 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e 411 mtcbinfo.mppi_timer = mptcp_timer;
3e170ce0 412 mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
39236c6e
A
413
414 /* attach to MP domain for garbage collection to take place */
415 mp_pcbinfo_attach(&mtcbinfo);
416
417 mptsub_zone_size = sizeof (struct mptsub);
418 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
419 8192, "mptsub")) == NULL) {
420 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
421 /* NOTREACHED */
422 }
423 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
424 zone_change(mptsub_zone, Z_EXPAND, TRUE);
425
426 mptopt_zone_size = sizeof (struct mptopt);
427 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
428 1024, "mptopt")) == NULL) {
429 panic("%s: unable to allocate MPTCP option zone\n", __func__);
430 /* NOTREACHED */
431 }
432 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
433 zone_change(mptopt_zone, Z_EXPAND, TRUE);
434
435 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
436 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
437 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
438 panic("%s: unable to allocate MPTCP address auth zone \n",
439 __func__);
440 /* NOTREACHED */
441 }
442 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
443 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
444
445 /* Set up a list of unique keys */
446 mptcp_key_pool_init();
39236c6e
A
447}
448
449/*
450 * Create an MPTCP session, called as a result of opening a MPTCP socket.
451 */
3e170ce0 452void *
39236c6e
A
453mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
454{
455 struct mppcbinfo *mppi;
456 struct mptses *mpte;
457 struct mptcb *mp_tp;
458 int error = 0;
459
460 VERIFY(mpp != NULL);
461 mppi = mpp->mpp_pcbinfo;
462 VERIFY(mppi != NULL);
463
3e170ce0
A
464 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
465 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
466
467 /* MPTCP Multipath PCB Extension */
468 bzero(mpte, sizeof (*mpte));
469 VERIFY(mpp->mpp_pcbe == NULL);
470 mpp->mpp_pcbe = mpte;
471 mpte->mpte_mppcb = mpp;
472 mpte->mpte_mptcb = mp_tp;
473
474 TAILQ_INIT(&mpte->mpte_sopts);
475 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
476 mpte->mpte_associd = SAE_ASSOCID_ANY;
477 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e
A
478
479 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
480 mppi->mppi_lock_attr);
481
482 /*
483 * XXX: adi@apple.com
484 *
485 * This can be rather expensive if we have lots of MPTCP sockets,
486 * but we need a kernel thread for this model to work. Perhaps we
487 * could amortize the costs by having one worker thread per a group
488 * of MPTCP sockets.
489 */
490 if (kernel_thread_start(mptcp_thread_func, mpte,
491 &mpte->mpte_thread) != KERN_SUCCESS) {
492 error = ENOBUFS;
493 goto out;
494 }
495 mp_so->so_usecount++; /* for thread */
496
497 /* MPTCP Protocol Control Block */
498 bzero(mp_tp, sizeof (*mp_tp));
499 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
500 mppi->mppi_lock_attr);
501 mp_tp->mpt_mpte = mpte;
3e170ce0 502 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e
A
503out:
504 if (error != 0)
505 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
506 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
507 struct sockbuf *, &mp_so->so_rcv,
508 struct sockbuf *, &mp_so->so_snd,
509 struct mppcb *, mpp, int, error);
510
511 return ((error != 0) ? NULL : mpte);
512}
513
514/*
515 * Destroy an MPTCP session.
516 */
517static void
518mptcp_sesdestroy(struct mptses *mpte)
519{
520 struct mptcb *mp_tp;
521
522 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
523
524 mp_tp = mpte->mpte_mptcb;
525 VERIFY(mp_tp != NULL);
526
527 /*
528 * MPTCP Multipath PCB Extension section
529 */
530 mptcp_flush_sopts(mpte);
531 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
532
533 lck_mtx_destroy(&mpte->mpte_thread_lock,
534 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
535
536 /*
537 * MPTCP Protocol Control Block section
538 */
539 lck_mtx_destroy(&mp_tp->mpt_lock,
540 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
541
542 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
543 struct mptcb *, mp_tp);
544}
545
546/*
547 * Allocate an MPTCP socket option structure.
548 */
549struct mptopt *
550mptcp_sopt_alloc(int how)
551{
552 struct mptopt *mpo;
553
554 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
555 zalloc_noblock(mptopt_zone);
556 if (mpo != NULL) {
557 bzero(mpo, mptopt_zone_size);
558 }
559
560 return (mpo);
561}
562
563/*
564 * Free an MPTCP socket option structure.
565 */
566void
567mptcp_sopt_free(struct mptopt *mpo)
568{
569 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
570
571 zfree(mptopt_zone, mpo);
572}
573
574/*
575 * Add a socket option to the MPTCP socket option list.
576 */
577void
578mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
579{
580 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
581 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
582 mpo->mpo_flags |= MPOF_ATTACHED;
583 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
584}
585
586/*
587 * Remove a socket option from the MPTCP socket option list.
588 */
589void
590mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
591{
592 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
593 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
594 mpo->mpo_flags &= ~MPOF_ATTACHED;
595 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
596}
597
598/*
599 * Search for an existing <sopt_level,sopt_name> socket option.
600 */
601struct mptopt *
602mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
603{
604 struct mptopt *mpo;
605
606 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
607
608 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
609 if (mpo->mpo_level == sopt->sopt_level &&
610 mpo->mpo_name == sopt->sopt_name)
611 break;
612 }
613 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
614
615 return (mpo);
616}
617
618/*
619 * Flushes all recorded socket options from an MP socket.
620 */
621void
622mptcp_flush_sopts(struct mptses *mpte)
623{
624 struct mptopt *mpo, *tmpo;
625
626 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
627
628 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
629 mptcp_sopt_remove(mpte, mpo);
630 mptcp_sopt_free(mpo);
631 }
632 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
633}
634
635/*
636 * Allocate a MPTCP subflow structure.
637 */
638struct mptsub *
639mptcp_subflow_alloc(int how)
640{
641 struct mptsub *mpts;
642
643 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
644 zalloc_noblock(mptsub_zone);
645 if (mpts != NULL) {
646 bzero(mpts, mptsub_zone_size);
647 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
648 mtcbinfo.mppi_lock_attr);
649 }
650
651 return (mpts);
652}
653
654/*
655 * Deallocate a subflow structure, called when all of the references held
656 * on it have been released. This implies that the subflow has been deleted.
657 */
658void
659mptcp_subflow_free(struct mptsub *mpts)
660{
661 MPTS_LOCK_ASSERT_HELD(mpts);
662
663 VERIFY(mpts->mpts_refcnt == 0);
664 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
665 VERIFY(mpts->mpts_mpte == NULL);
666 VERIFY(mpts->mpts_socket == NULL);
667
668 if (mpts->mpts_src_sl != NULL) {
669 sockaddrlist_free(mpts->mpts_src_sl);
670 mpts->mpts_src_sl = NULL;
671 }
672 if (mpts->mpts_dst_sl != NULL) {
673 sockaddrlist_free(mpts->mpts_dst_sl);
674 mpts->mpts_dst_sl = NULL;
675 }
676 MPTS_UNLOCK(mpts);
677 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
678
679 zfree(mptsub_zone, mpts);
680}
681
682/*
683 * Create an MPTCP subflow socket.
684 */
685static int
686mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
687 struct proc *p, struct socket **so)
688{
689 struct mptopt smpo, *mpo, *tmpo;
690 struct socket *mp_so;
691 int error;
692
693 *so = NULL;
694 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
695 mp_so = mpte->mpte_mppcb->mpp_socket;
696
697 /*
698 * Create the subflow socket (multipath subflow, non-blocking.)
699 *
700 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
701 * socket; it will be cleared when the socket is peeled off or closed.
702 * It also indicates to the underlying TCP to handle MPTCP options.
703 * A multipath subflow socket implies SS_NOFDREF state.
704 */
705 if ((error = socreate_internal(dom, so, SOCK_STREAM,
706 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
3e170ce0
A
707 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
708 " unable to create subflow socket error %d\n",
709 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
710 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
711 return (error);
712 }
713
714 socket_lock(*so, 0);
715 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
716 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
717 (SS_NBIO|SS_NOFDREF));
718
719 /* prevent the socket buffers from being compressed */
720 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
721 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
722
723 bzero(&smpo, sizeof (smpo));
724 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
725 smpo.mpo_level = SOL_SOCKET;
726 smpo.mpo_intval = 1;
727
728 /* disable SIGPIPE */
729 smpo.mpo_name = SO_NOSIGPIPE;
730 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
731 goto out;
732
733 /* find out if the subflow's source address goes away */
734 smpo.mpo_name = SO_NOADDRERR;
735 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
736 goto out;
737
738 /* enable keepalive */
739 smpo.mpo_name = SO_KEEPALIVE;
740 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
741 goto out;
742
743 /*
744 * Limit the receive socket buffer size to 64k.
745 *
746 * We need to take into consideration the window scale option
747 * which could be negotiated in one subflow but disabled in
748 * another subflow.
749 * XXX This can be improved in the future.
750 */
751 smpo.mpo_name = SO_RCVBUF;
752 smpo.mpo_intval = MPTCP_RWIN_MAX;
753 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
754 goto out;
755
756 /* N.B.: set by sosetopt */
757 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
758 /* Prevent automatic socket buffer sizing. */
759 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
760
761 smpo.mpo_level = IPPROTO_TCP;
762 smpo.mpo_intval = mptcp_subflow_keeptime;
763 smpo.mpo_name = TCP_KEEPALIVE;
764 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
765 goto out;
766
767 /* replay setsockopt(2) on the subflow sockets for eligible options */
768 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
769 int interim;
770
771 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
772 continue;
773
774 /*
775 * Skip those that are handled internally; these options
776 * should not have been recorded and marked with the
777 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
778 */
779 if (mpo->mpo_level == SOL_SOCKET &&
780 (mpo->mpo_name == SO_NOSIGPIPE ||
781 mpo->mpo_name == SO_NOADDRERR ||
782 mpo->mpo_name == SO_KEEPALIVE))
783 continue;
784
785 interim = (mpo->mpo_flags & MPOF_INTERIM);
786 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
787 char buf[32];
3e170ce0
A
788 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
789 " mp_so 0x%llx"
790 " sopt %s val %d interim record removed\n",
39236c6e
A
791 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
792 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
793 buf, sizeof (buf)), mpo->mpo_intval),
794 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
795 mptcp_sopt_remove(mpte, mpo);
796 mptcp_sopt_free(mpo);
797 continue;
798 }
799 }
800
801 /*
802 * We need to receive everything that the subflow socket has,
803 * so use a customized socket receive function. We will undo
804 * this when the socket is peeled off or closed.
805 */
806 mpts->mpts_oprotosw = (*so)->so_proto;
807 switch (dom) {
808 case PF_INET:
809 (*so)->so_proto = &mptcp_subflow_protosw;
810 break;
811#if INET6
812 case PF_INET6:
813 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
814 break;
815#endif /* INET6 */
816 default:
817 VERIFY(0);
818 /* NOTREACHED */
819 }
820
821out:
822 socket_unlock(*so, 0);
823
824 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
825 struct mptsub *, mpts, int, dom, int, error);
826
827 return (error);
828}
829
830/*
831 * Close an MPTCP subflow socket.
832 *
833 * Note that this may be called on an embryonic subflow, and the only
834 * thing that is guaranteed valid is the protocol-user request.
835 */
836static int
837mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
838{
839 MPTS_LOCK_ASSERT_HELD(mpts);
840
841 socket_lock(so, 0);
842 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
843 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
844
845 /* restore protocol-user requests */
846 VERIFY(mpts->mpts_oprotosw != NULL);
847 so->so_proto = mpts->mpts_oprotosw;
848 socket_unlock(so, 0);
849
850 mpts->mpts_socket = NULL; /* may already be NULL */
851
852 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
853 struct socket *, so,
854 struct sockbuf *, &so->so_rcv,
855 struct sockbuf *, &so->so_snd,
856 struct mptses *, mpts->mpts_mpte);
857
858 return (soclose(so));
859}
860
861/*
862 * Connect an MPTCP subflow socket.
863 *
864 * This may be called inline as part of adding a subflow, or asynchronously
865 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
866 * pending connect case, the subflow socket may have been bound to an interface
867 * and/or a source IP address which may no longer be around by the time this
868 * routine is called; in that case the connect attempt will most likely fail.
869 */
870static int
871mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
872{
873 struct socket *so;
874 int af, error;
875
876 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
877 MPTS_LOCK_ASSERT_HELD(mpts);
878
879 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
880 MPTSF_CONNECTING);
881 VERIFY(mpts->mpts_socket != NULL);
882 so = mpts->mpts_socket;
883 af = mpts->mpts_family;
884
885 if (af == AF_INET || af == AF_INET6) {
886 struct sockaddr_entry *dst_se;
887 char dbuf[MAX_IPv6_STR_LEN];
888
889 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
890 VERIFY(dst_se != NULL);
891
3e170ce0
A
892 mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
893 "dst %s[%d] cid %d [pended %s]\n",
39236c6e
A
894 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
895 inet_ntop(af, ((af == AF_INET) ?
896 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
897 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
898 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
899 ntohs(SIN(dst_se->se_addr)->sin_port) :
900 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
901 mpts->mpts_connid,
902 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
903 "YES" : "NO")),
904 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
905 }
906
907 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
908
909 socket_lock(so, 0);
fe8ab488
A
910 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
911
39236c6e
A
912 /* connect the subflow socket */
913 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
914 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
3e170ce0
A
915 mpte->mpte_associd, NULL, CONNREQF_MPTCP,
916 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
39236c6e
A
917 socket_unlock(so, 0);
918
fe8ab488
A
919 /* Allocate a unique address id per subflow */
920 mpte->mpte_addrid_last++;
921 if (mpte->mpte_addrid_last == 0)
922 mpte->mpte_addrid_last++;
923
39236c6e
A
924 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
925 struct mptsub *, mpts, int, error);
926
927 return (error);
928}
929
930/*
931 * MPTCP subflow socket receive routine, derived from soreceive().
932 */
933static int
934mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
935 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
936{
937#pragma unused(uio)
938 int flags, error = 0;
939 struct proc *p = current_proc();
940 struct mbuf *m, **mp = mp0;
941 struct mbuf *nextrecord;
942
943 socket_lock(so, 1);
944 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
945
946#ifdef MORE_LOCKING_DEBUG
947 if (so->so_usecount == 1) {
948 panic("%s: so=%x no other reference on socket\n", __func__, so);
949 /* NOTREACHED */
950 }
951#endif
952 /*
953 * We return all that is there in the subflow's socket receive buffer
954 * to the MPTCP layer, so we require that the caller passes in the
955 * expected parameters.
956 */
957 if (mp == NULL || controlp != NULL) {
958 socket_unlock(so, 1);
959 return (EINVAL);
960 }
961 *mp = NULL;
962 if (psa != NULL)
963 *psa = NULL;
964 if (flagsp != NULL)
965 flags = *flagsp &~ MSG_EOR;
966 else
967 flags = 0;
968
969 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
970 socket_unlock(so, 1);
971 return (EOPNOTSUPP);
972 }
973 flags |= (MSG_DONTWAIT|MSG_NBIO);
974
975 /*
976 * If a recv attempt is made on a previously-accepted socket
977 * that has been marked as inactive (disconnected), reject
978 * the request.
979 */
980 if (so->so_flags & SOF_DEFUNCT) {
981 struct sockbuf *sb = &so->so_rcv;
982
983 error = ENOTCONN;
984 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
985 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
986 SOCK_DOM(so), SOCK_TYPE(so), error));
987 /*
988 * This socket should have been disconnected and flushed
989 * prior to being returned from sodefunct(); there should
990 * be no data on its receive list, so panic otherwise.
991 */
992 if (so->so_state & SS_DEFUNCT)
993 sb_empty_assert(sb, __func__);
994 socket_unlock(so, 1);
995 return (error);
996 }
997
998 /*
999 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1000 * and if so just return to the caller. This could happen when
1001 * soreceive() is called by a socket upcall function during the
1002 * time the socket is freed. The socket buffer would have been
1003 * locked across the upcall, therefore we cannot put this thread
1004 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1005 * we may livelock), because the lock on the socket buffer will
1006 * only be released when the upcall routine returns to its caller.
1007 * Because the socket has been officially closed, there can be
1008 * no further read on it.
1009 *
1010 * A multipath subflow socket would have its SS_NOFDREF set by
1011 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1012 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1013 */
1014 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1015 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1016 socket_unlock(so, 1);
1017 return (0);
1018 }
1019
1020 /*
1021 * For consistency with soreceive() semantics, we need to obey
1022 * SB_LOCK in case some other code path has locked the buffer.
1023 */
1024 error = sblock(&so->so_rcv, 0);
1025 if (error != 0) {
1026 socket_unlock(so, 1);
1027 return (error);
1028 }
1029
1030 m = so->so_rcv.sb_mb;
1031 if (m == NULL) {
1032 /*
1033 * Panic if we notice inconsistencies in the socket's
1034 * receive list; both sb_mb and sb_cc should correctly
1035 * reflect the contents of the list, otherwise we may
1036 * end up with false positives during select() or poll()
1037 * which could put the application in a bad state.
1038 */
1039 SB_MB_CHECK(&so->so_rcv);
1040
1041 if (so->so_error != 0) {
1042 error = so->so_error;
1043 so->so_error = 0;
1044 goto release;
1045 }
1046
1047 if (so->so_state & SS_CANTRCVMORE) {
1048 goto release;
1049 }
1050
1051 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1052 error = ENOTCONN;
1053 goto release;
1054 }
1055
1056 /*
1057 * MSG_DONTWAIT is implicitly defined and this routine will
1058 * never block, so return EWOULDBLOCK when there is nothing.
1059 */
1060 error = EWOULDBLOCK;
1061 goto release;
1062 }
1063
1064 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1065 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1066 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1067
1068 while (m != NULL) {
1069 nextrecord = m->m_nextpkt;
1070 sbfree(&so->so_rcv, m);
1071
1072 if (mp != NULL) {
1073 *mp = m;
1074 mp = &m->m_next;
1075 so->so_rcv.sb_mb = m = m->m_next;
1076 *mp = NULL;
1077 }
1078
1079 if (m != NULL) {
1080 m->m_nextpkt = nextrecord;
1081 if (nextrecord == NULL)
1082 so->so_rcv.sb_lastrecord = m;
1083 } else {
1084 m = so->so_rcv.sb_mb = nextrecord;
1085 SB_EMPTY_FIXUP(&so->so_rcv);
1086 }
1087 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1088 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1089 }
1090
1091 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1092 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1093 /* notify protocol that we drained all the data */
1094 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1095 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1096
1097 if (flagsp != NULL)
1098 *flagsp |= flags;
1099
1100release:
1101 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1102 return (error);
1103
1104}
1105
1106
1107/*
1108 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1109 * the work done earlier when the subflow socket was created.
1110 */
1111void
1112mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1113 struct socket *so)
1114{
1115 struct mptopt smpo;
1116 struct socket *mp_so;
1117 int p, c;
1118
1119 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1120 mp_so = mpte->mpte_mppcb->mpp_socket;
1121 MPTS_LOCK_ASSERT_HELD(mpts);
1122
1123 socket_lock(so, 0);
1124 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1125 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1126
1127 /* inherit MPTCP socket states */
1128 if (!(mp_so->so_state & SS_NBIO))
1129 so->so_state &= ~SS_NBIO;
1130
1131 /*
1132 * At this point, the socket is not yet closed, as there is at least
1133 * one outstanding usecount previously held by mpts_socket from
1134 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1135 */
1136 so->so_flags &= ~SOF_MP_SUBFLOW;
1137 so->so_state &= ~SS_NOFDREF;
fe8ab488 1138 so->so_flags &= ~SOF_MPTCP_TRUE;
39236c6e
A
1139
1140 /* allow socket buffers to be compressed */
1141 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1142 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1143
1144 /*
1145 * Allow socket buffer auto sizing.
1146 *
1147 * This will increase the current 64k buffer size to whatever is best.
1148 */
fe8ab488
A
1149 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
1150 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1151 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1152 so->so_snd.sb_flags |= SB_AUTOSIZE;
39236c6e
A
1153
1154 /* restore protocol-user requests */
1155 VERIFY(mpts->mpts_oprotosw != NULL);
1156 so->so_proto = mpts->mpts_oprotosw;
1157
1158 bzero(&smpo, sizeof (smpo));
1159 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1160 smpo.mpo_level = SOL_SOCKET;
1161
1162 /* inherit SOF_NOSIGPIPE from parent MP socket */
1163 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1164 c = (so->so_flags & SOF_NOSIGPIPE);
1165 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1166 smpo.mpo_name = SO_NOSIGPIPE;
1167 if ((p - c) != 0)
1168 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1169
1170 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1171 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1172 c = (so->so_flags & SOF_NOADDRAVAIL);
1173 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1174 smpo.mpo_name = SO_NOADDRERR;
1175 if ((p - c) != 0)
1176 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1177
1178 /* inherit SO_KEEPALIVE from parent MP socket */
1179 p = (mp_so->so_options & SO_KEEPALIVE);
1180 c = (so->so_options & SO_KEEPALIVE);
1181 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1182 smpo.mpo_name = SO_KEEPALIVE;
1183 if ((p - c) != 0)
1184 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1185
1186 /* unset TCP level default keepalive option */
1187 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1188 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1189 smpo.mpo_level = IPPROTO_TCP;
1190 smpo.mpo_intval = 0;
1191 smpo.mpo_name = TCP_KEEPALIVE;
1192 if ((p - c) != 0)
1193 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1194 socket_unlock(so, 0);
1195
1196 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1197 struct mptsub *, mpts, struct socket *, so,
1198 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1199}
1200
1201/*
1202 * Establish an initial MPTCP connection (if first subflow and not yet
1203 * connected), or add a subflow to an existing MPTCP connection.
1204 */
1205int
1206mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1207 struct proc *p, uint32_t ifscope)
1208{
1209 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1210 struct socket *mp_so, *so = NULL;
1211 struct mptsub_connreq mpcr;
1212 struct mptcb *mp_tp;
1213 int af, error = 0;
1214
1215 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1216 mp_so = mpte->mpte_mppcb->mpp_socket;
1217 mp_tp = mpte->mpte_mptcb;
1218
fe8ab488
A
1219 MPT_LOCK(mp_tp);
1220 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1221 /* If the remote end sends Data FIN, refuse subflow adds */
1222 error = ENOTCONN;
1223 MPT_UNLOCK(mp_tp);
1224 return (error);
1225 }
1226 MPT_UNLOCK(mp_tp);
1227
39236c6e
A
1228 MPTS_LOCK(mpts);
1229 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1230 VERIFY(mpts->mpts_mpte == NULL);
1231 VERIFY(mpts->mpts_socket == NULL);
1232 VERIFY(mpts->mpts_dst_sl != NULL);
3e170ce0 1233 VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
39236c6e
A
1234
1235 /* select source (if specified) and destination addresses */
1236 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1237 &mpts->mpts_dst_sl, &dst_se)) != 0)
1238 goto out;
1239
1240 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1241 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1242 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1243 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1244 VERIFY(af == AF_INET || af == AF_INET6);
1245
1246 /*
1247 * If the source address is not specified, allocate a storage for
1248 * it, so that later on we can fill it in with the actual source
1249 * IP address chosen by the underlying layer for the subflow after
1250 * it is connected.
1251 */
1252 if (mpts->mpts_src_sl == NULL) {
1253 mpts->mpts_src_sl =
1254 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1255 if (mpts->mpts_src_sl == NULL) {
1256 error = ENOBUFS;
1257 goto out;
1258 }
1259 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1260 VERIFY(se != NULL && se->se_addr != NULL &&
1261 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1262 bzero(se->se_addr, se->se_addr->sa_len);
1263 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1264 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1265 }
1266
1267 /* create the subflow socket */
1268 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1269 goto out;
1270
fe8ab488
A
1271 /* If fastjoin is requested, set state in mpts */
1272 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
1273 (mp_tp->mpt_state == MPTCPS_ESTABLISHED) &&
1274 (mpte->mpte_nummpcapflows == 0)) {
1275 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1276 mpts->mpts_rel_seq = 1;
1277 MPT_LOCK(mp_tp);
1278 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1279 MPT_UNLOCK(mp_tp);
1280 }
1281
39236c6e 1282 /*
3e170ce0
A
1283 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1284 * -1 (SAE_CONNID_ALL).
39236c6e
A
1285 */
1286 mpte->mpte_connid_last++;
3e170ce0
A
1287 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1288 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
1289 mpte->mpte_connid_last++;
1290
1291 mpts->mpts_connid = mpte->mpte_connid_last;
3e170ce0
A
1292 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1293 mpts->mpts_connid != SAE_CONNID_ALL);
fe8ab488
A
1294
1295 /* Allocate a unique address id per subflow */
1296 mpte->mpte_addrid_last++;
1297 if (mpte->mpte_addrid_last == 0)
1298 mpte->mpte_addrid_last++;
39236c6e
A
1299
1300 /* bind subflow socket to the specified interface */
1301 if (ifscope != IFSCOPE_NONE) {
1302 socket_lock(so, 0);
1303 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1304 if (error != 0) {
1305 socket_unlock(so, 0);
1306 (void) mptcp_subflow_soclose(mpts, so);
1307 goto out;
1308 }
1309 VERIFY(mpts->mpts_outif != NULL);
1310 mpts->mpts_flags |= MPTSF_BOUND_IF;
1311
3e170ce0
A
1312 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
1313 "bindif %s[%d] cid d\n",
39236c6e
A
1314 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1315 mpts->mpts_outif->if_xname,
3e170ce0
A
1316 ifscope, mpts->mpts_connid),
1317 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1318 socket_unlock(so, 0);
1319 }
1320
1321 /* if source address and/or port is specified, bind to it */
1322 if (src_se != NULL) {
1323 struct sockaddr *sa = src_se->se_addr;
1324 uint32_t mpts_flags = 0;
1325 in_port_t lport;
1326
1327 switch (af) {
1328 case AF_INET:
1329 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1330 mpts_flags |= MPTSF_BOUND_IP;
1331 if ((lport = SIN(sa)->sin_port) != 0)
1332 mpts_flags |= MPTSF_BOUND_PORT;
1333 break;
1334#if INET6
1335 case AF_INET6:
1336 VERIFY(af == AF_INET6);
1337 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1338 mpts_flags |= MPTSF_BOUND_IP;
1339 if ((lport = SIN6(sa)->sin6_port) != 0)
1340 mpts_flags |= MPTSF_BOUND_PORT;
1341 break;
1342#endif /* INET6 */
1343 }
1344
1345 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1346 if (error != 0) {
1347 (void) mptcp_subflow_soclose(mpts, so);
1348 goto out;
1349 }
1350 mpts->mpts_flags |= mpts_flags;
1351
1352 if (af == AF_INET || af == AF_INET6) {
1353 char sbuf[MAX_IPv6_STR_LEN];
1354
3e170ce0
A
1355 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
1356 "mp_so 0x%llx bindip %s[%d] cid %d\n",
39236c6e
A
1357 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1358 inet_ntop(af, ((af == AF_INET) ?
1359 (void *)&SIN(sa)->sin_addr.s_addr :
1360 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
3e170ce0
A
1361 ntohs(lport), mpts->mpts_connid),
1362 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1363 }
1364 }
1365
1366 /*
1367 * Insert the subflow into the list, and associate the MPTCP PCB
1368 * as well as the the subflow socket. From this point on, removing
1369 * the subflow needs to be done via mptcp_subflow_del().
1370 */
1371 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1372 mpte->mpte_numflows++;
1373
1374 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1375 mpts->mpts_mpte = mpte;
1376 mpts->mpts_socket = so;
1377 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1378 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1379 mp_so->so_usecount++; /* for subflow socket */
1380
1381 /* register for subflow socket read/write events */
1382 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1383 mptcp_subflow_wupcall, mpts);
1384
1385 /*
1386 * Register for subflow socket control events; ignore
1387 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1388 * will generate it here.
1389 */
1390 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1391 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1392 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1393 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1394 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1395 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1396 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
fe8ab488
A
1397 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1398 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
39236c6e
A
1399
1400 /* sanity check */
1401 VERIFY(!(mpts->mpts_flags &
1402 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1403
1404 bzero(&mpcr, sizeof (mpcr));
1405 mpcr.mpcr_proc = p;
1406 mpcr.mpcr_ifscope = ifscope;
1407 /*
1408 * Indicate to the TCP subflow whether or not it should establish
1409 * the initial MPTCP connection, or join an existing one. Fill
1410 * in the connection request structure with additional info needed
1411 * by the underlying TCP (to be used in the TCP options, etc.)
1412 */
1413 MPT_LOCK(mp_tp);
1414 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1415 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1416 mp_tp->mpt_localkey = mptcp_reserve_key();
1417 mptcp_conn_properties(mp_tp);
1418 }
1419 MPT_UNLOCK(mp_tp);
1420 soisconnecting(mp_so);
1421 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1422 } else {
1423 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1424 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
fe8ab488
A
1425
1426 /* avoid starting up cellular subflow unless required */
1427 if ((mptcp_delayed_subf_start) &&
1428 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1429 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1430 }
39236c6e
A
1431 MPT_UNLOCK(mp_tp);
1432 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1433 }
1434
1435 mpts->mpts_mpcr = mpcr;
1436 mpts->mpts_flags |= MPTSF_CONNECTING;
1437
1438 if (af == AF_INET || af == AF_INET6) {
1439 char dbuf[MAX_IPv6_STR_LEN];
1440
3e170ce0
A
1441 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
1442 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
1443 "[pending %s]\n", __func__,
1444 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1445 inet_ntop(af, ((af == AF_INET) ?
1446 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1447 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1448 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1449 ntohs(SIN(dst_se->se_addr)->sin_port) :
1450 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1451 mpts->mpts_connid,
1452 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
1453 "YES" : "NO")),
1454 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1455 }
1456
1457 /* connect right away if first attempt, or if join can be done now */
1458 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1459 error = mptcp_subflow_soconnectx(mpte, mpts);
1460
1461out:
1462 MPTS_UNLOCK(mpts);
1463 if (error == 0) {
1464 soevent(mp_so, SO_FILT_HINT_LOCKED |
1465 SO_FILT_HINT_CONNINFO_UPDATED);
1466 }
1467 return (error);
1468}
1469
39236c6e
A
1470/*
1471 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1472 * will no longer be accessible after a subflow is deleted, thus this
1473 * should occur only after the subflow socket has been disconnected.
1474 * If peeloff(2) is called, leave the socket open.
1475 */
1476void
1477mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1478{
1479 struct socket *mp_so, *so;
1480
1481 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1482 mp_so = mpte->mpte_mppcb->mpp_socket;
1483
1484 MPTS_LOCK(mpts);
1485 so = mpts->mpts_socket;
1486 VERIFY(so != NULL);
fe8ab488
A
1487
1488 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1489 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1490 MPTS_UNLOCK(mpts);
3e170ce0
A
1491 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
1492 " mp_so 0x%llx flags %x\n",
1493 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
1494 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
1495 return;
1496 }
39236c6e 1497
3e170ce0
A
1498 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
1499 "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
39236c6e
A
1500 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1501 mp_so->so_usecount,
1502 mp_so->so_retaincnt, mpts->mpts_connid,
1503 (close ? "YES" : "NO"), mpts->mpts_soerror,
3e170ce0
A
1504 mpts->mpts_flags,
1505 mp_so->so_error),
1506 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1507
1508 VERIFY(mpts->mpts_mpte == mpte);
3e170ce0
A
1509 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1510 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1511
1512 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1513 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1514 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1515 VERIFY(mpte->mpte_numflows != 0);
1516 mpte->mpte_numflows--;
fe8ab488
A
1517 if (mpte->mpte_active_sub == mpts)
1518 mpte->mpte_active_sub = NULL;
39236c6e
A
1519
1520 /*
1521 * Drop references held by this subflow socket; there
1522 * will be no further upcalls made from this point.
1523 */
1524 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1525 (void) sock_catchevents(so, NULL, NULL, 0);
fe8ab488 1526
39236c6e 1527 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
fe8ab488 1528
39236c6e
A
1529 if (close)
1530 (void) mptcp_subflow_soclose(mpts, so);
1531
1532 VERIFY(mp_so->so_usecount != 0);
1533 mp_so->so_usecount--; /* for subflow socket */
1534 mpts->mpts_mpte = NULL;
1535 mpts->mpts_socket = NULL;
1536 MPTS_UNLOCK(mpts);
1537
1538 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1539 MPTS_REMREF(mpts); /* for subflow socket */
1540
1541 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1542}
1543
1544/*
1545 * Disconnect a subflow socket.
1546 */
1547void
1548mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1549 boolean_t deleteok)
1550{
1551 struct socket *so;
1552 struct mptcb *mp_tp;
1553 int send_dfin = 0;
1554
1555 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1556 MPTS_LOCK_ASSERT_HELD(mpts);
1557
1558 VERIFY(mpts->mpts_mpte == mpte);
1559 VERIFY(mpts->mpts_socket != NULL);
3e170ce0
A
1560 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1561 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1562
1563 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1564 return;
1565
1566 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1567
1568 /*
1569 * If this is coming from disconnectx(2) or issued as part of
1570 * closing the MPTCP socket, the subflow shouldn't stick around.
1571 * Otherwise let it linger around in case the upper layers need
1572 * to retrieve its conninfo.
1573 */
1574 if (deleteok)
1575 mpts->mpts_flags |= MPTSF_DELETEOK;
1576
1577 so = mpts->mpts_socket;
1578 mp_tp = mpte->mpte_mptcb;
1579 MPT_LOCK(mp_tp);
1580 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1581 send_dfin = 1;
1582 MPT_UNLOCK(mp_tp);
1583
1584 socket_lock(so, 0);
1585 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1586 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
1587 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
1588 "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
1589 (deleteok ? "NO" : "YES")),
1590 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1591
1592 if (send_dfin)
1593 mptcp_send_dfin(so);
1594 (void) soshutdownlock(so, SHUT_RD);
1595 (void) soshutdownlock(so, SHUT_WR);
1596 (void) sodisconnectlocked(so);
1597 }
1598 socket_unlock(so, 0);
1599 /*
1600 * Generate a disconnect event for this subflow socket, in case
1601 * the lower layer doesn't do it; this is needed because the
1602 * subflow socket deletion relies on it. This will also end up
1603 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1604 * we cannot do that here because subflow lock is currently held.
1605 */
1606 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1607}
1608
1609/*
1610 * Subflow socket read upcall.
1611 *
1612 * Called when the associated subflow socket posted a read event. The subflow
1613 * socket lock has been released prior to invoking the callback. Note that the
1614 * upcall may occur synchronously as a result of MPTCP performing an action on
1615 * it, or asynchronously as a result of an event happening at the subflow layer.
1616 * Therefore, to maintain lock ordering, the only lock that can be acquired
1617 * here is the thread lock, for signalling purposes.
1618 */
1619static void
1620mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1621{
1622#pragma unused(so, waitf)
1623 struct mptsub *mpts = arg;
1624 struct mptses *mpte = mpts->mpts_mpte;
1625
fe8ab488
A
1626 /*
1627 * mpte should never be NULL, except in a race with
1628 * mptcp_subflow_del
1629 */
1630 if (mpte == NULL)
1631 return;
39236c6e
A
1632
1633 lck_mtx_lock(&mpte->mpte_thread_lock);
1634 mptcp_thread_signal_locked(mpte);
1635 lck_mtx_unlock(&mpte->mpte_thread_lock);
1636}
1637
1638/*
1639 * Subflow socket input.
1640 *
1641 * Called in the context of the MPTCP thread, for reading data from the
1642 * underlying subflow socket and delivering it to MPTCP.
1643 */
1644static void
1645mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1646{
1647 struct mbuf *m = NULL;
1648 struct socket *so;
1649 int error;
1650 struct mptsub *mpts_alt = NULL;
1651
1652 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1653 MPTS_LOCK_ASSERT_HELD(mpts);
1654
1655 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1656 struct mptsub *, mpts);
1657
1658 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1659 return;
1660
1661 so = mpts->mpts_socket;
1662
1663 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1664 if (error != 0 && error != EWOULDBLOCK) {
3e170ce0
A
1665 mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
1666 __func__, mpts->mpts_connid, error),
1667 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1668 MPTS_UNLOCK(mpts);
3e170ce0 1669 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e 1670 if (mpts_alt == NULL) {
fe8ab488
A
1671 if (mptcp_delayed_subf_start) {
1672 mpts_alt = mptcp_get_pending_subflow(mpte,
1673 mpts);
1674 if (mpts_alt) {
3e170ce0
A
1675 mptcplog((LOG_DEBUG,"MPTCP Receiver:"
1676 " %s: pending %d\n",
1677 __func__, mpts_alt->mpts_connid),
1678 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1679 } else {
3e170ce0
A
1680 mptcplog((LOG_ERR, "MPTCP Receiver:"
1681 " %s: no pending flow for cid %d",
1682 __func__, mpts->mpts_connid),
1683 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488
A
1684 }
1685 } else {
3e170ce0
A
1686 mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
1687 " path for cid %d\n", __func__,
1688 mpts->mpts_connid),
1689 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1690 }
39236c6e
A
1691 }
1692 MPTS_LOCK(mpts);
1693 } else if (error == 0) {
3e170ce0
A
1694 mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
1695 __func__, mpts->mpts_connid),
1696 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1697 }
1698
1699 /* In fallback, make sure to accept data on all but one subflow */
1700 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1701 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1702 m_freem(m);
1703 return;
1704 }
1705
1706 if (m != NULL) {
3e170ce0
A
1707
1708 /* Did we receive data on the backup subflow? */
1709 if (!(mpts->mpts_flags & MPTSF_ACTIVE))
1710 mpts->mpts_peerswitch++;
1711 else
1712 mpts->mpts_peerswitch = 0;
1713
39236c6e
A
1714 /*
1715 * Release subflow lock since this may trigger MPTCP to send,
1716 * possibly on a different subflow. An extra reference has
1717 * been held on the subflow by the MPTCP thread before coming
1718 * here, so we can be sure that it won't go away, in the event
1719 * the MP socket lock gets released.
1720 */
1721 MPTS_UNLOCK(mpts);
1722 mptcp_input(mpte, m);
1723 MPTS_LOCK(mpts);
1724 }
1725}
1726
1727/*
1728 * Subflow socket write upcall.
1729 *
1730 * Called when the associated subflow socket posted a read event. The subflow
1731 * socket lock has been released prior to invoking the callback. Note that the
1732 * upcall may occur synchronously as a result of MPTCP performing an action on
1733 * it, or asynchronously as a result of an event happening at the subflow layer.
1734 * Therefore, to maintain lock ordering, the only lock that can be acquired
1735 * here is the thread lock, for signalling purposes.
1736 */
1737static void
1738mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1739{
1740#pragma unused(so, waitf)
1741 struct mptsub *mpts = arg;
1742 struct mptses *mpte = mpts->mpts_mpte;
1743
fe8ab488
A
1744 /*
1745 * mpte should never be NULL except in a race with
1746 * mptcp_subflow_del which doesn't hold socket lock across critical
1747 * section. This upcall is made after releasing the socket lock.
1748 * Interleaving of socket operations becomes possible therefore.
1749 */
1750 if (mpte == NULL)
1751 return;
39236c6e
A
1752
1753 lck_mtx_lock(&mpte->mpte_thread_lock);
1754 mptcp_thread_signal_locked(mpte);
1755 lck_mtx_unlock(&mpte->mpte_thread_lock);
1756}
1757
1758/*
1759 * Subflow socket output.
1760 *
1761 * Called for sending data from MPTCP to the underlying subflow socket.
1762 */
1763int
1764mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1765{
1766 struct socket *mp_so, *so;
1767 size_t sb_cc = 0, tot_sent = 0;
1768 struct mbuf *sb_mb;
1769 int error = 0;
1770 u_int64_t mpt_dsn = 0;
1771 struct mptcb *mp_tp = mpte->mpte_mptcb;
1772 struct mbuf *mpt_mbuf = NULL;
fe8ab488
A
1773 u_int64_t off = 0;
1774 struct mbuf *head, *tail;
39236c6e
A
1775
1776 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1777 MPTS_LOCK_ASSERT_HELD(mpts);
1778 mp_so = mpte->mpte_mppcb->mpp_socket;
1779 so = mpts->mpts_socket;
1780
1781 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1782 struct mptsub *, mpts);
1783
1784 /* subflow socket is suspended? */
1785 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
3e170ce0
A
1786 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
1787 "flow controlled\n", __func__,
1788 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1789 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1790 goto out;
1791 }
1792
1793 /* subflow socket is not MPTCP capable? */
1794 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
fe8ab488
A
1795 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1796 !(mpts->mpts_flags & MPTSF_FASTJ_SEND)) {
3e170ce0 1797 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
39236c6e 1798 "MPTCP capable\n", __func__,
3e170ce0
A
1799 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1800 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1801 goto out;
1802 }
1803
1804 /* Remove Addr Option is not sent reliably as per I-D */
1805 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1806 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1807 tp->t_rem_aid = mpte->mpte_lost_aid;
1808 if (mptcp_remaddr_enable)
1809 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1810 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1811 }
1812
1813 /*
1814 * The mbuf chains containing the metadata (as well as pointing to
1815 * the user data sitting at the MPTCP output queue) would then be
1816 * sent down to the subflow socket.
1817 *
1818 * Some notes on data sequencing:
1819 *
1820 * a. Each mbuf must be a M_PKTHDR.
1821 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1822 * in the mbuf pkthdr structure.
1823 * c. Each mbuf containing the MPTCP metadata must have its
1824 * pkt_flags marked with the PKTF_MPTCP flag.
1825 */
1826
1827 /* First, drop acknowledged data */
1828 sb_mb = mp_so->so_snd.sb_mb;
1829 if (sb_mb == NULL) {
1830 goto out;
1831 }
1832
1833 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1834
1835 mpt_mbuf = sb_mb;
1836 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1837 mpt_mbuf = mpt_mbuf->m_next;
1838 }
1839 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1840 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1841 } else {
1842 goto out;
1843 }
1844
1845 MPT_LOCK(mp_tp);
1846 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
fe8ab488 1847 u_int64_t len = 0;
39236c6e 1848 len = mp_tp->mpt_snduna - mpt_dsn;
3e170ce0 1849 MPT_UNLOCK(mp_tp);
fe8ab488 1850 sbdrop(&mp_so->so_snd, (int)len);
3e170ce0 1851 MPT_LOCK(mp_tp);
39236c6e
A
1852 }
1853
1854 /*
1855 * In degraded mode, we don't receive data acks, so force free
1856 * mbufs less than snd_nxt
1857 */
fe8ab488
A
1858 if (mp_so->so_snd.sb_mb == NULL) {
1859 MPT_UNLOCK(mp_tp);
1860 goto out;
1861 }
1862
39236c6e
A
1863 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1864 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 1865 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
39236c6e 1866 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
fe8ab488 1867 u_int64_t len = 0;
39236c6e 1868 len = mp_tp->mpt_sndnxt - mpt_dsn;
fe8ab488 1869 sbdrop(&mp_so->so_snd, (int)len);
39236c6e
A
1870 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1871 }
1872
fe8ab488
A
1873 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1874 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1875 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1876 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1877 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1878 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1879 }
1880
39236c6e
A
1881 /*
1882 * Adjust the subflow's notion of next byte to send based on
1883 * the last unacknowledged byte
1884 */
1885 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1886 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
fe8ab488
A
1887 /*
1888 * With FastJoin, a write before the fastjoin event will use
1889 * an uninitialized relative sequence number.
1890 */
1891 if (mpts->mpts_rel_seq == 0)
1892 mpts->mpts_rel_seq = 1;
39236c6e
A
1893 }
1894
1895 /*
1896 * Adjust the top level notion of next byte used for retransmissions
1897 * and sending FINs.
1898 */
1899 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1900 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1901 }
1902
1903
1904 /* Now determine the offset from which to start transmitting data */
1905 sb_mb = mp_so->so_snd.sb_mb;
1906 sb_cc = mp_so->so_snd.sb_cc;
1907 if (sb_mb == NULL) {
1908 MPT_UNLOCK(mp_tp);
1909 goto out;
1910 }
1911 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1912 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
fe8ab488 1913 sb_cc -= (size_t)off;
39236c6e
A
1914 } else {
1915 MPT_UNLOCK(mp_tp);
1916 goto out;
1917 }
1918 MPT_UNLOCK(mp_tp);
1919
1920 mpt_mbuf = sb_mb;
1921 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1922
1923 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
fe8ab488 1924 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
39236c6e
A
1925 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1926 mpt_mbuf = mpt_mbuf->m_next;
1927 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1928 }
3e170ce0
A
1929 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
1930 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
1931 "snduna = %llu sndnxt = %llu probe %d\n",
1932 __func__, mpts->mpts_connid,
1933 mp_tp->mpt_snduna, mpts->mpts_sndnxt,
1934 mpts->mpts_probecnt),
1935 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1936
1937 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1938
fe8ab488
A
1939 head = tail = NULL;
1940
39236c6e
A
1941 while (tot_sent < sb_cc) {
1942 struct mbuf *m;
fe8ab488 1943 size_t mlen;
39236c6e
A
1944
1945 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1946 mlen -= off;
1947 if (mlen == 0)
1948 goto out;
1949
1950 if (mlen > sb_cc) {
1951 panic("%s: unexpected %lu %lu \n", __func__,
1952 mlen, sb_cc);
1953 }
1954
fe8ab488
A
1955 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
1956 M_COPYM_MUST_COPY_HDR);
39236c6e
A
1957 if (m == NULL) {
1958 error = ENOBUFS;
1959 break;
1960 }
1961
1962 /* Create a DSN mapping for the data (m_copym does it) */
1963 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
fe8ab488 1964 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
1965 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1966 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
1967 m->m_pkthdr.mp_dsn = mpt_dsn + off;
1968 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
1969 m->m_pkthdr.mp_rlen = mlen;
1970 mpts->mpts_rel_seq += mlen;
1971 m->m_pkthdr.len = mlen;
1972
fe8ab488
A
1973 if (head == NULL) {
1974 head = tail = m;
1975 } else {
1976 tail->m_next = m;
1977 tail = m;
1978 }
1979
fe8ab488
A
1980 tot_sent += mlen;
1981 off = 0;
1982 mpt_mbuf = mpt_mbuf->m_next;
1983 }
1984
1985 if (head != NULL) {
1986
1987 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
1988 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1989 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
1990 }
1991
1992 error = sock_sendmbuf(so, NULL, head, 0, NULL);
1993
1994 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
39236c6e
A
1995 struct sockbuf *, &so->so_rcv,
1996 struct sockbuf *, &so->so_snd,
1997 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
1998 size_t, tot_sent);
1999 }
2000
2001 if (error == 0) {
2002 mpts->mpts_sndnxt += tot_sent;
3e170ce0
A
2003
2004 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2005 tcpstat.tcps_mp_num_probes++;
2006 if (tot_sent < mpts->mpts_maxseg)
2007 mpts->mpts_probecnt += 1;
2008 else
2009 mpts->mpts_probecnt +=
2010 tot_sent/mpts->mpts_maxseg;
2011 }
2012
39236c6e 2013 MPT_LOCK(mp_tp);
3e170ce0 2014
39236c6e
A
2015 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
2016 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
2017 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2018 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2019 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
2020 }
fe8ab488 2021 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
39236c6e 2022 MPT_UNLOCK(mp_tp);
fe8ab488
A
2023
2024 /* Send once in SYN_SENT state to avoid sending SYN spam */
2025 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
2026 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
2027 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
39236c6e 2028 }
39236c6e 2029
3e170ce0
A
2030 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2031 (mpts->mpts_probesoon != 0))
2032 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
2033 "wrote %d %d probe %d probedelta %d\n",
fe8ab488 2034 __func__, mpts->mpts_connid, (int)tot_sent,
3e170ce0
A
2035 (int) sb_cc, mpts->mpts_probecnt,
2036 (tcp_now - mpts->mpts_probesoon)),
2037 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2038 } else {
3e170ce0
A
2039 mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
2040 __func__, mpts->mpts_connid, error, tot_sent),
2041 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2042 }
2043out:
2044 return (error);
2045}
2046
2047/*
2048 * Subflow socket control event upcall.
2049 *
2050 * Called when the associated subflow socket posted one or more control events.
2051 * The subflow socket lock has been released prior to invoking the callback.
2052 * Note that the upcall may occur synchronously as a result of MPTCP performing
2053 * an action on it, or asynchronously as a result of an event happening at the
2054 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
2055 * be acquired here is the thread lock, for signalling purposes.
2056 */
2057static void
2058mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
2059{
2060#pragma unused(so)
2061 struct mptsub *mpts = arg;
2062 struct mptses *mpte = mpts->mpts_mpte;
2063
2064 VERIFY(mpte != NULL);
2065
2066 lck_mtx_lock(&mpte->mpte_thread_lock);
2067 atomic_bitset_32(&mpts->mpts_evctl, events);
2068 mptcp_thread_signal_locked(mpte);
2069 lck_mtx_unlock(&mpte->mpte_thread_lock);
2070}
2071
2072/*
2073 * Subflow socket control events.
2074 *
2075 * Called for handling events related to the underlying subflow socket.
2076 */
2077static ev_ret_t
3e170ce0
A
2078mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
2079 uint64_t *p_mpsofilt_hint)
39236c6e 2080{
fe8ab488 2081 uint32_t events, save_events;
39236c6e 2082 ev_ret_t ret = MPTS_EVRET_OK;
3e170ce0
A
2083 int i = 0;
2084 int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
2085 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e
A
2086 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2087 MPTS_LOCK_ASSERT_HELD(mpts);
2088
2089 /* bail if there's nothing to process */
2090 if ((events = mpts->mpts_evctl) == 0)
2091 return (ret);
2092
2093 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
2094 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
2095 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
2096 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
2097 SO_FILT_HINT_DISCONNECTED)) {
2098 events |= SO_FILT_HINT_MPFAILOVER;
2099 }
2100
fe8ab488
A
2101 save_events = events;
2102
39236c6e
A
2103 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
2104 struct mptsub *, mpts, uint32_t, events);
2105
3e170ce0
A
2106 mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
2107 mpts->mpts_connid, events, SO_FILT_HINT_BITS),
2108 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
2109
2110 /*
2111 * Process all the socket filter hints and reset the hint
2112 * once it is handled
2113 */
2114 for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
2115 if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
2116 (ret >= MPTS_EVRET_OK)) {
2117 ev_ret_t error =
2118 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
2119 events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
2120 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2121 }
fe8ab488
A
2122 }
2123
39236c6e
A
2124 /*
2125 * We should be getting only events specified via sock_catchevents(),
2126 * so loudly complain if we have any unprocessed one(s).
2127 */
2128 if (events != 0 || ret < MPTS_EVRET_OK) {
3e170ce0 2129 mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
39236c6e 2130 " unhandled events=%b\n",
3e170ce0 2131 (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
39236c6e 2132 __func__, mpts->mpts_connid,
3e170ce0
A
2133 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
2134 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2135 }
2136
2137 /* clear the ones we've processed */
fe8ab488 2138 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
39236c6e
A
2139 return (ret);
2140}
2141
2142/*
2143 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2144 */
2145static ev_ret_t
3e170ce0
A
2146mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
2147 uint64_t *p_mpsofilt_hint)
39236c6e
A
2148{
2149 struct socket *mp_so, *so;
2150 struct mptcb *mp_tp;
2151 boolean_t linger;
2152
2153 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2154 MPTS_LOCK_ASSERT_HELD(mpts);
2155 VERIFY(mpte->mpte_mppcb != NULL);
2156 mp_so = mpte->mpte_mppcb->mpp_socket;
2157 mp_tp = mpte->mpte_mptcb;
2158 so = mpts->mpts_socket;
2159
2160 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2161 !(mp_so->so_flags & SOF_PCBCLEARING));
2162
3e170ce0
A
2163 mptcplog((LOG_DEBUG, "MPTCP Events: "
2164 "%s: cid %d [linger %s]\n", __func__,
2165 mpts->mpts_connid, (linger ? "YES" : "NO")),
2166 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2167
39236c6e
A
2168 /*
2169 * We got a TCP RST for this subflow connection.
2170 *
2171 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
fe8ab488
A
2172 * client if the MPTCP connection has not been established or
2173 * if the connection has only one subflow and is a connection being
2174 * resumed. Otherwise we close the socket.
39236c6e
A
2175 */
2176 mptcp_subflow_disconnect(mpte, mpts, !linger);
2177
2178 MPT_LOCK(mp_tp);
2179 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
fe8ab488
A
2180 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
2181 } else if (mpte->mpte_nummpcapflows < 1) {
2182 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
3e170ce0 2183 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
39236c6e
A
2184 }
2185 MPT_UNLOCK(mp_tp);
2186
2187 /*
2188 * Keep the subflow socket around, unless the MPTCP socket has
2189 * been detached or the subflow has been disconnected explicitly,
2190 * in which case it should be deleted right away.
2191 */
2192 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2193}
2194
2195/*
2196 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2197 */
2198static ev_ret_t
3e170ce0
A
2199mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2200 uint64_t *p_mpsofilt_hint)
39236c6e 2201{
3e170ce0 2202#pragma unused(p_mpsofilt_hint)
39236c6e
A
2203 struct socket *so;
2204
2205 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2206 MPTS_LOCK_ASSERT_HELD(mpts);
2207
2208 so = mpts->mpts_socket;
2209
3e170ce0
A
2210 mptcplog((LOG_DEBUG, "MPTCP Events: "
2211 "%s: cid %d\n", __func__, mpts->mpts_connid),
2212 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2213
2214 /*
2215 * We got a FIN for this subflow connection. This subflow socket
2216 * is no longer available for receiving data;
2217 * The FIN may arrive with data. The data is handed up to the
2218 * mptcp socket and the subflow is disconnected.
2219 */
2220
2221 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2222}
2223
2224/*
2225 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2226 */
2227static ev_ret_t
3e170ce0
A
2228mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
2229 uint64_t *p_mpsofilt_hint)
39236c6e 2230{
3e170ce0 2231#pragma unused(p_mpsofilt_hint)
39236c6e
A
2232 struct socket *so;
2233
2234 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2235 MPTS_LOCK_ASSERT_HELD(mpts);
2236
2237 so = mpts->mpts_socket;
2238
3e170ce0
A
2239 mptcplog((LOG_DEBUG, "MPTCP Events: "
2240 "%s: cid %d\n", __func__, mpts->mpts_connid),
2241 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2242
39236c6e
A
2243 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2244}
2245
2246/*
2247 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2248 */
2249static ev_ret_t
3e170ce0
A
2250mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
2251 uint64_t *p_mpsofilt_hint)
39236c6e 2252{
3e170ce0 2253#pragma unused(p_mpsofilt_hint)
39236c6e
A
2254 struct socket *mp_so, *so;
2255 struct mptcb *mp_tp;
2256 boolean_t linger;
2257
2258 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2259 MPTS_LOCK_ASSERT_HELD(mpts);
2260 VERIFY(mpte->mpte_mppcb != NULL);
2261 mp_so = mpte->mpte_mppcb->mpp_socket;
2262 mp_tp = mpte->mpte_mptcb;
2263 so = mpts->mpts_socket;
2264
2265 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2266 !(mp_so->so_flags & SOF_PCBCLEARING));
2267
3e170ce0
A
2268 mptcplog((LOG_NOTICE, "MPTCP Events: "
2269 "%s: cid %d [linger %s]\n", __func__,
2270 mpts->mpts_connid, (linger ? "YES" : "NO")),
2271 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2272
2273 if (mpts->mpts_soerror == 0)
2274 mpts->mpts_soerror = ETIMEDOUT;
2275
2276 /*
2277 * The subflow connection has timed out.
2278 *
2279 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2280 * client if the MPTCP connection has not been established. Otherwise
2281 * drop it.
2282 */
2283 mptcp_subflow_disconnect(mpte, mpts, !linger);
2284
2285 MPT_LOCK(mp_tp);
2286 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2287 mp_so->so_error = ETIMEDOUT;
2288 }
2289 MPT_UNLOCK(mp_tp);
2290
2291 /*
2292 * Keep the subflow socket around, unless the MPTCP socket has
2293 * been detached or the subflow has been disconnected explicitly,
2294 * in which case it should be deleted right away.
2295 */
2296 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2297}
2298
2299/*
2300 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2301 */
2302static ev_ret_t
3e170ce0
A
2303mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
2304 uint64_t *p_mpsofilt_hint)
39236c6e 2305{
3e170ce0 2306#pragma unused(p_mpsofilt_hint)
39236c6e
A
2307 struct socket *mp_so, *so;
2308 struct mptcb *mp_tp;
2309 boolean_t linger;
2310 struct tcpcb *tp = NULL;
2311
2312 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2313 MPTS_LOCK_ASSERT_HELD(mpts);
2314
2315 VERIFY(mpte->mpte_mppcb != NULL);
2316 mp_so = mpte->mpte_mppcb->mpp_socket;
2317 mp_tp = mpte->mpte_mptcb;
2318 so = mpts->mpts_socket;
2319
2320 /* Not grabbing socket lock as t_local_aid is write once only */
2321 tp = intotcpcb(sotoinpcb(so));
2322 /*
2323 * This overwrites any previous mpte_lost_aid to avoid storing
2324 * too much state when the typical case has only two subflows.
2325 */
2326 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2327 mpte->mpte_lost_aid = tp->t_local_aid;
2328
2329 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2330 !(mp_so->so_flags & SOF_PCBCLEARING));
2331
3e170ce0
A
2332 mptcplog((LOG_DEBUG, "MPTCP Events: "
2333 "%s cid %d [linger %s]\n", __func__,
2334 mpts->mpts_connid, (linger ? "YES" : "NO")),
2335 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2336
2337 if (mpts->mpts_soerror == 0)
2338 mpts->mpts_soerror = EADDRNOTAVAIL;
2339
2340 /*
2341 * The subflow connection has lost its source address.
2342 *
2343 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2344 * client if the MPTCP connection has not been established. If it
2345 * has been established with one subflow , we keep the MPTCP
2346 * connection valid without any subflows till closed by application.
2347 * This lets tcp connection manager decide whether to close this or
2348 * not as it reacts to reachability changes too.
2349 */
2350 mptcp_subflow_disconnect(mpte, mpts, !linger);
2351
2352 MPT_LOCK(mp_tp);
2353 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2354 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2355 mp_so->so_error = EADDRNOTAVAIL;
2356 }
2357 MPT_UNLOCK(mp_tp);
2358
2359 /*
2360 * Keep the subflow socket around, unless the MPTCP socket has
2361 * been detached or the subflow has been disconnected explicitly,
2362 * in which case it should be deleted right away.
2363 */
2364 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2365}
2366
fe8ab488
A
2367/*
2368 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2369 * indicates that the remote side sent a Data FIN
2370 */
2371static ev_ret_t
3e170ce0
A
2372mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2373 uint64_t *p_mpsofilt_hint)
fe8ab488
A
2374{
2375 struct socket *so, *mp_so;
2376 struct mptcb *mp_tp;
2377
2378 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2379 MPTS_LOCK_ASSERT_HELD(mpts);
2380 mp_so = mpte->mpte_mppcb->mpp_socket;
2381 so = mpts->mpts_socket;
2382 mp_tp = mpte->mpte_mptcb;
2383
3e170ce0
A
2384 mptcplog((LOG_DEBUG, "MPTCP Events: "
2385 "%s: cid %d\n", __func__, mpts->mpts_connid),
2386 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2387
2388 /*
2389 * We got a Data FIN for the MPTCP connection.
2390 * The FIN may arrive with data. The data is handed up to the
2391 * mptcp socket and the user is notified so that it may close
2392 * the socket if needed.
2393 */
2394 MPT_LOCK(mp_tp);
2395 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3e170ce0 2396 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
fe8ab488
A
2397 }
2398 MPT_UNLOCK(mp_tp);
2399 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2400}
2401
39236c6e
A
2402/*
2403 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2404 */
2405static ev_ret_t
3e170ce0
A
2406mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
2407 uint64_t *p_mpsofilt_hint)
39236c6e
A
2408{
2409 struct mptsub *mpts_alt = NULL;
2410 struct socket *so = NULL;
2411 struct socket *mp_so;
2412 int altpath_exists = 0;
2413
2414 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2415 MPTS_LOCK_ASSERT_HELD(mpts);
2416 mp_so = mpte->mpte_mppcb->mpp_socket;
3e170ce0
A
2417 mptcplog((LOG_NOTICE, "MPTCP Events: "
2418 "%s: mp_so 0x%llx\n", __func__,
2419 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2420 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2421
2422 MPTS_UNLOCK(mpts);
3e170ce0 2423 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
2424
2425 /*
2426 * If there is no alternate eligible subflow, ignore the
2427 * failover hint.
2428 */
2429 if (mpts_alt == NULL) {
3e170ce0
A
2430 mptcplog((LOG_WARNING, "MPTCP Events: "
2431 "%s: no alternate path\n", __func__),
2432 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2433
fe8ab488
A
2434 if (mptcp_delayed_subf_start) {
2435 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2436 if (mpts_alt != NULL) {
2437 MPTS_LOCK(mpts_alt);
2438 (void) mptcp_subflow_soconnectx(mpte,
2439 mpts_alt);
2440 MPTS_UNLOCK(mpts_alt);
2441 }
2442 }
39236c6e
A
2443 MPTS_LOCK(mpts);
2444 goto done;
2445 }
2446 MPTS_LOCK(mpts_alt);
2447 altpath_exists = 1;
2448 so = mpts_alt->mpts_socket;
2449 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2450 socket_lock(so, 1);
fe8ab488
A
2451 /* All data acknowledged and no RTT spike */
2452 if ((so->so_snd.sb_cc == 0) &&
2453 (mptcp_no_rto_spike(so))) {
39236c6e
A
2454 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2455 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2456 } else {
2457 /* no alternate path available */
2458 altpath_exists = 0;
2459 }
2460 socket_unlock(so, 1);
2461 }
2462 if (altpath_exists) {
3e170ce0
A
2463 mptcplog((LOG_INFO, "MPTCP Events: "
2464 "%s: cid = %d\n",
2465 __func__, mpts_alt->mpts_connid),
2466 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2467 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3e170ce0 2468 mpts_alt->mpts_peerswitch = 0;
39236c6e
A
2469 struct mptcb *mp_tp = mpte->mpte_mptcb;
2470 /* Bring the subflow's notion of snd_nxt into the send window */
2471 MPT_LOCK(mp_tp);
2472 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2473 MPT_UNLOCK(mp_tp);
2474 mpte->mpte_active_sub = mpts_alt;
2475 socket_lock(so, 1);
2476 sowwakeup(so);
2477 socket_unlock(so, 1);
2478 }
2479 MPTS_UNLOCK(mpts_alt);
2480
2481 if (altpath_exists) {
3e170ce0
A
2482 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2483 mptcplog((LOG_NOTICE, "MPTCP Events: "
2484 "%s: mp_so 0x%llx switched from "
39236c6e
A
2485 "%d to %d\n", __func__,
2486 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
2487 mpts->mpts_connid, mpts_alt->mpts_connid),
2488 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2489 tcpstat.tcps_mp_switches++;
2490 }
2491
2492 MPTS_LOCK(mpts);
2493 if (altpath_exists) {
2494 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2495 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2496 } else {
3e170ce0
A
2497 mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
2498 __func__, mpts->mpts_connid),
2499 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2500done:
39236c6e
A
2501 so = mpts->mpts_socket;
2502 socket_lock(so, 1);
2503 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2504 socket_unlock(so, 1);
2505 }
39236c6e
A
2506 MPTS_LOCK_ASSERT_HELD(mpts);
2507 return (MPTS_EVRET_OK);
2508}
2509
2510/*
2511 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2512 */
2513static ev_ret_t
3e170ce0
A
2514mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
2515 uint64_t *p_mpsofilt_hint)
39236c6e
A
2516{
2517 struct socket *mp_so, *so;
2518 struct mptcb *mp_tp;
2519 boolean_t linger;
2520
2521 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2522 MPTS_LOCK_ASSERT_HELD(mpts);
2523 VERIFY(mpte->mpte_mppcb != NULL);
2524 mp_so = mpte->mpte_mppcb->mpp_socket;
2525 mp_tp = mpte->mpte_mptcb;
2526 so = mpts->mpts_socket;
2527
2528 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2529 !(mp_so->so_flags & SOF_PCBCLEARING));
2530
3e170ce0
A
2531 mptcplog((LOG_DEBUG, "MPTCP Events: "
2532 "%s: cid %d [linger %s]\n", __func__,
2533 mpts->mpts_connid, (linger ? "YES" : "NO")),
2534 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2535
2536 if (mpts->mpts_soerror == 0)
2537 mpts->mpts_soerror = EHOSTUNREACH;
2538
2539 /*
2540 * The subflow connection cannot use the outgoing interface.
2541 *
2542 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2543 * client if the MPTCP connection has not been established. If it
2544 * has been established, let the upper layer call disconnectx.
2545 */
2546 mptcp_subflow_disconnect(mpte, mpts, !linger);
3e170ce0 2547 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
39236c6e
A
2548
2549 MPT_LOCK(mp_tp);
2550 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2551 mp_so->so_error = EHOSTUNREACH;
2552 }
2553 MPT_UNLOCK(mp_tp);
2554
39236c6e
A
2555 /*
2556 * Keep the subflow socket around, unless the MPTCP socket has
2557 * been detached or the subflow has been disconnected explicitly,
2558 * in which case it should be deleted right away.
2559 */
2560 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2561}
2562
2563/*
2564 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2565 */
2566static ev_ret_t
3e170ce0
A
2567mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
2568 uint64_t *p_mpsofilt_hint)
39236c6e 2569{
3e170ce0 2570#pragma unused(p_mpsofilt_hint)
39236c6e
A
2571 struct socket *so;
2572
2573 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2574 MPTS_LOCK_ASSERT_HELD(mpts);
2575
2576 so = mpts->mpts_socket;
2577
2578 /* the subflow connection is being flow controlled */
2579 mpts->mpts_flags |= MPTSF_SUSPENDED;
2580
3e170ce0
A
2581 mptcplog((LOG_DEBUG, "MPTCP Events: "
2582 "%s: cid %d\n", __func__,
2583 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2584
2585 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2586}
2587
2588/*
2589 * Handle SO_FILT_HINT_RESUME subflow socket event.
2590 */
2591static ev_ret_t
3e170ce0
A
2592mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
2593 uint64_t *p_mpsofilt_hint)
39236c6e 2594{
3e170ce0 2595#pragma unused(p_mpsofilt_hint)
39236c6e
A
2596 struct socket *so;
2597
2598 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2599 MPTS_LOCK_ASSERT_HELD(mpts);
2600
2601 so = mpts->mpts_socket;
2602
2603 /* the subflow connection is no longer flow controlled */
2604 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2605
3e170ce0
A
2606 mptcplog((LOG_DEBUG, "MPTCP Events: "
2607 "%s: cid %d\n", __func__, mpts->mpts_connid),
2608 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2609
2610 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2611}
2612
2613/*
2614 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2615 */
2616static ev_ret_t
3e170ce0
A
2617mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
2618 uint64_t *p_mpsofilt_hint)
39236c6e
A
2619{
2620 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2621 struct sockaddr_entry *src_se, *dst_se;
2622 struct sockaddr_storage src;
2623 struct socket *mp_so, *so;
2624 struct mptcb *mp_tp;
2625 struct ifnet *outifp;
2626 int af, error = 0;
2627 boolean_t mpok = FALSE;
3e170ce0
A
2628 boolean_t cell = FALSE;
2629 boolean_t wifi = FALSE;
2630 boolean_t wired = FALSE;
39236c6e
A
2631
2632 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2633 VERIFY(mpte->mpte_mppcb != NULL);
2634 mp_so = mpte->mpte_mppcb->mpp_socket;
2635 mp_tp = mpte->mpte_mptcb;
2636
2637 MPTS_LOCK_ASSERT_HELD(mpts);
2638 so = mpts->mpts_socket;
2639 af = mpts->mpts_family;
2640
2641 if (mpts->mpts_flags & MPTSF_CONNECTED)
2642 return (MPTS_EVRET_OK);
2643
2644 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2645 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
2646 socket_lock(so, 0);
2647 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2648 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
2649 mptcplog((LOG_DEBUG, "MPTCP Events: "
2650 "%s: cid %d disconnect before tcp connect\n",
2651 __func__, mpts->mpts_connid),
2652 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2653 (void) soshutdownlock(so, SHUT_RD);
2654 (void) soshutdownlock(so, SHUT_WR);
2655 (void) sodisconnectlocked(so);
2656 }
2657 socket_unlock(so, 0);
39236c6e
A
2658 return (MPTS_EVRET_OK);
2659 }
2660
2661 /*
2662 * The subflow connection has been connected. Find out whether it
2663 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2664 *
2665 * a. If MPTCP connection is not yet established, then this must be
2666 * the first subflow connection. If MPTCP failed to negotiate,
2667 * indicate to the MPTCP socket client via EPROTO, that the
2668 * underlying TCP connection may be peeled off via peeloff(2).
2669 * Otherwise, mark the MPTCP socket as connected.
2670 *
2671 * b. If MPTCP connection has been established, then this must be
2672 * one of the subsequent subflow connections. If MPTCP failed
2673 * to negotiate, disconnect the connection since peeloff(2)
2674 * is no longer possible.
2675 *
2676 * Right now, we simply unblock any waiters at the MPTCP socket layer
2677 * if the MPTCP connection has not been established.
2678 */
2679 socket_lock(so, 0);
2680
2681 if (so->so_state & SS_ISDISCONNECTED) {
2682 /*
2683 * With MPTCP joins, a connection is connected at the subflow
2684 * level, but the 4th ACK from the server elevates the MPTCP
2685 * subflow to connected state. So there is a small window
2686 * where the subflow could get disconnected before the
2687 * connected event is processed.
2688 */
2689 socket_unlock(so, 0);
2690 return (MPTS_EVRET_OK);
2691 }
2692
2693 mpts->mpts_soerror = 0;
2694 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2695 mpts->mpts_flags |= MPTSF_CONNECTED;
2696 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2697 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2698
2699 VERIFY(mpts->mpts_dst_sl != NULL);
2700 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2701 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2702 dst_se->se_addr->sa_family == af);
2703
2704 VERIFY(mpts->mpts_src_sl != NULL);
2705 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2706 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2707 src_se->se_addr->sa_family == af);
2708
2709 /* get/check source IP address */
2710 switch (af) {
2711 case AF_INET: {
2712 error = in_getsockaddr_s(so, &src);
2713 if (error == 0) {
2714 struct sockaddr_in *ms = SIN(src_se->se_addr);
2715 struct sockaddr_in *s = SIN(&src);
2716
2717 VERIFY(s->sin_len == ms->sin_len);
2718 VERIFY(ms->sin_family == AF_INET);
2719
2720 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2721 bcmp(&ms->sin_addr, &s->sin_addr,
2722 sizeof (ms->sin_addr)) != 0) {
3e170ce0
A
2723 mptcplog((LOG_ERR, "MPTCP Events: "
2724 "%s: cid %d local "
39236c6e
A
2725 "address %s (expected %s)\n", __func__,
2726 mpts->mpts_connid, inet_ntop(AF_INET,
2727 (void *)&s->sin_addr.s_addr, buf0,
2728 sizeof (buf0)), inet_ntop(AF_INET,
2729 (void *)&ms->sin_addr.s_addr, buf1,
3e170ce0
A
2730 sizeof (buf1))),
2731 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2732 }
2733 bcopy(s, ms, sizeof (*s));
2734 }
2735 break;
2736 }
2737#if INET6
2738 case AF_INET6: {
2739 error = in6_getsockaddr_s(so, &src);
2740 if (error == 0) {
2741 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2742 struct sockaddr_in6 *s = SIN6(&src);
2743
2744 VERIFY(s->sin6_len == ms->sin6_len);
2745 VERIFY(ms->sin6_family == AF_INET6);
2746
2747 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2748 bcmp(&ms->sin6_addr, &s->sin6_addr,
2749 sizeof (ms->sin6_addr)) != 0) {
3e170ce0
A
2750 mptcplog((LOG_ERR, "MPTCP Events: "
2751 "%s: cid %d local "
39236c6e
A
2752 "address %s (expected %s)\n", __func__,
2753 mpts->mpts_connid, inet_ntop(AF_INET6,
2754 (void *)&s->sin6_addr, buf0,
2755 sizeof (buf0)), inet_ntop(AF_INET6,
2756 (void *)&ms->sin6_addr, buf1,
3e170ce0
A
2757 sizeof (buf1))),
2758 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2759 }
2760 bcopy(s, ms, sizeof (*s));
2761 }
2762 break;
2763 }
2764#endif /* INET6 */
2765 default:
2766 VERIFY(0);
2767 /* NOTREACHED */
2768 }
2769
2770 if (error != 0) {
3e170ce0
A
2771 mptcplog((LOG_ERR, "MPTCP Events "
2772 "%s: cid %d getsockaddr failed (%d)\n",
2773 __func__, mpts->mpts_connid, error),
2774 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2775 }
2776
2777 /* get/verify the outbound interface */
2778 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2779 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2780 VERIFY(mpts->mpts_outif != NULL);
2781 if (mpts->mpts_outif != outifp) {
3e170ce0 2782 mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
39236c6e
A
2783 "(expected %s)\n", __func__, mpts->mpts_connid,
2784 ((outifp != NULL) ? outifp->if_xname : "NULL"),
3e170ce0
A
2785 mpts->mpts_outif->if_xname),
2786 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2787
39236c6e
A
2788 if (outifp == NULL)
2789 outifp = mpts->mpts_outif;
2790 }
2791 } else {
2792 mpts->mpts_outif = outifp;
2793 }
2794
3e170ce0
A
2795 mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
2796 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
2797 mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
2798
2799 cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
2800 wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
2801 wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
2802
2803 if (cell)
2804 mpts->mpts_linktype |= MPTSL_CELL;
2805 else if (wifi)
2806 mpts->mpts_linktype |= MPTSL_WIFI;
2807 else if (wired)
2808 mpts->mpts_linktype |= MPTSL_WIRED;
2809
39236c6e
A
2810 socket_unlock(so, 0);
2811
3e170ce0
A
2812 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
2813 "establishment srtt %d \n", __func__,
2814 mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
2815 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2816
2817
2818 mptcplog((LOG_DEBUG, "MPTCP Socket: "
2819 "%s: cid %d outif %s %s[%d] -> %s[%d] "
39236c6e
A
2820 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2821 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2822 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2823 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2824 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2825 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2826 inet_ntop(af, ((af == AF_INET) ?
2827 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2828 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2829 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2830 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2831 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
3e170ce0
A
2832 "MPTCP capable" : "a regular TCP")),
2833 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
2834
2835 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2836 MPTS_UNLOCK(mpts);
2837
3e170ce0 2838 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
2839
2840 MPT_LOCK(mp_tp);
2841 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2842 /* case (a) above */
2843 if (!mpok) {
2844 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2845 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2846 MPT_UNLOCK(mp_tp);
2847 } else {
2848 if (mptcp_init_authparms(mp_tp) != 0) {
2849 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2850 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2851 MPT_UNLOCK(mp_tp);
2852 mpok = FALSE;
2853 } else {
3e170ce0
A
2854 mptcplog((LOG_DEBUG, "MPTCP State: "
2855 "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
2856 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2857 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2858 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2859 mpte->mpte_associd = mpts->mpts_connid;
2860 DTRACE_MPTCP2(state__change,
2861 struct mptcb *, mp_tp,
2862 uint32_t, 0 /* event */);
2863 mptcp_init_statevars(mp_tp);
2864 MPT_UNLOCK(mp_tp);
2865
2866 (void) mptcp_setconnorder(mpte,
2867 mpts->mpts_connid, 1);
2868 soisconnected(mp_so);
2869 }
2870 }
2871 MPTS_LOCK(mpts);
2872 if (mpok) {
2873 /* Initialize the relative sequence number */
2874 mpts->mpts_rel_seq = 1;
2875 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2876 mpte->mpte_nummpcapflows++;
2877 MPT_LOCK_SPIN(mp_tp);
2878 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2879 MPT_UNLOCK(mp_tp);
2880 }
2881 } else if (mpok) {
2882 MPT_UNLOCK(mp_tp);
fe8ab488
A
2883 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2884 /* Experimental code, disabled by default. */
2885 sorwakeup(mp_so);
2886 sowwakeup(mp_so);
2887 }
39236c6e
A
2888 /*
2889 * case (b) above
2890 * In case of additional flows, the MPTCP socket is not
2891 * MPTSF_MP_CAPABLE until an ACK is received from server
2892 * for 3-way handshake. TCP would have guaranteed that this
2893 * is an MPTCP subflow.
2894 */
2895 MPTS_LOCK(mpts);
2896 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
fe8ab488 2897 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
39236c6e 2898 mpte->mpte_nummpcapflows++;
fe8ab488
A
2899 /* With Fastjoin, rel sequence will be nonzero */
2900 if (mpts->mpts_rel_seq == 0)
2901 mpts->mpts_rel_seq = 1;
39236c6e 2902 MPT_LOCK_SPIN(mp_tp);
fe8ab488
A
2903 /* With Fastjoin, sndnxt is updated before connected_ev */
2904 if (mpts->mpts_sndnxt == 0) {
2905 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2906 }
39236c6e 2907 MPT_UNLOCK(mp_tp);
fe8ab488
A
2908 mptcp_output_needed(mpte, mpts);
2909 } else {
2910 MPT_UNLOCK(mp_tp);
2911 MPTS_LOCK(mpts);
39236c6e 2912 }
fe8ab488 2913
39236c6e
A
2914 MPTS_LOCK_ASSERT_HELD(mpts);
2915
2916 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2917}
2918
2919/*
2920 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2921 */
2922static ev_ret_t
3e170ce0
A
2923mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
2924 uint64_t *p_mpsofilt_hint)
39236c6e
A
2925{
2926 struct socket *mp_so, *so;
2927 struct mptcb *mp_tp;
2928 boolean_t linger;
2929
2930 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2931 MPTS_LOCK_ASSERT_HELD(mpts);
2932 VERIFY(mpte->mpte_mppcb != NULL);
2933 mp_so = mpte->mpte_mppcb->mpp_socket;
2934 mp_tp = mpte->mpte_mptcb;
2935 so = mpts->mpts_socket;
2936
2937 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2938 !(mp_so->so_flags & SOF_PCBCLEARING));
2939
3e170ce0
A
2940 mptcplog((LOG_DEBUG, "MPTCP Events: "
2941 "%s: cid %d [linger %s]\n", __func__,
2942 mpts->mpts_connid, (linger ? "YES" : "NO")),
2943 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2944
2945 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2946 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2947
2948 /*
2949 * Clear flags that are used by getconninfo to return state.
fe8ab488 2950 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e
A
2951 */
2952 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
2953 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
2954 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
2955 MPTSF_SUSPENDED|MPTSF_ACTIVE);
2956 mpts->mpts_flags |= MPTSF_DISCONNECTED;
2957
2958 /*
2959 * The subflow connection has been disconnected.
2960 *
2961 * Right now, we simply unblock any waiters at the MPTCP socket layer
2962 * if the MPTCP connection has not been established.
2963 */
3e170ce0 2964 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
2965
2966 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
2967 mpte->mpte_nummpcapflows--;
fe8ab488
A
2968 if (mpte->mpte_active_sub == mpts) {
2969 mpte->mpte_active_sub = NULL;
3e170ce0
A
2970 mptcplog((LOG_DEBUG, "MPTCP Events: "
2971 "%s: resetting active subflow \n",
2972 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2973 }
39236c6e
A
2974 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
2975 }
2976
2977 MPT_LOCK(mp_tp);
2978 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2979 MPT_UNLOCK(mp_tp);
3e170ce0 2980 MPTS_UNLOCK(mpts);
39236c6e 2981 soisdisconnected(mp_so);
3e170ce0 2982 MPTS_LOCK(mpts);
39236c6e
A
2983 } else {
2984 MPT_UNLOCK(mp_tp);
2985 }
2986
39236c6e
A
2987 /*
2988 * The underlying subflow socket has been disconnected;
2989 * it is no longer useful to us. Keep the subflow socket
2990 * around, unless the MPTCP socket has been detached or
2991 * the subflow has been disconnected explicitly, in which
2992 * case it should be deleted right away.
2993 */
2994 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2995}
2996
2997/*
2998 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2999 */
3000static ev_ret_t
3e170ce0
A
3001mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3002 uint64_t *p_mpsofilt_hint)
39236c6e
A
3003{
3004 struct socket *mp_so, *so;
3005 struct mptcb *mp_tp;
3e170ce0 3006 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e
A
3007
3008 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3009 VERIFY(mpte->mpte_mppcb != NULL);
3010 mp_so = mpte->mpte_mppcb->mpp_socket;
3011 mp_tp = mpte->mpte_mptcb;
3012
3013 MPTS_LOCK_ASSERT_HELD(mpts);
3014 so = mpts->mpts_socket;
3015
3016 socket_lock(so, 0);
3017 MPT_LOCK(mp_tp);
3018
3019 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3020 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3021 else
3022 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3023
3024 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3025 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3026 goto done;
3027 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3028 }
3029 else
3030 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3031
3032 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3033 mpts->mpts_flags |= MPTSF_MP_READY;
3034 else
3035 mpts->mpts_flags &= ~MPTSF_MP_READY;
3036
3037 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3038 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3039 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3040 }
3041
3042 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3043 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3044 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3e170ce0
A
3045 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3046 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3047 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3048 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3049 ret = MPTS_EVRET_CONNECT_PENDING;
3e170ce0
A
3050 } else {
3051 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3052 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3053 }
3054
3e170ce0
A
3055 mptcplog((LOG_DEBUG, "MPTCP Events: "
3056 "%s: mp_so 0x%llx mpt_flags=%b cid %d "
39236c6e
A
3057 "mptsf=%b\n", __func__,
3058 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
3059 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3e170ce0
A
3060 mpts->mpts_flags, MPTSF_BITS),
3061 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3062
39236c6e
A
3063done:
3064 MPT_UNLOCK(mp_tp);
3065 socket_unlock(so, 0);
39236c6e
A
3066 return (ret);
3067}
3068
3069/*
3070 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3071 */
3072static ev_ret_t
3e170ce0
A
3073mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3074 uint64_t *p_mpsofilt_hint)
39236c6e
A
3075{
3076 struct socket *mp_so, *so;
3077 struct mptcb *mp_tp;
3078 boolean_t linger;
3079
3080
3081 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3082 MPTS_LOCK_ASSERT_HELD(mpts);
3083 VERIFY(mpte->mpte_mppcb != NULL);
3084 mp_so = mpte->mpte_mppcb->mpp_socket;
3085 mp_tp = mpte->mpte_mptcb;
3086 so = mpts->mpts_socket;
3087
3088 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3089 !(mp_so->so_flags & SOF_PCBCLEARING));
3090
3091 if (mpts->mpts_soerror == 0)
3092 mpts->mpts_soerror = ECONNABORTED;
3093
39236c6e
A
3094 /* We got an invalid option or a fast close */
3095 socket_lock(so, 0);
3096 struct tcptemp *t_template;
3097 struct inpcb *inp = sotoinpcb(so);
3098 struct tcpcb *tp = NULL;
3099
3100 tp = intotcpcb(inp);
fe8ab488 3101 so->so_error = ECONNABORTED;
39236c6e
A
3102
3103 t_template = tcp_maketemplate(tp);
3104 if (t_template) {
fe8ab488 3105 struct tcp_respond_args tra;
39236c6e 3106
fe8ab488 3107 bzero(&tra, sizeof(tra));
39236c6e 3108 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3109 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3110 else
fe8ab488
A
3111 tra.ifscope = IFSCOPE_NONE;
3112 tra.awdl_unrestricted = 1;
39236c6e
A
3113
3114 tcp_respond(tp, t_template->tt_ipgen,
3115 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3116 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3117 (void) m_free(dtom(t_template));
3e170ce0
A
3118 mptcplog((LOG_DEBUG, "MPTCP Events: "
3119 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3120 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3121 so, mpts->mpts_connid),
3122 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3123 }
3124 socket_unlock(so, 0);
3125 mptcp_subflow_disconnect(mpte, mpts, !linger);
39236c6e 3126
3e170ce0
A
3127 *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
3128
3129 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
3130 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e
A
3131
3132 MPT_LOCK(mp_tp);
fe8ab488
A
3133 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) ||
3134 (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) {
39236c6e
A
3135 mp_so->so_error = ECONNABORTED;
3136 }
3e170ce0
A
3137 /*
3138 * Ideally there should be a state transition for when a FASTCLOSE
3139 * is received. Right now we keep the connection in MPTCPS_ESTABLISHED
3140 * state and only go to terminal state when the user level code calls
3141 * close after processing the SO_FILT_HINT_CONNRESET event.
3142 */
3143 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3144 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e
A
3145 MPT_UNLOCK(mp_tp);
3146
39236c6e
A
3147 /*
3148 * Keep the subflow socket around unless the subflow has been
3149 * disconnected explicitly.
3150 */
3151 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3152}
3153
fe8ab488 3154static ev_ret_t
3e170ce0
A
3155mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
3156 uint64_t *p_mpsofilt_hint)
fe8ab488 3157{
3e170ce0 3158#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3159 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3160 MPTS_LOCK_ASSERT_HELD(mpts);
3161 VERIFY(mpte->mpte_mppcb != NULL);
3162
3163 if (mpte->mpte_nummpcapflows == 0) {
3164 struct mptcb *mp_tp = mpte->mpte_mptcb;
3e170ce0
A
3165 mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
3166 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
3167 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3168
fe8ab488
A
3169 mpte->mpte_active_sub = mpts;
3170 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3171 MPT_LOCK(mp_tp);
3172 /*
3173 * If mptcp_subflow_output is called before fastjoin_ev
3174 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3175 * and further mpts->mpts_sndnxt is incremented by len copied.
3176 */
3177 if (mpts->mpts_sndnxt == 0) {
3178 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
3179 mpts->mpts_rel_seq = 1;
3180 }
3181 MPT_UNLOCK(mp_tp);
3182 }
3183
3184 return (MPTS_EVRET_OK);
3185}
3186
3187static ev_ret_t
3e170ce0
A
3188mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
3189 uint64_t *p_mpsofilt_hint)
fe8ab488 3190{
3e170ce0 3191#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3192 MPTE_LOCK_ASSERT_HELD(mpte);
3193 MPTS_LOCK_ASSERT_HELD(mpts);
3194 VERIFY(mpte->mpte_mppcb != NULL);
3e170ce0
A
3195
3196 mptcplog((LOG_DEBUG, "MPTCP Events: "
3197 "%s cid %d\n", __func__, mpts->mpts_connid),
3198 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3199
3200 mpts->mpts_flags |= MPTSF_DELETEOK;
3201 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3202 return (MPTS_EVRET_DELETE);
3203 else
3204 return (MPTS_EVRET_OK);
3205}
3206
39236c6e
A
3207static const char *
3208mptcp_evret2str(ev_ret_t ret)
3209{
3210 const char *c = "UNKNOWN";
3211
3212 switch (ret) {
3213 case MPTS_EVRET_DELETE:
3214 c = "MPTS_EVRET_DELETE";
3215 break;
3216 case MPTS_EVRET_CONNECT_PENDING:
3217 c = "MPTS_EVRET_CONNECT_PENDING";
3218 break;
3219 case MPTS_EVRET_DISCONNECT_FALLBACK:
3220 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3221 break;
3222 case MPTS_EVRET_OK:
3223 c = "MPTS_EVRET_OK";
3224 break;
3e170ce0 3225 default:
39236c6e
A
3226 break;
3227 }
3228 return (c);
3229}
3230
3231/*
3232 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3233 */
3234void
3235mptcp_subflow_addref(struct mptsub *mpts, int locked)
3236{
3237 if (!locked)
3238 MPTS_LOCK(mpts);
3239 else
3240 MPTS_LOCK_ASSERT_HELD(mpts);
3241
3242 if (++mpts->mpts_refcnt == 0) {
3243 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3244 /* NOTREACHED */
3245 }
3246 if (!locked)
3247 MPTS_UNLOCK(mpts);
3248}
3249
3250/*
3251 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3252 */
3253void
3254mptcp_subflow_remref(struct mptsub *mpts)
3255{
3256 MPTS_LOCK(mpts);
3257 if (mpts->mpts_refcnt == 0) {
3258 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3259 /* NOTREACHED */
3260 }
3261 if (--mpts->mpts_refcnt > 0) {
3262 MPTS_UNLOCK(mpts);
3263 return;
3264 }
3265 /* callee will unlock and destroy lock */
3266 mptcp_subflow_free(mpts);
3267}
3268
3269/*
3270 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3271 * caller must ensure that the option can be issued on subflow sockets, via
3272 * MPOF_SUBFLOW_OK flag.
3273 */
3274int
3275mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3276 struct mptopt *mpo)
3277{
3278 struct socket *mp_so;
3279 struct sockopt sopt;
3280 char buf[32];
3281 int error;
3282
3283 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3284 mpo->mpo_flags &= ~MPOF_INTERIM;
3285
3286 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3287 mp_so = mpte->mpte_mppcb->mpp_socket;
3288
3289 bzero(&sopt, sizeof (sopt));
3290 sopt.sopt_dir = SOPT_SET;
3291 sopt.sopt_level = mpo->mpo_level;
3292 sopt.sopt_name = mpo->mpo_name;
3293 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3294 sopt.sopt_valsize = sizeof (int);
3295 sopt.sopt_p = kernproc;
3296
3297 error = sosetoptlock(so, &sopt, 0); /* already locked */
3298 if (error == 0) {
3e170ce0
A
3299 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3300 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3301 "val %d set successful\n", __func__,
3302 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3303 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3304 buf, sizeof (buf)), mpo->mpo_intval),
3305 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3306 } else {
3e170ce0
A
3307 mptcplog((LOG_ERR, "MPTCP Socket: "
3308 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3309 "val %d set error %d\n", __func__,
3310 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3311 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3312 buf, sizeof (buf)), mpo->mpo_intval, error),
3313 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3314 }
3315 return (error);
3316}
3317
3318/*
3319 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3320 * caller must ensure that the option can be issued on subflow sockets, via
3321 * MPOF_SUBFLOW_OK flag.
3322 */
3323int
3324mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3325 struct mptopt *mpo)
3326{
3327 struct socket *mp_so;
3328 struct sockopt sopt;
3329 char buf[32];
3330 int error;
3331
3332 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3333 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3334 mp_so = mpte->mpte_mppcb->mpp_socket;
3335
3336 bzero(&sopt, sizeof (sopt));
3337 sopt.sopt_dir = SOPT_GET;
3338 sopt.sopt_level = mpo->mpo_level;
3339 sopt.sopt_name = mpo->mpo_name;
3340 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3341 sopt.sopt_valsize = sizeof (int);
3342 sopt.sopt_p = kernproc;
3343
3344 error = sogetoptlock(so, &sopt, 0); /* already locked */
3345 if (error == 0) {
3e170ce0
A
3346 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3347 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3348 "val %d get successful\n", __func__,
3349 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3350 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3351 buf, sizeof (buf)), mpo->mpo_intval),
3352 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3353 } else {
3e170ce0
A
3354 mptcplog((LOG_ERR, "MPTCP Socket: "
3355 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e
A
3356 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3357 mptcp_sopt2str(mpo->mpo_level,
3e170ce0
A
3358 mpo->mpo_name, buf, sizeof (buf)), error),
3359 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3360 }
3361 return (error);
3362}
3363
3364
3365/*
3366 * MPTCP garbage collector.
3367 *
3368 * This routine is called by the MP domain on-demand, periodic callout,
3369 * which is triggered when a MPTCP socket is closed. The callout will
3370 * repeat as long as this routine returns a non-zero value.
3371 */
3372static uint32_t
3373mptcp_gc(struct mppcbinfo *mppi)
3374{
3375 struct mppcb *mpp, *tmpp;
3376 uint32_t active = 0;
3377
3378 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3379
39236c6e
A
3380 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3381 struct socket *mp_so;
3382 struct mptses *mpte;
3383 struct mptcb *mp_tp;
3384
3385 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3386 mp_so = mpp->mpp_socket;
3387 VERIFY(mp_so != NULL);
3388 mpte = mptompte(mpp);
3389 VERIFY(mpte != NULL);
3390 mp_tp = mpte->mpte_mptcb;
3391 VERIFY(mp_tp != NULL);
3392
3e170ce0
A
3393 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3394 "%s: mp_so 0x%llx found "
39236c6e
A
3395 "(u=%d,r=%d,s=%d)\n", __func__,
3396 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3397 mp_so->so_retaincnt, mpp->mpp_state),
3398 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3399
3400 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3e170ce0
A
3401 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3402 "%s: mp_so 0x%llx skipped "
39236c6e
A
3403 "(u=%d,r=%d)\n", __func__,
3404 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3405 mp_so->so_usecount, mp_so->so_retaincnt),
3406 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3407 active++;
3408 continue;
3409 }
3410
3411 /* check again under the lock */
3412 if (mp_so->so_usecount > 1) {
3413 boolean_t wakeup = FALSE;
3414 struct mptsub *mpts, *tmpts;
3415
3e170ce0
A
3416 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3417 "%s: mp_so 0x%llx skipped "
39236c6e
A
3418 "[u=%d,r=%d] %d %d\n", __func__,
3419 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3420 mp_so->so_usecount, mp_so->so_retaincnt,
3421 mp_tp->mpt_gc_ticks,
3e170ce0
A
3422 mp_tp->mpt_state),
3423 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3424
39236c6e
A
3425 MPT_LOCK(mp_tp);
3426 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3427 if (mp_tp->mpt_gc_ticks > 0)
3428 mp_tp->mpt_gc_ticks--;
3429 if (mp_tp->mpt_gc_ticks == 0) {
3430 wakeup = TRUE;
3431 if (mp_tp->mpt_localkey != NULL) {
3432 mptcp_free_key(
3433 mp_tp->mpt_localkey);
3434 mp_tp->mpt_localkey = NULL;
3435 }
3436 }
3437 }
3438 MPT_UNLOCK(mp_tp);
3439 if (wakeup) {
3440 TAILQ_FOREACH_SAFE(mpts,
3441 &mpte->mpte_subflows, mpts_entry, tmpts) {
3442 MPTS_LOCK(mpts);
3443 mpts->mpts_flags |= MPTSF_DELETEOK;
3444 if (mpts->mpts_soerror == 0)
3445 mpts->mpts_soerror = ETIMEDOUT;
3446 mptcp_subflow_eupcall(mpts->mpts_socket,
3447 mpts, SO_FILT_HINT_DISCONNECTED);
3448 MPTS_UNLOCK(mpts);
3449 }
3450 }
3451 lck_mtx_unlock(&mpp->mpp_lock);
3452 active++;
3453 continue;
3454 }
3455
3456 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3e170ce0
A
3457 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3458 "%s: mp_so 0x%llx skipped "
39236c6e
A
3459 "[u=%d,r=%d,s=%d]\n", __func__,
3460 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3461 mp_so->so_usecount, mp_so->so_retaincnt,
3e170ce0
A
3462 mpp->mpp_state),
3463 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3464 lck_mtx_unlock(&mpp->mpp_lock);
3465 active++;
3466 continue;
3467 }
3468
3469 /*
3470 * The PCB has been detached, and there is exactly 1 refnct
3471 * held by the MPTCP thread. Signal that thread to terminate,
3472 * after which the last refcnt will be released. That will
3473 * allow it to be destroyed below during the next round.
3474 */
3475 if (mp_so->so_usecount == 1) {
3e170ce0
A
3476 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3477 "%s: mp_so 0x%llx scheduled for "
39236c6e
A
3478 "termination [u=%d,r=%d]\n", __func__,
3479 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3480 mp_so->so_usecount, mp_so->so_retaincnt),
3481 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3482
39236c6e
A
3483 /* signal MPTCP thread to terminate */
3484 mptcp_thread_terminate_signal(mpte);
3485 lck_mtx_unlock(&mpp->mpp_lock);
3486 active++;
3487 continue;
3488 }
3489
3e170ce0
A
3490 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3491 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 3492 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3493 mp_so->so_usecount, mp_so->so_retaincnt),
3494 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3495
39236c6e
A
3496 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3497 struct sockbuf *, &mp_so->so_rcv,
3498 struct sockbuf *, &mp_so->so_snd,
3499 struct mppcb *, mpp);
3500
3501 mp_pcbdispose(mpp);
3502 }
3503
3504 return (active);
3505}
3506
3507/*
3508 * Drop a MPTCP connection, reporting the specified error.
3509 */
3510struct mptses *
3511mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3512{
3513 struct socket *mp_so;
3514
3515 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3516 MPT_LOCK_ASSERT_HELD(mp_tp);
3517 VERIFY(mpte->mpte_mptcb == mp_tp);
3518 mp_so = mpte->mpte_mppcb->mpp_socket;
3519
fe8ab488 3520 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e
A
3521 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3522 uint32_t, 0 /* event */);
3523
3524 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3525 errno = mp_tp->mpt_softerror;
3526 mp_so->so_error = errno;
3527
3528 return (mptcp_close(mpte, mp_tp));
3529}
3530
3531/*
3532 * Close a MPTCP control block.
3533 */
3534struct mptses *
3535mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3536{
3e170ce0
A
3537 struct socket *mp_so = NULL;
3538 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e
A
3539
3540 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3541 MPT_LOCK_ASSERT_HELD(mp_tp);
3542 VERIFY(mpte->mpte_mptcb == mp_tp);
3543 mp_so = mpte->mpte_mppcb->mpp_socket;
3544 if (mp_tp->mpt_localkey != NULL) {
3545 mptcp_free_key(mp_tp->mpt_localkey);
3546 mp_tp->mpt_localkey = NULL;
3547 }
3548
3549 MPT_UNLOCK(mp_tp);
3550 soisdisconnected(mp_so);
3551
3552 MPT_LOCK(mp_tp);
3553 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3554 return (NULL);
3555 }
3556 MPT_UNLOCK(mp_tp);
3557
3558 /* Clean up all subflows */
3559 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3560 MPTS_LOCK(mpts);
fe8ab488 3561 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
39236c6e
A
3562 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3563 MPTS_UNLOCK(mpts);
3564 mptcp_subflow_del(mpte, mpts, TRUE);
3565 }
3566 MPT_LOCK(mp_tp);
3567
3568 return (NULL);
3569}
3570
3571void
3572mptcp_notify_close(struct socket *so)
3573{
3574 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3575}
3576
3577/*
3578 * Signal MPTCP thread to wake up.
3579 */
3580void
3581mptcp_thread_signal(struct mptses *mpte)
3582{
3583 lck_mtx_lock(&mpte->mpte_thread_lock);
3584 mptcp_thread_signal_locked(mpte);
3585 lck_mtx_unlock(&mpte->mpte_thread_lock);
3586}
3587
3588/*
3589 * Signal MPTCP thread to wake up (locked version)
3590 */
3591static void
3592mptcp_thread_signal_locked(struct mptses *mpte)
3593{
3594 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3595
3596 mpte->mpte_thread_reqs++;
3597 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3598 wakeup_one((caddr_t)&mpte->mpte_thread);
3599}
3600
3601/*
3602 * Signal MPTCP thread to terminate.
3603 */
3604static void
3605mptcp_thread_terminate_signal(struct mptses *mpte)
3606{
3607 lck_mtx_lock(&mpte->mpte_thread_lock);
3608 if (mpte->mpte_thread != THREAD_NULL) {
3609 mpte->mpte_thread = THREAD_NULL;
3610 mpte->mpte_thread_reqs++;
3611 if (!mpte->mpte_thread_active)
3612 wakeup_one((caddr_t)&mpte->mpte_thread);
3613 }
3614 lck_mtx_unlock(&mpte->mpte_thread_lock);
3615}
3616
3617/*
3618 * MPTCP thread workloop.
3619 */
3620static void
3621mptcp_thread_dowork(struct mptses *mpte)
3622{
3623 struct socket *mp_so;
3624 struct mptsub *mpts, *tmpts;
3625 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3e170ce0 3626 uint64_t mpsofilt_hint_mask = 0;
39236c6e
A
3627
3628 MPTE_LOCK(mpte); /* same as MP socket lock */
3629 VERIFY(mpte->mpte_mppcb != NULL);
3630 mp_so = mpte->mpte_mppcb->mpp_socket;
3631 VERIFY(mp_so != NULL);
3632
3633 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3634 ev_ret_t ret;
3635
3636 MPTS_LOCK(mpts);
3637 MPTS_ADDREF_LOCKED(mpts); /* for us */
3638
3639 /* Update process ownership based on parent mptcp socket */
3640 mptcp_update_last_owner(mpts, mp_so);
3641
3642 mptcp_subflow_input(mpte, mpts);
3e170ce0
A
3643
3644 mptcp_get_rtt_measurement(mpts, mpte);
3645
3646 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e
A
3647
3648 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3e170ce0
A
3649 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3650 "%s: cid %d \n", __func__,
3651 mpts->mpts_connid),
3652 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3653 (void) mptcp_subflow_output(mpte, mpts);
3654 }
3655
3656 /*
3657 * If MPTCP socket is closed, disconnect all subflows.
3658 * This will generate a disconnect event which will
3659 * be handled during the next iteration, causing a
3660 * non-zero error to be returned above.
3661 */
3662 if (mp_so->so_flags & SOF_PCBCLEARING)
3663 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3664 MPTS_UNLOCK(mpts);
3665
3666 switch (ret) {
39236c6e
A
3667 case MPTS_EVRET_OK:
3668 /* nothing to do */
3669 break;
3670 case MPTS_EVRET_DELETE:
fe8ab488 3671 mptcp_subflow_del(mpte, mpts, TRUE);
39236c6e
A
3672 break;
3673 case MPTS_EVRET_CONNECT_PENDING:
3674 connect_pending = TRUE;
3675 break;
3676 case MPTS_EVRET_DISCONNECT_FALLBACK:
3677 disconnect_fallback = TRUE;
3678 break;
3e170ce0
A
3679 default:
3680 mptcplog((LOG_DEBUG,
3681 "MPTCP Socket: %s: mptcp_subflow_events "
3682 "returned invalid value: %d\n", __func__,
3683 ret),
3684 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3685 break;
39236c6e
A
3686 }
3687 MPTS_REMREF(mpts); /* ours */
3688 }
3689
3e170ce0
A
3690 if (mpsofilt_hint_mask) {
3691 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
3692 }
3693
3694 if (!connect_pending && !disconnect_fallback) {
3695 MPTE_UNLOCK(mpte);
3696 return;
3697 }
3698
3699 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3700 MPTS_LOCK(mpts);
3701 if (disconnect_fallback) {
3702 struct socket *so = NULL;
3703 struct inpcb *inp = NULL;
3704 struct tcpcb *tp = NULL;
3705
3706 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3707 MPTS_UNLOCK(mpts);
3708 continue;
3709 }
3710
3711 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3712
3713 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3e170ce0 3714 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
39236c6e
A
3715 MPTS_UNLOCK(mpts);
3716 continue;
3717 }
3718 so = mpts->mpts_socket;
3719
3720 /*
3721 * The MPTCP connection has degraded to a fallback
3722 * mode, so there is no point in keeping this subflow
3723 * regardless of its MPTCP-readiness state, unless it
3724 * is the primary one which we use for fallback. This
3725 * assumes that the subflow used for fallback is the
3726 * ACTIVE one.
3727 */
3728
3729 socket_lock(so, 1);
3730 inp = sotoinpcb(so);
3731 tp = intotcpcb(inp);
3732 tp->t_mpflags &=
3733 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3734 tp->t_mpflags |= TMPF_TCP_FALLBACK;
3735 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3736 socket_unlock(so, 1);
3737 MPTS_UNLOCK(mpts);
3738 continue;
3739 }
3740 tp->t_mpflags |= TMPF_RESET;
3741 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3742 socket_unlock(so, 1);
3743
3744 } else if (connect_pending) {
fe8ab488
A
3745 /*
3746 * If delayed subflow start is set and cellular,
3747 * delay the connect till a retransmission timeout
3748 */
3749
3750 if ((mptcp_delayed_subf_start) &&
3751 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3752 MPTS_UNLOCK(mpts);
3753 continue;
3754 }
3755
39236c6e
A
3756 /*
3757 * The MPTCP connection has progressed to a state
3758 * where it supports full multipath semantics; allow
3759 * additional joins to be attempted for all subflows
3760 * that are in the PENDING state.
3761 */
3762 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3763 (void) mptcp_subflow_soconnectx(mpte, mpts);
3764 }
3765 }
3766 MPTS_UNLOCK(mpts);
3767 }
3768
3769 MPTE_UNLOCK(mpte);
3770}
3771
3772/*
3773 * MPTCP thread.
3774 */
3775static void
3776mptcp_thread_func(void *v, wait_result_t w)
3777{
3778#pragma unused(w)
3779 struct mptses *mpte = v;
3780 struct timespec *ts = NULL;
3781
3782 VERIFY(mpte != NULL);
3783
3784 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3785
3786 for (;;) {
3787 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3788
3789 if (mpte->mpte_thread != THREAD_NULL) {
3790 (void) msleep(&mpte->mpte_thread,
3791 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3792 __func__, ts);
3793 }
3794
3795 /* MPTCP socket is closed? */
3796 if (mpte->mpte_thread == THREAD_NULL) {
3797 lck_mtx_unlock(&mpte->mpte_thread_lock);
3798 /* callee will destroy thread lock */
3799 mptcp_thread_destroy(mpte);
3800 /* NOTREACHED */
3801 return;
3802 }
3803
3804 mpte->mpte_thread_active = 1;
3805 for (;;) {
3806 uint32_t reqs = mpte->mpte_thread_reqs;
3807
3808 lck_mtx_unlock(&mpte->mpte_thread_lock);
3809 mptcp_thread_dowork(mpte);
3810 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3811
3812 /* if there's no pending request, we're done */
3813 if (reqs == mpte->mpte_thread_reqs ||
3814 mpte->mpte_thread == THREAD_NULL)
3815 break;
3816 }
3817 mpte->mpte_thread_reqs = 0;
3818 mpte->mpte_thread_active = 0;
3819 }
3820}
3821
3822/*
3823 * Destroy a MTCP thread, to be called in the MPTCP thread context
3824 * upon receiving an indication to self-terminate. This routine
3825 * will not return, as the current thread is terminated at the end.
3826 */
3827static void
3828mptcp_thread_destroy(struct mptses *mpte)
3829{
3830 struct socket *mp_so;
3831
3832 MPTE_LOCK(mpte); /* same as MP socket lock */
3833 VERIFY(mpte->mpte_thread == THREAD_NULL);
3834 VERIFY(mpte->mpte_mppcb != NULL);
3835
3836 mptcp_sesdestroy(mpte);
3837
3838 mp_so = mpte->mpte_mppcb->mpp_socket;
3839 VERIFY(mp_so != NULL);
3840 VERIFY(mp_so->so_usecount != 0);
3841 mp_so->so_usecount--; /* for thread */
3842 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3843 MPTE_UNLOCK(mpte);
3844
3845 /* for the extra refcnt from kernel_thread_start() */
3846 thread_deallocate(current_thread());
3847 /* this is the end */
3848 thread_terminate(current_thread());
3849 /* NOTREACHED */
3850}
3851
3852/*
3853 * Protocol pr_lock callback.
3854 */
3855int
3856mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3857{
3858 struct mppcb *mpp = sotomppcb(mp_so);
3859 void *lr_saved;
3860
3861 if (lr == NULL)
3862 lr_saved = __builtin_return_address(0);
3863 else
3864 lr_saved = lr;
3865
3866 if (mpp == NULL) {
3867 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3868 mp_so, lr_saved, solockhistory_nr(mp_so));
3869 /* NOTREACHED */
3870 }
3871 lck_mtx_lock(&mpp->mpp_lock);
3872
3873 if (mp_so->so_usecount < 0) {
3874 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3875 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3876 solockhistory_nr(mp_so));
3877 /* NOTREACHED */
3878 }
3879 if (refcount != 0)
3880 mp_so->so_usecount++;
3881 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3882 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3883
3884 return (0);
3885}
3886
3887/*
3888 * Protocol pr_unlock callback.
3889 */
3890int
3891mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3892{
3893 struct mppcb *mpp = sotomppcb(mp_so);
3894 void *lr_saved;
3895
3896 if (lr == NULL)
3897 lr_saved = __builtin_return_address(0);
3898 else
3899 lr_saved = lr;
3900
3901 if (mpp == NULL) {
3902 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3903 mp_so, mp_so->so_usecount, lr_saved,
3904 solockhistory_nr(mp_so));
3905 /* NOTREACHED */
3906 }
3907 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3908
3909 if (refcount != 0)
3910 mp_so->so_usecount--;
3911
3912 if (mp_so->so_usecount < 0) {
3913 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3914 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3915 /* NOTREACHED */
3916 }
3917 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3918 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3919 lck_mtx_unlock(&mpp->mpp_lock);
3920
3921 return (0);
3922}
3923
3924/*
3925 * Protocol pr_getlock callback.
3926 */
3927lck_mtx_t *
3928mptcp_getlock(struct socket *mp_so, int locktype)
3929{
3930#pragma unused(locktype)
3931 struct mppcb *mpp = sotomppcb(mp_so);
3932
3933 if (mpp == NULL) {
3934 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3935 solockhistory_nr(mp_so));
3936 /* NOTREACHED */
3937 }
3938 if (mp_so->so_usecount < 0) {
3939 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3940 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3941 /* NOTREACHED */
3942 }
3943 return (&mpp->mpp_lock);
3944}
3945
3946/*
3947 * Key generation functions
3948 */
3949static void
3950mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
3951{
3952 struct mptcp_key_entry *key_elm;
3953try_again:
3954 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
3955 if (key_entry->mkey_value == 0)
3956 goto try_again;
3957 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
3958 sizeof (key_entry->mkey_digest));
3959
3960 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3961 if (key_elm->mkey_value == key_entry->mkey_value) {
3962 goto try_again;
3963 }
3964 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
3965 0) {
3966 goto try_again;
3967 }
3968 }
3969}
3970
3971static mptcp_key_t *
3972mptcp_reserve_key(void)
3973{
3974 struct mptcp_key_entry *key_elm;
3975 struct mptcp_key_entry *found_elm = NULL;
3976
3977 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3978 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3979 if (key_elm->mkey_flags == MKEYF_FREE) {
3980 key_elm->mkey_flags = MKEYF_INUSE;
3981 found_elm = key_elm;
3982 break;
3983 }
3984 }
3985 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3986
3987 if (found_elm) {
3988 return (&found_elm->mkey_value);
3989 }
3990
3991 key_elm = (struct mptcp_key_entry *)
3992 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3993 key_elm->mkey_flags = MKEYF_INUSE;
3994
3995 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3996 mptcp_generate_unique_key(key_elm);
3997 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
3998 mptcp_keys_pool.mkph_count += 1;
3999 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4000 return (&key_elm->mkey_value);
4001}
4002
4003static caddr_t
4004mptcp_get_stored_digest(mptcp_key_t *key)
4005{
4006 struct mptcp_key_entry *key_holder;
4007 caddr_t digest = NULL;
4008
4009 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4010 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
4011 offsetof(struct mptcp_key_entry, mkey_value));
4012 if (key_holder->mkey_flags != MKEYF_INUSE)
4013 panic_plain("%s", __func__);
4014 digest = &key_holder->mkey_digest[0];
4015 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4016 return (digest);
4017}
4018
4019void
4020mptcp_free_key(mptcp_key_t *key)
4021{
4022 struct mptcp_key_entry *key_holder;
4023 struct mptcp_key_entry *key_elm;
4024 int pt = RandomULong();
4025
39236c6e
A
4026 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4027 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
4028 offsetof(struct mptcp_key_entry, mkey_value));
4029 key_holder->mkey_flags = MKEYF_FREE;
4030
4031 LIST_REMOVE(key_holder, mkey_next);
4032 mptcp_keys_pool.mkph_count -= 1;
4033
4034 /* Free half the time */
4035 if (pt & 0x01) {
4036 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
4037 } else {
4038 /* Insert it at random point to avoid early reuse */
4039 int i = 0;
4040 if (mptcp_keys_pool.mkph_count > 1) {
4041 pt = pt % (mptcp_keys_pool.mkph_count - 1);
4042 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4043 if (++i >= pt) {
4044 LIST_INSERT_AFTER(key_elm, key_holder,
4045 mkey_next);
4046 break;
4047 }
4048 }
4049 if (i < pt)
4050 panic("missed insertion");
4051 } else {
4052 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
4053 mkey_next);
4054 }
4055 mptcp_keys_pool.mkph_count += 1;
4056 }
4057 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4058}
4059
4060static void
4061mptcp_key_pool_init(void)
4062{
4063 int i;
4064 struct mptcp_key_entry *key_entry;
4065
4066 LIST_INIT(&mptcp_keys_pool);
4067 mptcp_keys_pool.mkph_count = 0;
4068
4069 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
4070 (sizeof (struct mptcp_key_entry));
4071 mptcp_keys_pool.mkph_key_entry_zone = zinit(
4072 mptcp_keys_pool.mkph_key_elm_sz,
4073 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
4074 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
4075 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
4076 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
4077 /* NOTREACHED */
4078 }
4079 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
4080 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
4081
4082 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
4083 key_entry = (struct mptcp_key_entry *)
4084 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4085 key_entry->mkey_flags = MKEYF_FREE;
4086 mptcp_generate_unique_key(key_entry);
4087 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
4088 mptcp_keys_pool.mkph_count += 1;
4089 }
4090 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
4091 mtcbinfo.mppi_lock_attr);
4092}
4093
4094/*
4095 * MPTCP Join support
4096 */
4097
4098static void
4099mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4100 uint8_t addr_id)
39236c6e
A
4101{
4102 struct tcpcb *tp = sototcpcb(so);
4103 struct mptcp_subf_auth_entry *sauth_entry;
4104 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4105
4106 MPT_LOCK_SPIN(mp_tp);
4107 tp->t_mptcb = mp_tp;
39236c6e 4108 /*
39236c6e
A
4109 * The address ID of the first flow is implicitly 0.
4110 */
4111 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4112 tp->t_local_aid = 0;
4113 } else {
fe8ab488 4114 tp->t_local_aid = addr_id;
39236c6e
A
4115 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4116 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4117 }
fe8ab488 4118 MPT_UNLOCK(mp_tp);
39236c6e
A
4119 sauth_entry = zalloc(mpt_subauth_zone);
4120 sauth_entry->msae_laddr_id = tp->t_local_aid;
4121 sauth_entry->msae_raddr_id = 0;
4122 sauth_entry->msae_raddr_rand = 0;
4123try_again:
4124 sauth_entry->msae_laddr_rand = RandomULong();
4125 if (sauth_entry->msae_laddr_rand == 0)
4126 goto try_again;
fe8ab488 4127 MPT_LOCK_SPIN(mp_tp);
39236c6e 4128 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
fe8ab488 4129 MPT_UNLOCK(mp_tp);
39236c6e
A
4130}
4131
4132static void
4133mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4134{
4135 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4136 struct tcpcb *tp = NULL;
39236c6e
A
4137 int found = 0;
4138
fe8ab488
A
4139 socket_lock(so, 0);
4140 tp = sototcpcb(so);
4141 if (tp == NULL) {
4142 socket_unlock(so, 0);
39236c6e 4143 return;
fe8ab488 4144 }
39236c6e
A
4145
4146 MPT_LOCK(mp_tp);
4147 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4148 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4149 found = 1;
4150 break;
4151 }
4152 }
4153 if (found) {
4154 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4155 }
39236c6e 4156 MPT_UNLOCK(mp_tp);
fe8ab488 4157
3e170ce0
A
4158 if (found)
4159 zfree(mpt_subauth_zone, sauth_entry);
4160
fe8ab488
A
4161 tp->t_mptcb = NULL;
4162 socket_unlock(so, 0);
39236c6e
A
4163}
4164
4165void
4166mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4167 u_int32_t *rrand)
4168{
4169 struct mptcp_subf_auth_entry *sauth_entry;
4170 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4171
4172 MPT_LOCK(mp_tp);
4173 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4174 if (sauth_entry->msae_laddr_id == addr_id) {
4175 if (lrand)
4176 *lrand = sauth_entry->msae_laddr_rand;
4177 if (rrand)
4178 *rrand = sauth_entry->msae_raddr_rand;
4179 break;
4180 }
4181 }
4182 MPT_UNLOCK(mp_tp);
4183}
4184
4185void
4186mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4187 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4188{
4189 struct mptcp_subf_auth_entry *sauth_entry;
4190 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4191
4192 MPT_LOCK(mp_tp);
4193 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4194 if (sauth_entry->msae_laddr_id == laddr_id) {
4195 if ((sauth_entry->msae_raddr_id != 0) &&
4196 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4197 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4198 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4199 sauth_entry->msae_raddr_id),
4200 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4201 MPT_UNLOCK(mp_tp);
4202 return;
4203 }
4204 sauth_entry->msae_raddr_id = raddr_id;
4205 if ((sauth_entry->msae_raddr_rand != 0) &&
4206 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4207 mptcplog((LOG_ERR, "MPTCP Socket: "
4208 "%s: dup SYN_ACK %d %d \n",
39236c6e 4209 __func__, raddr_rand,
3e170ce0
A
4210 sauth_entry->msae_raddr_rand),
4211 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4212 MPT_UNLOCK(mp_tp);
4213 return;
4214 }
4215 sauth_entry->msae_raddr_rand = raddr_rand;
4216 MPT_UNLOCK(mp_tp);
4217 return;
4218 }
4219 }
4220 MPT_UNLOCK(mp_tp);
4221}
4222
4223/*
4224 * SHA1 support for MPTCP
4225 */
4226static int
4227mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4228{
4229 SHA1_CTX sha1ctxt;
4230 const unsigned char *sha1_base;
4231 int sha1_size;
4232
4233 if (digest_len != SHA1_RESULTLEN) {
4234 return (FALSE);
4235 }
4236
4237 sha1_base = (const unsigned char *) key;
4238 sha1_size = sizeof (mptcp_key_t);
4239 SHA1Init(&sha1ctxt);
4240 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4241 SHA1Final(sha_digest, &sha1ctxt);
4242 return (TRUE);
4243}
4244
4245void
4246mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4247 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4248{
4249 SHA1_CTX sha1ctxt;
4250 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4251 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4252 u_int32_t data[2];
4253 int i;
4254
4255 bzero(digest, digest_len);
4256
4257 /* Set up the Key for HMAC */
4258 key_ipad[0] = key1;
4259 key_ipad[1] = key2;
4260
4261 key_opad[0] = key1;
4262 key_opad[1] = key2;
4263
4264 /* Set up the message for HMAC */
4265 data[0] = rand1;
4266 data[1] = rand2;
4267
4268 /* Key is 512 block length, so no need to compute hash */
4269
4270 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4271
4272 for (i = 0; i < 8; i++) {
4273 key_ipad[i] ^= 0x3636363636363636;
4274 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4275 }
4276
4277 /* Perform inner SHA1 */
4278 SHA1Init(&sha1ctxt);
4279 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4280 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4281 SHA1Final(digest, &sha1ctxt);
4282
4283 /* Perform outer SHA1 */
4284 SHA1Init(&sha1ctxt);
4285 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4286 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4287 SHA1Final(digest, &sha1ctxt);
4288}
4289
4290/*
4291 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4292 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4293 */
4294void
4295mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4296 int digest_len)
4297{
4298 uint32_t lrand, rrand;
4299 mptcp_key_t localkey, remotekey;
4300 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4301
4302 if (digest_len != SHA1_RESULTLEN)
4303 return;
4304
4305 lrand = rrand = 0;
4306 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4307 MPT_LOCK_SPIN(mp_tp);
4308 localkey = *mp_tp->mpt_localkey;
4309 remotekey = mp_tp->mpt_remotekey;
4310 MPT_UNLOCK(mp_tp);
4311 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4312 digest_len);
4313}
4314
4315u_int64_t
4316mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4317{
4318 u_char digest[SHA1_RESULTLEN];
4319 u_int64_t trunced_digest;
4320
4321 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4322 bcopy(digest, &trunced_digest, 8);
4323 return (trunced_digest);
4324}
4325
4326/*
4327 * Authentication data generation
4328 */
4329int
4330mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4331 int token_len)
4332{
4333 VERIFY(token_len == sizeof (u_int32_t));
4334 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4335
4336 /* Most significant 32 bits of the SHA1 hash */
4337 bcopy(sha_digest, token, sizeof (u_int32_t));
4338 return (TRUE);
4339}
4340
4341int
4342mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4343 int idsn_len)
4344{
4345 VERIFY(idsn_len == sizeof (u_int64_t));
4346 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4347
4348 /*
4349 * Least significant 64 bits of the SHA1 hash
4350 */
4351
4352 idsn[7] = sha_digest[12];
4353 idsn[6] = sha_digest[13];
4354 idsn[5] = sha_digest[14];
4355 idsn[4] = sha_digest[15];
4356 idsn[3] = sha_digest[16];
4357 idsn[2] = sha_digest[17];
4358 idsn[1] = sha_digest[18];
4359 idsn[0] = sha_digest[19];
4360 return (TRUE);
4361}
4362
4363static int
4364mptcp_init_authparms(struct mptcb *mp_tp)
4365{
4366 caddr_t local_digest = NULL;
4367 char remote_digest[MPTCP_SHA1_RESULTLEN];
4368 MPT_LOCK_ASSERT_HELD(mp_tp);
4369
4370 /* Only Version 0 is supported for auth purposes */
3e170ce0 4371 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4372 return (-1);
4373
4374 /* Setup local and remote tokens and Initial DSNs */
4375 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4376 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4377 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4378 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4379 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4380
4381 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4382 SHA1_RESULTLEN)) {
3e170ce0
A
4383 mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
4384 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4385 return (-1);
4386 }
4387 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4388 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken));
4389 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4390 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4391 return (0);
4392}
4393
4394static void
4395mptcp_init_statevars(struct mptcb *mp_tp)
4396{
4397 MPT_LOCK_ASSERT_HELD(mp_tp);
4398
4399 /* The subflow SYN is also first MPTCP byte */
4400 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4401 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4402
4403 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4404}
4405
4406static void
4407mptcp_conn_properties(struct mptcb *mp_tp)
4408{
4409 /* There is only Version 0 at this time */
3e170ce0 4410 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
39236c6e
A
4411
4412 /* Set DSS checksum flag */
4413 if (mptcp_dss_csum)
4414 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4415
4416 /* Set up receive window */
4417 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4418
4419 /* Set up gc ticks */
4420 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4421}
4422
4423/*
4424 * Helper Functions
4425 */
4426mptcp_token_t
4427mptcp_get_localtoken(void* mptcb_arg)
4428{
4429 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4430 return (mp_tp->mpt_localtoken);
4431}
4432
4433mptcp_token_t
4434mptcp_get_remotetoken(void* mptcb_arg)
4435{
4436 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4437 return (mp_tp->mpt_remotetoken);
4438}
4439
4440u_int64_t
4441mptcp_get_localkey(void* mptcb_arg)
4442{
4443 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4444 if (mp_tp->mpt_localkey != NULL)
4445 return (*mp_tp->mpt_localkey);
4446 else
4447 return (0);
4448}
4449
4450u_int64_t
4451mptcp_get_remotekey(void* mptcb_arg)
4452{
4453 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4454 return (mp_tp->mpt_remotekey);
4455}
4456
4457void
4458mptcp_send_dfin(struct socket *so)
4459{
4460 struct tcpcb *tp = NULL;
4461 struct inpcb *inp = NULL;
4462
4463 inp = sotoinpcb(so);
4464 if (!inp)
4465 return;
4466
4467 tp = intotcpcb(inp);
4468 if (!tp)
4469 return;
4470
4471 if (!(tp->t_mpflags & TMPF_RESET))
4472 tp->t_mpflags |= TMPF_SEND_DFIN;
4473}
4474
4475/*
4476 * Data Sequence Mapping routines
4477 */
4478void
4479mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4480{
4481 struct mptcb *mp_tp;
4482
4483 if (m == NULL)
4484 return;
4485
3e170ce0 4486 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
4487 MPT_LOCK(mp_tp);
4488 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4489 MPT_UNLOCK(mp_tp);
4490 panic("%s: data write before establishment.",
4491 __func__);
4492 return;
4493 }
4494
4495 while (m) {
4496 VERIFY(m->m_flags & M_PKTHDR);
4497 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4498 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4499 m->m_pkthdr.mp_rlen = m_pktlen(m);
4500 mp_tp->mpt_sndmax += m_pktlen(m);
4501 m = m->m_next;
4502 }
4503 MPT_UNLOCK(mp_tp);
4504}
4505
4506void
4507mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len)
4508{
4509 u_int32_t sub_len = 0;
4510
4511 while (m) {
4512 VERIFY(m->m_flags & M_PKTHDR);
4513
4514 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4515 sub_len = m->m_pkthdr.mp_rlen;
4516
4517 if (sub_len < len) {
4518 m->m_pkthdr.mp_dsn += sub_len;
4519 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4520 m->m_pkthdr.mp_rseq += sub_len;
4521 }
4522 m->m_pkthdr.mp_rlen = 0;
4523 len -= sub_len;
4524 } else {
4525 /* sub_len >= len */
4526 m->m_pkthdr.mp_dsn += len;
4527 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4528 m->m_pkthdr.mp_rseq += len;
4529 }
3e170ce0
A
4530 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4531 "%s: dsn 0x%llu ssn %u len %d %d\n",
4532 __func__,
39236c6e 4533 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
3e170ce0
A
4534 m->m_pkthdr.mp_rlen, len),
4535 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4536 m->m_pkthdr.mp_rlen -= len;
4537 return;
4538 }
4539 } else {
4540 panic("%s: MPTCP tag not set", __func__);
4541 /* NOTREACHED */
4542 }
4543 m = m->m_next;
4544 }
4545}
4546
4547/* Obtain the DSN mapping stored in the mbuf */
4548void
4549mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4550 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4551{
4552 u_int64_t dsn64;
4553
4554 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4555 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4556 *dsn64p = dsn64;
4557}
4558
4559void
4560mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4561 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4562{
4563 struct mbuf *m = so->so_snd.sb_mb;
4564 struct mbuf *mnext = NULL;
4565 uint32_t runlen = 0;
4566 u_int64_t dsn64;
4567 uint32_t contig_len = 0;
4568
4569 if (m == NULL)
4570 return;
4571
4572 if (off < 0)
4573 return;
4574 /*
4575 * In the subflow socket, the DSN sequencing can be discontiguous,
4576 * but the subflow sequence mapping is contiguous. Use the subflow
4577 * sequence property to find the right mbuf and corresponding dsn
4578 * mapping.
4579 */
4580
4581 while (m) {
4582 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4583 VERIFY(m->m_flags & M_PKTHDR);
4584
4585 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4586 off -= m->m_pkthdr.mp_rlen;
4587 m = m->m_next;
4588 } else {
4589 break;
4590 }
4591 }
4592
4593 if (m == NULL) {
4594 panic("%s: bad offset", __func__);
4595 /* NOTREACHED */
4596 }
4597
4598 dsn64 = m->m_pkthdr.mp_dsn + off;
4599 *dsn = dsn64;
4600 *relseq = m->m_pkthdr.mp_rseq + off;
4601
4602 /*
4603 * Now find the last contiguous byte and its length from
4604 * start.
4605 */
4606 runlen = m->m_pkthdr.mp_rlen - off;
4607 contig_len = runlen;
4608
4609 /* If datalen does not span multiple mbufs, return */
4610 if (datalen <= runlen) {
4611 *data_len = min(datalen, UINT16_MAX);
4612 return;
4613 }
4614
4615 mnext = m->m_next;
4616 while (datalen > runlen) {
4617 if (mnext == NULL) {
4618 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4619 runlen, off);
4620 /* NOTREACHED */
4621 }
4622 VERIFY(mnext->m_flags & M_PKTHDR);
4623 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4624
4625 /*
4626 * case A. contiguous DSN stream
4627 * case B. discontiguous DSN stream
4628 */
4629 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4630 /* case A */
4631 runlen += mnext->m_pkthdr.mp_rlen;
4632 contig_len += mnext->m_pkthdr.mp_rlen;
3e170ce0
A
4633 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
4634 __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4635 } else {
4636 /* case B */
3e170ce0 4637 mptcplog((LOG_DEBUG, "MPTCP Sender: "
fe8ab488 4638 "%s: discontig datalen %d contig_len %d cc %d \n",
3e170ce0
A
4639 __func__, datalen, contig_len, so->so_snd.sb_cc),
4640 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4641 break;
4642 }
4643 mnext = mnext->m_next;
4644 }
4645 datalen = min(datalen, UINT16_MAX);
4646 *data_len = min(datalen, contig_len);
3e170ce0
A
4647 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4648 "%s: %llu %u %d %d \n", __func__,
4649 *dsn, *relseq, *data_len, off),
4650 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4651}
4652
4653/*
4654 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4655 * here. It must be called from mptcp_adj_rmap() which is called only after
4656 * reassembly of out of order data. The rcvnxt variable must
4657 * be updated only when atleast some insequence new data is received.
4658 */
4659static void
4660mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4661{
4662 struct mptcb *mp_tp = tptomptp(tp);
4663
4664 if (mp_tp == NULL)
4665 return;
4666 MPT_LOCK(mp_tp);
4667 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4668 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4669 m->m_pkthdr.mp_rlen)))) {
4670 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4671 }
4672 MPT_UNLOCK(mp_tp);
4673}
4674
4675/*
3e170ce0
A
4676 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4677 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4678 * When it trims data tcp_input calls m_adj() which does not remove the
4679 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4680 * The dsn map insertion cannot be delayed after trim, because data can be in
4681 * the reassembly queue for a while and the DSN option info in tp will be
4682 * overwritten for every new packet received.
39236c6e
A
4683 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4684 * with mptcp_adj_rmap()
4685 */
4686void
4687mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4688{
4689 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4690
4691 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4692 VERIFY(m->m_flags & M_PKTHDR);
4693 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4694 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4695 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4696 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4697 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4698 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4699 }
4700}
4701
fe8ab488 4702int
39236c6e
A
4703mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4704{
4705 u_int64_t dsn;
4706 u_int32_t sseq, datalen;
4707 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4708 u_int32_t old_rcvnxt = 0;
4709
4710 if (m_pktlen(m) == 0)
fe8ab488 4711 return 0;
39236c6e
A
4712
4713 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4714 VERIFY(m->m_flags & M_PKTHDR);
4715
4716 dsn = m->m_pkthdr.mp_dsn;
4717 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4718 datalen = m->m_pkthdr.mp_rlen;
4719 } else {
4720 /* data arrived without an DSS option mapping */
fe8ab488
A
4721
4722 /* initial subflow can fallback right after SYN handshake */
39236c6e 4723 mptcp_notify_mpfail(so);
fe8ab488 4724 return 0;
39236c6e
A
4725 }
4726
4727 /* In the common case, data is in window and in sequence */
4728 if (m->m_pkthdr.len == (int)datalen) {
4729 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4730 return 0;
39236c6e
A
4731 }
4732
39236c6e
A
4733 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4734 if (SEQ_GT(old_rcvnxt, sseq)) {
4735 /* data trimmed from the left */
4736 int off = old_rcvnxt - sseq;
4737 m->m_pkthdr.mp_dsn += off;
4738 m->m_pkthdr.mp_rseq += off;
fe8ab488 4739 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e
A
4740 } else if (old_rcvnxt == sseq) {
4741 /*
3e170ce0 4742 * data was trimmed from the right
39236c6e
A
4743 */
4744 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4745 } else {
fe8ab488 4746 mptcp_notify_mpfail(so);
3e170ce0 4747 return (-1);
39236c6e
A
4748 }
4749 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4750 return 0;
39236c6e
A
4751}
4752
4753/*
4754 * Following routines help with failure detection and failover of data
4755 * transfer from one subflow to another.
4756 */
4757void
4758mptcp_act_on_txfail(struct socket *so)
4759{
4760 struct tcpcb *tp = NULL;
4761 struct inpcb *inp = sotoinpcb(so);
4762
4763 if (inp == NULL)
4764 return;
4765
4766 tp = intotcpcb(inp);
4767 if (tp == NULL)
4768 return;
4769
39236c6e
A
4770 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4771 return;
4772 }
4773
4774 so->so_flags |= SOF_MP_TRYFAILOVER;
4775 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4776}
4777
4778/*
4779 * Support for MP_FAIL option
4780 */
4781int
4782mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4783{
4784 struct mbuf *m = so->so_snd.sb_mb;
4785 u_int64_t dsn;
4786 int off = 0;
4787 u_int32_t datalen;
4788
4789 if (m == NULL)
4790 return (-1);
4791
4792 while (m != NULL) {
4793 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4794 VERIFY(m->m_flags & M_PKTHDR);
4795 dsn = m->m_pkthdr.mp_dsn;
4796 datalen = m->m_pkthdr.mp_rlen;
4797 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4798 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4799 off = dsn_fail - dsn;
4800 *tcp_seq = m->m_pkthdr.mp_rseq + off;
3e170ce0
A
4801 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
4802 __func__, dsn, dsn_fail),
4803 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4804 return (0);
4805 }
4806
4807 m = m->m_next;
4808 }
4809
4810 /*
4811 * If there was no mbuf data and a fallback to TCP occurred, there's
4812 * not much else to do.
4813 */
4814
3e170ce0
A
4815 mptcplog((LOG_ERR, "MPTCP Sender: "
4816 "%s: %llu not found \n", __func__, dsn_fail),
4817 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4818 return (-1);
4819}
4820
4821/*
4822 * Support for sending contiguous MPTCP bytes in subflow
fe8ab488 4823 * Also for preventing sending data with ACK in 3-way handshake
39236c6e
A
4824 */
4825int32_t
4826mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4827{
4828 u_int64_t mdss_dsn = 0;
4829 u_int32_t mdss_subflow_seq = 0;
4830 u_int16_t mdss_data_len = 0;
4831
4832 if (len == 0)
4833 return (len);
4834
4835 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4836 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4837
fe8ab488
A
4838 /*
4839 * Special case handling for Fast Join. We want to send data right
4840 * after ACK of the 3-way handshake, but not piggyback the data
4841 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4842 * mdss_data_len control this.
4843 */
4844 struct tcpcb *tp = NULL;
4845 tp = intotcpcb(sotoinpcb(so));
4846 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4847 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4848 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4849 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4850 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4851 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
4852 mdss_data_len = 0;
4853 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4854 }
39236c6e
A
4855 return (mdss_data_len);
4856}
4857
4858int32_t
4859mptcp_sbspace(struct mptcb *mpt)
4860{
4861 struct sockbuf *sb;
4862 uint32_t rcvbuf;
4863 int32_t space;
4864
4865 MPT_LOCK_ASSERT_HELD(mpt);
4866 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4867
4868 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4869 rcvbuf = sb->sb_hiwat;
4870 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4871 (sb->sb_mbmax - sb->sb_mbcnt)));
4872 if (space < 0)
4873 space = 0;
4874 /* XXX check if it's too small? */
4875
4876 return (space);
4877}
4878
4879/*
4880 * Support Fallback to Regular TCP
4881 */
4882void
4883mptcp_notify_mpready(struct socket *so)
4884{
4885 struct tcpcb *tp = NULL;
4886
4887 if (so == NULL)
4888 return;
4889
4890 tp = intotcpcb(sotoinpcb(so));
4891
4892 if (tp == NULL)
4893 return;
4894
4895 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4896 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4897 struct tcpcb *, tp);
4898
4899 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4900 return;
4901
4902 if (tp->t_mpflags & TMPF_MPTCP_READY)
4903 return;
4904
4905 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4906 tp->t_mpflags |= TMPF_MPTCP_READY;
4907
4908 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4909}
4910
4911void
4912mptcp_notify_mpfail(struct socket *so)
4913{
4914 struct tcpcb *tp = NULL;
4915
4916 if (so == NULL)
4917 return;
4918
4919 tp = intotcpcb(sotoinpcb(so));
4920
4921 if (tp == NULL)
4922 return;
4923
4924 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4925 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4926 struct tcpcb *, tp);
4927
4928 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
4929 return;
4930
4931 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4932 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4933
4934 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4935}
4936
4937/*
4938 * Keepalive helper function
4939 */
4940boolean_t
4941mptcp_ok_to_keepalive(struct mptcb *mp_tp)
4942{
4943 boolean_t ret = 1;
4944 VERIFY(mp_tp != NULL);
4945 MPT_LOCK(mp_tp);
4946 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
4947 ret = 0;
4948 }
4949 MPT_UNLOCK(mp_tp);
4950 return (ret);
4951}
4952
4953/*
4954 * MPTCP t_maxseg adjustment function
4955 */
4956int
4957mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
4958{
4959 int mss_lower = 0;
4960 struct mptcb *mp_tp = tptomptp(tp);
4961
4962#define MPTCP_COMPUTE_LEN { \
4963 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
4964 MPT_LOCK(mp_tp); \
4965 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
4966 mss_lower += 2; \
4967 else \
4968 /* adjust to 32-bit boundary + EOL */ \
4969 mss_lower += 2; \
4970 MPT_UNLOCK(mp_tp); \
4971}
4972 if (mp_tp == NULL)
4973 return (0);
4974
4975 /*
4976 * For the first subflow and subsequent subflows, adjust mss for
4977 * most common MPTCP option size, for case where tcp_mss is called
4978 * during option processing and MTU discovery.
4979 */
4980 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4981 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
4982 MPTCP_COMPUTE_LEN;
4983 }
4984
4985 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4986 (tp->t_mpflags & TMPF_SENT_JOIN)) {
4987 MPTCP_COMPUTE_LEN;
4988 }
4989
4990 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
4991 MPTCP_COMPUTE_LEN;
4992 }
4993
4994 return (mss_lower);
4995}
4996
4997/*
4998 * Update the pid, upid, uuid of the subflow so, based on parent so
4999 */
5000void
5001mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
5002{
5003 struct socket *subflow_so = mpts->mpts_socket;
5004
5005 MPTS_LOCK_ASSERT_HELD(mpts);
5006
5007 socket_lock(subflow_so, 0);
5008 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
5009 (subflow_so->last_upid != parent_mpso->last_upid)) {
5010 subflow_so->last_upid = parent_mpso->last_upid;
5011 subflow_so->last_pid = parent_mpso->last_pid;
5012 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
5013 }
5014 so_update_policy(subflow_so);
5015 socket_unlock(subflow_so, 0);
5016}
5017
5018static void
5019fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5020{
5021 struct inpcb *inp;
5022
5023 tcp_getconninfo(so, &flow->flow_ci);
5024 inp = sotoinpcb(so);
5025#if INET6
5026 if ((inp->inp_vflag & INP_IPV6) != 0) {
5027 flow->flow_src.ss_family = AF_INET6;
5028 flow->flow_dst.ss_family = AF_INET6;
5029 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5030 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5031 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5032 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5033 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5034 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5035 } else
5036#endif
3e170ce0 5037 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5038 flow->flow_src.ss_family = AF_INET;
5039 flow->flow_dst.ss_family = AF_INET;
5040 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5041 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5042 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5043 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5044 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5045 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5046 }
3e170ce0
A
5047 flow->flow_len = sizeof(*flow);
5048 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5049 flow->flow_flags = mpts->mpts_flags;
5050 flow->flow_cid = mpts->mpts_connid;
3e170ce0
A
5051 flow->flow_sndnxt = mpts->mpts_sndnxt;
5052 flow->flow_relseq = mpts->mpts_rel_seq;
5053 flow->flow_soerror = mpts->mpts_soerror;
5054 flow->flow_probecnt = mpts->mpts_probecnt;
5055 flow->flow_peerswitch = mpts->mpts_peerswitch;
39236c6e
A
5056}
5057
5058static int
5059mptcp_pcblist SYSCTL_HANDLER_ARGS
5060{
5061#pragma unused(oidp, arg1, arg2)
5062 int error = 0, f;
5063 size_t n, len;
5064 struct mppcb *mpp;
5065 struct mptses *mpte;
5066 struct mptcb *mp_tp;
5067 struct mptsub *mpts;
5068 struct socket *so;
5069 conninfo_mptcp_t mptcpci;
fe8ab488 5070 mptcp_flow_t *flows = NULL;
39236c6e
A
5071
5072 if (req->newptr != USER_ADDR_NULL)
5073 return (EPERM);
5074
5075 lck_mtx_lock(&mtcbinfo.mppi_lock);
5076 n = mtcbinfo.mppi_count;
5077 if (req->oldptr == USER_ADDR_NULL) {
5078 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5079 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5080 4 * (n + n/8) * sizeof(mptcp_flow_t);
5081 return (0);
5082 }
5083 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5084 flows = NULL;
39236c6e
A
5085 lck_mtx_lock(&mpp->mpp_lock);
5086 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3e170ce0
A
5087 if (mpp->mpp_flags & MPP_DEFUNCT) {
5088 lck_mtx_unlock(&mpp->mpp_lock);
5089 continue;
5090 }
39236c6e
A
5091 mpte = mptompte(mpp);
5092 VERIFY(mpte != NULL);
5093 mp_tp = mpte->mpte_mptcb;
5094 VERIFY(mp_tp != NULL);
3e170ce0
A
5095
5096 bzero(&mptcpci, sizeof(mptcpci));
5097 MPT_LOCK(mp_tp);
39236c6e 5098 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5099 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5100 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5101 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5102 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5103 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5104 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5105 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5106 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5107 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5108 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5109 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
5110 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5111 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5112 MPT_UNLOCK(mp_tp);
5113
39236c6e 5114 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5115 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5116 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5117 mptcpci.mptcpci_flow_offset =
5118 offsetof(conninfo_mptcp_t, mptcpci_flows);
5119
fe8ab488
A
5120 len = sizeof(*flows) * mpte->mpte_numflows;
5121 if (mpte->mpte_numflows != 0) {
5122 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5123 if (flows == NULL) {
5124 lck_mtx_unlock(&mpp->mpp_lock);
5125 break;
5126 }
5127 mptcpci.mptcpci_len = sizeof(mptcpci) +
5128 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5129 error = SYSCTL_OUT(req, &mptcpci,
5130 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5131 } else {
5132 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5133 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
fe8ab488 5134 }
39236c6e
A
5135 if (error) {
5136 lck_mtx_unlock(&mpp->mpp_lock);
5137 FREE(flows, M_TEMP);
5138 break;
5139 }
5140 f = 0;
5141 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5142 MPTS_LOCK(mpts);
5143 so = mpts->mpts_socket;
5144 socket_lock(so, 0);
5145 fill_mptcp_subflow(so, &flows[f], mpts);
5146 socket_unlock(so, 0);
5147 MPTS_UNLOCK(mpts);
5148 f++;
5149 }
5150 lck_mtx_unlock(&mpp->mpp_lock);
fe8ab488
A
5151 if (flows) {
5152 error = SYSCTL_OUT(req, flows, len);
5153 FREE(flows, M_TEMP);
5154 if (error)
5155 break;
5156 }
39236c6e
A
5157 }
5158 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5159
5160 return (error);
5161}
5162
5163SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5164 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5165 "List of active MPTCP connections");
fe8ab488
A
5166
5167/*
5168 * Check the health of the other subflows and do an mptcp_output if
5169 * there is no other active or functional subflow at the time of
5170 * call of this function.
5171 */
5172static void
5173mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
5174{
5175 struct mptsub *from_mpts = NULL;
5176
5177 MPTE_LOCK_ASSERT_HELD(mpte);
5178
5179 MPTS_UNLOCK(to_mpts);
5180
5181 from_mpts = mpte->mpte_active_sub;
5182
5183 if (from_mpts == NULL)
5184 goto output_needed;
5185
5186 MPTS_LOCK(from_mpts);
5187
5188 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
5189 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
5190 MPTS_UNLOCK(from_mpts);
5191 goto output_needed;
5192 }
5193
5194 MPTS_UNLOCK(from_mpts);
5195 MPTS_LOCK(to_mpts);
5196 return;
5197
5198output_needed:
5199 mptcp_output(mpte);
5200 MPTS_LOCK(to_mpts);
5201}
5202
fe8ab488
A
5203/*
5204 * Set notsent lowat mark on the MPTCB
5205 */
5206int
5207mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5208{
5209 struct mptcb *mp_tp = NULL;
5210 int error = 0;
5211
5212 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5213 mp_tp = mpte->mpte_mptcb;
5214
5215 if (mp_tp)
5216 mp_tp->mpt_notsent_lowat = optval;
5217 else
5218 error = EINVAL;
5219
5220 return error;
5221}
5222
5223u_int32_t
5224mptcp_get_notsent_lowat(struct mptses *mpte)
5225{
5226 struct mptcb *mp_tp = NULL;
5227
5228 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5229 mp_tp = mpte->mpte_mptcb;
5230
5231 if (mp_tp)
5232 return mp_tp->mpt_notsent_lowat;
5233 else
5234 return 0;
5235}
5236
5237int
5238mptcp_notsent_lowat_check(struct socket *so) {
5239 struct mptses *mpte;
5240 struct mppcb *mpp;
5241 struct mptcb *mp_tp;
5242 struct mptsub *mpts;
5243
5244 int notsent = 0;
5245
5246 mpp = sotomppcb(so);
5247 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5248 return (0);
5249 }
5250
5251 mpte = mptompte(mpp);
5252 mp_tp = mpte->mpte_mptcb;
5253
5254 MPT_LOCK(mp_tp);
5255 notsent = so->so_snd.sb_cc;
5256
5257 if ((notsent == 0) ||
5258 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5259 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5260 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5261 "lowat %d notsent %d actual %d \n",
5262 mp_tp->mpt_notsent_lowat, notsent,
5263 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5264 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5265 MPT_UNLOCK(mp_tp);
5266 return (1);
5267 }
5268 MPT_UNLOCK(mp_tp);
5269
5270 /* When Nagle's algorithm is not disabled, it is better
5271 * to wakeup the client even before there is atleast one
5272 * maxseg of data to write.
5273 */
5274 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5275 int retval = 0;
5276 MPTS_LOCK(mpts);
5277 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5278 struct socket *subf_so = mpts->mpts_socket;
5279 socket_lock(subf_so, 0);
5280 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5281
5282 notsent = so->so_snd.sb_cc -
5283 (tp->snd_nxt - tp->snd_una);
5284
5285 if ((tp->t_flags & TF_NODELAY) == 0 &&
5286 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5287 retval = 1;
5288 }
3e170ce0 5289 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5290 " nodelay false \n",
3e170ce0
A
5291 mp_tp->mpt_notsent_lowat, notsent),
5292 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5293 socket_unlock(subf_so, 0);
5294 MPTS_UNLOCK(mpts);
5295 return (retval);
5296 }
5297 MPTS_UNLOCK(mpts);
5298 }
5299 return (0);
5300}
5301
3e170ce0
A
5302static void
5303mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
5304{
5305 MPTE_LOCK_ASSERT_HELD(mpte);
5306 MPTS_LOCK_ASSERT_HELD(mpts);
5307
5308 struct socket *subflow_so = mpts->mpts_socket;
5309 socket_lock(subflow_so, 0);
5310 mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
5311 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
5312 socket_unlock(subflow_so, 0);
5313}
5314
5315/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5316static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5317static uint32_t mptcp_kern_skt_inuse = 0;
5318symptoms_advisory_t mptcp_advisory;
5319
5320static errno_t
5321mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5322 void **unitinfo)
5323{
5324#pragma unused(kctlref, sac, unitinfo)
5325 /*
5326 * We don't need to do anything here. But we can atleast ensure
5327 * only one user opens the MPTCP_KERN_CTL_NAME control socket.
5328 */
5329 if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
5330 return (0);
5331 else
5332 return (EALREADY);
5333}
5334
5335static errno_t
5336mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5337 void *unitinfo)
5338{
5339#pragma unused(kctlref, kcunit, unitinfo)
5340 if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
5341 /* TBD needs to be locked if the size grows more than an int */
5342 bzero(&mptcp_advisory, sizeof(mptcp_advisory));
5343 return (0);
5344 }
5345 else {
5346 return (EINVAL);
5347 }
5348}
5349
5350static errno_t
5351mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5352 mbuf_t m, int flags)
5353{
5354#pragma unused(kctlref, kcunit, unitinfo, flags)
5355 symptoms_advisory_t *sa = NULL;
5356
5357 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5358 mbuf_freem(m);
5359 return (EINVAL);
5360 }
5361
5362 if (mbuf_len(m) >= sizeof(*sa))
5363 sa = mbuf_data(m);
5364 else
5365 return (EINVAL);
5366
5367 if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
5368 /*
5369 * we could use this notification to notify all mptcp pcbs
5370 * of the change in network status. But its difficult to
5371 * define if sending REMOVE_ADDR or MP_PRIO is appropriate
5372 * given that these are only soft indicators of the network
5373 * state. Leaving this as TBD for now.
5374 */
5375 }
5376
5377 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
5378 mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
5379 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
5380 sa->sa_cell_status, mptcp_advisory.sa_cell_status),
5381 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
5382 MPTCP_LOGLVL_LOG);
5383
5384 if ((sa->sa_wifi_status &
5385 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5386 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
5387 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5388 }
5389
5390 if ((sa->sa_cell_status &
5391 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
5392 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
5393 mptcp_advisory.sa_cell_status = sa->sa_cell_status;
5394 }
5395 } else {
5396 mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
5397 "wifi %d cell %d\n", __func__,
5398 mptcp_advisory.sa_wifi_status,
5399 mptcp_advisory.sa_cell_status),
5400 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
5401 }
5402 return (0);
5403}
5404
5405void
5406mptcp_control_register(void)
5407{
5408 /* Set up the advisory control socket */
5409 struct kern_ctl_reg mptcp_kern_ctl;
5410
5411 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5412 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5413 sizeof(mptcp_kern_ctl.ctl_name));
5414 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5415 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5416 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5417 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5418
5419 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5420}
5421
5422int
5423mptcp_is_wifi_unusable(void)
5424{
5425 /* a false return val indicates there is no info or wifi is ok */
5426 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5427}
5428
5429int
5430mptcp_is_cell_unusable(void)
5431{
5432 /* a false return val indicates there is no info or cell is ok */
5433 return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
5434}
5435
5436struct mptsub*
5437mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
5438{
5439 struct mptsub *cellsub = NULL;
5440 struct mptsub *wifisub = NULL;
5441 struct mptsub *wiredsub = NULL;
5442
5443 VERIFY ((best != NULL) && (second_best != NULL));
5444
5445 if (!mptcp_use_symptomsd)
5446 return (NULL);
5447
5448 if (!mptcp_kern_skt_inuse)
5449 return (NULL);
5450
5451 /*
5452 * There could be devices with more than one wifi interface or
5453 * more than one wired or cell interfaces.
5454 * TBD: SymptomsD is unavailable on such platforms as of now.
5455 * Try to prefer best when possible in general.
5456 * Also, SymptomsD sends notifications about wifi only when it
5457 * is primary.
5458 */
5459 if (best->mpts_linktype & MPTSL_WIFI)
5460 wifisub = best;
5461 else if (best->mpts_linktype & MPTSL_CELL)
5462 cellsub = best;
5463 else if (best->mpts_linktype & MPTSL_WIRED)
5464 wiredsub = best;
5465
5466 /*
5467 * On platforms with wired paths, don't use hints about wifi or cell.
5468 * Currently, SymptomsD is not available on platforms with wired paths.
5469 */
5470 if (wiredsub)
5471 return (NULL);
5472
5473 if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
5474 wifisub = second_best;
5475
5476 if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
5477 cellsub = second_best;
5478
5479 if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
5480 wiredsub = second_best;
5481
5482 if ((wifisub == best) && mptcp_is_wifi_unusable()) {
5483 tcpstat.tcps_mp_sel_symtomsd++;
5484 if (mptcp_is_cell_unusable()) {
5485 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5486 " suggests both Wifi and Cell are bad. Wired %s.",
5487 (wiredsub == NULL) ? "none" : "present"),
5488 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5489 return (wiredsub);
5490 } else {
5491 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5492 " suggests Wifi bad, Cell good. Wired %s.",
5493 (wiredsub == NULL) ? "none" : "present"),
5494 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5495 return ((wiredsub != NULL) ? wiredsub : cellsub);
5496 }
5497 }
5498
5499 if ((cellsub == best) && (mptcp_is_cell_unusable())) {
5500 tcpstat.tcps_mp_sel_symtomsd++;
5501 if (mptcp_is_wifi_unusable()) {
5502 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5503 " suggests both Cell and Wifi are bad. Wired %s.",
5504 (wiredsub == NULL) ? "none" : "present"),
5505 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5506 return (wiredsub);
5507 } else {
5508 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5509 " suggests Cell bad, Wifi good. Wired %s.",
5510 (wiredsub == NULL) ? "none" : "present"),
5511 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5512 return ((wiredsub != NULL) ? wiredsub : wifisub);
5513 }
5514 }
5515
5516 /* little is known about the state of the network or wifi is good */
5517 return (NULL);
5518}