]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
490019cf 2 * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
3e170ce0 50#include <net/if_var.h>
39236c6e
A
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_fsm.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58#include <netinet/mptcp_var.h>
59#include <netinet/mptcp.h>
60#include <netinet/mptcp_seq.h>
61#include <netinet/mptcp_timer.h>
62#include <libkern/crypto/sha1.h>
63#if INET6
64#include <netinet6/in6_pcb.h>
65#include <netinet6/ip6protosw.h>
66#endif /* INET6 */
67#include <dev/random/randomdev.h>
68
39037602
A
69extern char *proc_best_name(proc_t);
70
39236c6e
A
71/*
72 * Notes on MPTCP implementation.
73 *
74 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
75 * communication domain. The structure mtcbinfo describes the MPTCP instance
76 * of a Multipath protocol in that domain. It is used to keep track of all
77 * MPTCP PCB instances in the system, and is protected by the global lock
78 * mppi_lock.
79 *
80 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
81 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
82 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
83 * allocated from the same memory block, and each structure has a pointer
84 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
85 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
86 * PCB (mppcb) as well as the MPTCP Session (mptses).
87 *
88 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
89 * in particular, the list of subflows as well as the MPTCP thread.
90 *
91 * A functioning MPTCP Session consists of one or more subflow sockets. Each
92 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
93 * represented by the mptsub structure. Because each subflow requires access
94 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
95 * subflow. This gets decremented prior to the subflow's destruction. The
96 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
97 *
98 * To handle events (read, write, control) from the subflows, an MPTCP thread
99 * is created; currently, there is one thread per MPTCP Session. In order to
100 * prevent the MPTCP socket from being destroyed while being accessed by the
101 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
102 * which will be decremented prior to the thread's termination. The thread
103 * lock (mpte_thread_lock) is used to synchronize its signalling.
104 *
105 * Lock ordering is defined as follows:
106 *
107 * mtcbinfo (mppi_lock)
108 * mp_so (mpp_lock)
109 * mpts (mpts_lock)
110 * so (inpcb_mtx)
111 * mptcb (mpt_lock)
112 *
113 * It is not a requirement that all of the above locks need to be acquired
114 * in succession, but the correct lock ordering must be followed when there
115 * are more than one locks that need to be held. The MPTCP thread lock is
116 * is not constrained by this arrangement, because none of the other locks
117 * is ever acquired while holding mpte_thread_lock; therefore it may be called
118 * at any moment to signal the thread.
119 *
120 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
121 * work is done by the MPTCP garbage collector which is invoked on demand by
122 * the PF_MULTIPATH garbage collector. This process will take place once all
123 * of the subflows have been destroyed, and the MPTCP thread be instructed to
124 * self-terminate.
125 */
126
127static void mptcp_sesdestroy(struct mptses *);
128static void mptcp_thread_signal_locked(struct mptses *);
129static void mptcp_thread_terminate_signal(struct mptses *);
130static void mptcp_thread_dowork(struct mptses *);
131static void mptcp_thread_func(void *, wait_result_t);
132static void mptcp_thread_destroy(struct mptses *);
133static void mptcp_key_pool_init(void);
fe8ab488 134static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 135static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
136
137static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
138static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
139static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
140static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
141 struct uio *, struct mbuf **, struct mbuf **, int *);
142static void mptcp_subflow_rupcall(struct socket *, void *, int);
143static void mptcp_subflow_input(struct mptses *, struct mptsub *);
144static void mptcp_subflow_wupcall(struct socket *, void *, int);
145static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
146static void mptcp_update_last_owner(struct mptsub *, struct socket *);
fe8ab488 147static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
3e170ce0 148static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
39037602 149static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *, int *);
39236c6e
A
150
151/*
152 * Possible return values for subflow event handlers. Note that success
153 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
154 * indicate errors or actions which require immediate attention; they will
155 * prevent the rest of the handlers from processing their respective events
156 * until the next round of events processing.
157 */
158typedef enum {
159 MPTS_EVRET_DELETE = 1, /* delete this subflow */
160 MPTS_EVRET_OK = 2, /* OK */
161 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
162 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
163} ev_ret_t;
164
3e170ce0
A
165static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
166static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
167static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
168static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
169static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
170static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
171static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
172static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
173static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
174static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
175static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
176static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
177static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
178static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
179static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
180static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
181static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
fe8ab488 182
39236c6e
A
183static const char *mptcp_evret2str(ev_ret_t);
184
185static mptcp_key_t *mptcp_reserve_key(void);
186static int mptcp_do_sha1(mptcp_key_t *, char *, int);
490019cf 187static void mptcp_init_local_parms(struct mptcb *);
39236c6e
A
188
189static unsigned int mptsub_zone_size; /* size of mptsub */
190static struct zone *mptsub_zone; /* zone for mptsub */
191
192static unsigned int mptopt_zone_size; /* size of mptopt */
193static struct zone *mptopt_zone; /* zone for mptopt */
194
195static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
196static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
197
198struct mppcbinfo mtcbinfo;
199
200static struct mptcp_keys_pool_head mptcp_keys_pool;
201
202#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
203#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
204
205SYSCTL_DECL(_net_inet);
206
207SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
208
3e170ce0
A
209uint32_t mptcp_dbg_area = 0; /* more noise if greater than 1 */
210SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
211 &mptcp_dbg_area, 0, "MPTCP debug area");
212
213uint32_t mptcp_dbg_level = 0;
214SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &mptcp_dbg_level, 0, "MPTCP debug level");
216
39236c6e
A
217
218SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
219 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
220
221/*
222 * Since there is one kernel thread per mptcp socket, imposing an artificial
223 * limit on number of allowed mptcp sockets.
224 */
225uint32_t mptcp_socket_limit = MPPCB_LIMIT;
226SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
227 &mptcp_socket_limit, 0, "MPTCP socket limit");
228
fe8ab488
A
229/*
230 * SYSCTL to turn on delayed cellular subflow start.
231 */
232uint32_t mptcp_delayed_subf_start = 0;
233SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
234 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
235
236/*
3e170ce0 237 * sysctl to use network status hints from symptomsd
fe8ab488 238 */
3e170ce0
A
239uint32_t mptcp_use_symptomsd = 1;
240SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
241 &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
fe8ab488 242
39236c6e
A
243static struct protosw mptcp_subflow_protosw;
244static struct pr_usrreqs mptcp_subflow_usrreqs;
245#if INET6
246static struct ip6protosw mptcp_subflow_protosw6;
247static struct pr_usrreqs mptcp_subflow_usrreqs6;
248#endif /* INET6 */
249
3e170ce0
A
250typedef struct mptcp_subflow_event_entry {
251 uint64_t sofilt_hint_mask;
252 ev_ret_t (*sofilt_hint_ev_hdlr)(
253 struct mptses *mpte,
254 struct mptsub *mpts,
255 uint64_t *p_mpsofilt_hint);
256} mptsub_ev_entry_t;
257
490019cf
A
258/*
259 * XXX The order of the event handlers below is really
260 * really important.
261 * SO_FILT_HINT_DELETEOK event has to be handled first,
262 * else we may end up missing on this event.
263 * Please read radar://24043716 for more details.
264 */
3e170ce0 265static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
490019cf
A
266 {
267 .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
268 .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
269 },
3e170ce0
A
270 {
271 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
272 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
273 },
274 {
275 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
276 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
277 },
278 {
279 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
280 .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
281 },
282 {
283 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
284 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
285 },
286 {
287 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
288 .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
289 },
290 { .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
291 .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
292 },
293 {
294 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
295 .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
296 },
297 {
298 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
299 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
300 },
301 {
302 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
303 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
304 },
305 {
306 .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
307 .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
308 },
309 {
310 .sofilt_hint_mask = SO_FILT_HINT_RESUME,
311 .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
312 },
313 {
314 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
315 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
316 },
317 {
318 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
319 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
320 },
3e170ce0
A
321 {
322 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
323 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
324 },
325 {
326 .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
327 .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
328 }
329};
330
39236c6e
A
331/*
332 * Protocol pr_init callback.
333 */
334void
335mptcp_init(struct protosw *pp, struct domain *dp)
336{
337#pragma unused(dp)
338 static int mptcp_initialized = 0;
339 struct protosw *prp;
340#if INET6
341 struct ip6protosw *prp6;
342#endif /* INET6 */
343
344 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
345
346 /* do this only once */
347 if (mptcp_initialized)
348 return;
349 mptcp_initialized = 1;
350
351 /*
352 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
353 * we must be able to find IPPROTO_TCP entries for both.
354 */
355 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
356 VERIFY(prp != NULL);
357 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
358 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
359 sizeof (mptcp_subflow_usrreqs));
360 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
361 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
362 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
363 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
364 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
365 /*
366 * Socket filters shouldn't attach/detach to/from this protosw
367 * since pr_protosw is to be used instead, which points to the
368 * real protocol; if they do, it is a bug and we should panic.
369 */
370 mptcp_subflow_protosw.pr_filter_head.tqh_first =
371 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
372 mptcp_subflow_protosw.pr_filter_head.tqh_last =
373 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
374
375#if INET6
376 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
377 IPPROTO_TCP, SOCK_STREAM);
378 VERIFY(prp6 != NULL);
379 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
380 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
381 sizeof (mptcp_subflow_usrreqs6));
382 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
383 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
384 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
385 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
386 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
387 /*
388 * Socket filters shouldn't attach/detach to/from this protosw
389 * since pr_protosw is to be used instead, which points to the
390 * real protocol; if they do, it is a bug and we should panic.
391 */
392 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
393 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
394 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
395 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
396#endif /* INET6 */
397
398 bzero(&mtcbinfo, sizeof (mtcbinfo));
399 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
400 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
401 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
402 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
403 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
404 /* NOTREACHED */
405 }
406 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
407 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
408
409 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
410 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
411 mtcbinfo.mppi_lock_grp_attr);
412 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
413 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
414 mtcbinfo.mppi_lock_attr);
39236c6e 415
3e170ce0 416 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e 417 mtcbinfo.mppi_timer = mptcp_timer;
3e170ce0 418 mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
39236c6e
A
419
420 /* attach to MP domain for garbage collection to take place */
421 mp_pcbinfo_attach(&mtcbinfo);
422
423 mptsub_zone_size = sizeof (struct mptsub);
424 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
425 8192, "mptsub")) == NULL) {
426 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
427 /* NOTREACHED */
428 }
429 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
430 zone_change(mptsub_zone, Z_EXPAND, TRUE);
431
432 mptopt_zone_size = sizeof (struct mptopt);
433 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
434 1024, "mptopt")) == NULL) {
435 panic("%s: unable to allocate MPTCP option zone\n", __func__);
436 /* NOTREACHED */
437 }
438 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
439 zone_change(mptopt_zone, Z_EXPAND, TRUE);
440
441 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
442 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
443 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
444 panic("%s: unable to allocate MPTCP address auth zone \n",
445 __func__);
446 /* NOTREACHED */
447 }
448 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
449 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
450
451 /* Set up a list of unique keys */
452 mptcp_key_pool_init();
39236c6e
A
453}
454
455/*
456 * Create an MPTCP session, called as a result of opening a MPTCP socket.
457 */
3e170ce0 458void *
39236c6e
A
459mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
460{
461 struct mppcbinfo *mppi;
462 struct mptses *mpte;
463 struct mptcb *mp_tp;
464 int error = 0;
465
466 VERIFY(mpp != NULL);
467 mppi = mpp->mpp_pcbinfo;
468 VERIFY(mppi != NULL);
469
3e170ce0
A
470 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
471 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
472
473 /* MPTCP Multipath PCB Extension */
474 bzero(mpte, sizeof (*mpte));
475 VERIFY(mpp->mpp_pcbe == NULL);
476 mpp->mpp_pcbe = mpte;
477 mpte->mpte_mppcb = mpp;
478 mpte->mpte_mptcb = mp_tp;
479
480 TAILQ_INIT(&mpte->mpte_sopts);
481 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
482 mpte->mpte_associd = SAE_ASSOCID_ANY;
483 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e
A
484
485 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
486 mppi->mppi_lock_attr);
487
488 /*
489 * XXX: adi@apple.com
490 *
491 * This can be rather expensive if we have lots of MPTCP sockets,
492 * but we need a kernel thread for this model to work. Perhaps we
493 * could amortize the costs by having one worker thread per a group
494 * of MPTCP sockets.
495 */
496 if (kernel_thread_start(mptcp_thread_func, mpte,
497 &mpte->mpte_thread) != KERN_SUCCESS) {
498 error = ENOBUFS;
499 goto out;
500 }
501 mp_so->so_usecount++; /* for thread */
502
503 /* MPTCP Protocol Control Block */
504 bzero(mp_tp, sizeof (*mp_tp));
505 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
506 mppi->mppi_lock_attr);
507 mp_tp->mpt_mpte = mpte;
3e170ce0 508 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e
A
509out:
510 if (error != 0)
511 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
512 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
513 struct sockbuf *, &mp_so->so_rcv,
514 struct sockbuf *, &mp_so->so_snd,
515 struct mppcb *, mpp, int, error);
516
517 return ((error != 0) ? NULL : mpte);
518}
519
520/*
521 * Destroy an MPTCP session.
522 */
523static void
524mptcp_sesdestroy(struct mptses *mpte)
525{
526 struct mptcb *mp_tp;
527
528 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
529
530 mp_tp = mpte->mpte_mptcb;
531 VERIFY(mp_tp != NULL);
532
533 /*
534 * MPTCP Multipath PCB Extension section
535 */
536 mptcp_flush_sopts(mpte);
537 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
538
539 lck_mtx_destroy(&mpte->mpte_thread_lock,
540 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
541
542 /*
543 * MPTCP Protocol Control Block section
544 */
545 lck_mtx_destroy(&mp_tp->mpt_lock,
546 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
547
548 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
549 struct mptcb *, mp_tp);
550}
551
552/*
553 * Allocate an MPTCP socket option structure.
554 */
555struct mptopt *
556mptcp_sopt_alloc(int how)
557{
558 struct mptopt *mpo;
559
560 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
561 zalloc_noblock(mptopt_zone);
562 if (mpo != NULL) {
563 bzero(mpo, mptopt_zone_size);
564 }
565
566 return (mpo);
567}
568
569/*
570 * Free an MPTCP socket option structure.
571 */
572void
573mptcp_sopt_free(struct mptopt *mpo)
574{
575 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
576
577 zfree(mptopt_zone, mpo);
578}
579
580/*
581 * Add a socket option to the MPTCP socket option list.
582 */
583void
584mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
585{
586 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
587 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
588 mpo->mpo_flags |= MPOF_ATTACHED;
589 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
590}
591
592/*
593 * Remove a socket option from the MPTCP socket option list.
594 */
595void
596mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
597{
598 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
599 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
600 mpo->mpo_flags &= ~MPOF_ATTACHED;
601 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
602}
603
604/*
605 * Search for an existing <sopt_level,sopt_name> socket option.
606 */
607struct mptopt *
608mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
609{
610 struct mptopt *mpo;
611
612 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
613
614 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
615 if (mpo->mpo_level == sopt->sopt_level &&
616 mpo->mpo_name == sopt->sopt_name)
617 break;
618 }
619 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
620
621 return (mpo);
622}
623
624/*
625 * Flushes all recorded socket options from an MP socket.
626 */
627void
628mptcp_flush_sopts(struct mptses *mpte)
629{
630 struct mptopt *mpo, *tmpo;
631
632 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
633
634 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
635 mptcp_sopt_remove(mpte, mpo);
636 mptcp_sopt_free(mpo);
637 }
638 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
639}
640
641/*
642 * Allocate a MPTCP subflow structure.
643 */
644struct mptsub *
645mptcp_subflow_alloc(int how)
646{
647 struct mptsub *mpts;
648
649 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
650 zalloc_noblock(mptsub_zone);
651 if (mpts != NULL) {
652 bzero(mpts, mptsub_zone_size);
653 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
654 mtcbinfo.mppi_lock_attr);
655 }
656
657 return (mpts);
658}
659
660/*
661 * Deallocate a subflow structure, called when all of the references held
662 * on it have been released. This implies that the subflow has been deleted.
663 */
664void
665mptcp_subflow_free(struct mptsub *mpts)
666{
667 MPTS_LOCK_ASSERT_HELD(mpts);
668
669 VERIFY(mpts->mpts_refcnt == 0);
670 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
671 VERIFY(mpts->mpts_mpte == NULL);
672 VERIFY(mpts->mpts_socket == NULL);
673
813fb2f6
A
674 if (mpts->mpts_src != NULL) {
675 FREE(mpts->mpts_src, M_SONAME);
676 mpts->mpts_src = NULL;
39236c6e 677 }
813fb2f6
A
678 if (mpts->mpts_dst != NULL) {
679 FREE(mpts->mpts_dst, M_SONAME);
680 mpts->mpts_dst = NULL;
39236c6e
A
681 }
682 MPTS_UNLOCK(mpts);
683 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
684
685 zfree(mptsub_zone, mpts);
686}
687
688/*
689 * Create an MPTCP subflow socket.
690 */
691static int
692mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
693 struct proc *p, struct socket **so)
694{
695 struct mptopt smpo, *mpo, *tmpo;
696 struct socket *mp_so;
697 int error;
698
699 *so = NULL;
700 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
701 mp_so = mpte->mpte_mppcb->mpp_socket;
702
703 /*
704 * Create the subflow socket (multipath subflow, non-blocking.)
705 *
706 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
707 * socket; it will be cleared when the socket is peeled off or closed.
708 * It also indicates to the underlying TCP to handle MPTCP options.
709 * A multipath subflow socket implies SS_NOFDREF state.
710 */
711 if ((error = socreate_internal(dom, so, SOCK_STREAM,
712 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
3e170ce0
A
713 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
714 " unable to create subflow socket error %d\n",
715 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
716 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
717 return (error);
718 }
719
720 socket_lock(*so, 0);
721 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
722 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
723 (SS_NBIO|SS_NOFDREF));
724
725 /* prevent the socket buffers from being compressed */
726 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
727 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
728
490019cf
A
729 /* Inherit preconnect and TFO data flags */
730 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
731 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
732
733 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
734 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
735
39236c6e
A
736 bzero(&smpo, sizeof (smpo));
737 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
738 smpo.mpo_level = SOL_SOCKET;
739 smpo.mpo_intval = 1;
740
741 /* disable SIGPIPE */
742 smpo.mpo_name = SO_NOSIGPIPE;
743 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
744 goto out;
745
746 /* find out if the subflow's source address goes away */
747 smpo.mpo_name = SO_NOADDRERR;
748 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
749 goto out;
750
751 /* enable keepalive */
752 smpo.mpo_name = SO_KEEPALIVE;
753 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
754 goto out;
755
756 /*
757 * Limit the receive socket buffer size to 64k.
758 *
759 * We need to take into consideration the window scale option
760 * which could be negotiated in one subflow but disabled in
761 * another subflow.
762 * XXX This can be improved in the future.
763 */
764 smpo.mpo_name = SO_RCVBUF;
765 smpo.mpo_intval = MPTCP_RWIN_MAX;
766 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
767 goto out;
768
769 /* N.B.: set by sosetopt */
770 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
771 /* Prevent automatic socket buffer sizing. */
772 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
773
774 smpo.mpo_level = IPPROTO_TCP;
775 smpo.mpo_intval = mptcp_subflow_keeptime;
776 smpo.mpo_name = TCP_KEEPALIVE;
777 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
778 goto out;
779
780 /* replay setsockopt(2) on the subflow sockets for eligible options */
781 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
782 int interim;
783
784 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
785 continue;
786
787 /*
788 * Skip those that are handled internally; these options
789 * should not have been recorded and marked with the
790 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
791 */
792 if (mpo->mpo_level == SOL_SOCKET &&
793 (mpo->mpo_name == SO_NOSIGPIPE ||
794 mpo->mpo_name == SO_NOADDRERR ||
795 mpo->mpo_name == SO_KEEPALIVE))
796 continue;
797
798 interim = (mpo->mpo_flags & MPOF_INTERIM);
799 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
800 char buf[32];
3e170ce0
A
801 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
802 " mp_so 0x%llx"
803 " sopt %s val %d interim record removed\n",
39236c6e
A
804 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
805 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
806 buf, sizeof (buf)), mpo->mpo_intval),
807 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
808 mptcp_sopt_remove(mpte, mpo);
809 mptcp_sopt_free(mpo);
810 continue;
811 }
812 }
813
814 /*
815 * We need to receive everything that the subflow socket has,
816 * so use a customized socket receive function. We will undo
817 * this when the socket is peeled off or closed.
818 */
819 mpts->mpts_oprotosw = (*so)->so_proto;
820 switch (dom) {
821 case PF_INET:
822 (*so)->so_proto = &mptcp_subflow_protosw;
823 break;
824#if INET6
825 case PF_INET6:
826 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
827 break;
828#endif /* INET6 */
829 default:
830 VERIFY(0);
831 /* NOTREACHED */
832 }
833
834out:
835 socket_unlock(*so, 0);
836
837 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
838 struct mptsub *, mpts, int, dom, int, error);
839
840 return (error);
841}
842
843/*
844 * Close an MPTCP subflow socket.
845 *
846 * Note that this may be called on an embryonic subflow, and the only
847 * thing that is guaranteed valid is the protocol-user request.
848 */
849static int
850mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
851{
852 MPTS_LOCK_ASSERT_HELD(mpts);
853
854 socket_lock(so, 0);
855 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
856 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
857
858 /* restore protocol-user requests */
859 VERIFY(mpts->mpts_oprotosw != NULL);
860 so->so_proto = mpts->mpts_oprotosw;
861 socket_unlock(so, 0);
862
863 mpts->mpts_socket = NULL; /* may already be NULL */
864
865 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
866 struct socket *, so,
867 struct sockbuf *, &so->so_rcv,
868 struct sockbuf *, &so->so_snd,
869 struct mptses *, mpts->mpts_mpte);
870
871 return (soclose(so));
872}
873
874/*
875 * Connect an MPTCP subflow socket.
876 *
877 * This may be called inline as part of adding a subflow, or asynchronously
878 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
879 * pending connect case, the subflow socket may have been bound to an interface
880 * and/or a source IP address which may no longer be around by the time this
881 * routine is called; in that case the connect attempt will most likely fail.
882 */
883static int
884mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
885{
886 struct socket *so;
887 int af, error;
888
889 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
890 MPTS_LOCK_ASSERT_HELD(mpts);
891
892 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
893 MPTSF_CONNECTING);
894 VERIFY(mpts->mpts_socket != NULL);
895 so = mpts->mpts_socket;
896 af = mpts->mpts_family;
897
898 if (af == AF_INET || af == AF_INET6) {
813fb2f6 899 struct sockaddr *dst;
39236c6e
A
900 char dbuf[MAX_IPv6_STR_LEN];
901
813fb2f6 902 dst = mpts->mpts_dst;
39236c6e 903
3e170ce0
A
904 mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
905 "dst %s[%d] cid %d [pended %s]\n",
39236c6e
A
906 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
907 inet_ntop(af, ((af == AF_INET) ?
813fb2f6
A
908 (void *)&SIN(dst)->sin_addr.s_addr :
909 (void *)&SIN6(dst)->sin6_addr),
39236c6e 910 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
813fb2f6
A
911 ntohs(SIN(dst)->sin_port) :
912 ntohs(SIN6(dst)->sin6_port)),
39236c6e
A
913 mpts->mpts_connid,
914 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
915 "YES" : "NO")),
916 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
917 }
918
919 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
920
921 socket_lock(so, 0);
fe8ab488 922 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 923
39236c6e 924 /* connect the subflow socket */
813fb2f6 925 error = soconnectxlocked(so, mpts->mpts_src, mpts->mpts_dst,
39236c6e 926 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
3e170ce0
A
927 mpte->mpte_associd, NULL, CONNREQF_MPTCP,
928 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
39236c6e
A
929 socket_unlock(so, 0);
930
fe8ab488
A
931 /* Allocate a unique address id per subflow */
932 mpte->mpte_addrid_last++;
933 if (mpte->mpte_addrid_last == 0)
934 mpte->mpte_addrid_last++;
935
39236c6e
A
936 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
937 struct mptsub *, mpts, int, error);
938
939 return (error);
940}
941
942/*
943 * MPTCP subflow socket receive routine, derived from soreceive().
944 */
945static int
946mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
947 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
948{
949#pragma unused(uio)
950 int flags, error = 0;
951 struct proc *p = current_proc();
952 struct mbuf *m, **mp = mp0;
953 struct mbuf *nextrecord;
954
955 socket_lock(so, 1);
956 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
957
958#ifdef MORE_LOCKING_DEBUG
959 if (so->so_usecount == 1) {
960 panic("%s: so=%x no other reference on socket\n", __func__, so);
961 /* NOTREACHED */
962 }
963#endif
964 /*
965 * We return all that is there in the subflow's socket receive buffer
966 * to the MPTCP layer, so we require that the caller passes in the
967 * expected parameters.
968 */
969 if (mp == NULL || controlp != NULL) {
970 socket_unlock(so, 1);
971 return (EINVAL);
972 }
973 *mp = NULL;
974 if (psa != NULL)
975 *psa = NULL;
976 if (flagsp != NULL)
977 flags = *flagsp &~ MSG_EOR;
978 else
979 flags = 0;
980
981 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
982 socket_unlock(so, 1);
983 return (EOPNOTSUPP);
984 }
985 flags |= (MSG_DONTWAIT|MSG_NBIO);
986
987 /*
988 * If a recv attempt is made on a previously-accepted socket
989 * that has been marked as inactive (disconnected), reject
990 * the request.
991 */
992 if (so->so_flags & SOF_DEFUNCT) {
993 struct sockbuf *sb = &so->so_rcv;
994
995 error = ENOTCONN;
39037602
A
996 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
997 __func__, proc_pid(p), proc_best_name(p),
998 (uint64_t)VM_KERNEL_ADDRPERM(so),
999 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e
A
1000 /*
1001 * This socket should have been disconnected and flushed
1002 * prior to being returned from sodefunct(); there should
1003 * be no data on its receive list, so panic otherwise.
1004 */
1005 if (so->so_state & SS_DEFUNCT)
1006 sb_empty_assert(sb, __func__);
1007 socket_unlock(so, 1);
1008 return (error);
1009 }
1010
1011 /*
1012 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1013 * and if so just return to the caller. This could happen when
1014 * soreceive() is called by a socket upcall function during the
1015 * time the socket is freed. The socket buffer would have been
1016 * locked across the upcall, therefore we cannot put this thread
1017 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1018 * we may livelock), because the lock on the socket buffer will
1019 * only be released when the upcall routine returns to its caller.
1020 * Because the socket has been officially closed, there can be
1021 * no further read on it.
1022 *
1023 * A multipath subflow socket would have its SS_NOFDREF set by
1024 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1025 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1026 */
1027 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1028 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1029 socket_unlock(so, 1);
1030 return (0);
1031 }
1032
1033 /*
1034 * For consistency with soreceive() semantics, we need to obey
1035 * SB_LOCK in case some other code path has locked the buffer.
1036 */
1037 error = sblock(&so->so_rcv, 0);
1038 if (error != 0) {
1039 socket_unlock(so, 1);
1040 return (error);
1041 }
1042
1043 m = so->so_rcv.sb_mb;
1044 if (m == NULL) {
1045 /*
1046 * Panic if we notice inconsistencies in the socket's
1047 * receive list; both sb_mb and sb_cc should correctly
1048 * reflect the contents of the list, otherwise we may
1049 * end up with false positives during select() or poll()
1050 * which could put the application in a bad state.
1051 */
1052 SB_MB_CHECK(&so->so_rcv);
1053
1054 if (so->so_error != 0) {
1055 error = so->so_error;
1056 so->so_error = 0;
1057 goto release;
1058 }
1059
1060 if (so->so_state & SS_CANTRCVMORE) {
1061 goto release;
1062 }
1063
1064 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1065 error = ENOTCONN;
1066 goto release;
1067 }
1068
1069 /*
1070 * MSG_DONTWAIT is implicitly defined and this routine will
1071 * never block, so return EWOULDBLOCK when there is nothing.
1072 */
1073 error = EWOULDBLOCK;
1074 goto release;
1075 }
1076
1077 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1078 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1079 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1080
1081 while (m != NULL) {
1082 nextrecord = m->m_nextpkt;
1083 sbfree(&so->so_rcv, m);
1084
1085 if (mp != NULL) {
1086 *mp = m;
1087 mp = &m->m_next;
1088 so->so_rcv.sb_mb = m = m->m_next;
1089 *mp = NULL;
1090 }
1091
1092 if (m != NULL) {
1093 m->m_nextpkt = nextrecord;
1094 if (nextrecord == NULL)
1095 so->so_rcv.sb_lastrecord = m;
1096 } else {
1097 m = so->so_rcv.sb_mb = nextrecord;
1098 SB_EMPTY_FIXUP(&so->so_rcv);
1099 }
1100 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1101 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1102 }
1103
1104 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1105 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1106 /* notify protocol that we drained all the data */
1107 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1108 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1109
1110 if (flagsp != NULL)
1111 *flagsp |= flags;
1112
1113release:
1114 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1115 return (error);
1116
1117}
1118
1119
1120/*
1121 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1122 * the work done earlier when the subflow socket was created.
1123 */
1124void
1125mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1126 struct socket *so)
1127{
1128 struct mptopt smpo;
1129 struct socket *mp_so;
1130 int p, c;
1131
1132 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1133 mp_so = mpte->mpte_mppcb->mpp_socket;
1134 MPTS_LOCK_ASSERT_HELD(mpts);
1135
1136 socket_lock(so, 0);
1137 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1138 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1139
1140 /* inherit MPTCP socket states */
1141 if (!(mp_so->so_state & SS_NBIO))
1142 so->so_state &= ~SS_NBIO;
1143
1144 /*
1145 * At this point, the socket is not yet closed, as there is at least
1146 * one outstanding usecount previously held by mpts_socket from
1147 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1148 */
1149 so->so_flags &= ~SOF_MP_SUBFLOW;
1150 so->so_state &= ~SS_NOFDREF;
fe8ab488 1151 so->so_flags &= ~SOF_MPTCP_TRUE;
39236c6e
A
1152
1153 /* allow socket buffers to be compressed */
1154 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1155 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1156
1157 /*
1158 * Allow socket buffer auto sizing.
1159 *
1160 * This will increase the current 64k buffer size to whatever is best.
1161 */
39037602 1162 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
fe8ab488
A
1163 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1164 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1165 so->so_snd.sb_flags |= SB_AUTOSIZE;
39236c6e
A
1166
1167 /* restore protocol-user requests */
1168 VERIFY(mpts->mpts_oprotosw != NULL);
1169 so->so_proto = mpts->mpts_oprotosw;
1170
1171 bzero(&smpo, sizeof (smpo));
1172 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1173 smpo.mpo_level = SOL_SOCKET;
1174
1175 /* inherit SOF_NOSIGPIPE from parent MP socket */
1176 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1177 c = (so->so_flags & SOF_NOSIGPIPE);
1178 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1179 smpo.mpo_name = SO_NOSIGPIPE;
1180 if ((p - c) != 0)
1181 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1182
1183 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1184 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1185 c = (so->so_flags & SOF_NOADDRAVAIL);
1186 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1187 smpo.mpo_name = SO_NOADDRERR;
1188 if ((p - c) != 0)
1189 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1190
1191 /* inherit SO_KEEPALIVE from parent MP socket */
1192 p = (mp_so->so_options & SO_KEEPALIVE);
1193 c = (so->so_options & SO_KEEPALIVE);
1194 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1195 smpo.mpo_name = SO_KEEPALIVE;
1196 if ((p - c) != 0)
1197 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1198
1199 /* unset TCP level default keepalive option */
1200 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1201 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1202 smpo.mpo_level = IPPROTO_TCP;
1203 smpo.mpo_intval = 0;
1204 smpo.mpo_name = TCP_KEEPALIVE;
1205 if ((p - c) != 0)
1206 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1207 socket_unlock(so, 0);
1208
1209 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1210 struct mptsub *, mpts, struct socket *, so,
1211 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1212}
1213
1214/*
1215 * Establish an initial MPTCP connection (if first subflow and not yet
1216 * connected), or add a subflow to an existing MPTCP connection.
1217 */
1218int
1219mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1220 struct proc *p, uint32_t ifscope)
1221{
39236c6e
A
1222 struct socket *mp_so, *so = NULL;
1223 struct mptsub_connreq mpcr;
1224 struct mptcb *mp_tp;
1225 int af, error = 0;
1226
1227 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1228 mp_so = mpte->mpte_mppcb->mpp_socket;
1229 mp_tp = mpte->mpte_mptcb;
1230
fe8ab488
A
1231 MPT_LOCK(mp_tp);
1232 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1233 /* If the remote end sends Data FIN, refuse subflow adds */
1234 error = ENOTCONN;
1235 MPT_UNLOCK(mp_tp);
1236 return (error);
1237 }
1238 MPT_UNLOCK(mp_tp);
1239
39236c6e
A
1240 MPTS_LOCK(mpts);
1241 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1242 VERIFY(mpts->mpts_mpte == NULL);
1243 VERIFY(mpts->mpts_socket == NULL);
813fb2f6 1244 VERIFY(mpts->mpts_dst != NULL);
3e170ce0 1245 VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
39236c6e 1246
813fb2f6 1247 af = mpts->mpts_family = mpts->mpts_dst->sa_family;
39236c6e
A
1248
1249 /*
1250 * If the source address is not specified, allocate a storage for
1251 * it, so that later on we can fill it in with the actual source
1252 * IP address chosen by the underlying layer for the subflow after
1253 * it is connected.
1254 */
813fb2f6
A
1255 if (mpts->mpts_src == NULL) {
1256 int len = mpts->mpts_dst->sa_len;
1257
1258 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
1259 M_WAITOK | M_ZERO);
1260 if (mpts->mpts_src == NULL) {
39236c6e
A
1261 error = ENOBUFS;
1262 goto out;
1263 }
813fb2f6
A
1264 bzero(mpts->mpts_src, len);
1265 mpts->mpts_src->sa_len = len;
1266 mpts->mpts_src->sa_family = mpts->mpts_dst->sa_family;
39236c6e
A
1267 }
1268
1269 /* create the subflow socket */
1270 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1271 goto out;
1272
1273 /*
3e170ce0
A
1274 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1275 * -1 (SAE_CONNID_ALL).
39236c6e
A
1276 */
1277 mpte->mpte_connid_last++;
3e170ce0
A
1278 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1279 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
1280 mpte->mpte_connid_last++;
1281
1282 mpts->mpts_connid = mpte->mpte_connid_last;
3e170ce0
A
1283 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1284 mpts->mpts_connid != SAE_CONNID_ALL);
490019cf
A
1285
1286 mpts->mpts_rel_seq = 1;
1287
fe8ab488
A
1288 /* Allocate a unique address id per subflow */
1289 mpte->mpte_addrid_last++;
1290 if (mpte->mpte_addrid_last == 0)
1291 mpte->mpte_addrid_last++;
39236c6e
A
1292
1293 /* bind subflow socket to the specified interface */
1294 if (ifscope != IFSCOPE_NONE) {
1295 socket_lock(so, 0);
1296 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1297 if (error != 0) {
1298 socket_unlock(so, 0);
1299 (void) mptcp_subflow_soclose(mpts, so);
1300 goto out;
1301 }
1302 VERIFY(mpts->mpts_outif != NULL);
1303 mpts->mpts_flags |= MPTSF_BOUND_IF;
1304
39037602
A
1305 if (IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
1306 sototcpcb(so)->t_mpflags |= TMPF_BACKUP_PATH;
1307 } else {
1308 mpts->mpts_flags |= MPTSF_PREFERRED;
1309 }
1310
3e170ce0 1311 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
39037602 1312 "bindif %s[%d] cid %d expensive %d\n",
39236c6e
A
1313 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1314 mpts->mpts_outif->if_xname,
39037602
A
1315 ifscope, mpts->mpts_connid,
1316 IFNET_IS_EXPENSIVE(mpts->mpts_outif)),
3e170ce0 1317 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1318 socket_unlock(so, 0);
1319 }
1320
1321 /* if source address and/or port is specified, bind to it */
813fb2f6
A
1322 if (mpts->mpts_src != NULL) {
1323 struct sockaddr *sa = mpts->mpts_src;
39236c6e
A
1324 uint32_t mpts_flags = 0;
1325 in_port_t lport;
1326
1327 switch (af) {
1328 case AF_INET:
1329 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1330 mpts_flags |= MPTSF_BOUND_IP;
1331 if ((lport = SIN(sa)->sin_port) != 0)
1332 mpts_flags |= MPTSF_BOUND_PORT;
1333 break;
1334#if INET6
1335 case AF_INET6:
1336 VERIFY(af == AF_INET6);
1337 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1338 mpts_flags |= MPTSF_BOUND_IP;
1339 if ((lport = SIN6(sa)->sin6_port) != 0)
1340 mpts_flags |= MPTSF_BOUND_PORT;
1341 break;
1342#endif /* INET6 */
1343 }
1344
1345 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1346 if (error != 0) {
1347 (void) mptcp_subflow_soclose(mpts, so);
1348 goto out;
1349 }
1350 mpts->mpts_flags |= mpts_flags;
1351
1352 if (af == AF_INET || af == AF_INET6) {
1353 char sbuf[MAX_IPv6_STR_LEN];
1354
3e170ce0
A
1355 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
1356 "mp_so 0x%llx bindip %s[%d] cid %d\n",
39236c6e
A
1357 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1358 inet_ntop(af, ((af == AF_INET) ?
1359 (void *)&SIN(sa)->sin_addr.s_addr :
1360 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
3e170ce0
A
1361 ntohs(lport), mpts->mpts_connid),
1362 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1363 }
1364 }
1365
1366 /*
1367 * Insert the subflow into the list, and associate the MPTCP PCB
1368 * as well as the the subflow socket. From this point on, removing
1369 * the subflow needs to be done via mptcp_subflow_del().
1370 */
1371 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1372 mpte->mpte_numflows++;
1373
1374 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1375 mpts->mpts_mpte = mpte;
1376 mpts->mpts_socket = so;
1377 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1378 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1379 mp_so->so_usecount++; /* for subflow socket */
1380
1381 /* register for subflow socket read/write events */
1382 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1383 mptcp_subflow_wupcall, mpts);
1384
1385 /*
1386 * Register for subflow socket control events; ignore
1387 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1388 * will generate it here.
1389 */
1390 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1391 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1392 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1393 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1394 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1395 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1396 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
fe8ab488
A
1397 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1398 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
39236c6e
A
1399
1400 /* sanity check */
1401 VERIFY(!(mpts->mpts_flags &
1402 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1403
1404 bzero(&mpcr, sizeof (mpcr));
1405 mpcr.mpcr_proc = p;
1406 mpcr.mpcr_ifscope = ifscope;
1407 /*
1408 * Indicate to the TCP subflow whether or not it should establish
1409 * the initial MPTCP connection, or join an existing one. Fill
1410 * in the connection request structure with additional info needed
1411 * by the underlying TCP (to be used in the TCP options, etc.)
1412 */
1413 MPT_LOCK(mp_tp);
1414 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1415 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
490019cf 1416 mptcp_init_local_parms(mp_tp);
39236c6e
A
1417 }
1418 MPT_UNLOCK(mp_tp);
1419 soisconnecting(mp_so);
1420 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1421 } else {
1422 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1423 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
fe8ab488
A
1424
1425 /* avoid starting up cellular subflow unless required */
1426 if ((mptcp_delayed_subf_start) &&
1427 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1428 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1429 }
39236c6e
A
1430 MPT_UNLOCK(mp_tp);
1431 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1432 }
1433
490019cf
A
1434 /* If fastjoin or fastopen is requested, set state in mpts */
1435 if (mpte->mpte_nummpcapflows == 0) {
1436 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1437 MPT_LOCK(mp_tp);
1438 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
1439 mpts->mpts_flags |= MPTSF_TFO_REQD;
1440 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1441 }
1442 MPT_UNLOCK(mp_tp);
1443 }
1444
1445 if (so->so_flags & SOF_MPTCP_FASTJOIN) {
1446 MPT_LOCK(mp_tp);
1447 if (mp_tp->mpt_state == MPTCPS_ESTABLISHED) {
1448 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1449 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1450 }
1451 MPT_UNLOCK(mp_tp);
1452 }
1453 }
1454
39236c6e
A
1455 mpts->mpts_mpcr = mpcr;
1456 mpts->mpts_flags |= MPTSF_CONNECTING;
1457
1458 if (af == AF_INET || af == AF_INET6) {
1459 char dbuf[MAX_IPv6_STR_LEN];
1460
3e170ce0
A
1461 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
1462 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
1463 "[pending %s]\n", __func__,
1464 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1465 inet_ntop(af, ((af == AF_INET) ?
813fb2f6
A
1466 (void *)&SIN(mpts->mpts_dst)->sin_addr.s_addr :
1467 (void *)&SIN6(mpts->mpts_dst)->sin6_addr),
39236c6e 1468 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
813fb2f6
A
1469 ntohs(SIN(mpts->mpts_dst)->sin_port) :
1470 ntohs(SIN6(mpts->mpts_dst)->sin6_port)),
39236c6e
A
1471 mpts->mpts_connid,
1472 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
1473 "YES" : "NO")),
1474 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1475 }
1476
1477 /* connect right away if first attempt, or if join can be done now */
1478 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1479 error = mptcp_subflow_soconnectx(mpte, mpts);
1480
1481out:
1482 MPTS_UNLOCK(mpts);
1483 if (error == 0) {
1484 soevent(mp_so, SO_FILT_HINT_LOCKED |
1485 SO_FILT_HINT_CONNINFO_UPDATED);
1486 }
1487 return (error);
1488}
1489
39236c6e
A
1490/*
1491 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1492 * will no longer be accessible after a subflow is deleted, thus this
1493 * should occur only after the subflow socket has been disconnected.
1494 * If peeloff(2) is called, leave the socket open.
1495 */
1496void
1497mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1498{
1499 struct socket *mp_so, *so;
1500
1501 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1502 mp_so = mpte->mpte_mppcb->mpp_socket;
1503
1504 MPTS_LOCK(mpts);
1505 so = mpts->mpts_socket;
1506 VERIFY(so != NULL);
39037602 1507
fe8ab488
A
1508 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1509 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1510 MPTS_UNLOCK(mpts);
3e170ce0
A
1511 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
1512 " mp_so 0x%llx flags %x\n",
1513 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
1514 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
1515 return;
1516 }
39236c6e 1517
3e170ce0
A
1518 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
1519 "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
39236c6e
A
1520 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1521 mp_so->so_usecount,
1522 mp_so->so_retaincnt, mpts->mpts_connid,
1523 (close ? "YES" : "NO"), mpts->mpts_soerror,
3e170ce0
A
1524 mpts->mpts_flags,
1525 mp_so->so_error),
1526 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1527
1528 VERIFY(mpts->mpts_mpte == mpte);
3e170ce0
A
1529 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1530 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1531
1532 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1533 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1534 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1535 VERIFY(mpte->mpte_numflows != 0);
1536 mpte->mpte_numflows--;
fe8ab488
A
1537 if (mpte->mpte_active_sub == mpts)
1538 mpte->mpte_active_sub = NULL;
39236c6e
A
1539
1540 /*
1541 * Drop references held by this subflow socket; there
1542 * will be no further upcalls made from this point.
1543 */
1544 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1545 (void) sock_catchevents(so, NULL, NULL, 0);
fe8ab488 1546
39236c6e 1547 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 1548
39236c6e
A
1549 if (close)
1550 (void) mptcp_subflow_soclose(mpts, so);
1551
d190cdc3 1552 VERIFY(mp_so->so_usecount > 0);
39236c6e
A
1553 mp_so->so_usecount--; /* for subflow socket */
1554 mpts->mpts_mpte = NULL;
1555 mpts->mpts_socket = NULL;
1556 MPTS_UNLOCK(mpts);
1557
1558 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1559 MPTS_REMREF(mpts); /* for subflow socket */
1560
1561 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1562}
1563
1564/*
1565 * Disconnect a subflow socket.
1566 */
1567void
1568mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1569 boolean_t deleteok)
1570{
1571 struct socket *so;
1572 struct mptcb *mp_tp;
1573 int send_dfin = 0;
1574
1575 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1576 MPTS_LOCK_ASSERT_HELD(mpts);
1577
1578 VERIFY(mpts->mpts_mpte == mpte);
1579 VERIFY(mpts->mpts_socket != NULL);
3e170ce0
A
1580 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1581 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1582
1583 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1584 return;
1585
1586 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1587
1588 /*
1589 * If this is coming from disconnectx(2) or issued as part of
1590 * closing the MPTCP socket, the subflow shouldn't stick around.
1591 * Otherwise let it linger around in case the upper layers need
1592 * to retrieve its conninfo.
1593 */
1594 if (deleteok)
1595 mpts->mpts_flags |= MPTSF_DELETEOK;
1596
1597 so = mpts->mpts_socket;
1598 mp_tp = mpte->mpte_mptcb;
1599 MPT_LOCK(mp_tp);
1600 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1601 send_dfin = 1;
1602 MPT_UNLOCK(mp_tp);
1603
1604 socket_lock(so, 0);
1605 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1606 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
1607 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
1608 "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
1609 (deleteok ? "NO" : "YES")),
1610 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1611
1612 if (send_dfin)
1613 mptcp_send_dfin(so);
1614 (void) soshutdownlock(so, SHUT_RD);
1615 (void) soshutdownlock(so, SHUT_WR);
1616 (void) sodisconnectlocked(so);
1617 }
1618 socket_unlock(so, 0);
1619 /*
1620 * Generate a disconnect event for this subflow socket, in case
1621 * the lower layer doesn't do it; this is needed because the
1622 * subflow socket deletion relies on it. This will also end up
1623 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1624 * we cannot do that here because subflow lock is currently held.
1625 */
1626 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1627}
1628
1629/*
1630 * Subflow socket read upcall.
1631 *
1632 * Called when the associated subflow socket posted a read event. The subflow
1633 * socket lock has been released prior to invoking the callback. Note that the
1634 * upcall may occur synchronously as a result of MPTCP performing an action on
1635 * it, or asynchronously as a result of an event happening at the subflow layer.
1636 * Therefore, to maintain lock ordering, the only lock that can be acquired
1637 * here is the thread lock, for signalling purposes.
1638 */
1639static void
1640mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1641{
1642#pragma unused(so, waitf)
1643 struct mptsub *mpts = arg;
1644 struct mptses *mpte = mpts->mpts_mpte;
1645
39037602
A
1646 /*
1647 * mpte should never be NULL, except in a race with
1648 * mptcp_subflow_del
fe8ab488
A
1649 */
1650 if (mpte == NULL)
1651 return;
39236c6e
A
1652
1653 lck_mtx_lock(&mpte->mpte_thread_lock);
1654 mptcp_thread_signal_locked(mpte);
1655 lck_mtx_unlock(&mpte->mpte_thread_lock);
1656}
1657
1658/*
1659 * Subflow socket input.
1660 *
1661 * Called in the context of the MPTCP thread, for reading data from the
1662 * underlying subflow socket and delivering it to MPTCP.
1663 */
1664static void
1665mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1666{
1667 struct mbuf *m = NULL;
1668 struct socket *so;
1669 int error;
1670 struct mptsub *mpts_alt = NULL;
1671
1672 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1673 MPTS_LOCK_ASSERT_HELD(mpts);
1674
39037602 1675 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
1676 struct mptsub *, mpts);
1677
1678 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1679 return;
1680
1681 so = mpts->mpts_socket;
1682
1683 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1684 if (error != 0 && error != EWOULDBLOCK) {
3e170ce0
A
1685 mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
1686 __func__, mpts->mpts_connid, error),
1687 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1688 MPTS_UNLOCK(mpts);
3e170ce0 1689 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e 1690 if (mpts_alt == NULL) {
fe8ab488
A
1691 if (mptcp_delayed_subf_start) {
1692 mpts_alt = mptcp_get_pending_subflow(mpte,
1693 mpts);
1694 if (mpts_alt) {
3e170ce0
A
1695 mptcplog((LOG_DEBUG,"MPTCP Receiver:"
1696 " %s: pending %d\n",
1697 __func__, mpts_alt->mpts_connid),
1698 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1699 } else {
3e170ce0
A
1700 mptcplog((LOG_ERR, "MPTCP Receiver:"
1701 " %s: no pending flow for cid %d",
1702 __func__, mpts->mpts_connid),
1703 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488
A
1704 }
1705 } else {
3e170ce0 1706 mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
39037602 1707 " path for cid %d\n", __func__,
3e170ce0
A
1708 mpts->mpts_connid),
1709 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1710 }
490019cf
A
1711 if (error == ENODATA) {
1712 /*
1713 * Don't ignore ENODATA so as to discover
1714 * nasty middleboxes.
1715 */
1716 struct socket *mp_so =
1717 mpte->mpte_mppcb->mpp_socket;
1718 mp_so->so_error = ENODATA;
1719 sorwakeup(mp_so);
1720 }
39236c6e
A
1721 }
1722 MPTS_LOCK(mpts);
1723 } else if (error == 0) {
3e170ce0
A
1724 mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
1725 __func__, mpts->mpts_connid),
1726 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1727 }
1728
1729 /* In fallback, make sure to accept data on all but one subflow */
1730 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1731 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1732 m_freem(m);
1733 return;
1734 }
1735
1736 if (m != NULL) {
3e170ce0
A
1737
1738 /* Did we receive data on the backup subflow? */
1739 if (!(mpts->mpts_flags & MPTSF_ACTIVE))
1740 mpts->mpts_peerswitch++;
1741 else
1742 mpts->mpts_peerswitch = 0;
1743
39236c6e
A
1744 /*
1745 * Release subflow lock since this may trigger MPTCP to send,
1746 * possibly on a different subflow. An extra reference has
1747 * been held on the subflow by the MPTCP thread before coming
1748 * here, so we can be sure that it won't go away, in the event
1749 * the MP socket lock gets released.
1750 */
1751 MPTS_UNLOCK(mpts);
1752 mptcp_input(mpte, m);
1753 MPTS_LOCK(mpts);
1754 }
1755}
1756
1757/*
1758 * Subflow socket write upcall.
1759 *
1760 * Called when the associated subflow socket posted a read event. The subflow
1761 * socket lock has been released prior to invoking the callback. Note that the
1762 * upcall may occur synchronously as a result of MPTCP performing an action on
1763 * it, or asynchronously as a result of an event happening at the subflow layer.
1764 * Therefore, to maintain lock ordering, the only lock that can be acquired
1765 * here is the thread lock, for signalling purposes.
1766 */
1767static void
1768mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1769{
1770#pragma unused(so, waitf)
1771 struct mptsub *mpts = arg;
1772 struct mptses *mpte = mpts->mpts_mpte;
1773
fe8ab488 1774 /*
490019cf 1775 * mpte should never be NULL except in a race with
fe8ab488
A
1776 * mptcp_subflow_del which doesn't hold socket lock across critical
1777 * section. This upcall is made after releasing the socket lock.
1778 * Interleaving of socket operations becomes possible therefore.
1779 */
1780 if (mpte == NULL)
1781 return;
39236c6e
A
1782
1783 lck_mtx_lock(&mpte->mpte_thread_lock);
1784 mptcp_thread_signal_locked(mpte);
1785 lck_mtx_unlock(&mpte->mpte_thread_lock);
1786}
1787
1788/*
1789 * Subflow socket output.
1790 *
1791 * Called for sending data from MPTCP to the underlying subflow socket.
1792 */
1793int
1794mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1795{
1796 struct socket *mp_so, *so;
1797 size_t sb_cc = 0, tot_sent = 0;
1798 struct mbuf *sb_mb;
39037602 1799 int error = 0, wakeup = 0;
39236c6e
A
1800 u_int64_t mpt_dsn = 0;
1801 struct mptcb *mp_tp = mpte->mpte_mptcb;
1802 struct mbuf *mpt_mbuf = NULL;
fe8ab488
A
1803 u_int64_t off = 0;
1804 struct mbuf *head, *tail;
490019cf 1805 int tcp_zero_len_write = 0;
39236c6e
A
1806
1807 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1808 MPTS_LOCK_ASSERT_HELD(mpts);
1809 mp_so = mpte->mpte_mppcb->mpp_socket;
1810 so = mpts->mpts_socket;
1811
39037602 1812 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
39236c6e
A
1813 struct mptsub *, mpts);
1814
1815 /* subflow socket is suspended? */
1816 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
3e170ce0
A
1817 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
1818 "flow controlled\n", __func__,
1819 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1820 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1821 goto out;
1822 }
1823
1824 /* subflow socket is not MPTCP capable? */
1825 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
fe8ab488 1826 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
490019cf
A
1827 !(mpts->mpts_flags & MPTSF_FASTJ_SEND) &&
1828 !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
3e170ce0 1829 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
39236c6e 1830 "MPTCP capable\n", __func__,
3e170ce0
A
1831 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1832 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1833 goto out;
1834 }
1835
1836 /* Remove Addr Option is not sent reliably as per I-D */
1837 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1838 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1839 tp->t_rem_aid = mpte->mpte_lost_aid;
1840 if (mptcp_remaddr_enable)
1841 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1842 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1843 }
1844
490019cf 1845 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
39037602 1846 mptcp_drop_tfo_data(mpte, mpts, &wakeup);
490019cf
A
1847 }
1848
39236c6e
A
1849 /*
1850 * The mbuf chains containing the metadata (as well as pointing to
1851 * the user data sitting at the MPTCP output queue) would then be
1852 * sent down to the subflow socket.
1853 *
1854 * Some notes on data sequencing:
1855 *
1856 * a. Each mbuf must be a M_PKTHDR.
1857 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1858 * in the mbuf pkthdr structure.
1859 * c. Each mbuf containing the MPTCP metadata must have its
1860 * pkt_flags marked with the PKTF_MPTCP flag.
1861 */
1862
1863 /* First, drop acknowledged data */
1864 sb_mb = mp_so->so_snd.sb_mb;
1865 if (sb_mb == NULL) {
1866 goto out;
1867 }
1868
1869 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1870
1871 mpt_mbuf = sb_mb;
1872 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
490019cf
A
1873 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1874 (mpt_mbuf->m_next == NULL) &&
1875 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1876 /*
1877 * If TFO, allow connection establishment with zero
1878 * length write.
1879 */
1880 tcp_zero_len_write = 1;
1881 goto zero_len_write;
1882 }
39236c6e
A
1883 mpt_mbuf = mpt_mbuf->m_next;
1884 }
1885 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1886 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1887 } else {
1888 goto out;
1889 }
1890
1891 MPT_LOCK(mp_tp);
1892 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
fe8ab488 1893 u_int64_t len = 0;
39236c6e 1894 len = mp_tp->mpt_snduna - mpt_dsn;
3e170ce0 1895 MPT_UNLOCK(mp_tp);
fe8ab488 1896 sbdrop(&mp_so->so_snd, (int)len);
39037602 1897 wakeup = 1;
3e170ce0 1898 MPT_LOCK(mp_tp);
39236c6e
A
1899 }
1900
1901 /*
1902 * In degraded mode, we don't receive data acks, so force free
1903 * mbufs less than snd_nxt
1904 */
fe8ab488
A
1905 if (mp_so->so_snd.sb_mb == NULL) {
1906 MPT_UNLOCK(mp_tp);
1907 goto out;
1908 }
1909
39236c6e
A
1910 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1911 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 1912 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
39236c6e 1913 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
fe8ab488 1914 u_int64_t len = 0;
39236c6e 1915 len = mp_tp->mpt_sndnxt - mpt_dsn;
fe8ab488 1916 sbdrop(&mp_so->so_snd, (int)len);
39037602 1917 wakeup = 1;
39236c6e
A
1918 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1919 }
1920
fe8ab488
A
1921 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1922 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1923 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1924 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1925 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1926 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1927 }
1928
39236c6e
A
1929 /*
1930 * Adjust the subflow's notion of next byte to send based on
1931 * the last unacknowledged byte
1932 */
1933 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1934 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1935 }
1936
1937 /*
1938 * Adjust the top level notion of next byte used for retransmissions
1939 * and sending FINs.
1940 */
1941 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1942 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1943 }
1944
1945
1946 /* Now determine the offset from which to start transmitting data */
1947 sb_mb = mp_so->so_snd.sb_mb;
1948 sb_cc = mp_so->so_snd.sb_cc;
1949 if (sb_mb == NULL) {
1950 MPT_UNLOCK(mp_tp);
1951 goto out;
1952 }
1953 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1954 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
fe8ab488 1955 sb_cc -= (size_t)off;
39236c6e
A
1956 } else {
1957 MPT_UNLOCK(mp_tp);
1958 goto out;
1959 }
1960 MPT_UNLOCK(mp_tp);
1961
1962 mpt_mbuf = sb_mb;
39236c6e
A
1963
1964 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
fe8ab488 1965 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
39236c6e
A
1966 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1967 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 1968 }
3e170ce0
A
1969 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
1970 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
1971 "snduna = %llu sndnxt = %llu probe %d\n",
1972 __func__, mpts->mpts_connid,
1973 mp_tp->mpt_snduna, mpts->mpts_sndnxt,
1974 mpts->mpts_probecnt),
1975 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e 1976
ecc0ceb4 1977 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 1978
fe8ab488
A
1979 head = tail = NULL;
1980
39236c6e
A
1981 while (tot_sent < sb_cc) {
1982 struct mbuf *m;
fe8ab488 1983 size_t mlen;
39236c6e
A
1984
1985 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1986 mlen -= off;
1987 if (mlen == 0)
1988 goto out;
1989
1990 if (mlen > sb_cc) {
1991 panic("%s: unexpected %lu %lu \n", __func__,
1992 mlen, sb_cc);
1993 }
1994
fe8ab488
A
1995 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
1996 M_COPYM_MUST_COPY_HDR);
39236c6e
A
1997 if (m == NULL) {
1998 error = ENOBUFS;
1999 break;
2000 }
2001
2002 /* Create a DSN mapping for the data (m_copym does it) */
2003 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
fe8ab488 2004 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
2005 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2006 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2007 m->m_pkthdr.mp_dsn = mpt_dsn + off;
2008 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2009 m->m_pkthdr.mp_rlen = mlen;
2010 mpts->mpts_rel_seq += mlen;
2011 m->m_pkthdr.len = mlen;
2012
fe8ab488
A
2013 if (head == NULL) {
2014 head = tail = m;
2015 } else {
2016 tail->m_next = m;
2017 tail = m;
2018 }
2019
fe8ab488
A
2020 tot_sent += mlen;
2021 off = 0;
2022 mpt_mbuf = mpt_mbuf->m_next;
2023 }
2024
2025 if (head != NULL) {
490019cf 2026 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
fe8ab488 2027
490019cf
A
2028 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2029 (tp->t_tfo_stats == 0)) {
39037602 2030 tp->t_mpflags |= TMPF_TFO_REQUEST;
490019cf 2031 } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
fe8ab488
A
2032 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
2033 }
2034
2035 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2036
39037602 2037 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
39236c6e
A
2038 struct sockbuf *, &so->so_rcv,
2039 struct sockbuf *, &so->so_snd,
2040 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488 2041 size_t, tot_sent);
490019cf
A
2042 } else if (tcp_zero_len_write == 1) {
2043zero_len_write:
2044 socket_lock(so, 1);
2045 /* Opting to call pru_send as no mbuf at subflow level */
2046 error = (*so->so_proto->pr_usrreqs->pru_send)
2047 (so, 0, NULL, NULL, NULL, current_proc());
2048 socket_unlock(so, 1);
fe8ab488
A
2049 }
2050
490019cf 2051 if ((error == 0) || (error == EWOULDBLOCK)) {
fe8ab488 2052 mpts->mpts_sndnxt += tot_sent;
3e170ce0
A
2053
2054 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2055 tcpstat.tcps_mp_num_probes++;
2056 if (tot_sent < mpts->mpts_maxseg)
2057 mpts->mpts_probecnt += 1;
2058 else
2059 mpts->mpts_probecnt +=
2060 tot_sent/mpts->mpts_maxseg;
2061 }
2062
39236c6e 2063 MPT_LOCK(mp_tp);
3e170ce0 2064
39236c6e
A
2065 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
2066 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
2067 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2068 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2069 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
2070 }
fe8ab488 2071 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
39236c6e 2072 MPT_UNLOCK(mp_tp);
fe8ab488 2073
490019cf
A
2074 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2075 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2076
fe8ab488
A
2077 /* Send once in SYN_SENT state to avoid sending SYN spam */
2078 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
490019cf 2079 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
fe8ab488 2080 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
39236c6e 2081 }
39236c6e 2082
3e170ce0
A
2083 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2084 (mpts->mpts_probesoon != 0))
2085 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
2086 "wrote %d %d probe %d probedelta %d\n",
fe8ab488 2087 __func__, mpts->mpts_connid, (int)tot_sent,
3e170ce0
A
2088 (int) sb_cc, mpts->mpts_probecnt,
2089 (tcp_now - mpts->mpts_probesoon)),
2090 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2091 } else {
3e170ce0
A
2092 mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
2093 __func__, mpts->mpts_connid, error, tot_sent),
2094 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2095 }
2096out:
39037602
A
2097 if (wakeup)
2098 sowwakeup(mp_so);
2099
39236c6e
A
2100 return (error);
2101}
2102
2103/*
2104 * Subflow socket control event upcall.
2105 *
2106 * Called when the associated subflow socket posted one or more control events.
2107 * The subflow socket lock has been released prior to invoking the callback.
2108 * Note that the upcall may occur synchronously as a result of MPTCP performing
2109 * an action on it, or asynchronously as a result of an event happening at the
2110 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
2111 * be acquired here is the thread lock, for signalling purposes.
2112 */
2113static void
2114mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
2115{
2116#pragma unused(so)
2117 struct mptsub *mpts = arg;
2118 struct mptses *mpte = mpts->mpts_mpte;
2119
2120 VERIFY(mpte != NULL);
2121
2122 lck_mtx_lock(&mpte->mpte_thread_lock);
2123 atomic_bitset_32(&mpts->mpts_evctl, events);
2124 mptcp_thread_signal_locked(mpte);
2125 lck_mtx_unlock(&mpte->mpte_thread_lock);
2126}
2127
2128/*
2129 * Subflow socket control events.
2130 *
2131 * Called for handling events related to the underlying subflow socket.
2132 */
2133static ev_ret_t
3e170ce0
A
2134mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
2135 uint64_t *p_mpsofilt_hint)
39236c6e 2136{
fe8ab488 2137 uint32_t events, save_events;
39236c6e 2138 ev_ret_t ret = MPTS_EVRET_OK;
3e170ce0
A
2139 int i = 0;
2140 int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
2141 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e
A
2142 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2143 MPTS_LOCK_ASSERT_HELD(mpts);
2144
2145 /* bail if there's nothing to process */
2146 if ((events = mpts->mpts_evctl) == 0)
2147 return (ret);
2148
2149 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
2150 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
2151 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
2152 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
2153 SO_FILT_HINT_DISCONNECTED)) {
2154 events |= SO_FILT_HINT_MPFAILOVER;
2155 }
2156
fe8ab488
A
2157 save_events = events;
2158
39236c6e
A
2159 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
2160 struct mptsub *, mpts, uint32_t, events);
2161
3e170ce0
A
2162 mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
2163 mpts->mpts_connid, events, SO_FILT_HINT_BITS),
2164 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
2165
2166 /*
2167 * Process all the socket filter hints and reset the hint
2168 * once it is handled
2169 */
2170 for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
490019cf
A
2171 /*
2172 * Always execute the DISCONNECTED event, because it will wakeup
2173 * the app.
2174 */
3e170ce0 2175 if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
490019cf
A
2176 (ret >= MPTS_EVRET_OK ||
2177 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3e170ce0
A
2178 ev_ret_t error =
2179 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
2180 events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
2181 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2182 }
fe8ab488
A
2183 }
2184
39236c6e
A
2185 /*
2186 * We should be getting only events specified via sock_catchevents(),
2187 * so loudly complain if we have any unprocessed one(s).
2188 */
2189 if (events != 0 || ret < MPTS_EVRET_OK) {
3e170ce0 2190 mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
39236c6e 2191 " unhandled events=%b\n",
39037602 2192 (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
39236c6e 2193 __func__, mpts->mpts_connid,
3e170ce0
A
2194 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
2195 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2196 }
2197
2198 /* clear the ones we've processed */
fe8ab488 2199 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
39236c6e
A
2200 return (ret);
2201}
2202
2203/*
2204 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2205 */
2206static ev_ret_t
3e170ce0
A
2207mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
2208 uint64_t *p_mpsofilt_hint)
39236c6e
A
2209{
2210 struct socket *mp_so, *so;
2211 struct mptcb *mp_tp;
2212 boolean_t linger;
2213
2214 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2215 MPTS_LOCK_ASSERT_HELD(mpts);
2216 VERIFY(mpte->mpte_mppcb != NULL);
2217 mp_so = mpte->mpte_mppcb->mpp_socket;
2218 mp_tp = mpte->mpte_mptcb;
2219 so = mpts->mpts_socket;
2220
2221 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2222 !(mp_so->so_flags & SOF_PCBCLEARING));
2223
3e170ce0
A
2224 mptcplog((LOG_DEBUG, "MPTCP Events: "
2225 "%s: cid %d [linger %s]\n", __func__,
2226 mpts->mpts_connid, (linger ? "YES" : "NO")),
2227 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2228
39236c6e
A
2229 /*
2230 * We got a TCP RST for this subflow connection.
2231 *
2232 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
fe8ab488
A
2233 * client if the MPTCP connection has not been established or
2234 * if the connection has only one subflow and is a connection being
2235 * resumed. Otherwise we close the socket.
39236c6e
A
2236 */
2237 mptcp_subflow_disconnect(mpte, mpts, !linger);
2238
2239 MPT_LOCK(mp_tp);
2240 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
fe8ab488 2241 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
39037602
A
2242 } else if (mpte->mpte_nummpcapflows < 1 ||
2243 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
2244 (mpts->mpts_flags & MPTSF_ACTIVE))) {
fe8ab488 2245 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
3e170ce0 2246 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
39236c6e
A
2247 }
2248 MPT_UNLOCK(mp_tp);
2249
2250 /*
2251 * Keep the subflow socket around, unless the MPTCP socket has
2252 * been detached or the subflow has been disconnected explicitly,
2253 * in which case it should be deleted right away.
2254 */
2255 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2256}
2257
2258/*
2259 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2260 */
2261static ev_ret_t
3e170ce0
A
2262mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2263 uint64_t *p_mpsofilt_hint)
39236c6e 2264{
39037602 2265 struct mptcb *mp_tp;
39236c6e
A
2266 struct socket *so;
2267
2268 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2269 MPTS_LOCK_ASSERT_HELD(mpts);
2270
39037602 2271 mp_tp = mpte->mpte_mptcb;
39236c6e
A
2272 so = mpts->mpts_socket;
2273
3e170ce0
A
2274 mptcplog((LOG_DEBUG, "MPTCP Events: "
2275 "%s: cid %d\n", __func__, mpts->mpts_connid),
2276 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2277
2278 /*
39037602
A
2279 * A FIN on a fallen back MPTCP-connection should be treated like a
2280 * DATA_FIN.
2281 */
2282 MPT_LOCK(mp_tp);
2283 if ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
2284 (mpts->mpts_flags & MPTSF_ACTIVE)) {
2285 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
2286 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
2287 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
2288 }
2289 }
2290 MPT_UNLOCK(mp_tp);
39236c6e
A
2291
2292 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2293}
2294
2295/*
2296 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2297 */
2298static ev_ret_t
3e170ce0
A
2299mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
2300 uint64_t *p_mpsofilt_hint)
39236c6e 2301{
3e170ce0 2302#pragma unused(p_mpsofilt_hint)
39236c6e
A
2303 struct socket *so;
2304
2305 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2306 MPTS_LOCK_ASSERT_HELD(mpts);
2307
2308 so = mpts->mpts_socket;
2309
3e170ce0
A
2310 mptcplog((LOG_DEBUG, "MPTCP Events: "
2311 "%s: cid %d\n", __func__, mpts->mpts_connid),
2312 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2313
39236c6e
A
2314 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2315}
2316
2317/*
2318 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2319 */
2320static ev_ret_t
3e170ce0
A
2321mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
2322 uint64_t *p_mpsofilt_hint)
39236c6e 2323{
3e170ce0 2324#pragma unused(p_mpsofilt_hint)
39236c6e
A
2325 struct socket *mp_so, *so;
2326 struct mptcb *mp_tp;
2327 boolean_t linger;
2328
2329 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2330 MPTS_LOCK_ASSERT_HELD(mpts);
2331 VERIFY(mpte->mpte_mppcb != NULL);
2332 mp_so = mpte->mpte_mppcb->mpp_socket;
2333 mp_tp = mpte->mpte_mptcb;
2334 so = mpts->mpts_socket;
2335
2336 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2337 !(mp_so->so_flags & SOF_PCBCLEARING));
2338
3e170ce0
A
2339 mptcplog((LOG_NOTICE, "MPTCP Events: "
2340 "%s: cid %d [linger %s]\n", __func__,
2341 mpts->mpts_connid, (linger ? "YES" : "NO")),
2342 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2343
2344 if (mpts->mpts_soerror == 0)
2345 mpts->mpts_soerror = ETIMEDOUT;
2346
2347 /*
2348 * The subflow connection has timed out.
2349 *
2350 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2351 * client if the MPTCP connection has not been established. Otherwise
2352 * drop it.
2353 */
2354 mptcp_subflow_disconnect(mpte, mpts, !linger);
2355
2356 MPT_LOCK(mp_tp);
2357 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2358 mp_so->so_error = ETIMEDOUT;
2359 }
2360 MPT_UNLOCK(mp_tp);
2361
2362 /*
2363 * Keep the subflow socket around, unless the MPTCP socket has
2364 * been detached or the subflow has been disconnected explicitly,
2365 * in which case it should be deleted right away.
2366 */
2367 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2368}
2369
2370/*
2371 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2372 */
2373static ev_ret_t
3e170ce0
A
2374mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
2375 uint64_t *p_mpsofilt_hint)
39236c6e 2376{
3e170ce0 2377#pragma unused(p_mpsofilt_hint)
39236c6e
A
2378 struct socket *mp_so, *so;
2379 struct mptcb *mp_tp;
2380 boolean_t linger;
2381 struct tcpcb *tp = NULL;
2382
2383 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2384 MPTS_LOCK_ASSERT_HELD(mpts);
2385
2386 VERIFY(mpte->mpte_mppcb != NULL);
2387 mp_so = mpte->mpte_mppcb->mpp_socket;
2388 mp_tp = mpte->mpte_mptcb;
2389 so = mpts->mpts_socket;
2390
2391 /* Not grabbing socket lock as t_local_aid is write once only */
2392 tp = intotcpcb(sotoinpcb(so));
2393 /*
2394 * This overwrites any previous mpte_lost_aid to avoid storing
2395 * too much state when the typical case has only two subflows.
2396 */
2397 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2398 mpte->mpte_lost_aid = tp->t_local_aid;
2399
2400 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2401 !(mp_so->so_flags & SOF_PCBCLEARING));
2402
3e170ce0
A
2403 mptcplog((LOG_DEBUG, "MPTCP Events: "
2404 "%s cid %d [linger %s]\n", __func__,
2405 mpts->mpts_connid, (linger ? "YES" : "NO")),
2406 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2407
2408 if (mpts->mpts_soerror == 0)
2409 mpts->mpts_soerror = EADDRNOTAVAIL;
2410
2411 /*
2412 * The subflow connection has lost its source address.
2413 *
2414 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2415 * client if the MPTCP connection has not been established. If it
2416 * has been established with one subflow , we keep the MPTCP
2417 * connection valid without any subflows till closed by application.
2418 * This lets tcp connection manager decide whether to close this or
2419 * not as it reacts to reachability changes too.
2420 */
2421 mptcp_subflow_disconnect(mpte, mpts, !linger);
2422
2423 MPT_LOCK(mp_tp);
2424 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2425 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2426 mp_so->so_error = EADDRNOTAVAIL;
2427 }
2428 MPT_UNLOCK(mp_tp);
2429
2430 /*
2431 * Keep the subflow socket around, unless the MPTCP socket has
2432 * been detached or the subflow has been disconnected explicitly,
2433 * in which case it should be deleted right away.
2434 */
2435 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2436}
2437
fe8ab488
A
2438/*
2439 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2440 * indicates that the remote side sent a Data FIN
2441 */
2442static ev_ret_t
3e170ce0
A
2443mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2444 uint64_t *p_mpsofilt_hint)
fe8ab488
A
2445{
2446 struct socket *so, *mp_so;
2447 struct mptcb *mp_tp;
2448
2449 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2450 MPTS_LOCK_ASSERT_HELD(mpts);
2451 mp_so = mpte->mpte_mppcb->mpp_socket;
2452 so = mpts->mpts_socket;
2453 mp_tp = mpte->mpte_mptcb;
2454
3e170ce0
A
2455 mptcplog((LOG_DEBUG, "MPTCP Events: "
2456 "%s: cid %d\n", __func__, mpts->mpts_connid),
2457 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 2458
fe8ab488 2459 /*
39037602 2460 * We got a Data FIN for the MPTCP connection.
fe8ab488
A
2461 * The FIN may arrive with data. The data is handed up to the
2462 * mptcp socket and the user is notified so that it may close
2463 * the socket if needed.
2464 */
2465 MPT_LOCK(mp_tp);
39037602 2466 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
3e170ce0 2467 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
39037602 2468
fe8ab488
A
2469 MPT_UNLOCK(mp_tp);
2470 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2471}
2472
39236c6e
A
2473/*
2474 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2475 */
2476static ev_ret_t
3e170ce0
A
2477mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
2478 uint64_t *p_mpsofilt_hint)
39236c6e
A
2479{
2480 struct mptsub *mpts_alt = NULL;
2481 struct socket *so = NULL;
2482 struct socket *mp_so;
2483 int altpath_exists = 0;
2484
2485 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2486 MPTS_LOCK_ASSERT_HELD(mpts);
2487 mp_so = mpte->mpte_mppcb->mpp_socket;
3e170ce0
A
2488 mptcplog((LOG_NOTICE, "MPTCP Events: "
2489 "%s: mp_so 0x%llx\n", __func__,
2490 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2491 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2492
2493 MPTS_UNLOCK(mpts);
3e170ce0 2494 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
2495
2496 /*
2497 * If there is no alternate eligible subflow, ignore the
2498 * failover hint.
2499 */
2500 if (mpts_alt == NULL) {
3e170ce0
A
2501 mptcplog((LOG_WARNING, "MPTCP Events: "
2502 "%s: no alternate path\n", __func__),
2503 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2504
fe8ab488
A
2505 if (mptcp_delayed_subf_start) {
2506 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2507 if (mpts_alt != NULL) {
2508 MPTS_LOCK(mpts_alt);
2509 (void) mptcp_subflow_soconnectx(mpte,
2510 mpts_alt);
39037602 2511 MPTS_UNLOCK(mpts_alt);
fe8ab488
A
2512 }
2513 }
39236c6e
A
2514 MPTS_LOCK(mpts);
2515 goto done;
2516 }
2517 MPTS_LOCK(mpts_alt);
2518 altpath_exists = 1;
2519 so = mpts_alt->mpts_socket;
2520 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2521 socket_lock(so, 1);
fe8ab488
A
2522 /* All data acknowledged and no RTT spike */
2523 if ((so->so_snd.sb_cc == 0) &&
2524 (mptcp_no_rto_spike(so))) {
39236c6e
A
2525 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2526 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2527 } else {
2528 /* no alternate path available */
2529 altpath_exists = 0;
2530 }
2531 socket_unlock(so, 1);
2532 }
2533 if (altpath_exists) {
3e170ce0
A
2534 mptcplog((LOG_INFO, "MPTCP Events: "
2535 "%s: cid = %d\n",
2536 __func__, mpts_alt->mpts_connid),
2537 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2538 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3e170ce0 2539 mpts_alt->mpts_peerswitch = 0;
39236c6e
A
2540 struct mptcb *mp_tp = mpte->mpte_mptcb;
2541 /* Bring the subflow's notion of snd_nxt into the send window */
2542 MPT_LOCK(mp_tp);
2543 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2544 MPT_UNLOCK(mp_tp);
2545 mpte->mpte_active_sub = mpts_alt;
2546 socket_lock(so, 1);
2547 sowwakeup(so);
2548 socket_unlock(so, 1);
2549 }
2550 MPTS_UNLOCK(mpts_alt);
2551
2552 if (altpath_exists) {
3e170ce0
A
2553 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2554 mptcplog((LOG_NOTICE, "MPTCP Events: "
2555 "%s: mp_so 0x%llx switched from "
39236c6e
A
2556 "%d to %d\n", __func__,
2557 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
2558 mpts->mpts_connid, mpts_alt->mpts_connid),
2559 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2560 tcpstat.tcps_mp_switches++;
2561 }
2562
2563 MPTS_LOCK(mpts);
2564 if (altpath_exists) {
2565 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2566 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2567 } else {
3e170ce0
A
2568 mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
2569 __func__, mpts->mpts_connid),
2570 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2571done:
39236c6e
A
2572 so = mpts->mpts_socket;
2573 socket_lock(so, 1);
2574 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2575 socket_unlock(so, 1);
2576 }
39236c6e
A
2577 MPTS_LOCK_ASSERT_HELD(mpts);
2578 return (MPTS_EVRET_OK);
2579}
2580
2581/*
2582 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2583 */
2584static ev_ret_t
3e170ce0
A
2585mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
2586 uint64_t *p_mpsofilt_hint)
39236c6e
A
2587{
2588 struct socket *mp_so, *so;
2589 struct mptcb *mp_tp;
2590 boolean_t linger;
2591
2592 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2593 MPTS_LOCK_ASSERT_HELD(mpts);
2594 VERIFY(mpte->mpte_mppcb != NULL);
2595 mp_so = mpte->mpte_mppcb->mpp_socket;
2596 mp_tp = mpte->mpte_mptcb;
2597 so = mpts->mpts_socket;
2598
2599 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2600 !(mp_so->so_flags & SOF_PCBCLEARING));
2601
3e170ce0
A
2602 mptcplog((LOG_DEBUG, "MPTCP Events: "
2603 "%s: cid %d [linger %s]\n", __func__,
2604 mpts->mpts_connid, (linger ? "YES" : "NO")),
2605 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2606
2607 if (mpts->mpts_soerror == 0)
2608 mpts->mpts_soerror = EHOSTUNREACH;
2609
2610 /*
2611 * The subflow connection cannot use the outgoing interface.
2612 *
2613 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2614 * client if the MPTCP connection has not been established. If it
2615 * has been established, let the upper layer call disconnectx.
2616 */
2617 mptcp_subflow_disconnect(mpte, mpts, !linger);
3e170ce0 2618 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
39236c6e
A
2619
2620 MPT_LOCK(mp_tp);
2621 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2622 mp_so->so_error = EHOSTUNREACH;
2623 }
2624 MPT_UNLOCK(mp_tp);
2625
39236c6e
A
2626 /*
2627 * Keep the subflow socket around, unless the MPTCP socket has
2628 * been detached or the subflow has been disconnected explicitly,
2629 * in which case it should be deleted right away.
2630 */
2631 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2632}
2633
2634/*
2635 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2636 */
2637static ev_ret_t
3e170ce0
A
2638mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
2639 uint64_t *p_mpsofilt_hint)
39236c6e 2640{
3e170ce0 2641#pragma unused(p_mpsofilt_hint)
39236c6e
A
2642 struct socket *so;
2643
2644 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2645 MPTS_LOCK_ASSERT_HELD(mpts);
2646
2647 so = mpts->mpts_socket;
2648
2649 /* the subflow connection is being flow controlled */
2650 mpts->mpts_flags |= MPTSF_SUSPENDED;
2651
3e170ce0
A
2652 mptcplog((LOG_DEBUG, "MPTCP Events: "
2653 "%s: cid %d\n", __func__,
2654 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2655
2656 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2657}
2658
2659/*
2660 * Handle SO_FILT_HINT_RESUME subflow socket event.
2661 */
2662static ev_ret_t
3e170ce0
A
2663mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
2664 uint64_t *p_mpsofilt_hint)
39236c6e 2665{
3e170ce0 2666#pragma unused(p_mpsofilt_hint)
39236c6e
A
2667 struct socket *so;
2668
2669 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2670 MPTS_LOCK_ASSERT_HELD(mpts);
2671
2672 so = mpts->mpts_socket;
2673
2674 /* the subflow connection is no longer flow controlled */
2675 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2676
3e170ce0
A
2677 mptcplog((LOG_DEBUG, "MPTCP Events: "
2678 "%s: cid %d\n", __func__, mpts->mpts_connid),
2679 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2680
2681 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2682}
2683
2684/*
2685 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2686 */
2687static ev_ret_t
3e170ce0
A
2688mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
2689 uint64_t *p_mpsofilt_hint)
39236c6e
A
2690{
2691 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
39236c6e
A
2692 struct sockaddr_storage src;
2693 struct socket *mp_so, *so;
2694 struct mptcb *mp_tp;
2695 struct ifnet *outifp;
2696 int af, error = 0;
2697 boolean_t mpok = FALSE;
3e170ce0
A
2698 boolean_t cell = FALSE;
2699 boolean_t wifi = FALSE;
2700 boolean_t wired = FALSE;
39236c6e
A
2701
2702 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2703 VERIFY(mpte->mpte_mppcb != NULL);
2704 mp_so = mpte->mpte_mppcb->mpp_socket;
2705 mp_tp = mpte->mpte_mptcb;
2706
2707 MPTS_LOCK_ASSERT_HELD(mpts);
2708 so = mpts->mpts_socket;
2709 af = mpts->mpts_family;
2710
2711 if (mpts->mpts_flags & MPTSF_CONNECTED)
2712 return (MPTS_EVRET_OK);
2713
2714 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2715 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
490019cf 2716 socket_lock(so, 0);
fe8ab488
A
2717 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2718 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
2719 mptcplog((LOG_DEBUG, "MPTCP Events: "
2720 "%s: cid %d disconnect before tcp connect\n",
2721 __func__, mpts->mpts_connid),
2722 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2723 (void) soshutdownlock(so, SHUT_RD);
2724 (void) soshutdownlock(so, SHUT_WR);
2725 (void) sodisconnectlocked(so);
2726 }
2727 socket_unlock(so, 0);
39236c6e
A
2728 return (MPTS_EVRET_OK);
2729 }
2730
2731 /*
2732 * The subflow connection has been connected. Find out whether it
2733 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2734 *
2735 * a. If MPTCP connection is not yet established, then this must be
2736 * the first subflow connection. If MPTCP failed to negotiate,
2737 * indicate to the MPTCP socket client via EPROTO, that the
2738 * underlying TCP connection may be peeled off via peeloff(2).
2739 * Otherwise, mark the MPTCP socket as connected.
2740 *
2741 * b. If MPTCP connection has been established, then this must be
2742 * one of the subsequent subflow connections. If MPTCP failed
2743 * to negotiate, disconnect the connection since peeloff(2)
2744 * is no longer possible.
2745 *
2746 * Right now, we simply unblock any waiters at the MPTCP socket layer
2747 * if the MPTCP connection has not been established.
2748 */
2749 socket_lock(so, 0);
2750
2751 if (so->so_state & SS_ISDISCONNECTED) {
2752 /*
2753 * With MPTCP joins, a connection is connected at the subflow
2754 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
2755 * subflow to connected state. So there is a small window
2756 * where the subflow could get disconnected before the
39236c6e
A
2757 * connected event is processed.
2758 */
2759 socket_unlock(so, 0);
2760 return (MPTS_EVRET_OK);
2761 }
2762
2763 mpts->mpts_soerror = 0;
2764 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2765 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf
A
2766
2767 if (!(so->so_flags1 & SOF1_DATA_IDEMPOTENT))
2768 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
2769
2770 struct tcpcb *tp = sototcpcb(so);
2771 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
2772 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2773
490019cf
A
2774 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
2775
813fb2f6 2776 VERIFY(mpts->mpts_dst != NULL);
39236c6e 2777
813fb2f6 2778 VERIFY(mpts->mpts_src != NULL);
39236c6e
A
2779
2780 /* get/check source IP address */
2781 switch (af) {
2782 case AF_INET: {
2783 error = in_getsockaddr_s(so, &src);
2784 if (error == 0) {
813fb2f6 2785 struct sockaddr_in *ms = SIN(mpts->mpts_src);
39236c6e
A
2786 struct sockaddr_in *s = SIN(&src);
2787
2788 VERIFY(s->sin_len == ms->sin_len);
2789 VERIFY(ms->sin_family == AF_INET);
2790
2791 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2792 bcmp(&ms->sin_addr, &s->sin_addr,
2793 sizeof (ms->sin_addr)) != 0) {
3e170ce0
A
2794 mptcplog((LOG_ERR, "MPTCP Events: "
2795 "%s: cid %d local "
39236c6e
A
2796 "address %s (expected %s)\n", __func__,
2797 mpts->mpts_connid, inet_ntop(AF_INET,
2798 (void *)&s->sin_addr.s_addr, buf0,
2799 sizeof (buf0)), inet_ntop(AF_INET,
2800 (void *)&ms->sin_addr.s_addr, buf1,
3e170ce0
A
2801 sizeof (buf1))),
2802 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2803 }
2804 bcopy(s, ms, sizeof (*s));
2805 }
2806 break;
2807 }
2808#if INET6
2809 case AF_INET6: {
2810 error = in6_getsockaddr_s(so, &src);
2811 if (error == 0) {
813fb2f6 2812 struct sockaddr_in6 *ms = SIN6(mpts->mpts_src);
39236c6e
A
2813 struct sockaddr_in6 *s = SIN6(&src);
2814
2815 VERIFY(s->sin6_len == ms->sin6_len);
2816 VERIFY(ms->sin6_family == AF_INET6);
2817
2818 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2819 bcmp(&ms->sin6_addr, &s->sin6_addr,
2820 sizeof (ms->sin6_addr)) != 0) {
3e170ce0
A
2821 mptcplog((LOG_ERR, "MPTCP Events: "
2822 "%s: cid %d local "
39236c6e
A
2823 "address %s (expected %s)\n", __func__,
2824 mpts->mpts_connid, inet_ntop(AF_INET6,
2825 (void *)&s->sin6_addr, buf0,
2826 sizeof (buf0)), inet_ntop(AF_INET6,
2827 (void *)&ms->sin6_addr, buf1,
3e170ce0
A
2828 sizeof (buf1))),
2829 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2830 }
2831 bcopy(s, ms, sizeof (*s));
2832 }
2833 break;
2834 }
2835#endif /* INET6 */
2836 default:
2837 VERIFY(0);
2838 /* NOTREACHED */
2839 }
2840
2841 if (error != 0) {
3e170ce0
A
2842 mptcplog((LOG_ERR, "MPTCP Events "
2843 "%s: cid %d getsockaddr failed (%d)\n",
2844 __func__, mpts->mpts_connid, error),
2845 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2846 }
2847
2848 /* get/verify the outbound interface */
2849 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2850 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2851 VERIFY(mpts->mpts_outif != NULL);
2852 if (mpts->mpts_outif != outifp) {
3e170ce0 2853 mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
39236c6e
A
2854 "(expected %s)\n", __func__, mpts->mpts_connid,
2855 ((outifp != NULL) ? outifp->if_xname : "NULL"),
3e170ce0
A
2856 mpts->mpts_outif->if_xname),
2857 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2858
39236c6e
A
2859 if (outifp == NULL)
2860 outifp = mpts->mpts_outif;
2861 }
2862 } else {
2863 mpts->mpts_outif = outifp;
2864 }
2865
3e170ce0
A
2866 mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
2867 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
2868 mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
2869
2870 cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
2871 wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
2872 wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
2873
2874 if (cell)
2875 mpts->mpts_linktype |= MPTSL_CELL;
2876 else if (wifi)
2877 mpts->mpts_linktype |= MPTSL_WIFI;
2878 else if (wired)
2879 mpts->mpts_linktype |= MPTSL_WIRED;
2880
39236c6e
A
2881 socket_unlock(so, 0);
2882
3e170ce0
A
2883 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
2884 "establishment srtt %d \n", __func__,
2885 mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
2886 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2887
2888
2889 mptcplog((LOG_DEBUG, "MPTCP Socket: "
2890 "%s: cid %d outif %s %s[%d] -> %s[%d] "
39236c6e
A
2891 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2892 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
813fb2f6
A
2893 (void *)&SIN(mpts->mpts_src)->sin_addr.s_addr :
2894 (void *)&SIN6(mpts->mpts_src)->sin6_addr, buf0, sizeof (buf0)),
2895 ((af == AF_INET) ? ntohs(SIN(mpts->mpts_src)->sin_port) :
2896 ntohs(SIN6(mpts->mpts_src)->sin6_port)),
39236c6e 2897 inet_ntop(af, ((af == AF_INET) ?
813fb2f6
A
2898 (void *)&SIN(mpts->mpts_dst)->sin_addr.s_addr :
2899 (void *)&SIN6(mpts->mpts_dst)->sin6_addr), buf1, sizeof (buf1)),
2900 ((af == AF_INET) ? ntohs(SIN(mpts->mpts_dst)->sin_port) :
2901 ntohs(SIN6(mpts->mpts_dst)->sin6_port)),
39236c6e 2902 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
3e170ce0
A
2903 "MPTCP capable" : "a regular TCP")),
2904 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
2905
2906 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2907 MPTS_UNLOCK(mpts);
2908
3e170ce0 2909 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
2910
2911 MPT_LOCK(mp_tp);
2912 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2913 /* case (a) above */
2914 if (!mpok) {
2915 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2916 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2917 MPT_UNLOCK(mp_tp);
2918 } else {
490019cf
A
2919 MPT_UNLOCK(mp_tp);
2920 mptcplog((LOG_DEBUG, "MPTCP State: "
2921 "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
2922 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2923 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
2924 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2925 mpte->mpte_associd = mpts->mpts_connid;
39037602
A
2926 DTRACE_MPTCP2(state__change,
2927 struct mptcb *, mp_tp,
490019cf
A
2928 uint32_t, 0 /* event */);
2929
39037602
A
2930 if (mpts->mpts_outif &&
2931 IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
2932 sototcpcb(so)->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
2933 } else {
2934 mpts->mpts_flags |= MPTSF_PREFERRED;
2935 }
813fb2f6 2936 mpts->mpts_flags |= MPTSF_ACTIVE;
490019cf 2937 soisconnected(mp_so);
39236c6e
A
2938 }
2939 MPTS_LOCK(mpts);
2940 if (mpok) {
39236c6e
A
2941 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2942 mpte->mpte_nummpcapflows++;
2943 MPT_LOCK_SPIN(mp_tp);
490019cf
A
2944 /* With TFO, sndnxt may be initialized earlier */
2945 if (mpts->mpts_sndnxt == 0)
2946 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2947 MPT_UNLOCK(mp_tp);
2948 }
2949 } else if (mpok) {
2950 MPT_UNLOCK(mp_tp);
fe8ab488
A
2951 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2952 /* Experimental code, disabled by default. */
2953 sorwakeup(mp_so);
2954 sowwakeup(mp_so);
2955 }
39236c6e
A
2956 /*
2957 * case (b) above
2958 * In case of additional flows, the MPTCP socket is not
2959 * MPTSF_MP_CAPABLE until an ACK is received from server
2960 * for 3-way handshake. TCP would have guaranteed that this
2961 * is an MPTCP subflow.
2962 */
2963 MPTS_LOCK(mpts);
2964 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
fe8ab488 2965 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
39236c6e 2966 mpte->mpte_nummpcapflows++;
39236c6e 2967 MPT_LOCK_SPIN(mp_tp);
fe8ab488
A
2968 /* With Fastjoin, sndnxt is updated before connected_ev */
2969 if (mpts->mpts_sndnxt == 0) {
2970 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
490019cf 2971 mpts->mpts_rel_seq = 1;
39037602 2972 }
39236c6e 2973 MPT_UNLOCK(mp_tp);
fe8ab488
A
2974 mptcp_output_needed(mpte, mpts);
2975 } else {
2976 MPT_UNLOCK(mp_tp);
2977 MPTS_LOCK(mpts);
39236c6e 2978 }
fe8ab488 2979
39236c6e
A
2980 MPTS_LOCK_ASSERT_HELD(mpts);
2981
2982 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2983}
2984
2985/*
2986 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2987 */
2988static ev_ret_t
3e170ce0
A
2989mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
2990 uint64_t *p_mpsofilt_hint)
39236c6e
A
2991{
2992 struct socket *mp_so, *so;
2993 struct mptcb *mp_tp;
2994 boolean_t linger;
2995
2996 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2997 MPTS_LOCK_ASSERT_HELD(mpts);
2998 VERIFY(mpte->mpte_mppcb != NULL);
2999 mp_so = mpte->mpte_mppcb->mpp_socket;
3000 mp_tp = mpte->mpte_mptcb;
3001 so = mpts->mpts_socket;
3002
3003 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3004 !(mp_so->so_flags & SOF_PCBCLEARING));
3005
3e170ce0
A
3006 mptcplog((LOG_DEBUG, "MPTCP Events: "
3007 "%s: cid %d [linger %s]\n", __func__,
3008 mpts->mpts_connid, (linger ? "YES" : "NO")),
3009 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3010
3011 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3012 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3013
3014 /*
3015 * Clear flags that are used by getconninfo to return state.
fe8ab488 3016 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e
A
3017 */
3018 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3019 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3020 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
3021 MPTSF_SUSPENDED|MPTSF_ACTIVE);
3022 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3023
3024 /*
3025 * The subflow connection has been disconnected.
3026 *
3027 * Right now, we simply unblock any waiters at the MPTCP socket layer
3028 * if the MPTCP connection has not been established.
3029 */
3e170ce0 3030 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3031
3032 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3033 mpte->mpte_nummpcapflows--;
fe8ab488
A
3034 if (mpte->mpte_active_sub == mpts) {
3035 mpte->mpte_active_sub = NULL;
3e170ce0
A
3036 mptcplog((LOG_DEBUG, "MPTCP Events: "
3037 "%s: resetting active subflow \n",
3038 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3039 }
39236c6e
A
3040 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3041 }
3042
3043 MPT_LOCK(mp_tp);
3044 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3045 MPT_UNLOCK(mp_tp);
3e170ce0 3046 MPTS_UNLOCK(mpts);
39236c6e 3047 soisdisconnected(mp_so);
3e170ce0 3048 MPTS_LOCK(mpts);
39236c6e
A
3049 } else {
3050 MPT_UNLOCK(mp_tp);
3051 }
3052
39236c6e
A
3053 /*
3054 * The underlying subflow socket has been disconnected;
3055 * it is no longer useful to us. Keep the subflow socket
3056 * around, unless the MPTCP socket has been detached or
3057 * the subflow has been disconnected explicitly, in which
3058 * case it should be deleted right away.
3059 */
3060 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3061}
3062
3063/*
3064 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3065 */
3066static ev_ret_t
3e170ce0
A
3067mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3068 uint64_t *p_mpsofilt_hint)
39236c6e
A
3069{
3070 struct socket *mp_so, *so;
3071 struct mptcb *mp_tp;
3e170ce0 3072 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e
A
3073
3074 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3075 VERIFY(mpte->mpte_mppcb != NULL);
3076 mp_so = mpte->mpte_mppcb->mpp_socket;
3077 mp_tp = mpte->mpte_mptcb;
3078
3079 MPTS_LOCK_ASSERT_HELD(mpts);
3080 so = mpts->mpts_socket;
3081
3082 socket_lock(so, 0);
3083 MPT_LOCK(mp_tp);
3084
3085 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3086 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3087 else
3088 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3089
3090 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3091 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3092 goto done;
3093 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3094 }
3095 else
3096 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3097
3098 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3099 mpts->mpts_flags |= MPTSF_MP_READY;
3100 else
3101 mpts->mpts_flags &= ~MPTSF_MP_READY;
3102
3103 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3104 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3105 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3106 }
3107
3108 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3109 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3110 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3e170ce0
A
3111 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3112 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3113 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3114 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3115 ret = MPTS_EVRET_CONNECT_PENDING;
3e170ce0
A
3116 } else {
3117 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3118 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3119 }
3120
3e170ce0
A
3121 mptcplog((LOG_DEBUG, "MPTCP Events: "
3122 "%s: mp_so 0x%llx mpt_flags=%b cid %d "
39236c6e
A
3123 "mptsf=%b\n", __func__,
3124 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
3125 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3e170ce0
A
3126 mpts->mpts_flags, MPTSF_BITS),
3127 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3128
39236c6e
A
3129done:
3130 MPT_UNLOCK(mp_tp);
3131 socket_unlock(so, 0);
39236c6e
A
3132 return (ret);
3133}
3134
3135/*
3136 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3137 */
3138static ev_ret_t
3e170ce0
A
3139mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3140 uint64_t *p_mpsofilt_hint)
39236c6e
A
3141{
3142 struct socket *mp_so, *so;
3143 struct mptcb *mp_tp;
39037602 3144 boolean_t linger, is_fastclose;
39236c6e
A
3145
3146
3147 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3148 MPTS_LOCK_ASSERT_HELD(mpts);
3149 VERIFY(mpte->mpte_mppcb != NULL);
3150 mp_so = mpte->mpte_mppcb->mpp_socket;
3151 mp_tp = mpte->mpte_mptcb;
3152 so = mpts->mpts_socket;
3153
3154 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3155 !(mp_so->so_flags & SOF_PCBCLEARING));
3156
3157 if (mpts->mpts_soerror == 0)
3158 mpts->mpts_soerror = ECONNABORTED;
3159
39236c6e
A
3160 /* We got an invalid option or a fast close */
3161 socket_lock(so, 0);
3162 struct tcptemp *t_template;
3163 struct inpcb *inp = sotoinpcb(so);
3164 struct tcpcb *tp = NULL;
3165
3166 tp = intotcpcb(inp);
fe8ab488 3167 so->so_error = ECONNABORTED;
39236c6e 3168
39037602
A
3169 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3170
39236c6e
A
3171 t_template = tcp_maketemplate(tp);
3172 if (t_template) {
fe8ab488 3173 struct tcp_respond_args tra;
39236c6e 3174
fe8ab488 3175 bzero(&tra, sizeof(tra));
39236c6e 3176 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3177 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3178 else
fe8ab488
A
3179 tra.ifscope = IFSCOPE_NONE;
3180 tra.awdl_unrestricted = 1;
39236c6e
A
3181
3182 tcp_respond(tp, t_template->tt_ipgen,
3183 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3184 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3185 (void) m_free(dtom(t_template));
3e170ce0
A
3186 mptcplog((LOG_DEBUG, "MPTCP Events: "
3187 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3188 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3189 so, mpts->mpts_connid),
3190 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3191 }
3192 socket_unlock(so, 0);
3193 mptcp_subflow_disconnect(mpte, mpts, !linger);
39236c6e 3194
3e170ce0
A
3195 *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
3196
39037602
A
3197 MPT_LOCK(mp_tp);
3198
3199 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3e170ce0 3200 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 3201
39037602
A
3202 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3203 mp_so->so_error = ECONNABORTED;
3204 else
3205 mp_so->so_error = ECONNRESET;
3206
3207 /*
3208 * mptcp_drop is being called after processing the events, to fully
3209 * close the MPTCP connection
3210 */
39236c6e 3211 }
39037602 3212
3e170ce0
A
3213 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3214 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e
A
3215 MPT_UNLOCK(mp_tp);
3216
39236c6e
A
3217 /*
3218 * Keep the subflow socket around unless the subflow has been
3219 * disconnected explicitly.
3220 */
3221 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3222}
3223
fe8ab488 3224static ev_ret_t
3e170ce0
A
3225mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
3226 uint64_t *p_mpsofilt_hint)
fe8ab488 3227{
3e170ce0 3228#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3229 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3230 MPTS_LOCK_ASSERT_HELD(mpts);
3231 VERIFY(mpte->mpte_mppcb != NULL);
39037602 3232
fe8ab488
A
3233 if (mpte->mpte_nummpcapflows == 0) {
3234 struct mptcb *mp_tp = mpte->mpte_mptcb;
3e170ce0
A
3235 mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
3236 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
3237 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3238
fe8ab488
A
3239 mpte->mpte_active_sub = mpts;
3240 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3241 MPT_LOCK(mp_tp);
3242 /*
3243 * If mptcp_subflow_output is called before fastjoin_ev
3244 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3245 * and further mpts->mpts_sndnxt is incremented by len copied.
3246 */
3247 if (mpts->mpts_sndnxt == 0) {
3248 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
fe8ab488
A
3249 }
3250 MPT_UNLOCK(mp_tp);
3251 }
3252
3253 return (MPTS_EVRET_OK);
3254}
3255
3256static ev_ret_t
3e170ce0
A
3257mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
3258 uint64_t *p_mpsofilt_hint)
fe8ab488 3259{
3e170ce0 3260#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3261 MPTE_LOCK_ASSERT_HELD(mpte);
3262 MPTS_LOCK_ASSERT_HELD(mpts);
3263 VERIFY(mpte->mpte_mppcb != NULL);
3e170ce0
A
3264
3265 mptcplog((LOG_DEBUG, "MPTCP Events: "
3266 "%s cid %d\n", __func__, mpts->mpts_connid),
3267 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3268
3269 mpts->mpts_flags |= MPTSF_DELETEOK;
3270 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3271 return (MPTS_EVRET_DELETE);
3272 else
3273 return (MPTS_EVRET_OK);
3274}
3275
39236c6e
A
3276static const char *
3277mptcp_evret2str(ev_ret_t ret)
3278{
3279 const char *c = "UNKNOWN";
3280
3281 switch (ret) {
3282 case MPTS_EVRET_DELETE:
3283 c = "MPTS_EVRET_DELETE";
3284 break;
3285 case MPTS_EVRET_CONNECT_PENDING:
3286 c = "MPTS_EVRET_CONNECT_PENDING";
3287 break;
3288 case MPTS_EVRET_DISCONNECT_FALLBACK:
3289 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3290 break;
3291 case MPTS_EVRET_OK:
3292 c = "MPTS_EVRET_OK";
3293 break;
3e170ce0 3294 default:
39236c6e
A
3295 break;
3296 }
3297 return (c);
3298}
3299
3300/*
3301 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3302 */
3303void
3304mptcp_subflow_addref(struct mptsub *mpts, int locked)
3305{
3306 if (!locked)
3307 MPTS_LOCK(mpts);
3308 else
3309 MPTS_LOCK_ASSERT_HELD(mpts);
3310
3311 if (++mpts->mpts_refcnt == 0) {
3312 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3313 /* NOTREACHED */
3314 }
3315 if (!locked)
3316 MPTS_UNLOCK(mpts);
3317}
3318
3319/*
3320 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3321 */
3322void
3323mptcp_subflow_remref(struct mptsub *mpts)
3324{
3325 MPTS_LOCK(mpts);
3326 if (mpts->mpts_refcnt == 0) {
3327 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3328 /* NOTREACHED */
3329 }
3330 if (--mpts->mpts_refcnt > 0) {
3331 MPTS_UNLOCK(mpts);
3332 return;
3333 }
3334 /* callee will unlock and destroy lock */
3335 mptcp_subflow_free(mpts);
3336}
3337
3338/*
3339 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3340 * caller must ensure that the option can be issued on subflow sockets, via
3341 * MPOF_SUBFLOW_OK flag.
3342 */
3343int
3344mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3345 struct mptopt *mpo)
3346{
3347 struct socket *mp_so;
3348 struct sockopt sopt;
3349 char buf[32];
3350 int error;
3351
3352 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3353 mpo->mpo_flags &= ~MPOF_INTERIM;
3354
3355 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3356 mp_so = mpte->mpte_mppcb->mpp_socket;
3357
3358 bzero(&sopt, sizeof (sopt));
3359 sopt.sopt_dir = SOPT_SET;
3360 sopt.sopt_level = mpo->mpo_level;
3361 sopt.sopt_name = mpo->mpo_name;
3362 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3363 sopt.sopt_valsize = sizeof (int);
3364 sopt.sopt_p = kernproc;
3365
3366 error = sosetoptlock(so, &sopt, 0); /* already locked */
3367 if (error == 0) {
3e170ce0
A
3368 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3369 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3370 "val %d set successful\n", __func__,
3371 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3372 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3373 buf, sizeof (buf)), mpo->mpo_intval),
3374 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3375 } else {
3e170ce0
A
3376 mptcplog((LOG_ERR, "MPTCP Socket: "
3377 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3378 "val %d set error %d\n", __func__,
3379 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3380 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3381 buf, sizeof (buf)), mpo->mpo_intval, error),
3382 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3383 }
3384 return (error);
3385}
3386
3387/*
3388 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3389 * caller must ensure that the option can be issued on subflow sockets, via
3390 * MPOF_SUBFLOW_OK flag.
3391 */
3392int
3393mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3394 struct mptopt *mpo)
3395{
3396 struct socket *mp_so;
3397 struct sockopt sopt;
3398 char buf[32];
3399 int error;
3400
3401 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3402 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3403 mp_so = mpte->mpte_mppcb->mpp_socket;
3404
3405 bzero(&sopt, sizeof (sopt));
3406 sopt.sopt_dir = SOPT_GET;
3407 sopt.sopt_level = mpo->mpo_level;
3408 sopt.sopt_name = mpo->mpo_name;
3409 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3410 sopt.sopt_valsize = sizeof (int);
3411 sopt.sopt_p = kernproc;
3412
3413 error = sogetoptlock(so, &sopt, 0); /* already locked */
3414 if (error == 0) {
3e170ce0
A
3415 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3416 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3417 "val %d get successful\n", __func__,
3418 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3419 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3420 buf, sizeof (buf)), mpo->mpo_intval),
3421 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3422 } else {
3e170ce0
A
3423 mptcplog((LOG_ERR, "MPTCP Socket: "
3424 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e
A
3425 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3426 mptcp_sopt2str(mpo->mpo_level,
3e170ce0
A
3427 mpo->mpo_name, buf, sizeof (buf)), error),
3428 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3429 }
3430 return (error);
3431}
3432
3433
3434/*
3435 * MPTCP garbage collector.
3436 *
3437 * This routine is called by the MP domain on-demand, periodic callout,
3438 * which is triggered when a MPTCP socket is closed. The callout will
3439 * repeat as long as this routine returns a non-zero value.
3440 */
3441static uint32_t
3442mptcp_gc(struct mppcbinfo *mppi)
3443{
3444 struct mppcb *mpp, *tmpp;
3445 uint32_t active = 0;
3446
3447 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3448
39236c6e
A
3449 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3450 struct socket *mp_so;
3451 struct mptses *mpte;
3452 struct mptcb *mp_tp;
3453
3454 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3455 mp_so = mpp->mpp_socket;
3456 VERIFY(mp_so != NULL);
3457 mpte = mptompte(mpp);
3458 VERIFY(mpte != NULL);
3459 mp_tp = mpte->mpte_mptcb;
3460 VERIFY(mp_tp != NULL);
3461
3e170ce0
A
3462 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3463 "%s: mp_so 0x%llx found "
39236c6e
A
3464 "(u=%d,r=%d,s=%d)\n", __func__,
3465 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3466 mp_so->so_retaincnt, mpp->mpp_state),
3467 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3468
3469 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3e170ce0
A
3470 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3471 "%s: mp_so 0x%llx skipped "
39236c6e
A
3472 "(u=%d,r=%d)\n", __func__,
3473 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3474 mp_so->so_usecount, mp_so->so_retaincnt),
3475 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3476 active++;
3477 continue;
3478 }
3479
3480 /* check again under the lock */
3481 if (mp_so->so_usecount > 1) {
3482 boolean_t wakeup = FALSE;
3483 struct mptsub *mpts, *tmpts;
3484
3e170ce0
A
3485 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3486 "%s: mp_so 0x%llx skipped "
39236c6e
A
3487 "[u=%d,r=%d] %d %d\n", __func__,
3488 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3489 mp_so->so_usecount, mp_so->so_retaincnt,
3490 mp_tp->mpt_gc_ticks,
3e170ce0
A
3491 mp_tp->mpt_state),
3492 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3493
39236c6e
A
3494 MPT_LOCK(mp_tp);
3495 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3496 if (mp_tp->mpt_gc_ticks > 0)
3497 mp_tp->mpt_gc_ticks--;
3498 if (mp_tp->mpt_gc_ticks == 0) {
3499 wakeup = TRUE;
3500 if (mp_tp->mpt_localkey != NULL) {
3501 mptcp_free_key(
3502 mp_tp->mpt_localkey);
3503 mp_tp->mpt_localkey = NULL;
3504 }
3505 }
3506 }
3507 MPT_UNLOCK(mp_tp);
3508 if (wakeup) {
3509 TAILQ_FOREACH_SAFE(mpts,
3510 &mpte->mpte_subflows, mpts_entry, tmpts) {
3511 MPTS_LOCK(mpts);
3512 mpts->mpts_flags |= MPTSF_DELETEOK;
3513 if (mpts->mpts_soerror == 0)
3514 mpts->mpts_soerror = ETIMEDOUT;
3515 mptcp_subflow_eupcall(mpts->mpts_socket,
3516 mpts, SO_FILT_HINT_DISCONNECTED);
3517 MPTS_UNLOCK(mpts);
3518 }
3519 }
3520 lck_mtx_unlock(&mpp->mpp_lock);
3521 active++;
3522 continue;
3523 }
3524
3525 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3e170ce0
A
3526 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3527 "%s: mp_so 0x%llx skipped "
39236c6e
A
3528 "[u=%d,r=%d,s=%d]\n", __func__,
3529 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3530 mp_so->so_usecount, mp_so->so_retaincnt,
3e170ce0
A
3531 mpp->mpp_state),
3532 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3533 lck_mtx_unlock(&mpp->mpp_lock);
3534 active++;
3535 continue;
3536 }
3537
3538 /*
3539 * The PCB has been detached, and there is exactly 1 refnct
3540 * held by the MPTCP thread. Signal that thread to terminate,
3541 * after which the last refcnt will be released. That will
3542 * allow it to be destroyed below during the next round.
3543 */
3544 if (mp_so->so_usecount == 1) {
3e170ce0
A
3545 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3546 "%s: mp_so 0x%llx scheduled for "
39236c6e
A
3547 "termination [u=%d,r=%d]\n", __func__,
3548 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3549 mp_so->so_usecount, mp_so->so_retaincnt),
3550 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3551
39236c6e
A
3552 /* signal MPTCP thread to terminate */
3553 mptcp_thread_terminate_signal(mpte);
3554 lck_mtx_unlock(&mpp->mpp_lock);
3555 active++;
3556 continue;
3557 }
3558
3e170ce0
A
3559 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3560 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 3561 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3562 mp_so->so_usecount, mp_so->so_retaincnt),
3563 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3564
39037602 3565 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
3566 struct sockbuf *, &mp_so->so_rcv,
3567 struct sockbuf *, &mp_so->so_snd,
3568 struct mppcb *, mpp);
3569
3570 mp_pcbdispose(mpp);
39037602 3571 sodealloc(mp_so);
39236c6e
A
3572 }
3573
3574 return (active);
3575}
3576
3577/*
3578 * Drop a MPTCP connection, reporting the specified error.
3579 */
3580struct mptses *
3581mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3582{
3583 struct socket *mp_so;
3584
3585 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3586 MPT_LOCK_ASSERT_HELD(mp_tp);
3587 VERIFY(mpte->mpte_mptcb == mp_tp);
3588 mp_so = mpte->mpte_mppcb->mpp_socket;
3589
fe8ab488 3590 mp_tp->mpt_state = MPTCPS_TERMINATE;
39037602 3591 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
3592 uint32_t, 0 /* event */);
3593
3594 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3595 errno = mp_tp->mpt_softerror;
3596 mp_so->so_error = errno;
3597
3598 return (mptcp_close(mpte, mp_tp));
3599}
3600
3601/*
3602 * Close a MPTCP control block.
3603 */
3604struct mptses *
3605mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3606{
3e170ce0
A
3607 struct socket *mp_so = NULL;
3608 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e
A
3609
3610 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3611 MPT_LOCK_ASSERT_HELD(mp_tp);
3612 VERIFY(mpte->mpte_mptcb == mp_tp);
3613 mp_so = mpte->mpte_mppcb->mpp_socket;
3614 if (mp_tp->mpt_localkey != NULL) {
3615 mptcp_free_key(mp_tp->mpt_localkey);
3616 mp_tp->mpt_localkey = NULL;
3617 }
3618
3619 MPT_UNLOCK(mp_tp);
3620 soisdisconnected(mp_so);
3621
3622 MPT_LOCK(mp_tp);
3623 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3624 return (NULL);
3625 }
3626 MPT_UNLOCK(mp_tp);
3627
3628 /* Clean up all subflows */
3629 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3630 MPTS_LOCK(mpts);
fe8ab488 3631 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
39236c6e
A
3632 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3633 MPTS_UNLOCK(mpts);
3634 mptcp_subflow_del(mpte, mpts, TRUE);
3635 }
3636 MPT_LOCK(mp_tp);
3637
3638 return (NULL);
3639}
3640
3641void
3642mptcp_notify_close(struct socket *so)
3643{
3644 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3645}
3646
3647/*
3648 * Signal MPTCP thread to wake up.
3649 */
3650void
3651mptcp_thread_signal(struct mptses *mpte)
3652{
3653 lck_mtx_lock(&mpte->mpte_thread_lock);
3654 mptcp_thread_signal_locked(mpte);
3655 lck_mtx_unlock(&mpte->mpte_thread_lock);
3656}
3657
3658/*
3659 * Signal MPTCP thread to wake up (locked version)
3660 */
3661static void
3662mptcp_thread_signal_locked(struct mptses *mpte)
3663{
3664 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3665
3666 mpte->mpte_thread_reqs++;
3667 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3668 wakeup_one((caddr_t)&mpte->mpte_thread);
3669}
3670
3671/*
3672 * Signal MPTCP thread to terminate.
3673 */
3674static void
3675mptcp_thread_terminate_signal(struct mptses *mpte)
3676{
3677 lck_mtx_lock(&mpte->mpte_thread_lock);
3678 if (mpte->mpte_thread != THREAD_NULL) {
3679 mpte->mpte_thread = THREAD_NULL;
3680 mpte->mpte_thread_reqs++;
3681 if (!mpte->mpte_thread_active)
3682 wakeup_one((caddr_t)&mpte->mpte_thread);
3683 }
3684 lck_mtx_unlock(&mpte->mpte_thread_lock);
3685}
3686
3687/*
3688 * MPTCP thread workloop.
3689 */
3690static void
3691mptcp_thread_dowork(struct mptses *mpte)
3692{
3693 struct socket *mp_so;
3694 struct mptsub *mpts, *tmpts;
3695 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3e170ce0 3696 uint64_t mpsofilt_hint_mask = 0;
39236c6e
A
3697
3698 MPTE_LOCK(mpte); /* same as MP socket lock */
3699 VERIFY(mpte->mpte_mppcb != NULL);
3700 mp_so = mpte->mpte_mppcb->mpp_socket;
3701 VERIFY(mp_so != NULL);
3702
3703 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3704 ev_ret_t ret;
3705
3706 MPTS_LOCK(mpts);
3707 MPTS_ADDREF_LOCKED(mpts); /* for us */
490019cf 3708
39236c6e
A
3709 /* Update process ownership based on parent mptcp socket */
3710 mptcp_update_last_owner(mpts, mp_so);
490019cf 3711
39236c6e 3712 mptcp_subflow_input(mpte, mpts);
3e170ce0
A
3713
3714 mptcp_get_rtt_measurement(mpts, mpte);
3715
3716 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e
A
3717
3718 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3e170ce0
A
3719 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3720 "%s: cid %d \n", __func__,
3721 mpts->mpts_connid),
3722 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3723 (void) mptcp_subflow_output(mpte, mpts);
3724 }
3725
3726 /*
3727 * If MPTCP socket is closed, disconnect all subflows.
3728 * This will generate a disconnect event which will
3729 * be handled during the next iteration, causing a
3730 * non-zero error to be returned above.
3731 */
3732 if (mp_so->so_flags & SOF_PCBCLEARING)
3733 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3734 MPTS_UNLOCK(mpts);
3735
3736 switch (ret) {
39236c6e
A
3737 case MPTS_EVRET_OK:
3738 /* nothing to do */
3739 break;
3740 case MPTS_EVRET_DELETE:
fe8ab488 3741 mptcp_subflow_del(mpte, mpts, TRUE);
39236c6e
A
3742 break;
3743 case MPTS_EVRET_CONNECT_PENDING:
3744 connect_pending = TRUE;
3745 break;
3746 case MPTS_EVRET_DISCONNECT_FALLBACK:
3747 disconnect_fallback = TRUE;
3748 break;
3e170ce0
A
3749 default:
3750 mptcplog((LOG_DEBUG,
3751 "MPTCP Socket: %s: mptcp_subflow_events "
3752 "returned invalid value: %d\n", __func__,
3753 ret),
3754 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3755 break;
39236c6e
A
3756 }
3757 MPTS_REMREF(mpts); /* ours */
3758 }
3759
3e170ce0 3760 if (mpsofilt_hint_mask) {
39037602
A
3761 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
3762 socantrcvmore(mp_so);
3763 mpsofilt_hint_mask &= ~SO_FILT_HINT_CANTRCVMORE;
3764 }
3765
3766 if (mpsofilt_hint_mask & SO_FILT_HINT_CONNRESET) {
3767 struct mptcb *mp_tp = mpte->mpte_mptcb;
3768
3769 MPT_LOCK(mp_tp);
3770 mptcp_drop(mpte, mp_tp, ECONNRESET);
3771 MPT_UNLOCK(mp_tp);
3772 }
3773
3e170ce0 3774 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
3775 }
3776
3777 if (!connect_pending && !disconnect_fallback) {
3778 MPTE_UNLOCK(mpte);
3779 return;
3780 }
3781
3782 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3783 MPTS_LOCK(mpts);
3784 if (disconnect_fallback) {
3785 struct socket *so = NULL;
3786 struct inpcb *inp = NULL;
3787 struct tcpcb *tp = NULL;
3788
3789 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3790 MPTS_UNLOCK(mpts);
3791 continue;
3792 }
3793
3794 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3795
3796 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3e170ce0 3797 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
39236c6e
A
3798 MPTS_UNLOCK(mpts);
3799 continue;
3800 }
490019cf
A
3801
3802 if (mpts->mpts_flags & MPTSF_TFO_REQD)
39037602 3803 mptcp_drop_tfo_data(mpte, mpts, NULL);
490019cf 3804
39236c6e
A
3805 so = mpts->mpts_socket;
3806
3807 /*
3808 * The MPTCP connection has degraded to a fallback
3809 * mode, so there is no point in keeping this subflow
3810 * regardless of its MPTCP-readiness state, unless it
3811 * is the primary one which we use for fallback. This
3812 * assumes that the subflow used for fallback is the
3813 * ACTIVE one.
3814 */
3815
3816 socket_lock(so, 1);
3817 inp = sotoinpcb(so);
3818 tp = intotcpcb(inp);
3819 tp->t_mpflags &=
3820 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3821 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 3822
39236c6e
A
3823 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3824 socket_unlock(so, 1);
3825 MPTS_UNLOCK(mpts);
3826 continue;
3827 }
3828 tp->t_mpflags |= TMPF_RESET;
3829 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3830 socket_unlock(so, 1);
3831
3832 } else if (connect_pending) {
fe8ab488
A
3833 /*
3834 * If delayed subflow start is set and cellular,
3835 * delay the connect till a retransmission timeout
3836 */
3837
3838 if ((mptcp_delayed_subf_start) &&
3839 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3840 MPTS_UNLOCK(mpts);
3841 continue;
3842 }
3843
39236c6e
A
3844 /*
3845 * The MPTCP connection has progressed to a state
3846 * where it supports full multipath semantics; allow
3847 * additional joins to be attempted for all subflows
3848 * that are in the PENDING state.
3849 */
3850 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3851 (void) mptcp_subflow_soconnectx(mpte, mpts);
3852 }
3853 }
3854 MPTS_UNLOCK(mpts);
3855 }
3856
3857 MPTE_UNLOCK(mpte);
3858}
3859
3860/*
3861 * MPTCP thread.
3862 */
3863static void
3864mptcp_thread_func(void *v, wait_result_t w)
3865{
3866#pragma unused(w)
3867 struct mptses *mpte = v;
3868 struct timespec *ts = NULL;
3869
3870 VERIFY(mpte != NULL);
3871
3872 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3873
3874 for (;;) {
3875 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3876
3877 if (mpte->mpte_thread != THREAD_NULL) {
3878 (void) msleep(&mpte->mpte_thread,
3879 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3880 __func__, ts);
3881 }
3882
3883 /* MPTCP socket is closed? */
3884 if (mpte->mpte_thread == THREAD_NULL) {
3885 lck_mtx_unlock(&mpte->mpte_thread_lock);
3886 /* callee will destroy thread lock */
3887 mptcp_thread_destroy(mpte);
3888 /* NOTREACHED */
3889 return;
3890 }
3891
3892 mpte->mpte_thread_active = 1;
3893 for (;;) {
3894 uint32_t reqs = mpte->mpte_thread_reqs;
3895
3896 lck_mtx_unlock(&mpte->mpte_thread_lock);
3897 mptcp_thread_dowork(mpte);
3898 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3899
3900 /* if there's no pending request, we're done */
3901 if (reqs == mpte->mpte_thread_reqs ||
3902 mpte->mpte_thread == THREAD_NULL)
3903 break;
3904 }
3905 mpte->mpte_thread_reqs = 0;
3906 mpte->mpte_thread_active = 0;
3907 }
3908}
3909
3910/*
3911 * Destroy a MTCP thread, to be called in the MPTCP thread context
3912 * upon receiving an indication to self-terminate. This routine
3913 * will not return, as the current thread is terminated at the end.
3914 */
3915static void
3916mptcp_thread_destroy(struct mptses *mpte)
3917{
3918 struct socket *mp_so;
3919
3920 MPTE_LOCK(mpte); /* same as MP socket lock */
3921 VERIFY(mpte->mpte_thread == THREAD_NULL);
3922 VERIFY(mpte->mpte_mppcb != NULL);
3923
3924 mptcp_sesdestroy(mpte);
3925
3926 mp_so = mpte->mpte_mppcb->mpp_socket;
3927 VERIFY(mp_so != NULL);
d190cdc3 3928 VERIFY(mp_so->so_usecount > 0);
39236c6e
A
3929 mp_so->so_usecount--; /* for thread */
3930 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3931 MPTE_UNLOCK(mpte);
3932
3933 /* for the extra refcnt from kernel_thread_start() */
3934 thread_deallocate(current_thread());
3935 /* this is the end */
3936 thread_terminate(current_thread());
3937 /* NOTREACHED */
3938}
3939
3940/*
3941 * Protocol pr_lock callback.
3942 */
3943int
3944mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3945{
3946 struct mppcb *mpp = sotomppcb(mp_so);
3947 void *lr_saved;
3948
3949 if (lr == NULL)
3950 lr_saved = __builtin_return_address(0);
3951 else
3952 lr_saved = lr;
3953
3954 if (mpp == NULL) {
3955 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3956 mp_so, lr_saved, solockhistory_nr(mp_so));
3957 /* NOTREACHED */
3958 }
3959 lck_mtx_lock(&mpp->mpp_lock);
3960
3961 if (mp_so->so_usecount < 0) {
3962 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3963 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3964 solockhistory_nr(mp_so));
3965 /* NOTREACHED */
3966 }
3967 if (refcount != 0)
3968 mp_so->so_usecount++;
3969 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3970 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3971
3972 return (0);
3973}
3974
3975/*
3976 * Protocol pr_unlock callback.
3977 */
3978int
3979mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3980{
3981 struct mppcb *mpp = sotomppcb(mp_so);
3982 void *lr_saved;
3983
3984 if (lr == NULL)
3985 lr_saved = __builtin_return_address(0);
3986 else
3987 lr_saved = lr;
3988
3989 if (mpp == NULL) {
3990 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3991 mp_so, mp_so->so_usecount, lr_saved,
3992 solockhistory_nr(mp_so));
3993 /* NOTREACHED */
3994 }
3995 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3996
3997 if (refcount != 0)
3998 mp_so->so_usecount--;
3999
4000 if (mp_so->so_usecount < 0) {
4001 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4002 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4003 /* NOTREACHED */
4004 }
4005 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4006 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4007 lck_mtx_unlock(&mpp->mpp_lock);
4008
4009 return (0);
4010}
4011
4012/*
4013 * Protocol pr_getlock callback.
4014 */
4015lck_mtx_t *
4016mptcp_getlock(struct socket *mp_so, int locktype)
4017{
4018#pragma unused(locktype)
4019 struct mppcb *mpp = sotomppcb(mp_so);
4020
4021 if (mpp == NULL) {
4022 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4023 solockhistory_nr(mp_so));
4024 /* NOTREACHED */
4025 }
4026 if (mp_so->so_usecount < 0) {
4027 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4028 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4029 /* NOTREACHED */
4030 }
4031 return (&mpp->mpp_lock);
4032}
4033
4034/*
4035 * Key generation functions
4036 */
4037static void
4038mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
4039{
4040 struct mptcp_key_entry *key_elm;
4041try_again:
4042 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
4043 if (key_entry->mkey_value == 0)
4044 goto try_again;
4045 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
4046 sizeof (key_entry->mkey_digest));
4047
4048 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4049 if (key_elm->mkey_value == key_entry->mkey_value) {
4050 goto try_again;
4051 }
4052 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
4053 0) {
4054 goto try_again;
4055 }
4056 }
4057}
4058
4059static mptcp_key_t *
4060mptcp_reserve_key(void)
4061{
4062 struct mptcp_key_entry *key_elm;
4063 struct mptcp_key_entry *found_elm = NULL;
4064
4065 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4066 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4067 if (key_elm->mkey_flags == MKEYF_FREE) {
4068 key_elm->mkey_flags = MKEYF_INUSE;
4069 found_elm = key_elm;
4070 break;
4071 }
4072 }
4073 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4074
4075 if (found_elm) {
4076 return (&found_elm->mkey_value);
4077 }
4078
4079 key_elm = (struct mptcp_key_entry *)
4080 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4081 key_elm->mkey_flags = MKEYF_INUSE;
4082
4083 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4084 mptcp_generate_unique_key(key_elm);
4085 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
4086 mptcp_keys_pool.mkph_count += 1;
4087 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4088 return (&key_elm->mkey_value);
4089}
4090
4091static caddr_t
4092mptcp_get_stored_digest(mptcp_key_t *key)
4093{
4094 struct mptcp_key_entry *key_holder;
4095 caddr_t digest = NULL;
4096
4097 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4098 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
4099 offsetof(struct mptcp_key_entry, mkey_value));
4100 if (key_holder->mkey_flags != MKEYF_INUSE)
4101 panic_plain("%s", __func__);
4102 digest = &key_holder->mkey_digest[0];
4103 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4104 return (digest);
4105}
4106
4107void
4108mptcp_free_key(mptcp_key_t *key)
4109{
4110 struct mptcp_key_entry *key_holder;
4111 struct mptcp_key_entry *key_elm;
4112 int pt = RandomULong();
4113
39236c6e
A
4114 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4115 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
4116 offsetof(struct mptcp_key_entry, mkey_value));
4117 key_holder->mkey_flags = MKEYF_FREE;
4118
4119 LIST_REMOVE(key_holder, mkey_next);
4120 mptcp_keys_pool.mkph_count -= 1;
4121
4122 /* Free half the time */
4123 if (pt & 0x01) {
4124 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
4125 } else {
4126 /* Insert it at random point to avoid early reuse */
4127 int i = 0;
4128 if (mptcp_keys_pool.mkph_count > 1) {
4129 pt = pt % (mptcp_keys_pool.mkph_count - 1);
4130 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4131 if (++i >= pt) {
4132 LIST_INSERT_AFTER(key_elm, key_holder,
4133 mkey_next);
4134 break;
4135 }
4136 }
4137 if (i < pt)
4138 panic("missed insertion");
4139 } else {
4140 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
4141 mkey_next);
4142 }
4143 mptcp_keys_pool.mkph_count += 1;
4144 }
4145 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4146}
4147
4148static void
4149mptcp_key_pool_init(void)
4150{
4151 int i;
4152 struct mptcp_key_entry *key_entry;
4153
4154 LIST_INIT(&mptcp_keys_pool);
4155 mptcp_keys_pool.mkph_count = 0;
4156
4157 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
4158 (sizeof (struct mptcp_key_entry));
4159 mptcp_keys_pool.mkph_key_entry_zone = zinit(
4160 mptcp_keys_pool.mkph_key_elm_sz,
4161 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
4162 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
4163 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
4164 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
4165 /* NOTREACHED */
4166 }
4167 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
4168 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
4169
4170 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
4171 key_entry = (struct mptcp_key_entry *)
4172 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4173 key_entry->mkey_flags = MKEYF_FREE;
4174 mptcp_generate_unique_key(key_entry);
4175 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
4176 mptcp_keys_pool.mkph_count += 1;
4177 }
4178 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
4179 mtcbinfo.mppi_lock_attr);
4180}
4181
4182/*
4183 * MPTCP Join support
4184 */
4185
4186static void
4187mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4188 uint8_t addr_id)
39236c6e
A
4189{
4190 struct tcpcb *tp = sototcpcb(so);
4191 struct mptcp_subf_auth_entry *sauth_entry;
4192 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4193
4194 MPT_LOCK_SPIN(mp_tp);
4195 tp->t_mptcb = mp_tp;
39236c6e 4196 /*
39236c6e
A
4197 * The address ID of the first flow is implicitly 0.
4198 */
4199 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4200 tp->t_local_aid = 0;
4201 } else {
fe8ab488 4202 tp->t_local_aid = addr_id;
39236c6e
A
4203 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4204 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4205 }
fe8ab488 4206 MPT_UNLOCK(mp_tp);
39236c6e
A
4207 sauth_entry = zalloc(mpt_subauth_zone);
4208 sauth_entry->msae_laddr_id = tp->t_local_aid;
4209 sauth_entry->msae_raddr_id = 0;
4210 sauth_entry->msae_raddr_rand = 0;
4211try_again:
4212 sauth_entry->msae_laddr_rand = RandomULong();
4213 if (sauth_entry->msae_laddr_rand == 0)
4214 goto try_again;
fe8ab488 4215 MPT_LOCK_SPIN(mp_tp);
39236c6e 4216 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
fe8ab488 4217 MPT_UNLOCK(mp_tp);
39236c6e
A
4218}
4219
4220static void
4221mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4222{
4223 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4224 struct tcpcb *tp = NULL;
39236c6e
A
4225 int found = 0;
4226
fe8ab488
A
4227 socket_lock(so, 0);
4228 tp = sototcpcb(so);
4229 if (tp == NULL) {
4230 socket_unlock(so, 0);
39236c6e 4231 return;
fe8ab488 4232 }
39236c6e
A
4233
4234 MPT_LOCK(mp_tp);
4235 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4236 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4237 found = 1;
4238 break;
4239 }
4240 }
4241 if (found) {
4242 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4243 }
39236c6e 4244 MPT_UNLOCK(mp_tp);
fe8ab488 4245
3e170ce0
A
4246 if (found)
4247 zfree(mpt_subauth_zone, sauth_entry);
4248
fe8ab488
A
4249 tp->t_mptcb = NULL;
4250 socket_unlock(so, 0);
39236c6e
A
4251}
4252
4253void
4254mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4255 u_int32_t *rrand)
4256{
4257 struct mptcp_subf_auth_entry *sauth_entry;
4258 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4259
4260 MPT_LOCK(mp_tp);
4261 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4262 if (sauth_entry->msae_laddr_id == addr_id) {
4263 if (lrand)
4264 *lrand = sauth_entry->msae_laddr_rand;
4265 if (rrand)
4266 *rrand = sauth_entry->msae_raddr_rand;
4267 break;
4268 }
4269 }
4270 MPT_UNLOCK(mp_tp);
4271}
4272
4273void
4274mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4275 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4276{
4277 struct mptcp_subf_auth_entry *sauth_entry;
4278 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4279
4280 MPT_LOCK(mp_tp);
4281 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4282 if (sauth_entry->msae_laddr_id == laddr_id) {
4283 if ((sauth_entry->msae_raddr_id != 0) &&
4284 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4285 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4286 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4287 sauth_entry->msae_raddr_id),
4288 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4289 MPT_UNLOCK(mp_tp);
4290 return;
4291 }
4292 sauth_entry->msae_raddr_id = raddr_id;
4293 if ((sauth_entry->msae_raddr_rand != 0) &&
4294 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4295 mptcplog((LOG_ERR, "MPTCP Socket: "
4296 "%s: dup SYN_ACK %d %d \n",
39236c6e 4297 __func__, raddr_rand,
3e170ce0
A
4298 sauth_entry->msae_raddr_rand),
4299 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4300 MPT_UNLOCK(mp_tp);
4301 return;
4302 }
4303 sauth_entry->msae_raddr_rand = raddr_rand;
4304 MPT_UNLOCK(mp_tp);
4305 return;
4306 }
4307 }
4308 MPT_UNLOCK(mp_tp);
4309}
4310
4311/*
4312 * SHA1 support for MPTCP
4313 */
4314static int
4315mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4316{
4317 SHA1_CTX sha1ctxt;
4318 const unsigned char *sha1_base;
4319 int sha1_size;
4320
4321 if (digest_len != SHA1_RESULTLEN) {
4322 return (FALSE);
4323 }
4324
4325 sha1_base = (const unsigned char *) key;
4326 sha1_size = sizeof (mptcp_key_t);
4327 SHA1Init(&sha1ctxt);
4328 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4329 SHA1Final(sha_digest, &sha1ctxt);
4330 return (TRUE);
4331}
4332
4333void
4334mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4335 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4336{
4337 SHA1_CTX sha1ctxt;
4338 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4339 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4340 u_int32_t data[2];
4341 int i;
4342
4343 bzero(digest, digest_len);
4344
4345 /* Set up the Key for HMAC */
4346 key_ipad[0] = key1;
4347 key_ipad[1] = key2;
4348
4349 key_opad[0] = key1;
4350 key_opad[1] = key2;
4351
4352 /* Set up the message for HMAC */
4353 data[0] = rand1;
4354 data[1] = rand2;
4355
4356 /* Key is 512 block length, so no need to compute hash */
4357
4358 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4359
4360 for (i = 0; i < 8; i++) {
4361 key_ipad[i] ^= 0x3636363636363636;
4362 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4363 }
4364
4365 /* Perform inner SHA1 */
4366 SHA1Init(&sha1ctxt);
4367 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4368 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4369 SHA1Final(digest, &sha1ctxt);
4370
4371 /* Perform outer SHA1 */
4372 SHA1Init(&sha1ctxt);
4373 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4374 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4375 SHA1Final(digest, &sha1ctxt);
4376}
4377
4378/*
4379 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4380 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4381 */
4382void
4383mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4384 int digest_len)
4385{
4386 uint32_t lrand, rrand;
4387 mptcp_key_t localkey, remotekey;
4388 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4389
4390 if (digest_len != SHA1_RESULTLEN)
4391 return;
4392
4393 lrand = rrand = 0;
4394 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4395 MPT_LOCK_SPIN(mp_tp);
4396 localkey = *mp_tp->mpt_localkey;
4397 remotekey = mp_tp->mpt_remotekey;
4398 MPT_UNLOCK(mp_tp);
4399 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4400 digest_len);
4401}
4402
4403u_int64_t
4404mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4405{
4406 u_char digest[SHA1_RESULTLEN];
4407 u_int64_t trunced_digest;
4408
4409 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4410 bcopy(digest, &trunced_digest, 8);
4411 return (trunced_digest);
4412}
4413
4414/*
4415 * Authentication data generation
4416 */
490019cf 4417void
39236c6e
A
4418mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4419 int token_len)
4420{
4421 VERIFY(token_len == sizeof (u_int32_t));
4422 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4423
4424 /* Most significant 32 bits of the SHA1 hash */
4425 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4426 return;
39236c6e
A
4427}
4428
490019cf 4429void
39236c6e
A
4430mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4431 int idsn_len)
4432{
4433 VERIFY(idsn_len == sizeof (u_int64_t));
4434 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4435
4436 /*
4437 * Least significant 64 bits of the SHA1 hash
4438 */
4439
4440 idsn[7] = sha_digest[12];
4441 idsn[6] = sha_digest[13];
4442 idsn[5] = sha_digest[14];
4443 idsn[4] = sha_digest[15];
4444 idsn[3] = sha_digest[16];
4445 idsn[2] = sha_digest[17];
4446 idsn[1] = sha_digest[18];
4447 idsn[0] = sha_digest[19];
490019cf 4448 return;
39236c6e
A
4449}
4450
490019cf
A
4451static void
4452mptcp_conn_properties(struct mptcb *mp_tp)
4453{
4454 /* There is only Version 0 at this time */
4455 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4456
4457 /* Set DSS checksum flag */
4458 if (mptcp_dss_csum)
4459 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4460
4461 /* Set up receive window */
4462 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4463
4464 /* Set up gc ticks */
4465 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4466}
4467
4468static void
4469mptcp_init_local_parms(struct mptcb *mp_tp)
39236c6e
A
4470{
4471 caddr_t local_digest = NULL;
490019cf
A
4472
4473 mp_tp->mpt_localkey = mptcp_reserve_key();
4474 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4475 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4476 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4477 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4478 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4479
4480 /* The subflow SYN is also first MPTCP byte */
4481 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4482 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4483
4484 mptcp_conn_properties(mp_tp);
4485}
4486
4487int
4488mptcp_init_remote_parms(struct mptcb *mp_tp)
4489{
39236c6e
A
4490 char remote_digest[MPTCP_SHA1_RESULTLEN];
4491 MPT_LOCK_ASSERT_HELD(mp_tp);
4492
4493 /* Only Version 0 is supported for auth purposes */
3e170ce0 4494 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4495 return (-1);
4496
4497 /* Setup local and remote tokens and Initial DSNs */
39236c6e
A
4498
4499 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4500 SHA1_RESULTLEN)) {
3e170ce0
A
4501 mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
4502 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4503 return (-1);
4504 }
4505 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4506 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4507 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4508 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
39236c6e 4509 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4510
490019cf 4511 return (0);
39236c6e
A
4512}
4513
4514/*
4515 * Helper Functions
4516 */
4517mptcp_token_t
4518mptcp_get_localtoken(void* mptcb_arg)
4519{
4520 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4521 return (mp_tp->mpt_localtoken);
4522}
4523
4524mptcp_token_t
4525mptcp_get_remotetoken(void* mptcb_arg)
4526{
4527 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4528 return (mp_tp->mpt_remotetoken);
4529}
4530
4531u_int64_t
4532mptcp_get_localkey(void* mptcb_arg)
4533{
4534 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4535 if (mp_tp->mpt_localkey != NULL)
4536 return (*mp_tp->mpt_localkey);
4537 else
4538 return (0);
4539}
4540
4541u_int64_t
4542mptcp_get_remotekey(void* mptcb_arg)
4543{
4544 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4545 return (mp_tp->mpt_remotekey);
4546}
4547
4548void
4549mptcp_send_dfin(struct socket *so)
4550{
4551 struct tcpcb *tp = NULL;
4552 struct inpcb *inp = NULL;
4553
4554 inp = sotoinpcb(so);
4555 if (!inp)
4556 return;
4557
4558 tp = intotcpcb(inp);
4559 if (!tp)
4560 return;
4561
4562 if (!(tp->t_mpflags & TMPF_RESET))
4563 tp->t_mpflags |= TMPF_SEND_DFIN;
4564}
4565
4566/*
4567 * Data Sequence Mapping routines
4568 */
4569void
4570mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4571{
4572 struct mptcb *mp_tp;
4573
4574 if (m == NULL)
4575 return;
4576
3e170ce0 4577 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e 4578 MPT_LOCK(mp_tp);
39236c6e
A
4579 while (m) {
4580 VERIFY(m->m_flags & M_PKTHDR);
4581 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4582 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4583 m->m_pkthdr.mp_rlen = m_pktlen(m);
4584 mp_tp->mpt_sndmax += m_pktlen(m);
4585 m = m->m_next;
4586 }
4587 MPT_UNLOCK(mp_tp);
4588}
4589
4590void
490019cf 4591mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e
A
4592{
4593 u_int32_t sub_len = 0;
490019cf
A
4594 int rewinding = 0;
4595
4596 if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
4597 /* TFO makes things complicated. */
4598 if (so->so_flags1 & SOF1_TFO_REWIND) {
4599 rewinding = 1;
4600 so->so_flags1 &= ~SOF1_TFO_REWIND;
4601 }
4602 }
39236c6e
A
4603
4604 while (m) {
4605 VERIFY(m->m_flags & M_PKTHDR);
4606
4607 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4608 sub_len = m->m_pkthdr.mp_rlen;
4609
4610 if (sub_len < len) {
4611 m->m_pkthdr.mp_dsn += sub_len;
4612 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4613 m->m_pkthdr.mp_rseq += sub_len;
4614 }
4615 m->m_pkthdr.mp_rlen = 0;
4616 len -= sub_len;
4617 } else {
4618 /* sub_len >= len */
490019cf
A
4619 if (rewinding == 0)
4620 m->m_pkthdr.mp_dsn += len;
39236c6e 4621 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
490019cf
A
4622 if (rewinding == 0)
4623 m->m_pkthdr.mp_rseq += len;
39236c6e 4624 }
3e170ce0 4625 mptcplog((LOG_DEBUG, "MPTCP Sender: "
490019cf 4626 "%s: dsn 0x%llx ssn %u len %d %d\n",
3e170ce0 4627 __func__,
39236c6e 4628 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
3e170ce0
A
4629 m->m_pkthdr.mp_rlen, len),
4630 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 4631 m->m_pkthdr.mp_rlen -= len;
39037602 4632 break;
39236c6e
A
4633 }
4634 } else {
4635 panic("%s: MPTCP tag not set", __func__);
4636 /* NOTREACHED */
4637 }
4638 m = m->m_next;
4639 }
39037602
A
4640
4641 if (so->so_flags & SOF_MP_SUBFLOW &&
4642 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4643 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4644 /*
4645 * Received an ack without receiving a DATA_ACK.
4646 * Need to fallback to regular TCP (or destroy this subflow).
4647 */
4648 mptcp_notify_mpfail(so);
4649 }
39236c6e
A
4650}
4651
4652/* Obtain the DSN mapping stored in the mbuf */
4653void
4654mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4655 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4656{
4657 u_int64_t dsn64;
4658
4659 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4660 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4661 *dsn64p = dsn64;
4662}
4663
4664void
4665mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4666 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4667{
4668 struct mbuf *m = so->so_snd.sb_mb;
4669 struct mbuf *mnext = NULL;
4670 uint32_t runlen = 0;
4671 u_int64_t dsn64;
4672 uint32_t contig_len = 0;
4673
4674 if (m == NULL)
4675 return;
4676
4677 if (off < 0)
4678 return;
4679 /*
4680 * In the subflow socket, the DSN sequencing can be discontiguous,
4681 * but the subflow sequence mapping is contiguous. Use the subflow
4682 * sequence property to find the right mbuf and corresponding dsn
4683 * mapping.
4684 */
4685
4686 while (m) {
4687 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4688 VERIFY(m->m_flags & M_PKTHDR);
4689
4690 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4691 off -= m->m_pkthdr.mp_rlen;
4692 m = m->m_next;
4693 } else {
4694 break;
4695 }
4696 }
4697
4698 if (m == NULL) {
4699 panic("%s: bad offset", __func__);
4700 /* NOTREACHED */
4701 }
4702
4703 dsn64 = m->m_pkthdr.mp_dsn + off;
4704 *dsn = dsn64;
4705 *relseq = m->m_pkthdr.mp_rseq + off;
4706
4707 /*
4708 * Now find the last contiguous byte and its length from
4709 * start.
4710 */
4711 runlen = m->m_pkthdr.mp_rlen - off;
4712 contig_len = runlen;
4713
4714 /* If datalen does not span multiple mbufs, return */
4715 if (datalen <= runlen) {
4716 *data_len = min(datalen, UINT16_MAX);
4717 return;
4718 }
4719
4720 mnext = m->m_next;
4721 while (datalen > runlen) {
4722 if (mnext == NULL) {
4723 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4724 runlen, off);
4725 /* NOTREACHED */
4726 }
4727 VERIFY(mnext->m_flags & M_PKTHDR);
4728 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4729
4730 /*
4731 * case A. contiguous DSN stream
4732 * case B. discontiguous DSN stream
4733 */
4734 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4735 /* case A */
4736 runlen += mnext->m_pkthdr.mp_rlen;
4737 contig_len += mnext->m_pkthdr.mp_rlen;
3e170ce0
A
4738 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
4739 __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4740 } else {
4741 /* case B */
3e170ce0 4742 mptcplog((LOG_DEBUG, "MPTCP Sender: "
fe8ab488 4743 "%s: discontig datalen %d contig_len %d cc %d \n",
3e170ce0
A
4744 __func__, datalen, contig_len, so->so_snd.sb_cc),
4745 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4746 break;
4747 }
4748 mnext = mnext->m_next;
4749 }
4750 datalen = min(datalen, UINT16_MAX);
4751 *data_len = min(datalen, contig_len);
3e170ce0
A
4752 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4753 "%s: %llu %u %d %d \n", __func__,
4754 *dsn, *relseq, *data_len, off),
4755 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4756}
4757
4758/*
4759 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4760 * here. It must be called from mptcp_adj_rmap() which is called only after
4761 * reassembly of out of order data. The rcvnxt variable must
4762 * be updated only when atleast some insequence new data is received.
4763 */
4764static void
4765mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4766{
4767 struct mptcb *mp_tp = tptomptp(tp);
4768
4769 if (mp_tp == NULL)
4770 return;
4771 MPT_LOCK(mp_tp);
4772 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4773 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4774 m->m_pkthdr.mp_rlen)))) {
4775 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4776 }
4777 MPT_UNLOCK(mp_tp);
4778}
4779
4780/*
3e170ce0
A
4781 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4782 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4783 * When it trims data tcp_input calls m_adj() which does not remove the
4784 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4785 * The dsn map insertion cannot be delayed after trim, because data can be in
4786 * the reassembly queue for a while and the DSN option info in tp will be
4787 * overwritten for every new packet received.
39236c6e
A
4788 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4789 * with mptcp_adj_rmap()
4790 */
4791void
4792mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4793{
4794 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4795
4796 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4797 VERIFY(m->m_flags & M_PKTHDR);
4798 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4799 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4800 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4801 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4802 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4803 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4804 }
4805}
4806
fe8ab488 4807int
39236c6e
A
4808mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4809{
4810 u_int64_t dsn;
4811 u_int32_t sseq, datalen;
4812 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4813 u_int32_t old_rcvnxt = 0;
4814
4815 if (m_pktlen(m) == 0)
fe8ab488 4816 return 0;
39236c6e
A
4817
4818 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4819 VERIFY(m->m_flags & M_PKTHDR);
4820
4821 dsn = m->m_pkthdr.mp_dsn;
4822 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4823 datalen = m->m_pkthdr.mp_rlen;
4824 } else {
4825 /* data arrived without an DSS option mapping */
fe8ab488
A
4826
4827 /* initial subflow can fallback right after SYN handshake */
39236c6e 4828 mptcp_notify_mpfail(so);
fe8ab488 4829 return 0;
39236c6e
A
4830 }
4831
4832 /* In the common case, data is in window and in sequence */
4833 if (m->m_pkthdr.len == (int)datalen) {
4834 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4835 return 0;
39236c6e
A
4836 }
4837
39236c6e
A
4838 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4839 if (SEQ_GT(old_rcvnxt, sseq)) {
4840 /* data trimmed from the left */
4841 int off = old_rcvnxt - sseq;
4842 m->m_pkthdr.mp_dsn += off;
4843 m->m_pkthdr.mp_rseq += off;
fe8ab488 4844 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e
A
4845 } else if (old_rcvnxt == sseq) {
4846 /*
3e170ce0 4847 * data was trimmed from the right
39236c6e
A
4848 */
4849 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4850 } else {
fe8ab488 4851 mptcp_notify_mpfail(so);
3e170ce0 4852 return (-1);
39236c6e
A
4853 }
4854 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4855 return 0;
39236c6e
A
4856}
4857
4858/*
4859 * Following routines help with failure detection and failover of data
4860 * transfer from one subflow to another.
4861 */
4862void
4863mptcp_act_on_txfail(struct socket *so)
4864{
4865 struct tcpcb *tp = NULL;
4866 struct inpcb *inp = sotoinpcb(so);
4867
4868 if (inp == NULL)
4869 return;
4870
4871 tp = intotcpcb(inp);
4872 if (tp == NULL)
4873 return;
4874
39236c6e
A
4875 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4876 return;
4877 }
4878
4879 so->so_flags |= SOF_MP_TRYFAILOVER;
4880 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4881}
4882
4883/*
4884 * Support for MP_FAIL option
4885 */
4886int
4887mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4888{
4889 struct mbuf *m = so->so_snd.sb_mb;
4890 u_int64_t dsn;
4891 int off = 0;
4892 u_int32_t datalen;
4893
4894 if (m == NULL)
4895 return (-1);
4896
4897 while (m != NULL) {
4898 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4899 VERIFY(m->m_flags & M_PKTHDR);
4900 dsn = m->m_pkthdr.mp_dsn;
4901 datalen = m->m_pkthdr.mp_rlen;
4902 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4903 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4904 off = dsn_fail - dsn;
4905 *tcp_seq = m->m_pkthdr.mp_rseq + off;
3e170ce0
A
4906 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
4907 __func__, dsn, dsn_fail),
4908 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4909 return (0);
4910 }
4911
4912 m = m->m_next;
4913 }
4914
4915 /*
4916 * If there was no mbuf data and a fallback to TCP occurred, there's
4917 * not much else to do.
4918 */
4919
3e170ce0
A
4920 mptcplog((LOG_ERR, "MPTCP Sender: "
4921 "%s: %llu not found \n", __func__, dsn_fail),
4922 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4923 return (-1);
4924}
4925
4926/*
4927 * Support for sending contiguous MPTCP bytes in subflow
fe8ab488 4928 * Also for preventing sending data with ACK in 3-way handshake
39236c6e
A
4929 */
4930int32_t
4931mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4932{
4933 u_int64_t mdss_dsn = 0;
4934 u_int32_t mdss_subflow_seq = 0;
4935 u_int16_t mdss_data_len = 0;
4936
4937 if (len == 0)
4938 return (len);
4939
4940 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4941 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4942
39037602 4943 /*
fe8ab488
A
4944 * Special case handling for Fast Join. We want to send data right
4945 * after ACK of the 3-way handshake, but not piggyback the data
4946 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4947 * mdss_data_len control this.
4948 */
4949 struct tcpcb *tp = NULL;
39037602 4950 tp = intotcpcb(sotoinpcb(so));
fe8ab488
A
4951 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4952 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4953 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4954 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4955 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4956 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
490019cf
A
4957 mdss_data_len = 0;
4958 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4959 }
4960
4961 if ((tp->t_state > TCPS_SYN_SENT) &&
4962 (tp->t_mpflags & TMPF_TFO_REQUEST)) {
4963 mdss_data_len = 0;
4964 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4965 }
39236c6e
A
4966 return (mdss_data_len);
4967}
4968
4969int32_t
4970mptcp_sbspace(struct mptcb *mpt)
4971{
4972 struct sockbuf *sb;
4973 uint32_t rcvbuf;
4974 int32_t space;
4975
4976 MPT_LOCK_ASSERT_HELD(mpt);
4977 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4978
4979 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4980 rcvbuf = sb->sb_hiwat;
4981 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4982 (sb->sb_mbmax - sb->sb_mbcnt)));
4983 if (space < 0)
4984 space = 0;
4985 /* XXX check if it's too small? */
4986
4987 return (space);
4988}
4989
4990/*
4991 * Support Fallback to Regular TCP
4992 */
4993void
4994mptcp_notify_mpready(struct socket *so)
4995{
4996 struct tcpcb *tp = NULL;
4997
4998 if (so == NULL)
4999 return;
5000
5001 tp = intotcpcb(sotoinpcb(so));
5002
5003 if (tp == NULL)
5004 return;
5005
5006 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5007 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5008 struct tcpcb *, tp);
5009
5010 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5011 return;
5012
5013 if (tp->t_mpflags & TMPF_MPTCP_READY)
5014 return;
5015
5016 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5017 tp->t_mpflags |= TMPF_MPTCP_READY;
5018
5019 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5020}
5021
5022void
5023mptcp_notify_mpfail(struct socket *so)
5024{
5025 struct tcpcb *tp = NULL;
5026
5027 if (so == NULL)
5028 return;
5029
5030 tp = intotcpcb(sotoinpcb(so));
5031
5032 if (tp == NULL)
5033 return;
5034
5035 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5036 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5037 struct tcpcb *, tp);
5038
5039 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5040 return;
5041
5042 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5043 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5044
5045 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5046}
5047
5048/*
5049 * Keepalive helper function
5050 */
5051boolean_t
5052mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5053{
5054 boolean_t ret = 1;
5055 VERIFY(mp_tp != NULL);
5056 MPT_LOCK(mp_tp);
5057 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5058 ret = 0;
5059 }
5060 MPT_UNLOCK(mp_tp);
5061 return (ret);
5062}
5063
5064/*
5065 * MPTCP t_maxseg adjustment function
5066 */
5067int
5068mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5069{
5070 int mss_lower = 0;
5071 struct mptcb *mp_tp = tptomptp(tp);
5072
5073#define MPTCP_COMPUTE_LEN { \
5074 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5075 MPT_LOCK(mp_tp); \
5076 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5077 mss_lower += 2; \
5078 else \
5079 /* adjust to 32-bit boundary + EOL */ \
5080 mss_lower += 2; \
5081 MPT_UNLOCK(mp_tp); \
5082}
5083 if (mp_tp == NULL)
5084 return (0);
5085
5086 /*
5087 * For the first subflow and subsequent subflows, adjust mss for
5088 * most common MPTCP option size, for case where tcp_mss is called
5089 * during option processing and MTU discovery.
5090 */
5091 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5092 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
5093 MPTCP_COMPUTE_LEN;
5094 }
5095
5096 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5097 (tp->t_mpflags & TMPF_SENT_JOIN)) {
5098 MPTCP_COMPUTE_LEN;
5099 }
5100
5101 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5102 MPTCP_COMPUTE_LEN;
5103 }
5104
5105 return (mss_lower);
5106}
5107
5108/*
5109 * Update the pid, upid, uuid of the subflow so, based on parent so
5110 */
5111void
5112mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
5113{
5114 struct socket *subflow_so = mpts->mpts_socket;
39037602 5115
39236c6e
A
5116 MPTS_LOCK_ASSERT_HELD(mpts);
5117
5118 socket_lock(subflow_so, 0);
5119 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
5120 (subflow_so->last_upid != parent_mpso->last_upid)) {
5121 subflow_so->last_upid = parent_mpso->last_upid;
5122 subflow_so->last_pid = parent_mpso->last_pid;
5123 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
5124 }
5125 so_update_policy(subflow_so);
5126 socket_unlock(subflow_so, 0);
5127}
5128
5129static void
5130fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5131{
5132 struct inpcb *inp;
5133
5134 tcp_getconninfo(so, &flow->flow_ci);
5135 inp = sotoinpcb(so);
5136#if INET6
5137 if ((inp->inp_vflag & INP_IPV6) != 0) {
5138 flow->flow_src.ss_family = AF_INET6;
5139 flow->flow_dst.ss_family = AF_INET6;
5140 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5141 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5142 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5143 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5144 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5145 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
39037602 5146 } else
39236c6e 5147#endif
3e170ce0 5148 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5149 flow->flow_src.ss_family = AF_INET;
5150 flow->flow_dst.ss_family = AF_INET;
5151 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5152 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5153 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5154 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5155 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5156 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5157 }
3e170ce0
A
5158 flow->flow_len = sizeof(*flow);
5159 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5160 flow->flow_flags = mpts->mpts_flags;
5161 flow->flow_cid = mpts->mpts_connid;
3e170ce0
A
5162 flow->flow_sndnxt = mpts->mpts_sndnxt;
5163 flow->flow_relseq = mpts->mpts_rel_seq;
5164 flow->flow_soerror = mpts->mpts_soerror;
5165 flow->flow_probecnt = mpts->mpts_probecnt;
5166 flow->flow_peerswitch = mpts->mpts_peerswitch;
39236c6e
A
5167}
5168
5169static int
5170mptcp_pcblist SYSCTL_HANDLER_ARGS
5171{
5172#pragma unused(oidp, arg1, arg2)
5173 int error = 0, f;
5174 size_t n, len;
5175 struct mppcb *mpp;
5176 struct mptses *mpte;
5177 struct mptcb *mp_tp;
5178 struct mptsub *mpts;
5179 struct socket *so;
5180 conninfo_mptcp_t mptcpci;
fe8ab488 5181 mptcp_flow_t *flows = NULL;
39236c6e
A
5182
5183 if (req->newptr != USER_ADDR_NULL)
5184 return (EPERM);
5185
5186 lck_mtx_lock(&mtcbinfo.mppi_lock);
5187 n = mtcbinfo.mppi_count;
5188 if (req->oldptr == USER_ADDR_NULL) {
5189 lck_mtx_unlock(&mtcbinfo.mppi_lock);
39037602 5190 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
39236c6e
A
5191 4 * (n + n/8) * sizeof(mptcp_flow_t);
5192 return (0);
5193 }
5194 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5195 flows = NULL;
39236c6e
A
5196 lck_mtx_lock(&mpp->mpp_lock);
5197 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3e170ce0
A
5198 if (mpp->mpp_flags & MPP_DEFUNCT) {
5199 lck_mtx_unlock(&mpp->mpp_lock);
5200 continue;
5201 }
39236c6e
A
5202 mpte = mptompte(mpp);
5203 VERIFY(mpte != NULL);
5204 mp_tp = mpte->mpte_mptcb;
5205 VERIFY(mp_tp != NULL);
3e170ce0
A
5206
5207 bzero(&mptcpci, sizeof(mptcpci));
5208 MPT_LOCK(mp_tp);
39236c6e 5209 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5210 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5211 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5212 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5213 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5214 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5215 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5216 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5217 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5218 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5219 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5220 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
5221 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5222 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5223 MPT_UNLOCK(mp_tp);
5224
39236c6e 5225 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5226 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5227 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5228 mptcpci.mptcpci_flow_offset =
5229 offsetof(conninfo_mptcp_t, mptcpci_flows);
5230
fe8ab488
A
5231 len = sizeof(*flows) * mpte->mpte_numflows;
5232 if (mpte->mpte_numflows != 0) {
5233 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5234 if (flows == NULL) {
5235 lck_mtx_unlock(&mpp->mpp_lock);
5236 break;
5237 }
5238 mptcpci.mptcpci_len = sizeof(mptcpci) +
5239 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5240 error = SYSCTL_OUT(req, &mptcpci,
5241 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5242 } else {
5243 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5244 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 5245 }
39236c6e
A
5246 if (error) {
5247 lck_mtx_unlock(&mpp->mpp_lock);
5248 FREE(flows, M_TEMP);
5249 break;
5250 }
5251 f = 0;
5252 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5253 MPTS_LOCK(mpts);
5254 so = mpts->mpts_socket;
5255 socket_lock(so, 0);
5256 fill_mptcp_subflow(so, &flows[f], mpts);
5257 socket_unlock(so, 0);
5258 MPTS_UNLOCK(mpts);
5259 f++;
5260 }
5261 lck_mtx_unlock(&mpp->mpp_lock);
fe8ab488
A
5262 if (flows) {
5263 error = SYSCTL_OUT(req, flows, len);
5264 FREE(flows, M_TEMP);
5265 if (error)
5266 break;
5267 }
39236c6e
A
5268 }
5269 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5270
5271 return (error);
5272}
5273
5274SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 5275 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 5276 "List of active MPTCP connections");
fe8ab488
A
5277
5278/*
5279 * Check the health of the other subflows and do an mptcp_output if
5280 * there is no other active or functional subflow at the time of
5281 * call of this function.
5282 */
5283static void
5284mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
5285{
5286 struct mptsub *from_mpts = NULL;
5287
5288 MPTE_LOCK_ASSERT_HELD(mpte);
5289
5290 MPTS_UNLOCK(to_mpts);
39037602
A
5291
5292 from_mpts = mpte->mpte_active_sub;
fe8ab488
A
5293
5294 if (from_mpts == NULL)
5295 goto output_needed;
5296
5297 MPTS_LOCK(from_mpts);
5298
5299 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
5300 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
5301 MPTS_UNLOCK(from_mpts);
5302 goto output_needed;
5303 }
5304
5305 MPTS_UNLOCK(from_mpts);
5306 MPTS_LOCK(to_mpts);
5307 return;
5308
39037602
A
5309output_needed:
5310 mptcp_output(mpte);
fe8ab488
A
5311 MPTS_LOCK(to_mpts);
5312}
5313
fe8ab488
A
5314/*
5315 * Set notsent lowat mark on the MPTCB
5316 */
5317int
5318mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5319{
5320 struct mptcb *mp_tp = NULL;
5321 int error = 0;
5322
5323 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5324 mp_tp = mpte->mpte_mptcb;
5325
5326 if (mp_tp)
5327 mp_tp->mpt_notsent_lowat = optval;
5328 else
5329 error = EINVAL;
5330
39037602 5331 return error;
fe8ab488
A
5332}
5333
5334u_int32_t
5335mptcp_get_notsent_lowat(struct mptses *mpte)
5336{
5337 struct mptcb *mp_tp = NULL;
5338
5339 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5340 mp_tp = mpte->mpte_mptcb;
5341
5342 if (mp_tp)
5343 return mp_tp->mpt_notsent_lowat;
5344 else
5345 return 0;
5346}
5347
39037602 5348int
fe8ab488
A
5349mptcp_notsent_lowat_check(struct socket *so) {
5350 struct mptses *mpte;
5351 struct mppcb *mpp;
5352 struct mptcb *mp_tp;
5353 struct mptsub *mpts;
5354
5355 int notsent = 0;
5356
5357 mpp = sotomppcb(so);
5358 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5359 return (0);
5360 }
5361
5362 mpte = mptompte(mpp);
5363 mp_tp = mpte->mpte_mptcb;
5364
5365 MPT_LOCK(mp_tp);
5366 notsent = so->so_snd.sb_cc;
5367
5368 if ((notsent == 0) ||
5369 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5370 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5371 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5372 "lowat %d notsent %d actual %d \n",
5373 mp_tp->mpt_notsent_lowat, notsent,
5374 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5375 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5376 MPT_UNLOCK(mp_tp);
5377 return (1);
5378 }
5379 MPT_UNLOCK(mp_tp);
5380
5381 /* When Nagle's algorithm is not disabled, it is better
5382 * to wakeup the client even before there is atleast one
5383 * maxseg of data to write.
5384 */
5385 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5386 int retval = 0;
5387 MPTS_LOCK(mpts);
5388 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5389 struct socket *subf_so = mpts->mpts_socket;
5390 socket_lock(subf_so, 0);
5391 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 5392
fe8ab488
A
5393 notsent = so->so_snd.sb_cc -
5394 (tp->snd_nxt - tp->snd_una);
39037602 5395
fe8ab488
A
5396 if ((tp->t_flags & TF_NODELAY) == 0 &&
5397 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5398 retval = 1;
5399 }
3e170ce0 5400 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5401 " nodelay false \n",
3e170ce0
A
5402 mp_tp->mpt_notsent_lowat, notsent),
5403 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5404 socket_unlock(subf_so, 0);
5405 MPTS_UNLOCK(mpts);
5406 return (retval);
5407 }
5408 MPTS_UNLOCK(mpts);
5409 }
5410 return (0);
5411}
5412
3e170ce0
A
5413static void
5414mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
5415{
5416 MPTE_LOCK_ASSERT_HELD(mpte);
5417 MPTS_LOCK_ASSERT_HELD(mpts);
5418
5419 struct socket *subflow_so = mpts->mpts_socket;
5420 socket_lock(subflow_so, 0);
5421 mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
5422 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
5423 socket_unlock(subflow_so, 0);
5424}
5425
5426/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5427static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5428static uint32_t mptcp_kern_skt_inuse = 0;
5429symptoms_advisory_t mptcp_advisory;
5430
5431static errno_t
5432mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5433 void **unitinfo)
5434{
5435#pragma unused(kctlref, sac, unitinfo)
5436 /*
5437 * We don't need to do anything here. But we can atleast ensure
5438 * only one user opens the MPTCP_KERN_CTL_NAME control socket.
5439 */
5440 if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
5441 return (0);
5442 else
5443 return (EALREADY);
5444}
5445
5446static errno_t
5447mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5448 void *unitinfo)
5449{
5450#pragma unused(kctlref, kcunit, unitinfo)
5451 if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
5452 /* TBD needs to be locked if the size grows more than an int */
5453 bzero(&mptcp_advisory, sizeof(mptcp_advisory));
5454 return (0);
5455 }
5456 else {
5457 return (EINVAL);
5458 }
5459}
5460
5461static errno_t
5462mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5463 mbuf_t m, int flags)
5464{
5465#pragma unused(kctlref, kcunit, unitinfo, flags)
5466 symptoms_advisory_t *sa = NULL;
5467
5468 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5469 mbuf_freem(m);
5470 return (EINVAL);
5471 }
5472
5473 if (mbuf_len(m) >= sizeof(*sa))
5474 sa = mbuf_data(m);
5475 else
5476 return (EINVAL);
5477
5478 if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
5479 /*
5480 * we could use this notification to notify all mptcp pcbs
5481 * of the change in network status. But its difficult to
5482 * define if sending REMOVE_ADDR or MP_PRIO is appropriate
5483 * given that these are only soft indicators of the network
5484 * state. Leaving this as TBD for now.
5485 */
5486 }
5487
5488 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
5489 mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
5490 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
5491 sa->sa_cell_status, mptcp_advisory.sa_cell_status),
5492 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
5493 MPTCP_LOGLVL_LOG);
5494
5495 if ((sa->sa_wifi_status &
5496 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5497 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
5498 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5499 }
5500
5501 if ((sa->sa_cell_status &
5502 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
5503 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
5504 mptcp_advisory.sa_cell_status = sa->sa_cell_status;
5505 }
5506 } else {
5507 mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
5508 "wifi %d cell %d\n", __func__,
5509 mptcp_advisory.sa_wifi_status,
5510 mptcp_advisory.sa_cell_status),
5511 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
5512 }
5513 return (0);
5514}
5515
5516void
5517mptcp_control_register(void)
5518{
5519 /* Set up the advisory control socket */
5520 struct kern_ctl_reg mptcp_kern_ctl;
5521
5522 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5523 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5524 sizeof(mptcp_kern_ctl.ctl_name));
5525 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5526 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5527 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5528 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5529
5530 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5531}
5532
5533int
5534mptcp_is_wifi_unusable(void)
5535{
5536 /* a false return val indicates there is no info or wifi is ok */
5537 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5538}
5539
5540int
5541mptcp_is_cell_unusable(void)
5542{
5543 /* a false return val indicates there is no info or cell is ok */
5544 return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
5545}
5546
5547struct mptsub*
5548mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
5549{
5550 struct mptsub *cellsub = NULL;
5551 struct mptsub *wifisub = NULL;
5552 struct mptsub *wiredsub = NULL;
5553
5554 VERIFY ((best != NULL) && (second_best != NULL));
5555
5556 if (!mptcp_use_symptomsd)
5557 return (NULL);
5558
5559 if (!mptcp_kern_skt_inuse)
5560 return (NULL);
5561
5562 /*
39037602
A
5563 * There could be devices with more than one wifi interface or
5564 * more than one wired or cell interfaces.
3e170ce0
A
5565 * TBD: SymptomsD is unavailable on such platforms as of now.
5566 * Try to prefer best when possible in general.
5567 * Also, SymptomsD sends notifications about wifi only when it
5568 * is primary.
5569 */
5570 if (best->mpts_linktype & MPTSL_WIFI)
5571 wifisub = best;
5572 else if (best->mpts_linktype & MPTSL_CELL)
5573 cellsub = best;
5574 else if (best->mpts_linktype & MPTSL_WIRED)
5575 wiredsub = best;
5576
5577 /*
5578 * On platforms with wired paths, don't use hints about wifi or cell.
5579 * Currently, SymptomsD is not available on platforms with wired paths.
5580 */
5581 if (wiredsub)
5582 return (NULL);
5583
5584 if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
5585 wifisub = second_best;
5586
5587 if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
5588 cellsub = second_best;
5589
5590 if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
5591 wiredsub = second_best;
5592
5593 if ((wifisub == best) && mptcp_is_wifi_unusable()) {
5594 tcpstat.tcps_mp_sel_symtomsd++;
5595 if (mptcp_is_cell_unusable()) {
5596 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5597 " suggests both Wifi and Cell are bad. Wired %s.",
5598 (wiredsub == NULL) ? "none" : "present"),
5599 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5600 return (wiredsub);
5601 } else {
5602 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5603 " suggests Wifi bad, Cell good. Wired %s.",
5604 (wiredsub == NULL) ? "none" : "present"),
5605 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5606 return ((wiredsub != NULL) ? wiredsub : cellsub);
5607 }
5608 }
5609
5610 if ((cellsub == best) && (mptcp_is_cell_unusable())) {
5611 tcpstat.tcps_mp_sel_symtomsd++;
5612 if (mptcp_is_wifi_unusable()) {
5613 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5614 " suggests both Cell and Wifi are bad. Wired %s.",
5615 (wiredsub == NULL) ? "none" : "present"),
5616 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5617 return (wiredsub);
5618 } else {
5619 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5620 " suggests Cell bad, Wifi good. Wired %s.",
5621 (wiredsub == NULL) ? "none" : "present"),
5622 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5623 return ((wiredsub != NULL) ? wiredsub : wifisub);
5624 }
5625 }
5626
5627 /* little is known about the state of the network or wifi is good */
39037602 5628 return (NULL);
3e170ce0 5629}
490019cf
A
5630
5631/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5632static void
39037602 5633mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts, int *wakeup)
490019cf
A
5634{
5635 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5636 struct socket *so = mpts->mpts_socket;
5637 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5638 struct mptcb *mp_tp = mpte->mpte_mptcb;
5639
5640 /* If data was sent with SYN, rewind state */
5641 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5642 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5643 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5644 MPT_LOCK(mp_tp);
5645 u_int64_t mp_droplen = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
5646 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5647 VERIFY(mp_droplen <= (UINT_MAX));
5648 VERIFY(mp_droplen >= tcp_droplen);
5649
5650 if (mp_droplen > tcp_droplen) {
5651 /* handle partial TCP ack */
5652 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5653 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
5654 mpts->mpts_sndnxt = mp_tp->mpt_sndnxt;
5655 mp_droplen = tcp_droplen;
5656 } else {
5657 /* all data on SYN was acked */
5658 mpts->mpts_rel_seq = 1;
5659 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5660 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
5661 }
5662 mp_tp->mpt_sndmax -= tcp_droplen;
5663
5664 MPT_UNLOCK(mp_tp);
5665 if (mp_droplen != 0) {
5666 VERIFY(mp_so->so_snd.sb_mb != NULL);
5667 sbdrop(&mp_so->so_snd, (int)mp_droplen);
39037602
A
5668 if (wakeup)
5669 *wakeup = 1;
490019cf
A
5670 }
5671 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d "
5672 "TFO tcp len %d mptcp len %d\n", __func__,
5673 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid,
5674 tcp_droplen, mp_droplen),
5675 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5676 }
5677}