]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-3789.31.2.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
490019cf 2 * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
3e170ce0 50#include <net/if_var.h>
39236c6e
A
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_fsm.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58#include <netinet/mptcp_var.h>
59#include <netinet/mptcp.h>
60#include <netinet/mptcp_seq.h>
61#include <netinet/mptcp_timer.h>
62#include <libkern/crypto/sha1.h>
63#if INET6
64#include <netinet6/in6_pcb.h>
65#include <netinet6/ip6protosw.h>
66#endif /* INET6 */
67#include <dev/random/randomdev.h>
68
39037602
A
69extern char *proc_best_name(proc_t);
70
39236c6e
A
71/*
72 * Notes on MPTCP implementation.
73 *
74 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
75 * communication domain. The structure mtcbinfo describes the MPTCP instance
76 * of a Multipath protocol in that domain. It is used to keep track of all
77 * MPTCP PCB instances in the system, and is protected by the global lock
78 * mppi_lock.
79 *
80 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
81 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
82 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
83 * allocated from the same memory block, and each structure has a pointer
84 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
85 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
86 * PCB (mppcb) as well as the MPTCP Session (mptses).
87 *
88 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
89 * in particular, the list of subflows as well as the MPTCP thread.
90 *
91 * A functioning MPTCP Session consists of one or more subflow sockets. Each
92 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
93 * represented by the mptsub structure. Because each subflow requires access
94 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
95 * subflow. This gets decremented prior to the subflow's destruction. The
96 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
97 *
98 * To handle events (read, write, control) from the subflows, an MPTCP thread
99 * is created; currently, there is one thread per MPTCP Session. In order to
100 * prevent the MPTCP socket from being destroyed while being accessed by the
101 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
102 * which will be decremented prior to the thread's termination. The thread
103 * lock (mpte_thread_lock) is used to synchronize its signalling.
104 *
105 * Lock ordering is defined as follows:
106 *
107 * mtcbinfo (mppi_lock)
108 * mp_so (mpp_lock)
109 * mpts (mpts_lock)
110 * so (inpcb_mtx)
111 * mptcb (mpt_lock)
112 *
113 * It is not a requirement that all of the above locks need to be acquired
114 * in succession, but the correct lock ordering must be followed when there
115 * are more than one locks that need to be held. The MPTCP thread lock is
116 * is not constrained by this arrangement, because none of the other locks
117 * is ever acquired while holding mpte_thread_lock; therefore it may be called
118 * at any moment to signal the thread.
119 *
120 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
121 * work is done by the MPTCP garbage collector which is invoked on demand by
122 * the PF_MULTIPATH garbage collector. This process will take place once all
123 * of the subflows have been destroyed, and the MPTCP thread be instructed to
124 * self-terminate.
125 */
126
127static void mptcp_sesdestroy(struct mptses *);
128static void mptcp_thread_signal_locked(struct mptses *);
129static void mptcp_thread_terminate_signal(struct mptses *);
130static void mptcp_thread_dowork(struct mptses *);
131static void mptcp_thread_func(void *, wait_result_t);
132static void mptcp_thread_destroy(struct mptses *);
133static void mptcp_key_pool_init(void);
fe8ab488 134static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 135static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
136
137static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
138static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
139static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
140static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
141 struct uio *, struct mbuf **, struct mbuf **, int *);
142static void mptcp_subflow_rupcall(struct socket *, void *, int);
143static void mptcp_subflow_input(struct mptses *, struct mptsub *);
144static void mptcp_subflow_wupcall(struct socket *, void *, int);
145static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
146static void mptcp_update_last_owner(struct mptsub *, struct socket *);
fe8ab488 147static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
3e170ce0 148static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
39037602 149static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *, int *);
39236c6e
A
150
151/*
152 * Possible return values for subflow event handlers. Note that success
153 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
154 * indicate errors or actions which require immediate attention; they will
155 * prevent the rest of the handlers from processing their respective events
156 * until the next round of events processing.
157 */
158typedef enum {
159 MPTS_EVRET_DELETE = 1, /* delete this subflow */
160 MPTS_EVRET_OK = 2, /* OK */
161 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
162 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
163} ev_ret_t;
164
3e170ce0
A
165static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
166static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
167static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
168static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
169static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
170static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
171static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
172static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
173static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
174static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
175static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
176static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
177static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
178static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
179static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
180static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
181static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
fe8ab488 182
39236c6e
A
183static const char *mptcp_evret2str(ev_ret_t);
184
185static mptcp_key_t *mptcp_reserve_key(void);
186static int mptcp_do_sha1(mptcp_key_t *, char *, int);
490019cf 187static void mptcp_init_local_parms(struct mptcb *);
39236c6e
A
188
189static unsigned int mptsub_zone_size; /* size of mptsub */
190static struct zone *mptsub_zone; /* zone for mptsub */
191
192static unsigned int mptopt_zone_size; /* size of mptopt */
193static struct zone *mptopt_zone; /* zone for mptopt */
194
195static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
196static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
197
198struct mppcbinfo mtcbinfo;
199
200static struct mptcp_keys_pool_head mptcp_keys_pool;
201
202#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
203#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
204
205SYSCTL_DECL(_net_inet);
206
207SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
208
3e170ce0
A
209uint32_t mptcp_dbg_area = 0; /* more noise if greater than 1 */
210SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
211 &mptcp_dbg_area, 0, "MPTCP debug area");
212
213uint32_t mptcp_dbg_level = 0;
214SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &mptcp_dbg_level, 0, "MPTCP debug level");
216
39236c6e
A
217
218SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
219 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
220
221/*
222 * Since there is one kernel thread per mptcp socket, imposing an artificial
223 * limit on number of allowed mptcp sockets.
224 */
225uint32_t mptcp_socket_limit = MPPCB_LIMIT;
226SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
227 &mptcp_socket_limit, 0, "MPTCP socket limit");
228
fe8ab488
A
229/*
230 * SYSCTL to turn on delayed cellular subflow start.
231 */
232uint32_t mptcp_delayed_subf_start = 0;
233SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
234 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
235
236/*
3e170ce0 237 * sysctl to use network status hints from symptomsd
fe8ab488 238 */
3e170ce0
A
239uint32_t mptcp_use_symptomsd = 1;
240SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
241 &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
fe8ab488 242
39236c6e
A
243static struct protosw mptcp_subflow_protosw;
244static struct pr_usrreqs mptcp_subflow_usrreqs;
245#if INET6
246static struct ip6protosw mptcp_subflow_protosw6;
247static struct pr_usrreqs mptcp_subflow_usrreqs6;
248#endif /* INET6 */
249
3e170ce0
A
250typedef struct mptcp_subflow_event_entry {
251 uint64_t sofilt_hint_mask;
252 ev_ret_t (*sofilt_hint_ev_hdlr)(
253 struct mptses *mpte,
254 struct mptsub *mpts,
255 uint64_t *p_mpsofilt_hint);
256} mptsub_ev_entry_t;
257
490019cf
A
258/*
259 * XXX The order of the event handlers below is really
260 * really important.
261 * SO_FILT_HINT_DELETEOK event has to be handled first,
262 * else we may end up missing on this event.
263 * Please read radar://24043716 for more details.
264 */
3e170ce0 265static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
490019cf
A
266 {
267 .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
268 .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
269 },
3e170ce0
A
270 {
271 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
272 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
273 },
274 {
275 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
276 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
277 },
278 {
279 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
280 .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
281 },
282 {
283 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
284 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
285 },
286 {
287 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
288 .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
289 },
290 { .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
291 .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
292 },
293 {
294 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
295 .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
296 },
297 {
298 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
299 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
300 },
301 {
302 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
303 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
304 },
305 {
306 .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
307 .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
308 },
309 {
310 .sofilt_hint_mask = SO_FILT_HINT_RESUME,
311 .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
312 },
313 {
314 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
315 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
316 },
317 {
318 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
319 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
320 },
3e170ce0
A
321 {
322 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
323 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
324 },
325 {
326 .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
327 .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
328 }
329};
330
39236c6e
A
331/*
332 * Protocol pr_init callback.
333 */
334void
335mptcp_init(struct protosw *pp, struct domain *dp)
336{
337#pragma unused(dp)
338 static int mptcp_initialized = 0;
339 struct protosw *prp;
340#if INET6
341 struct ip6protosw *prp6;
342#endif /* INET6 */
343
344 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
345
346 /* do this only once */
347 if (mptcp_initialized)
348 return;
349 mptcp_initialized = 1;
350
351 /*
352 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
353 * we must be able to find IPPROTO_TCP entries for both.
354 */
355 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
356 VERIFY(prp != NULL);
357 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
358 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
359 sizeof (mptcp_subflow_usrreqs));
360 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
361 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
362 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
363 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
364 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
365 /*
366 * Socket filters shouldn't attach/detach to/from this protosw
367 * since pr_protosw is to be used instead, which points to the
368 * real protocol; if they do, it is a bug and we should panic.
369 */
370 mptcp_subflow_protosw.pr_filter_head.tqh_first =
371 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
372 mptcp_subflow_protosw.pr_filter_head.tqh_last =
373 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
374
375#if INET6
376 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
377 IPPROTO_TCP, SOCK_STREAM);
378 VERIFY(prp6 != NULL);
379 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
380 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
381 sizeof (mptcp_subflow_usrreqs6));
382 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
383 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
384 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
385 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
386 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
387 /*
388 * Socket filters shouldn't attach/detach to/from this protosw
389 * since pr_protosw is to be used instead, which points to the
390 * real protocol; if they do, it is a bug and we should panic.
391 */
392 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
393 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
394 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
395 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
396#endif /* INET6 */
397
398 bzero(&mtcbinfo, sizeof (mtcbinfo));
399 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
400 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
401 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
402 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
403 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
404 /* NOTREACHED */
405 }
406 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
407 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
408
409 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
410 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
411 mtcbinfo.mppi_lock_grp_attr);
412 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
413 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
414 mtcbinfo.mppi_lock_attr);
39236c6e 415
3e170ce0 416 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e 417 mtcbinfo.mppi_timer = mptcp_timer;
3e170ce0 418 mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
39236c6e
A
419
420 /* attach to MP domain for garbage collection to take place */
421 mp_pcbinfo_attach(&mtcbinfo);
422
423 mptsub_zone_size = sizeof (struct mptsub);
424 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
425 8192, "mptsub")) == NULL) {
426 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
427 /* NOTREACHED */
428 }
429 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
430 zone_change(mptsub_zone, Z_EXPAND, TRUE);
431
432 mptopt_zone_size = sizeof (struct mptopt);
433 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
434 1024, "mptopt")) == NULL) {
435 panic("%s: unable to allocate MPTCP option zone\n", __func__);
436 /* NOTREACHED */
437 }
438 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
439 zone_change(mptopt_zone, Z_EXPAND, TRUE);
440
441 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
442 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
443 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
444 panic("%s: unable to allocate MPTCP address auth zone \n",
445 __func__);
446 /* NOTREACHED */
447 }
448 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
449 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
450
451 /* Set up a list of unique keys */
452 mptcp_key_pool_init();
39236c6e
A
453}
454
455/*
456 * Create an MPTCP session, called as a result of opening a MPTCP socket.
457 */
3e170ce0 458void *
39236c6e
A
459mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
460{
461 struct mppcbinfo *mppi;
462 struct mptses *mpte;
463 struct mptcb *mp_tp;
464 int error = 0;
465
466 VERIFY(mpp != NULL);
467 mppi = mpp->mpp_pcbinfo;
468 VERIFY(mppi != NULL);
469
3e170ce0
A
470 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
471 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
472
473 /* MPTCP Multipath PCB Extension */
474 bzero(mpte, sizeof (*mpte));
475 VERIFY(mpp->mpp_pcbe == NULL);
476 mpp->mpp_pcbe = mpte;
477 mpte->mpte_mppcb = mpp;
478 mpte->mpte_mptcb = mp_tp;
479
480 TAILQ_INIT(&mpte->mpte_sopts);
481 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
482 mpte->mpte_associd = SAE_ASSOCID_ANY;
483 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e
A
484
485 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
486 mppi->mppi_lock_attr);
487
488 /*
489 * XXX: adi@apple.com
490 *
491 * This can be rather expensive if we have lots of MPTCP sockets,
492 * but we need a kernel thread for this model to work. Perhaps we
493 * could amortize the costs by having one worker thread per a group
494 * of MPTCP sockets.
495 */
496 if (kernel_thread_start(mptcp_thread_func, mpte,
497 &mpte->mpte_thread) != KERN_SUCCESS) {
498 error = ENOBUFS;
499 goto out;
500 }
501 mp_so->so_usecount++; /* for thread */
502
503 /* MPTCP Protocol Control Block */
504 bzero(mp_tp, sizeof (*mp_tp));
505 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
506 mppi->mppi_lock_attr);
507 mp_tp->mpt_mpte = mpte;
3e170ce0 508 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e
A
509out:
510 if (error != 0)
511 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
512 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
513 struct sockbuf *, &mp_so->so_rcv,
514 struct sockbuf *, &mp_so->so_snd,
515 struct mppcb *, mpp, int, error);
516
517 return ((error != 0) ? NULL : mpte);
518}
519
520/*
521 * Destroy an MPTCP session.
522 */
523static void
524mptcp_sesdestroy(struct mptses *mpte)
525{
526 struct mptcb *mp_tp;
527
528 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
529
530 mp_tp = mpte->mpte_mptcb;
531 VERIFY(mp_tp != NULL);
532
533 /*
534 * MPTCP Multipath PCB Extension section
535 */
536 mptcp_flush_sopts(mpte);
537 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
538
539 lck_mtx_destroy(&mpte->mpte_thread_lock,
540 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
541
542 /*
543 * MPTCP Protocol Control Block section
544 */
545 lck_mtx_destroy(&mp_tp->mpt_lock,
546 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
547
548 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
549 struct mptcb *, mp_tp);
550}
551
552/*
553 * Allocate an MPTCP socket option structure.
554 */
555struct mptopt *
556mptcp_sopt_alloc(int how)
557{
558 struct mptopt *mpo;
559
560 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
561 zalloc_noblock(mptopt_zone);
562 if (mpo != NULL) {
563 bzero(mpo, mptopt_zone_size);
564 }
565
566 return (mpo);
567}
568
569/*
570 * Free an MPTCP socket option structure.
571 */
572void
573mptcp_sopt_free(struct mptopt *mpo)
574{
575 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
576
577 zfree(mptopt_zone, mpo);
578}
579
580/*
581 * Add a socket option to the MPTCP socket option list.
582 */
583void
584mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
585{
586 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
587 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
588 mpo->mpo_flags |= MPOF_ATTACHED;
589 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
590}
591
592/*
593 * Remove a socket option from the MPTCP socket option list.
594 */
595void
596mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
597{
598 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
599 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
600 mpo->mpo_flags &= ~MPOF_ATTACHED;
601 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
602}
603
604/*
605 * Search for an existing <sopt_level,sopt_name> socket option.
606 */
607struct mptopt *
608mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
609{
610 struct mptopt *mpo;
611
612 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
613
614 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
615 if (mpo->mpo_level == sopt->sopt_level &&
616 mpo->mpo_name == sopt->sopt_name)
617 break;
618 }
619 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
620
621 return (mpo);
622}
623
624/*
625 * Flushes all recorded socket options from an MP socket.
626 */
627void
628mptcp_flush_sopts(struct mptses *mpte)
629{
630 struct mptopt *mpo, *tmpo;
631
632 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
633
634 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
635 mptcp_sopt_remove(mpte, mpo);
636 mptcp_sopt_free(mpo);
637 }
638 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
639}
640
641/*
642 * Allocate a MPTCP subflow structure.
643 */
644struct mptsub *
645mptcp_subflow_alloc(int how)
646{
647 struct mptsub *mpts;
648
649 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
650 zalloc_noblock(mptsub_zone);
651 if (mpts != NULL) {
652 bzero(mpts, mptsub_zone_size);
653 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
654 mtcbinfo.mppi_lock_attr);
655 }
656
657 return (mpts);
658}
659
660/*
661 * Deallocate a subflow structure, called when all of the references held
662 * on it have been released. This implies that the subflow has been deleted.
663 */
664void
665mptcp_subflow_free(struct mptsub *mpts)
666{
667 MPTS_LOCK_ASSERT_HELD(mpts);
668
669 VERIFY(mpts->mpts_refcnt == 0);
670 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
671 VERIFY(mpts->mpts_mpte == NULL);
672 VERIFY(mpts->mpts_socket == NULL);
673
674 if (mpts->mpts_src_sl != NULL) {
675 sockaddrlist_free(mpts->mpts_src_sl);
676 mpts->mpts_src_sl = NULL;
677 }
678 if (mpts->mpts_dst_sl != NULL) {
679 sockaddrlist_free(mpts->mpts_dst_sl);
680 mpts->mpts_dst_sl = NULL;
681 }
682 MPTS_UNLOCK(mpts);
683 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
684
685 zfree(mptsub_zone, mpts);
686}
687
688/*
689 * Create an MPTCP subflow socket.
690 */
691static int
692mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
693 struct proc *p, struct socket **so)
694{
695 struct mptopt smpo, *mpo, *tmpo;
696 struct socket *mp_so;
697 int error;
698
699 *so = NULL;
700 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
701 mp_so = mpte->mpte_mppcb->mpp_socket;
702
703 /*
704 * Create the subflow socket (multipath subflow, non-blocking.)
705 *
706 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
707 * socket; it will be cleared when the socket is peeled off or closed.
708 * It also indicates to the underlying TCP to handle MPTCP options.
709 * A multipath subflow socket implies SS_NOFDREF state.
710 */
711 if ((error = socreate_internal(dom, so, SOCK_STREAM,
712 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
3e170ce0
A
713 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
714 " unable to create subflow socket error %d\n",
715 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
716 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
717 return (error);
718 }
719
720 socket_lock(*so, 0);
721 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
722 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
723 (SS_NBIO|SS_NOFDREF));
724
725 /* prevent the socket buffers from being compressed */
726 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
727 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
728
490019cf
A
729 /* Inherit preconnect and TFO data flags */
730 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
731 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
732
733 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
734 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
735
39236c6e
A
736 bzero(&smpo, sizeof (smpo));
737 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
738 smpo.mpo_level = SOL_SOCKET;
739 smpo.mpo_intval = 1;
740
741 /* disable SIGPIPE */
742 smpo.mpo_name = SO_NOSIGPIPE;
743 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
744 goto out;
745
746 /* find out if the subflow's source address goes away */
747 smpo.mpo_name = SO_NOADDRERR;
748 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
749 goto out;
750
751 /* enable keepalive */
752 smpo.mpo_name = SO_KEEPALIVE;
753 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
754 goto out;
755
756 /*
757 * Limit the receive socket buffer size to 64k.
758 *
759 * We need to take into consideration the window scale option
760 * which could be negotiated in one subflow but disabled in
761 * another subflow.
762 * XXX This can be improved in the future.
763 */
764 smpo.mpo_name = SO_RCVBUF;
765 smpo.mpo_intval = MPTCP_RWIN_MAX;
766 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
767 goto out;
768
769 /* N.B.: set by sosetopt */
770 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
771 /* Prevent automatic socket buffer sizing. */
772 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
773
774 smpo.mpo_level = IPPROTO_TCP;
775 smpo.mpo_intval = mptcp_subflow_keeptime;
776 smpo.mpo_name = TCP_KEEPALIVE;
777 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
778 goto out;
779
780 /* replay setsockopt(2) on the subflow sockets for eligible options */
781 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
782 int interim;
783
784 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
785 continue;
786
787 /*
788 * Skip those that are handled internally; these options
789 * should not have been recorded and marked with the
790 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
791 */
792 if (mpo->mpo_level == SOL_SOCKET &&
793 (mpo->mpo_name == SO_NOSIGPIPE ||
794 mpo->mpo_name == SO_NOADDRERR ||
795 mpo->mpo_name == SO_KEEPALIVE))
796 continue;
797
798 interim = (mpo->mpo_flags & MPOF_INTERIM);
799 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
800 char buf[32];
3e170ce0
A
801 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
802 " mp_so 0x%llx"
803 " sopt %s val %d interim record removed\n",
39236c6e
A
804 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
805 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
806 buf, sizeof (buf)), mpo->mpo_intval),
807 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
808 mptcp_sopt_remove(mpte, mpo);
809 mptcp_sopt_free(mpo);
810 continue;
811 }
812 }
813
814 /*
815 * We need to receive everything that the subflow socket has,
816 * so use a customized socket receive function. We will undo
817 * this when the socket is peeled off or closed.
818 */
819 mpts->mpts_oprotosw = (*so)->so_proto;
820 switch (dom) {
821 case PF_INET:
822 (*so)->so_proto = &mptcp_subflow_protosw;
823 break;
824#if INET6
825 case PF_INET6:
826 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
827 break;
828#endif /* INET6 */
829 default:
830 VERIFY(0);
831 /* NOTREACHED */
832 }
833
834out:
835 socket_unlock(*so, 0);
836
837 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
838 struct mptsub *, mpts, int, dom, int, error);
839
840 return (error);
841}
842
843/*
844 * Close an MPTCP subflow socket.
845 *
846 * Note that this may be called on an embryonic subflow, and the only
847 * thing that is guaranteed valid is the protocol-user request.
848 */
849static int
850mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
851{
852 MPTS_LOCK_ASSERT_HELD(mpts);
853
854 socket_lock(so, 0);
855 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
856 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
857
858 /* restore protocol-user requests */
859 VERIFY(mpts->mpts_oprotosw != NULL);
860 so->so_proto = mpts->mpts_oprotosw;
861 socket_unlock(so, 0);
862
863 mpts->mpts_socket = NULL; /* may already be NULL */
864
865 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
866 struct socket *, so,
867 struct sockbuf *, &so->so_rcv,
868 struct sockbuf *, &so->so_snd,
869 struct mptses *, mpts->mpts_mpte);
870
871 return (soclose(so));
872}
873
874/*
875 * Connect an MPTCP subflow socket.
876 *
877 * This may be called inline as part of adding a subflow, or asynchronously
878 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
879 * pending connect case, the subflow socket may have been bound to an interface
880 * and/or a source IP address which may no longer be around by the time this
881 * routine is called; in that case the connect attempt will most likely fail.
882 */
883static int
884mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
885{
886 struct socket *so;
887 int af, error;
888
889 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
890 MPTS_LOCK_ASSERT_HELD(mpts);
891
892 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
893 MPTSF_CONNECTING);
894 VERIFY(mpts->mpts_socket != NULL);
895 so = mpts->mpts_socket;
896 af = mpts->mpts_family;
897
898 if (af == AF_INET || af == AF_INET6) {
899 struct sockaddr_entry *dst_se;
900 char dbuf[MAX_IPv6_STR_LEN];
901
902 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
903 VERIFY(dst_se != NULL);
904
3e170ce0
A
905 mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
906 "dst %s[%d] cid %d [pended %s]\n",
39236c6e
A
907 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
908 inet_ntop(af, ((af == AF_INET) ?
909 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
910 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
911 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
912 ntohs(SIN(dst_se->se_addr)->sin_port) :
913 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
914 mpts->mpts_connid,
915 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
916 "YES" : "NO")),
917 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
918 }
919
920 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
921
922 socket_lock(so, 0);
fe8ab488 923 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 924
39236c6e
A
925 /* connect the subflow socket */
926 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
927 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
3e170ce0
A
928 mpte->mpte_associd, NULL, CONNREQF_MPTCP,
929 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
39236c6e
A
930 socket_unlock(so, 0);
931
fe8ab488
A
932 /* Allocate a unique address id per subflow */
933 mpte->mpte_addrid_last++;
934 if (mpte->mpte_addrid_last == 0)
935 mpte->mpte_addrid_last++;
936
39236c6e
A
937 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
938 struct mptsub *, mpts, int, error);
939
940 return (error);
941}
942
943/*
944 * MPTCP subflow socket receive routine, derived from soreceive().
945 */
946static int
947mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
948 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
949{
950#pragma unused(uio)
951 int flags, error = 0;
952 struct proc *p = current_proc();
953 struct mbuf *m, **mp = mp0;
954 struct mbuf *nextrecord;
955
956 socket_lock(so, 1);
957 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
958
959#ifdef MORE_LOCKING_DEBUG
960 if (so->so_usecount == 1) {
961 panic("%s: so=%x no other reference on socket\n", __func__, so);
962 /* NOTREACHED */
963 }
964#endif
965 /*
966 * We return all that is there in the subflow's socket receive buffer
967 * to the MPTCP layer, so we require that the caller passes in the
968 * expected parameters.
969 */
970 if (mp == NULL || controlp != NULL) {
971 socket_unlock(so, 1);
972 return (EINVAL);
973 }
974 *mp = NULL;
975 if (psa != NULL)
976 *psa = NULL;
977 if (flagsp != NULL)
978 flags = *flagsp &~ MSG_EOR;
979 else
980 flags = 0;
981
982 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
983 socket_unlock(so, 1);
984 return (EOPNOTSUPP);
985 }
986 flags |= (MSG_DONTWAIT|MSG_NBIO);
987
988 /*
989 * If a recv attempt is made on a previously-accepted socket
990 * that has been marked as inactive (disconnected), reject
991 * the request.
992 */
993 if (so->so_flags & SOF_DEFUNCT) {
994 struct sockbuf *sb = &so->so_rcv;
995
996 error = ENOTCONN;
39037602
A
997 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
998 __func__, proc_pid(p), proc_best_name(p),
999 (uint64_t)VM_KERNEL_ADDRPERM(so),
1000 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e
A
1001 /*
1002 * This socket should have been disconnected and flushed
1003 * prior to being returned from sodefunct(); there should
1004 * be no data on its receive list, so panic otherwise.
1005 */
1006 if (so->so_state & SS_DEFUNCT)
1007 sb_empty_assert(sb, __func__);
1008 socket_unlock(so, 1);
1009 return (error);
1010 }
1011
1012 /*
1013 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1014 * and if so just return to the caller. This could happen when
1015 * soreceive() is called by a socket upcall function during the
1016 * time the socket is freed. The socket buffer would have been
1017 * locked across the upcall, therefore we cannot put this thread
1018 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1019 * we may livelock), because the lock on the socket buffer will
1020 * only be released when the upcall routine returns to its caller.
1021 * Because the socket has been officially closed, there can be
1022 * no further read on it.
1023 *
1024 * A multipath subflow socket would have its SS_NOFDREF set by
1025 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1026 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1027 */
1028 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1029 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1030 socket_unlock(so, 1);
1031 return (0);
1032 }
1033
1034 /*
1035 * For consistency with soreceive() semantics, we need to obey
1036 * SB_LOCK in case some other code path has locked the buffer.
1037 */
1038 error = sblock(&so->so_rcv, 0);
1039 if (error != 0) {
1040 socket_unlock(so, 1);
1041 return (error);
1042 }
1043
1044 m = so->so_rcv.sb_mb;
1045 if (m == NULL) {
1046 /*
1047 * Panic if we notice inconsistencies in the socket's
1048 * receive list; both sb_mb and sb_cc should correctly
1049 * reflect the contents of the list, otherwise we may
1050 * end up with false positives during select() or poll()
1051 * which could put the application in a bad state.
1052 */
1053 SB_MB_CHECK(&so->so_rcv);
1054
1055 if (so->so_error != 0) {
1056 error = so->so_error;
1057 so->so_error = 0;
1058 goto release;
1059 }
1060
1061 if (so->so_state & SS_CANTRCVMORE) {
1062 goto release;
1063 }
1064
1065 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1066 error = ENOTCONN;
1067 goto release;
1068 }
1069
1070 /*
1071 * MSG_DONTWAIT is implicitly defined and this routine will
1072 * never block, so return EWOULDBLOCK when there is nothing.
1073 */
1074 error = EWOULDBLOCK;
1075 goto release;
1076 }
1077
1078 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1079 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1080 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1081
1082 while (m != NULL) {
1083 nextrecord = m->m_nextpkt;
1084 sbfree(&so->so_rcv, m);
1085
1086 if (mp != NULL) {
1087 *mp = m;
1088 mp = &m->m_next;
1089 so->so_rcv.sb_mb = m = m->m_next;
1090 *mp = NULL;
1091 }
1092
1093 if (m != NULL) {
1094 m->m_nextpkt = nextrecord;
1095 if (nextrecord == NULL)
1096 so->so_rcv.sb_lastrecord = m;
1097 } else {
1098 m = so->so_rcv.sb_mb = nextrecord;
1099 SB_EMPTY_FIXUP(&so->so_rcv);
1100 }
1101 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1102 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1103 }
1104
1105 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1106 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1107 /* notify protocol that we drained all the data */
1108 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1109 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1110
1111 if (flagsp != NULL)
1112 *flagsp |= flags;
1113
1114release:
1115 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1116 return (error);
1117
1118}
1119
1120
1121/*
1122 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1123 * the work done earlier when the subflow socket was created.
1124 */
1125void
1126mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1127 struct socket *so)
1128{
1129 struct mptopt smpo;
1130 struct socket *mp_so;
1131 int p, c;
1132
1133 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1134 mp_so = mpte->mpte_mppcb->mpp_socket;
1135 MPTS_LOCK_ASSERT_HELD(mpts);
1136
1137 socket_lock(so, 0);
1138 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1139 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1140
1141 /* inherit MPTCP socket states */
1142 if (!(mp_so->so_state & SS_NBIO))
1143 so->so_state &= ~SS_NBIO;
1144
1145 /*
1146 * At this point, the socket is not yet closed, as there is at least
1147 * one outstanding usecount previously held by mpts_socket from
1148 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1149 */
1150 so->so_flags &= ~SOF_MP_SUBFLOW;
1151 so->so_state &= ~SS_NOFDREF;
fe8ab488 1152 so->so_flags &= ~SOF_MPTCP_TRUE;
39236c6e
A
1153
1154 /* allow socket buffers to be compressed */
1155 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1156 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1157
1158 /*
1159 * Allow socket buffer auto sizing.
1160 *
1161 * This will increase the current 64k buffer size to whatever is best.
1162 */
39037602 1163 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
fe8ab488
A
1164 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1165 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1166 so->so_snd.sb_flags |= SB_AUTOSIZE;
39236c6e
A
1167
1168 /* restore protocol-user requests */
1169 VERIFY(mpts->mpts_oprotosw != NULL);
1170 so->so_proto = mpts->mpts_oprotosw;
1171
1172 bzero(&smpo, sizeof (smpo));
1173 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1174 smpo.mpo_level = SOL_SOCKET;
1175
1176 /* inherit SOF_NOSIGPIPE from parent MP socket */
1177 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1178 c = (so->so_flags & SOF_NOSIGPIPE);
1179 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1180 smpo.mpo_name = SO_NOSIGPIPE;
1181 if ((p - c) != 0)
1182 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1183
1184 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1185 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1186 c = (so->so_flags & SOF_NOADDRAVAIL);
1187 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1188 smpo.mpo_name = SO_NOADDRERR;
1189 if ((p - c) != 0)
1190 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1191
1192 /* inherit SO_KEEPALIVE from parent MP socket */
1193 p = (mp_so->so_options & SO_KEEPALIVE);
1194 c = (so->so_options & SO_KEEPALIVE);
1195 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1196 smpo.mpo_name = SO_KEEPALIVE;
1197 if ((p - c) != 0)
1198 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1199
1200 /* unset TCP level default keepalive option */
1201 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1202 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1203 smpo.mpo_level = IPPROTO_TCP;
1204 smpo.mpo_intval = 0;
1205 smpo.mpo_name = TCP_KEEPALIVE;
1206 if ((p - c) != 0)
1207 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1208 socket_unlock(so, 0);
1209
1210 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1211 struct mptsub *, mpts, struct socket *, so,
1212 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1213}
1214
1215/*
1216 * Establish an initial MPTCP connection (if first subflow and not yet
1217 * connected), or add a subflow to an existing MPTCP connection.
1218 */
1219int
1220mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1221 struct proc *p, uint32_t ifscope)
1222{
1223 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1224 struct socket *mp_so, *so = NULL;
1225 struct mptsub_connreq mpcr;
1226 struct mptcb *mp_tp;
1227 int af, error = 0;
1228
1229 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1230 mp_so = mpte->mpte_mppcb->mpp_socket;
1231 mp_tp = mpte->mpte_mptcb;
1232
fe8ab488
A
1233 MPT_LOCK(mp_tp);
1234 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1235 /* If the remote end sends Data FIN, refuse subflow adds */
1236 error = ENOTCONN;
1237 MPT_UNLOCK(mp_tp);
1238 return (error);
1239 }
1240 MPT_UNLOCK(mp_tp);
1241
39236c6e
A
1242 MPTS_LOCK(mpts);
1243 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1244 VERIFY(mpts->mpts_mpte == NULL);
1245 VERIFY(mpts->mpts_socket == NULL);
1246 VERIFY(mpts->mpts_dst_sl != NULL);
3e170ce0 1247 VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
39236c6e
A
1248
1249 /* select source (if specified) and destination addresses */
1250 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1251 &mpts->mpts_dst_sl, &dst_se)) != 0)
1252 goto out;
1253
1254 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1255 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1256 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1257 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1258 VERIFY(af == AF_INET || af == AF_INET6);
1259
1260 /*
1261 * If the source address is not specified, allocate a storage for
1262 * it, so that later on we can fill it in with the actual source
1263 * IP address chosen by the underlying layer for the subflow after
1264 * it is connected.
1265 */
1266 if (mpts->mpts_src_sl == NULL) {
1267 mpts->mpts_src_sl =
1268 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1269 if (mpts->mpts_src_sl == NULL) {
1270 error = ENOBUFS;
1271 goto out;
1272 }
1273 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1274 VERIFY(se != NULL && se->se_addr != NULL &&
1275 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1276 bzero(se->se_addr, se->se_addr->sa_len);
1277 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1278 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1279 }
1280
1281 /* create the subflow socket */
1282 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1283 goto out;
1284
1285 /*
3e170ce0
A
1286 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1287 * -1 (SAE_CONNID_ALL).
39236c6e
A
1288 */
1289 mpte->mpte_connid_last++;
3e170ce0
A
1290 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1291 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
1292 mpte->mpte_connid_last++;
1293
1294 mpts->mpts_connid = mpte->mpte_connid_last;
3e170ce0
A
1295 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1296 mpts->mpts_connid != SAE_CONNID_ALL);
490019cf
A
1297
1298 mpts->mpts_rel_seq = 1;
1299
fe8ab488
A
1300 /* Allocate a unique address id per subflow */
1301 mpte->mpte_addrid_last++;
1302 if (mpte->mpte_addrid_last == 0)
1303 mpte->mpte_addrid_last++;
39236c6e
A
1304
1305 /* bind subflow socket to the specified interface */
1306 if (ifscope != IFSCOPE_NONE) {
1307 socket_lock(so, 0);
1308 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1309 if (error != 0) {
1310 socket_unlock(so, 0);
1311 (void) mptcp_subflow_soclose(mpts, so);
1312 goto out;
1313 }
1314 VERIFY(mpts->mpts_outif != NULL);
1315 mpts->mpts_flags |= MPTSF_BOUND_IF;
1316
39037602
A
1317 if (IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
1318 sototcpcb(so)->t_mpflags |= TMPF_BACKUP_PATH;
1319 } else {
1320 mpts->mpts_flags |= MPTSF_PREFERRED;
1321 }
1322
3e170ce0 1323 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
39037602 1324 "bindif %s[%d] cid %d expensive %d\n",
39236c6e
A
1325 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1326 mpts->mpts_outif->if_xname,
39037602
A
1327 ifscope, mpts->mpts_connid,
1328 IFNET_IS_EXPENSIVE(mpts->mpts_outif)),
3e170ce0 1329 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1330 socket_unlock(so, 0);
1331 }
1332
1333 /* if source address and/or port is specified, bind to it */
1334 if (src_se != NULL) {
1335 struct sockaddr *sa = src_se->se_addr;
1336 uint32_t mpts_flags = 0;
1337 in_port_t lport;
1338
1339 switch (af) {
1340 case AF_INET:
1341 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1342 mpts_flags |= MPTSF_BOUND_IP;
1343 if ((lport = SIN(sa)->sin_port) != 0)
1344 mpts_flags |= MPTSF_BOUND_PORT;
1345 break;
1346#if INET6
1347 case AF_INET6:
1348 VERIFY(af == AF_INET6);
1349 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1350 mpts_flags |= MPTSF_BOUND_IP;
1351 if ((lport = SIN6(sa)->sin6_port) != 0)
1352 mpts_flags |= MPTSF_BOUND_PORT;
1353 break;
1354#endif /* INET6 */
1355 }
1356
1357 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1358 if (error != 0) {
1359 (void) mptcp_subflow_soclose(mpts, so);
1360 goto out;
1361 }
1362 mpts->mpts_flags |= mpts_flags;
1363
1364 if (af == AF_INET || af == AF_INET6) {
1365 char sbuf[MAX_IPv6_STR_LEN];
1366
3e170ce0
A
1367 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
1368 "mp_so 0x%llx bindip %s[%d] cid %d\n",
39236c6e
A
1369 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1370 inet_ntop(af, ((af == AF_INET) ?
1371 (void *)&SIN(sa)->sin_addr.s_addr :
1372 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
3e170ce0
A
1373 ntohs(lport), mpts->mpts_connid),
1374 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1375 }
1376 }
1377
1378 /*
1379 * Insert the subflow into the list, and associate the MPTCP PCB
1380 * as well as the the subflow socket. From this point on, removing
1381 * the subflow needs to be done via mptcp_subflow_del().
1382 */
1383 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1384 mpte->mpte_numflows++;
1385
1386 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1387 mpts->mpts_mpte = mpte;
1388 mpts->mpts_socket = so;
1389 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1390 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1391 mp_so->so_usecount++; /* for subflow socket */
1392
1393 /* register for subflow socket read/write events */
1394 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1395 mptcp_subflow_wupcall, mpts);
1396
1397 /*
1398 * Register for subflow socket control events; ignore
1399 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1400 * will generate it here.
1401 */
1402 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1403 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1404 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1405 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1406 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1407 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1408 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
fe8ab488
A
1409 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1410 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
39236c6e
A
1411
1412 /* sanity check */
1413 VERIFY(!(mpts->mpts_flags &
1414 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1415
1416 bzero(&mpcr, sizeof (mpcr));
1417 mpcr.mpcr_proc = p;
1418 mpcr.mpcr_ifscope = ifscope;
1419 /*
1420 * Indicate to the TCP subflow whether or not it should establish
1421 * the initial MPTCP connection, or join an existing one. Fill
1422 * in the connection request structure with additional info needed
1423 * by the underlying TCP (to be used in the TCP options, etc.)
1424 */
1425 MPT_LOCK(mp_tp);
1426 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1427 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
490019cf 1428 mptcp_init_local_parms(mp_tp);
39236c6e
A
1429 }
1430 MPT_UNLOCK(mp_tp);
1431 soisconnecting(mp_so);
1432 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1433 } else {
1434 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1435 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
fe8ab488
A
1436
1437 /* avoid starting up cellular subflow unless required */
1438 if ((mptcp_delayed_subf_start) &&
1439 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1440 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1441 }
39236c6e
A
1442 MPT_UNLOCK(mp_tp);
1443 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1444 }
1445
490019cf
A
1446 /* If fastjoin or fastopen is requested, set state in mpts */
1447 if (mpte->mpte_nummpcapflows == 0) {
1448 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1449 MPT_LOCK(mp_tp);
1450 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
1451 mpts->mpts_flags |= MPTSF_TFO_REQD;
1452 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1453 }
1454 MPT_UNLOCK(mp_tp);
1455 }
1456
1457 if (so->so_flags & SOF_MPTCP_FASTJOIN) {
1458 MPT_LOCK(mp_tp);
1459 if (mp_tp->mpt_state == MPTCPS_ESTABLISHED) {
1460 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1461 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1462 }
1463 MPT_UNLOCK(mp_tp);
1464 }
1465 }
1466
39236c6e
A
1467 mpts->mpts_mpcr = mpcr;
1468 mpts->mpts_flags |= MPTSF_CONNECTING;
1469
1470 if (af == AF_INET || af == AF_INET6) {
1471 char dbuf[MAX_IPv6_STR_LEN];
1472
3e170ce0
A
1473 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
1474 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
1475 "[pending %s]\n", __func__,
1476 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1477 inet_ntop(af, ((af == AF_INET) ?
1478 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1479 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1480 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1481 ntohs(SIN(dst_se->se_addr)->sin_port) :
1482 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1483 mpts->mpts_connid,
1484 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
1485 "YES" : "NO")),
1486 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1487 }
1488
1489 /* connect right away if first attempt, or if join can be done now */
1490 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1491 error = mptcp_subflow_soconnectx(mpte, mpts);
1492
1493out:
1494 MPTS_UNLOCK(mpts);
1495 if (error == 0) {
1496 soevent(mp_so, SO_FILT_HINT_LOCKED |
1497 SO_FILT_HINT_CONNINFO_UPDATED);
1498 }
1499 return (error);
1500}
1501
39236c6e
A
1502/*
1503 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1504 * will no longer be accessible after a subflow is deleted, thus this
1505 * should occur only after the subflow socket has been disconnected.
1506 * If peeloff(2) is called, leave the socket open.
1507 */
1508void
1509mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1510{
1511 struct socket *mp_so, *so;
1512
1513 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1514 mp_so = mpte->mpte_mppcb->mpp_socket;
1515
1516 MPTS_LOCK(mpts);
1517 so = mpts->mpts_socket;
1518 VERIFY(so != NULL);
39037602 1519
fe8ab488
A
1520 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1521 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1522 MPTS_UNLOCK(mpts);
3e170ce0
A
1523 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
1524 " mp_so 0x%llx flags %x\n",
1525 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
1526 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
1527 return;
1528 }
39236c6e 1529
3e170ce0
A
1530 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
1531 "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
39236c6e
A
1532 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1533 mp_so->so_usecount,
1534 mp_so->so_retaincnt, mpts->mpts_connid,
1535 (close ? "YES" : "NO"), mpts->mpts_soerror,
3e170ce0
A
1536 mpts->mpts_flags,
1537 mp_so->so_error),
1538 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1539
1540 VERIFY(mpts->mpts_mpte == mpte);
3e170ce0
A
1541 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1542 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1543
1544 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1545 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1546 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1547 VERIFY(mpte->mpte_numflows != 0);
1548 mpte->mpte_numflows--;
fe8ab488
A
1549 if (mpte->mpte_active_sub == mpts)
1550 mpte->mpte_active_sub = NULL;
39236c6e
A
1551
1552 /*
1553 * Drop references held by this subflow socket; there
1554 * will be no further upcalls made from this point.
1555 */
1556 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1557 (void) sock_catchevents(so, NULL, NULL, 0);
fe8ab488 1558
39236c6e 1559 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 1560
39236c6e
A
1561 if (close)
1562 (void) mptcp_subflow_soclose(mpts, so);
1563
d190cdc3 1564 VERIFY(mp_so->so_usecount > 0);
39236c6e
A
1565 mp_so->so_usecount--; /* for subflow socket */
1566 mpts->mpts_mpte = NULL;
1567 mpts->mpts_socket = NULL;
1568 MPTS_UNLOCK(mpts);
1569
1570 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1571 MPTS_REMREF(mpts); /* for subflow socket */
1572
1573 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1574}
1575
1576/*
1577 * Disconnect a subflow socket.
1578 */
1579void
1580mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1581 boolean_t deleteok)
1582{
1583 struct socket *so;
1584 struct mptcb *mp_tp;
1585 int send_dfin = 0;
1586
1587 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1588 MPTS_LOCK_ASSERT_HELD(mpts);
1589
1590 VERIFY(mpts->mpts_mpte == mpte);
1591 VERIFY(mpts->mpts_socket != NULL);
3e170ce0
A
1592 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1593 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1594
1595 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1596 return;
1597
1598 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1599
1600 /*
1601 * If this is coming from disconnectx(2) or issued as part of
1602 * closing the MPTCP socket, the subflow shouldn't stick around.
1603 * Otherwise let it linger around in case the upper layers need
1604 * to retrieve its conninfo.
1605 */
1606 if (deleteok)
1607 mpts->mpts_flags |= MPTSF_DELETEOK;
1608
1609 so = mpts->mpts_socket;
1610 mp_tp = mpte->mpte_mptcb;
1611 MPT_LOCK(mp_tp);
1612 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1613 send_dfin = 1;
1614 MPT_UNLOCK(mp_tp);
1615
1616 socket_lock(so, 0);
1617 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1618 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
1619 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
1620 "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
1621 (deleteok ? "NO" : "YES")),
1622 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1623
1624 if (send_dfin)
1625 mptcp_send_dfin(so);
1626 (void) soshutdownlock(so, SHUT_RD);
1627 (void) soshutdownlock(so, SHUT_WR);
1628 (void) sodisconnectlocked(so);
1629 }
1630 socket_unlock(so, 0);
1631 /*
1632 * Generate a disconnect event for this subflow socket, in case
1633 * the lower layer doesn't do it; this is needed because the
1634 * subflow socket deletion relies on it. This will also end up
1635 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1636 * we cannot do that here because subflow lock is currently held.
1637 */
1638 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1639}
1640
1641/*
1642 * Subflow socket read upcall.
1643 *
1644 * Called when the associated subflow socket posted a read event. The subflow
1645 * socket lock has been released prior to invoking the callback. Note that the
1646 * upcall may occur synchronously as a result of MPTCP performing an action on
1647 * it, or asynchronously as a result of an event happening at the subflow layer.
1648 * Therefore, to maintain lock ordering, the only lock that can be acquired
1649 * here is the thread lock, for signalling purposes.
1650 */
1651static void
1652mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1653{
1654#pragma unused(so, waitf)
1655 struct mptsub *mpts = arg;
1656 struct mptses *mpte = mpts->mpts_mpte;
1657
39037602
A
1658 /*
1659 * mpte should never be NULL, except in a race with
1660 * mptcp_subflow_del
fe8ab488
A
1661 */
1662 if (mpte == NULL)
1663 return;
39236c6e
A
1664
1665 lck_mtx_lock(&mpte->mpte_thread_lock);
1666 mptcp_thread_signal_locked(mpte);
1667 lck_mtx_unlock(&mpte->mpte_thread_lock);
1668}
1669
1670/*
1671 * Subflow socket input.
1672 *
1673 * Called in the context of the MPTCP thread, for reading data from the
1674 * underlying subflow socket and delivering it to MPTCP.
1675 */
1676static void
1677mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1678{
1679 struct mbuf *m = NULL;
1680 struct socket *so;
1681 int error;
1682 struct mptsub *mpts_alt = NULL;
1683
1684 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1685 MPTS_LOCK_ASSERT_HELD(mpts);
1686
39037602 1687 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
1688 struct mptsub *, mpts);
1689
1690 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1691 return;
1692
1693 so = mpts->mpts_socket;
1694
1695 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1696 if (error != 0 && error != EWOULDBLOCK) {
3e170ce0
A
1697 mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
1698 __func__, mpts->mpts_connid, error),
1699 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1700 MPTS_UNLOCK(mpts);
3e170ce0 1701 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e 1702 if (mpts_alt == NULL) {
fe8ab488
A
1703 if (mptcp_delayed_subf_start) {
1704 mpts_alt = mptcp_get_pending_subflow(mpte,
1705 mpts);
1706 if (mpts_alt) {
3e170ce0
A
1707 mptcplog((LOG_DEBUG,"MPTCP Receiver:"
1708 " %s: pending %d\n",
1709 __func__, mpts_alt->mpts_connid),
1710 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1711 } else {
3e170ce0
A
1712 mptcplog((LOG_ERR, "MPTCP Receiver:"
1713 " %s: no pending flow for cid %d",
1714 __func__, mpts->mpts_connid),
1715 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488
A
1716 }
1717 } else {
3e170ce0 1718 mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
39037602 1719 " path for cid %d\n", __func__,
3e170ce0
A
1720 mpts->mpts_connid),
1721 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1722 }
490019cf
A
1723 if (error == ENODATA) {
1724 /*
1725 * Don't ignore ENODATA so as to discover
1726 * nasty middleboxes.
1727 */
1728 struct socket *mp_so =
1729 mpte->mpte_mppcb->mpp_socket;
1730 mp_so->so_error = ENODATA;
1731 sorwakeup(mp_so);
1732 }
39236c6e
A
1733 }
1734 MPTS_LOCK(mpts);
1735 } else if (error == 0) {
3e170ce0
A
1736 mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
1737 __func__, mpts->mpts_connid),
1738 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1739 }
1740
1741 /* In fallback, make sure to accept data on all but one subflow */
1742 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1743 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1744 m_freem(m);
1745 return;
1746 }
1747
1748 if (m != NULL) {
3e170ce0
A
1749
1750 /* Did we receive data on the backup subflow? */
1751 if (!(mpts->mpts_flags & MPTSF_ACTIVE))
1752 mpts->mpts_peerswitch++;
1753 else
1754 mpts->mpts_peerswitch = 0;
1755
39236c6e
A
1756 /*
1757 * Release subflow lock since this may trigger MPTCP to send,
1758 * possibly on a different subflow. An extra reference has
1759 * been held on the subflow by the MPTCP thread before coming
1760 * here, so we can be sure that it won't go away, in the event
1761 * the MP socket lock gets released.
1762 */
1763 MPTS_UNLOCK(mpts);
1764 mptcp_input(mpte, m);
1765 MPTS_LOCK(mpts);
1766 }
1767}
1768
1769/*
1770 * Subflow socket write upcall.
1771 *
1772 * Called when the associated subflow socket posted a read event. The subflow
1773 * socket lock has been released prior to invoking the callback. Note that the
1774 * upcall may occur synchronously as a result of MPTCP performing an action on
1775 * it, or asynchronously as a result of an event happening at the subflow layer.
1776 * Therefore, to maintain lock ordering, the only lock that can be acquired
1777 * here is the thread lock, for signalling purposes.
1778 */
1779static void
1780mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1781{
1782#pragma unused(so, waitf)
1783 struct mptsub *mpts = arg;
1784 struct mptses *mpte = mpts->mpts_mpte;
1785
fe8ab488 1786 /*
490019cf 1787 * mpte should never be NULL except in a race with
fe8ab488
A
1788 * mptcp_subflow_del which doesn't hold socket lock across critical
1789 * section. This upcall is made after releasing the socket lock.
1790 * Interleaving of socket operations becomes possible therefore.
1791 */
1792 if (mpte == NULL)
1793 return;
39236c6e
A
1794
1795 lck_mtx_lock(&mpte->mpte_thread_lock);
1796 mptcp_thread_signal_locked(mpte);
1797 lck_mtx_unlock(&mpte->mpte_thread_lock);
1798}
1799
1800/*
1801 * Subflow socket output.
1802 *
1803 * Called for sending data from MPTCP to the underlying subflow socket.
1804 */
1805int
1806mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1807{
1808 struct socket *mp_so, *so;
1809 size_t sb_cc = 0, tot_sent = 0;
1810 struct mbuf *sb_mb;
39037602 1811 int error = 0, wakeup = 0;
39236c6e
A
1812 u_int64_t mpt_dsn = 0;
1813 struct mptcb *mp_tp = mpte->mpte_mptcb;
1814 struct mbuf *mpt_mbuf = NULL;
fe8ab488
A
1815 u_int64_t off = 0;
1816 struct mbuf *head, *tail;
490019cf 1817 int tcp_zero_len_write = 0;
39236c6e
A
1818
1819 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1820 MPTS_LOCK_ASSERT_HELD(mpts);
1821 mp_so = mpte->mpte_mppcb->mpp_socket;
1822 so = mpts->mpts_socket;
1823
39037602 1824 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
39236c6e
A
1825 struct mptsub *, mpts);
1826
1827 /* subflow socket is suspended? */
1828 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
3e170ce0
A
1829 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
1830 "flow controlled\n", __func__,
1831 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1832 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1833 goto out;
1834 }
1835
1836 /* subflow socket is not MPTCP capable? */
1837 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
fe8ab488 1838 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
490019cf
A
1839 !(mpts->mpts_flags & MPTSF_FASTJ_SEND) &&
1840 !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
3e170ce0 1841 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
39236c6e 1842 "MPTCP capable\n", __func__,
3e170ce0
A
1843 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1844 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1845 goto out;
1846 }
1847
1848 /* Remove Addr Option is not sent reliably as per I-D */
1849 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1850 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1851 tp->t_rem_aid = mpte->mpte_lost_aid;
1852 if (mptcp_remaddr_enable)
1853 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1854 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1855 }
1856
490019cf 1857 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
39037602 1858 mptcp_drop_tfo_data(mpte, mpts, &wakeup);
490019cf
A
1859 }
1860
39236c6e
A
1861 /*
1862 * The mbuf chains containing the metadata (as well as pointing to
1863 * the user data sitting at the MPTCP output queue) would then be
1864 * sent down to the subflow socket.
1865 *
1866 * Some notes on data sequencing:
1867 *
1868 * a. Each mbuf must be a M_PKTHDR.
1869 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1870 * in the mbuf pkthdr structure.
1871 * c. Each mbuf containing the MPTCP metadata must have its
1872 * pkt_flags marked with the PKTF_MPTCP flag.
1873 */
1874
1875 /* First, drop acknowledged data */
1876 sb_mb = mp_so->so_snd.sb_mb;
1877 if (sb_mb == NULL) {
1878 goto out;
1879 }
1880
1881 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1882
1883 mpt_mbuf = sb_mb;
1884 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
490019cf
A
1885 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1886 (mpt_mbuf->m_next == NULL) &&
1887 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1888 /*
1889 * If TFO, allow connection establishment with zero
1890 * length write.
1891 */
1892 tcp_zero_len_write = 1;
1893 goto zero_len_write;
1894 }
39236c6e
A
1895 mpt_mbuf = mpt_mbuf->m_next;
1896 }
1897 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1898 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1899 } else {
1900 goto out;
1901 }
1902
1903 MPT_LOCK(mp_tp);
1904 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
fe8ab488 1905 u_int64_t len = 0;
39236c6e 1906 len = mp_tp->mpt_snduna - mpt_dsn;
3e170ce0 1907 MPT_UNLOCK(mp_tp);
fe8ab488 1908 sbdrop(&mp_so->so_snd, (int)len);
39037602 1909 wakeup = 1;
3e170ce0 1910 MPT_LOCK(mp_tp);
39236c6e
A
1911 }
1912
1913 /*
1914 * In degraded mode, we don't receive data acks, so force free
1915 * mbufs less than snd_nxt
1916 */
fe8ab488
A
1917 if (mp_so->so_snd.sb_mb == NULL) {
1918 MPT_UNLOCK(mp_tp);
1919 goto out;
1920 }
1921
39236c6e
A
1922 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1923 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 1924 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
39236c6e 1925 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
fe8ab488 1926 u_int64_t len = 0;
39236c6e 1927 len = mp_tp->mpt_sndnxt - mpt_dsn;
fe8ab488 1928 sbdrop(&mp_so->so_snd, (int)len);
39037602 1929 wakeup = 1;
39236c6e
A
1930 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1931 }
1932
fe8ab488
A
1933 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1934 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1935 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1936 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1937 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1938 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1939 }
1940
39236c6e
A
1941 /*
1942 * Adjust the subflow's notion of next byte to send based on
1943 * the last unacknowledged byte
1944 */
1945 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1946 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1947 }
1948
1949 /*
1950 * Adjust the top level notion of next byte used for retransmissions
1951 * and sending FINs.
1952 */
1953 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1954 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1955 }
1956
1957
1958 /* Now determine the offset from which to start transmitting data */
1959 sb_mb = mp_so->so_snd.sb_mb;
1960 sb_cc = mp_so->so_snd.sb_cc;
1961 if (sb_mb == NULL) {
1962 MPT_UNLOCK(mp_tp);
1963 goto out;
1964 }
1965 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1966 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
fe8ab488 1967 sb_cc -= (size_t)off;
39236c6e
A
1968 } else {
1969 MPT_UNLOCK(mp_tp);
1970 goto out;
1971 }
1972 MPT_UNLOCK(mp_tp);
1973
1974 mpt_mbuf = sb_mb;
39236c6e
A
1975
1976 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
fe8ab488 1977 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
39236c6e
A
1978 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1979 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 1980 }
3e170ce0
A
1981 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
1982 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
1983 "snduna = %llu sndnxt = %llu probe %d\n",
1984 __func__, mpts->mpts_connid,
1985 mp_tp->mpt_snduna, mpts->mpts_sndnxt,
1986 mpts->mpts_probecnt),
1987 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e 1988
ecc0ceb4 1989 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 1990
fe8ab488
A
1991 head = tail = NULL;
1992
39236c6e
A
1993 while (tot_sent < sb_cc) {
1994 struct mbuf *m;
fe8ab488 1995 size_t mlen;
39236c6e
A
1996
1997 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1998 mlen -= off;
1999 if (mlen == 0)
2000 goto out;
2001
2002 if (mlen > sb_cc) {
2003 panic("%s: unexpected %lu %lu \n", __func__,
2004 mlen, sb_cc);
2005 }
2006
fe8ab488
A
2007 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2008 M_COPYM_MUST_COPY_HDR);
39236c6e
A
2009 if (m == NULL) {
2010 error = ENOBUFS;
2011 break;
2012 }
2013
2014 /* Create a DSN mapping for the data (m_copym does it) */
2015 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
fe8ab488 2016 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
2017 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2018 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2019 m->m_pkthdr.mp_dsn = mpt_dsn + off;
2020 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2021 m->m_pkthdr.mp_rlen = mlen;
2022 mpts->mpts_rel_seq += mlen;
2023 m->m_pkthdr.len = mlen;
2024
fe8ab488
A
2025 if (head == NULL) {
2026 head = tail = m;
2027 } else {
2028 tail->m_next = m;
2029 tail = m;
2030 }
2031
fe8ab488
A
2032 tot_sent += mlen;
2033 off = 0;
2034 mpt_mbuf = mpt_mbuf->m_next;
2035 }
2036
2037 if (head != NULL) {
490019cf 2038 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
fe8ab488 2039
490019cf
A
2040 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2041 (tp->t_tfo_stats == 0)) {
39037602 2042 tp->t_mpflags |= TMPF_TFO_REQUEST;
490019cf 2043 } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
fe8ab488
A
2044 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
2045 }
2046
2047 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2048
39037602 2049 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
39236c6e
A
2050 struct sockbuf *, &so->so_rcv,
2051 struct sockbuf *, &so->so_snd,
2052 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488 2053 size_t, tot_sent);
490019cf
A
2054 } else if (tcp_zero_len_write == 1) {
2055zero_len_write:
2056 socket_lock(so, 1);
2057 /* Opting to call pru_send as no mbuf at subflow level */
2058 error = (*so->so_proto->pr_usrreqs->pru_send)
2059 (so, 0, NULL, NULL, NULL, current_proc());
2060 socket_unlock(so, 1);
fe8ab488
A
2061 }
2062
490019cf 2063 if ((error == 0) || (error == EWOULDBLOCK)) {
fe8ab488 2064 mpts->mpts_sndnxt += tot_sent;
3e170ce0
A
2065
2066 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2067 tcpstat.tcps_mp_num_probes++;
2068 if (tot_sent < mpts->mpts_maxseg)
2069 mpts->mpts_probecnt += 1;
2070 else
2071 mpts->mpts_probecnt +=
2072 tot_sent/mpts->mpts_maxseg;
2073 }
2074
39236c6e 2075 MPT_LOCK(mp_tp);
3e170ce0 2076
39236c6e
A
2077 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
2078 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
2079 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2080 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2081 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
2082 }
fe8ab488 2083 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
39236c6e 2084 MPT_UNLOCK(mp_tp);
fe8ab488 2085
490019cf
A
2086 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2087 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2088
fe8ab488
A
2089 /* Send once in SYN_SENT state to avoid sending SYN spam */
2090 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
490019cf 2091 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
fe8ab488 2092 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
39236c6e 2093 }
39236c6e 2094
3e170ce0
A
2095 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2096 (mpts->mpts_probesoon != 0))
2097 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
2098 "wrote %d %d probe %d probedelta %d\n",
fe8ab488 2099 __func__, mpts->mpts_connid, (int)tot_sent,
3e170ce0
A
2100 (int) sb_cc, mpts->mpts_probecnt,
2101 (tcp_now - mpts->mpts_probesoon)),
2102 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2103 } else {
3e170ce0
A
2104 mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
2105 __func__, mpts->mpts_connid, error, tot_sent),
2106 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2107 }
2108out:
39037602
A
2109 if (wakeup)
2110 sowwakeup(mp_so);
2111
39236c6e
A
2112 return (error);
2113}
2114
2115/*
2116 * Subflow socket control event upcall.
2117 *
2118 * Called when the associated subflow socket posted one or more control events.
2119 * The subflow socket lock has been released prior to invoking the callback.
2120 * Note that the upcall may occur synchronously as a result of MPTCP performing
2121 * an action on it, or asynchronously as a result of an event happening at the
2122 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
2123 * be acquired here is the thread lock, for signalling purposes.
2124 */
2125static void
2126mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
2127{
2128#pragma unused(so)
2129 struct mptsub *mpts = arg;
2130 struct mptses *mpte = mpts->mpts_mpte;
2131
2132 VERIFY(mpte != NULL);
2133
2134 lck_mtx_lock(&mpte->mpte_thread_lock);
2135 atomic_bitset_32(&mpts->mpts_evctl, events);
2136 mptcp_thread_signal_locked(mpte);
2137 lck_mtx_unlock(&mpte->mpte_thread_lock);
2138}
2139
2140/*
2141 * Subflow socket control events.
2142 *
2143 * Called for handling events related to the underlying subflow socket.
2144 */
2145static ev_ret_t
3e170ce0
A
2146mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
2147 uint64_t *p_mpsofilt_hint)
39236c6e 2148{
fe8ab488 2149 uint32_t events, save_events;
39236c6e 2150 ev_ret_t ret = MPTS_EVRET_OK;
3e170ce0
A
2151 int i = 0;
2152 int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
2153 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e
A
2154 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2155 MPTS_LOCK_ASSERT_HELD(mpts);
2156
2157 /* bail if there's nothing to process */
2158 if ((events = mpts->mpts_evctl) == 0)
2159 return (ret);
2160
2161 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
2162 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
2163 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
2164 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
2165 SO_FILT_HINT_DISCONNECTED)) {
2166 events |= SO_FILT_HINT_MPFAILOVER;
2167 }
2168
fe8ab488
A
2169 save_events = events;
2170
39236c6e
A
2171 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
2172 struct mptsub *, mpts, uint32_t, events);
2173
3e170ce0
A
2174 mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
2175 mpts->mpts_connid, events, SO_FILT_HINT_BITS),
2176 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
2177
2178 /*
2179 * Process all the socket filter hints and reset the hint
2180 * once it is handled
2181 */
2182 for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
490019cf
A
2183 /*
2184 * Always execute the DISCONNECTED event, because it will wakeup
2185 * the app.
2186 */
3e170ce0 2187 if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
490019cf
A
2188 (ret >= MPTS_EVRET_OK ||
2189 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3e170ce0
A
2190 ev_ret_t error =
2191 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
2192 events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
2193 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2194 }
fe8ab488
A
2195 }
2196
39236c6e
A
2197 /*
2198 * We should be getting only events specified via sock_catchevents(),
2199 * so loudly complain if we have any unprocessed one(s).
2200 */
2201 if (events != 0 || ret < MPTS_EVRET_OK) {
3e170ce0 2202 mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
39236c6e 2203 " unhandled events=%b\n",
39037602 2204 (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
39236c6e 2205 __func__, mpts->mpts_connid,
3e170ce0
A
2206 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
2207 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2208 }
2209
2210 /* clear the ones we've processed */
fe8ab488 2211 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
39236c6e
A
2212 return (ret);
2213}
2214
2215/*
2216 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2217 */
2218static ev_ret_t
3e170ce0
A
2219mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
2220 uint64_t *p_mpsofilt_hint)
39236c6e
A
2221{
2222 struct socket *mp_so, *so;
2223 struct mptcb *mp_tp;
2224 boolean_t linger;
2225
2226 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2227 MPTS_LOCK_ASSERT_HELD(mpts);
2228 VERIFY(mpte->mpte_mppcb != NULL);
2229 mp_so = mpte->mpte_mppcb->mpp_socket;
2230 mp_tp = mpte->mpte_mptcb;
2231 so = mpts->mpts_socket;
2232
2233 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2234 !(mp_so->so_flags & SOF_PCBCLEARING));
2235
3e170ce0
A
2236 mptcplog((LOG_DEBUG, "MPTCP Events: "
2237 "%s: cid %d [linger %s]\n", __func__,
2238 mpts->mpts_connid, (linger ? "YES" : "NO")),
2239 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2240
39236c6e
A
2241 /*
2242 * We got a TCP RST for this subflow connection.
2243 *
2244 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
fe8ab488
A
2245 * client if the MPTCP connection has not been established or
2246 * if the connection has only one subflow and is a connection being
2247 * resumed. Otherwise we close the socket.
39236c6e
A
2248 */
2249 mptcp_subflow_disconnect(mpte, mpts, !linger);
2250
2251 MPT_LOCK(mp_tp);
2252 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
fe8ab488 2253 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
39037602
A
2254 } else if (mpte->mpte_nummpcapflows < 1 ||
2255 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
2256 (mpts->mpts_flags & MPTSF_ACTIVE))) {
fe8ab488 2257 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
3e170ce0 2258 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
39236c6e
A
2259 }
2260 MPT_UNLOCK(mp_tp);
2261
2262 /*
2263 * Keep the subflow socket around, unless the MPTCP socket has
2264 * been detached or the subflow has been disconnected explicitly,
2265 * in which case it should be deleted right away.
2266 */
2267 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2268}
2269
2270/*
2271 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2272 */
2273static ev_ret_t
3e170ce0
A
2274mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2275 uint64_t *p_mpsofilt_hint)
39236c6e 2276{
39037602 2277 struct mptcb *mp_tp;
39236c6e
A
2278 struct socket *so;
2279
2280 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2281 MPTS_LOCK_ASSERT_HELD(mpts);
2282
39037602 2283 mp_tp = mpte->mpte_mptcb;
39236c6e
A
2284 so = mpts->mpts_socket;
2285
3e170ce0
A
2286 mptcplog((LOG_DEBUG, "MPTCP Events: "
2287 "%s: cid %d\n", __func__, mpts->mpts_connid),
2288 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2289
2290 /*
39037602
A
2291 * A FIN on a fallen back MPTCP-connection should be treated like a
2292 * DATA_FIN.
2293 */
2294 MPT_LOCK(mp_tp);
2295 if ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) &&
2296 (mpts->mpts_flags & MPTSF_ACTIVE)) {
2297 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
2298 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
2299 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
2300 }
2301 }
2302 MPT_UNLOCK(mp_tp);
39236c6e
A
2303
2304 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2305}
2306
2307/*
2308 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2309 */
2310static ev_ret_t
3e170ce0
A
2311mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
2312 uint64_t *p_mpsofilt_hint)
39236c6e 2313{
3e170ce0 2314#pragma unused(p_mpsofilt_hint)
39236c6e
A
2315 struct socket *so;
2316
2317 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2318 MPTS_LOCK_ASSERT_HELD(mpts);
2319
2320 so = mpts->mpts_socket;
2321
3e170ce0
A
2322 mptcplog((LOG_DEBUG, "MPTCP Events: "
2323 "%s: cid %d\n", __func__, mpts->mpts_connid),
2324 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2325
39236c6e
A
2326 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2327}
2328
2329/*
2330 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2331 */
2332static ev_ret_t
3e170ce0
A
2333mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
2334 uint64_t *p_mpsofilt_hint)
39236c6e 2335{
3e170ce0 2336#pragma unused(p_mpsofilt_hint)
39236c6e
A
2337 struct socket *mp_so, *so;
2338 struct mptcb *mp_tp;
2339 boolean_t linger;
2340
2341 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2342 MPTS_LOCK_ASSERT_HELD(mpts);
2343 VERIFY(mpte->mpte_mppcb != NULL);
2344 mp_so = mpte->mpte_mppcb->mpp_socket;
2345 mp_tp = mpte->mpte_mptcb;
2346 so = mpts->mpts_socket;
2347
2348 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2349 !(mp_so->so_flags & SOF_PCBCLEARING));
2350
3e170ce0
A
2351 mptcplog((LOG_NOTICE, "MPTCP Events: "
2352 "%s: cid %d [linger %s]\n", __func__,
2353 mpts->mpts_connid, (linger ? "YES" : "NO")),
2354 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2355
2356 if (mpts->mpts_soerror == 0)
2357 mpts->mpts_soerror = ETIMEDOUT;
2358
2359 /*
2360 * The subflow connection has timed out.
2361 *
2362 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2363 * client if the MPTCP connection has not been established. Otherwise
2364 * drop it.
2365 */
2366 mptcp_subflow_disconnect(mpte, mpts, !linger);
2367
2368 MPT_LOCK(mp_tp);
2369 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2370 mp_so->so_error = ETIMEDOUT;
2371 }
2372 MPT_UNLOCK(mp_tp);
2373
2374 /*
2375 * Keep the subflow socket around, unless the MPTCP socket has
2376 * been detached or the subflow has been disconnected explicitly,
2377 * in which case it should be deleted right away.
2378 */
2379 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2380}
2381
2382/*
2383 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2384 */
2385static ev_ret_t
3e170ce0
A
2386mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
2387 uint64_t *p_mpsofilt_hint)
39236c6e 2388{
3e170ce0 2389#pragma unused(p_mpsofilt_hint)
39236c6e
A
2390 struct socket *mp_so, *so;
2391 struct mptcb *mp_tp;
2392 boolean_t linger;
2393 struct tcpcb *tp = NULL;
2394
2395 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2396 MPTS_LOCK_ASSERT_HELD(mpts);
2397
2398 VERIFY(mpte->mpte_mppcb != NULL);
2399 mp_so = mpte->mpte_mppcb->mpp_socket;
2400 mp_tp = mpte->mpte_mptcb;
2401 so = mpts->mpts_socket;
2402
2403 /* Not grabbing socket lock as t_local_aid is write once only */
2404 tp = intotcpcb(sotoinpcb(so));
2405 /*
2406 * This overwrites any previous mpte_lost_aid to avoid storing
2407 * too much state when the typical case has only two subflows.
2408 */
2409 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2410 mpte->mpte_lost_aid = tp->t_local_aid;
2411
2412 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2413 !(mp_so->so_flags & SOF_PCBCLEARING));
2414
3e170ce0
A
2415 mptcplog((LOG_DEBUG, "MPTCP Events: "
2416 "%s cid %d [linger %s]\n", __func__,
2417 mpts->mpts_connid, (linger ? "YES" : "NO")),
2418 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2419
2420 if (mpts->mpts_soerror == 0)
2421 mpts->mpts_soerror = EADDRNOTAVAIL;
2422
2423 /*
2424 * The subflow connection has lost its source address.
2425 *
2426 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2427 * client if the MPTCP connection has not been established. If it
2428 * has been established with one subflow , we keep the MPTCP
2429 * connection valid without any subflows till closed by application.
2430 * This lets tcp connection manager decide whether to close this or
2431 * not as it reacts to reachability changes too.
2432 */
2433 mptcp_subflow_disconnect(mpte, mpts, !linger);
2434
2435 MPT_LOCK(mp_tp);
2436 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2437 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2438 mp_so->so_error = EADDRNOTAVAIL;
2439 }
2440 MPT_UNLOCK(mp_tp);
2441
2442 /*
2443 * Keep the subflow socket around, unless the MPTCP socket has
2444 * been detached or the subflow has been disconnected explicitly,
2445 * in which case it should be deleted right away.
2446 */
2447 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2448}
2449
fe8ab488
A
2450/*
2451 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2452 * indicates that the remote side sent a Data FIN
2453 */
2454static ev_ret_t
3e170ce0
A
2455mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2456 uint64_t *p_mpsofilt_hint)
fe8ab488
A
2457{
2458 struct socket *so, *mp_so;
2459 struct mptcb *mp_tp;
2460
2461 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2462 MPTS_LOCK_ASSERT_HELD(mpts);
2463 mp_so = mpte->mpte_mppcb->mpp_socket;
2464 so = mpts->mpts_socket;
2465 mp_tp = mpte->mpte_mptcb;
2466
3e170ce0
A
2467 mptcplog((LOG_DEBUG, "MPTCP Events: "
2468 "%s: cid %d\n", __func__, mpts->mpts_connid),
2469 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 2470
fe8ab488 2471 /*
39037602 2472 * We got a Data FIN for the MPTCP connection.
fe8ab488
A
2473 * The FIN may arrive with data. The data is handed up to the
2474 * mptcp socket and the user is notified so that it may close
2475 * the socket if needed.
2476 */
2477 MPT_LOCK(mp_tp);
39037602 2478 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
3e170ce0 2479 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
39037602 2480
fe8ab488
A
2481 MPT_UNLOCK(mp_tp);
2482 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2483}
2484
39236c6e
A
2485/*
2486 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2487 */
2488static ev_ret_t
3e170ce0
A
2489mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
2490 uint64_t *p_mpsofilt_hint)
39236c6e
A
2491{
2492 struct mptsub *mpts_alt = NULL;
2493 struct socket *so = NULL;
2494 struct socket *mp_so;
2495 int altpath_exists = 0;
2496
2497 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2498 MPTS_LOCK_ASSERT_HELD(mpts);
2499 mp_so = mpte->mpte_mppcb->mpp_socket;
3e170ce0
A
2500 mptcplog((LOG_NOTICE, "MPTCP Events: "
2501 "%s: mp_so 0x%llx\n", __func__,
2502 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2503 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2504
2505 MPTS_UNLOCK(mpts);
3e170ce0 2506 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
2507
2508 /*
2509 * If there is no alternate eligible subflow, ignore the
2510 * failover hint.
2511 */
2512 if (mpts_alt == NULL) {
3e170ce0
A
2513 mptcplog((LOG_WARNING, "MPTCP Events: "
2514 "%s: no alternate path\n", __func__),
2515 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2516
fe8ab488
A
2517 if (mptcp_delayed_subf_start) {
2518 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2519 if (mpts_alt != NULL) {
2520 MPTS_LOCK(mpts_alt);
2521 (void) mptcp_subflow_soconnectx(mpte,
2522 mpts_alt);
39037602 2523 MPTS_UNLOCK(mpts_alt);
fe8ab488
A
2524 }
2525 }
39236c6e
A
2526 MPTS_LOCK(mpts);
2527 goto done;
2528 }
2529 MPTS_LOCK(mpts_alt);
2530 altpath_exists = 1;
2531 so = mpts_alt->mpts_socket;
2532 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2533 socket_lock(so, 1);
fe8ab488
A
2534 /* All data acknowledged and no RTT spike */
2535 if ((so->so_snd.sb_cc == 0) &&
2536 (mptcp_no_rto_spike(so))) {
39236c6e
A
2537 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2538 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2539 } else {
2540 /* no alternate path available */
2541 altpath_exists = 0;
2542 }
2543 socket_unlock(so, 1);
2544 }
2545 if (altpath_exists) {
3e170ce0
A
2546 mptcplog((LOG_INFO, "MPTCP Events: "
2547 "%s: cid = %d\n",
2548 __func__, mpts_alt->mpts_connid),
2549 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2550 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3e170ce0 2551 mpts_alt->mpts_peerswitch = 0;
39236c6e
A
2552 struct mptcb *mp_tp = mpte->mpte_mptcb;
2553 /* Bring the subflow's notion of snd_nxt into the send window */
2554 MPT_LOCK(mp_tp);
2555 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2556 MPT_UNLOCK(mp_tp);
2557 mpte->mpte_active_sub = mpts_alt;
2558 socket_lock(so, 1);
2559 sowwakeup(so);
2560 socket_unlock(so, 1);
2561 }
2562 MPTS_UNLOCK(mpts_alt);
2563
2564 if (altpath_exists) {
3e170ce0
A
2565 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2566 mptcplog((LOG_NOTICE, "MPTCP Events: "
2567 "%s: mp_so 0x%llx switched from "
39236c6e
A
2568 "%d to %d\n", __func__,
2569 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
2570 mpts->mpts_connid, mpts_alt->mpts_connid),
2571 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2572 tcpstat.tcps_mp_switches++;
2573 }
2574
2575 MPTS_LOCK(mpts);
2576 if (altpath_exists) {
2577 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2578 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2579 } else {
3e170ce0
A
2580 mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
2581 __func__, mpts->mpts_connid),
2582 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2583done:
39236c6e
A
2584 so = mpts->mpts_socket;
2585 socket_lock(so, 1);
2586 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2587 socket_unlock(so, 1);
2588 }
39236c6e
A
2589 MPTS_LOCK_ASSERT_HELD(mpts);
2590 return (MPTS_EVRET_OK);
2591}
2592
2593/*
2594 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2595 */
2596static ev_ret_t
3e170ce0
A
2597mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
2598 uint64_t *p_mpsofilt_hint)
39236c6e
A
2599{
2600 struct socket *mp_so, *so;
2601 struct mptcb *mp_tp;
2602 boolean_t linger;
2603
2604 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2605 MPTS_LOCK_ASSERT_HELD(mpts);
2606 VERIFY(mpte->mpte_mppcb != NULL);
2607 mp_so = mpte->mpte_mppcb->mpp_socket;
2608 mp_tp = mpte->mpte_mptcb;
2609 so = mpts->mpts_socket;
2610
2611 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2612 !(mp_so->so_flags & SOF_PCBCLEARING));
2613
3e170ce0
A
2614 mptcplog((LOG_DEBUG, "MPTCP Events: "
2615 "%s: cid %d [linger %s]\n", __func__,
2616 mpts->mpts_connid, (linger ? "YES" : "NO")),
2617 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2618
2619 if (mpts->mpts_soerror == 0)
2620 mpts->mpts_soerror = EHOSTUNREACH;
2621
2622 /*
2623 * The subflow connection cannot use the outgoing interface.
2624 *
2625 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2626 * client if the MPTCP connection has not been established. If it
2627 * has been established, let the upper layer call disconnectx.
2628 */
2629 mptcp_subflow_disconnect(mpte, mpts, !linger);
3e170ce0 2630 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
39236c6e
A
2631
2632 MPT_LOCK(mp_tp);
2633 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2634 mp_so->so_error = EHOSTUNREACH;
2635 }
2636 MPT_UNLOCK(mp_tp);
2637
39236c6e
A
2638 /*
2639 * Keep the subflow socket around, unless the MPTCP socket has
2640 * been detached or the subflow has been disconnected explicitly,
2641 * in which case it should be deleted right away.
2642 */
2643 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2644}
2645
2646/*
2647 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2648 */
2649static ev_ret_t
3e170ce0
A
2650mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
2651 uint64_t *p_mpsofilt_hint)
39236c6e 2652{
3e170ce0 2653#pragma unused(p_mpsofilt_hint)
39236c6e
A
2654 struct socket *so;
2655
2656 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2657 MPTS_LOCK_ASSERT_HELD(mpts);
2658
2659 so = mpts->mpts_socket;
2660
2661 /* the subflow connection is being flow controlled */
2662 mpts->mpts_flags |= MPTSF_SUSPENDED;
2663
3e170ce0
A
2664 mptcplog((LOG_DEBUG, "MPTCP Events: "
2665 "%s: cid %d\n", __func__,
2666 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2667
2668 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2669}
2670
2671/*
2672 * Handle SO_FILT_HINT_RESUME subflow socket event.
2673 */
2674static ev_ret_t
3e170ce0
A
2675mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
2676 uint64_t *p_mpsofilt_hint)
39236c6e 2677{
3e170ce0 2678#pragma unused(p_mpsofilt_hint)
39236c6e
A
2679 struct socket *so;
2680
2681 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2682 MPTS_LOCK_ASSERT_HELD(mpts);
2683
2684 so = mpts->mpts_socket;
2685
2686 /* the subflow connection is no longer flow controlled */
2687 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2688
3e170ce0
A
2689 mptcplog((LOG_DEBUG, "MPTCP Events: "
2690 "%s: cid %d\n", __func__, mpts->mpts_connid),
2691 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2692
2693 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2694}
2695
2696/*
2697 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2698 */
2699static ev_ret_t
3e170ce0
A
2700mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
2701 uint64_t *p_mpsofilt_hint)
39236c6e
A
2702{
2703 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2704 struct sockaddr_entry *src_se, *dst_se;
2705 struct sockaddr_storage src;
2706 struct socket *mp_so, *so;
2707 struct mptcb *mp_tp;
2708 struct ifnet *outifp;
2709 int af, error = 0;
2710 boolean_t mpok = FALSE;
3e170ce0
A
2711 boolean_t cell = FALSE;
2712 boolean_t wifi = FALSE;
2713 boolean_t wired = FALSE;
39236c6e
A
2714
2715 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2716 VERIFY(mpte->mpte_mppcb != NULL);
2717 mp_so = mpte->mpte_mppcb->mpp_socket;
2718 mp_tp = mpte->mpte_mptcb;
2719
2720 MPTS_LOCK_ASSERT_HELD(mpts);
2721 so = mpts->mpts_socket;
2722 af = mpts->mpts_family;
2723
2724 if (mpts->mpts_flags & MPTSF_CONNECTED)
2725 return (MPTS_EVRET_OK);
2726
2727 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2728 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
490019cf 2729 socket_lock(so, 0);
fe8ab488
A
2730 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2731 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
2732 mptcplog((LOG_DEBUG, "MPTCP Events: "
2733 "%s: cid %d disconnect before tcp connect\n",
2734 __func__, mpts->mpts_connid),
2735 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2736 (void) soshutdownlock(so, SHUT_RD);
2737 (void) soshutdownlock(so, SHUT_WR);
2738 (void) sodisconnectlocked(so);
2739 }
2740 socket_unlock(so, 0);
39236c6e
A
2741 return (MPTS_EVRET_OK);
2742 }
2743
2744 /*
2745 * The subflow connection has been connected. Find out whether it
2746 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2747 *
2748 * a. If MPTCP connection is not yet established, then this must be
2749 * the first subflow connection. If MPTCP failed to negotiate,
2750 * indicate to the MPTCP socket client via EPROTO, that the
2751 * underlying TCP connection may be peeled off via peeloff(2).
2752 * Otherwise, mark the MPTCP socket as connected.
2753 *
2754 * b. If MPTCP connection has been established, then this must be
2755 * one of the subsequent subflow connections. If MPTCP failed
2756 * to negotiate, disconnect the connection since peeloff(2)
2757 * is no longer possible.
2758 *
2759 * Right now, we simply unblock any waiters at the MPTCP socket layer
2760 * if the MPTCP connection has not been established.
2761 */
2762 socket_lock(so, 0);
2763
2764 if (so->so_state & SS_ISDISCONNECTED) {
2765 /*
2766 * With MPTCP joins, a connection is connected at the subflow
2767 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
2768 * subflow to connected state. So there is a small window
2769 * where the subflow could get disconnected before the
39236c6e
A
2770 * connected event is processed.
2771 */
2772 socket_unlock(so, 0);
2773 return (MPTS_EVRET_OK);
2774 }
2775
2776 mpts->mpts_soerror = 0;
2777 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2778 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf
A
2779
2780 if (!(so->so_flags1 & SOF1_DATA_IDEMPOTENT))
2781 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
2782
2783 struct tcpcb *tp = sototcpcb(so);
2784 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
2785 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2786
490019cf
A
2787 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
2788
39236c6e
A
2789 VERIFY(mpts->mpts_dst_sl != NULL);
2790 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2791 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2792 dst_se->se_addr->sa_family == af);
2793
2794 VERIFY(mpts->mpts_src_sl != NULL);
2795 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2796 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2797 src_se->se_addr->sa_family == af);
2798
2799 /* get/check source IP address */
2800 switch (af) {
2801 case AF_INET: {
2802 error = in_getsockaddr_s(so, &src);
2803 if (error == 0) {
2804 struct sockaddr_in *ms = SIN(src_se->se_addr);
2805 struct sockaddr_in *s = SIN(&src);
2806
2807 VERIFY(s->sin_len == ms->sin_len);
2808 VERIFY(ms->sin_family == AF_INET);
2809
2810 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2811 bcmp(&ms->sin_addr, &s->sin_addr,
2812 sizeof (ms->sin_addr)) != 0) {
3e170ce0
A
2813 mptcplog((LOG_ERR, "MPTCP Events: "
2814 "%s: cid %d local "
39236c6e
A
2815 "address %s (expected %s)\n", __func__,
2816 mpts->mpts_connid, inet_ntop(AF_INET,
2817 (void *)&s->sin_addr.s_addr, buf0,
2818 sizeof (buf0)), inet_ntop(AF_INET,
2819 (void *)&ms->sin_addr.s_addr, buf1,
3e170ce0
A
2820 sizeof (buf1))),
2821 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2822 }
2823 bcopy(s, ms, sizeof (*s));
2824 }
2825 break;
2826 }
2827#if INET6
2828 case AF_INET6: {
2829 error = in6_getsockaddr_s(so, &src);
2830 if (error == 0) {
2831 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2832 struct sockaddr_in6 *s = SIN6(&src);
2833
2834 VERIFY(s->sin6_len == ms->sin6_len);
2835 VERIFY(ms->sin6_family == AF_INET6);
2836
2837 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2838 bcmp(&ms->sin6_addr, &s->sin6_addr,
2839 sizeof (ms->sin6_addr)) != 0) {
3e170ce0
A
2840 mptcplog((LOG_ERR, "MPTCP Events: "
2841 "%s: cid %d local "
39236c6e
A
2842 "address %s (expected %s)\n", __func__,
2843 mpts->mpts_connid, inet_ntop(AF_INET6,
2844 (void *)&s->sin6_addr, buf0,
2845 sizeof (buf0)), inet_ntop(AF_INET6,
2846 (void *)&ms->sin6_addr, buf1,
3e170ce0
A
2847 sizeof (buf1))),
2848 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2849 }
2850 bcopy(s, ms, sizeof (*s));
2851 }
2852 break;
2853 }
2854#endif /* INET6 */
2855 default:
2856 VERIFY(0);
2857 /* NOTREACHED */
2858 }
2859
2860 if (error != 0) {
3e170ce0
A
2861 mptcplog((LOG_ERR, "MPTCP Events "
2862 "%s: cid %d getsockaddr failed (%d)\n",
2863 __func__, mpts->mpts_connid, error),
2864 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2865 }
2866
2867 /* get/verify the outbound interface */
2868 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2869 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2870 VERIFY(mpts->mpts_outif != NULL);
2871 if (mpts->mpts_outif != outifp) {
3e170ce0 2872 mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
39236c6e
A
2873 "(expected %s)\n", __func__, mpts->mpts_connid,
2874 ((outifp != NULL) ? outifp->if_xname : "NULL"),
3e170ce0
A
2875 mpts->mpts_outif->if_xname),
2876 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2877
39236c6e
A
2878 if (outifp == NULL)
2879 outifp = mpts->mpts_outif;
2880 }
2881 } else {
2882 mpts->mpts_outif = outifp;
2883 }
2884
3e170ce0
A
2885 mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
2886 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
2887 mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
2888
2889 cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
2890 wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
2891 wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
2892
2893 if (cell)
2894 mpts->mpts_linktype |= MPTSL_CELL;
2895 else if (wifi)
2896 mpts->mpts_linktype |= MPTSL_WIFI;
2897 else if (wired)
2898 mpts->mpts_linktype |= MPTSL_WIRED;
2899
39236c6e
A
2900 socket_unlock(so, 0);
2901
3e170ce0
A
2902 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
2903 "establishment srtt %d \n", __func__,
2904 mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
2905 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2906
2907
2908 mptcplog((LOG_DEBUG, "MPTCP Socket: "
2909 "%s: cid %d outif %s %s[%d] -> %s[%d] "
39236c6e
A
2910 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2911 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2912 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2913 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2914 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2915 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2916 inet_ntop(af, ((af == AF_INET) ?
2917 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2918 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2919 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2920 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2921 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
3e170ce0
A
2922 "MPTCP capable" : "a regular TCP")),
2923 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
2924
2925 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2926 MPTS_UNLOCK(mpts);
2927
3e170ce0 2928 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
2929
2930 MPT_LOCK(mp_tp);
2931 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2932 /* case (a) above */
2933 if (!mpok) {
2934 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2935 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2936 MPT_UNLOCK(mp_tp);
2937 } else {
490019cf
A
2938 MPT_UNLOCK(mp_tp);
2939 mptcplog((LOG_DEBUG, "MPTCP State: "
2940 "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
2941 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2942 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
2943 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2944 mpte->mpte_associd = mpts->mpts_connid;
39037602
A
2945 DTRACE_MPTCP2(state__change,
2946 struct mptcb *, mp_tp,
490019cf
A
2947 uint32_t, 0 /* event */);
2948
39037602
A
2949 if (mpts->mpts_outif &&
2950 IFNET_IS_EXPENSIVE(mpts->mpts_outif)) {
2951 sototcpcb(so)->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
2952 } else {
2953 mpts->mpts_flags |= MPTSF_PREFERRED;
2954 }
490019cf 2955 soisconnected(mp_so);
39236c6e
A
2956 }
2957 MPTS_LOCK(mpts);
2958 if (mpok) {
39236c6e
A
2959 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2960 mpte->mpte_nummpcapflows++;
2961 MPT_LOCK_SPIN(mp_tp);
490019cf
A
2962 /* With TFO, sndnxt may be initialized earlier */
2963 if (mpts->mpts_sndnxt == 0)
2964 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2965 MPT_UNLOCK(mp_tp);
2966 }
2967 } else if (mpok) {
2968 MPT_UNLOCK(mp_tp);
fe8ab488
A
2969 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2970 /* Experimental code, disabled by default. */
2971 sorwakeup(mp_so);
2972 sowwakeup(mp_so);
2973 }
39236c6e
A
2974 /*
2975 * case (b) above
2976 * In case of additional flows, the MPTCP socket is not
2977 * MPTSF_MP_CAPABLE until an ACK is received from server
2978 * for 3-way handshake. TCP would have guaranteed that this
2979 * is an MPTCP subflow.
2980 */
2981 MPTS_LOCK(mpts);
2982 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
fe8ab488 2983 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
39236c6e 2984 mpte->mpte_nummpcapflows++;
39236c6e 2985 MPT_LOCK_SPIN(mp_tp);
fe8ab488
A
2986 /* With Fastjoin, sndnxt is updated before connected_ev */
2987 if (mpts->mpts_sndnxt == 0) {
2988 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
490019cf 2989 mpts->mpts_rel_seq = 1;
39037602 2990 }
39236c6e 2991 MPT_UNLOCK(mp_tp);
fe8ab488
A
2992 mptcp_output_needed(mpte, mpts);
2993 } else {
2994 MPT_UNLOCK(mp_tp);
2995 MPTS_LOCK(mpts);
39236c6e 2996 }
fe8ab488 2997
39236c6e
A
2998 MPTS_LOCK_ASSERT_HELD(mpts);
2999
3000 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3001}
3002
3003/*
3004 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3005 */
3006static ev_ret_t
3e170ce0
A
3007mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
3008 uint64_t *p_mpsofilt_hint)
39236c6e
A
3009{
3010 struct socket *mp_so, *so;
3011 struct mptcb *mp_tp;
3012 boolean_t linger;
3013
3014 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3015 MPTS_LOCK_ASSERT_HELD(mpts);
3016 VERIFY(mpte->mpte_mppcb != NULL);
3017 mp_so = mpte->mpte_mppcb->mpp_socket;
3018 mp_tp = mpte->mpte_mptcb;
3019 so = mpts->mpts_socket;
3020
3021 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3022 !(mp_so->so_flags & SOF_PCBCLEARING));
3023
3e170ce0
A
3024 mptcplog((LOG_DEBUG, "MPTCP Events: "
3025 "%s: cid %d [linger %s]\n", __func__,
3026 mpts->mpts_connid, (linger ? "YES" : "NO")),
3027 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3028
3029 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3030 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3031
3032 /*
3033 * Clear flags that are used by getconninfo to return state.
fe8ab488 3034 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e
A
3035 */
3036 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3037 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3038 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
3039 MPTSF_SUSPENDED|MPTSF_ACTIVE);
3040 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3041
3042 /*
3043 * The subflow connection has been disconnected.
3044 *
3045 * Right now, we simply unblock any waiters at the MPTCP socket layer
3046 * if the MPTCP connection has not been established.
3047 */
3e170ce0 3048 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3049
3050 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3051 mpte->mpte_nummpcapflows--;
fe8ab488
A
3052 if (mpte->mpte_active_sub == mpts) {
3053 mpte->mpte_active_sub = NULL;
3e170ce0
A
3054 mptcplog((LOG_DEBUG, "MPTCP Events: "
3055 "%s: resetting active subflow \n",
3056 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3057 }
39236c6e
A
3058 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3059 }
3060
3061 MPT_LOCK(mp_tp);
3062 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3063 MPT_UNLOCK(mp_tp);
3e170ce0 3064 MPTS_UNLOCK(mpts);
39236c6e 3065 soisdisconnected(mp_so);
3e170ce0 3066 MPTS_LOCK(mpts);
39236c6e
A
3067 } else {
3068 MPT_UNLOCK(mp_tp);
3069 }
3070
39236c6e
A
3071 /*
3072 * The underlying subflow socket has been disconnected;
3073 * it is no longer useful to us. Keep the subflow socket
3074 * around, unless the MPTCP socket has been detached or
3075 * the subflow has been disconnected explicitly, in which
3076 * case it should be deleted right away.
3077 */
3078 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3079}
3080
3081/*
3082 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3083 */
3084static ev_ret_t
3e170ce0
A
3085mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3086 uint64_t *p_mpsofilt_hint)
39236c6e
A
3087{
3088 struct socket *mp_so, *so;
3089 struct mptcb *mp_tp;
3e170ce0 3090 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e
A
3091
3092 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3093 VERIFY(mpte->mpte_mppcb != NULL);
3094 mp_so = mpte->mpte_mppcb->mpp_socket;
3095 mp_tp = mpte->mpte_mptcb;
3096
3097 MPTS_LOCK_ASSERT_HELD(mpts);
3098 so = mpts->mpts_socket;
3099
3100 socket_lock(so, 0);
3101 MPT_LOCK(mp_tp);
3102
3103 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3104 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3105 else
3106 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3107
3108 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3109 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3110 goto done;
3111 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3112 }
3113 else
3114 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3115
3116 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3117 mpts->mpts_flags |= MPTSF_MP_READY;
3118 else
3119 mpts->mpts_flags &= ~MPTSF_MP_READY;
3120
3121 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3122 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3123 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3124 }
3125
3126 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3127 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3128 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3e170ce0
A
3129 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3130 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3131 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3132 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3133 ret = MPTS_EVRET_CONNECT_PENDING;
3e170ce0
A
3134 } else {
3135 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3136 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3137 }
3138
3e170ce0
A
3139 mptcplog((LOG_DEBUG, "MPTCP Events: "
3140 "%s: mp_so 0x%llx mpt_flags=%b cid %d "
39236c6e
A
3141 "mptsf=%b\n", __func__,
3142 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
3143 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3e170ce0
A
3144 mpts->mpts_flags, MPTSF_BITS),
3145 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3146
39236c6e
A
3147done:
3148 MPT_UNLOCK(mp_tp);
3149 socket_unlock(so, 0);
39236c6e
A
3150 return (ret);
3151}
3152
3153/*
3154 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3155 */
3156static ev_ret_t
3e170ce0
A
3157mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3158 uint64_t *p_mpsofilt_hint)
39236c6e
A
3159{
3160 struct socket *mp_so, *so;
3161 struct mptcb *mp_tp;
39037602 3162 boolean_t linger, is_fastclose;
39236c6e
A
3163
3164
3165 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3166 MPTS_LOCK_ASSERT_HELD(mpts);
3167 VERIFY(mpte->mpte_mppcb != NULL);
3168 mp_so = mpte->mpte_mppcb->mpp_socket;
3169 mp_tp = mpte->mpte_mptcb;
3170 so = mpts->mpts_socket;
3171
3172 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3173 !(mp_so->so_flags & SOF_PCBCLEARING));
3174
3175 if (mpts->mpts_soerror == 0)
3176 mpts->mpts_soerror = ECONNABORTED;
3177
39236c6e
A
3178 /* We got an invalid option or a fast close */
3179 socket_lock(so, 0);
3180 struct tcptemp *t_template;
3181 struct inpcb *inp = sotoinpcb(so);
3182 struct tcpcb *tp = NULL;
3183
3184 tp = intotcpcb(inp);
fe8ab488 3185 so->so_error = ECONNABORTED;
39236c6e 3186
39037602
A
3187 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3188
39236c6e
A
3189 t_template = tcp_maketemplate(tp);
3190 if (t_template) {
fe8ab488 3191 struct tcp_respond_args tra;
39236c6e 3192
fe8ab488 3193 bzero(&tra, sizeof(tra));
39236c6e 3194 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3195 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3196 else
fe8ab488
A
3197 tra.ifscope = IFSCOPE_NONE;
3198 tra.awdl_unrestricted = 1;
39236c6e
A
3199
3200 tcp_respond(tp, t_template->tt_ipgen,
3201 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3202 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3203 (void) m_free(dtom(t_template));
3e170ce0
A
3204 mptcplog((LOG_DEBUG, "MPTCP Events: "
3205 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3206 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3207 so, mpts->mpts_connid),
3208 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3209 }
3210 socket_unlock(so, 0);
3211 mptcp_subflow_disconnect(mpte, mpts, !linger);
39236c6e 3212
3e170ce0
A
3213 *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
3214
39037602
A
3215 MPT_LOCK(mp_tp);
3216
3217 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3e170ce0 3218 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 3219
39037602
A
3220 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3221 mp_so->so_error = ECONNABORTED;
3222 else
3223 mp_so->so_error = ECONNRESET;
3224
3225 /*
3226 * mptcp_drop is being called after processing the events, to fully
3227 * close the MPTCP connection
3228 */
39236c6e 3229 }
39037602 3230
3e170ce0
A
3231 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3232 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e
A
3233 MPT_UNLOCK(mp_tp);
3234
39236c6e
A
3235 /*
3236 * Keep the subflow socket around unless the subflow has been
3237 * disconnected explicitly.
3238 */
3239 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3240}
3241
fe8ab488 3242static ev_ret_t
3e170ce0
A
3243mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
3244 uint64_t *p_mpsofilt_hint)
fe8ab488 3245{
3e170ce0 3246#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3247 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3248 MPTS_LOCK_ASSERT_HELD(mpts);
3249 VERIFY(mpte->mpte_mppcb != NULL);
39037602 3250
fe8ab488
A
3251 if (mpte->mpte_nummpcapflows == 0) {
3252 struct mptcb *mp_tp = mpte->mpte_mptcb;
3e170ce0
A
3253 mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
3254 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
3255 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3256
fe8ab488
A
3257 mpte->mpte_active_sub = mpts;
3258 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3259 MPT_LOCK(mp_tp);
3260 /*
3261 * If mptcp_subflow_output is called before fastjoin_ev
3262 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3263 * and further mpts->mpts_sndnxt is incremented by len copied.
3264 */
3265 if (mpts->mpts_sndnxt == 0) {
3266 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
fe8ab488
A
3267 }
3268 MPT_UNLOCK(mp_tp);
3269 }
3270
3271 return (MPTS_EVRET_OK);
3272}
3273
3274static ev_ret_t
3e170ce0
A
3275mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
3276 uint64_t *p_mpsofilt_hint)
fe8ab488 3277{
3e170ce0 3278#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3279 MPTE_LOCK_ASSERT_HELD(mpte);
3280 MPTS_LOCK_ASSERT_HELD(mpts);
3281 VERIFY(mpte->mpte_mppcb != NULL);
3e170ce0
A
3282
3283 mptcplog((LOG_DEBUG, "MPTCP Events: "
3284 "%s cid %d\n", __func__, mpts->mpts_connid),
3285 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3286
3287 mpts->mpts_flags |= MPTSF_DELETEOK;
3288 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3289 return (MPTS_EVRET_DELETE);
3290 else
3291 return (MPTS_EVRET_OK);
3292}
3293
39236c6e
A
3294static const char *
3295mptcp_evret2str(ev_ret_t ret)
3296{
3297 const char *c = "UNKNOWN";
3298
3299 switch (ret) {
3300 case MPTS_EVRET_DELETE:
3301 c = "MPTS_EVRET_DELETE";
3302 break;
3303 case MPTS_EVRET_CONNECT_PENDING:
3304 c = "MPTS_EVRET_CONNECT_PENDING";
3305 break;
3306 case MPTS_EVRET_DISCONNECT_FALLBACK:
3307 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3308 break;
3309 case MPTS_EVRET_OK:
3310 c = "MPTS_EVRET_OK";
3311 break;
3e170ce0 3312 default:
39236c6e
A
3313 break;
3314 }
3315 return (c);
3316}
3317
3318/*
3319 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3320 */
3321void
3322mptcp_subflow_addref(struct mptsub *mpts, int locked)
3323{
3324 if (!locked)
3325 MPTS_LOCK(mpts);
3326 else
3327 MPTS_LOCK_ASSERT_HELD(mpts);
3328
3329 if (++mpts->mpts_refcnt == 0) {
3330 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3331 /* NOTREACHED */
3332 }
3333 if (!locked)
3334 MPTS_UNLOCK(mpts);
3335}
3336
3337/*
3338 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3339 */
3340void
3341mptcp_subflow_remref(struct mptsub *mpts)
3342{
3343 MPTS_LOCK(mpts);
3344 if (mpts->mpts_refcnt == 0) {
3345 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3346 /* NOTREACHED */
3347 }
3348 if (--mpts->mpts_refcnt > 0) {
3349 MPTS_UNLOCK(mpts);
3350 return;
3351 }
3352 /* callee will unlock and destroy lock */
3353 mptcp_subflow_free(mpts);
3354}
3355
3356/*
3357 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3358 * caller must ensure that the option can be issued on subflow sockets, via
3359 * MPOF_SUBFLOW_OK flag.
3360 */
3361int
3362mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3363 struct mptopt *mpo)
3364{
3365 struct socket *mp_so;
3366 struct sockopt sopt;
3367 char buf[32];
3368 int error;
3369
3370 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3371 mpo->mpo_flags &= ~MPOF_INTERIM;
3372
3373 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3374 mp_so = mpte->mpte_mppcb->mpp_socket;
3375
3376 bzero(&sopt, sizeof (sopt));
3377 sopt.sopt_dir = SOPT_SET;
3378 sopt.sopt_level = mpo->mpo_level;
3379 sopt.sopt_name = mpo->mpo_name;
3380 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3381 sopt.sopt_valsize = sizeof (int);
3382 sopt.sopt_p = kernproc;
3383
3384 error = sosetoptlock(so, &sopt, 0); /* already locked */
3385 if (error == 0) {
3e170ce0
A
3386 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3387 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3388 "val %d set successful\n", __func__,
3389 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3390 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3391 buf, sizeof (buf)), mpo->mpo_intval),
3392 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3393 } else {
3e170ce0
A
3394 mptcplog((LOG_ERR, "MPTCP Socket: "
3395 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3396 "val %d set error %d\n", __func__,
3397 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3398 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3399 buf, sizeof (buf)), mpo->mpo_intval, error),
3400 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3401 }
3402 return (error);
3403}
3404
3405/*
3406 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3407 * caller must ensure that the option can be issued on subflow sockets, via
3408 * MPOF_SUBFLOW_OK flag.
3409 */
3410int
3411mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3412 struct mptopt *mpo)
3413{
3414 struct socket *mp_so;
3415 struct sockopt sopt;
3416 char buf[32];
3417 int error;
3418
3419 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3420 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3421 mp_so = mpte->mpte_mppcb->mpp_socket;
3422
3423 bzero(&sopt, sizeof (sopt));
3424 sopt.sopt_dir = SOPT_GET;
3425 sopt.sopt_level = mpo->mpo_level;
3426 sopt.sopt_name = mpo->mpo_name;
3427 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3428 sopt.sopt_valsize = sizeof (int);
3429 sopt.sopt_p = kernproc;
3430
3431 error = sogetoptlock(so, &sopt, 0); /* already locked */
3432 if (error == 0) {
3e170ce0
A
3433 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3434 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3435 "val %d get successful\n", __func__,
3436 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3437 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3438 buf, sizeof (buf)), mpo->mpo_intval),
3439 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3440 } else {
3e170ce0
A
3441 mptcplog((LOG_ERR, "MPTCP Socket: "
3442 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e
A
3443 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3444 mptcp_sopt2str(mpo->mpo_level,
3e170ce0
A
3445 mpo->mpo_name, buf, sizeof (buf)), error),
3446 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3447 }
3448 return (error);
3449}
3450
3451
3452/*
3453 * MPTCP garbage collector.
3454 *
3455 * This routine is called by the MP domain on-demand, periodic callout,
3456 * which is triggered when a MPTCP socket is closed. The callout will
3457 * repeat as long as this routine returns a non-zero value.
3458 */
3459static uint32_t
3460mptcp_gc(struct mppcbinfo *mppi)
3461{
3462 struct mppcb *mpp, *tmpp;
3463 uint32_t active = 0;
3464
3465 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3466
39236c6e
A
3467 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3468 struct socket *mp_so;
3469 struct mptses *mpte;
3470 struct mptcb *mp_tp;
3471
3472 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3473 mp_so = mpp->mpp_socket;
3474 VERIFY(mp_so != NULL);
3475 mpte = mptompte(mpp);
3476 VERIFY(mpte != NULL);
3477 mp_tp = mpte->mpte_mptcb;
3478 VERIFY(mp_tp != NULL);
3479
3e170ce0
A
3480 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3481 "%s: mp_so 0x%llx found "
39236c6e
A
3482 "(u=%d,r=%d,s=%d)\n", __func__,
3483 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3484 mp_so->so_retaincnt, mpp->mpp_state),
3485 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3486
3487 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3e170ce0
A
3488 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3489 "%s: mp_so 0x%llx skipped "
39236c6e
A
3490 "(u=%d,r=%d)\n", __func__,
3491 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3492 mp_so->so_usecount, mp_so->so_retaincnt),
3493 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3494 active++;
3495 continue;
3496 }
3497
3498 /* check again under the lock */
3499 if (mp_so->so_usecount > 1) {
3500 boolean_t wakeup = FALSE;
3501 struct mptsub *mpts, *tmpts;
3502
3e170ce0
A
3503 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3504 "%s: mp_so 0x%llx skipped "
39236c6e
A
3505 "[u=%d,r=%d] %d %d\n", __func__,
3506 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3507 mp_so->so_usecount, mp_so->so_retaincnt,
3508 mp_tp->mpt_gc_ticks,
3e170ce0
A
3509 mp_tp->mpt_state),
3510 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3511
39236c6e
A
3512 MPT_LOCK(mp_tp);
3513 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3514 if (mp_tp->mpt_gc_ticks > 0)
3515 mp_tp->mpt_gc_ticks--;
3516 if (mp_tp->mpt_gc_ticks == 0) {
3517 wakeup = TRUE;
3518 if (mp_tp->mpt_localkey != NULL) {
3519 mptcp_free_key(
3520 mp_tp->mpt_localkey);
3521 mp_tp->mpt_localkey = NULL;
3522 }
3523 }
3524 }
3525 MPT_UNLOCK(mp_tp);
3526 if (wakeup) {
3527 TAILQ_FOREACH_SAFE(mpts,
3528 &mpte->mpte_subflows, mpts_entry, tmpts) {
3529 MPTS_LOCK(mpts);
3530 mpts->mpts_flags |= MPTSF_DELETEOK;
3531 if (mpts->mpts_soerror == 0)
3532 mpts->mpts_soerror = ETIMEDOUT;
3533 mptcp_subflow_eupcall(mpts->mpts_socket,
3534 mpts, SO_FILT_HINT_DISCONNECTED);
3535 MPTS_UNLOCK(mpts);
3536 }
3537 }
3538 lck_mtx_unlock(&mpp->mpp_lock);
3539 active++;
3540 continue;
3541 }
3542
3543 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3e170ce0
A
3544 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3545 "%s: mp_so 0x%llx skipped "
39236c6e
A
3546 "[u=%d,r=%d,s=%d]\n", __func__,
3547 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3548 mp_so->so_usecount, mp_so->so_retaincnt,
3e170ce0
A
3549 mpp->mpp_state),
3550 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3551 lck_mtx_unlock(&mpp->mpp_lock);
3552 active++;
3553 continue;
3554 }
3555
3556 /*
3557 * The PCB has been detached, and there is exactly 1 refnct
3558 * held by the MPTCP thread. Signal that thread to terminate,
3559 * after which the last refcnt will be released. That will
3560 * allow it to be destroyed below during the next round.
3561 */
3562 if (mp_so->so_usecount == 1) {
3e170ce0
A
3563 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3564 "%s: mp_so 0x%llx scheduled for "
39236c6e
A
3565 "termination [u=%d,r=%d]\n", __func__,
3566 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3567 mp_so->so_usecount, mp_so->so_retaincnt),
3568 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3569
39236c6e
A
3570 /* signal MPTCP thread to terminate */
3571 mptcp_thread_terminate_signal(mpte);
3572 lck_mtx_unlock(&mpp->mpp_lock);
3573 active++;
3574 continue;
3575 }
3576
3e170ce0
A
3577 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3578 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 3579 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3580 mp_so->so_usecount, mp_so->so_retaincnt),
3581 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3582
39037602 3583 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
3584 struct sockbuf *, &mp_so->so_rcv,
3585 struct sockbuf *, &mp_so->so_snd,
3586 struct mppcb *, mpp);
3587
3588 mp_pcbdispose(mpp);
39037602 3589 sodealloc(mp_so);
39236c6e
A
3590 }
3591
3592 return (active);
3593}
3594
3595/*
3596 * Drop a MPTCP connection, reporting the specified error.
3597 */
3598struct mptses *
3599mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3600{
3601 struct socket *mp_so;
3602
3603 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3604 MPT_LOCK_ASSERT_HELD(mp_tp);
3605 VERIFY(mpte->mpte_mptcb == mp_tp);
3606 mp_so = mpte->mpte_mppcb->mpp_socket;
3607
fe8ab488 3608 mp_tp->mpt_state = MPTCPS_TERMINATE;
39037602 3609 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
3610 uint32_t, 0 /* event */);
3611
3612 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3613 errno = mp_tp->mpt_softerror;
3614 mp_so->so_error = errno;
3615
3616 return (mptcp_close(mpte, mp_tp));
3617}
3618
3619/*
3620 * Close a MPTCP control block.
3621 */
3622struct mptses *
3623mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3624{
3e170ce0
A
3625 struct socket *mp_so = NULL;
3626 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e
A
3627
3628 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3629 MPT_LOCK_ASSERT_HELD(mp_tp);
3630 VERIFY(mpte->mpte_mptcb == mp_tp);
3631 mp_so = mpte->mpte_mppcb->mpp_socket;
3632 if (mp_tp->mpt_localkey != NULL) {
3633 mptcp_free_key(mp_tp->mpt_localkey);
3634 mp_tp->mpt_localkey = NULL;
3635 }
3636
3637 MPT_UNLOCK(mp_tp);
3638 soisdisconnected(mp_so);
3639
3640 MPT_LOCK(mp_tp);
3641 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3642 return (NULL);
3643 }
3644 MPT_UNLOCK(mp_tp);
3645
3646 /* Clean up all subflows */
3647 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3648 MPTS_LOCK(mpts);
fe8ab488 3649 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
39236c6e
A
3650 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3651 MPTS_UNLOCK(mpts);
3652 mptcp_subflow_del(mpte, mpts, TRUE);
3653 }
3654 MPT_LOCK(mp_tp);
3655
3656 return (NULL);
3657}
3658
3659void
3660mptcp_notify_close(struct socket *so)
3661{
3662 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3663}
3664
3665/*
3666 * Signal MPTCP thread to wake up.
3667 */
3668void
3669mptcp_thread_signal(struct mptses *mpte)
3670{
3671 lck_mtx_lock(&mpte->mpte_thread_lock);
3672 mptcp_thread_signal_locked(mpte);
3673 lck_mtx_unlock(&mpte->mpte_thread_lock);
3674}
3675
3676/*
3677 * Signal MPTCP thread to wake up (locked version)
3678 */
3679static void
3680mptcp_thread_signal_locked(struct mptses *mpte)
3681{
3682 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3683
3684 mpte->mpte_thread_reqs++;
3685 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3686 wakeup_one((caddr_t)&mpte->mpte_thread);
3687}
3688
3689/*
3690 * Signal MPTCP thread to terminate.
3691 */
3692static void
3693mptcp_thread_terminate_signal(struct mptses *mpte)
3694{
3695 lck_mtx_lock(&mpte->mpte_thread_lock);
3696 if (mpte->mpte_thread != THREAD_NULL) {
3697 mpte->mpte_thread = THREAD_NULL;
3698 mpte->mpte_thread_reqs++;
3699 if (!mpte->mpte_thread_active)
3700 wakeup_one((caddr_t)&mpte->mpte_thread);
3701 }
3702 lck_mtx_unlock(&mpte->mpte_thread_lock);
3703}
3704
3705/*
3706 * MPTCP thread workloop.
3707 */
3708static void
3709mptcp_thread_dowork(struct mptses *mpte)
3710{
3711 struct socket *mp_so;
3712 struct mptsub *mpts, *tmpts;
3713 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3e170ce0 3714 uint64_t mpsofilt_hint_mask = 0;
39236c6e
A
3715
3716 MPTE_LOCK(mpte); /* same as MP socket lock */
3717 VERIFY(mpte->mpte_mppcb != NULL);
3718 mp_so = mpte->mpte_mppcb->mpp_socket;
3719 VERIFY(mp_so != NULL);
3720
3721 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3722 ev_ret_t ret;
3723
3724 MPTS_LOCK(mpts);
3725 MPTS_ADDREF_LOCKED(mpts); /* for us */
490019cf 3726
39236c6e
A
3727 /* Update process ownership based on parent mptcp socket */
3728 mptcp_update_last_owner(mpts, mp_so);
490019cf 3729
39236c6e 3730 mptcp_subflow_input(mpte, mpts);
3e170ce0
A
3731
3732 mptcp_get_rtt_measurement(mpts, mpte);
3733
3734 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e
A
3735
3736 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3e170ce0
A
3737 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3738 "%s: cid %d \n", __func__,
3739 mpts->mpts_connid),
3740 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3741 (void) mptcp_subflow_output(mpte, mpts);
3742 }
3743
3744 /*
3745 * If MPTCP socket is closed, disconnect all subflows.
3746 * This will generate a disconnect event which will
3747 * be handled during the next iteration, causing a
3748 * non-zero error to be returned above.
3749 */
3750 if (mp_so->so_flags & SOF_PCBCLEARING)
3751 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3752 MPTS_UNLOCK(mpts);
3753
3754 switch (ret) {
39236c6e
A
3755 case MPTS_EVRET_OK:
3756 /* nothing to do */
3757 break;
3758 case MPTS_EVRET_DELETE:
fe8ab488 3759 mptcp_subflow_del(mpte, mpts, TRUE);
39236c6e
A
3760 break;
3761 case MPTS_EVRET_CONNECT_PENDING:
3762 connect_pending = TRUE;
3763 break;
3764 case MPTS_EVRET_DISCONNECT_FALLBACK:
3765 disconnect_fallback = TRUE;
3766 break;
3e170ce0
A
3767 default:
3768 mptcplog((LOG_DEBUG,
3769 "MPTCP Socket: %s: mptcp_subflow_events "
3770 "returned invalid value: %d\n", __func__,
3771 ret),
3772 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3773 break;
39236c6e
A
3774 }
3775 MPTS_REMREF(mpts); /* ours */
3776 }
3777
3e170ce0 3778 if (mpsofilt_hint_mask) {
39037602
A
3779 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
3780 socantrcvmore(mp_so);
3781 mpsofilt_hint_mask &= ~SO_FILT_HINT_CANTRCVMORE;
3782 }
3783
3784 if (mpsofilt_hint_mask & SO_FILT_HINT_CONNRESET) {
3785 struct mptcb *mp_tp = mpte->mpte_mptcb;
3786
3787 MPT_LOCK(mp_tp);
3788 mptcp_drop(mpte, mp_tp, ECONNRESET);
3789 MPT_UNLOCK(mp_tp);
3790 }
3791
3e170ce0 3792 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
3793 }
3794
3795 if (!connect_pending && !disconnect_fallback) {
3796 MPTE_UNLOCK(mpte);
3797 return;
3798 }
3799
3800 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3801 MPTS_LOCK(mpts);
3802 if (disconnect_fallback) {
3803 struct socket *so = NULL;
3804 struct inpcb *inp = NULL;
3805 struct tcpcb *tp = NULL;
3806
3807 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3808 MPTS_UNLOCK(mpts);
3809 continue;
3810 }
3811
3812 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3813
3814 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3e170ce0 3815 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
39236c6e
A
3816 MPTS_UNLOCK(mpts);
3817 continue;
3818 }
490019cf
A
3819
3820 if (mpts->mpts_flags & MPTSF_TFO_REQD)
39037602 3821 mptcp_drop_tfo_data(mpte, mpts, NULL);
490019cf 3822
39236c6e
A
3823 so = mpts->mpts_socket;
3824
3825 /*
3826 * The MPTCP connection has degraded to a fallback
3827 * mode, so there is no point in keeping this subflow
3828 * regardless of its MPTCP-readiness state, unless it
3829 * is the primary one which we use for fallback. This
3830 * assumes that the subflow used for fallback is the
3831 * ACTIVE one.
3832 */
3833
3834 socket_lock(so, 1);
3835 inp = sotoinpcb(so);
3836 tp = intotcpcb(inp);
3837 tp->t_mpflags &=
3838 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3839 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 3840
39236c6e
A
3841 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3842 socket_unlock(so, 1);
3843 MPTS_UNLOCK(mpts);
3844 continue;
3845 }
3846 tp->t_mpflags |= TMPF_RESET;
3847 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3848 socket_unlock(so, 1);
3849
3850 } else if (connect_pending) {
fe8ab488
A
3851 /*
3852 * If delayed subflow start is set and cellular,
3853 * delay the connect till a retransmission timeout
3854 */
3855
3856 if ((mptcp_delayed_subf_start) &&
3857 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3858 MPTS_UNLOCK(mpts);
3859 continue;
3860 }
3861
39236c6e
A
3862 /*
3863 * The MPTCP connection has progressed to a state
3864 * where it supports full multipath semantics; allow
3865 * additional joins to be attempted for all subflows
3866 * that are in the PENDING state.
3867 */
3868 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3869 (void) mptcp_subflow_soconnectx(mpte, mpts);
3870 }
3871 }
3872 MPTS_UNLOCK(mpts);
3873 }
3874
3875 MPTE_UNLOCK(mpte);
3876}
3877
3878/*
3879 * MPTCP thread.
3880 */
3881static void
3882mptcp_thread_func(void *v, wait_result_t w)
3883{
3884#pragma unused(w)
3885 struct mptses *mpte = v;
3886 struct timespec *ts = NULL;
3887
3888 VERIFY(mpte != NULL);
3889
3890 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3891
3892 for (;;) {
3893 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3894
3895 if (mpte->mpte_thread != THREAD_NULL) {
3896 (void) msleep(&mpte->mpte_thread,
3897 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3898 __func__, ts);
3899 }
3900
3901 /* MPTCP socket is closed? */
3902 if (mpte->mpte_thread == THREAD_NULL) {
3903 lck_mtx_unlock(&mpte->mpte_thread_lock);
3904 /* callee will destroy thread lock */
3905 mptcp_thread_destroy(mpte);
3906 /* NOTREACHED */
3907 return;
3908 }
3909
3910 mpte->mpte_thread_active = 1;
3911 for (;;) {
3912 uint32_t reqs = mpte->mpte_thread_reqs;
3913
3914 lck_mtx_unlock(&mpte->mpte_thread_lock);
3915 mptcp_thread_dowork(mpte);
3916 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3917
3918 /* if there's no pending request, we're done */
3919 if (reqs == mpte->mpte_thread_reqs ||
3920 mpte->mpte_thread == THREAD_NULL)
3921 break;
3922 }
3923 mpte->mpte_thread_reqs = 0;
3924 mpte->mpte_thread_active = 0;
3925 }
3926}
3927
3928/*
3929 * Destroy a MTCP thread, to be called in the MPTCP thread context
3930 * upon receiving an indication to self-terminate. This routine
3931 * will not return, as the current thread is terminated at the end.
3932 */
3933static void
3934mptcp_thread_destroy(struct mptses *mpte)
3935{
3936 struct socket *mp_so;
3937
3938 MPTE_LOCK(mpte); /* same as MP socket lock */
3939 VERIFY(mpte->mpte_thread == THREAD_NULL);
3940 VERIFY(mpte->mpte_mppcb != NULL);
3941
3942 mptcp_sesdestroy(mpte);
3943
3944 mp_so = mpte->mpte_mppcb->mpp_socket;
3945 VERIFY(mp_so != NULL);
d190cdc3 3946 VERIFY(mp_so->so_usecount > 0);
39236c6e
A
3947 mp_so->so_usecount--; /* for thread */
3948 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3949 MPTE_UNLOCK(mpte);
3950
3951 /* for the extra refcnt from kernel_thread_start() */
3952 thread_deallocate(current_thread());
3953 /* this is the end */
3954 thread_terminate(current_thread());
3955 /* NOTREACHED */
3956}
3957
3958/*
3959 * Protocol pr_lock callback.
3960 */
3961int
3962mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3963{
3964 struct mppcb *mpp = sotomppcb(mp_so);
3965 void *lr_saved;
3966
3967 if (lr == NULL)
3968 lr_saved = __builtin_return_address(0);
3969 else
3970 lr_saved = lr;
3971
3972 if (mpp == NULL) {
3973 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3974 mp_so, lr_saved, solockhistory_nr(mp_so));
3975 /* NOTREACHED */
3976 }
3977 lck_mtx_lock(&mpp->mpp_lock);
3978
3979 if (mp_so->so_usecount < 0) {
3980 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3981 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3982 solockhistory_nr(mp_so));
3983 /* NOTREACHED */
3984 }
3985 if (refcount != 0)
3986 mp_so->so_usecount++;
3987 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3988 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3989
3990 return (0);
3991}
3992
3993/*
3994 * Protocol pr_unlock callback.
3995 */
3996int
3997mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3998{
3999 struct mppcb *mpp = sotomppcb(mp_so);
4000 void *lr_saved;
4001
4002 if (lr == NULL)
4003 lr_saved = __builtin_return_address(0);
4004 else
4005 lr_saved = lr;
4006
4007 if (mpp == NULL) {
4008 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4009 mp_so, mp_so->so_usecount, lr_saved,
4010 solockhistory_nr(mp_so));
4011 /* NOTREACHED */
4012 }
4013 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
4014
4015 if (refcount != 0)
4016 mp_so->so_usecount--;
4017
4018 if (mp_so->so_usecount < 0) {
4019 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4020 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4021 /* NOTREACHED */
4022 }
4023 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4024 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4025 lck_mtx_unlock(&mpp->mpp_lock);
4026
4027 return (0);
4028}
4029
4030/*
4031 * Protocol pr_getlock callback.
4032 */
4033lck_mtx_t *
4034mptcp_getlock(struct socket *mp_so, int locktype)
4035{
4036#pragma unused(locktype)
4037 struct mppcb *mpp = sotomppcb(mp_so);
4038
4039 if (mpp == NULL) {
4040 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4041 solockhistory_nr(mp_so));
4042 /* NOTREACHED */
4043 }
4044 if (mp_so->so_usecount < 0) {
4045 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4046 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4047 /* NOTREACHED */
4048 }
4049 return (&mpp->mpp_lock);
4050}
4051
4052/*
4053 * Key generation functions
4054 */
4055static void
4056mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
4057{
4058 struct mptcp_key_entry *key_elm;
4059try_again:
4060 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
4061 if (key_entry->mkey_value == 0)
4062 goto try_again;
4063 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
4064 sizeof (key_entry->mkey_digest));
4065
4066 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4067 if (key_elm->mkey_value == key_entry->mkey_value) {
4068 goto try_again;
4069 }
4070 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
4071 0) {
4072 goto try_again;
4073 }
4074 }
4075}
4076
4077static mptcp_key_t *
4078mptcp_reserve_key(void)
4079{
4080 struct mptcp_key_entry *key_elm;
4081 struct mptcp_key_entry *found_elm = NULL;
4082
4083 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4084 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4085 if (key_elm->mkey_flags == MKEYF_FREE) {
4086 key_elm->mkey_flags = MKEYF_INUSE;
4087 found_elm = key_elm;
4088 break;
4089 }
4090 }
4091 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4092
4093 if (found_elm) {
4094 return (&found_elm->mkey_value);
4095 }
4096
4097 key_elm = (struct mptcp_key_entry *)
4098 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4099 key_elm->mkey_flags = MKEYF_INUSE;
4100
4101 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4102 mptcp_generate_unique_key(key_elm);
4103 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
4104 mptcp_keys_pool.mkph_count += 1;
4105 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4106 return (&key_elm->mkey_value);
4107}
4108
4109static caddr_t
4110mptcp_get_stored_digest(mptcp_key_t *key)
4111{
4112 struct mptcp_key_entry *key_holder;
4113 caddr_t digest = NULL;
4114
4115 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4116 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
4117 offsetof(struct mptcp_key_entry, mkey_value));
4118 if (key_holder->mkey_flags != MKEYF_INUSE)
4119 panic_plain("%s", __func__);
4120 digest = &key_holder->mkey_digest[0];
4121 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4122 return (digest);
4123}
4124
4125void
4126mptcp_free_key(mptcp_key_t *key)
4127{
4128 struct mptcp_key_entry *key_holder;
4129 struct mptcp_key_entry *key_elm;
4130 int pt = RandomULong();
4131
39236c6e
A
4132 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4133 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
4134 offsetof(struct mptcp_key_entry, mkey_value));
4135 key_holder->mkey_flags = MKEYF_FREE;
4136
4137 LIST_REMOVE(key_holder, mkey_next);
4138 mptcp_keys_pool.mkph_count -= 1;
4139
4140 /* Free half the time */
4141 if (pt & 0x01) {
4142 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
4143 } else {
4144 /* Insert it at random point to avoid early reuse */
4145 int i = 0;
4146 if (mptcp_keys_pool.mkph_count > 1) {
4147 pt = pt % (mptcp_keys_pool.mkph_count - 1);
4148 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4149 if (++i >= pt) {
4150 LIST_INSERT_AFTER(key_elm, key_holder,
4151 mkey_next);
4152 break;
4153 }
4154 }
4155 if (i < pt)
4156 panic("missed insertion");
4157 } else {
4158 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
4159 mkey_next);
4160 }
4161 mptcp_keys_pool.mkph_count += 1;
4162 }
4163 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4164}
4165
4166static void
4167mptcp_key_pool_init(void)
4168{
4169 int i;
4170 struct mptcp_key_entry *key_entry;
4171
4172 LIST_INIT(&mptcp_keys_pool);
4173 mptcp_keys_pool.mkph_count = 0;
4174
4175 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
4176 (sizeof (struct mptcp_key_entry));
4177 mptcp_keys_pool.mkph_key_entry_zone = zinit(
4178 mptcp_keys_pool.mkph_key_elm_sz,
4179 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
4180 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
4181 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
4182 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
4183 /* NOTREACHED */
4184 }
4185 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
4186 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
4187
4188 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
4189 key_entry = (struct mptcp_key_entry *)
4190 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4191 key_entry->mkey_flags = MKEYF_FREE;
4192 mptcp_generate_unique_key(key_entry);
4193 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
4194 mptcp_keys_pool.mkph_count += 1;
4195 }
4196 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
4197 mtcbinfo.mppi_lock_attr);
4198}
4199
4200/*
4201 * MPTCP Join support
4202 */
4203
4204static void
4205mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4206 uint8_t addr_id)
39236c6e
A
4207{
4208 struct tcpcb *tp = sototcpcb(so);
4209 struct mptcp_subf_auth_entry *sauth_entry;
4210 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4211
4212 MPT_LOCK_SPIN(mp_tp);
4213 tp->t_mptcb = mp_tp;
39236c6e 4214 /*
39236c6e
A
4215 * The address ID of the first flow is implicitly 0.
4216 */
4217 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4218 tp->t_local_aid = 0;
4219 } else {
fe8ab488 4220 tp->t_local_aid = addr_id;
39236c6e
A
4221 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4222 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4223 }
fe8ab488 4224 MPT_UNLOCK(mp_tp);
39236c6e
A
4225 sauth_entry = zalloc(mpt_subauth_zone);
4226 sauth_entry->msae_laddr_id = tp->t_local_aid;
4227 sauth_entry->msae_raddr_id = 0;
4228 sauth_entry->msae_raddr_rand = 0;
4229try_again:
4230 sauth_entry->msae_laddr_rand = RandomULong();
4231 if (sauth_entry->msae_laddr_rand == 0)
4232 goto try_again;
fe8ab488 4233 MPT_LOCK_SPIN(mp_tp);
39236c6e 4234 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
fe8ab488 4235 MPT_UNLOCK(mp_tp);
39236c6e
A
4236}
4237
4238static void
4239mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4240{
4241 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4242 struct tcpcb *tp = NULL;
39236c6e
A
4243 int found = 0;
4244
fe8ab488
A
4245 socket_lock(so, 0);
4246 tp = sototcpcb(so);
4247 if (tp == NULL) {
4248 socket_unlock(so, 0);
39236c6e 4249 return;
fe8ab488 4250 }
39236c6e
A
4251
4252 MPT_LOCK(mp_tp);
4253 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4254 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4255 found = 1;
4256 break;
4257 }
4258 }
4259 if (found) {
4260 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4261 }
39236c6e 4262 MPT_UNLOCK(mp_tp);
fe8ab488 4263
3e170ce0
A
4264 if (found)
4265 zfree(mpt_subauth_zone, sauth_entry);
4266
fe8ab488
A
4267 tp->t_mptcb = NULL;
4268 socket_unlock(so, 0);
39236c6e
A
4269}
4270
4271void
4272mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4273 u_int32_t *rrand)
4274{
4275 struct mptcp_subf_auth_entry *sauth_entry;
4276 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4277
4278 MPT_LOCK(mp_tp);
4279 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4280 if (sauth_entry->msae_laddr_id == addr_id) {
4281 if (lrand)
4282 *lrand = sauth_entry->msae_laddr_rand;
4283 if (rrand)
4284 *rrand = sauth_entry->msae_raddr_rand;
4285 break;
4286 }
4287 }
4288 MPT_UNLOCK(mp_tp);
4289}
4290
4291void
4292mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4293 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4294{
4295 struct mptcp_subf_auth_entry *sauth_entry;
4296 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4297
4298 MPT_LOCK(mp_tp);
4299 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4300 if (sauth_entry->msae_laddr_id == laddr_id) {
4301 if ((sauth_entry->msae_raddr_id != 0) &&
4302 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4303 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4304 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4305 sauth_entry->msae_raddr_id),
4306 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4307 MPT_UNLOCK(mp_tp);
4308 return;
4309 }
4310 sauth_entry->msae_raddr_id = raddr_id;
4311 if ((sauth_entry->msae_raddr_rand != 0) &&
4312 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4313 mptcplog((LOG_ERR, "MPTCP Socket: "
4314 "%s: dup SYN_ACK %d %d \n",
39236c6e 4315 __func__, raddr_rand,
3e170ce0
A
4316 sauth_entry->msae_raddr_rand),
4317 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4318 MPT_UNLOCK(mp_tp);
4319 return;
4320 }
4321 sauth_entry->msae_raddr_rand = raddr_rand;
4322 MPT_UNLOCK(mp_tp);
4323 return;
4324 }
4325 }
4326 MPT_UNLOCK(mp_tp);
4327}
4328
4329/*
4330 * SHA1 support for MPTCP
4331 */
4332static int
4333mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4334{
4335 SHA1_CTX sha1ctxt;
4336 const unsigned char *sha1_base;
4337 int sha1_size;
4338
4339 if (digest_len != SHA1_RESULTLEN) {
4340 return (FALSE);
4341 }
4342
4343 sha1_base = (const unsigned char *) key;
4344 sha1_size = sizeof (mptcp_key_t);
4345 SHA1Init(&sha1ctxt);
4346 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4347 SHA1Final(sha_digest, &sha1ctxt);
4348 return (TRUE);
4349}
4350
4351void
4352mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4353 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4354{
4355 SHA1_CTX sha1ctxt;
4356 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4357 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4358 u_int32_t data[2];
4359 int i;
4360
4361 bzero(digest, digest_len);
4362
4363 /* Set up the Key for HMAC */
4364 key_ipad[0] = key1;
4365 key_ipad[1] = key2;
4366
4367 key_opad[0] = key1;
4368 key_opad[1] = key2;
4369
4370 /* Set up the message for HMAC */
4371 data[0] = rand1;
4372 data[1] = rand2;
4373
4374 /* Key is 512 block length, so no need to compute hash */
4375
4376 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4377
4378 for (i = 0; i < 8; i++) {
4379 key_ipad[i] ^= 0x3636363636363636;
4380 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4381 }
4382
4383 /* Perform inner SHA1 */
4384 SHA1Init(&sha1ctxt);
4385 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4386 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4387 SHA1Final(digest, &sha1ctxt);
4388
4389 /* Perform outer SHA1 */
4390 SHA1Init(&sha1ctxt);
4391 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4392 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4393 SHA1Final(digest, &sha1ctxt);
4394}
4395
4396/*
4397 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4398 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4399 */
4400void
4401mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4402 int digest_len)
4403{
4404 uint32_t lrand, rrand;
4405 mptcp_key_t localkey, remotekey;
4406 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4407
4408 if (digest_len != SHA1_RESULTLEN)
4409 return;
4410
4411 lrand = rrand = 0;
4412 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4413 MPT_LOCK_SPIN(mp_tp);
4414 localkey = *mp_tp->mpt_localkey;
4415 remotekey = mp_tp->mpt_remotekey;
4416 MPT_UNLOCK(mp_tp);
4417 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4418 digest_len);
4419}
4420
4421u_int64_t
4422mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4423{
4424 u_char digest[SHA1_RESULTLEN];
4425 u_int64_t trunced_digest;
4426
4427 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4428 bcopy(digest, &trunced_digest, 8);
4429 return (trunced_digest);
4430}
4431
4432/*
4433 * Authentication data generation
4434 */
490019cf 4435void
39236c6e
A
4436mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4437 int token_len)
4438{
4439 VERIFY(token_len == sizeof (u_int32_t));
4440 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4441
4442 /* Most significant 32 bits of the SHA1 hash */
4443 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4444 return;
39236c6e
A
4445}
4446
490019cf 4447void
39236c6e
A
4448mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4449 int idsn_len)
4450{
4451 VERIFY(idsn_len == sizeof (u_int64_t));
4452 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4453
4454 /*
4455 * Least significant 64 bits of the SHA1 hash
4456 */
4457
4458 idsn[7] = sha_digest[12];
4459 idsn[6] = sha_digest[13];
4460 idsn[5] = sha_digest[14];
4461 idsn[4] = sha_digest[15];
4462 idsn[3] = sha_digest[16];
4463 idsn[2] = sha_digest[17];
4464 idsn[1] = sha_digest[18];
4465 idsn[0] = sha_digest[19];
490019cf 4466 return;
39236c6e
A
4467}
4468
490019cf
A
4469static void
4470mptcp_conn_properties(struct mptcb *mp_tp)
4471{
4472 /* There is only Version 0 at this time */
4473 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4474
4475 /* Set DSS checksum flag */
4476 if (mptcp_dss_csum)
4477 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4478
4479 /* Set up receive window */
4480 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4481
4482 /* Set up gc ticks */
4483 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4484}
4485
4486static void
4487mptcp_init_local_parms(struct mptcb *mp_tp)
39236c6e
A
4488{
4489 caddr_t local_digest = NULL;
490019cf
A
4490
4491 mp_tp->mpt_localkey = mptcp_reserve_key();
4492 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4493 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4494 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4495 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4496 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4497
4498 /* The subflow SYN is also first MPTCP byte */
4499 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4500 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4501
4502 mptcp_conn_properties(mp_tp);
4503}
4504
4505int
4506mptcp_init_remote_parms(struct mptcb *mp_tp)
4507{
39236c6e
A
4508 char remote_digest[MPTCP_SHA1_RESULTLEN];
4509 MPT_LOCK_ASSERT_HELD(mp_tp);
4510
4511 /* Only Version 0 is supported for auth purposes */
3e170ce0 4512 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4513 return (-1);
4514
4515 /* Setup local and remote tokens and Initial DSNs */
39236c6e
A
4516
4517 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4518 SHA1_RESULTLEN)) {
3e170ce0
A
4519 mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
4520 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4521 return (-1);
4522 }
4523 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4524 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4525 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4526 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
39236c6e 4527 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4528
490019cf 4529 return (0);
39236c6e
A
4530}
4531
4532/*
4533 * Helper Functions
4534 */
4535mptcp_token_t
4536mptcp_get_localtoken(void* mptcb_arg)
4537{
4538 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4539 return (mp_tp->mpt_localtoken);
4540}
4541
4542mptcp_token_t
4543mptcp_get_remotetoken(void* mptcb_arg)
4544{
4545 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4546 return (mp_tp->mpt_remotetoken);
4547}
4548
4549u_int64_t
4550mptcp_get_localkey(void* mptcb_arg)
4551{
4552 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4553 if (mp_tp->mpt_localkey != NULL)
4554 return (*mp_tp->mpt_localkey);
4555 else
4556 return (0);
4557}
4558
4559u_int64_t
4560mptcp_get_remotekey(void* mptcb_arg)
4561{
4562 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4563 return (mp_tp->mpt_remotekey);
4564}
4565
4566void
4567mptcp_send_dfin(struct socket *so)
4568{
4569 struct tcpcb *tp = NULL;
4570 struct inpcb *inp = NULL;
4571
4572 inp = sotoinpcb(so);
4573 if (!inp)
4574 return;
4575
4576 tp = intotcpcb(inp);
4577 if (!tp)
4578 return;
4579
4580 if (!(tp->t_mpflags & TMPF_RESET))
4581 tp->t_mpflags |= TMPF_SEND_DFIN;
4582}
4583
4584/*
4585 * Data Sequence Mapping routines
4586 */
4587void
4588mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4589{
4590 struct mptcb *mp_tp;
4591
4592 if (m == NULL)
4593 return;
4594
3e170ce0 4595 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e 4596 MPT_LOCK(mp_tp);
39236c6e
A
4597 while (m) {
4598 VERIFY(m->m_flags & M_PKTHDR);
4599 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4600 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4601 m->m_pkthdr.mp_rlen = m_pktlen(m);
4602 mp_tp->mpt_sndmax += m_pktlen(m);
4603 m = m->m_next;
4604 }
4605 MPT_UNLOCK(mp_tp);
4606}
4607
4608void
490019cf 4609mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e
A
4610{
4611 u_int32_t sub_len = 0;
490019cf
A
4612 int rewinding = 0;
4613
4614 if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
4615 /* TFO makes things complicated. */
4616 if (so->so_flags1 & SOF1_TFO_REWIND) {
4617 rewinding = 1;
4618 so->so_flags1 &= ~SOF1_TFO_REWIND;
4619 }
4620 }
39236c6e
A
4621
4622 while (m) {
4623 VERIFY(m->m_flags & M_PKTHDR);
4624
4625 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4626 sub_len = m->m_pkthdr.mp_rlen;
4627
4628 if (sub_len < len) {
4629 m->m_pkthdr.mp_dsn += sub_len;
4630 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4631 m->m_pkthdr.mp_rseq += sub_len;
4632 }
4633 m->m_pkthdr.mp_rlen = 0;
4634 len -= sub_len;
4635 } else {
4636 /* sub_len >= len */
490019cf
A
4637 if (rewinding == 0)
4638 m->m_pkthdr.mp_dsn += len;
39236c6e 4639 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
490019cf
A
4640 if (rewinding == 0)
4641 m->m_pkthdr.mp_rseq += len;
39236c6e 4642 }
3e170ce0 4643 mptcplog((LOG_DEBUG, "MPTCP Sender: "
490019cf 4644 "%s: dsn 0x%llx ssn %u len %d %d\n",
3e170ce0 4645 __func__,
39236c6e 4646 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
3e170ce0
A
4647 m->m_pkthdr.mp_rlen, len),
4648 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 4649 m->m_pkthdr.mp_rlen -= len;
39037602 4650 break;
39236c6e
A
4651 }
4652 } else {
4653 panic("%s: MPTCP tag not set", __func__);
4654 /* NOTREACHED */
4655 }
4656 m = m->m_next;
4657 }
39037602
A
4658
4659 if (so->so_flags & SOF_MP_SUBFLOW &&
4660 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4661 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4662 /*
4663 * Received an ack without receiving a DATA_ACK.
4664 * Need to fallback to regular TCP (or destroy this subflow).
4665 */
4666 mptcp_notify_mpfail(so);
4667 }
39236c6e
A
4668}
4669
4670/* Obtain the DSN mapping stored in the mbuf */
4671void
4672mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4673 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4674{
4675 u_int64_t dsn64;
4676
4677 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4678 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4679 *dsn64p = dsn64;
4680}
4681
4682void
4683mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4684 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4685{
4686 struct mbuf *m = so->so_snd.sb_mb;
4687 struct mbuf *mnext = NULL;
4688 uint32_t runlen = 0;
4689 u_int64_t dsn64;
4690 uint32_t contig_len = 0;
4691
4692 if (m == NULL)
4693 return;
4694
4695 if (off < 0)
4696 return;
4697 /*
4698 * In the subflow socket, the DSN sequencing can be discontiguous,
4699 * but the subflow sequence mapping is contiguous. Use the subflow
4700 * sequence property to find the right mbuf and corresponding dsn
4701 * mapping.
4702 */
4703
4704 while (m) {
4705 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4706 VERIFY(m->m_flags & M_PKTHDR);
4707
4708 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4709 off -= m->m_pkthdr.mp_rlen;
4710 m = m->m_next;
4711 } else {
4712 break;
4713 }
4714 }
4715
4716 if (m == NULL) {
4717 panic("%s: bad offset", __func__);
4718 /* NOTREACHED */
4719 }
4720
4721 dsn64 = m->m_pkthdr.mp_dsn + off;
4722 *dsn = dsn64;
4723 *relseq = m->m_pkthdr.mp_rseq + off;
4724
4725 /*
4726 * Now find the last contiguous byte and its length from
4727 * start.
4728 */
4729 runlen = m->m_pkthdr.mp_rlen - off;
4730 contig_len = runlen;
4731
4732 /* If datalen does not span multiple mbufs, return */
4733 if (datalen <= runlen) {
4734 *data_len = min(datalen, UINT16_MAX);
4735 return;
4736 }
4737
4738 mnext = m->m_next;
4739 while (datalen > runlen) {
4740 if (mnext == NULL) {
4741 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4742 runlen, off);
4743 /* NOTREACHED */
4744 }
4745 VERIFY(mnext->m_flags & M_PKTHDR);
4746 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4747
4748 /*
4749 * case A. contiguous DSN stream
4750 * case B. discontiguous DSN stream
4751 */
4752 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4753 /* case A */
4754 runlen += mnext->m_pkthdr.mp_rlen;
4755 contig_len += mnext->m_pkthdr.mp_rlen;
3e170ce0
A
4756 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
4757 __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4758 } else {
4759 /* case B */
3e170ce0 4760 mptcplog((LOG_DEBUG, "MPTCP Sender: "
fe8ab488 4761 "%s: discontig datalen %d contig_len %d cc %d \n",
3e170ce0
A
4762 __func__, datalen, contig_len, so->so_snd.sb_cc),
4763 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4764 break;
4765 }
4766 mnext = mnext->m_next;
4767 }
4768 datalen = min(datalen, UINT16_MAX);
4769 *data_len = min(datalen, contig_len);
3e170ce0
A
4770 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4771 "%s: %llu %u %d %d \n", __func__,
4772 *dsn, *relseq, *data_len, off),
4773 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4774}
4775
4776/*
4777 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4778 * here. It must be called from mptcp_adj_rmap() which is called only after
4779 * reassembly of out of order data. The rcvnxt variable must
4780 * be updated only when atleast some insequence new data is received.
4781 */
4782static void
4783mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4784{
4785 struct mptcb *mp_tp = tptomptp(tp);
4786
4787 if (mp_tp == NULL)
4788 return;
4789 MPT_LOCK(mp_tp);
4790 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4791 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4792 m->m_pkthdr.mp_rlen)))) {
4793 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4794 }
4795 MPT_UNLOCK(mp_tp);
4796}
4797
4798/*
3e170ce0
A
4799 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4800 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4801 * When it trims data tcp_input calls m_adj() which does not remove the
4802 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4803 * The dsn map insertion cannot be delayed after trim, because data can be in
4804 * the reassembly queue for a while and the DSN option info in tp will be
4805 * overwritten for every new packet received.
39236c6e
A
4806 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4807 * with mptcp_adj_rmap()
4808 */
4809void
4810mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4811{
4812 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4813
4814 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4815 VERIFY(m->m_flags & M_PKTHDR);
4816 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4817 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4818 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4819 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4820 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4821 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4822 }
4823}
4824
fe8ab488 4825int
39236c6e
A
4826mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4827{
4828 u_int64_t dsn;
4829 u_int32_t sseq, datalen;
4830 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4831 u_int32_t old_rcvnxt = 0;
4832
4833 if (m_pktlen(m) == 0)
fe8ab488 4834 return 0;
39236c6e
A
4835
4836 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4837 VERIFY(m->m_flags & M_PKTHDR);
4838
4839 dsn = m->m_pkthdr.mp_dsn;
4840 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4841 datalen = m->m_pkthdr.mp_rlen;
4842 } else {
4843 /* data arrived without an DSS option mapping */
fe8ab488
A
4844
4845 /* initial subflow can fallback right after SYN handshake */
39236c6e 4846 mptcp_notify_mpfail(so);
fe8ab488 4847 return 0;
39236c6e
A
4848 }
4849
4850 /* In the common case, data is in window and in sequence */
4851 if (m->m_pkthdr.len == (int)datalen) {
4852 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4853 return 0;
39236c6e
A
4854 }
4855
39236c6e
A
4856 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4857 if (SEQ_GT(old_rcvnxt, sseq)) {
4858 /* data trimmed from the left */
4859 int off = old_rcvnxt - sseq;
4860 m->m_pkthdr.mp_dsn += off;
4861 m->m_pkthdr.mp_rseq += off;
fe8ab488 4862 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e
A
4863 } else if (old_rcvnxt == sseq) {
4864 /*
3e170ce0 4865 * data was trimmed from the right
39236c6e
A
4866 */
4867 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4868 } else {
fe8ab488 4869 mptcp_notify_mpfail(so);
3e170ce0 4870 return (-1);
39236c6e
A
4871 }
4872 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4873 return 0;
39236c6e
A
4874}
4875
4876/*
4877 * Following routines help with failure detection and failover of data
4878 * transfer from one subflow to another.
4879 */
4880void
4881mptcp_act_on_txfail(struct socket *so)
4882{
4883 struct tcpcb *tp = NULL;
4884 struct inpcb *inp = sotoinpcb(so);
4885
4886 if (inp == NULL)
4887 return;
4888
4889 tp = intotcpcb(inp);
4890 if (tp == NULL)
4891 return;
4892
39236c6e
A
4893 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4894 return;
4895 }
4896
4897 so->so_flags |= SOF_MP_TRYFAILOVER;
4898 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4899}
4900
4901/*
4902 * Support for MP_FAIL option
4903 */
4904int
4905mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4906{
4907 struct mbuf *m = so->so_snd.sb_mb;
4908 u_int64_t dsn;
4909 int off = 0;
4910 u_int32_t datalen;
4911
4912 if (m == NULL)
4913 return (-1);
4914
4915 while (m != NULL) {
4916 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4917 VERIFY(m->m_flags & M_PKTHDR);
4918 dsn = m->m_pkthdr.mp_dsn;
4919 datalen = m->m_pkthdr.mp_rlen;
4920 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4921 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4922 off = dsn_fail - dsn;
4923 *tcp_seq = m->m_pkthdr.mp_rseq + off;
3e170ce0
A
4924 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
4925 __func__, dsn, dsn_fail),
4926 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4927 return (0);
4928 }
4929
4930 m = m->m_next;
4931 }
4932
4933 /*
4934 * If there was no mbuf data and a fallback to TCP occurred, there's
4935 * not much else to do.
4936 */
4937
3e170ce0
A
4938 mptcplog((LOG_ERR, "MPTCP Sender: "
4939 "%s: %llu not found \n", __func__, dsn_fail),
4940 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4941 return (-1);
4942}
4943
4944/*
4945 * Support for sending contiguous MPTCP bytes in subflow
fe8ab488 4946 * Also for preventing sending data with ACK in 3-way handshake
39236c6e
A
4947 */
4948int32_t
4949mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4950{
4951 u_int64_t mdss_dsn = 0;
4952 u_int32_t mdss_subflow_seq = 0;
4953 u_int16_t mdss_data_len = 0;
4954
4955 if (len == 0)
4956 return (len);
4957
4958 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4959 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4960
39037602 4961 /*
fe8ab488
A
4962 * Special case handling for Fast Join. We want to send data right
4963 * after ACK of the 3-way handshake, but not piggyback the data
4964 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4965 * mdss_data_len control this.
4966 */
4967 struct tcpcb *tp = NULL;
39037602 4968 tp = intotcpcb(sotoinpcb(so));
fe8ab488
A
4969 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4970 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4971 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4972 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4973 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4974 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
490019cf
A
4975 mdss_data_len = 0;
4976 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4977 }
4978
4979 if ((tp->t_state > TCPS_SYN_SENT) &&
4980 (tp->t_mpflags & TMPF_TFO_REQUEST)) {
4981 mdss_data_len = 0;
4982 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4983 }
39236c6e
A
4984 return (mdss_data_len);
4985}
4986
4987int32_t
4988mptcp_sbspace(struct mptcb *mpt)
4989{
4990 struct sockbuf *sb;
4991 uint32_t rcvbuf;
4992 int32_t space;
4993
4994 MPT_LOCK_ASSERT_HELD(mpt);
4995 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4996
4997 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4998 rcvbuf = sb->sb_hiwat;
4999 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
5000 (sb->sb_mbmax - sb->sb_mbcnt)));
5001 if (space < 0)
5002 space = 0;
5003 /* XXX check if it's too small? */
5004
5005 return (space);
5006}
5007
5008/*
5009 * Support Fallback to Regular TCP
5010 */
5011void
5012mptcp_notify_mpready(struct socket *so)
5013{
5014 struct tcpcb *tp = NULL;
5015
5016 if (so == NULL)
5017 return;
5018
5019 tp = intotcpcb(sotoinpcb(so));
5020
5021 if (tp == NULL)
5022 return;
5023
5024 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5025 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5026 struct tcpcb *, tp);
5027
5028 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5029 return;
5030
5031 if (tp->t_mpflags & TMPF_MPTCP_READY)
5032 return;
5033
5034 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5035 tp->t_mpflags |= TMPF_MPTCP_READY;
5036
5037 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5038}
5039
5040void
5041mptcp_notify_mpfail(struct socket *so)
5042{
5043 struct tcpcb *tp = NULL;
5044
5045 if (so == NULL)
5046 return;
5047
5048 tp = intotcpcb(sotoinpcb(so));
5049
5050 if (tp == NULL)
5051 return;
5052
5053 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5054 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5055 struct tcpcb *, tp);
5056
5057 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5058 return;
5059
5060 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5061 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5062
5063 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5064}
5065
5066/*
5067 * Keepalive helper function
5068 */
5069boolean_t
5070mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5071{
5072 boolean_t ret = 1;
5073 VERIFY(mp_tp != NULL);
5074 MPT_LOCK(mp_tp);
5075 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5076 ret = 0;
5077 }
5078 MPT_UNLOCK(mp_tp);
5079 return (ret);
5080}
5081
5082/*
5083 * MPTCP t_maxseg adjustment function
5084 */
5085int
5086mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5087{
5088 int mss_lower = 0;
5089 struct mptcb *mp_tp = tptomptp(tp);
5090
5091#define MPTCP_COMPUTE_LEN { \
5092 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5093 MPT_LOCK(mp_tp); \
5094 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5095 mss_lower += 2; \
5096 else \
5097 /* adjust to 32-bit boundary + EOL */ \
5098 mss_lower += 2; \
5099 MPT_UNLOCK(mp_tp); \
5100}
5101 if (mp_tp == NULL)
5102 return (0);
5103
5104 /*
5105 * For the first subflow and subsequent subflows, adjust mss for
5106 * most common MPTCP option size, for case where tcp_mss is called
5107 * during option processing and MTU discovery.
5108 */
5109 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5110 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
5111 MPTCP_COMPUTE_LEN;
5112 }
5113
5114 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5115 (tp->t_mpflags & TMPF_SENT_JOIN)) {
5116 MPTCP_COMPUTE_LEN;
5117 }
5118
5119 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5120 MPTCP_COMPUTE_LEN;
5121 }
5122
5123 return (mss_lower);
5124}
5125
5126/*
5127 * Update the pid, upid, uuid of the subflow so, based on parent so
5128 */
5129void
5130mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
5131{
5132 struct socket *subflow_so = mpts->mpts_socket;
39037602 5133
39236c6e
A
5134 MPTS_LOCK_ASSERT_HELD(mpts);
5135
5136 socket_lock(subflow_so, 0);
5137 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
5138 (subflow_so->last_upid != parent_mpso->last_upid)) {
5139 subflow_so->last_upid = parent_mpso->last_upid;
5140 subflow_so->last_pid = parent_mpso->last_pid;
5141 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
5142 }
5143 so_update_policy(subflow_so);
5144 socket_unlock(subflow_so, 0);
5145}
5146
5147static void
5148fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5149{
5150 struct inpcb *inp;
5151
5152 tcp_getconninfo(so, &flow->flow_ci);
5153 inp = sotoinpcb(so);
5154#if INET6
5155 if ((inp->inp_vflag & INP_IPV6) != 0) {
5156 flow->flow_src.ss_family = AF_INET6;
5157 flow->flow_dst.ss_family = AF_INET6;
5158 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5159 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5160 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5161 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5162 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5163 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
39037602 5164 } else
39236c6e 5165#endif
3e170ce0 5166 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5167 flow->flow_src.ss_family = AF_INET;
5168 flow->flow_dst.ss_family = AF_INET;
5169 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5170 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5171 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5172 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5173 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5174 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5175 }
3e170ce0
A
5176 flow->flow_len = sizeof(*flow);
5177 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5178 flow->flow_flags = mpts->mpts_flags;
5179 flow->flow_cid = mpts->mpts_connid;
3e170ce0
A
5180 flow->flow_sndnxt = mpts->mpts_sndnxt;
5181 flow->flow_relseq = mpts->mpts_rel_seq;
5182 flow->flow_soerror = mpts->mpts_soerror;
5183 flow->flow_probecnt = mpts->mpts_probecnt;
5184 flow->flow_peerswitch = mpts->mpts_peerswitch;
39236c6e
A
5185}
5186
5187static int
5188mptcp_pcblist SYSCTL_HANDLER_ARGS
5189{
5190#pragma unused(oidp, arg1, arg2)
5191 int error = 0, f;
5192 size_t n, len;
5193 struct mppcb *mpp;
5194 struct mptses *mpte;
5195 struct mptcb *mp_tp;
5196 struct mptsub *mpts;
5197 struct socket *so;
5198 conninfo_mptcp_t mptcpci;
fe8ab488 5199 mptcp_flow_t *flows = NULL;
39236c6e
A
5200
5201 if (req->newptr != USER_ADDR_NULL)
5202 return (EPERM);
5203
5204 lck_mtx_lock(&mtcbinfo.mppi_lock);
5205 n = mtcbinfo.mppi_count;
5206 if (req->oldptr == USER_ADDR_NULL) {
5207 lck_mtx_unlock(&mtcbinfo.mppi_lock);
39037602 5208 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
39236c6e
A
5209 4 * (n + n/8) * sizeof(mptcp_flow_t);
5210 return (0);
5211 }
5212 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5213 flows = NULL;
39236c6e
A
5214 lck_mtx_lock(&mpp->mpp_lock);
5215 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3e170ce0
A
5216 if (mpp->mpp_flags & MPP_DEFUNCT) {
5217 lck_mtx_unlock(&mpp->mpp_lock);
5218 continue;
5219 }
39236c6e
A
5220 mpte = mptompte(mpp);
5221 VERIFY(mpte != NULL);
5222 mp_tp = mpte->mpte_mptcb;
5223 VERIFY(mp_tp != NULL);
3e170ce0
A
5224
5225 bzero(&mptcpci, sizeof(mptcpci));
5226 MPT_LOCK(mp_tp);
39236c6e 5227 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5228 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5229 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5230 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5231 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5232 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5233 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5234 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5235 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5236 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5237 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5238 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
5239 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5240 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5241 MPT_UNLOCK(mp_tp);
5242
39236c6e 5243 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5244 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5245 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5246 mptcpci.mptcpci_flow_offset =
5247 offsetof(conninfo_mptcp_t, mptcpci_flows);
5248
fe8ab488
A
5249 len = sizeof(*flows) * mpte->mpte_numflows;
5250 if (mpte->mpte_numflows != 0) {
5251 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5252 if (flows == NULL) {
5253 lck_mtx_unlock(&mpp->mpp_lock);
5254 break;
5255 }
5256 mptcpci.mptcpci_len = sizeof(mptcpci) +
5257 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5258 error = SYSCTL_OUT(req, &mptcpci,
5259 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5260 } else {
5261 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5262 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 5263 }
39236c6e
A
5264 if (error) {
5265 lck_mtx_unlock(&mpp->mpp_lock);
5266 FREE(flows, M_TEMP);
5267 break;
5268 }
5269 f = 0;
5270 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5271 MPTS_LOCK(mpts);
5272 so = mpts->mpts_socket;
5273 socket_lock(so, 0);
5274 fill_mptcp_subflow(so, &flows[f], mpts);
5275 socket_unlock(so, 0);
5276 MPTS_UNLOCK(mpts);
5277 f++;
5278 }
5279 lck_mtx_unlock(&mpp->mpp_lock);
fe8ab488
A
5280 if (flows) {
5281 error = SYSCTL_OUT(req, flows, len);
5282 FREE(flows, M_TEMP);
5283 if (error)
5284 break;
5285 }
39236c6e
A
5286 }
5287 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5288
5289 return (error);
5290}
5291
5292SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 5293 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 5294 "List of active MPTCP connections");
fe8ab488
A
5295
5296/*
5297 * Check the health of the other subflows and do an mptcp_output if
5298 * there is no other active or functional subflow at the time of
5299 * call of this function.
5300 */
5301static void
5302mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
5303{
5304 struct mptsub *from_mpts = NULL;
5305
5306 MPTE_LOCK_ASSERT_HELD(mpte);
5307
5308 MPTS_UNLOCK(to_mpts);
39037602
A
5309
5310 from_mpts = mpte->mpte_active_sub;
fe8ab488
A
5311
5312 if (from_mpts == NULL)
5313 goto output_needed;
5314
5315 MPTS_LOCK(from_mpts);
5316
5317 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
5318 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
5319 MPTS_UNLOCK(from_mpts);
5320 goto output_needed;
5321 }
5322
5323 MPTS_UNLOCK(from_mpts);
5324 MPTS_LOCK(to_mpts);
5325 return;
5326
39037602
A
5327output_needed:
5328 mptcp_output(mpte);
fe8ab488
A
5329 MPTS_LOCK(to_mpts);
5330}
5331
fe8ab488
A
5332/*
5333 * Set notsent lowat mark on the MPTCB
5334 */
5335int
5336mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5337{
5338 struct mptcb *mp_tp = NULL;
5339 int error = 0;
5340
5341 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5342 mp_tp = mpte->mpte_mptcb;
5343
5344 if (mp_tp)
5345 mp_tp->mpt_notsent_lowat = optval;
5346 else
5347 error = EINVAL;
5348
39037602 5349 return error;
fe8ab488
A
5350}
5351
5352u_int32_t
5353mptcp_get_notsent_lowat(struct mptses *mpte)
5354{
5355 struct mptcb *mp_tp = NULL;
5356
5357 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5358 mp_tp = mpte->mpte_mptcb;
5359
5360 if (mp_tp)
5361 return mp_tp->mpt_notsent_lowat;
5362 else
5363 return 0;
5364}
5365
39037602 5366int
fe8ab488
A
5367mptcp_notsent_lowat_check(struct socket *so) {
5368 struct mptses *mpte;
5369 struct mppcb *mpp;
5370 struct mptcb *mp_tp;
5371 struct mptsub *mpts;
5372
5373 int notsent = 0;
5374
5375 mpp = sotomppcb(so);
5376 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5377 return (0);
5378 }
5379
5380 mpte = mptompte(mpp);
5381 mp_tp = mpte->mpte_mptcb;
5382
5383 MPT_LOCK(mp_tp);
5384 notsent = so->so_snd.sb_cc;
5385
5386 if ((notsent == 0) ||
5387 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5388 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5389 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5390 "lowat %d notsent %d actual %d \n",
5391 mp_tp->mpt_notsent_lowat, notsent,
5392 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5393 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5394 MPT_UNLOCK(mp_tp);
5395 return (1);
5396 }
5397 MPT_UNLOCK(mp_tp);
5398
5399 /* When Nagle's algorithm is not disabled, it is better
5400 * to wakeup the client even before there is atleast one
5401 * maxseg of data to write.
5402 */
5403 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5404 int retval = 0;
5405 MPTS_LOCK(mpts);
5406 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5407 struct socket *subf_so = mpts->mpts_socket;
5408 socket_lock(subf_so, 0);
5409 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 5410
fe8ab488
A
5411 notsent = so->so_snd.sb_cc -
5412 (tp->snd_nxt - tp->snd_una);
39037602 5413
fe8ab488
A
5414 if ((tp->t_flags & TF_NODELAY) == 0 &&
5415 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5416 retval = 1;
5417 }
3e170ce0 5418 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5419 " nodelay false \n",
3e170ce0
A
5420 mp_tp->mpt_notsent_lowat, notsent),
5421 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5422 socket_unlock(subf_so, 0);
5423 MPTS_UNLOCK(mpts);
5424 return (retval);
5425 }
5426 MPTS_UNLOCK(mpts);
5427 }
5428 return (0);
5429}
5430
3e170ce0
A
5431static void
5432mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
5433{
5434 MPTE_LOCK_ASSERT_HELD(mpte);
5435 MPTS_LOCK_ASSERT_HELD(mpts);
5436
5437 struct socket *subflow_so = mpts->mpts_socket;
5438 socket_lock(subflow_so, 0);
5439 mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
5440 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
5441 socket_unlock(subflow_so, 0);
5442}
5443
5444/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5445static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5446static uint32_t mptcp_kern_skt_inuse = 0;
5447symptoms_advisory_t mptcp_advisory;
5448
5449static errno_t
5450mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5451 void **unitinfo)
5452{
5453#pragma unused(kctlref, sac, unitinfo)
5454 /*
5455 * We don't need to do anything here. But we can atleast ensure
5456 * only one user opens the MPTCP_KERN_CTL_NAME control socket.
5457 */
5458 if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
5459 return (0);
5460 else
5461 return (EALREADY);
5462}
5463
5464static errno_t
5465mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5466 void *unitinfo)
5467{
5468#pragma unused(kctlref, kcunit, unitinfo)
5469 if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
5470 /* TBD needs to be locked if the size grows more than an int */
5471 bzero(&mptcp_advisory, sizeof(mptcp_advisory));
5472 return (0);
5473 }
5474 else {
5475 return (EINVAL);
5476 }
5477}
5478
5479static errno_t
5480mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5481 mbuf_t m, int flags)
5482{
5483#pragma unused(kctlref, kcunit, unitinfo, flags)
5484 symptoms_advisory_t *sa = NULL;
5485
5486 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5487 mbuf_freem(m);
5488 return (EINVAL);
5489 }
5490
5491 if (mbuf_len(m) >= sizeof(*sa))
5492 sa = mbuf_data(m);
5493 else
5494 return (EINVAL);
5495
5496 if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
5497 /*
5498 * we could use this notification to notify all mptcp pcbs
5499 * of the change in network status. But its difficult to
5500 * define if sending REMOVE_ADDR or MP_PRIO is appropriate
5501 * given that these are only soft indicators of the network
5502 * state. Leaving this as TBD for now.
5503 */
5504 }
5505
5506 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
5507 mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
5508 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
5509 sa->sa_cell_status, mptcp_advisory.sa_cell_status),
5510 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
5511 MPTCP_LOGLVL_LOG);
5512
5513 if ((sa->sa_wifi_status &
5514 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5515 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
5516 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5517 }
5518
5519 if ((sa->sa_cell_status &
5520 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
5521 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
5522 mptcp_advisory.sa_cell_status = sa->sa_cell_status;
5523 }
5524 } else {
5525 mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
5526 "wifi %d cell %d\n", __func__,
5527 mptcp_advisory.sa_wifi_status,
5528 mptcp_advisory.sa_cell_status),
5529 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
5530 }
5531 return (0);
5532}
5533
5534void
5535mptcp_control_register(void)
5536{
5537 /* Set up the advisory control socket */
5538 struct kern_ctl_reg mptcp_kern_ctl;
5539
5540 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5541 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5542 sizeof(mptcp_kern_ctl.ctl_name));
5543 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5544 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5545 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5546 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5547
5548 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5549}
5550
5551int
5552mptcp_is_wifi_unusable(void)
5553{
5554 /* a false return val indicates there is no info or wifi is ok */
5555 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5556}
5557
5558int
5559mptcp_is_cell_unusable(void)
5560{
5561 /* a false return val indicates there is no info or cell is ok */
5562 return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
5563}
5564
5565struct mptsub*
5566mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
5567{
5568 struct mptsub *cellsub = NULL;
5569 struct mptsub *wifisub = NULL;
5570 struct mptsub *wiredsub = NULL;
5571
5572 VERIFY ((best != NULL) && (second_best != NULL));
5573
5574 if (!mptcp_use_symptomsd)
5575 return (NULL);
5576
5577 if (!mptcp_kern_skt_inuse)
5578 return (NULL);
5579
5580 /*
39037602
A
5581 * There could be devices with more than one wifi interface or
5582 * more than one wired or cell interfaces.
3e170ce0
A
5583 * TBD: SymptomsD is unavailable on such platforms as of now.
5584 * Try to prefer best when possible in general.
5585 * Also, SymptomsD sends notifications about wifi only when it
5586 * is primary.
5587 */
5588 if (best->mpts_linktype & MPTSL_WIFI)
5589 wifisub = best;
5590 else if (best->mpts_linktype & MPTSL_CELL)
5591 cellsub = best;
5592 else if (best->mpts_linktype & MPTSL_WIRED)
5593 wiredsub = best;
5594
5595 /*
5596 * On platforms with wired paths, don't use hints about wifi or cell.
5597 * Currently, SymptomsD is not available on platforms with wired paths.
5598 */
5599 if (wiredsub)
5600 return (NULL);
5601
5602 if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
5603 wifisub = second_best;
5604
5605 if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
5606 cellsub = second_best;
5607
5608 if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
5609 wiredsub = second_best;
5610
5611 if ((wifisub == best) && mptcp_is_wifi_unusable()) {
5612 tcpstat.tcps_mp_sel_symtomsd++;
5613 if (mptcp_is_cell_unusable()) {
5614 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5615 " suggests both Wifi and Cell are bad. Wired %s.",
5616 (wiredsub == NULL) ? "none" : "present"),
5617 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5618 return (wiredsub);
5619 } else {
5620 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5621 " suggests Wifi bad, Cell good. Wired %s.",
5622 (wiredsub == NULL) ? "none" : "present"),
5623 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5624 return ((wiredsub != NULL) ? wiredsub : cellsub);
5625 }
5626 }
5627
5628 if ((cellsub == best) && (mptcp_is_cell_unusable())) {
5629 tcpstat.tcps_mp_sel_symtomsd++;
5630 if (mptcp_is_wifi_unusable()) {
5631 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5632 " suggests both Cell and Wifi are bad. Wired %s.",
5633 (wiredsub == NULL) ? "none" : "present"),
5634 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5635 return (wiredsub);
5636 } else {
5637 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5638 " suggests Cell bad, Wifi good. Wired %s.",
5639 (wiredsub == NULL) ? "none" : "present"),
5640 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5641 return ((wiredsub != NULL) ? wiredsub : wifisub);
5642 }
5643 }
5644
5645 /* little is known about the state of the network or wifi is good */
39037602 5646 return (NULL);
3e170ce0 5647}
490019cf
A
5648
5649/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5650static void
39037602 5651mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts, int *wakeup)
490019cf
A
5652{
5653 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5654 struct socket *so = mpts->mpts_socket;
5655 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5656 struct mptcb *mp_tp = mpte->mpte_mptcb;
5657
5658 /* If data was sent with SYN, rewind state */
5659 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5660 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5661 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5662 MPT_LOCK(mp_tp);
5663 u_int64_t mp_droplen = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
5664 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5665 VERIFY(mp_droplen <= (UINT_MAX));
5666 VERIFY(mp_droplen >= tcp_droplen);
5667
5668 if (mp_droplen > tcp_droplen) {
5669 /* handle partial TCP ack */
5670 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5671 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
5672 mpts->mpts_sndnxt = mp_tp->mpt_sndnxt;
5673 mp_droplen = tcp_droplen;
5674 } else {
5675 /* all data on SYN was acked */
5676 mpts->mpts_rel_seq = 1;
5677 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5678 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
5679 }
5680 mp_tp->mpt_sndmax -= tcp_droplen;
5681
5682 MPT_UNLOCK(mp_tp);
5683 if (mp_droplen != 0) {
5684 VERIFY(mp_so->so_snd.sb_mb != NULL);
5685 sbdrop(&mp_so->so_snd, (int)mp_droplen);
39037602
A
5686 if (wakeup)
5687 *wakeup = 1;
490019cf
A
5688 }
5689 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d "
5690 "TFO tcp len %d mptcp len %d\n", __func__,
5691 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid,
5692 tcp_droplen, mp_droplen),
5693 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5694 }
5695}