]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-3248.60.10.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
490019cf 2 * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
3e170ce0 50#include <net/if_var.h>
39236c6e
A
51#include <netinet/in.h>
52#include <netinet/in_pcb.h>
53#include <netinet/in_var.h>
54#include <netinet/tcp.h>
55#include <netinet/tcp_fsm.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58#include <netinet/mptcp_var.h>
59#include <netinet/mptcp.h>
60#include <netinet/mptcp_seq.h>
61#include <netinet/mptcp_timer.h>
62#include <libkern/crypto/sha1.h>
63#if INET6
64#include <netinet6/in6_pcb.h>
65#include <netinet6/ip6protosw.h>
66#endif /* INET6 */
67#include <dev/random/randomdev.h>
68
69/*
70 * Notes on MPTCP implementation.
71 *
72 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
73 * communication domain. The structure mtcbinfo describes the MPTCP instance
74 * of a Multipath protocol in that domain. It is used to keep track of all
75 * MPTCP PCB instances in the system, and is protected by the global lock
76 * mppi_lock.
77 *
78 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
79 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
80 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
81 * allocated from the same memory block, and each structure has a pointer
82 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
83 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
84 * PCB (mppcb) as well as the MPTCP Session (mptses).
85 *
86 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
87 * in particular, the list of subflows as well as the MPTCP thread.
88 *
89 * A functioning MPTCP Session consists of one or more subflow sockets. Each
90 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
91 * represented by the mptsub structure. Because each subflow requires access
92 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
93 * subflow. This gets decremented prior to the subflow's destruction. The
94 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
95 *
96 * To handle events (read, write, control) from the subflows, an MPTCP thread
97 * is created; currently, there is one thread per MPTCP Session. In order to
98 * prevent the MPTCP socket from being destroyed while being accessed by the
99 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
100 * which will be decremented prior to the thread's termination. The thread
101 * lock (mpte_thread_lock) is used to synchronize its signalling.
102 *
103 * Lock ordering is defined as follows:
104 *
105 * mtcbinfo (mppi_lock)
106 * mp_so (mpp_lock)
107 * mpts (mpts_lock)
108 * so (inpcb_mtx)
109 * mptcb (mpt_lock)
110 *
111 * It is not a requirement that all of the above locks need to be acquired
112 * in succession, but the correct lock ordering must be followed when there
113 * are more than one locks that need to be held. The MPTCP thread lock is
114 * is not constrained by this arrangement, because none of the other locks
115 * is ever acquired while holding mpte_thread_lock; therefore it may be called
116 * at any moment to signal the thread.
117 *
118 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
119 * work is done by the MPTCP garbage collector which is invoked on demand by
120 * the PF_MULTIPATH garbage collector. This process will take place once all
121 * of the subflows have been destroyed, and the MPTCP thread be instructed to
122 * self-terminate.
123 */
124
125static void mptcp_sesdestroy(struct mptses *);
126static void mptcp_thread_signal_locked(struct mptses *);
127static void mptcp_thread_terminate_signal(struct mptses *);
128static void mptcp_thread_dowork(struct mptses *);
129static void mptcp_thread_func(void *, wait_result_t);
130static void mptcp_thread_destroy(struct mptses *);
131static void mptcp_key_pool_init(void);
fe8ab488 132static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e
A
133static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
134static void mptcp_conn_properties(struct mptcb *);
39236c6e
A
135
136static uint32_t mptcp_gc(struct mppcbinfo *);
137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138 int, struct proc *, struct socket **);
139static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142 struct uio *, struct mbuf **, struct mbuf **, int *);
143static void mptcp_subflow_rupcall(struct socket *, void *, int);
144static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145static void mptcp_subflow_wupcall(struct socket *, void *, int);
146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147static void mptcp_update_last_owner(struct mptsub *, struct socket *);
fe8ab488 148static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
3e170ce0 149static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
490019cf 150static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
39236c6e
A
151
152/*
153 * Possible return values for subflow event handlers. Note that success
154 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
155 * indicate errors or actions which require immediate attention; they will
156 * prevent the rest of the handlers from processing their respective events
157 * until the next round of events processing.
158 */
159typedef enum {
160 MPTS_EVRET_DELETE = 1, /* delete this subflow */
161 MPTS_EVRET_OK = 2, /* OK */
162 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
163 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
164} ev_ret_t;
165
3e170ce0
A
166static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
167static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
168static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
169static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
170static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
171static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
172static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
173static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
174static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
175static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
176static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
177static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
178static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
179static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
180static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
181static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
182static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
fe8ab488 183
39236c6e
A
184static const char *mptcp_evret2str(ev_ret_t);
185
186static mptcp_key_t *mptcp_reserve_key(void);
187static int mptcp_do_sha1(mptcp_key_t *, char *, int);
490019cf 188static void mptcp_init_local_parms(struct mptcb *);
39236c6e
A
189
190static unsigned int mptsub_zone_size; /* size of mptsub */
191static struct zone *mptsub_zone; /* zone for mptsub */
192
193static unsigned int mptopt_zone_size; /* size of mptopt */
194static struct zone *mptopt_zone; /* zone for mptopt */
195
196static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
197static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
198
199struct mppcbinfo mtcbinfo;
200
201static struct mptcp_keys_pool_head mptcp_keys_pool;
202
203#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
204#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
205
206SYSCTL_DECL(_net_inet);
207
208SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
209
3e170ce0
A
210uint32_t mptcp_dbg_area = 0; /* more noise if greater than 1 */
211SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
212 &mptcp_dbg_area, 0, "MPTCP debug area");
213
214uint32_t mptcp_dbg_level = 0;
215SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
216 &mptcp_dbg_level, 0, "MPTCP debug level");
217
39236c6e
A
218
219SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
220 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
221
222/*
223 * Since there is one kernel thread per mptcp socket, imposing an artificial
224 * limit on number of allowed mptcp sockets.
225 */
226uint32_t mptcp_socket_limit = MPPCB_LIMIT;
227SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
228 &mptcp_socket_limit, 0, "MPTCP socket limit");
229
fe8ab488
A
230/*
231 * SYSCTL to turn on delayed cellular subflow start.
232 */
233uint32_t mptcp_delayed_subf_start = 0;
234SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
235 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
236
237/*
3e170ce0 238 * sysctl to use network status hints from symptomsd
fe8ab488 239 */
3e170ce0
A
240uint32_t mptcp_use_symptomsd = 1;
241SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
242 &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
fe8ab488 243
39236c6e
A
244static struct protosw mptcp_subflow_protosw;
245static struct pr_usrreqs mptcp_subflow_usrreqs;
246#if INET6
247static struct ip6protosw mptcp_subflow_protosw6;
248static struct pr_usrreqs mptcp_subflow_usrreqs6;
249#endif /* INET6 */
250
3e170ce0
A
251typedef struct mptcp_subflow_event_entry {
252 uint64_t sofilt_hint_mask;
253 ev_ret_t (*sofilt_hint_ev_hdlr)(
254 struct mptses *mpte,
255 struct mptsub *mpts,
256 uint64_t *p_mpsofilt_hint);
257} mptsub_ev_entry_t;
258
490019cf
A
259/*
260 * XXX The order of the event handlers below is really
261 * really important.
262 * SO_FILT_HINT_DELETEOK event has to be handled first,
263 * else we may end up missing on this event.
264 * Please read radar://24043716 for more details.
265 */
3e170ce0 266static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
490019cf
A
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
269 .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
270 },
3e170ce0
A
271 {
272 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
273 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
274 },
275 {
276 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
277 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
278 },
279 {
280 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
281 .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
282 },
283 {
284 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
285 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
286 },
287 {
288 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
289 .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
290 },
291 { .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
292 .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
293 },
294 {
295 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
296 .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
297 },
298 {
299 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
300 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
301 },
302 {
303 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
304 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
305 },
306 {
307 .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
308 .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
309 },
310 {
311 .sofilt_hint_mask = SO_FILT_HINT_RESUME,
312 .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
313 },
314 {
315 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
316 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
317 },
318 {
319 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
320 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
321 },
3e170ce0
A
322 {
323 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
324 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
325 },
326 {
327 .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
328 .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
329 }
330};
331
39236c6e
A
332/*
333 * Protocol pr_init callback.
334 */
335void
336mptcp_init(struct protosw *pp, struct domain *dp)
337{
338#pragma unused(dp)
339 static int mptcp_initialized = 0;
340 struct protosw *prp;
341#if INET6
342 struct ip6protosw *prp6;
343#endif /* INET6 */
344
345 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
346
347 /* do this only once */
348 if (mptcp_initialized)
349 return;
350 mptcp_initialized = 1;
351
352 /*
353 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
354 * we must be able to find IPPROTO_TCP entries for both.
355 */
356 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
357 VERIFY(prp != NULL);
358 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
359 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
360 sizeof (mptcp_subflow_usrreqs));
361 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
362 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
363 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
364 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
365 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
366 /*
367 * Socket filters shouldn't attach/detach to/from this protosw
368 * since pr_protosw is to be used instead, which points to the
369 * real protocol; if they do, it is a bug and we should panic.
370 */
371 mptcp_subflow_protosw.pr_filter_head.tqh_first =
372 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
373 mptcp_subflow_protosw.pr_filter_head.tqh_last =
374 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
375
376#if INET6
377 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
378 IPPROTO_TCP, SOCK_STREAM);
379 VERIFY(prp6 != NULL);
380 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
381 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
382 sizeof (mptcp_subflow_usrreqs6));
383 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
384 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
385 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
386 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
387 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
388 /*
389 * Socket filters shouldn't attach/detach to/from this protosw
390 * since pr_protosw is to be used instead, which points to the
391 * real protocol; if they do, it is a bug and we should panic.
392 */
393 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
394 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
395 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
396 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
397#endif /* INET6 */
398
399 bzero(&mtcbinfo, sizeof (mtcbinfo));
400 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
401 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
402 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
403 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
404 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
405 /* NOTREACHED */
406 }
407 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
408 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
409
410 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
411 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
412 mtcbinfo.mppi_lock_grp_attr);
413 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
414 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
415 mtcbinfo.mppi_lock_attr);
39236c6e 416
3e170ce0 417 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e 418 mtcbinfo.mppi_timer = mptcp_timer;
3e170ce0 419 mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
39236c6e
A
420
421 /* attach to MP domain for garbage collection to take place */
422 mp_pcbinfo_attach(&mtcbinfo);
423
424 mptsub_zone_size = sizeof (struct mptsub);
425 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
426 8192, "mptsub")) == NULL) {
427 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
428 /* NOTREACHED */
429 }
430 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
431 zone_change(mptsub_zone, Z_EXPAND, TRUE);
432
433 mptopt_zone_size = sizeof (struct mptopt);
434 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
435 1024, "mptopt")) == NULL) {
436 panic("%s: unable to allocate MPTCP option zone\n", __func__);
437 /* NOTREACHED */
438 }
439 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
440 zone_change(mptopt_zone, Z_EXPAND, TRUE);
441
442 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
443 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
444 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
445 panic("%s: unable to allocate MPTCP address auth zone \n",
446 __func__);
447 /* NOTREACHED */
448 }
449 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
450 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
451
452 /* Set up a list of unique keys */
453 mptcp_key_pool_init();
39236c6e
A
454}
455
456/*
457 * Create an MPTCP session, called as a result of opening a MPTCP socket.
458 */
3e170ce0 459void *
39236c6e
A
460mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
461{
462 struct mppcbinfo *mppi;
463 struct mptses *mpte;
464 struct mptcb *mp_tp;
465 int error = 0;
466
467 VERIFY(mpp != NULL);
468 mppi = mpp->mpp_pcbinfo;
469 VERIFY(mppi != NULL);
470
3e170ce0
A
471 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
472 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
473
474 /* MPTCP Multipath PCB Extension */
475 bzero(mpte, sizeof (*mpte));
476 VERIFY(mpp->mpp_pcbe == NULL);
477 mpp->mpp_pcbe = mpte;
478 mpte->mpte_mppcb = mpp;
479 mpte->mpte_mptcb = mp_tp;
480
481 TAILQ_INIT(&mpte->mpte_sopts);
482 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
483 mpte->mpte_associd = SAE_ASSOCID_ANY;
484 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e
A
485
486 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
487 mppi->mppi_lock_attr);
488
489 /*
490 * XXX: adi@apple.com
491 *
492 * This can be rather expensive if we have lots of MPTCP sockets,
493 * but we need a kernel thread for this model to work. Perhaps we
494 * could amortize the costs by having one worker thread per a group
495 * of MPTCP sockets.
496 */
497 if (kernel_thread_start(mptcp_thread_func, mpte,
498 &mpte->mpte_thread) != KERN_SUCCESS) {
499 error = ENOBUFS;
500 goto out;
501 }
502 mp_so->so_usecount++; /* for thread */
503
504 /* MPTCP Protocol Control Block */
505 bzero(mp_tp, sizeof (*mp_tp));
506 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
507 mppi->mppi_lock_attr);
508 mp_tp->mpt_mpte = mpte;
3e170ce0 509 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e
A
510out:
511 if (error != 0)
512 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
513 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
514 struct sockbuf *, &mp_so->so_rcv,
515 struct sockbuf *, &mp_so->so_snd,
516 struct mppcb *, mpp, int, error);
517
518 return ((error != 0) ? NULL : mpte);
519}
520
521/*
522 * Destroy an MPTCP session.
523 */
524static void
525mptcp_sesdestroy(struct mptses *mpte)
526{
527 struct mptcb *mp_tp;
528
529 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
530
531 mp_tp = mpte->mpte_mptcb;
532 VERIFY(mp_tp != NULL);
533
534 /*
535 * MPTCP Multipath PCB Extension section
536 */
537 mptcp_flush_sopts(mpte);
538 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
539
540 lck_mtx_destroy(&mpte->mpte_thread_lock,
541 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
542
543 /*
544 * MPTCP Protocol Control Block section
545 */
546 lck_mtx_destroy(&mp_tp->mpt_lock,
547 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
548
549 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
550 struct mptcb *, mp_tp);
551}
552
553/*
554 * Allocate an MPTCP socket option structure.
555 */
556struct mptopt *
557mptcp_sopt_alloc(int how)
558{
559 struct mptopt *mpo;
560
561 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
562 zalloc_noblock(mptopt_zone);
563 if (mpo != NULL) {
564 bzero(mpo, mptopt_zone_size);
565 }
566
567 return (mpo);
568}
569
570/*
571 * Free an MPTCP socket option structure.
572 */
573void
574mptcp_sopt_free(struct mptopt *mpo)
575{
576 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
577
578 zfree(mptopt_zone, mpo);
579}
580
581/*
582 * Add a socket option to the MPTCP socket option list.
583 */
584void
585mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
586{
587 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
588 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
589 mpo->mpo_flags |= MPOF_ATTACHED;
590 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
591}
592
593/*
594 * Remove a socket option from the MPTCP socket option list.
595 */
596void
597mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
598{
599 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
600 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
601 mpo->mpo_flags &= ~MPOF_ATTACHED;
602 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
603}
604
605/*
606 * Search for an existing <sopt_level,sopt_name> socket option.
607 */
608struct mptopt *
609mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
610{
611 struct mptopt *mpo;
612
613 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
614
615 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
616 if (mpo->mpo_level == sopt->sopt_level &&
617 mpo->mpo_name == sopt->sopt_name)
618 break;
619 }
620 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
621
622 return (mpo);
623}
624
625/*
626 * Flushes all recorded socket options from an MP socket.
627 */
628void
629mptcp_flush_sopts(struct mptses *mpte)
630{
631 struct mptopt *mpo, *tmpo;
632
633 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
634
635 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
636 mptcp_sopt_remove(mpte, mpo);
637 mptcp_sopt_free(mpo);
638 }
639 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
640}
641
642/*
643 * Allocate a MPTCP subflow structure.
644 */
645struct mptsub *
646mptcp_subflow_alloc(int how)
647{
648 struct mptsub *mpts;
649
650 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
651 zalloc_noblock(mptsub_zone);
652 if (mpts != NULL) {
653 bzero(mpts, mptsub_zone_size);
654 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
655 mtcbinfo.mppi_lock_attr);
656 }
657
658 return (mpts);
659}
660
661/*
662 * Deallocate a subflow structure, called when all of the references held
663 * on it have been released. This implies that the subflow has been deleted.
664 */
665void
666mptcp_subflow_free(struct mptsub *mpts)
667{
668 MPTS_LOCK_ASSERT_HELD(mpts);
669
670 VERIFY(mpts->mpts_refcnt == 0);
671 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
672 VERIFY(mpts->mpts_mpte == NULL);
673 VERIFY(mpts->mpts_socket == NULL);
674
675 if (mpts->mpts_src_sl != NULL) {
676 sockaddrlist_free(mpts->mpts_src_sl);
677 mpts->mpts_src_sl = NULL;
678 }
679 if (mpts->mpts_dst_sl != NULL) {
680 sockaddrlist_free(mpts->mpts_dst_sl);
681 mpts->mpts_dst_sl = NULL;
682 }
683 MPTS_UNLOCK(mpts);
684 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
685
686 zfree(mptsub_zone, mpts);
687}
688
689/*
690 * Create an MPTCP subflow socket.
691 */
692static int
693mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
694 struct proc *p, struct socket **so)
695{
696 struct mptopt smpo, *mpo, *tmpo;
697 struct socket *mp_so;
698 int error;
699
700 *so = NULL;
701 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
702 mp_so = mpte->mpte_mppcb->mpp_socket;
703
704 /*
705 * Create the subflow socket (multipath subflow, non-blocking.)
706 *
707 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
708 * socket; it will be cleared when the socket is peeled off or closed.
709 * It also indicates to the underlying TCP to handle MPTCP options.
710 * A multipath subflow socket implies SS_NOFDREF state.
711 */
712 if ((error = socreate_internal(dom, so, SOCK_STREAM,
713 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
3e170ce0
A
714 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
715 " unable to create subflow socket error %d\n",
716 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
717 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
718 return (error);
719 }
720
721 socket_lock(*so, 0);
722 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
723 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
724 (SS_NBIO|SS_NOFDREF));
725
726 /* prevent the socket buffers from being compressed */
727 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
728 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
729
490019cf
A
730 /* Inherit preconnect and TFO data flags */
731 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
732 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
733
734 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
735 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
736
39236c6e
A
737 bzero(&smpo, sizeof (smpo));
738 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
739 smpo.mpo_level = SOL_SOCKET;
740 smpo.mpo_intval = 1;
741
742 /* disable SIGPIPE */
743 smpo.mpo_name = SO_NOSIGPIPE;
744 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
745 goto out;
746
747 /* find out if the subflow's source address goes away */
748 smpo.mpo_name = SO_NOADDRERR;
749 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
750 goto out;
751
752 /* enable keepalive */
753 smpo.mpo_name = SO_KEEPALIVE;
754 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
755 goto out;
756
757 /*
758 * Limit the receive socket buffer size to 64k.
759 *
760 * We need to take into consideration the window scale option
761 * which could be negotiated in one subflow but disabled in
762 * another subflow.
763 * XXX This can be improved in the future.
764 */
765 smpo.mpo_name = SO_RCVBUF;
766 smpo.mpo_intval = MPTCP_RWIN_MAX;
767 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
768 goto out;
769
770 /* N.B.: set by sosetopt */
771 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
772 /* Prevent automatic socket buffer sizing. */
773 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
774
775 smpo.mpo_level = IPPROTO_TCP;
776 smpo.mpo_intval = mptcp_subflow_keeptime;
777 smpo.mpo_name = TCP_KEEPALIVE;
778 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
779 goto out;
780
781 /* replay setsockopt(2) on the subflow sockets for eligible options */
782 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
783 int interim;
784
785 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
786 continue;
787
788 /*
789 * Skip those that are handled internally; these options
790 * should not have been recorded and marked with the
791 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
792 */
793 if (mpo->mpo_level == SOL_SOCKET &&
794 (mpo->mpo_name == SO_NOSIGPIPE ||
795 mpo->mpo_name == SO_NOADDRERR ||
796 mpo->mpo_name == SO_KEEPALIVE))
797 continue;
798
799 interim = (mpo->mpo_flags & MPOF_INTERIM);
800 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
801 char buf[32];
3e170ce0
A
802 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
803 " mp_so 0x%llx"
804 " sopt %s val %d interim record removed\n",
39236c6e
A
805 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
806 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
807 buf, sizeof (buf)), mpo->mpo_intval),
808 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
809 mptcp_sopt_remove(mpte, mpo);
810 mptcp_sopt_free(mpo);
811 continue;
812 }
813 }
814
815 /*
816 * We need to receive everything that the subflow socket has,
817 * so use a customized socket receive function. We will undo
818 * this when the socket is peeled off or closed.
819 */
820 mpts->mpts_oprotosw = (*so)->so_proto;
821 switch (dom) {
822 case PF_INET:
823 (*so)->so_proto = &mptcp_subflow_protosw;
824 break;
825#if INET6
826 case PF_INET6:
827 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
828 break;
829#endif /* INET6 */
830 default:
831 VERIFY(0);
832 /* NOTREACHED */
833 }
834
835out:
836 socket_unlock(*so, 0);
837
838 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
839 struct mptsub *, mpts, int, dom, int, error);
840
841 return (error);
842}
843
844/*
845 * Close an MPTCP subflow socket.
846 *
847 * Note that this may be called on an embryonic subflow, and the only
848 * thing that is guaranteed valid is the protocol-user request.
849 */
850static int
851mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
852{
853 MPTS_LOCK_ASSERT_HELD(mpts);
854
855 socket_lock(so, 0);
856 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
857 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
858
859 /* restore protocol-user requests */
860 VERIFY(mpts->mpts_oprotosw != NULL);
861 so->so_proto = mpts->mpts_oprotosw;
862 socket_unlock(so, 0);
863
864 mpts->mpts_socket = NULL; /* may already be NULL */
865
866 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
867 struct socket *, so,
868 struct sockbuf *, &so->so_rcv,
869 struct sockbuf *, &so->so_snd,
870 struct mptses *, mpts->mpts_mpte);
871
872 return (soclose(so));
873}
874
875/*
876 * Connect an MPTCP subflow socket.
877 *
878 * This may be called inline as part of adding a subflow, or asynchronously
879 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
880 * pending connect case, the subflow socket may have been bound to an interface
881 * and/or a source IP address which may no longer be around by the time this
882 * routine is called; in that case the connect attempt will most likely fail.
883 */
884static int
885mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
886{
887 struct socket *so;
888 int af, error;
889
890 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
891 MPTS_LOCK_ASSERT_HELD(mpts);
892
893 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
894 MPTSF_CONNECTING);
895 VERIFY(mpts->mpts_socket != NULL);
896 so = mpts->mpts_socket;
897 af = mpts->mpts_family;
898
899 if (af == AF_INET || af == AF_INET6) {
900 struct sockaddr_entry *dst_se;
901 char dbuf[MAX_IPv6_STR_LEN];
902
903 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
904 VERIFY(dst_se != NULL);
905
3e170ce0
A
906 mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
907 "dst %s[%d] cid %d [pended %s]\n",
39236c6e
A
908 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
909 inet_ntop(af, ((af == AF_INET) ?
910 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
911 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
912 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
913 ntohs(SIN(dst_se->se_addr)->sin_port) :
914 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
915 mpts->mpts_connid,
916 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
917 "YES" : "NO")),
918 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
919 }
920
921 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
922
923 socket_lock(so, 0);
fe8ab488
A
924 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
925
39236c6e
A
926 /* connect the subflow socket */
927 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
928 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
3e170ce0
A
929 mpte->mpte_associd, NULL, CONNREQF_MPTCP,
930 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
39236c6e
A
931 socket_unlock(so, 0);
932
fe8ab488
A
933 /* Allocate a unique address id per subflow */
934 mpte->mpte_addrid_last++;
935 if (mpte->mpte_addrid_last == 0)
936 mpte->mpte_addrid_last++;
937
39236c6e
A
938 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
939 struct mptsub *, mpts, int, error);
940
941 return (error);
942}
943
944/*
945 * MPTCP subflow socket receive routine, derived from soreceive().
946 */
947static int
948mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
949 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
950{
951#pragma unused(uio)
952 int flags, error = 0;
953 struct proc *p = current_proc();
954 struct mbuf *m, **mp = mp0;
955 struct mbuf *nextrecord;
956
957 socket_lock(so, 1);
958 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
959
960#ifdef MORE_LOCKING_DEBUG
961 if (so->so_usecount == 1) {
962 panic("%s: so=%x no other reference on socket\n", __func__, so);
963 /* NOTREACHED */
964 }
965#endif
966 /*
967 * We return all that is there in the subflow's socket receive buffer
968 * to the MPTCP layer, so we require that the caller passes in the
969 * expected parameters.
970 */
971 if (mp == NULL || controlp != NULL) {
972 socket_unlock(so, 1);
973 return (EINVAL);
974 }
975 *mp = NULL;
976 if (psa != NULL)
977 *psa = NULL;
978 if (flagsp != NULL)
979 flags = *flagsp &~ MSG_EOR;
980 else
981 flags = 0;
982
983 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
984 socket_unlock(so, 1);
985 return (EOPNOTSUPP);
986 }
987 flags |= (MSG_DONTWAIT|MSG_NBIO);
988
989 /*
990 * If a recv attempt is made on a previously-accepted socket
991 * that has been marked as inactive (disconnected), reject
992 * the request.
993 */
994 if (so->so_flags & SOF_DEFUNCT) {
995 struct sockbuf *sb = &so->so_rcv;
996
997 error = ENOTCONN;
998 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
999 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
1000 SOCK_DOM(so), SOCK_TYPE(so), error));
1001 /*
1002 * This socket should have been disconnected and flushed
1003 * prior to being returned from sodefunct(); there should
1004 * be no data on its receive list, so panic otherwise.
1005 */
1006 if (so->so_state & SS_DEFUNCT)
1007 sb_empty_assert(sb, __func__);
1008 socket_unlock(so, 1);
1009 return (error);
1010 }
1011
1012 /*
1013 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1014 * and if so just return to the caller. This could happen when
1015 * soreceive() is called by a socket upcall function during the
1016 * time the socket is freed. The socket buffer would have been
1017 * locked across the upcall, therefore we cannot put this thread
1018 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1019 * we may livelock), because the lock on the socket buffer will
1020 * only be released when the upcall routine returns to its caller.
1021 * Because the socket has been officially closed, there can be
1022 * no further read on it.
1023 *
1024 * A multipath subflow socket would have its SS_NOFDREF set by
1025 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1026 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1027 */
1028 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1029 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1030 socket_unlock(so, 1);
1031 return (0);
1032 }
1033
1034 /*
1035 * For consistency with soreceive() semantics, we need to obey
1036 * SB_LOCK in case some other code path has locked the buffer.
1037 */
1038 error = sblock(&so->so_rcv, 0);
1039 if (error != 0) {
1040 socket_unlock(so, 1);
1041 return (error);
1042 }
1043
1044 m = so->so_rcv.sb_mb;
1045 if (m == NULL) {
1046 /*
1047 * Panic if we notice inconsistencies in the socket's
1048 * receive list; both sb_mb and sb_cc should correctly
1049 * reflect the contents of the list, otherwise we may
1050 * end up with false positives during select() or poll()
1051 * which could put the application in a bad state.
1052 */
1053 SB_MB_CHECK(&so->so_rcv);
1054
1055 if (so->so_error != 0) {
1056 error = so->so_error;
1057 so->so_error = 0;
1058 goto release;
1059 }
1060
1061 if (so->so_state & SS_CANTRCVMORE) {
1062 goto release;
1063 }
1064
1065 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1066 error = ENOTCONN;
1067 goto release;
1068 }
1069
1070 /*
1071 * MSG_DONTWAIT is implicitly defined and this routine will
1072 * never block, so return EWOULDBLOCK when there is nothing.
1073 */
1074 error = EWOULDBLOCK;
1075 goto release;
1076 }
1077
1078 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1079 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1080 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1081
1082 while (m != NULL) {
1083 nextrecord = m->m_nextpkt;
1084 sbfree(&so->so_rcv, m);
1085
1086 if (mp != NULL) {
1087 *mp = m;
1088 mp = &m->m_next;
1089 so->so_rcv.sb_mb = m = m->m_next;
1090 *mp = NULL;
1091 }
1092
1093 if (m != NULL) {
1094 m->m_nextpkt = nextrecord;
1095 if (nextrecord == NULL)
1096 so->so_rcv.sb_lastrecord = m;
1097 } else {
1098 m = so->so_rcv.sb_mb = nextrecord;
1099 SB_EMPTY_FIXUP(&so->so_rcv);
1100 }
1101 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1102 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1103 }
1104
1105 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1106 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1107 /* notify protocol that we drained all the data */
1108 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1109 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1110
1111 if (flagsp != NULL)
1112 *flagsp |= flags;
1113
1114release:
1115 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1116 return (error);
1117
1118}
1119
1120
1121/*
1122 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1123 * the work done earlier when the subflow socket was created.
1124 */
1125void
1126mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1127 struct socket *so)
1128{
1129 struct mptopt smpo;
1130 struct socket *mp_so;
1131 int p, c;
1132
1133 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1134 mp_so = mpte->mpte_mppcb->mpp_socket;
1135 MPTS_LOCK_ASSERT_HELD(mpts);
1136
1137 socket_lock(so, 0);
1138 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1139 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1140
1141 /* inherit MPTCP socket states */
1142 if (!(mp_so->so_state & SS_NBIO))
1143 so->so_state &= ~SS_NBIO;
1144
1145 /*
1146 * At this point, the socket is not yet closed, as there is at least
1147 * one outstanding usecount previously held by mpts_socket from
1148 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1149 */
1150 so->so_flags &= ~SOF_MP_SUBFLOW;
1151 so->so_state &= ~SS_NOFDREF;
fe8ab488 1152 so->so_flags &= ~SOF_MPTCP_TRUE;
39236c6e
A
1153
1154 /* allow socket buffers to be compressed */
1155 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1156 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1157
1158 /*
1159 * Allow socket buffer auto sizing.
1160 *
1161 * This will increase the current 64k buffer size to whatever is best.
1162 */
fe8ab488
A
1163 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
1164 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1165 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1166 so->so_snd.sb_flags |= SB_AUTOSIZE;
39236c6e
A
1167
1168 /* restore protocol-user requests */
1169 VERIFY(mpts->mpts_oprotosw != NULL);
1170 so->so_proto = mpts->mpts_oprotosw;
1171
1172 bzero(&smpo, sizeof (smpo));
1173 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1174 smpo.mpo_level = SOL_SOCKET;
1175
1176 /* inherit SOF_NOSIGPIPE from parent MP socket */
1177 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1178 c = (so->so_flags & SOF_NOSIGPIPE);
1179 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1180 smpo.mpo_name = SO_NOSIGPIPE;
1181 if ((p - c) != 0)
1182 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1183
1184 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1185 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1186 c = (so->so_flags & SOF_NOADDRAVAIL);
1187 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1188 smpo.mpo_name = SO_NOADDRERR;
1189 if ((p - c) != 0)
1190 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1191
1192 /* inherit SO_KEEPALIVE from parent MP socket */
1193 p = (mp_so->so_options & SO_KEEPALIVE);
1194 c = (so->so_options & SO_KEEPALIVE);
1195 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1196 smpo.mpo_name = SO_KEEPALIVE;
1197 if ((p - c) != 0)
1198 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1199
1200 /* unset TCP level default keepalive option */
1201 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1202 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1203 smpo.mpo_level = IPPROTO_TCP;
1204 smpo.mpo_intval = 0;
1205 smpo.mpo_name = TCP_KEEPALIVE;
1206 if ((p - c) != 0)
1207 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1208 socket_unlock(so, 0);
1209
1210 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1211 struct mptsub *, mpts, struct socket *, so,
1212 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1213}
1214
1215/*
1216 * Establish an initial MPTCP connection (if first subflow and not yet
1217 * connected), or add a subflow to an existing MPTCP connection.
1218 */
1219int
1220mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1221 struct proc *p, uint32_t ifscope)
1222{
1223 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1224 struct socket *mp_so, *so = NULL;
1225 struct mptsub_connreq mpcr;
1226 struct mptcb *mp_tp;
1227 int af, error = 0;
1228
1229 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1230 mp_so = mpte->mpte_mppcb->mpp_socket;
1231 mp_tp = mpte->mpte_mptcb;
1232
fe8ab488
A
1233 MPT_LOCK(mp_tp);
1234 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1235 /* If the remote end sends Data FIN, refuse subflow adds */
1236 error = ENOTCONN;
1237 MPT_UNLOCK(mp_tp);
1238 return (error);
1239 }
1240 MPT_UNLOCK(mp_tp);
1241
39236c6e
A
1242 MPTS_LOCK(mpts);
1243 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1244 VERIFY(mpts->mpts_mpte == NULL);
1245 VERIFY(mpts->mpts_socket == NULL);
1246 VERIFY(mpts->mpts_dst_sl != NULL);
3e170ce0 1247 VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
39236c6e
A
1248
1249 /* select source (if specified) and destination addresses */
1250 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1251 &mpts->mpts_dst_sl, &dst_se)) != 0)
1252 goto out;
1253
1254 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1255 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1256 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1257 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1258 VERIFY(af == AF_INET || af == AF_INET6);
1259
1260 /*
1261 * If the source address is not specified, allocate a storage for
1262 * it, so that later on we can fill it in with the actual source
1263 * IP address chosen by the underlying layer for the subflow after
1264 * it is connected.
1265 */
1266 if (mpts->mpts_src_sl == NULL) {
1267 mpts->mpts_src_sl =
1268 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1269 if (mpts->mpts_src_sl == NULL) {
1270 error = ENOBUFS;
1271 goto out;
1272 }
1273 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1274 VERIFY(se != NULL && se->se_addr != NULL &&
1275 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1276 bzero(se->se_addr, se->se_addr->sa_len);
1277 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1278 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1279 }
1280
1281 /* create the subflow socket */
1282 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1283 goto out;
1284
1285 /*
3e170ce0
A
1286 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1287 * -1 (SAE_CONNID_ALL).
39236c6e
A
1288 */
1289 mpte->mpte_connid_last++;
3e170ce0
A
1290 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1291 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
1292 mpte->mpte_connid_last++;
1293
1294 mpts->mpts_connid = mpte->mpte_connid_last;
3e170ce0
A
1295 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1296 mpts->mpts_connid != SAE_CONNID_ALL);
490019cf
A
1297
1298 mpts->mpts_rel_seq = 1;
1299
fe8ab488
A
1300 /* Allocate a unique address id per subflow */
1301 mpte->mpte_addrid_last++;
1302 if (mpte->mpte_addrid_last == 0)
1303 mpte->mpte_addrid_last++;
39236c6e
A
1304
1305 /* bind subflow socket to the specified interface */
1306 if (ifscope != IFSCOPE_NONE) {
1307 socket_lock(so, 0);
1308 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1309 if (error != 0) {
1310 socket_unlock(so, 0);
1311 (void) mptcp_subflow_soclose(mpts, so);
1312 goto out;
1313 }
1314 VERIFY(mpts->mpts_outif != NULL);
1315 mpts->mpts_flags |= MPTSF_BOUND_IF;
1316
3e170ce0
A
1317 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
1318 "bindif %s[%d] cid d\n",
39236c6e
A
1319 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1320 mpts->mpts_outif->if_xname,
3e170ce0
A
1321 ifscope, mpts->mpts_connid),
1322 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1323 socket_unlock(so, 0);
1324 }
1325
1326 /* if source address and/or port is specified, bind to it */
1327 if (src_se != NULL) {
1328 struct sockaddr *sa = src_se->se_addr;
1329 uint32_t mpts_flags = 0;
1330 in_port_t lport;
1331
1332 switch (af) {
1333 case AF_INET:
1334 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1335 mpts_flags |= MPTSF_BOUND_IP;
1336 if ((lport = SIN(sa)->sin_port) != 0)
1337 mpts_flags |= MPTSF_BOUND_PORT;
1338 break;
1339#if INET6
1340 case AF_INET6:
1341 VERIFY(af == AF_INET6);
1342 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1343 mpts_flags |= MPTSF_BOUND_IP;
1344 if ((lport = SIN6(sa)->sin6_port) != 0)
1345 mpts_flags |= MPTSF_BOUND_PORT;
1346 break;
1347#endif /* INET6 */
1348 }
1349
1350 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1351 if (error != 0) {
1352 (void) mptcp_subflow_soclose(mpts, so);
1353 goto out;
1354 }
1355 mpts->mpts_flags |= mpts_flags;
1356
1357 if (af == AF_INET || af == AF_INET6) {
1358 char sbuf[MAX_IPv6_STR_LEN];
1359
3e170ce0
A
1360 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
1361 "mp_so 0x%llx bindip %s[%d] cid %d\n",
39236c6e
A
1362 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1363 inet_ntop(af, ((af == AF_INET) ?
1364 (void *)&SIN(sa)->sin_addr.s_addr :
1365 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
3e170ce0
A
1366 ntohs(lport), mpts->mpts_connid),
1367 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1368 }
1369 }
1370
1371 /*
1372 * Insert the subflow into the list, and associate the MPTCP PCB
1373 * as well as the the subflow socket. From this point on, removing
1374 * the subflow needs to be done via mptcp_subflow_del().
1375 */
1376 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1377 mpte->mpte_numflows++;
1378
1379 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1380 mpts->mpts_mpte = mpte;
1381 mpts->mpts_socket = so;
1382 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1383 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1384 mp_so->so_usecount++; /* for subflow socket */
1385
1386 /* register for subflow socket read/write events */
1387 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1388 mptcp_subflow_wupcall, mpts);
1389
1390 /*
1391 * Register for subflow socket control events; ignore
1392 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1393 * will generate it here.
1394 */
1395 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1396 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1397 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1398 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1399 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1400 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1401 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
fe8ab488
A
1402 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1403 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
39236c6e
A
1404
1405 /* sanity check */
1406 VERIFY(!(mpts->mpts_flags &
1407 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1408
1409 bzero(&mpcr, sizeof (mpcr));
1410 mpcr.mpcr_proc = p;
1411 mpcr.mpcr_ifscope = ifscope;
1412 /*
1413 * Indicate to the TCP subflow whether or not it should establish
1414 * the initial MPTCP connection, or join an existing one. Fill
1415 * in the connection request structure with additional info needed
1416 * by the underlying TCP (to be used in the TCP options, etc.)
1417 */
1418 MPT_LOCK(mp_tp);
1419 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1420 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
490019cf 1421 mptcp_init_local_parms(mp_tp);
39236c6e
A
1422 }
1423 MPT_UNLOCK(mp_tp);
1424 soisconnecting(mp_so);
1425 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1426 } else {
1427 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1428 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
fe8ab488
A
1429
1430 /* avoid starting up cellular subflow unless required */
1431 if ((mptcp_delayed_subf_start) &&
1432 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1433 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1434 }
39236c6e
A
1435 MPT_UNLOCK(mp_tp);
1436 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1437 }
1438
490019cf
A
1439 /* If fastjoin or fastopen is requested, set state in mpts */
1440 if (mpte->mpte_nummpcapflows == 0) {
1441 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1442 MPT_LOCK(mp_tp);
1443 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
1444 mpts->mpts_flags |= MPTSF_TFO_REQD;
1445 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1446 }
1447 MPT_UNLOCK(mp_tp);
1448 }
1449
1450 if (so->so_flags & SOF_MPTCP_FASTJOIN) {
1451 MPT_LOCK(mp_tp);
1452 if (mp_tp->mpt_state == MPTCPS_ESTABLISHED) {
1453 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1454 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1455 }
1456 MPT_UNLOCK(mp_tp);
1457 }
1458 }
1459
39236c6e
A
1460 mpts->mpts_mpcr = mpcr;
1461 mpts->mpts_flags |= MPTSF_CONNECTING;
1462
1463 if (af == AF_INET || af == AF_INET6) {
1464 char dbuf[MAX_IPv6_STR_LEN];
1465
3e170ce0
A
1466 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
1467 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
1468 "[pending %s]\n", __func__,
1469 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1470 inet_ntop(af, ((af == AF_INET) ?
1471 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1472 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1473 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1474 ntohs(SIN(dst_se->se_addr)->sin_port) :
1475 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1476 mpts->mpts_connid,
1477 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0
A
1478 "YES" : "NO")),
1479 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1480 }
1481
1482 /* connect right away if first attempt, or if join can be done now */
1483 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1484 error = mptcp_subflow_soconnectx(mpte, mpts);
1485
1486out:
1487 MPTS_UNLOCK(mpts);
1488 if (error == 0) {
1489 soevent(mp_so, SO_FILT_HINT_LOCKED |
1490 SO_FILT_HINT_CONNINFO_UPDATED);
1491 }
1492 return (error);
1493}
1494
39236c6e
A
1495/*
1496 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1497 * will no longer be accessible after a subflow is deleted, thus this
1498 * should occur only after the subflow socket has been disconnected.
1499 * If peeloff(2) is called, leave the socket open.
1500 */
1501void
1502mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1503{
1504 struct socket *mp_so, *so;
1505
1506 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1507 mp_so = mpte->mpte_mppcb->mpp_socket;
1508
1509 MPTS_LOCK(mpts);
1510 so = mpts->mpts_socket;
1511 VERIFY(so != NULL);
fe8ab488
A
1512
1513 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1514 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1515 MPTS_UNLOCK(mpts);
3e170ce0
A
1516 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
1517 " mp_so 0x%llx flags %x\n",
1518 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
1519 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
1520 return;
1521 }
39236c6e 1522
3e170ce0
A
1523 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
1524 "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
39236c6e
A
1525 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1526 mp_so->so_usecount,
1527 mp_so->so_retaincnt, mpts->mpts_connid,
1528 (close ? "YES" : "NO"), mpts->mpts_soerror,
3e170ce0
A
1529 mpts->mpts_flags,
1530 mp_so->so_error),
1531 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1532
1533 VERIFY(mpts->mpts_mpte == mpte);
3e170ce0
A
1534 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1535 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1536
1537 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1538 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1539 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1540 VERIFY(mpte->mpte_numflows != 0);
1541 mpte->mpte_numflows--;
fe8ab488
A
1542 if (mpte->mpte_active_sub == mpts)
1543 mpte->mpte_active_sub = NULL;
39236c6e
A
1544
1545 /*
1546 * Drop references held by this subflow socket; there
1547 * will be no further upcalls made from this point.
1548 */
1549 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1550 (void) sock_catchevents(so, NULL, NULL, 0);
fe8ab488 1551
39236c6e 1552 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
fe8ab488 1553
39236c6e
A
1554 if (close)
1555 (void) mptcp_subflow_soclose(mpts, so);
1556
1557 VERIFY(mp_so->so_usecount != 0);
1558 mp_so->so_usecount--; /* for subflow socket */
1559 mpts->mpts_mpte = NULL;
1560 mpts->mpts_socket = NULL;
1561 MPTS_UNLOCK(mpts);
1562
1563 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1564 MPTS_REMREF(mpts); /* for subflow socket */
1565
1566 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1567}
1568
1569/*
1570 * Disconnect a subflow socket.
1571 */
1572void
1573mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1574 boolean_t deleteok)
1575{
1576 struct socket *so;
1577 struct mptcb *mp_tp;
1578 int send_dfin = 0;
1579
1580 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1581 MPTS_LOCK_ASSERT_HELD(mpts);
1582
1583 VERIFY(mpts->mpts_mpte == mpte);
1584 VERIFY(mpts->mpts_socket != NULL);
3e170ce0
A
1585 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1586 mpts->mpts_connid != SAE_CONNID_ALL);
39236c6e
A
1587
1588 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1589 return;
1590
1591 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1592
1593 /*
1594 * If this is coming from disconnectx(2) or issued as part of
1595 * closing the MPTCP socket, the subflow shouldn't stick around.
1596 * Otherwise let it linger around in case the upper layers need
1597 * to retrieve its conninfo.
1598 */
1599 if (deleteok)
1600 mpts->mpts_flags |= MPTSF_DELETEOK;
1601
1602 so = mpts->mpts_socket;
1603 mp_tp = mpte->mpte_mptcb;
1604 MPT_LOCK(mp_tp);
1605 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1606 send_dfin = 1;
1607 MPT_UNLOCK(mp_tp);
1608
1609 socket_lock(so, 0);
1610 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1611 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
1612 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
1613 "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
1614 (deleteok ? "NO" : "YES")),
1615 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1616
1617 if (send_dfin)
1618 mptcp_send_dfin(so);
1619 (void) soshutdownlock(so, SHUT_RD);
1620 (void) soshutdownlock(so, SHUT_WR);
1621 (void) sodisconnectlocked(so);
1622 }
1623 socket_unlock(so, 0);
1624 /*
1625 * Generate a disconnect event for this subflow socket, in case
1626 * the lower layer doesn't do it; this is needed because the
1627 * subflow socket deletion relies on it. This will also end up
1628 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1629 * we cannot do that here because subflow lock is currently held.
1630 */
1631 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1632}
1633
1634/*
1635 * Subflow socket read upcall.
1636 *
1637 * Called when the associated subflow socket posted a read event. The subflow
1638 * socket lock has been released prior to invoking the callback. Note that the
1639 * upcall may occur synchronously as a result of MPTCP performing an action on
1640 * it, or asynchronously as a result of an event happening at the subflow layer.
1641 * Therefore, to maintain lock ordering, the only lock that can be acquired
1642 * here is the thread lock, for signalling purposes.
1643 */
1644static void
1645mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1646{
1647#pragma unused(so, waitf)
1648 struct mptsub *mpts = arg;
1649 struct mptses *mpte = mpts->mpts_mpte;
1650
fe8ab488
A
1651 /*
1652 * mpte should never be NULL, except in a race with
1653 * mptcp_subflow_del
1654 */
1655 if (mpte == NULL)
1656 return;
39236c6e
A
1657
1658 lck_mtx_lock(&mpte->mpte_thread_lock);
1659 mptcp_thread_signal_locked(mpte);
1660 lck_mtx_unlock(&mpte->mpte_thread_lock);
1661}
1662
1663/*
1664 * Subflow socket input.
1665 *
1666 * Called in the context of the MPTCP thread, for reading data from the
1667 * underlying subflow socket and delivering it to MPTCP.
1668 */
1669static void
1670mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1671{
1672 struct mbuf *m = NULL;
1673 struct socket *so;
1674 int error;
1675 struct mptsub *mpts_alt = NULL;
1676
1677 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1678 MPTS_LOCK_ASSERT_HELD(mpts);
1679
1680 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1681 struct mptsub *, mpts);
1682
1683 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1684 return;
1685
1686 so = mpts->mpts_socket;
1687
1688 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1689 if (error != 0 && error != EWOULDBLOCK) {
3e170ce0
A
1690 mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
1691 __func__, mpts->mpts_connid, error),
1692 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1693 MPTS_UNLOCK(mpts);
3e170ce0 1694 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e 1695 if (mpts_alt == NULL) {
fe8ab488
A
1696 if (mptcp_delayed_subf_start) {
1697 mpts_alt = mptcp_get_pending_subflow(mpte,
1698 mpts);
1699 if (mpts_alt) {
3e170ce0
A
1700 mptcplog((LOG_DEBUG,"MPTCP Receiver:"
1701 " %s: pending %d\n",
1702 __func__, mpts_alt->mpts_connid),
1703 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1704 } else {
3e170ce0
A
1705 mptcplog((LOG_ERR, "MPTCP Receiver:"
1706 " %s: no pending flow for cid %d",
1707 __func__, mpts->mpts_connid),
1708 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488
A
1709 }
1710 } else {
3e170ce0
A
1711 mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
1712 " path for cid %d\n", __func__,
1713 mpts->mpts_connid),
1714 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1715 }
490019cf
A
1716 if (error == ENODATA) {
1717 /*
1718 * Don't ignore ENODATA so as to discover
1719 * nasty middleboxes.
1720 */
1721 struct socket *mp_so =
1722 mpte->mpte_mppcb->mpp_socket;
1723 mp_so->so_error = ENODATA;
1724 sorwakeup(mp_so);
1725 }
39236c6e
A
1726 }
1727 MPTS_LOCK(mpts);
1728 } else if (error == 0) {
3e170ce0
A
1729 mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
1730 __func__, mpts->mpts_connid),
1731 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1732 }
1733
1734 /* In fallback, make sure to accept data on all but one subflow */
1735 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1736 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1737 m_freem(m);
1738 return;
1739 }
1740
1741 if (m != NULL) {
3e170ce0
A
1742
1743 /* Did we receive data on the backup subflow? */
1744 if (!(mpts->mpts_flags & MPTSF_ACTIVE))
1745 mpts->mpts_peerswitch++;
1746 else
1747 mpts->mpts_peerswitch = 0;
1748
39236c6e
A
1749 /*
1750 * Release subflow lock since this may trigger MPTCP to send,
1751 * possibly on a different subflow. An extra reference has
1752 * been held on the subflow by the MPTCP thread before coming
1753 * here, so we can be sure that it won't go away, in the event
1754 * the MP socket lock gets released.
1755 */
1756 MPTS_UNLOCK(mpts);
1757 mptcp_input(mpte, m);
1758 MPTS_LOCK(mpts);
1759 }
1760}
1761
1762/*
1763 * Subflow socket write upcall.
1764 *
1765 * Called when the associated subflow socket posted a read event. The subflow
1766 * socket lock has been released prior to invoking the callback. Note that the
1767 * upcall may occur synchronously as a result of MPTCP performing an action on
1768 * it, or asynchronously as a result of an event happening at the subflow layer.
1769 * Therefore, to maintain lock ordering, the only lock that can be acquired
1770 * here is the thread lock, for signalling purposes.
1771 */
1772static void
1773mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1774{
1775#pragma unused(so, waitf)
1776 struct mptsub *mpts = arg;
1777 struct mptses *mpte = mpts->mpts_mpte;
1778
fe8ab488 1779 /*
490019cf 1780 * mpte should never be NULL except in a race with
fe8ab488
A
1781 * mptcp_subflow_del which doesn't hold socket lock across critical
1782 * section. This upcall is made after releasing the socket lock.
1783 * Interleaving of socket operations becomes possible therefore.
1784 */
1785 if (mpte == NULL)
1786 return;
39236c6e
A
1787
1788 lck_mtx_lock(&mpte->mpte_thread_lock);
1789 mptcp_thread_signal_locked(mpte);
1790 lck_mtx_unlock(&mpte->mpte_thread_lock);
1791}
1792
1793/*
1794 * Subflow socket output.
1795 *
1796 * Called for sending data from MPTCP to the underlying subflow socket.
1797 */
1798int
1799mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1800{
1801 struct socket *mp_so, *so;
1802 size_t sb_cc = 0, tot_sent = 0;
1803 struct mbuf *sb_mb;
1804 int error = 0;
1805 u_int64_t mpt_dsn = 0;
1806 struct mptcb *mp_tp = mpte->mpte_mptcb;
1807 struct mbuf *mpt_mbuf = NULL;
fe8ab488
A
1808 u_int64_t off = 0;
1809 struct mbuf *head, *tail;
490019cf 1810 int tcp_zero_len_write = 0;
39236c6e
A
1811
1812 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1813 MPTS_LOCK_ASSERT_HELD(mpts);
1814 mp_so = mpte->mpte_mppcb->mpp_socket;
1815 so = mpts->mpts_socket;
1816
1817 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1818 struct mptsub *, mpts);
1819
1820 /* subflow socket is suspended? */
1821 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
3e170ce0
A
1822 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
1823 "flow controlled\n", __func__,
1824 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1825 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1826 goto out;
1827 }
1828
1829 /* subflow socket is not MPTCP capable? */
1830 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
fe8ab488 1831 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
490019cf
A
1832 !(mpts->mpts_flags & MPTSF_FASTJ_SEND) &&
1833 !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
3e170ce0 1834 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
39236c6e 1835 "MPTCP capable\n", __func__,
3e170ce0
A
1836 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1837 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1838 goto out;
1839 }
1840
1841 /* Remove Addr Option is not sent reliably as per I-D */
1842 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1843 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1844 tp->t_rem_aid = mpte->mpte_lost_aid;
1845 if (mptcp_remaddr_enable)
1846 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1847 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1848 }
1849
490019cf
A
1850 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
1851 mptcp_drop_tfo_data(mpte, mpts);
1852 }
1853
39236c6e
A
1854 /*
1855 * The mbuf chains containing the metadata (as well as pointing to
1856 * the user data sitting at the MPTCP output queue) would then be
1857 * sent down to the subflow socket.
1858 *
1859 * Some notes on data sequencing:
1860 *
1861 * a. Each mbuf must be a M_PKTHDR.
1862 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1863 * in the mbuf pkthdr structure.
1864 * c. Each mbuf containing the MPTCP metadata must have its
1865 * pkt_flags marked with the PKTF_MPTCP flag.
1866 */
1867
1868 /* First, drop acknowledged data */
1869 sb_mb = mp_so->so_snd.sb_mb;
1870 if (sb_mb == NULL) {
1871 goto out;
1872 }
1873
1874 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1875
1876 mpt_mbuf = sb_mb;
1877 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
490019cf
A
1878 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1879 (mpt_mbuf->m_next == NULL) &&
1880 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1881 /*
1882 * If TFO, allow connection establishment with zero
1883 * length write.
1884 */
1885 tcp_zero_len_write = 1;
1886 goto zero_len_write;
1887 }
39236c6e
A
1888 mpt_mbuf = mpt_mbuf->m_next;
1889 }
1890 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1891 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1892 } else {
1893 goto out;
1894 }
1895
1896 MPT_LOCK(mp_tp);
1897 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
fe8ab488 1898 u_int64_t len = 0;
39236c6e 1899 len = mp_tp->mpt_snduna - mpt_dsn;
3e170ce0 1900 MPT_UNLOCK(mp_tp);
fe8ab488 1901 sbdrop(&mp_so->so_snd, (int)len);
3e170ce0 1902 MPT_LOCK(mp_tp);
39236c6e
A
1903 }
1904
1905 /*
1906 * In degraded mode, we don't receive data acks, so force free
1907 * mbufs less than snd_nxt
1908 */
fe8ab488
A
1909 if (mp_so->so_snd.sb_mb == NULL) {
1910 MPT_UNLOCK(mp_tp);
1911 goto out;
1912 }
1913
39236c6e
A
1914 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1915 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 1916 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
39236c6e 1917 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
fe8ab488 1918 u_int64_t len = 0;
39236c6e 1919 len = mp_tp->mpt_sndnxt - mpt_dsn;
fe8ab488 1920 sbdrop(&mp_so->so_snd, (int)len);
39236c6e
A
1921 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1922 }
1923
fe8ab488
A
1924 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1925 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1926 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1927 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1928 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1929 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1930 }
1931
39236c6e
A
1932 /*
1933 * Adjust the subflow's notion of next byte to send based on
1934 * the last unacknowledged byte
1935 */
1936 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1937 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1938 }
1939
1940 /*
1941 * Adjust the top level notion of next byte used for retransmissions
1942 * and sending FINs.
1943 */
1944 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1945 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1946 }
1947
1948
1949 /* Now determine the offset from which to start transmitting data */
1950 sb_mb = mp_so->so_snd.sb_mb;
1951 sb_cc = mp_so->so_snd.sb_cc;
1952 if (sb_mb == NULL) {
1953 MPT_UNLOCK(mp_tp);
1954 goto out;
1955 }
1956 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1957 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
fe8ab488 1958 sb_cc -= (size_t)off;
39236c6e
A
1959 } else {
1960 MPT_UNLOCK(mp_tp);
1961 goto out;
1962 }
1963 MPT_UNLOCK(mp_tp);
1964
1965 mpt_mbuf = sb_mb;
39236c6e
A
1966
1967 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
fe8ab488 1968 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
39236c6e
A
1969 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1970 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 1971 }
3e170ce0
A
1972 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
1973 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
1974 "snduna = %llu sndnxt = %llu probe %d\n",
1975 __func__, mpts->mpts_connid,
1976 mp_tp->mpt_snduna, mpts->mpts_sndnxt,
1977 mpts->mpts_probecnt),
1978 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e 1979
ecc0ceb4 1980 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 1981
fe8ab488
A
1982 head = tail = NULL;
1983
39236c6e
A
1984 while (tot_sent < sb_cc) {
1985 struct mbuf *m;
fe8ab488 1986 size_t mlen;
39236c6e
A
1987
1988 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1989 mlen -= off;
1990 if (mlen == 0)
1991 goto out;
1992
1993 if (mlen > sb_cc) {
1994 panic("%s: unexpected %lu %lu \n", __func__,
1995 mlen, sb_cc);
1996 }
1997
fe8ab488
A
1998 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
1999 M_COPYM_MUST_COPY_HDR);
39236c6e
A
2000 if (m == NULL) {
2001 error = ENOBUFS;
2002 break;
2003 }
2004
2005 /* Create a DSN mapping for the data (m_copym does it) */
2006 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
fe8ab488 2007 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
2008 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2009 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2010 m->m_pkthdr.mp_dsn = mpt_dsn + off;
2011 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2012 m->m_pkthdr.mp_rlen = mlen;
2013 mpts->mpts_rel_seq += mlen;
2014 m->m_pkthdr.len = mlen;
2015
fe8ab488
A
2016 if (head == NULL) {
2017 head = tail = m;
2018 } else {
2019 tail->m_next = m;
2020 tail = m;
2021 }
2022
fe8ab488
A
2023 tot_sent += mlen;
2024 off = 0;
2025 mpt_mbuf = mpt_mbuf->m_next;
2026 }
2027
2028 if (head != NULL) {
490019cf 2029 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
fe8ab488 2030
490019cf
A
2031 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2032 (tp->t_tfo_stats == 0)) {
2033 tp->t_mpflags |= TMPF_TFO_REQUEST;
2034 } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
fe8ab488
A
2035 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
2036 }
2037
2038 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2039
2040 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
39236c6e
A
2041 struct sockbuf *, &so->so_rcv,
2042 struct sockbuf *, &so->so_snd,
2043 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488 2044 size_t, tot_sent);
490019cf
A
2045 } else if (tcp_zero_len_write == 1) {
2046zero_len_write:
2047 socket_lock(so, 1);
2048 /* Opting to call pru_send as no mbuf at subflow level */
2049 error = (*so->so_proto->pr_usrreqs->pru_send)
2050 (so, 0, NULL, NULL, NULL, current_proc());
2051 socket_unlock(so, 1);
fe8ab488
A
2052 }
2053
490019cf 2054 if ((error == 0) || (error == EWOULDBLOCK)) {
fe8ab488 2055 mpts->mpts_sndnxt += tot_sent;
3e170ce0
A
2056
2057 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2058 tcpstat.tcps_mp_num_probes++;
2059 if (tot_sent < mpts->mpts_maxseg)
2060 mpts->mpts_probecnt += 1;
2061 else
2062 mpts->mpts_probecnt +=
2063 tot_sent/mpts->mpts_maxseg;
2064 }
2065
39236c6e 2066 MPT_LOCK(mp_tp);
3e170ce0 2067
39236c6e
A
2068 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
2069 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
2070 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2071 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2072 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
2073 }
fe8ab488 2074 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
39236c6e 2075 MPT_UNLOCK(mp_tp);
fe8ab488 2076
490019cf
A
2077 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2078 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2079
fe8ab488
A
2080 /* Send once in SYN_SENT state to avoid sending SYN spam */
2081 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
490019cf 2082 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
fe8ab488 2083 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
39236c6e 2084 }
39236c6e 2085
3e170ce0
A
2086 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2087 (mpts->mpts_probesoon != 0))
2088 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
2089 "wrote %d %d probe %d probedelta %d\n",
fe8ab488 2090 __func__, mpts->mpts_connid, (int)tot_sent,
3e170ce0
A
2091 (int) sb_cc, mpts->mpts_probecnt,
2092 (tcp_now - mpts->mpts_probesoon)),
2093 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2094 } else {
3e170ce0
A
2095 mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
2096 __func__, mpts->mpts_connid, error, tot_sent),
2097 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2098 }
2099out:
2100 return (error);
2101}
2102
2103/*
2104 * Subflow socket control event upcall.
2105 *
2106 * Called when the associated subflow socket posted one or more control events.
2107 * The subflow socket lock has been released prior to invoking the callback.
2108 * Note that the upcall may occur synchronously as a result of MPTCP performing
2109 * an action on it, or asynchronously as a result of an event happening at the
2110 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
2111 * be acquired here is the thread lock, for signalling purposes.
2112 */
2113static void
2114mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
2115{
2116#pragma unused(so)
2117 struct mptsub *mpts = arg;
2118 struct mptses *mpte = mpts->mpts_mpte;
2119
2120 VERIFY(mpte != NULL);
2121
2122 lck_mtx_lock(&mpte->mpte_thread_lock);
2123 atomic_bitset_32(&mpts->mpts_evctl, events);
2124 mptcp_thread_signal_locked(mpte);
2125 lck_mtx_unlock(&mpte->mpte_thread_lock);
2126}
2127
2128/*
2129 * Subflow socket control events.
2130 *
2131 * Called for handling events related to the underlying subflow socket.
2132 */
2133static ev_ret_t
3e170ce0
A
2134mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
2135 uint64_t *p_mpsofilt_hint)
39236c6e 2136{
fe8ab488 2137 uint32_t events, save_events;
39236c6e 2138 ev_ret_t ret = MPTS_EVRET_OK;
3e170ce0
A
2139 int i = 0;
2140 int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
2141 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e
A
2142 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2143 MPTS_LOCK_ASSERT_HELD(mpts);
2144
2145 /* bail if there's nothing to process */
2146 if ((events = mpts->mpts_evctl) == 0)
2147 return (ret);
2148
2149 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
2150 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
2151 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
2152 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
2153 SO_FILT_HINT_DISCONNECTED)) {
2154 events |= SO_FILT_HINT_MPFAILOVER;
2155 }
2156
fe8ab488
A
2157 save_events = events;
2158
39236c6e
A
2159 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
2160 struct mptsub *, mpts, uint32_t, events);
2161
3e170ce0
A
2162 mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
2163 mpts->mpts_connid, events, SO_FILT_HINT_BITS),
2164 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
2165
2166 /*
2167 * Process all the socket filter hints and reset the hint
2168 * once it is handled
2169 */
2170 for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
490019cf
A
2171 /*
2172 * Always execute the DISCONNECTED event, because it will wakeup
2173 * the app.
2174 */
3e170ce0 2175 if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
490019cf
A
2176 (ret >= MPTS_EVRET_OK ||
2177 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3e170ce0
A
2178 ev_ret_t error =
2179 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
2180 events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
2181 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2182 }
fe8ab488
A
2183 }
2184
39236c6e
A
2185 /*
2186 * We should be getting only events specified via sock_catchevents(),
2187 * so loudly complain if we have any unprocessed one(s).
2188 */
2189 if (events != 0 || ret < MPTS_EVRET_OK) {
3e170ce0 2190 mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
39236c6e 2191 " unhandled events=%b\n",
3e170ce0 2192 (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
39236c6e 2193 __func__, mpts->mpts_connid,
3e170ce0
A
2194 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
2195 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2196 }
2197
2198 /* clear the ones we've processed */
fe8ab488 2199 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
39236c6e
A
2200 return (ret);
2201}
2202
2203/*
2204 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2205 */
2206static ev_ret_t
3e170ce0
A
2207mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
2208 uint64_t *p_mpsofilt_hint)
39236c6e
A
2209{
2210 struct socket *mp_so, *so;
2211 struct mptcb *mp_tp;
2212 boolean_t linger;
2213
2214 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2215 MPTS_LOCK_ASSERT_HELD(mpts);
2216 VERIFY(mpte->mpte_mppcb != NULL);
2217 mp_so = mpte->mpte_mppcb->mpp_socket;
2218 mp_tp = mpte->mpte_mptcb;
2219 so = mpts->mpts_socket;
2220
2221 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2222 !(mp_so->so_flags & SOF_PCBCLEARING));
2223
3e170ce0
A
2224 mptcplog((LOG_DEBUG, "MPTCP Events: "
2225 "%s: cid %d [linger %s]\n", __func__,
2226 mpts->mpts_connid, (linger ? "YES" : "NO")),
2227 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2228
39236c6e
A
2229 /*
2230 * We got a TCP RST for this subflow connection.
2231 *
2232 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
fe8ab488
A
2233 * client if the MPTCP connection has not been established or
2234 * if the connection has only one subflow and is a connection being
2235 * resumed. Otherwise we close the socket.
39236c6e
A
2236 */
2237 mptcp_subflow_disconnect(mpte, mpts, !linger);
2238
2239 MPT_LOCK(mp_tp);
2240 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
fe8ab488
A
2241 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
2242 } else if (mpte->mpte_nummpcapflows < 1) {
2243 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
3e170ce0 2244 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
39236c6e
A
2245 }
2246 MPT_UNLOCK(mp_tp);
2247
2248 /*
2249 * Keep the subflow socket around, unless the MPTCP socket has
2250 * been detached or the subflow has been disconnected explicitly,
2251 * in which case it should be deleted right away.
2252 */
2253 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2254}
2255
2256/*
2257 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2258 */
2259static ev_ret_t
3e170ce0
A
2260mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2261 uint64_t *p_mpsofilt_hint)
39236c6e 2262{
3e170ce0 2263#pragma unused(p_mpsofilt_hint)
39236c6e
A
2264 struct socket *so;
2265
2266 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2267 MPTS_LOCK_ASSERT_HELD(mpts);
2268
2269 so = mpts->mpts_socket;
2270
3e170ce0
A
2271 mptcplog((LOG_DEBUG, "MPTCP Events: "
2272 "%s: cid %d\n", __func__, mpts->mpts_connid),
2273 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2274
2275 /*
2276 * We got a FIN for this subflow connection. This subflow socket
2277 * is no longer available for receiving data;
2278 * The FIN may arrive with data. The data is handed up to the
2279 * mptcp socket and the subflow is disconnected.
2280 */
2281
2282 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2283}
2284
2285/*
2286 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2287 */
2288static ev_ret_t
3e170ce0
A
2289mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
2290 uint64_t *p_mpsofilt_hint)
39236c6e 2291{
3e170ce0 2292#pragma unused(p_mpsofilt_hint)
39236c6e
A
2293 struct socket *so;
2294
2295 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2296 MPTS_LOCK_ASSERT_HELD(mpts);
2297
2298 so = mpts->mpts_socket;
2299
3e170ce0
A
2300 mptcplog((LOG_DEBUG, "MPTCP Events: "
2301 "%s: cid %d\n", __func__, mpts->mpts_connid),
2302 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2303
39236c6e
A
2304 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2305}
2306
2307/*
2308 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2309 */
2310static ev_ret_t
3e170ce0
A
2311mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
2312 uint64_t *p_mpsofilt_hint)
39236c6e 2313{
3e170ce0 2314#pragma unused(p_mpsofilt_hint)
39236c6e
A
2315 struct socket *mp_so, *so;
2316 struct mptcb *mp_tp;
2317 boolean_t linger;
2318
2319 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2320 MPTS_LOCK_ASSERT_HELD(mpts);
2321 VERIFY(mpte->mpte_mppcb != NULL);
2322 mp_so = mpte->mpte_mppcb->mpp_socket;
2323 mp_tp = mpte->mpte_mptcb;
2324 so = mpts->mpts_socket;
2325
2326 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2327 !(mp_so->so_flags & SOF_PCBCLEARING));
2328
3e170ce0
A
2329 mptcplog((LOG_NOTICE, "MPTCP Events: "
2330 "%s: cid %d [linger %s]\n", __func__,
2331 mpts->mpts_connid, (linger ? "YES" : "NO")),
2332 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2333
2334 if (mpts->mpts_soerror == 0)
2335 mpts->mpts_soerror = ETIMEDOUT;
2336
2337 /*
2338 * The subflow connection has timed out.
2339 *
2340 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2341 * client if the MPTCP connection has not been established. Otherwise
2342 * drop it.
2343 */
2344 mptcp_subflow_disconnect(mpte, mpts, !linger);
2345
2346 MPT_LOCK(mp_tp);
2347 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2348 mp_so->so_error = ETIMEDOUT;
2349 }
2350 MPT_UNLOCK(mp_tp);
2351
2352 /*
2353 * Keep the subflow socket around, unless the MPTCP socket has
2354 * been detached or the subflow has been disconnected explicitly,
2355 * in which case it should be deleted right away.
2356 */
2357 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2358}
2359
2360/*
2361 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2362 */
2363static ev_ret_t
3e170ce0
A
2364mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
2365 uint64_t *p_mpsofilt_hint)
39236c6e 2366{
3e170ce0 2367#pragma unused(p_mpsofilt_hint)
39236c6e
A
2368 struct socket *mp_so, *so;
2369 struct mptcb *mp_tp;
2370 boolean_t linger;
2371 struct tcpcb *tp = NULL;
2372
2373 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2374 MPTS_LOCK_ASSERT_HELD(mpts);
2375
2376 VERIFY(mpte->mpte_mppcb != NULL);
2377 mp_so = mpte->mpte_mppcb->mpp_socket;
2378 mp_tp = mpte->mpte_mptcb;
2379 so = mpts->mpts_socket;
2380
2381 /* Not grabbing socket lock as t_local_aid is write once only */
2382 tp = intotcpcb(sotoinpcb(so));
2383 /*
2384 * This overwrites any previous mpte_lost_aid to avoid storing
2385 * too much state when the typical case has only two subflows.
2386 */
2387 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2388 mpte->mpte_lost_aid = tp->t_local_aid;
2389
2390 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2391 !(mp_so->so_flags & SOF_PCBCLEARING));
2392
3e170ce0
A
2393 mptcplog((LOG_DEBUG, "MPTCP Events: "
2394 "%s cid %d [linger %s]\n", __func__,
2395 mpts->mpts_connid, (linger ? "YES" : "NO")),
2396 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2397
2398 if (mpts->mpts_soerror == 0)
2399 mpts->mpts_soerror = EADDRNOTAVAIL;
2400
2401 /*
2402 * The subflow connection has lost its source address.
2403 *
2404 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2405 * client if the MPTCP connection has not been established. If it
2406 * has been established with one subflow , we keep the MPTCP
2407 * connection valid without any subflows till closed by application.
2408 * This lets tcp connection manager decide whether to close this or
2409 * not as it reacts to reachability changes too.
2410 */
2411 mptcp_subflow_disconnect(mpte, mpts, !linger);
2412
2413 MPT_LOCK(mp_tp);
2414 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2415 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2416 mp_so->so_error = EADDRNOTAVAIL;
2417 }
2418 MPT_UNLOCK(mp_tp);
2419
2420 /*
2421 * Keep the subflow socket around, unless the MPTCP socket has
2422 * been detached or the subflow has been disconnected explicitly,
2423 * in which case it should be deleted right away.
2424 */
2425 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2426}
2427
fe8ab488
A
2428/*
2429 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2430 * indicates that the remote side sent a Data FIN
2431 */
2432static ev_ret_t
3e170ce0
A
2433mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2434 uint64_t *p_mpsofilt_hint)
fe8ab488
A
2435{
2436 struct socket *so, *mp_so;
2437 struct mptcb *mp_tp;
2438
2439 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2440 MPTS_LOCK_ASSERT_HELD(mpts);
2441 mp_so = mpte->mpte_mppcb->mpp_socket;
2442 so = mpts->mpts_socket;
2443 mp_tp = mpte->mpte_mptcb;
2444
3e170ce0
A
2445 mptcplog((LOG_DEBUG, "MPTCP Events: "
2446 "%s: cid %d\n", __func__, mpts->mpts_connid),
2447 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2448
2449 /*
2450 * We got a Data FIN for the MPTCP connection.
2451 * The FIN may arrive with data. The data is handed up to the
2452 * mptcp socket and the user is notified so that it may close
2453 * the socket if needed.
2454 */
2455 MPT_LOCK(mp_tp);
2456 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3e170ce0 2457 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
fe8ab488
A
2458 }
2459 MPT_UNLOCK(mp_tp);
2460 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2461}
2462
39236c6e
A
2463/*
2464 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2465 */
2466static ev_ret_t
3e170ce0
A
2467mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
2468 uint64_t *p_mpsofilt_hint)
39236c6e
A
2469{
2470 struct mptsub *mpts_alt = NULL;
2471 struct socket *so = NULL;
2472 struct socket *mp_so;
2473 int altpath_exists = 0;
2474
2475 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2476 MPTS_LOCK_ASSERT_HELD(mpts);
2477 mp_so = mpte->mpte_mppcb->mpp_socket;
3e170ce0
A
2478 mptcplog((LOG_NOTICE, "MPTCP Events: "
2479 "%s: mp_so 0x%llx\n", __func__,
2480 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2481 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2482
2483 MPTS_UNLOCK(mpts);
3e170ce0 2484 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
2485
2486 /*
2487 * If there is no alternate eligible subflow, ignore the
2488 * failover hint.
2489 */
2490 if (mpts_alt == NULL) {
3e170ce0
A
2491 mptcplog((LOG_WARNING, "MPTCP Events: "
2492 "%s: no alternate path\n", __func__),
2493 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2494
fe8ab488
A
2495 if (mptcp_delayed_subf_start) {
2496 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2497 if (mpts_alt != NULL) {
2498 MPTS_LOCK(mpts_alt);
2499 (void) mptcp_subflow_soconnectx(mpte,
2500 mpts_alt);
2501 MPTS_UNLOCK(mpts_alt);
2502 }
2503 }
39236c6e
A
2504 MPTS_LOCK(mpts);
2505 goto done;
2506 }
2507 MPTS_LOCK(mpts_alt);
2508 altpath_exists = 1;
2509 so = mpts_alt->mpts_socket;
2510 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2511 socket_lock(so, 1);
fe8ab488
A
2512 /* All data acknowledged and no RTT spike */
2513 if ((so->so_snd.sb_cc == 0) &&
2514 (mptcp_no_rto_spike(so))) {
39236c6e
A
2515 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2516 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2517 } else {
2518 /* no alternate path available */
2519 altpath_exists = 0;
2520 }
2521 socket_unlock(so, 1);
2522 }
2523 if (altpath_exists) {
3e170ce0
A
2524 mptcplog((LOG_INFO, "MPTCP Events: "
2525 "%s: cid = %d\n",
2526 __func__, mpts_alt->mpts_connid),
2527 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 2528 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3e170ce0 2529 mpts_alt->mpts_peerswitch = 0;
39236c6e
A
2530 struct mptcb *mp_tp = mpte->mpte_mptcb;
2531 /* Bring the subflow's notion of snd_nxt into the send window */
2532 MPT_LOCK(mp_tp);
2533 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2534 MPT_UNLOCK(mp_tp);
2535 mpte->mpte_active_sub = mpts_alt;
2536 socket_lock(so, 1);
2537 sowwakeup(so);
2538 socket_unlock(so, 1);
2539 }
2540 MPTS_UNLOCK(mpts_alt);
2541
2542 if (altpath_exists) {
3e170ce0
A
2543 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2544 mptcplog((LOG_NOTICE, "MPTCP Events: "
2545 "%s: mp_so 0x%llx switched from "
39236c6e
A
2546 "%d to %d\n", __func__,
2547 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
2548 mpts->mpts_connid, mpts_alt->mpts_connid),
2549 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2550 tcpstat.tcps_mp_switches++;
2551 }
2552
2553 MPTS_LOCK(mpts);
2554 if (altpath_exists) {
2555 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2556 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2557 } else {
3e170ce0
A
2558 mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
2559 __func__, mpts->mpts_connid),
2560 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 2561done:
39236c6e
A
2562 so = mpts->mpts_socket;
2563 socket_lock(so, 1);
2564 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2565 socket_unlock(so, 1);
2566 }
39236c6e
A
2567 MPTS_LOCK_ASSERT_HELD(mpts);
2568 return (MPTS_EVRET_OK);
2569}
2570
2571/*
2572 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2573 */
2574static ev_ret_t
3e170ce0
A
2575mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
2576 uint64_t *p_mpsofilt_hint)
39236c6e
A
2577{
2578 struct socket *mp_so, *so;
2579 struct mptcb *mp_tp;
2580 boolean_t linger;
2581
2582 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2583 MPTS_LOCK_ASSERT_HELD(mpts);
2584 VERIFY(mpte->mpte_mppcb != NULL);
2585 mp_so = mpte->mpte_mppcb->mpp_socket;
2586 mp_tp = mpte->mpte_mptcb;
2587 so = mpts->mpts_socket;
2588
2589 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2590 !(mp_so->so_flags & SOF_PCBCLEARING));
2591
3e170ce0
A
2592 mptcplog((LOG_DEBUG, "MPTCP Events: "
2593 "%s: cid %d [linger %s]\n", __func__,
2594 mpts->mpts_connid, (linger ? "YES" : "NO")),
2595 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2596
2597 if (mpts->mpts_soerror == 0)
2598 mpts->mpts_soerror = EHOSTUNREACH;
2599
2600 /*
2601 * The subflow connection cannot use the outgoing interface.
2602 *
2603 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2604 * client if the MPTCP connection has not been established. If it
2605 * has been established, let the upper layer call disconnectx.
2606 */
2607 mptcp_subflow_disconnect(mpte, mpts, !linger);
3e170ce0 2608 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
39236c6e
A
2609
2610 MPT_LOCK(mp_tp);
2611 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2612 mp_so->so_error = EHOSTUNREACH;
2613 }
2614 MPT_UNLOCK(mp_tp);
2615
39236c6e
A
2616 /*
2617 * Keep the subflow socket around, unless the MPTCP socket has
2618 * been detached or the subflow has been disconnected explicitly,
2619 * in which case it should be deleted right away.
2620 */
2621 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2622}
2623
2624/*
2625 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2626 */
2627static ev_ret_t
3e170ce0
A
2628mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
2629 uint64_t *p_mpsofilt_hint)
39236c6e 2630{
3e170ce0 2631#pragma unused(p_mpsofilt_hint)
39236c6e
A
2632 struct socket *so;
2633
2634 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2635 MPTS_LOCK_ASSERT_HELD(mpts);
2636
2637 so = mpts->mpts_socket;
2638
2639 /* the subflow connection is being flow controlled */
2640 mpts->mpts_flags |= MPTSF_SUSPENDED;
2641
3e170ce0
A
2642 mptcplog((LOG_DEBUG, "MPTCP Events: "
2643 "%s: cid %d\n", __func__,
2644 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2645
2646 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2647}
2648
2649/*
2650 * Handle SO_FILT_HINT_RESUME subflow socket event.
2651 */
2652static ev_ret_t
3e170ce0
A
2653mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
2654 uint64_t *p_mpsofilt_hint)
39236c6e 2655{
3e170ce0 2656#pragma unused(p_mpsofilt_hint)
39236c6e
A
2657 struct socket *so;
2658
2659 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2660 MPTS_LOCK_ASSERT_HELD(mpts);
2661
2662 so = mpts->mpts_socket;
2663
2664 /* the subflow connection is no longer flow controlled */
2665 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2666
3e170ce0
A
2667 mptcplog((LOG_DEBUG, "MPTCP Events: "
2668 "%s: cid %d\n", __func__, mpts->mpts_connid),
2669 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
2670
2671 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2672}
2673
2674/*
2675 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2676 */
2677static ev_ret_t
3e170ce0
A
2678mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
2679 uint64_t *p_mpsofilt_hint)
39236c6e
A
2680{
2681 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2682 struct sockaddr_entry *src_se, *dst_se;
2683 struct sockaddr_storage src;
2684 struct socket *mp_so, *so;
2685 struct mptcb *mp_tp;
2686 struct ifnet *outifp;
2687 int af, error = 0;
2688 boolean_t mpok = FALSE;
3e170ce0
A
2689 boolean_t cell = FALSE;
2690 boolean_t wifi = FALSE;
2691 boolean_t wired = FALSE;
39236c6e
A
2692
2693 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2694 VERIFY(mpte->mpte_mppcb != NULL);
2695 mp_so = mpte->mpte_mppcb->mpp_socket;
2696 mp_tp = mpte->mpte_mptcb;
2697
2698 MPTS_LOCK_ASSERT_HELD(mpts);
2699 so = mpts->mpts_socket;
2700 af = mpts->mpts_family;
2701
2702 if (mpts->mpts_flags & MPTSF_CONNECTED)
2703 return (MPTS_EVRET_OK);
2704
2705 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2706 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
490019cf 2707 socket_lock(so, 0);
fe8ab488
A
2708 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2709 (so->so_state & SS_ISCONNECTED)) {
3e170ce0
A
2710 mptcplog((LOG_DEBUG, "MPTCP Events: "
2711 "%s: cid %d disconnect before tcp connect\n",
2712 __func__, mpts->mpts_connid),
2713 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
2714 (void) soshutdownlock(so, SHUT_RD);
2715 (void) soshutdownlock(so, SHUT_WR);
2716 (void) sodisconnectlocked(so);
2717 }
2718 socket_unlock(so, 0);
39236c6e
A
2719 return (MPTS_EVRET_OK);
2720 }
2721
2722 /*
2723 * The subflow connection has been connected. Find out whether it
2724 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2725 *
2726 * a. If MPTCP connection is not yet established, then this must be
2727 * the first subflow connection. If MPTCP failed to negotiate,
2728 * indicate to the MPTCP socket client via EPROTO, that the
2729 * underlying TCP connection may be peeled off via peeloff(2).
2730 * Otherwise, mark the MPTCP socket as connected.
2731 *
2732 * b. If MPTCP connection has been established, then this must be
2733 * one of the subsequent subflow connections. If MPTCP failed
2734 * to negotiate, disconnect the connection since peeloff(2)
2735 * is no longer possible.
2736 *
2737 * Right now, we simply unblock any waiters at the MPTCP socket layer
2738 * if the MPTCP connection has not been established.
2739 */
2740 socket_lock(so, 0);
2741
2742 if (so->so_state & SS_ISDISCONNECTED) {
2743 /*
2744 * With MPTCP joins, a connection is connected at the subflow
2745 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
2746 * subflow to connected state. So there is a small window
2747 * where the subflow could get disconnected before the
39236c6e
A
2748 * connected event is processed.
2749 */
2750 socket_unlock(so, 0);
2751 return (MPTS_EVRET_OK);
2752 }
2753
2754 mpts->mpts_soerror = 0;
2755 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2756 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf
A
2757
2758 if (!(so->so_flags1 & SOF1_DATA_IDEMPOTENT))
2759 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
2760
2761 struct tcpcb *tp = sototcpcb(so);
2762 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
2763 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2764
490019cf
A
2765 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
2766
39236c6e
A
2767 VERIFY(mpts->mpts_dst_sl != NULL);
2768 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2769 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2770 dst_se->se_addr->sa_family == af);
2771
2772 VERIFY(mpts->mpts_src_sl != NULL);
2773 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2774 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2775 src_se->se_addr->sa_family == af);
2776
2777 /* get/check source IP address */
2778 switch (af) {
2779 case AF_INET: {
2780 error = in_getsockaddr_s(so, &src);
2781 if (error == 0) {
2782 struct sockaddr_in *ms = SIN(src_se->se_addr);
2783 struct sockaddr_in *s = SIN(&src);
2784
2785 VERIFY(s->sin_len == ms->sin_len);
2786 VERIFY(ms->sin_family == AF_INET);
2787
2788 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2789 bcmp(&ms->sin_addr, &s->sin_addr,
2790 sizeof (ms->sin_addr)) != 0) {
3e170ce0
A
2791 mptcplog((LOG_ERR, "MPTCP Events: "
2792 "%s: cid %d local "
39236c6e
A
2793 "address %s (expected %s)\n", __func__,
2794 mpts->mpts_connid, inet_ntop(AF_INET,
2795 (void *)&s->sin_addr.s_addr, buf0,
2796 sizeof (buf0)), inet_ntop(AF_INET,
2797 (void *)&ms->sin_addr.s_addr, buf1,
3e170ce0
A
2798 sizeof (buf1))),
2799 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2800 }
2801 bcopy(s, ms, sizeof (*s));
2802 }
2803 break;
2804 }
2805#if INET6
2806 case AF_INET6: {
2807 error = in6_getsockaddr_s(so, &src);
2808 if (error == 0) {
2809 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2810 struct sockaddr_in6 *s = SIN6(&src);
2811
2812 VERIFY(s->sin6_len == ms->sin6_len);
2813 VERIFY(ms->sin6_family == AF_INET6);
2814
2815 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2816 bcmp(&ms->sin6_addr, &s->sin6_addr,
2817 sizeof (ms->sin6_addr)) != 0) {
3e170ce0
A
2818 mptcplog((LOG_ERR, "MPTCP Events: "
2819 "%s: cid %d local "
39236c6e
A
2820 "address %s (expected %s)\n", __func__,
2821 mpts->mpts_connid, inet_ntop(AF_INET6,
2822 (void *)&s->sin6_addr, buf0,
2823 sizeof (buf0)), inet_ntop(AF_INET6,
2824 (void *)&ms->sin6_addr, buf1,
3e170ce0
A
2825 sizeof (buf1))),
2826 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2827 }
2828 bcopy(s, ms, sizeof (*s));
2829 }
2830 break;
2831 }
2832#endif /* INET6 */
2833 default:
2834 VERIFY(0);
2835 /* NOTREACHED */
2836 }
2837
2838 if (error != 0) {
3e170ce0
A
2839 mptcplog((LOG_ERR, "MPTCP Events "
2840 "%s: cid %d getsockaddr failed (%d)\n",
2841 __func__, mpts->mpts_connid, error),
2842 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2843 }
2844
2845 /* get/verify the outbound interface */
2846 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2847 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2848 VERIFY(mpts->mpts_outif != NULL);
2849 if (mpts->mpts_outif != outifp) {
3e170ce0 2850 mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
39236c6e
A
2851 "(expected %s)\n", __func__, mpts->mpts_connid,
2852 ((outifp != NULL) ? outifp->if_xname : "NULL"),
3e170ce0
A
2853 mpts->mpts_outif->if_xname),
2854 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2855
39236c6e
A
2856 if (outifp == NULL)
2857 outifp = mpts->mpts_outif;
2858 }
2859 } else {
2860 mpts->mpts_outif = outifp;
2861 }
2862
3e170ce0
A
2863 mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
2864 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
2865 mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
2866
2867 cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
2868 wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
2869 wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
2870
2871 if (cell)
2872 mpts->mpts_linktype |= MPTSL_CELL;
2873 else if (wifi)
2874 mpts->mpts_linktype |= MPTSL_WIFI;
2875 else if (wired)
2876 mpts->mpts_linktype |= MPTSL_WIRED;
2877
39236c6e
A
2878 socket_unlock(so, 0);
2879
3e170ce0
A
2880 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
2881 "establishment srtt %d \n", __func__,
2882 mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
2883 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2884
2885
2886 mptcplog((LOG_DEBUG, "MPTCP Socket: "
2887 "%s: cid %d outif %s %s[%d] -> %s[%d] "
39236c6e
A
2888 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2889 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2890 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2891 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2892 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2893 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2894 inet_ntop(af, ((af == AF_INET) ?
2895 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2896 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2897 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2898 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2899 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
3e170ce0
A
2900 "MPTCP capable" : "a regular TCP")),
2901 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
2902
2903 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2904 MPTS_UNLOCK(mpts);
2905
3e170ce0 2906 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
2907
2908 MPT_LOCK(mp_tp);
2909 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2910 /* case (a) above */
2911 if (!mpok) {
2912 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2913 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2914 MPT_UNLOCK(mp_tp);
2915 } else {
490019cf
A
2916 MPT_UNLOCK(mp_tp);
2917 mptcplog((LOG_DEBUG, "MPTCP State: "
2918 "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
2919 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2920 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
2921 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2922 mpte->mpte_associd = mpts->mpts_connid;
2923 DTRACE_MPTCP2(state__change,
2924 struct mptcb *, mp_tp,
2925 uint32_t, 0 /* event */);
2926
2927 (void) mptcp_setconnorder(mpte, mpts->mpts_connid, 1);
2928 soisconnected(mp_so);
39236c6e
A
2929 }
2930 MPTS_LOCK(mpts);
2931 if (mpok) {
39236c6e
A
2932 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2933 mpte->mpte_nummpcapflows++;
2934 MPT_LOCK_SPIN(mp_tp);
490019cf
A
2935 /* With TFO, sndnxt may be initialized earlier */
2936 if (mpts->mpts_sndnxt == 0)
2937 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2938 MPT_UNLOCK(mp_tp);
2939 }
2940 } else if (mpok) {
2941 MPT_UNLOCK(mp_tp);
fe8ab488
A
2942 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2943 /* Experimental code, disabled by default. */
2944 sorwakeup(mp_so);
2945 sowwakeup(mp_so);
2946 }
39236c6e
A
2947 /*
2948 * case (b) above
2949 * In case of additional flows, the MPTCP socket is not
2950 * MPTSF_MP_CAPABLE until an ACK is received from server
2951 * for 3-way handshake. TCP would have guaranteed that this
2952 * is an MPTCP subflow.
2953 */
2954 MPTS_LOCK(mpts);
2955 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
fe8ab488 2956 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
39236c6e 2957 mpte->mpte_nummpcapflows++;
39236c6e 2958 MPT_LOCK_SPIN(mp_tp);
fe8ab488
A
2959 /* With Fastjoin, sndnxt is updated before connected_ev */
2960 if (mpts->mpts_sndnxt == 0) {
2961 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
490019cf 2962 mpts->mpts_rel_seq = 1;
fe8ab488 2963 }
39236c6e 2964 MPT_UNLOCK(mp_tp);
fe8ab488
A
2965 mptcp_output_needed(mpte, mpts);
2966 } else {
2967 MPT_UNLOCK(mp_tp);
2968 MPTS_LOCK(mpts);
39236c6e 2969 }
fe8ab488 2970
39236c6e
A
2971 MPTS_LOCK_ASSERT_HELD(mpts);
2972
2973 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2974}
2975
2976/*
2977 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2978 */
2979static ev_ret_t
3e170ce0
A
2980mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
2981 uint64_t *p_mpsofilt_hint)
39236c6e
A
2982{
2983 struct socket *mp_so, *so;
2984 struct mptcb *mp_tp;
2985 boolean_t linger;
2986
2987 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2988 MPTS_LOCK_ASSERT_HELD(mpts);
2989 VERIFY(mpte->mpte_mppcb != NULL);
2990 mp_so = mpte->mpte_mppcb->mpp_socket;
2991 mp_tp = mpte->mpte_mptcb;
2992 so = mpts->mpts_socket;
2993
2994 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2995 !(mp_so->so_flags & SOF_PCBCLEARING));
2996
3e170ce0
A
2997 mptcplog((LOG_DEBUG, "MPTCP Events: "
2998 "%s: cid %d [linger %s]\n", __func__,
2999 mpts->mpts_connid, (linger ? "YES" : "NO")),
3000 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3001
3002 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3003 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3004
3005 /*
3006 * Clear flags that are used by getconninfo to return state.
fe8ab488 3007 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e
A
3008 */
3009 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3010 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3011 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
3012 MPTSF_SUSPENDED|MPTSF_ACTIVE);
3013 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3014
3015 /*
3016 * The subflow connection has been disconnected.
3017 *
3018 * Right now, we simply unblock any waiters at the MPTCP socket layer
3019 * if the MPTCP connection has not been established.
3020 */
3e170ce0 3021 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3022
3023 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3024 mpte->mpte_nummpcapflows--;
fe8ab488
A
3025 if (mpte->mpte_active_sub == mpts) {
3026 mpte->mpte_active_sub = NULL;
3e170ce0
A
3027 mptcplog((LOG_DEBUG, "MPTCP Events: "
3028 "%s: resetting active subflow \n",
3029 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3030 }
39236c6e
A
3031 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3032 }
3033
3034 MPT_LOCK(mp_tp);
3035 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3036 MPT_UNLOCK(mp_tp);
3e170ce0 3037 MPTS_UNLOCK(mpts);
39236c6e 3038 soisdisconnected(mp_so);
3e170ce0 3039 MPTS_LOCK(mpts);
39236c6e
A
3040 } else {
3041 MPT_UNLOCK(mp_tp);
3042 }
3043
39236c6e
A
3044 /*
3045 * The underlying subflow socket has been disconnected;
3046 * it is no longer useful to us. Keep the subflow socket
3047 * around, unless the MPTCP socket has been detached or
3048 * the subflow has been disconnected explicitly, in which
3049 * case it should be deleted right away.
3050 */
3051 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3052}
3053
3054/*
3055 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3056 */
3057static ev_ret_t
3e170ce0
A
3058mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3059 uint64_t *p_mpsofilt_hint)
39236c6e
A
3060{
3061 struct socket *mp_so, *so;
3062 struct mptcb *mp_tp;
3e170ce0 3063 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e
A
3064
3065 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3066 VERIFY(mpte->mpte_mppcb != NULL);
3067 mp_so = mpte->mpte_mppcb->mpp_socket;
3068 mp_tp = mpte->mpte_mptcb;
3069
3070 MPTS_LOCK_ASSERT_HELD(mpts);
3071 so = mpts->mpts_socket;
3072
3073 socket_lock(so, 0);
3074 MPT_LOCK(mp_tp);
3075
3076 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3077 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3078 else
3079 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3080
3081 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3082 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3083 goto done;
3084 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3085 }
3086 else
3087 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3088
3089 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3090 mpts->mpts_flags |= MPTSF_MP_READY;
3091 else
3092 mpts->mpts_flags &= ~MPTSF_MP_READY;
3093
3094 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3095 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3096 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3097 }
3098
3099 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3100 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3101 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3e170ce0
A
3102 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3103 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3104 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3105 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3106 ret = MPTS_EVRET_CONNECT_PENDING;
3e170ce0
A
3107 } else {
3108 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3109 SO_FILT_HINT_CONNINFO_UPDATED;
39236c6e
A
3110 }
3111
3e170ce0
A
3112 mptcplog((LOG_DEBUG, "MPTCP Events: "
3113 "%s: mp_so 0x%llx mpt_flags=%b cid %d "
39236c6e
A
3114 "mptsf=%b\n", __func__,
3115 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
3116 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3e170ce0
A
3117 mpts->mpts_flags, MPTSF_BITS),
3118 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3119
39236c6e
A
3120done:
3121 MPT_UNLOCK(mp_tp);
3122 socket_unlock(so, 0);
39236c6e
A
3123 return (ret);
3124}
3125
3126/*
3127 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3128 */
3129static ev_ret_t
3e170ce0
A
3130mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3131 uint64_t *p_mpsofilt_hint)
39236c6e
A
3132{
3133 struct socket *mp_so, *so;
3134 struct mptcb *mp_tp;
3135 boolean_t linger;
3136
3137
3138 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3139 MPTS_LOCK_ASSERT_HELD(mpts);
3140 VERIFY(mpte->mpte_mppcb != NULL);
3141 mp_so = mpte->mpte_mppcb->mpp_socket;
3142 mp_tp = mpte->mpte_mptcb;
3143 so = mpts->mpts_socket;
3144
3145 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3146 !(mp_so->so_flags & SOF_PCBCLEARING));
3147
3148 if (mpts->mpts_soerror == 0)
3149 mpts->mpts_soerror = ECONNABORTED;
3150
39236c6e
A
3151 /* We got an invalid option or a fast close */
3152 socket_lock(so, 0);
3153 struct tcptemp *t_template;
3154 struct inpcb *inp = sotoinpcb(so);
3155 struct tcpcb *tp = NULL;
3156
3157 tp = intotcpcb(inp);
fe8ab488 3158 so->so_error = ECONNABORTED;
39236c6e
A
3159
3160 t_template = tcp_maketemplate(tp);
3161 if (t_template) {
fe8ab488 3162 struct tcp_respond_args tra;
39236c6e 3163
fe8ab488 3164 bzero(&tra, sizeof(tra));
39236c6e 3165 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3166 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3167 else
fe8ab488
A
3168 tra.ifscope = IFSCOPE_NONE;
3169 tra.awdl_unrestricted = 1;
39236c6e
A
3170
3171 tcp_respond(tp, t_template->tt_ipgen,
3172 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3173 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3174 (void) m_free(dtom(t_template));
3e170ce0
A
3175 mptcplog((LOG_DEBUG, "MPTCP Events: "
3176 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3177 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3178 so, mpts->mpts_connid),
3179 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3180 }
3181 socket_unlock(so, 0);
3182 mptcp_subflow_disconnect(mpte, mpts, !linger);
39236c6e 3183
3e170ce0
A
3184 *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
3185
3186 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
3187 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e
A
3188
3189 MPT_LOCK(mp_tp);
fe8ab488
A
3190 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) ||
3191 (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) {
39236c6e
A
3192 mp_so->so_error = ECONNABORTED;
3193 }
3e170ce0
A
3194 /*
3195 * Ideally there should be a state transition for when a FASTCLOSE
3196 * is received. Right now we keep the connection in MPTCPS_ESTABLISHED
3197 * state and only go to terminal state when the user level code calls
3198 * close after processing the SO_FILT_HINT_CONNRESET event.
3199 */
3200 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3201 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e
A
3202 MPT_UNLOCK(mp_tp);
3203
39236c6e
A
3204 /*
3205 * Keep the subflow socket around unless the subflow has been
3206 * disconnected explicitly.
3207 */
3208 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3209}
3210
fe8ab488 3211static ev_ret_t
3e170ce0
A
3212mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
3213 uint64_t *p_mpsofilt_hint)
fe8ab488 3214{
3e170ce0 3215#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3216 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3217 MPTS_LOCK_ASSERT_HELD(mpts);
3218 VERIFY(mpte->mpte_mppcb != NULL);
3219
3220 if (mpte->mpte_nummpcapflows == 0) {
3221 struct mptcb *mp_tp = mpte->mpte_mptcb;
3e170ce0
A
3222 mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
3223 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
3224 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3225
fe8ab488
A
3226 mpte->mpte_active_sub = mpts;
3227 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3228 MPT_LOCK(mp_tp);
3229 /*
3230 * If mptcp_subflow_output is called before fastjoin_ev
3231 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3232 * and further mpts->mpts_sndnxt is incremented by len copied.
3233 */
3234 if (mpts->mpts_sndnxt == 0) {
3235 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
fe8ab488
A
3236 }
3237 MPT_UNLOCK(mp_tp);
3238 }
3239
3240 return (MPTS_EVRET_OK);
3241}
3242
3243static ev_ret_t
3e170ce0
A
3244mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
3245 uint64_t *p_mpsofilt_hint)
fe8ab488 3246{
3e170ce0 3247#pragma unused(p_mpsofilt_hint)
fe8ab488
A
3248 MPTE_LOCK_ASSERT_HELD(mpte);
3249 MPTS_LOCK_ASSERT_HELD(mpts);
3250 VERIFY(mpte->mpte_mppcb != NULL);
3e170ce0
A
3251
3252 mptcplog((LOG_DEBUG, "MPTCP Events: "
3253 "%s cid %d\n", __func__, mpts->mpts_connid),
3254 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3255
3256 mpts->mpts_flags |= MPTSF_DELETEOK;
3257 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3258 return (MPTS_EVRET_DELETE);
3259 else
3260 return (MPTS_EVRET_OK);
3261}
3262
39236c6e
A
3263static const char *
3264mptcp_evret2str(ev_ret_t ret)
3265{
3266 const char *c = "UNKNOWN";
3267
3268 switch (ret) {
3269 case MPTS_EVRET_DELETE:
3270 c = "MPTS_EVRET_DELETE";
3271 break;
3272 case MPTS_EVRET_CONNECT_PENDING:
3273 c = "MPTS_EVRET_CONNECT_PENDING";
3274 break;
3275 case MPTS_EVRET_DISCONNECT_FALLBACK:
3276 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3277 break;
3278 case MPTS_EVRET_OK:
3279 c = "MPTS_EVRET_OK";
3280 break;
3e170ce0 3281 default:
39236c6e
A
3282 break;
3283 }
3284 return (c);
3285}
3286
3287/*
3288 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3289 */
3290void
3291mptcp_subflow_addref(struct mptsub *mpts, int locked)
3292{
3293 if (!locked)
3294 MPTS_LOCK(mpts);
3295 else
3296 MPTS_LOCK_ASSERT_HELD(mpts);
3297
3298 if (++mpts->mpts_refcnt == 0) {
3299 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3300 /* NOTREACHED */
3301 }
3302 if (!locked)
3303 MPTS_UNLOCK(mpts);
3304}
3305
3306/*
3307 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3308 */
3309void
3310mptcp_subflow_remref(struct mptsub *mpts)
3311{
3312 MPTS_LOCK(mpts);
3313 if (mpts->mpts_refcnt == 0) {
3314 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3315 /* NOTREACHED */
3316 }
3317 if (--mpts->mpts_refcnt > 0) {
3318 MPTS_UNLOCK(mpts);
3319 return;
3320 }
3321 /* callee will unlock and destroy lock */
3322 mptcp_subflow_free(mpts);
3323}
3324
3325/*
3326 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3327 * caller must ensure that the option can be issued on subflow sockets, via
3328 * MPOF_SUBFLOW_OK flag.
3329 */
3330int
3331mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3332 struct mptopt *mpo)
3333{
3334 struct socket *mp_so;
3335 struct sockopt sopt;
3336 char buf[32];
3337 int error;
3338
3339 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3340 mpo->mpo_flags &= ~MPOF_INTERIM;
3341
3342 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3343 mp_so = mpte->mpte_mppcb->mpp_socket;
3344
3345 bzero(&sopt, sizeof (sopt));
3346 sopt.sopt_dir = SOPT_SET;
3347 sopt.sopt_level = mpo->mpo_level;
3348 sopt.sopt_name = mpo->mpo_name;
3349 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3350 sopt.sopt_valsize = sizeof (int);
3351 sopt.sopt_p = kernproc;
3352
3353 error = sosetoptlock(so, &sopt, 0); /* already locked */
3354 if (error == 0) {
3e170ce0
A
3355 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3356 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3357 "val %d set successful\n", __func__,
3358 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3359 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3360 buf, sizeof (buf)), mpo->mpo_intval),
3361 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3362 } else {
3e170ce0
A
3363 mptcplog((LOG_ERR, "MPTCP Socket: "
3364 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3365 "val %d set error %d\n", __func__,
3366 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3367 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3368 buf, sizeof (buf)), mpo->mpo_intval, error),
3369 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3370 }
3371 return (error);
3372}
3373
3374/*
3375 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3376 * caller must ensure that the option can be issued on subflow sockets, via
3377 * MPOF_SUBFLOW_OK flag.
3378 */
3379int
3380mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3381 struct mptopt *mpo)
3382{
3383 struct socket *mp_so;
3384 struct sockopt sopt;
3385 char buf[32];
3386 int error;
3387
3388 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3389 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3390 mp_so = mpte->mpte_mppcb->mpp_socket;
3391
3392 bzero(&sopt, sizeof (sopt));
3393 sopt.sopt_dir = SOPT_GET;
3394 sopt.sopt_level = mpo->mpo_level;
3395 sopt.sopt_name = mpo->mpo_name;
3396 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3397 sopt.sopt_valsize = sizeof (int);
3398 sopt.sopt_p = kernproc;
3399
3400 error = sogetoptlock(so, &sopt, 0); /* already locked */
3401 if (error == 0) {
3e170ce0
A
3402 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3403 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3404 "val %d get successful\n", __func__,
3405 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3406 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3e170ce0
A
3407 buf, sizeof (buf)), mpo->mpo_intval),
3408 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3409 } else {
3e170ce0
A
3410 mptcplog((LOG_ERR, "MPTCP Socket: "
3411 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e
A
3412 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3413 mptcp_sopt2str(mpo->mpo_level,
3e170ce0
A
3414 mpo->mpo_name, buf, sizeof (buf)), error),
3415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3416 }
3417 return (error);
3418}
3419
3420
3421/*
3422 * MPTCP garbage collector.
3423 *
3424 * This routine is called by the MP domain on-demand, periodic callout,
3425 * which is triggered when a MPTCP socket is closed. The callout will
3426 * repeat as long as this routine returns a non-zero value.
3427 */
3428static uint32_t
3429mptcp_gc(struct mppcbinfo *mppi)
3430{
3431 struct mppcb *mpp, *tmpp;
3432 uint32_t active = 0;
3433
3434 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3435
39236c6e
A
3436 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3437 struct socket *mp_so;
3438 struct mptses *mpte;
3439 struct mptcb *mp_tp;
3440
3441 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3442 mp_so = mpp->mpp_socket;
3443 VERIFY(mp_so != NULL);
3444 mpte = mptompte(mpp);
3445 VERIFY(mpte != NULL);
3446 mp_tp = mpte->mpte_mptcb;
3447 VERIFY(mp_tp != NULL);
3448
3e170ce0
A
3449 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3450 "%s: mp_so 0x%llx found "
39236c6e
A
3451 "(u=%d,r=%d,s=%d)\n", __func__,
3452 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3453 mp_so->so_retaincnt, mpp->mpp_state),
3454 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3455
3456 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3e170ce0
A
3457 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3458 "%s: mp_so 0x%llx skipped "
39236c6e
A
3459 "(u=%d,r=%d)\n", __func__,
3460 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3461 mp_so->so_usecount, mp_so->so_retaincnt),
3462 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3463 active++;
3464 continue;
3465 }
3466
3467 /* check again under the lock */
3468 if (mp_so->so_usecount > 1) {
3469 boolean_t wakeup = FALSE;
3470 struct mptsub *mpts, *tmpts;
3471
3e170ce0
A
3472 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3473 "%s: mp_so 0x%llx skipped "
39236c6e
A
3474 "[u=%d,r=%d] %d %d\n", __func__,
3475 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3476 mp_so->so_usecount, mp_so->so_retaincnt,
3477 mp_tp->mpt_gc_ticks,
3e170ce0
A
3478 mp_tp->mpt_state),
3479 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3480
39236c6e
A
3481 MPT_LOCK(mp_tp);
3482 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3483 if (mp_tp->mpt_gc_ticks > 0)
3484 mp_tp->mpt_gc_ticks--;
3485 if (mp_tp->mpt_gc_ticks == 0) {
3486 wakeup = TRUE;
3487 if (mp_tp->mpt_localkey != NULL) {
3488 mptcp_free_key(
3489 mp_tp->mpt_localkey);
3490 mp_tp->mpt_localkey = NULL;
3491 }
3492 }
3493 }
3494 MPT_UNLOCK(mp_tp);
3495 if (wakeup) {
3496 TAILQ_FOREACH_SAFE(mpts,
3497 &mpte->mpte_subflows, mpts_entry, tmpts) {
3498 MPTS_LOCK(mpts);
3499 mpts->mpts_flags |= MPTSF_DELETEOK;
3500 if (mpts->mpts_soerror == 0)
3501 mpts->mpts_soerror = ETIMEDOUT;
3502 mptcp_subflow_eupcall(mpts->mpts_socket,
3503 mpts, SO_FILT_HINT_DISCONNECTED);
3504 MPTS_UNLOCK(mpts);
3505 }
3506 }
3507 lck_mtx_unlock(&mpp->mpp_lock);
3508 active++;
3509 continue;
3510 }
3511
3512 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3e170ce0
A
3513 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3514 "%s: mp_so 0x%llx skipped "
39236c6e
A
3515 "[u=%d,r=%d,s=%d]\n", __func__,
3516 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3517 mp_so->so_usecount, mp_so->so_retaincnt,
3e170ce0
A
3518 mpp->mpp_state),
3519 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3520 lck_mtx_unlock(&mpp->mpp_lock);
3521 active++;
3522 continue;
3523 }
3524
3525 /*
3526 * The PCB has been detached, and there is exactly 1 refnct
3527 * held by the MPTCP thread. Signal that thread to terminate,
3528 * after which the last refcnt will be released. That will
3529 * allow it to be destroyed below during the next round.
3530 */
3531 if (mp_so->so_usecount == 1) {
3e170ce0
A
3532 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3533 "%s: mp_so 0x%llx scheduled for "
39236c6e
A
3534 "termination [u=%d,r=%d]\n", __func__,
3535 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3536 mp_so->so_usecount, mp_so->so_retaincnt),
3537 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3538
39236c6e
A
3539 /* signal MPTCP thread to terminate */
3540 mptcp_thread_terminate_signal(mpte);
3541 lck_mtx_unlock(&mpp->mpp_lock);
3542 active++;
3543 continue;
3544 }
3545
3e170ce0
A
3546 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3547 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 3548 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3549 mp_so->so_usecount, mp_so->so_retaincnt),
3550 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3551
39236c6e
A
3552 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3553 struct sockbuf *, &mp_so->so_rcv,
3554 struct sockbuf *, &mp_so->so_snd,
3555 struct mppcb *, mpp);
3556
3557 mp_pcbdispose(mpp);
3558 }
3559
3560 return (active);
3561}
3562
3563/*
3564 * Drop a MPTCP connection, reporting the specified error.
3565 */
3566struct mptses *
3567mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3568{
3569 struct socket *mp_so;
3570
3571 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3572 MPT_LOCK_ASSERT_HELD(mp_tp);
3573 VERIFY(mpte->mpte_mptcb == mp_tp);
3574 mp_so = mpte->mpte_mppcb->mpp_socket;
3575
fe8ab488 3576 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e
A
3577 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3578 uint32_t, 0 /* event */);
3579
3580 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3581 errno = mp_tp->mpt_softerror;
3582 mp_so->so_error = errno;
3583
3584 return (mptcp_close(mpte, mp_tp));
3585}
3586
3587/*
3588 * Close a MPTCP control block.
3589 */
3590struct mptses *
3591mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3592{
3e170ce0
A
3593 struct socket *mp_so = NULL;
3594 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e
A
3595
3596 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3597 MPT_LOCK_ASSERT_HELD(mp_tp);
3598 VERIFY(mpte->mpte_mptcb == mp_tp);
3599 mp_so = mpte->mpte_mppcb->mpp_socket;
3600 if (mp_tp->mpt_localkey != NULL) {
3601 mptcp_free_key(mp_tp->mpt_localkey);
3602 mp_tp->mpt_localkey = NULL;
3603 }
3604
3605 MPT_UNLOCK(mp_tp);
3606 soisdisconnected(mp_so);
3607
3608 MPT_LOCK(mp_tp);
3609 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3610 return (NULL);
3611 }
3612 MPT_UNLOCK(mp_tp);
3613
3614 /* Clean up all subflows */
3615 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3616 MPTS_LOCK(mpts);
fe8ab488 3617 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
39236c6e
A
3618 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3619 MPTS_UNLOCK(mpts);
3620 mptcp_subflow_del(mpte, mpts, TRUE);
3621 }
3622 MPT_LOCK(mp_tp);
3623
3624 return (NULL);
3625}
3626
3627void
3628mptcp_notify_close(struct socket *so)
3629{
3630 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3631}
3632
3633/*
3634 * Signal MPTCP thread to wake up.
3635 */
3636void
3637mptcp_thread_signal(struct mptses *mpte)
3638{
3639 lck_mtx_lock(&mpte->mpte_thread_lock);
3640 mptcp_thread_signal_locked(mpte);
3641 lck_mtx_unlock(&mpte->mpte_thread_lock);
3642}
3643
3644/*
3645 * Signal MPTCP thread to wake up (locked version)
3646 */
3647static void
3648mptcp_thread_signal_locked(struct mptses *mpte)
3649{
3650 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3651
3652 mpte->mpte_thread_reqs++;
3653 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3654 wakeup_one((caddr_t)&mpte->mpte_thread);
3655}
3656
3657/*
3658 * Signal MPTCP thread to terminate.
3659 */
3660static void
3661mptcp_thread_terminate_signal(struct mptses *mpte)
3662{
3663 lck_mtx_lock(&mpte->mpte_thread_lock);
3664 if (mpte->mpte_thread != THREAD_NULL) {
3665 mpte->mpte_thread = THREAD_NULL;
3666 mpte->mpte_thread_reqs++;
3667 if (!mpte->mpte_thread_active)
3668 wakeup_one((caddr_t)&mpte->mpte_thread);
3669 }
3670 lck_mtx_unlock(&mpte->mpte_thread_lock);
3671}
3672
3673/*
3674 * MPTCP thread workloop.
3675 */
3676static void
3677mptcp_thread_dowork(struct mptses *mpte)
3678{
3679 struct socket *mp_so;
3680 struct mptsub *mpts, *tmpts;
3681 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3e170ce0 3682 uint64_t mpsofilt_hint_mask = 0;
39236c6e
A
3683
3684 MPTE_LOCK(mpte); /* same as MP socket lock */
3685 VERIFY(mpte->mpte_mppcb != NULL);
3686 mp_so = mpte->mpte_mppcb->mpp_socket;
3687 VERIFY(mp_so != NULL);
3688
3689 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3690 ev_ret_t ret;
3691
3692 MPTS_LOCK(mpts);
3693 MPTS_ADDREF_LOCKED(mpts); /* for us */
490019cf 3694
39236c6e
A
3695 /* Update process ownership based on parent mptcp socket */
3696 mptcp_update_last_owner(mpts, mp_so);
490019cf 3697
39236c6e 3698 mptcp_subflow_input(mpte, mpts);
3e170ce0
A
3699
3700 mptcp_get_rtt_measurement(mpts, mpte);
3701
3702 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e
A
3703
3704 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3e170ce0
A
3705 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3706 "%s: cid %d \n", __func__,
3707 mpts->mpts_connid),
3708 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3709 (void) mptcp_subflow_output(mpte, mpts);
3710 }
3711
3712 /*
3713 * If MPTCP socket is closed, disconnect all subflows.
3714 * This will generate a disconnect event which will
3715 * be handled during the next iteration, causing a
3716 * non-zero error to be returned above.
3717 */
3718 if (mp_so->so_flags & SOF_PCBCLEARING)
3719 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3720 MPTS_UNLOCK(mpts);
3721
3722 switch (ret) {
39236c6e
A
3723 case MPTS_EVRET_OK:
3724 /* nothing to do */
3725 break;
3726 case MPTS_EVRET_DELETE:
fe8ab488 3727 mptcp_subflow_del(mpte, mpts, TRUE);
39236c6e
A
3728 break;
3729 case MPTS_EVRET_CONNECT_PENDING:
3730 connect_pending = TRUE;
3731 break;
3732 case MPTS_EVRET_DISCONNECT_FALLBACK:
3733 disconnect_fallback = TRUE;
3734 break;
3e170ce0
A
3735 default:
3736 mptcplog((LOG_DEBUG,
3737 "MPTCP Socket: %s: mptcp_subflow_events "
3738 "returned invalid value: %d\n", __func__,
3739 ret),
3740 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3741 break;
39236c6e
A
3742 }
3743 MPTS_REMREF(mpts); /* ours */
3744 }
3745
3e170ce0
A
3746 if (mpsofilt_hint_mask) {
3747 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
3748 }
3749
3750 if (!connect_pending && !disconnect_fallback) {
3751 MPTE_UNLOCK(mpte);
3752 return;
3753 }
3754
3755 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3756 MPTS_LOCK(mpts);
3757 if (disconnect_fallback) {
3758 struct socket *so = NULL;
3759 struct inpcb *inp = NULL;
3760 struct tcpcb *tp = NULL;
3761
3762 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3763 MPTS_UNLOCK(mpts);
3764 continue;
3765 }
3766
3767 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3768
3769 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3e170ce0 3770 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
39236c6e
A
3771 MPTS_UNLOCK(mpts);
3772 continue;
3773 }
490019cf
A
3774
3775 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3776 mptcp_drop_tfo_data(mpte, mpts);
3777
39236c6e
A
3778 so = mpts->mpts_socket;
3779
3780 /*
3781 * The MPTCP connection has degraded to a fallback
3782 * mode, so there is no point in keeping this subflow
3783 * regardless of its MPTCP-readiness state, unless it
3784 * is the primary one which we use for fallback. This
3785 * assumes that the subflow used for fallback is the
3786 * ACTIVE one.
3787 */
3788
3789 socket_lock(so, 1);
3790 inp = sotoinpcb(so);
3791 tp = intotcpcb(inp);
3792 tp->t_mpflags &=
3793 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3794 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 3795
39236c6e
A
3796 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3797 socket_unlock(so, 1);
3798 MPTS_UNLOCK(mpts);
3799 continue;
3800 }
3801 tp->t_mpflags |= TMPF_RESET;
3802 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3803 socket_unlock(so, 1);
3804
3805 } else if (connect_pending) {
fe8ab488
A
3806 /*
3807 * If delayed subflow start is set and cellular,
3808 * delay the connect till a retransmission timeout
3809 */
3810
3811 if ((mptcp_delayed_subf_start) &&
3812 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3813 MPTS_UNLOCK(mpts);
3814 continue;
3815 }
3816
39236c6e
A
3817 /*
3818 * The MPTCP connection has progressed to a state
3819 * where it supports full multipath semantics; allow
3820 * additional joins to be attempted for all subflows
3821 * that are in the PENDING state.
3822 */
3823 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3824 (void) mptcp_subflow_soconnectx(mpte, mpts);
3825 }
3826 }
3827 MPTS_UNLOCK(mpts);
3828 }
3829
3830 MPTE_UNLOCK(mpte);
3831}
3832
3833/*
3834 * MPTCP thread.
3835 */
3836static void
3837mptcp_thread_func(void *v, wait_result_t w)
3838{
3839#pragma unused(w)
3840 struct mptses *mpte = v;
3841 struct timespec *ts = NULL;
3842
3843 VERIFY(mpte != NULL);
3844
3845 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3846
3847 for (;;) {
3848 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3849
3850 if (mpte->mpte_thread != THREAD_NULL) {
3851 (void) msleep(&mpte->mpte_thread,
3852 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3853 __func__, ts);
3854 }
3855
3856 /* MPTCP socket is closed? */
3857 if (mpte->mpte_thread == THREAD_NULL) {
3858 lck_mtx_unlock(&mpte->mpte_thread_lock);
3859 /* callee will destroy thread lock */
3860 mptcp_thread_destroy(mpte);
3861 /* NOTREACHED */
3862 return;
3863 }
3864
3865 mpte->mpte_thread_active = 1;
3866 for (;;) {
3867 uint32_t reqs = mpte->mpte_thread_reqs;
3868
3869 lck_mtx_unlock(&mpte->mpte_thread_lock);
3870 mptcp_thread_dowork(mpte);
3871 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3872
3873 /* if there's no pending request, we're done */
3874 if (reqs == mpte->mpte_thread_reqs ||
3875 mpte->mpte_thread == THREAD_NULL)
3876 break;
3877 }
3878 mpte->mpte_thread_reqs = 0;
3879 mpte->mpte_thread_active = 0;
3880 }
3881}
3882
3883/*
3884 * Destroy a MTCP thread, to be called in the MPTCP thread context
3885 * upon receiving an indication to self-terminate. This routine
3886 * will not return, as the current thread is terminated at the end.
3887 */
3888static void
3889mptcp_thread_destroy(struct mptses *mpte)
3890{
3891 struct socket *mp_so;
3892
3893 MPTE_LOCK(mpte); /* same as MP socket lock */
3894 VERIFY(mpte->mpte_thread == THREAD_NULL);
3895 VERIFY(mpte->mpte_mppcb != NULL);
3896
3897 mptcp_sesdestroy(mpte);
3898
3899 mp_so = mpte->mpte_mppcb->mpp_socket;
3900 VERIFY(mp_so != NULL);
3901 VERIFY(mp_so->so_usecount != 0);
3902 mp_so->so_usecount--; /* for thread */
3903 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3904 MPTE_UNLOCK(mpte);
3905
3906 /* for the extra refcnt from kernel_thread_start() */
3907 thread_deallocate(current_thread());
3908 /* this is the end */
3909 thread_terminate(current_thread());
3910 /* NOTREACHED */
3911}
3912
3913/*
3914 * Protocol pr_lock callback.
3915 */
3916int
3917mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3918{
3919 struct mppcb *mpp = sotomppcb(mp_so);
3920 void *lr_saved;
3921
3922 if (lr == NULL)
3923 lr_saved = __builtin_return_address(0);
3924 else
3925 lr_saved = lr;
3926
3927 if (mpp == NULL) {
3928 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3929 mp_so, lr_saved, solockhistory_nr(mp_so));
3930 /* NOTREACHED */
3931 }
3932 lck_mtx_lock(&mpp->mpp_lock);
3933
3934 if (mp_so->so_usecount < 0) {
3935 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3936 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3937 solockhistory_nr(mp_so));
3938 /* NOTREACHED */
3939 }
3940 if (refcount != 0)
3941 mp_so->so_usecount++;
3942 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3943 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3944
3945 return (0);
3946}
3947
3948/*
3949 * Protocol pr_unlock callback.
3950 */
3951int
3952mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3953{
3954 struct mppcb *mpp = sotomppcb(mp_so);
3955 void *lr_saved;
3956
3957 if (lr == NULL)
3958 lr_saved = __builtin_return_address(0);
3959 else
3960 lr_saved = lr;
3961
3962 if (mpp == NULL) {
3963 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3964 mp_so, mp_so->so_usecount, lr_saved,
3965 solockhistory_nr(mp_so));
3966 /* NOTREACHED */
3967 }
3968 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3969
3970 if (refcount != 0)
3971 mp_so->so_usecount--;
3972
3973 if (mp_so->so_usecount < 0) {
3974 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3975 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3976 /* NOTREACHED */
3977 }
3978 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3979 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3980 lck_mtx_unlock(&mpp->mpp_lock);
3981
3982 return (0);
3983}
3984
3985/*
3986 * Protocol pr_getlock callback.
3987 */
3988lck_mtx_t *
3989mptcp_getlock(struct socket *mp_so, int locktype)
3990{
3991#pragma unused(locktype)
3992 struct mppcb *mpp = sotomppcb(mp_so);
3993
3994 if (mpp == NULL) {
3995 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3996 solockhistory_nr(mp_so));
3997 /* NOTREACHED */
3998 }
3999 if (mp_so->so_usecount < 0) {
4000 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4001 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4002 /* NOTREACHED */
4003 }
4004 return (&mpp->mpp_lock);
4005}
4006
4007/*
4008 * Key generation functions
4009 */
4010static void
4011mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
4012{
4013 struct mptcp_key_entry *key_elm;
4014try_again:
4015 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
4016 if (key_entry->mkey_value == 0)
4017 goto try_again;
4018 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
4019 sizeof (key_entry->mkey_digest));
4020
4021 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4022 if (key_elm->mkey_value == key_entry->mkey_value) {
4023 goto try_again;
4024 }
4025 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
4026 0) {
4027 goto try_again;
4028 }
4029 }
4030}
4031
4032static mptcp_key_t *
4033mptcp_reserve_key(void)
4034{
4035 struct mptcp_key_entry *key_elm;
4036 struct mptcp_key_entry *found_elm = NULL;
4037
4038 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4039 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4040 if (key_elm->mkey_flags == MKEYF_FREE) {
4041 key_elm->mkey_flags = MKEYF_INUSE;
4042 found_elm = key_elm;
4043 break;
4044 }
4045 }
4046 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4047
4048 if (found_elm) {
4049 return (&found_elm->mkey_value);
4050 }
4051
4052 key_elm = (struct mptcp_key_entry *)
4053 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4054 key_elm->mkey_flags = MKEYF_INUSE;
4055
4056 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4057 mptcp_generate_unique_key(key_elm);
4058 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
4059 mptcp_keys_pool.mkph_count += 1;
4060 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4061 return (&key_elm->mkey_value);
4062}
4063
4064static caddr_t
4065mptcp_get_stored_digest(mptcp_key_t *key)
4066{
4067 struct mptcp_key_entry *key_holder;
4068 caddr_t digest = NULL;
4069
4070 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4071 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
4072 offsetof(struct mptcp_key_entry, mkey_value));
4073 if (key_holder->mkey_flags != MKEYF_INUSE)
4074 panic_plain("%s", __func__);
4075 digest = &key_holder->mkey_digest[0];
4076 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4077 return (digest);
4078}
4079
4080void
4081mptcp_free_key(mptcp_key_t *key)
4082{
4083 struct mptcp_key_entry *key_holder;
4084 struct mptcp_key_entry *key_elm;
4085 int pt = RandomULong();
4086
39236c6e
A
4087 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4088 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
4089 offsetof(struct mptcp_key_entry, mkey_value));
4090 key_holder->mkey_flags = MKEYF_FREE;
4091
4092 LIST_REMOVE(key_holder, mkey_next);
4093 mptcp_keys_pool.mkph_count -= 1;
4094
4095 /* Free half the time */
4096 if (pt & 0x01) {
4097 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
4098 } else {
4099 /* Insert it at random point to avoid early reuse */
4100 int i = 0;
4101 if (mptcp_keys_pool.mkph_count > 1) {
4102 pt = pt % (mptcp_keys_pool.mkph_count - 1);
4103 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4104 if (++i >= pt) {
4105 LIST_INSERT_AFTER(key_elm, key_holder,
4106 mkey_next);
4107 break;
4108 }
4109 }
4110 if (i < pt)
4111 panic("missed insertion");
4112 } else {
4113 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
4114 mkey_next);
4115 }
4116 mptcp_keys_pool.mkph_count += 1;
4117 }
4118 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4119}
4120
4121static void
4122mptcp_key_pool_init(void)
4123{
4124 int i;
4125 struct mptcp_key_entry *key_entry;
4126
4127 LIST_INIT(&mptcp_keys_pool);
4128 mptcp_keys_pool.mkph_count = 0;
4129
4130 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
4131 (sizeof (struct mptcp_key_entry));
4132 mptcp_keys_pool.mkph_key_entry_zone = zinit(
4133 mptcp_keys_pool.mkph_key_elm_sz,
4134 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
4135 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
4136 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
4137 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
4138 /* NOTREACHED */
4139 }
4140 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
4141 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
4142
4143 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
4144 key_entry = (struct mptcp_key_entry *)
4145 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4146 key_entry->mkey_flags = MKEYF_FREE;
4147 mptcp_generate_unique_key(key_entry);
4148 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
4149 mptcp_keys_pool.mkph_count += 1;
4150 }
4151 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
4152 mtcbinfo.mppi_lock_attr);
4153}
4154
4155/*
4156 * MPTCP Join support
4157 */
4158
4159static void
4160mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4161 uint8_t addr_id)
39236c6e
A
4162{
4163 struct tcpcb *tp = sototcpcb(so);
4164 struct mptcp_subf_auth_entry *sauth_entry;
4165 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4166
4167 MPT_LOCK_SPIN(mp_tp);
4168 tp->t_mptcb = mp_tp;
39236c6e 4169 /*
39236c6e
A
4170 * The address ID of the first flow is implicitly 0.
4171 */
4172 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4173 tp->t_local_aid = 0;
4174 } else {
fe8ab488 4175 tp->t_local_aid = addr_id;
39236c6e
A
4176 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4177 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4178 }
fe8ab488 4179 MPT_UNLOCK(mp_tp);
39236c6e
A
4180 sauth_entry = zalloc(mpt_subauth_zone);
4181 sauth_entry->msae_laddr_id = tp->t_local_aid;
4182 sauth_entry->msae_raddr_id = 0;
4183 sauth_entry->msae_raddr_rand = 0;
4184try_again:
4185 sauth_entry->msae_laddr_rand = RandomULong();
4186 if (sauth_entry->msae_laddr_rand == 0)
4187 goto try_again;
fe8ab488 4188 MPT_LOCK_SPIN(mp_tp);
39236c6e 4189 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
fe8ab488 4190 MPT_UNLOCK(mp_tp);
39236c6e
A
4191}
4192
4193static void
4194mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4195{
4196 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4197 struct tcpcb *tp = NULL;
39236c6e
A
4198 int found = 0;
4199
fe8ab488
A
4200 socket_lock(so, 0);
4201 tp = sototcpcb(so);
4202 if (tp == NULL) {
4203 socket_unlock(so, 0);
39236c6e 4204 return;
fe8ab488 4205 }
39236c6e
A
4206
4207 MPT_LOCK(mp_tp);
4208 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4209 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4210 found = 1;
4211 break;
4212 }
4213 }
4214 if (found) {
4215 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4216 }
39236c6e 4217 MPT_UNLOCK(mp_tp);
fe8ab488 4218
3e170ce0
A
4219 if (found)
4220 zfree(mpt_subauth_zone, sauth_entry);
4221
fe8ab488
A
4222 tp->t_mptcb = NULL;
4223 socket_unlock(so, 0);
39236c6e
A
4224}
4225
4226void
4227mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4228 u_int32_t *rrand)
4229{
4230 struct mptcp_subf_auth_entry *sauth_entry;
4231 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4232
4233 MPT_LOCK(mp_tp);
4234 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4235 if (sauth_entry->msae_laddr_id == addr_id) {
4236 if (lrand)
4237 *lrand = sauth_entry->msae_laddr_rand;
4238 if (rrand)
4239 *rrand = sauth_entry->msae_raddr_rand;
4240 break;
4241 }
4242 }
4243 MPT_UNLOCK(mp_tp);
4244}
4245
4246void
4247mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4248 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4249{
4250 struct mptcp_subf_auth_entry *sauth_entry;
4251 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4252
4253 MPT_LOCK(mp_tp);
4254 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4255 if (sauth_entry->msae_laddr_id == laddr_id) {
4256 if ((sauth_entry->msae_raddr_id != 0) &&
4257 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4258 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4259 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4260 sauth_entry->msae_raddr_id),
4261 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4262 MPT_UNLOCK(mp_tp);
4263 return;
4264 }
4265 sauth_entry->msae_raddr_id = raddr_id;
4266 if ((sauth_entry->msae_raddr_rand != 0) &&
4267 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4268 mptcplog((LOG_ERR, "MPTCP Socket: "
4269 "%s: dup SYN_ACK %d %d \n",
39236c6e 4270 __func__, raddr_rand,
3e170ce0
A
4271 sauth_entry->msae_raddr_rand),
4272 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4273 MPT_UNLOCK(mp_tp);
4274 return;
4275 }
4276 sauth_entry->msae_raddr_rand = raddr_rand;
4277 MPT_UNLOCK(mp_tp);
4278 return;
4279 }
4280 }
4281 MPT_UNLOCK(mp_tp);
4282}
4283
4284/*
4285 * SHA1 support for MPTCP
4286 */
4287static int
4288mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4289{
4290 SHA1_CTX sha1ctxt;
4291 const unsigned char *sha1_base;
4292 int sha1_size;
4293
4294 if (digest_len != SHA1_RESULTLEN) {
4295 return (FALSE);
4296 }
4297
4298 sha1_base = (const unsigned char *) key;
4299 sha1_size = sizeof (mptcp_key_t);
4300 SHA1Init(&sha1ctxt);
4301 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4302 SHA1Final(sha_digest, &sha1ctxt);
4303 return (TRUE);
4304}
4305
4306void
4307mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4308 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4309{
4310 SHA1_CTX sha1ctxt;
4311 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4312 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4313 u_int32_t data[2];
4314 int i;
4315
4316 bzero(digest, digest_len);
4317
4318 /* Set up the Key for HMAC */
4319 key_ipad[0] = key1;
4320 key_ipad[1] = key2;
4321
4322 key_opad[0] = key1;
4323 key_opad[1] = key2;
4324
4325 /* Set up the message for HMAC */
4326 data[0] = rand1;
4327 data[1] = rand2;
4328
4329 /* Key is 512 block length, so no need to compute hash */
4330
4331 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4332
4333 for (i = 0; i < 8; i++) {
4334 key_ipad[i] ^= 0x3636363636363636;
4335 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4336 }
4337
4338 /* Perform inner SHA1 */
4339 SHA1Init(&sha1ctxt);
4340 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4341 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4342 SHA1Final(digest, &sha1ctxt);
4343
4344 /* Perform outer SHA1 */
4345 SHA1Init(&sha1ctxt);
4346 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4347 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4348 SHA1Final(digest, &sha1ctxt);
4349}
4350
4351/*
4352 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4353 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4354 */
4355void
4356mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4357 int digest_len)
4358{
4359 uint32_t lrand, rrand;
4360 mptcp_key_t localkey, remotekey;
4361 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4362
4363 if (digest_len != SHA1_RESULTLEN)
4364 return;
4365
4366 lrand = rrand = 0;
4367 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4368 MPT_LOCK_SPIN(mp_tp);
4369 localkey = *mp_tp->mpt_localkey;
4370 remotekey = mp_tp->mpt_remotekey;
4371 MPT_UNLOCK(mp_tp);
4372 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4373 digest_len);
4374}
4375
4376u_int64_t
4377mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4378{
4379 u_char digest[SHA1_RESULTLEN];
4380 u_int64_t trunced_digest;
4381
4382 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4383 bcopy(digest, &trunced_digest, 8);
4384 return (trunced_digest);
4385}
4386
4387/*
4388 * Authentication data generation
4389 */
490019cf 4390void
39236c6e
A
4391mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4392 int token_len)
4393{
4394 VERIFY(token_len == sizeof (u_int32_t));
4395 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4396
4397 /* Most significant 32 bits of the SHA1 hash */
4398 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4399 return;
39236c6e
A
4400}
4401
490019cf 4402void
39236c6e
A
4403mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4404 int idsn_len)
4405{
4406 VERIFY(idsn_len == sizeof (u_int64_t));
4407 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4408
4409 /*
4410 * Least significant 64 bits of the SHA1 hash
4411 */
4412
4413 idsn[7] = sha_digest[12];
4414 idsn[6] = sha_digest[13];
4415 idsn[5] = sha_digest[14];
4416 idsn[4] = sha_digest[15];
4417 idsn[3] = sha_digest[16];
4418 idsn[2] = sha_digest[17];
4419 idsn[1] = sha_digest[18];
4420 idsn[0] = sha_digest[19];
490019cf 4421 return;
39236c6e
A
4422}
4423
490019cf
A
4424static void
4425mptcp_conn_properties(struct mptcb *mp_tp)
4426{
4427 /* There is only Version 0 at this time */
4428 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4429
4430 /* Set DSS checksum flag */
4431 if (mptcp_dss_csum)
4432 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4433
4434 /* Set up receive window */
4435 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4436
4437 /* Set up gc ticks */
4438 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4439}
4440
4441static void
4442mptcp_init_local_parms(struct mptcb *mp_tp)
39236c6e
A
4443{
4444 caddr_t local_digest = NULL;
490019cf
A
4445
4446 mp_tp->mpt_localkey = mptcp_reserve_key();
4447 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4448 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4449 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4450 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4451 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4452
4453 /* The subflow SYN is also first MPTCP byte */
4454 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4455 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4456
4457 mptcp_conn_properties(mp_tp);
4458}
4459
4460int
4461mptcp_init_remote_parms(struct mptcb *mp_tp)
4462{
39236c6e
A
4463 char remote_digest[MPTCP_SHA1_RESULTLEN];
4464 MPT_LOCK_ASSERT_HELD(mp_tp);
4465
4466 /* Only Version 0 is supported for auth purposes */
3e170ce0 4467 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4468 return (-1);
4469
4470 /* Setup local and remote tokens and Initial DSNs */
39236c6e
A
4471
4472 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4473 SHA1_RESULTLEN)) {
3e170ce0
A
4474 mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
4475 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4476 return (-1);
4477 }
4478 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4479 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4480 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4481 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
39236c6e 4482 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4483
490019cf 4484 return (0);
39236c6e
A
4485}
4486
4487/*
4488 * Helper Functions
4489 */
4490mptcp_token_t
4491mptcp_get_localtoken(void* mptcb_arg)
4492{
4493 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4494 return (mp_tp->mpt_localtoken);
4495}
4496
4497mptcp_token_t
4498mptcp_get_remotetoken(void* mptcb_arg)
4499{
4500 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4501 return (mp_tp->mpt_remotetoken);
4502}
4503
4504u_int64_t
4505mptcp_get_localkey(void* mptcb_arg)
4506{
4507 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4508 if (mp_tp->mpt_localkey != NULL)
4509 return (*mp_tp->mpt_localkey);
4510 else
4511 return (0);
4512}
4513
4514u_int64_t
4515mptcp_get_remotekey(void* mptcb_arg)
4516{
4517 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4518 return (mp_tp->mpt_remotekey);
4519}
4520
4521void
4522mptcp_send_dfin(struct socket *so)
4523{
4524 struct tcpcb *tp = NULL;
4525 struct inpcb *inp = NULL;
4526
4527 inp = sotoinpcb(so);
4528 if (!inp)
4529 return;
4530
4531 tp = intotcpcb(inp);
4532 if (!tp)
4533 return;
4534
4535 if (!(tp->t_mpflags & TMPF_RESET))
4536 tp->t_mpflags |= TMPF_SEND_DFIN;
4537}
4538
4539/*
4540 * Data Sequence Mapping routines
4541 */
4542void
4543mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4544{
4545 struct mptcb *mp_tp;
4546
4547 if (m == NULL)
4548 return;
4549
3e170ce0 4550 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e 4551 MPT_LOCK(mp_tp);
39236c6e
A
4552 while (m) {
4553 VERIFY(m->m_flags & M_PKTHDR);
4554 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4555 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4556 m->m_pkthdr.mp_rlen = m_pktlen(m);
4557 mp_tp->mpt_sndmax += m_pktlen(m);
4558 m = m->m_next;
4559 }
4560 MPT_UNLOCK(mp_tp);
4561}
4562
4563void
490019cf 4564mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e
A
4565{
4566 u_int32_t sub_len = 0;
490019cf
A
4567 int rewinding = 0;
4568
4569 if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
4570 /* TFO makes things complicated. */
4571 if (so->so_flags1 & SOF1_TFO_REWIND) {
4572 rewinding = 1;
4573 so->so_flags1 &= ~SOF1_TFO_REWIND;
4574 }
4575 }
39236c6e
A
4576
4577 while (m) {
4578 VERIFY(m->m_flags & M_PKTHDR);
4579
4580 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4581 sub_len = m->m_pkthdr.mp_rlen;
4582
4583 if (sub_len < len) {
4584 m->m_pkthdr.mp_dsn += sub_len;
4585 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4586 m->m_pkthdr.mp_rseq += sub_len;
4587 }
4588 m->m_pkthdr.mp_rlen = 0;
4589 len -= sub_len;
4590 } else {
4591 /* sub_len >= len */
490019cf
A
4592 if (rewinding == 0)
4593 m->m_pkthdr.mp_dsn += len;
39236c6e 4594 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
490019cf
A
4595 if (rewinding == 0)
4596 m->m_pkthdr.mp_rseq += len;
39236c6e 4597 }
3e170ce0 4598 mptcplog((LOG_DEBUG, "MPTCP Sender: "
490019cf 4599 "%s: dsn 0x%llx ssn %u len %d %d\n",
3e170ce0 4600 __func__,
39236c6e 4601 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
3e170ce0
A
4602 m->m_pkthdr.mp_rlen, len),
4603 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4604 m->m_pkthdr.mp_rlen -= len;
4605 return;
4606 }
4607 } else {
4608 panic("%s: MPTCP tag not set", __func__);
4609 /* NOTREACHED */
4610 }
4611 m = m->m_next;
4612 }
4613}
4614
4615/* Obtain the DSN mapping stored in the mbuf */
4616void
4617mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4618 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4619{
4620 u_int64_t dsn64;
4621
4622 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4623 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4624 *dsn64p = dsn64;
4625}
4626
4627void
4628mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4629 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4630{
4631 struct mbuf *m = so->so_snd.sb_mb;
4632 struct mbuf *mnext = NULL;
4633 uint32_t runlen = 0;
4634 u_int64_t dsn64;
4635 uint32_t contig_len = 0;
4636
4637 if (m == NULL)
4638 return;
4639
4640 if (off < 0)
4641 return;
4642 /*
4643 * In the subflow socket, the DSN sequencing can be discontiguous,
4644 * but the subflow sequence mapping is contiguous. Use the subflow
4645 * sequence property to find the right mbuf and corresponding dsn
4646 * mapping.
4647 */
4648
4649 while (m) {
4650 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4651 VERIFY(m->m_flags & M_PKTHDR);
4652
4653 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4654 off -= m->m_pkthdr.mp_rlen;
4655 m = m->m_next;
4656 } else {
4657 break;
4658 }
4659 }
4660
4661 if (m == NULL) {
4662 panic("%s: bad offset", __func__);
4663 /* NOTREACHED */
4664 }
4665
4666 dsn64 = m->m_pkthdr.mp_dsn + off;
4667 *dsn = dsn64;
4668 *relseq = m->m_pkthdr.mp_rseq + off;
4669
4670 /*
4671 * Now find the last contiguous byte and its length from
4672 * start.
4673 */
4674 runlen = m->m_pkthdr.mp_rlen - off;
4675 contig_len = runlen;
4676
4677 /* If datalen does not span multiple mbufs, return */
4678 if (datalen <= runlen) {
4679 *data_len = min(datalen, UINT16_MAX);
4680 return;
4681 }
4682
4683 mnext = m->m_next;
4684 while (datalen > runlen) {
4685 if (mnext == NULL) {
4686 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4687 runlen, off);
4688 /* NOTREACHED */
4689 }
4690 VERIFY(mnext->m_flags & M_PKTHDR);
4691 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4692
4693 /*
4694 * case A. contiguous DSN stream
4695 * case B. discontiguous DSN stream
4696 */
4697 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4698 /* case A */
4699 runlen += mnext->m_pkthdr.mp_rlen;
4700 contig_len += mnext->m_pkthdr.mp_rlen;
3e170ce0
A
4701 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
4702 __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4703 } else {
4704 /* case B */
3e170ce0 4705 mptcplog((LOG_DEBUG, "MPTCP Sender: "
fe8ab488 4706 "%s: discontig datalen %d contig_len %d cc %d \n",
3e170ce0
A
4707 __func__, datalen, contig_len, so->so_snd.sb_cc),
4708 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4709 break;
4710 }
4711 mnext = mnext->m_next;
4712 }
4713 datalen = min(datalen, UINT16_MAX);
4714 *data_len = min(datalen, contig_len);
3e170ce0
A
4715 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4716 "%s: %llu %u %d %d \n", __func__,
4717 *dsn, *relseq, *data_len, off),
4718 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4719}
4720
4721/*
4722 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4723 * here. It must be called from mptcp_adj_rmap() which is called only after
4724 * reassembly of out of order data. The rcvnxt variable must
4725 * be updated only when atleast some insequence new data is received.
4726 */
4727static void
4728mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4729{
4730 struct mptcb *mp_tp = tptomptp(tp);
4731
4732 if (mp_tp == NULL)
4733 return;
4734 MPT_LOCK(mp_tp);
4735 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4736 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4737 m->m_pkthdr.mp_rlen)))) {
4738 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4739 }
4740 MPT_UNLOCK(mp_tp);
4741}
4742
4743/*
3e170ce0
A
4744 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4745 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4746 * When it trims data tcp_input calls m_adj() which does not remove the
4747 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4748 * The dsn map insertion cannot be delayed after trim, because data can be in
4749 * the reassembly queue for a while and the DSN option info in tp will be
4750 * overwritten for every new packet received.
39236c6e
A
4751 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4752 * with mptcp_adj_rmap()
4753 */
4754void
4755mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4756{
4757 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4758
4759 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4760 VERIFY(m->m_flags & M_PKTHDR);
4761 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4762 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4763 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4764 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4765 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4766 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4767 }
4768}
4769
fe8ab488 4770int
39236c6e
A
4771mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4772{
4773 u_int64_t dsn;
4774 u_int32_t sseq, datalen;
4775 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4776 u_int32_t old_rcvnxt = 0;
4777
4778 if (m_pktlen(m) == 0)
fe8ab488 4779 return 0;
39236c6e
A
4780
4781 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4782 VERIFY(m->m_flags & M_PKTHDR);
4783
4784 dsn = m->m_pkthdr.mp_dsn;
4785 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4786 datalen = m->m_pkthdr.mp_rlen;
4787 } else {
4788 /* data arrived without an DSS option mapping */
fe8ab488
A
4789
4790 /* initial subflow can fallback right after SYN handshake */
39236c6e 4791 mptcp_notify_mpfail(so);
fe8ab488 4792 return 0;
39236c6e
A
4793 }
4794
4795 /* In the common case, data is in window and in sequence */
4796 if (m->m_pkthdr.len == (int)datalen) {
4797 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4798 return 0;
39236c6e
A
4799 }
4800
39236c6e
A
4801 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4802 if (SEQ_GT(old_rcvnxt, sseq)) {
4803 /* data trimmed from the left */
4804 int off = old_rcvnxt - sseq;
4805 m->m_pkthdr.mp_dsn += off;
4806 m->m_pkthdr.mp_rseq += off;
fe8ab488 4807 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e
A
4808 } else if (old_rcvnxt == sseq) {
4809 /*
3e170ce0 4810 * data was trimmed from the right
39236c6e
A
4811 */
4812 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4813 } else {
fe8ab488 4814 mptcp_notify_mpfail(so);
3e170ce0 4815 return (-1);
39236c6e
A
4816 }
4817 mptcp_adj_rcvnxt(tp, m);
fe8ab488 4818 return 0;
39236c6e
A
4819}
4820
4821/*
4822 * Following routines help with failure detection and failover of data
4823 * transfer from one subflow to another.
4824 */
4825void
4826mptcp_act_on_txfail(struct socket *so)
4827{
4828 struct tcpcb *tp = NULL;
4829 struct inpcb *inp = sotoinpcb(so);
4830
4831 if (inp == NULL)
4832 return;
4833
4834 tp = intotcpcb(inp);
4835 if (tp == NULL)
4836 return;
4837
39236c6e
A
4838 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4839 return;
4840 }
4841
4842 so->so_flags |= SOF_MP_TRYFAILOVER;
4843 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4844}
4845
4846/*
4847 * Support for MP_FAIL option
4848 */
4849int
4850mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4851{
4852 struct mbuf *m = so->so_snd.sb_mb;
4853 u_int64_t dsn;
4854 int off = 0;
4855 u_int32_t datalen;
4856
4857 if (m == NULL)
4858 return (-1);
4859
4860 while (m != NULL) {
4861 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4862 VERIFY(m->m_flags & M_PKTHDR);
4863 dsn = m->m_pkthdr.mp_dsn;
4864 datalen = m->m_pkthdr.mp_rlen;
4865 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4866 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4867 off = dsn_fail - dsn;
4868 *tcp_seq = m->m_pkthdr.mp_rseq + off;
3e170ce0
A
4869 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
4870 __func__, dsn, dsn_fail),
4871 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4872 return (0);
4873 }
4874
4875 m = m->m_next;
4876 }
4877
4878 /*
4879 * If there was no mbuf data and a fallback to TCP occurred, there's
4880 * not much else to do.
4881 */
4882
3e170ce0
A
4883 mptcplog((LOG_ERR, "MPTCP Sender: "
4884 "%s: %llu not found \n", __func__, dsn_fail),
4885 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4886 return (-1);
4887}
4888
4889/*
4890 * Support for sending contiguous MPTCP bytes in subflow
fe8ab488 4891 * Also for preventing sending data with ACK in 3-way handshake
39236c6e
A
4892 */
4893int32_t
4894mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4895{
4896 u_int64_t mdss_dsn = 0;
4897 u_int32_t mdss_subflow_seq = 0;
4898 u_int16_t mdss_data_len = 0;
4899
4900 if (len == 0)
4901 return (len);
4902
4903 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4904 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4905
fe8ab488
A
4906 /*
4907 * Special case handling for Fast Join. We want to send data right
4908 * after ACK of the 3-way handshake, but not piggyback the data
4909 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4910 * mdss_data_len control this.
4911 */
4912 struct tcpcb *tp = NULL;
4913 tp = intotcpcb(sotoinpcb(so));
4914 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4915 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4916 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4917 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4918 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4919 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
490019cf
A
4920 mdss_data_len = 0;
4921 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4922 }
4923
4924 if ((tp->t_state > TCPS_SYN_SENT) &&
4925 (tp->t_mpflags & TMPF_TFO_REQUEST)) {
4926 mdss_data_len = 0;
4927 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4928 }
39236c6e
A
4929 return (mdss_data_len);
4930}
4931
4932int32_t
4933mptcp_sbspace(struct mptcb *mpt)
4934{
4935 struct sockbuf *sb;
4936 uint32_t rcvbuf;
4937 int32_t space;
4938
4939 MPT_LOCK_ASSERT_HELD(mpt);
4940 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4941
4942 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4943 rcvbuf = sb->sb_hiwat;
4944 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4945 (sb->sb_mbmax - sb->sb_mbcnt)));
4946 if (space < 0)
4947 space = 0;
4948 /* XXX check if it's too small? */
4949
4950 return (space);
4951}
4952
4953/*
4954 * Support Fallback to Regular TCP
4955 */
4956void
4957mptcp_notify_mpready(struct socket *so)
4958{
4959 struct tcpcb *tp = NULL;
4960
4961 if (so == NULL)
4962 return;
4963
4964 tp = intotcpcb(sotoinpcb(so));
4965
4966 if (tp == NULL)
4967 return;
4968
4969 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4970 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4971 struct tcpcb *, tp);
4972
4973 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4974 return;
4975
4976 if (tp->t_mpflags & TMPF_MPTCP_READY)
4977 return;
4978
4979 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4980 tp->t_mpflags |= TMPF_MPTCP_READY;
4981
4982 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4983}
4984
4985void
4986mptcp_notify_mpfail(struct socket *so)
4987{
4988 struct tcpcb *tp = NULL;
4989
4990 if (so == NULL)
4991 return;
4992
4993 tp = intotcpcb(sotoinpcb(so));
4994
4995 if (tp == NULL)
4996 return;
4997
4998 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4999 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5000 struct tcpcb *, tp);
5001
5002 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5003 return;
5004
5005 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5006 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5007
5008 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5009}
5010
5011/*
5012 * Keepalive helper function
5013 */
5014boolean_t
5015mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5016{
5017 boolean_t ret = 1;
5018 VERIFY(mp_tp != NULL);
5019 MPT_LOCK(mp_tp);
5020 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5021 ret = 0;
5022 }
5023 MPT_UNLOCK(mp_tp);
5024 return (ret);
5025}
5026
5027/*
5028 * MPTCP t_maxseg adjustment function
5029 */
5030int
5031mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5032{
5033 int mss_lower = 0;
5034 struct mptcb *mp_tp = tptomptp(tp);
5035
5036#define MPTCP_COMPUTE_LEN { \
5037 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5038 MPT_LOCK(mp_tp); \
5039 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5040 mss_lower += 2; \
5041 else \
5042 /* adjust to 32-bit boundary + EOL */ \
5043 mss_lower += 2; \
5044 MPT_UNLOCK(mp_tp); \
5045}
5046 if (mp_tp == NULL)
5047 return (0);
5048
5049 /*
5050 * For the first subflow and subsequent subflows, adjust mss for
5051 * most common MPTCP option size, for case where tcp_mss is called
5052 * during option processing and MTU discovery.
5053 */
5054 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5055 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
5056 MPTCP_COMPUTE_LEN;
5057 }
5058
5059 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5060 (tp->t_mpflags & TMPF_SENT_JOIN)) {
5061 MPTCP_COMPUTE_LEN;
5062 }
5063
5064 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5065 MPTCP_COMPUTE_LEN;
5066 }
5067
5068 return (mss_lower);
5069}
5070
5071/*
5072 * Update the pid, upid, uuid of the subflow so, based on parent so
5073 */
5074void
5075mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
5076{
5077 struct socket *subflow_so = mpts->mpts_socket;
5078
5079 MPTS_LOCK_ASSERT_HELD(mpts);
5080
5081 socket_lock(subflow_so, 0);
5082 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
5083 (subflow_so->last_upid != parent_mpso->last_upid)) {
5084 subflow_so->last_upid = parent_mpso->last_upid;
5085 subflow_so->last_pid = parent_mpso->last_pid;
5086 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
5087 }
5088 so_update_policy(subflow_so);
5089 socket_unlock(subflow_so, 0);
5090}
5091
5092static void
5093fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5094{
5095 struct inpcb *inp;
5096
5097 tcp_getconninfo(so, &flow->flow_ci);
5098 inp = sotoinpcb(so);
5099#if INET6
5100 if ((inp->inp_vflag & INP_IPV6) != 0) {
5101 flow->flow_src.ss_family = AF_INET6;
5102 flow->flow_dst.ss_family = AF_INET6;
5103 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5104 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5105 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5106 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5107 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5108 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5109 } else
5110#endif
3e170ce0 5111 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5112 flow->flow_src.ss_family = AF_INET;
5113 flow->flow_dst.ss_family = AF_INET;
5114 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5115 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5116 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5117 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5118 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5119 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5120 }
3e170ce0
A
5121 flow->flow_len = sizeof(*flow);
5122 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5123 flow->flow_flags = mpts->mpts_flags;
5124 flow->flow_cid = mpts->mpts_connid;
3e170ce0
A
5125 flow->flow_sndnxt = mpts->mpts_sndnxt;
5126 flow->flow_relseq = mpts->mpts_rel_seq;
5127 flow->flow_soerror = mpts->mpts_soerror;
5128 flow->flow_probecnt = mpts->mpts_probecnt;
5129 flow->flow_peerswitch = mpts->mpts_peerswitch;
39236c6e
A
5130}
5131
5132static int
5133mptcp_pcblist SYSCTL_HANDLER_ARGS
5134{
5135#pragma unused(oidp, arg1, arg2)
5136 int error = 0, f;
5137 size_t n, len;
5138 struct mppcb *mpp;
5139 struct mptses *mpte;
5140 struct mptcb *mp_tp;
5141 struct mptsub *mpts;
5142 struct socket *so;
5143 conninfo_mptcp_t mptcpci;
fe8ab488 5144 mptcp_flow_t *flows = NULL;
39236c6e
A
5145
5146 if (req->newptr != USER_ADDR_NULL)
5147 return (EPERM);
5148
5149 lck_mtx_lock(&mtcbinfo.mppi_lock);
5150 n = mtcbinfo.mppi_count;
5151 if (req->oldptr == USER_ADDR_NULL) {
5152 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5153 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5154 4 * (n + n/8) * sizeof(mptcp_flow_t);
5155 return (0);
5156 }
5157 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5158 flows = NULL;
39236c6e
A
5159 lck_mtx_lock(&mpp->mpp_lock);
5160 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3e170ce0
A
5161 if (mpp->mpp_flags & MPP_DEFUNCT) {
5162 lck_mtx_unlock(&mpp->mpp_lock);
5163 continue;
5164 }
39236c6e
A
5165 mpte = mptompte(mpp);
5166 VERIFY(mpte != NULL);
5167 mp_tp = mpte->mpte_mptcb;
5168 VERIFY(mp_tp != NULL);
3e170ce0
A
5169
5170 bzero(&mptcpci, sizeof(mptcpci));
5171 MPT_LOCK(mp_tp);
39236c6e 5172 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5173 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5174 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5175 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5176 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5177 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5178 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5179 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5180 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5181 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5182 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5183 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
5184 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5185 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5186 MPT_UNLOCK(mp_tp);
5187
39236c6e 5188 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5189 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5190 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5191 mptcpci.mptcpci_flow_offset =
5192 offsetof(conninfo_mptcp_t, mptcpci_flows);
5193
fe8ab488
A
5194 len = sizeof(*flows) * mpte->mpte_numflows;
5195 if (mpte->mpte_numflows != 0) {
5196 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5197 if (flows == NULL) {
5198 lck_mtx_unlock(&mpp->mpp_lock);
5199 break;
5200 }
5201 mptcpci.mptcpci_len = sizeof(mptcpci) +
5202 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5203 error = SYSCTL_OUT(req, &mptcpci,
5204 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5205 } else {
5206 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5207 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
fe8ab488 5208 }
39236c6e
A
5209 if (error) {
5210 lck_mtx_unlock(&mpp->mpp_lock);
5211 FREE(flows, M_TEMP);
5212 break;
5213 }
5214 f = 0;
5215 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5216 MPTS_LOCK(mpts);
5217 so = mpts->mpts_socket;
5218 socket_lock(so, 0);
5219 fill_mptcp_subflow(so, &flows[f], mpts);
5220 socket_unlock(so, 0);
5221 MPTS_UNLOCK(mpts);
5222 f++;
5223 }
5224 lck_mtx_unlock(&mpp->mpp_lock);
fe8ab488
A
5225 if (flows) {
5226 error = SYSCTL_OUT(req, flows, len);
5227 FREE(flows, M_TEMP);
5228 if (error)
5229 break;
5230 }
39236c6e
A
5231 }
5232 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5233
5234 return (error);
5235}
5236
5237SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5238 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5239 "List of active MPTCP connections");
fe8ab488
A
5240
5241/*
5242 * Check the health of the other subflows and do an mptcp_output if
5243 * there is no other active or functional subflow at the time of
5244 * call of this function.
5245 */
5246static void
5247mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
5248{
5249 struct mptsub *from_mpts = NULL;
5250
5251 MPTE_LOCK_ASSERT_HELD(mpte);
5252
5253 MPTS_UNLOCK(to_mpts);
5254
5255 from_mpts = mpte->mpte_active_sub;
5256
5257 if (from_mpts == NULL)
5258 goto output_needed;
5259
5260 MPTS_LOCK(from_mpts);
5261
5262 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
5263 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
5264 MPTS_UNLOCK(from_mpts);
5265 goto output_needed;
5266 }
5267
5268 MPTS_UNLOCK(from_mpts);
5269 MPTS_LOCK(to_mpts);
5270 return;
5271
5272output_needed:
5273 mptcp_output(mpte);
5274 MPTS_LOCK(to_mpts);
5275}
5276
fe8ab488
A
5277/*
5278 * Set notsent lowat mark on the MPTCB
5279 */
5280int
5281mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5282{
5283 struct mptcb *mp_tp = NULL;
5284 int error = 0;
5285
5286 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5287 mp_tp = mpte->mpte_mptcb;
5288
5289 if (mp_tp)
5290 mp_tp->mpt_notsent_lowat = optval;
5291 else
5292 error = EINVAL;
5293
5294 return error;
5295}
5296
5297u_int32_t
5298mptcp_get_notsent_lowat(struct mptses *mpte)
5299{
5300 struct mptcb *mp_tp = NULL;
5301
5302 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5303 mp_tp = mpte->mpte_mptcb;
5304
5305 if (mp_tp)
5306 return mp_tp->mpt_notsent_lowat;
5307 else
5308 return 0;
5309}
5310
5311int
5312mptcp_notsent_lowat_check(struct socket *so) {
5313 struct mptses *mpte;
5314 struct mppcb *mpp;
5315 struct mptcb *mp_tp;
5316 struct mptsub *mpts;
5317
5318 int notsent = 0;
5319
5320 mpp = sotomppcb(so);
5321 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5322 return (0);
5323 }
5324
5325 mpte = mptompte(mpp);
5326 mp_tp = mpte->mpte_mptcb;
5327
5328 MPT_LOCK(mp_tp);
5329 notsent = so->so_snd.sb_cc;
5330
5331 if ((notsent == 0) ||
5332 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5333 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5334 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5335 "lowat %d notsent %d actual %d \n",
5336 mp_tp->mpt_notsent_lowat, notsent,
5337 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5338 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5339 MPT_UNLOCK(mp_tp);
5340 return (1);
5341 }
5342 MPT_UNLOCK(mp_tp);
5343
5344 /* When Nagle's algorithm is not disabled, it is better
5345 * to wakeup the client even before there is atleast one
5346 * maxseg of data to write.
5347 */
5348 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5349 int retval = 0;
5350 MPTS_LOCK(mpts);
5351 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5352 struct socket *subf_so = mpts->mpts_socket;
5353 socket_lock(subf_so, 0);
5354 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5355
5356 notsent = so->so_snd.sb_cc -
5357 (tp->snd_nxt - tp->snd_una);
5358
5359 if ((tp->t_flags & TF_NODELAY) == 0 &&
5360 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5361 retval = 1;
5362 }
3e170ce0 5363 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5364 " nodelay false \n",
3e170ce0
A
5365 mp_tp->mpt_notsent_lowat, notsent),
5366 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5367 socket_unlock(subf_so, 0);
5368 MPTS_UNLOCK(mpts);
5369 return (retval);
5370 }
5371 MPTS_UNLOCK(mpts);
5372 }
5373 return (0);
5374}
5375
3e170ce0
A
5376static void
5377mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
5378{
5379 MPTE_LOCK_ASSERT_HELD(mpte);
5380 MPTS_LOCK_ASSERT_HELD(mpts);
5381
5382 struct socket *subflow_so = mpts->mpts_socket;
5383 socket_lock(subflow_so, 0);
5384 mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
5385 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
5386 socket_unlock(subflow_so, 0);
5387}
5388
5389/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5390static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5391static uint32_t mptcp_kern_skt_inuse = 0;
5392symptoms_advisory_t mptcp_advisory;
5393
5394static errno_t
5395mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5396 void **unitinfo)
5397{
5398#pragma unused(kctlref, sac, unitinfo)
5399 /*
5400 * We don't need to do anything here. But we can atleast ensure
5401 * only one user opens the MPTCP_KERN_CTL_NAME control socket.
5402 */
5403 if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
5404 return (0);
5405 else
5406 return (EALREADY);
5407}
5408
5409static errno_t
5410mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5411 void *unitinfo)
5412{
5413#pragma unused(kctlref, kcunit, unitinfo)
5414 if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
5415 /* TBD needs to be locked if the size grows more than an int */
5416 bzero(&mptcp_advisory, sizeof(mptcp_advisory));
5417 return (0);
5418 }
5419 else {
5420 return (EINVAL);
5421 }
5422}
5423
5424static errno_t
5425mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5426 mbuf_t m, int flags)
5427{
5428#pragma unused(kctlref, kcunit, unitinfo, flags)
5429 symptoms_advisory_t *sa = NULL;
5430
5431 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5432 mbuf_freem(m);
5433 return (EINVAL);
5434 }
5435
5436 if (mbuf_len(m) >= sizeof(*sa))
5437 sa = mbuf_data(m);
5438 else
5439 return (EINVAL);
5440
5441 if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
5442 /*
5443 * we could use this notification to notify all mptcp pcbs
5444 * of the change in network status. But its difficult to
5445 * define if sending REMOVE_ADDR or MP_PRIO is appropriate
5446 * given that these are only soft indicators of the network
5447 * state. Leaving this as TBD for now.
5448 */
5449 }
5450
5451 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
5452 mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
5453 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
5454 sa->sa_cell_status, mptcp_advisory.sa_cell_status),
5455 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
5456 MPTCP_LOGLVL_LOG);
5457
5458 if ((sa->sa_wifi_status &
5459 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5460 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
5461 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5462 }
5463
5464 if ((sa->sa_cell_status &
5465 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
5466 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
5467 mptcp_advisory.sa_cell_status = sa->sa_cell_status;
5468 }
5469 } else {
5470 mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
5471 "wifi %d cell %d\n", __func__,
5472 mptcp_advisory.sa_wifi_status,
5473 mptcp_advisory.sa_cell_status),
5474 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
5475 }
5476 return (0);
5477}
5478
5479void
5480mptcp_control_register(void)
5481{
5482 /* Set up the advisory control socket */
5483 struct kern_ctl_reg mptcp_kern_ctl;
5484
5485 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5486 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5487 sizeof(mptcp_kern_ctl.ctl_name));
5488 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5489 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5490 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5491 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5492
5493 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5494}
5495
5496int
5497mptcp_is_wifi_unusable(void)
5498{
5499 /* a false return val indicates there is no info or wifi is ok */
5500 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5501}
5502
5503int
5504mptcp_is_cell_unusable(void)
5505{
5506 /* a false return val indicates there is no info or cell is ok */
5507 return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
5508}
5509
5510struct mptsub*
5511mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
5512{
5513 struct mptsub *cellsub = NULL;
5514 struct mptsub *wifisub = NULL;
5515 struct mptsub *wiredsub = NULL;
5516
5517 VERIFY ((best != NULL) && (second_best != NULL));
5518
5519 if (!mptcp_use_symptomsd)
5520 return (NULL);
5521
5522 if (!mptcp_kern_skt_inuse)
5523 return (NULL);
5524
5525 /*
5526 * There could be devices with more than one wifi interface or
5527 * more than one wired or cell interfaces.
5528 * TBD: SymptomsD is unavailable on such platforms as of now.
5529 * Try to prefer best when possible in general.
5530 * Also, SymptomsD sends notifications about wifi only when it
5531 * is primary.
5532 */
5533 if (best->mpts_linktype & MPTSL_WIFI)
5534 wifisub = best;
5535 else if (best->mpts_linktype & MPTSL_CELL)
5536 cellsub = best;
5537 else if (best->mpts_linktype & MPTSL_WIRED)
5538 wiredsub = best;
5539
5540 /*
5541 * On platforms with wired paths, don't use hints about wifi or cell.
5542 * Currently, SymptomsD is not available on platforms with wired paths.
5543 */
5544 if (wiredsub)
5545 return (NULL);
5546
5547 if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
5548 wifisub = second_best;
5549
5550 if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
5551 cellsub = second_best;
5552
5553 if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
5554 wiredsub = second_best;
5555
5556 if ((wifisub == best) && mptcp_is_wifi_unusable()) {
5557 tcpstat.tcps_mp_sel_symtomsd++;
5558 if (mptcp_is_cell_unusable()) {
5559 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5560 " suggests both Wifi and Cell are bad. Wired %s.",
5561 (wiredsub == NULL) ? "none" : "present"),
5562 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5563 return (wiredsub);
5564 } else {
5565 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5566 " suggests Wifi bad, Cell good. Wired %s.",
5567 (wiredsub == NULL) ? "none" : "present"),
5568 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5569 return ((wiredsub != NULL) ? wiredsub : cellsub);
5570 }
5571 }
5572
5573 if ((cellsub == best) && (mptcp_is_cell_unusable())) {
5574 tcpstat.tcps_mp_sel_symtomsd++;
5575 if (mptcp_is_wifi_unusable()) {
5576 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5577 " suggests both Cell and Wifi are bad. Wired %s.",
5578 (wiredsub == NULL) ? "none" : "present"),
5579 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5580 return (wiredsub);
5581 } else {
5582 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5583 " suggests Cell bad, Wifi good. Wired %s.",
5584 (wiredsub == NULL) ? "none" : "present"),
5585 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5586 return ((wiredsub != NULL) ? wiredsub : wifisub);
5587 }
5588 }
5589
5590 /* little is known about the state of the network or wifi is good */
5591 return (NULL);
5592}
490019cf
A
5593
5594/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5595static void
5596mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
5597{
5598 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5599 struct socket *so = mpts->mpts_socket;
5600 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5601 struct mptcb *mp_tp = mpte->mpte_mptcb;
5602
5603 /* If data was sent with SYN, rewind state */
5604 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5605 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5606 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5607 MPT_LOCK(mp_tp);
5608 u_int64_t mp_droplen = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
5609 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5610 VERIFY(mp_droplen <= (UINT_MAX));
5611 VERIFY(mp_droplen >= tcp_droplen);
5612
5613 if (mp_droplen > tcp_droplen) {
5614 /* handle partial TCP ack */
5615 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5616 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
5617 mpts->mpts_sndnxt = mp_tp->mpt_sndnxt;
5618 mp_droplen = tcp_droplen;
5619 } else {
5620 /* all data on SYN was acked */
5621 mpts->mpts_rel_seq = 1;
5622 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5623 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
5624 }
5625 mp_tp->mpt_sndmax -= tcp_droplen;
5626
5627 MPT_UNLOCK(mp_tp);
5628 if (mp_droplen != 0) {
5629 VERIFY(mp_so->so_snd.sb_mb != NULL);
5630 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5631 }
5632 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d "
5633 "TFO tcp len %d mptcp len %d\n", __func__,
5634 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid,
5635 tcp_droplen, mp_droplen),
5636 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5637 }
5638}
5639