]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-4570.31.3.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
5ba3f43e 2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
5ba3f43e
A
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
39236c6e
A
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
5ba3f43e
A
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
39236c6e
A
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
39236c6e 47#include <sys/sysctl.h>
5ba3f43e
A
48#include <sys/syslog.h>
49#include <sys/systm.h>
39236c6e 50
5ba3f43e 51#include <net/content_filter.h>
39236c6e 52#include <net/if.h>
3e170ce0 53#include <net/if_var.h>
39236c6e
A
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_fsm.h>
59#include <netinet/tcp_seq.h>
60#include <netinet/tcp_var.h>
61#include <netinet/mptcp_var.h>
62#include <netinet/mptcp.h>
5ba3f43e 63#include <netinet/mptcp_opt.h>
39236c6e
A
64#include <netinet/mptcp_seq.h>
65#include <netinet/mptcp_timer.h>
66#include <libkern/crypto/sha1.h>
67#if INET6
68#include <netinet6/in6_pcb.h>
69#include <netinet6/ip6protosw.h>
70#endif /* INET6 */
71#include <dev/random/randomdev.h>
72
73/*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
39236c6e
A
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
5ba3f43e 96 * subflow. This gets decremented prior to the subflow's destruction.
39236c6e 97 *
5ba3f43e
A
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
39236c6e 100 *
5ba3f43e
A
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
39236c6e
A
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
5ba3f43e 109 * of the subflows have been destroyed.
39236c6e
A
110 */
111
fe8ab488 112static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 113static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
114
115static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
116static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
5ba3f43e
A
118static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
39236c6e
A
120static void mptcp_subflow_rupcall(struct socket *, void *, int);
121static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122static void mptcp_subflow_wupcall(struct socket *, void *, int);
5ba3f43e
A
123static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127static void mptcp_subflow_abort(struct mptsub *, int);
128
129static void mptcp_send_dfin(struct socket *so);
39236c6e
A
130
131/*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
143} ev_ret_t;
144
3e170ce0 145static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
5ba3f43e
A
146static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
fe8ab488 157
39236c6e
A
158static const char *mptcp_evret2str(ev_ret_t);
159
5ba3f43e
A
160static void mptcp_do_sha1(mptcp_key_t *, char *);
161static void mptcp_init_local_parms(struct mptses *);
39236c6e
A
162
163static unsigned int mptsub_zone_size; /* size of mptsub */
164static struct zone *mptsub_zone; /* zone for mptsub */
165
166static unsigned int mptopt_zone_size; /* size of mptopt */
167static struct zone *mptopt_zone; /* zone for mptopt */
168
169static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172struct mppcbinfo mtcbinfo;
173
39236c6e
A
174#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177SYSCTL_DECL(_net_inet);
178
179SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
5ba3f43e 181uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
3e170ce0
A
182SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
5ba3f43e 185uint32_t mptcp_dbg_level = 1;
3e170ce0
A
186SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
39236c6e
A
189SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
39236c6e
A
192static struct protosw mptcp_subflow_protosw;
193static struct pr_usrreqs mptcp_subflow_usrreqs;
194#if INET6
195static struct ip6protosw mptcp_subflow_protosw6;
196static struct pr_usrreqs mptcp_subflow_usrreqs6;
197#endif /* INET6 */
198
5ba3f43e
A
199static uint8_t mptcp_create_subflows_scheduled;
200
3e170ce0
A
201typedef struct mptcp_subflow_event_entry {
202 uint64_t sofilt_hint_mask;
203 ev_ret_t (*sofilt_hint_ev_hdlr)(
204 struct mptses *mpte,
205 struct mptsub *mpts,
5ba3f43e
A
206 uint64_t *p_mpsofilt_hint,
207 uint64_t event);
3e170ce0
A
208} mptsub_ev_entry_t;
209
5ba3f43e
A
210static uint8_t mptcp_cellicon_is_set;
211static uint32_t mptcp_last_cellicon_set;
212#define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
213
490019cf
A
214/*
215 * XXX The order of the event handlers below is really
5ba3f43e 216 * really important. Think twice before changing it.
490019cf 217 */
3e170ce0
A
218static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
219 {
220 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
221 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
222 },
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
5ba3f43e 229 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
5ba3f43e 237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
5ba3f43e 241 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
250 },
3e170ce0
A
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
258 },
3e170ce0
A
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
262 },
263 {
5ba3f43e
A
264 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
270 },
3e170ce0
A
271};
272
39236c6e
A
273/*
274 * Protocol pr_init callback.
275 */
276void
277mptcp_init(struct protosw *pp, struct domain *dp)
278{
279#pragma unused(dp)
280 static int mptcp_initialized = 0;
281 struct protosw *prp;
282#if INET6
283 struct ip6protosw *prp6;
284#endif /* INET6 */
285
286 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
287
288 /* do this only once */
289 if (mptcp_initialized)
290 return;
291 mptcp_initialized = 1;
292
293 /*
294 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
295 * we must be able to find IPPROTO_TCP entries for both.
296 */
297 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
298 VERIFY(prp != NULL);
299 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
300 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
301 sizeof (mptcp_subflow_usrreqs));
302 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
303 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
304 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
305 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 306 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
307 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
308 /*
309 * Socket filters shouldn't attach/detach to/from this protosw
310 * since pr_protosw is to be used instead, which points to the
311 * real protocol; if they do, it is a bug and we should panic.
312 */
313 mptcp_subflow_protosw.pr_filter_head.tqh_first =
314 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
315 mptcp_subflow_protosw.pr_filter_head.tqh_last =
316 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
317
318#if INET6
319 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
320 IPPROTO_TCP, SOCK_STREAM);
321 VERIFY(prp6 != NULL);
322 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
323 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
324 sizeof (mptcp_subflow_usrreqs6));
325 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
326 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
327 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
328 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 329 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
330 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
331 /*
332 * Socket filters shouldn't attach/detach to/from this protosw
333 * since pr_protosw is to be used instead, which points to the
334 * real protocol; if they do, it is a bug and we should panic.
335 */
336 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
337 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
338 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
339 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
340#endif /* INET6 */
341
342 bzero(&mtcbinfo, sizeof (mtcbinfo));
343 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
344 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
345 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
346 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
347 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
348 /* NOTREACHED */
349 }
350 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
351 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
352
353 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
354 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
355 mtcbinfo.mppi_lock_grp_attr);
356 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
357 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
358 mtcbinfo.mppi_lock_attr);
39236c6e 359
3e170ce0 360 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e
A
361 mtcbinfo.mppi_timer = mptcp_timer;
362
363 /* attach to MP domain for garbage collection to take place */
364 mp_pcbinfo_attach(&mtcbinfo);
365
366 mptsub_zone_size = sizeof (struct mptsub);
367 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
368 8192, "mptsub")) == NULL) {
369 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
370 /* NOTREACHED */
371 }
372 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
373 zone_change(mptsub_zone, Z_EXPAND, TRUE);
374
375 mptopt_zone_size = sizeof (struct mptopt);
376 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
377 1024, "mptopt")) == NULL) {
378 panic("%s: unable to allocate MPTCP option zone\n", __func__);
379 /* NOTREACHED */
380 }
381 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
382 zone_change(mptopt_zone, Z_EXPAND, TRUE);
383
384 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
385 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
386 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
387 panic("%s: unable to allocate MPTCP address auth zone \n",
388 __func__);
389 /* NOTREACHED */
390 }
391 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
392 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
393
5ba3f43e
A
394 mptcp_last_cellicon_set = tcp_now;
395}
396
397int
398mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
399{
400 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
401
402 int i, index = -1;
403
404 if (ifp == NULL) {
405 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
406 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
407 return (-1);
408 }
409
410 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
411 if (stats[i].ifindex == IFSCOPE_NONE) {
412 if (index < 0)
413 index = i;
414 continue;
415 }
416
417 if (stats[i].ifindex == ifp->if_index) {
418 index = i;
419 return (index);
420 }
421 }
422
423 if (index != -1) {
424 stats[index].ifindex = ifp->if_index;
425 if (stats[index].is_expensive == 0)
426 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
427 }
428
429 return (index);
430}
431
432void
433mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
434{
435 int index;
436
437 tcpstat.tcps_mp_switches++;
438 mpte->mpte_subflow_switches++;
439
440 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
441
442 if (index != -1)
443 mpte->mpte_itfstats[index].switches++;
444}
445
446/*
447 * Flushes all recorded socket options from an MP socket.
448 */
449static void
450mptcp_flush_sopts(struct mptses *mpte)
451{
452 struct mptopt *mpo, *tmpo;
453
454 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
455 mptcp_sopt_remove(mpte, mpo);
456 mptcp_sopt_free(mpo);
457 }
458 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
39236c6e
A
459}
460
461/*
462 * Create an MPTCP session, called as a result of opening a MPTCP socket.
463 */
5ba3f43e
A
464int
465mptcp_sescreate(struct mppcb *mpp)
39236c6e
A
466{
467 struct mppcbinfo *mppi;
468 struct mptses *mpte;
469 struct mptcb *mp_tp;
39236c6e
A
470
471 VERIFY(mpp != NULL);
472 mppi = mpp->mpp_pcbinfo;
473 VERIFY(mppi != NULL);
474
3e170ce0
A
475 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
476 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
477
478 /* MPTCP Multipath PCB Extension */
479 bzero(mpte, sizeof (*mpte));
480 VERIFY(mpp->mpp_pcbe == NULL);
481 mpp->mpp_pcbe = mpte;
482 mpte->mpte_mppcb = mpp;
483 mpte->mpte_mptcb = mp_tp;
484
485 TAILQ_INIT(&mpte->mpte_sopts);
486 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
487 mpte->mpte_associd = SAE_ASSOCID_ANY;
488 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e 489
5ba3f43e
A
490 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
491 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
39236c6e
A
492
493 /* MPTCP Protocol Control Block */
494 bzero(mp_tp, sizeof (*mp_tp));
39236c6e 495 mp_tp->mpt_mpte = mpte;
3e170ce0 496 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e 497
5ba3f43e
A
498 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
499
500 return (0);
501}
502
503static void
504mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
505 uint64_t *cellbytes, uint64_t *allbytes)
506{
507 int64_t mycellbytes = 0;
508 uint64_t myallbytes = 0;
509 int i;
510
511 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
512 if (mpte->mpte_itfstats[i].is_expensive) {
513 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
514 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
515 }
516
517 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
518 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
519 }
520
521 if (initial_cell) {
522 mycellbytes -= mpte->mpte_init_txbytes;
523 mycellbytes -= mpte->mpte_init_txbytes;
524 }
525
526 if (mycellbytes < 0) {
527 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
528 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
529 *cellbytes = 0;
530 *allbytes = 0;
531 } else {
532 *cellbytes = mycellbytes;
533 *allbytes = myallbytes;
534 }
535}
536
537static void
538mptcpstats_session_wrapup(struct mptses *mpte)
539{
540 boolean_t cell = mpte->mpte_initial_cell;
541
542 switch (mpte->mpte_svctype) {
543 case MPTCP_SVCTYPE_HANDOVER:
544 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
545 tcpstat.tcps_mptcp_fp_handover_attempt++;
546
547 if (cell && mpte->mpte_handshake_success) {
548 tcpstat.tcps_mptcp_fp_handover_success_cell++;
549
550 if (mpte->mpte_used_wifi)
551 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
552 } else if (mpte->mpte_handshake_success) {
553 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
554
555 if (mpte->mpte_used_cell)
556 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
557 }
558 } else {
559 tcpstat.tcps_mptcp_handover_attempt++;
560
561 if (cell && mpte->mpte_handshake_success) {
562 tcpstat.tcps_mptcp_handover_success_cell++;
563
564 if (mpte->mpte_used_wifi)
565 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
566 } else if (mpte->mpte_handshake_success) {
567 tcpstat.tcps_mptcp_handover_success_wifi++;
568
569 if (mpte->mpte_used_cell)
570 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
571 }
572 }
573
574 if (mpte->mpte_handshake_success) {
575 uint64_t cellbytes;
576 uint64_t allbytes;
577
578 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
579
580 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
581 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
582 }
583 break;
584 case MPTCP_SVCTYPE_INTERACTIVE:
585 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
586 tcpstat.tcps_mptcp_fp_interactive_attempt++;
587
588 if (mpte->mpte_handshake_success) {
589 tcpstat.tcps_mptcp_fp_interactive_success++;
590
591 if (!cell && mpte->mpte_used_cell)
592 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
593 }
594 } else {
595 tcpstat.tcps_mptcp_interactive_attempt++;
596
597 if (mpte->mpte_handshake_success) {
598 tcpstat.tcps_mptcp_interactive_success++;
599
600 if (!cell && mpte->mpte_used_cell)
601 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
602 }
603 }
604
605 if (mpte->mpte_handshake_success) {
606 uint64_t cellbytes;
607 uint64_t allbytes;
608
609 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
610
611 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
612 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
613 }
614 break;
615 case MPTCP_SVCTYPE_AGGREGATE:
616 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
617 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
618
619 if (mpte->mpte_handshake_success)
620 tcpstat.tcps_mptcp_fp_aggregate_success++;
621 } else {
622 tcpstat.tcps_mptcp_aggregate_attempt++;
623
624 if (mpte->mpte_handshake_success) {
625 tcpstat.tcps_mptcp_aggregate_success++;
626 }
627 }
628
629 if (mpte->mpte_handshake_success) {
630 uint64_t cellbytes;
631 uint64_t allbytes;
632
633 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
634
635 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
636 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
637 }
638 break;
639 }
640
641 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
642 tcpstat.tcps_mptcp_back_to_wifi++;
39236c6e
A
643}
644
645/*
646 * Destroy an MPTCP session.
647 */
648static void
5ba3f43e 649mptcp_session_destroy(struct mptses *mpte)
39236c6e
A
650{
651 struct mptcb *mp_tp;
652
5ba3f43e 653 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
654
655 mp_tp = mpte->mpte_mptcb;
656 VERIFY(mp_tp != NULL);
657
5ba3f43e
A
658 mptcpstats_session_wrapup(mpte);
659
660 mptcp_unset_cellicon();
661
39236c6e
A
662 /*
663 * MPTCP Multipath PCB Extension section
664 */
665 mptcp_flush_sopts(mpte);
666 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
667
5ba3f43e
A
668 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
669 _FREE(mpte->mpte_itfinfo, M_TEMP);
670
671 mpte->mpte_itfinfo = NULL;
672
673 m_freem_list(mpte->mpte_reinjectq);
39236c6e
A
674
675 /*
676 * MPTCP Protocol Control Block section
677 */
39236c6e
A
678 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
679 struct mptcb *, mp_tp);
680}
681
5ba3f43e
A
682static boolean_t
683mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
39236c6e 684{
5ba3f43e
A
685 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
686 mp_tp->mpt_state < MPTCPS_TIME_WAIT &&
687 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
688}
39236c6e 689
5ba3f43e
A
690static int
691mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
692{
693 static const struct in6_addr well_known_prefix = {
694 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
696 0x00, 0x00, 0x00, 0x00},
697 };
698 char buf[MAX_IPv6_STR_LEN];
699 char *ptrv4 = (char *)addrv4;
700 char *ptr = (char *)addr;
701
702 if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network
703 IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback
704 IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local
705 IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite
706 IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast
707 IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast
708 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
709 return (-1);
39236c6e
A
710 }
711
5ba3f43e
A
712 /* Check for the well-known prefix */
713 if (len == NAT64_PREFIX_LEN_96 &&
714 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
715 if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
716 IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space
717 return (-1);
718 }
39236c6e 719
5ba3f43e
A
720 switch (len) {
721 case NAT64_PREFIX_LEN_96:
722 memcpy(ptr + 12, ptrv4, 4);
723 break;
724 case NAT64_PREFIX_LEN_64:
725 memcpy(ptr + 9, ptrv4, 4);
726 break;
727 case NAT64_PREFIX_LEN_56:
728 memcpy(ptr + 7, ptrv4, 1);
729 memcpy(ptr + 9, ptrv4 + 1, 3);
730 break;
731 case NAT64_PREFIX_LEN_48:
732 memcpy(ptr + 6, ptrv4, 2);
733 memcpy(ptr + 9, ptrv4 + 2, 2);
734 break;
735 case NAT64_PREFIX_LEN_40:
736 memcpy(ptr + 5, ptrv4, 3);
737 memcpy(ptr + 9, ptrv4 + 3, 1);
738 break;
739 case NAT64_PREFIX_LEN_32:
740 memcpy(ptr + 4, ptrv4, 4);
741 break;
742 default:
743 panic("NAT64-prefix len is wrong: %u\n", len);
744 }
39236c6e 745
5ba3f43e
A
746 mptcplog((LOG_DEBUG, "%s: nat64prefix-len %u synthesized %s\n", __func__,
747 len, inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf))),
748 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 749
5ba3f43e 750 return (0);
39236c6e
A
751}
752
39236c6e 753void
5ba3f43e 754mptcp_check_subflows_and_add(struct mptses *mpte)
39236c6e 755{
5ba3f43e
A
756 struct mptcb *mp_tp = mpte->mpte_mptcb;
757 uint32_t i;
39236c6e 758
5ba3f43e
A
759 if (!mptcp_ok_to_create_subflows(mp_tp))
760 return;
39236c6e 761
5ba3f43e
A
762 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
763 struct mpt_itf_info *info;
764 struct mptsub *mpts;
765 uint32_t ifindex;
766 int found = 0;
39236c6e 767
5ba3f43e 768 info = &mpte->mpte_itfinfo[i];
39236c6e 769
5ba3f43e
A
770 if (info->no_mptcp_support)
771 continue;
39236c6e 772
5ba3f43e
A
773 ifindex = info->ifindex;
774 if (ifindex == IFSCOPE_NONE)
775 continue;
39236c6e 776
5ba3f43e
A
777 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
778 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
39236c6e 779
5ba3f43e
A
780 if (ifp == NULL)
781 continue;
39236c6e 782
5ba3f43e
A
783 if (ifp->if_index == ifindex &&
784 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED)) {
785 /*
786 * We found a subflow on this interface.
787 * No need to create a new one.
788 */
789 found = 1;
790 break;
791 }
792
793 /*
794 * In Handover mode, only create cell subflow if
795 * 1. Wi-Fi Assist is active
796 * 2. Symptoms marked WiFi as weak
797 * 3. We are experiencing RTOs or we are not sending data.
798 *
799 * This covers the scenario, where:
800 * 1. We send and get retransmission timeouts (thus,
801 * we confirmed that WiFi is indeed bad).
802 * 2. We are not sending and the server tries to send.
803 * Establshing a cell-subflow gives the server a
804 * chance to send us some data over cell if WiFi
805 * is dead. We establish the subflow with the
806 * backup-bit set, so the server is not allowed to
807 * send on this subflow as long as WiFi is providing
808 * good performance.
809 */
810 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
811 !IFNET_IS_CELLULAR(ifp) &&
812 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
813 (!mptcp_is_wifi_unusable() ||
814 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh &&
815 mptetoso(mpte)->so_snd.sb_cc))) {
816 mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n",
817 __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex,
818 ifp->if_index),
819 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
820 found = 1;
821 break;
822 }
823 }
824
825 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
826 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
827 mptcp_developer_mode == 0) {
828 mptcp_ask_symptoms(mpte);
829 return;
830 }
831
832 if (!found) {
833 struct sockaddr *dst = &mpte->mpte_dst;
834 struct sockaddr_in6 nat64pre;
835
836 if (mpte->mpte_dst.sa_family == AF_INET &&
837 !info->has_v4_conn && info->has_v6_conn) {
838 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
839 struct ifnet *ifp;
840 int error, j;
841
842 bzero(&nat64pre, sizeof(struct sockaddr_in6));
843
844 ifnet_head_lock_shared();
845 ifp = ifindex2ifnet[ifindex];
846 ifnet_head_done();
847
848 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
849 if (error) {
850 mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n",
851 __func__, ifp->if_name, error),
852 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
853 continue;
854 }
855
856 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
857 if (nat64prefixes[j].prefix_len != 0)
858 break;
859 }
860
861 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
862
863 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
864 nat64prefixes[j].prefix_len,
865 &mpte->__mpte_dst_v4.sin_addr);
866 if (error != 0) {
867 mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__),
868 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
869 continue;
870 }
871
872 memcpy(&nat64pre.sin6_addr,
873 &nat64prefixes[j].ipv6_prefix,
874 sizeof(nat64pre.sin6_addr));
875 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
876 nat64pre.sin6_family = AF_INET6;
877 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
878 nat64pre.sin6_flowinfo = 0;
879 nat64pre.sin6_scope_id = 0;
880
881 dst = (struct sockaddr *)&nat64pre;
882 }
883
884 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
885 }
886 }
887}
888
889/*
890 * Based on the MPTCP Service-type and the state of the subflows, we
891 * will destroy subflows here.
892 */
893static void
894mptcp_check_subflows_and_remove(struct mptses *mpte)
895{
896 struct mptsub *mpts, *tmpts;
897 int found_working_subflow = 0, removed_some = 0;
898 int wifi_unusable = mptcp_is_wifi_unusable();
899
900 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
901 return;
902
903 /*
904 * Look for a subflow that is on a non-cellular interface
905 * and actually works (aka, no retransmission timeout).
906 */
907 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
908 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
909 struct socket *so;
910 struct tcpcb *tp;
911
912 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
913 continue;
914
915 so = mpts->mpts_socket;
916 tp = sototcpcb(so);
917
918 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
919 tp->t_state != TCPS_ESTABLISHED)
920 continue;
921
922 /* Either this subflow is in good condition while we try to send */
923 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc)
924 found_working_subflow = 1;
39236c6e 925
5ba3f43e
A
926 /* Or WiFi is fine */
927 if (!wifi_unusable)
928 found_working_subflow = 1;
39236c6e
A
929 }
930
5ba3f43e
A
931 /*
932 * Couldn't find a working subflow, let's not remove those on a cellular
933 * interface.
934 */
935 if (!found_working_subflow)
936 return;
937
938 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
939 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
940
941 /* Only remove cellular subflows */
942 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
943 continue;
944
945 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
946 removed_some = 1;
947 }
948
949 if (removed_some)
950 mptcp_unset_cellicon();
951}
952
953static void
954mptcp_remove_subflows(struct mptses *mpte)
955{
956 struct mptsub *mpts, *tmpts;
957
958 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
959 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
960 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
961
962 soevent(mpts->mpts_socket,
963 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
964 }
965 }
966}
967
968static void
969mptcp_create_subflows(__unused void *arg)
970{
971 struct mppcb *mpp;
972
973 /*
974 * Start with clearing, because we might be processing connections
975 * while a new event comes in.
976 */
977 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
978 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
979 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
980
981 /* Iterate over all MPTCP connections */
982
983 lck_mtx_lock(&mtcbinfo.mppi_lock);
984
985 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
986 struct mptses *mpte;
987 struct socket *mp_so;
988
989 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
990 continue;
991
992 mpp_lock(mpp);
993
994 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
995
996 mpte = mpp->mpp_pcbe;
997 mp_so = mpp->mpp_socket;
998
999 VERIFY(mp_so->so_usecount > 0);
1000
1001 mptcp_check_subflows_and_add(mpte);
1002 mptcp_remove_subflows(mpte);
1003
1004 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1005 mpp_unlock(mpp);
1006 }
1007
1008 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1009}
1010
1011/*
1012 * We need this because we are coming from an NECP-event. This event gets posted
1013 * while holding NECP-locks. The creation of the subflow however leads us back
1014 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1015 * So, we would deadlock there as we already hold the NECP-lock.
1016 *
1017 * So, let's schedule this separately. It also gives NECP the chance to make
1018 * progress, without having to wait for MPTCP to finish its subflow creation.
1019 */
1020void
1021mptcp_sched_create_subflows(struct mptses *mpte)
1022{
1023 struct mppcb *mpp = mpte->mpte_mppcb;
1024 struct mptcb *mp_tp = mpte->mpte_mptcb;
1025 struct socket *mp_so = mpp->mpp_socket;
1026
1027 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1028 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1029 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1030 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1031 return;
1032 }
1033
1034 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1035 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1036 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1037 }
1038
1039 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1040 return;
1041
1042 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1043 timeout(mptcp_create_subflows, NULL, hz/10);
1044}
1045
1046/*
1047 * Allocate an MPTCP socket option structure.
1048 */
1049struct mptopt *
1050mptcp_sopt_alloc(int how)
1051{
1052 struct mptopt *mpo;
1053
1054 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1055 zalloc_noblock(mptopt_zone);
1056 if (mpo != NULL) {
1057 bzero(mpo, mptopt_zone_size);
1058 }
1059
1060 return (mpo);
1061}
1062
1063/*
1064 * Free an MPTCP socket option structure.
1065 */
1066void
1067mptcp_sopt_free(struct mptopt *mpo)
1068{
1069 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1070
1071 zfree(mptopt_zone, mpo);
1072}
1073
1074/*
1075 * Add a socket option to the MPTCP socket option list.
1076 */
1077void
1078mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1079{
1080 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1081 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1082 mpo->mpo_flags |= MPOF_ATTACHED;
1083 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1084}
1085
1086/*
1087 * Remove a socket option from the MPTCP socket option list.
1088 */
1089void
1090mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1091{
1092 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1093 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1094 mpo->mpo_flags &= ~MPOF_ATTACHED;
1095 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1096}
1097
1098/*
1099 * Search for an existing <sopt_level,sopt_name> socket option.
1100 */
1101struct mptopt *
1102mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1103{
1104 struct mptopt *mpo;
1105
1106 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1107
1108 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1109 if (mpo->mpo_level == sopt->sopt_level &&
1110 mpo->mpo_name == sopt->sopt_name)
1111 break;
1112 }
1113 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
1114
1115 return (mpo);
1116}
1117
1118/*
1119 * Allocate a MPTCP subflow structure.
1120 */
1121static struct mptsub *
1122mptcp_subflow_alloc(void)
1123{
1124 struct mptsub *mpts = zalloc(mptsub_zone);
1125
1126 if (mpts == NULL)
1127 return (NULL);
1128
1129 bzero(mpts, mptsub_zone_size);
39236c6e
A
1130 return (mpts);
1131}
1132
1133/*
1134 * Deallocate a subflow structure, called when all of the references held
1135 * on it have been released. This implies that the subflow has been deleted.
1136 */
5ba3f43e 1137static void
39236c6e
A
1138mptcp_subflow_free(struct mptsub *mpts)
1139{
39236c6e
A
1140 VERIFY(mpts->mpts_refcnt == 0);
1141 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1142 VERIFY(mpts->mpts_mpte == NULL);
1143 VERIFY(mpts->mpts_socket == NULL);
1144
813fb2f6
A
1145 if (mpts->mpts_src != NULL) {
1146 FREE(mpts->mpts_src, M_SONAME);
1147 mpts->mpts_src = NULL;
39236c6e 1148 }
39236c6e
A
1149
1150 zfree(mptsub_zone, mpts);
1151}
1152
5ba3f43e
A
1153static void
1154mptcp_subflow_addref(struct mptsub *mpts)
1155{
1156 if (++mpts->mpts_refcnt == 0)
1157 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1158 /* NOTREACHED */
1159}
1160
1161static void
1162mptcp_subflow_remref(struct mptsub *mpts)
1163{
1164 if (mpts->mpts_refcnt == 0) {
1165 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1166 /* NOTREACHED */
1167 }
1168 if (--mpts->mpts_refcnt > 0)
1169 return;
1170
1171 /* callee will unlock and destroy lock */
1172 mptcp_subflow_free(mpts);
1173}
1174
1175static void
1176mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1177{
1178 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1179 struct tcpcb *tp = sototcpcb(so);
1180
1181 /*
1182 * From this moment on, the subflow is linked to the MPTCP-connection.
1183 * Locking,... happens now at the MPTCP-layer
1184 */
1185 tp->t_mptcb = mpte->mpte_mptcb;
1186 so->so_flags |= SOF_MP_SUBFLOW;
1187 mp_so->so_usecount++;
1188
1189 /*
1190 * Insert the subflow into the list, and associate the MPTCP PCB
1191 * as well as the the subflow socket. From this point on, removing
1192 * the subflow needs to be done via mptcp_subflow_del().
1193 */
1194 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1195 mpte->mpte_numflows++;
1196
1197 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1198 mpts->mpts_mpte = mpte;
1199 mpts->mpts_socket = so;
1200 tp->t_mpsub = mpts;
1201 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1202 mptcp_subflow_addref(mpts); /* for subflow socket */
1203}
1204
1205static void
1206mptcp_subflow_necp_cb(void *handle, __unused int action,
1207 __unused struct necp_client_flow *flow)
1208{
1209 struct inpcb *inp = (struct inpcb *)handle;
1210 struct socket *so = inp->inp_socket;
1211 struct mptsub *mpts;
1212 struct mptses *mpte;
1213
1214 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1215 return;
1216
1217 /*
1218 * The socket is being garbage-collected. There is nothing to be done
1219 * here.
1220 */
1221 if (so->so_usecount == 0)
1222 return;
1223
1224 socket_lock(so, 1);
1225
1226 /* Check again after we acquired the lock. */
1227 if (so->so_usecount == 0)
1228 goto out;
1229
1230 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1231 mpts = sototcpcb(so)->t_mpsub;
1232
1233 mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__),
1234 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
1235
1236 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1237
1238 mptcp_sched_create_subflows(mpte);
1239
1240 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
1241 flow->viable = 1;
1242
1243out:
1244 socket_unlock(so, 1);
1245}
1246
39236c6e
A
1247/*
1248 * Create an MPTCP subflow socket.
1249 */
1250static int
1251mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
5ba3f43e 1252 struct socket **so)
39236c6e 1253{
5ba3f43e 1254 lck_mtx_t *subflow_mtx;
39236c6e 1255 struct mptopt smpo, *mpo, *tmpo;
5ba3f43e 1256 struct proc *p;
39236c6e
A
1257 struct socket *mp_so;
1258 int error;
1259
1260 *so = NULL;
5ba3f43e
A
1261 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1262 mp_so = mptetoso(mpte);
1263
1264 p = proc_find(mp_so->last_pid);
1265 if (p == PROC_NULL) {
1266 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1267 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1268
1269 return (ESRCH);
1270 }
39236c6e
A
1271
1272 /*
1273 * Create the subflow socket (multipath subflow, non-blocking.)
1274 *
1275 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1276 * socket; it will be cleared when the socket is peeled off or closed.
1277 * It also indicates to the underlying TCP to handle MPTCP options.
1278 * A multipath subflow socket implies SS_NOFDREF state.
1279 */
5ba3f43e
A
1280
1281 /*
1282 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1283 * the ipi-lock. We cannot hold the socket-lock at that point.
1284 */
1285 mpte_unlock(mpte);
1286 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1287 SOCF_ASYNC, PROC_NULL);
1288 mpte_lock(mpte);
1289 if (error) {
1290 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1291 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1292 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1293
1294 proc_rele(p);
1295
1296 mptcp_subflow_free(mpts);
39236c6e
A
1297 return (error);
1298 }
1299
5ba3f43e
A
1300 /*
1301 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1302 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1303 * Which is why we also need to get the lock with pr_getlock, as after
1304 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1305 */
1306 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1307 lck_mtx_lock(subflow_mtx);
1308
1309 /*
1310 * Must be the first thing we do, to make sure all pointers for this
1311 * subflow are set.
1312 */
1313 mptcp_subflow_attach(mpte, mpts, *so);
1314
1315 /*
1316 * A multipath subflow socket is used internally in the kernel,
1317 * therefore it does not have a file desciptor associated by
1318 * default.
1319 */
1320 (*so)->so_state |= SS_NOFDREF;
1321
1322 lck_mtx_unlock(subflow_mtx);
39236c6e
A
1323
1324 /* prevent the socket buffers from being compressed */
1325 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1326 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1327
490019cf
A
1328 /* Inherit preconnect and TFO data flags */
1329 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1330 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
490019cf
A
1331 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1332 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1333
5ba3f43e
A
1334 /* Inherit uuid and create the related flow. */
1335 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1336 struct mptcb *mp_tp = mpte->mpte_mptcb;
1337
1338 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1339
1340 /*
1341 * A note on the unlock: With MPTCP, we do multiple times a
1342 * necp_client_register_socket_flow. This is problematic,
1343 * because now the lock-ordering guarantee (first necp-locks,
1344 * then socket-locks) is no more respected. So, we need to
1345 * unlock here.
1346 */
1347 mpte_unlock(mpte);
1348 error = necp_client_register_socket_flow(mp_so->last_pid,
1349 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1350 mpte_lock(mpte);
1351
1352 if (error)
1353 goto out_err;
1354
1355 /* Possible state-change during the unlock above */
1356 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1357 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1358 goto out_err;
1359
1360 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1361 } else {
1362 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1363 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1364 }
1365
1366 /* inherit the other socket options */
39236c6e
A
1367 bzero(&smpo, sizeof (smpo));
1368 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1369 smpo.mpo_level = SOL_SOCKET;
1370 smpo.mpo_intval = 1;
1371
1372 /* disable SIGPIPE */
1373 smpo.mpo_name = SO_NOSIGPIPE;
5ba3f43e
A
1374 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1375 goto out_err;
39236c6e
A
1376
1377 /* find out if the subflow's source address goes away */
1378 smpo.mpo_name = SO_NOADDRERR;
5ba3f43e
A
1379 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1380 goto out_err;
39236c6e
A
1381
1382 /* enable keepalive */
1383 smpo.mpo_name = SO_KEEPALIVE;
5ba3f43e
A
1384 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1385 goto out_err;
39236c6e
A
1386
1387 smpo.mpo_level = IPPROTO_TCP;
1388 smpo.mpo_intval = mptcp_subflow_keeptime;
1389 smpo.mpo_name = TCP_KEEPALIVE;
5ba3f43e
A
1390 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1391 goto out_err;
1392
1393 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1394 /*
1395 * On secondary subflows we might need to set the cell-fallback
1396 * flag (see conditions in mptcp_subflow_sosetopt).
1397 */
1398 smpo.mpo_level = SOL_SOCKET;
1399 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1400 smpo.mpo_intval = 1;
1401 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1402 goto out_err;
1403 }
39236c6e
A
1404
1405 /* replay setsockopt(2) on the subflow sockets for eligible options */
1406 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1407 int interim;
1408
1409 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1410 continue;
1411
1412 /*
1413 * Skip those that are handled internally; these options
1414 * should not have been recorded and marked with the
1415 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1416 */
1417 if (mpo->mpo_level == SOL_SOCKET &&
1418 (mpo->mpo_name == SO_NOSIGPIPE ||
1419 mpo->mpo_name == SO_NOADDRERR ||
1420 mpo->mpo_name == SO_KEEPALIVE))
1421 continue;
1422
1423 interim = (mpo->mpo_flags & MPOF_INTERIM);
5ba3f43e
A
1424 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1425 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1426 " sopt %s val %d interim record removed\n", __func__,
39236c6e 1427 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
1428 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1429 mpo->mpo_intval),
3e170ce0 1430 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1431 mptcp_sopt_remove(mpte, mpo);
1432 mptcp_sopt_free(mpo);
1433 continue;
1434 }
1435 }
1436
1437 /*
1438 * We need to receive everything that the subflow socket has,
1439 * so use a customized socket receive function. We will undo
1440 * this when the socket is peeled off or closed.
1441 */
39236c6e
A
1442 switch (dom) {
1443 case PF_INET:
1444 (*so)->so_proto = &mptcp_subflow_protosw;
1445 break;
1446#if INET6
1447 case PF_INET6:
1448 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1449 break;
1450#endif /* INET6 */
1451 default:
1452 VERIFY(0);
1453 /* NOTREACHED */
1454 }
1455
5ba3f43e
A
1456 proc_rele(p);
1457
1458 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1459 int, dom, int, error);
1460
1461 return (0);
39236c6e 1462
5ba3f43e
A
1463out_err:
1464 mptcp_subflow_abort(mpts, error);
1465
1466 proc_rele(p);
1467
1468 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1469 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1470
1471 return (error);
1472}
1473
1474/*
1475 * Close an MPTCP subflow socket.
1476 *
1477 * Note that this may be called on an embryonic subflow, and the only
1478 * thing that is guaranteed valid is the protocol-user request.
1479 */
5ba3f43e
A
1480static void
1481mptcp_subflow_soclose(struct mptsub *mpts)
39236c6e 1482{
5ba3f43e
A
1483 struct socket *so = mpts->mpts_socket;
1484
1485 if (mpts->mpts_flags & MPTSF_CLOSED)
1486 return;
39236c6e 1487
5ba3f43e 1488 VERIFY(so != NULL);
39236c6e
A
1489 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1490 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1491
39236c6e
A
1492 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1493 struct socket *, so,
1494 struct sockbuf *, &so->so_rcv,
1495 struct sockbuf *, &so->so_snd,
1496 struct mptses *, mpts->mpts_mpte);
1497
5ba3f43e
A
1498 mpts->mpts_flags |= MPTSF_CLOSED;
1499
1500 if (so->so_retaincnt == 0) {
1501 soclose_locked(so);
1502
1503 return;
1504 } else {
1505 VERIFY(so->so_usecount > 0);
1506 so->so_usecount--;
1507 }
1508
1509 return;
39236c6e
A
1510}
1511
1512/*
1513 * Connect an MPTCP subflow socket.
1514 *
5ba3f43e
A
1515 * Note that in the pending connect case, the subflow socket may have been
1516 * bound to an interface and/or a source IP address which may no longer be
1517 * around by the time this routine is called; in that case the connect attempt
1518 * will most likely fail.
39236c6e
A
1519 */
1520static int
1521mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1522{
5ba3f43e
A
1523 char dbuf[MAX_IPv6_STR_LEN];
1524 struct socket *mp_so, *so;
1525 struct mptcb *mp_tp;
1526 struct sockaddr *dst;
1527 struct proc *p;
39236c6e
A
1528 int af, error;
1529
5ba3f43e 1530 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 1531
5ba3f43e
A
1532 mp_so = mptetoso(mpte);
1533 mp_tp = mpte->mpte_mptcb;
39236c6e 1534
5ba3f43e
A
1535 p = proc_find(mp_so->last_pid);
1536 if (p == PROC_NULL) {
1537 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1538 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1539
5ba3f43e 1540 return (ESRCH);
39236c6e
A
1541 }
1542
5ba3f43e
A
1543 so = mpts->mpts_socket;
1544 af = mpts->mpts_dst.sa_family;
1545
1546 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1547 VERIFY(mpts->mpts_socket != NULL);
1548 VERIFY(af == AF_INET || af == AF_INET6);
1549
1550 dst = &mpts->mpts_dst;
1551 mptcplog((LOG_DEBUG, "%s: connectx mp_so 0x%llx dst %s[%d] cid %d [pended %s]\n",
1552 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1553 inet_ntop(af, ((af == AF_INET) ? (void *)&SIN(dst)->sin_addr.s_addr :
1554 (void *)&SIN6(dst)->sin6_addr),
1555 dbuf, sizeof (dbuf)),
1556 ((af == AF_INET) ? ntohs(SIN(dst)->sin_port) : ntohs(SIN6(dst)->sin6_port)),
1557 mpts->mpts_connid,
1558 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? "YES" : "NO")),
1559 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1560
39236c6e
A
1561 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1562
fe8ab488 1563 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 1564
39236c6e 1565 /* connect the subflow socket */
5ba3f43e
A
1566 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1567 p, mpts->mpts_ifscope,
1568 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1569
1570 mpts->mpts_iss = sototcpcb(so)->iss;
1571
1572 /* See tcp_connect_complete */
1573 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1574 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1575 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1576 }
39236c6e 1577
fe8ab488
A
1578 /* Allocate a unique address id per subflow */
1579 mpte->mpte_addrid_last++;
1580 if (mpte->mpte_addrid_last == 0)
1581 mpte->mpte_addrid_last++;
1582
5ba3f43e
A
1583 proc_rele(p);
1584
39236c6e
A
1585 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1586 struct mptsub *, mpts, int, error);
5ba3f43e
A
1587 if (error)
1588 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1589 __func__, error, mpts->mpts_ifscope),
1590 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1591
1592 return (error);
1593}
1594
1595/*
1596 * MPTCP subflow socket receive routine, derived from soreceive().
1597 */
1598static int
1599mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1600 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1601{
1602#pragma unused(uio)
5ba3f43e 1603 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1604 int flags, error = 0;
1605 struct proc *p = current_proc();
1606 struct mbuf *m, **mp = mp0;
5ba3f43e 1607 boolean_t proc_held = FALSE;
39236c6e 1608
5ba3f43e 1609 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1610 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1611
1612#ifdef MORE_LOCKING_DEBUG
1613 if (so->so_usecount == 1) {
1614 panic("%s: so=%x no other reference on socket\n", __func__, so);
1615 /* NOTREACHED */
1616 }
1617#endif
1618 /*
1619 * We return all that is there in the subflow's socket receive buffer
1620 * to the MPTCP layer, so we require that the caller passes in the
1621 * expected parameters.
1622 */
5ba3f43e 1623 if (mp == NULL || controlp != NULL)
39236c6e 1624 return (EINVAL);
5ba3f43e 1625
39236c6e
A
1626 *mp = NULL;
1627 if (psa != NULL)
1628 *psa = NULL;
1629 if (flagsp != NULL)
1630 flags = *flagsp &~ MSG_EOR;
1631 else
1632 flags = 0;
1633
5ba3f43e 1634 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
39236c6e 1635 return (EOPNOTSUPP);
5ba3f43e 1636
39236c6e
A
1637 flags |= (MSG_DONTWAIT|MSG_NBIO);
1638
1639 /*
1640 * If a recv attempt is made on a previously-accepted socket
1641 * that has been marked as inactive (disconnected), reject
1642 * the request.
1643 */
1644 if (so->so_flags & SOF_DEFUNCT) {
1645 struct sockbuf *sb = &so->so_rcv;
1646
1647 error = ENOTCONN;
39236c6e
A
1648 /*
1649 * This socket should have been disconnected and flushed
1650 * prior to being returned from sodefunct(); there should
1651 * be no data on its receive list, so panic otherwise.
1652 */
1653 if (so->so_state & SS_DEFUNCT)
1654 sb_empty_assert(sb, __func__);
39236c6e
A
1655 return (error);
1656 }
1657
1658 /*
1659 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1660 * and if so just return to the caller. This could happen when
1661 * soreceive() is called by a socket upcall function during the
1662 * time the socket is freed. The socket buffer would have been
1663 * locked across the upcall, therefore we cannot put this thread
1664 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1665 * we may livelock), because the lock on the socket buffer will
1666 * only be released when the upcall routine returns to its caller.
1667 * Because the socket has been officially closed, there can be
1668 * no further read on it.
1669 *
1670 * A multipath subflow socket would have its SS_NOFDREF set by
1671 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1672 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1673 */
1674 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
5ba3f43e 1675 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
39236c6e 1676 return (0);
39236c6e
A
1677
1678 /*
1679 * For consistency with soreceive() semantics, we need to obey
1680 * SB_LOCK in case some other code path has locked the buffer.
1681 */
1682 error = sblock(&so->so_rcv, 0);
5ba3f43e 1683 if (error != 0)
39236c6e 1684 return (error);
39236c6e
A
1685
1686 m = so->so_rcv.sb_mb;
1687 if (m == NULL) {
1688 /*
1689 * Panic if we notice inconsistencies in the socket's
1690 * receive list; both sb_mb and sb_cc should correctly
1691 * reflect the contents of the list, otherwise we may
1692 * end up with false positives during select() or poll()
1693 * which could put the application in a bad state.
1694 */
1695 SB_MB_CHECK(&so->so_rcv);
1696
1697 if (so->so_error != 0) {
1698 error = so->so_error;
1699 so->so_error = 0;
1700 goto release;
1701 }
1702
5ba3f43e
A
1703 if (so->so_state & SS_CANTRCVMORE) {
1704 goto release;
1705 }
1706
1707 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1708 error = ENOTCONN;
1709 goto release;
1710 }
1711
1712 /*
1713 * MSG_DONTWAIT is implicitly defined and this routine will
1714 * never block, so return EWOULDBLOCK when there is nothing.
1715 */
1716 error = EWOULDBLOCK;
1717 goto release;
1718 }
1719
1720 mptcp_update_last_owner(so, mp_so);
1721
1722 if (mp_so->last_pid != proc_pid(p)) {
1723 p = proc_find(mp_so->last_pid);
1724 if (p == PROC_NULL) {
1725 p = current_proc();
1726 } else {
1727 proc_held = TRUE;
1728 }
1729 }
1730
1731 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1732 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1733 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1734
1735 while (m != NULL) {
5c9f4661 1736 int dlen = 0, dfin = 0, error_out = 0;
5ba3f43e
A
1737 struct mbuf *start = m;
1738 uint64_t dsn;
1739 uint32_t sseq;
1740 uint16_t orig_dlen;
1741 uint16_t csum;
1742
1743 VERIFY(m->m_nextpkt == NULL);
1744
1745 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1746 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1747 dsn = m->m_pkthdr.mp_dsn;
1748 sseq = m->m_pkthdr.mp_rseq;
1749 csum = m->m_pkthdr.mp_csum;
1750 } else {
1751 /* We did fallback */
5c9f4661 1752 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
5ba3f43e
A
1753
1754 sbfree(&so->so_rcv, m);
1755
1756 if (mp != NULL) {
1757 *mp = m;
1758 mp = &m->m_next;
1759 so->so_rcv.sb_mb = m = m->m_next;
1760 *mp = NULL;
1761
1762 }
1763
1764 if (m != NULL) {
1765 so->so_rcv.sb_lastrecord = m;
1766 } else {
1767 SB_EMPTY_FIXUP(&so->so_rcv);
1768 }
1769
1770 continue;
39236c6e
A
1771 }
1772
5c9f4661
A
1773 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
1774 dfin = 1;
1775
5ba3f43e
A
1776 /*
1777 * Check if the full mapping is now present
1778 */
5c9f4661 1779 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
5ba3f43e
A
1780 mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n",
1781 __func__, so->so_rcv.sb_cc, dlen),
1782 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1783
1784 if (*mp0 == NULL)
1785 error = EWOULDBLOCK;
39236c6e
A
1786 goto release;
1787 }
1788
5ba3f43e
A
1789 /* Now, get the full mapping */
1790 while (dlen > 0) {
5c9f4661
A
1791 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1792 error_out = 1;
1793 error = EIO;
1794 dlen = 0;
1795 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1796 break;
1797 }
39236c6e 1798
5ba3f43e
A
1799 dlen -= m->m_len;
1800 sbfree(&so->so_rcv, m);
39236c6e 1801
5ba3f43e
A
1802 if (mp != NULL) {
1803 *mp = m;
1804 mp = &m->m_next;
1805 so->so_rcv.sb_mb = m = m->m_next;
1806 *mp = NULL;
1807 }
1808
5c9f4661
A
1809 if (dlen - dfin == 0)
1810 dlen = 0;
1811
5ba3f43e 1812 VERIFY(dlen <= 0 || m);
39236c6e
A
1813 }
1814
5ba3f43e
A
1815 VERIFY(dlen == 0);
1816
39236c6e 1817 if (m != NULL) {
5ba3f43e 1818 so->so_rcv.sb_lastrecord = m;
39236c6e 1819 } else {
39236c6e
A
1820 SB_EMPTY_FIXUP(&so->so_rcv);
1821 }
5ba3f43e 1822
5c9f4661
A
1823 if (error_out)
1824 goto release;
1825
1826
1827 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
5ba3f43e
A
1828 error = EIO;
1829 *mp0 = NULL;
1830 goto release;
1831 }
1832
39236c6e
A
1833 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1834 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1835 }
1836
1837 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1838 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
39236c6e
A
1839
1840 if (flagsp != NULL)
1841 *flagsp |= flags;
1842
1843release:
5ba3f43e
A
1844 sbunlock(&so->so_rcv, TRUE);
1845
1846 if (proc_held)
1847 proc_rele(p);
1848
39236c6e
A
1849 return (error);
1850
1851}
1852
39236c6e 1853/*
5ba3f43e 1854 * MPTCP subflow socket send routine, derived from sosend().
39236c6e 1855 */
5ba3f43e
A
1856static int
1857mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1858 struct mbuf *top, struct mbuf *control, int flags)
39236c6e 1859{
5ba3f43e
A
1860 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1861 struct proc *p = current_proc();
1862 boolean_t en_tracing = FALSE, proc_held = FALSE;
1863 int en_tracing_val;
1864 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1865 int error;
39236c6e 1866
5ba3f43e
A
1867 VERIFY(control == NULL);
1868 VERIFY(addr == NULL);
1869 VERIFY(uio == NULL);
1870 VERIFY(flags == 0);
1871 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
39236c6e 1872
5ba3f43e
A
1873 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1874 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e
A
1875
1876 /*
5ba3f43e
A
1877 * trace if tracing & network (vs. unix) sockets & and
1878 * non-loopback
39236c6e 1879 */
5ba3f43e
A
1880 if (ENTR_SHOULDTRACE &&
1881 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1882 struct inpcb *inp = sotoinpcb(so);
1883 if (inp->inp_last_outifp != NULL &&
1884 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1885 en_tracing = TRUE;
1886 en_tracing_val = top->m_pkthdr.len;
1887 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1888 VM_KERNEL_ADDRPERM(so),
1889 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1890 (int64_t)en_tracing_val);
1891 }
1892 }
39236c6e 1893
5ba3f43e 1894 mptcp_update_last_owner(so, mp_so);
39236c6e 1895
5ba3f43e
A
1896 if (mp_so->last_pid != proc_pid(p)) {
1897 p = proc_find(mp_so->last_pid);
1898 if (p == PROC_NULL) {
1899 p = current_proc();
1900 } else {
1901 proc_held = TRUE;
1902 }
1903 }
39236c6e 1904
5ba3f43e
A
1905#if NECP
1906 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1907#endif /* NECP */
39236c6e 1908
5ba3f43e 1909 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 1910
5ba3f43e
A
1911 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1912 if (error)
1913 goto out;
39236c6e 1914
5ba3f43e
A
1915 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1916 top = NULL;
39236c6e 1917
5ba3f43e
A
1918out:
1919 if (top != NULL)
1920 m_freem(top);
39236c6e 1921
5ba3f43e
A
1922 if (proc_held)
1923 proc_rele(p);
1924
1925 soclearfastopen(so);
1926
1927 if (en_tracing) {
1928 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
1929 VM_KERNEL_ADDRPERM(so),
1930 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
1931 (int64_t)en_tracing_val);
1932 }
1933
1934 return (error);
39236c6e 1935
39236c6e
A
1936}
1937
1938/*
1939 * Establish an initial MPTCP connection (if first subflow and not yet
1940 * connected), or add a subflow to an existing MPTCP connection.
1941 */
1942int
5ba3f43e
A
1943mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
1944 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
39236c6e 1945{
39236c6e 1946 struct socket *mp_so, *so = NULL;
39236c6e 1947 struct mptcb *mp_tp;
5ba3f43e 1948 struct mptsub *mpts = NULL;
39236c6e
A
1949 int af, error = 0;
1950
5ba3f43e
A
1951 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1952 mp_so = mptetoso(mpte);
39236c6e
A
1953 mp_tp = mpte->mpte_mptcb;
1954
fe8ab488
A
1955 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1956 /* If the remote end sends Data FIN, refuse subflow adds */
5ba3f43e
A
1957 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
1958 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1959 error = ENOTCONN;
5ba3f43e 1960 goto out_err;
fe8ab488 1961 }
39236c6e 1962
5ba3f43e
A
1963 mpts = mptcp_subflow_alloc();
1964 if (mpts == NULL) {
1965 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
1966 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1967 error = ENOMEM;
1968 goto out_err;
1969 }
39236c6e 1970
5ba3f43e
A
1971 if (src != NULL) {
1972 int len = src->sa_len;
813fb2f6
A
1973
1974 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
1975 M_WAITOK | M_ZERO);
1976 if (mpts->mpts_src == NULL) {
5ba3f43e
A
1977 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
1978 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1979 error = ENOMEM;
1980 goto out_err;
39236c6e 1981 }
5ba3f43e 1982 bcopy(src, mpts->mpts_src, len);
39236c6e
A
1983 }
1984
5ba3f43e
A
1985 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
1986
1987 af = mpts->mpts_dst.sa_family;
1988
1989 mpts->mpts_ifscope = ifscope;
1990
39236c6e 1991 /* create the subflow socket */
5ba3f43e
A
1992 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
1993 /*
1994 * Returning (error) and not cleaning up, because up to here
1995 * all we did is creating mpts.
1996 *
1997 * And the contract is that the call to mptcp_subflow_socreate,
1998 * moves ownership of mpts to mptcp_subflow_socreate.
1999 */
2000 return (error);
2001
2002 /*
2003 * We may be called from within the kernel. Still need to account this
2004 * one to the real app.
2005 */
2006 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
39236c6e
A
2007
2008 /*
3e170ce0
A
2009 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2010 * -1 (SAE_CONNID_ALL).
39236c6e
A
2011 */
2012 mpte->mpte_connid_last++;
3e170ce0
A
2013 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2014 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
2015 mpte->mpte_connid_last++;
2016
2017 mpts->mpts_connid = mpte->mpte_connid_last;
490019cf
A
2018
2019 mpts->mpts_rel_seq = 1;
2020
fe8ab488
A
2021 /* Allocate a unique address id per subflow */
2022 mpte->mpte_addrid_last++;
2023 if (mpte->mpte_addrid_last == 0)
2024 mpte->mpte_addrid_last++;
39236c6e 2025
39236c6e 2026 /* register for subflow socket read/write events */
5ba3f43e 2027 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
39236c6e 2028
5ba3f43e
A
2029 /* Register for subflow socket control events */
2030 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
39236c6e 2031 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
5ba3f43e
A
2032 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2033 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2034 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2035 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2036 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2037 SO_FILT_HINT_ADAPTIVE_WTIMO);
39236c6e
A
2038
2039 /* sanity check */
2040 VERIFY(!(mpts->mpts_flags &
2041 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2042
39236c6e
A
2043 /*
2044 * Indicate to the TCP subflow whether or not it should establish
2045 * the initial MPTCP connection, or join an existing one. Fill
2046 * in the connection request structure with additional info needed
2047 * by the underlying TCP (to be used in the TCP options, etc.)
2048 */
39236c6e 2049 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
5ba3f43e
A
2050 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2051
39236c6e 2052 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5ba3f43e 2053 mptcp_init_local_parms(mpte);
39236c6e 2054 }
39236c6e 2055 soisconnecting(mp_so);
5ba3f43e
A
2056
2057 /* If fastopen is requested, set state in mpts */
2058 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2059 mpts->mpts_flags |= MPTSF_TFO_REQD;
39236c6e
A
2060 } else {
2061 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2062 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
490019cf
A
2063 }
2064
39236c6e
A
2065 mpts->mpts_flags |= MPTSF_CONNECTING;
2066
2067 if (af == AF_INET || af == AF_INET6) {
2068 char dbuf[MAX_IPv6_STR_LEN];
2069
3e170ce0
A
2070 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2071 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
2072 "[pending %s]\n", __func__,
2073 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2074 inet_ntop(af, ((af == AF_INET) ?
5ba3f43e
A
2075 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2076 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
39236c6e 2077 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
5ba3f43e
A
2078 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2079 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
39236c6e
A
2080 mpts->mpts_connid,
2081 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0 2082 "YES" : "NO")),
5ba3f43e 2083 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2084 }
2085
2086 /* connect right away if first attempt, or if join can be done now */
2087 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2088 error = mptcp_subflow_soconnectx(mpte, mpts);
2089
5ba3f43e
A
2090 if (error)
2091 goto out_err_close;
2092
2093 if (pcid)
2094 *pcid = mpts->mpts_connid;
2095
2096 return (0);
2097
2098out_err_close:
2099 mptcp_subflow_abort(mpts, error);
2100
2101 return (error);
2102
2103out_err:
2104 if (mpts)
2105 mptcp_subflow_free(mpts);
2106
39236c6e
A
2107 return (error);
2108}
2109
5ba3f43e
A
2110void
2111mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2112{
2113 int index = mptcp_get_statsindex(stats, mpts);
2114
2115 if (index != -1) {
2116 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2117
2118 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2119 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2120 }
2121}
2122
39236c6e
A
2123/*
2124 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2125 * will no longer be accessible after a subflow is deleted, thus this
2126 * should occur only after the subflow socket has been disconnected.
39236c6e
A
2127 */
2128void
5ba3f43e 2129mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
39236c6e 2130{
5ba3f43e
A
2131 struct socket *mp_so = mptetoso(mpte);
2132 struct socket *so = mpts->mpts_socket;
2133 struct tcpcb *tp = sototcpcb(so);
39037602 2134
5ba3f43e
A
2135 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2136 VERIFY(mpts->mpts_mpte == mpte);
2137 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2138 VERIFY(mpte->mpte_numflows != 0);
2139 VERIFY(mp_so->so_usecount > 0);
39236c6e 2140
5ba3f43e
A
2141 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2142 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2143 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2144 mpts->mpts_flags, mp_so->so_error),
2145 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2146
5ba3f43e
A
2147 mptcpstats_update(mpte->mpte_itfstats, mpts);
2148 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2149 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
39236c6e 2150
39236c6e
A
2151 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2152 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
39236c6e 2153 mpte->mpte_numflows--;
fe8ab488
A
2154 if (mpte->mpte_active_sub == mpts)
2155 mpte->mpte_active_sub = NULL;
39236c6e
A
2156
2157 /*
2158 * Drop references held by this subflow socket; there
2159 * will be no further upcalls made from this point.
2160 */
5ba3f43e
A
2161 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2162 sock_catchevents_locked(so, NULL, NULL, 0);
fe8ab488 2163
39236c6e 2164 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 2165
39236c6e
A
2166 mp_so->so_usecount--; /* for subflow socket */
2167 mpts->mpts_mpte = NULL;
2168 mpts->mpts_socket = NULL;
39236c6e 2169
5ba3f43e
A
2170 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2171 mptcp_subflow_remref(mpts); /* for subflow socket */
2172
2173 so->so_flags &= ~SOF_MP_SUBFLOW;
2174 tp->t_mptcb = NULL;
2175 tp->t_mpsub = NULL;
2176}
2177
2178void
2179mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2180{
2181 struct socket *so = mpts->mpts_socket;
2182 struct mptcb *mp_tp = mpte->mpte_mptcb;
2183 int send_dfin = 0;
2184
2185 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2186 send_dfin = 1;
2187
2188 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2189 (so->so_state & SS_ISCONNECTED)) {
2190 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2191 __func__, mpts->mpts_connid, send_dfin),
2192 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2193
2194 if (send_dfin)
2195 mptcp_send_dfin(so);
2196 soshutdownlock(so, SHUT_WR);
2197 }
2198
2199}
2200
2201static void
2202mptcp_subflow_abort(struct mptsub *mpts, int error)
2203{
2204 struct socket *so = mpts->mpts_socket;
2205 struct tcpcb *tp = sototcpcb(so);
2206
2207 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2208 return;
2209
2210 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2211 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2212
5ba3f43e
A
2213 if (tp->t_state != TCPS_CLOSED)
2214 tcp_drop(tp, error);
2215
2216 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2217}
2218
2219/*
2220 * Disconnect a subflow socket.
2221 */
2222void
5ba3f43e 2223mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
39236c6e
A
2224{
2225 struct socket *so;
2226 struct mptcb *mp_tp;
2227 int send_dfin = 0;
2228
5ba3f43e 2229 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
2230
2231 VERIFY(mpts->mpts_mpte == mpte);
2232 VERIFY(mpts->mpts_socket != NULL);
39236c6e
A
2233
2234 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2235 return;
2236
2237 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2238
39236c6e
A
2239 so = mpts->mpts_socket;
2240 mp_tp = mpte->mpte_mptcb;
5ba3f43e 2241 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
39236c6e 2242 send_dfin = 1;
39236c6e 2243
39236c6e
A
2244 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2245 (so->so_state & SS_ISCONNECTED)) {
5ba3f43e
A
2246 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d\n",
2247 __func__, mpts->mpts_connid, send_dfin),
2248 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2249
2250 if (send_dfin)
2251 mptcp_send_dfin(so);
2252 (void) soshutdownlock(so, SHUT_RD);
2253 (void) soshutdownlock(so, SHUT_WR);
2254 (void) sodisconnectlocked(so);
2255 }
39236c6e
A
2256 /*
2257 * Generate a disconnect event for this subflow socket, in case
2258 * the lower layer doesn't do it; this is needed because the
5ba3f43e 2259 * subflow socket deletion relies on it.
39236c6e 2260 */
5ba3f43e 2261 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2262}
2263
2264/*
5ba3f43e 2265 * Called when the associated subflow socket posted a read event.
39236c6e
A
2266 */
2267static void
2268mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2269{
2270#pragma unused(so, waitf)
5ba3f43e 2271 struct mptsub *mpts = arg, *tmpts;
39236c6e
A
2272 struct mptses *mpte = mpts->mpts_mpte;
2273
5ba3f43e
A
2274 VERIFY(mpte != NULL);
2275
2276 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2277 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2278 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
fe8ab488 2279 return;
5ba3f43e
A
2280 }
2281
2282 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2283 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2284 if (mpts->mpts_socket->so_usecount == 0) {
2285 /* Will be removed soon by tcp_garbage_collect */
2286 continue;
2287 }
2288
2289 mptcp_subflow_addref(mpts);
2290 mpts->mpts_socket->so_usecount++;
39236c6e 2291
5ba3f43e
A
2292 mptcp_subflow_input(mpte, mpts);
2293
2294 mptcp_subflow_remref(mpts); /* ours */
2295
2296 VERIFY(mpts->mpts_socket->so_usecount != 0);
2297 mpts->mpts_socket->so_usecount--;
2298 }
2299
2300 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
39236c6e
A
2301}
2302
2303/*
2304 * Subflow socket input.
39236c6e
A
2305 */
2306static void
2307mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2308{
5ba3f43e 2309 struct socket *mp_so = mptetoso(mpte);
39236c6e
A
2310 struct mbuf *m = NULL;
2311 struct socket *so;
5ba3f43e 2312 int error, wakeup = 0;
39236c6e 2313
5ba3f43e
A
2314 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2315 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
39236c6e 2316
39037602 2317 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
2318 struct mptsub *, mpts);
2319
2320 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
5ba3f43e 2321 goto out;
39236c6e
A
2322
2323 so = mpts->mpts_socket;
2324
2325 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2326 if (error != 0 && error != EWOULDBLOCK) {
5ba3f43e 2327 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
3e170ce0
A
2328 __func__, mpts->mpts_connid, error),
2329 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5ba3f43e
A
2330 if (error == ENODATA) {
2331 /*
2332 * Don't ignore ENODATA so as to discover
2333 * nasty middleboxes.
2334 */
2335 mp_so->so_error = ENODATA;
2336
2337 wakeup = 1;
2338 goto out;
39236c6e 2339 }
39236c6e 2340 } else if (error == 0) {
5ba3f43e 2341 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
3e170ce0 2342 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2343 }
2344
2345 /* In fallback, make sure to accept data on all but one subflow */
5ba3f43e
A
2346 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2347 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2348 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2349 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2350 m_freem(m);
5ba3f43e 2351 goto out;
39236c6e
A
2352 }
2353
2354 if (m != NULL) {
5ba3f43e
A
2355 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2356 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
3e170ce0 2357
5ba3f43e
A
2358 mpte->mpte_used_cell = 1;
2359 } else {
2360 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2361
2362 mpte->mpte_used_wifi = 1;
2363 }
3e170ce0 2364
39236c6e 2365 mptcp_input(mpte, m);
39236c6e 2366 }
5ba3f43e
A
2367
2368 /* notify protocol that we drained all the data */
2369 if (error == 0 && m != NULL &&
2370 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2371 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2372
2373out:
2374 if (wakeup)
2375 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2376
2377 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
39236c6e
A
2378}
2379
2380/*
2381 * Subflow socket write upcall.
2382 *
5ba3f43e 2383 * Called when the associated subflow socket posted a read event.
39236c6e
A
2384 */
2385static void
2386mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2387{
2388#pragma unused(so, waitf)
2389 struct mptsub *mpts = arg;
2390 struct mptses *mpte = mpts->mpts_mpte;
2391
5ba3f43e
A
2392 VERIFY(mpte != NULL);
2393
2394 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2395 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2396 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
fe8ab488 2397 return;
5ba3f43e 2398 }
39236c6e 2399
5ba3f43e 2400 mptcp_output(mpte);
39236c6e
A
2401}
2402
2403/*
2404 * Subflow socket output.
2405 *
2406 * Called for sending data from MPTCP to the underlying subflow socket.
2407 */
2408int
5ba3f43e 2409mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
39236c6e 2410{
39236c6e 2411 struct mptcb *mp_tp = mpte->mpte_mptcb;
5ba3f43e
A
2412 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2413 struct socket *mp_so, *so;
2414 struct tcpcb *tp;
2415 uint64_t mpt_dsn = 0, off = 0;
2416 int sb_cc = 0, error = 0, wakeup = 0;
2417 uint32_t dss_csum;
2418 uint16_t tot_sent = 0;
2419 boolean_t reinjected = FALSE;
2420
2421 mpte_lock_assert_held(mpte);
2422
2423 mp_so = mptetoso(mpte);
39236c6e 2424 so = mpts->mpts_socket;
5ba3f43e 2425 tp = sototcpcb(so);
39236c6e 2426
5ba3f43e
A
2427 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2428 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
39236c6e 2429
5ba3f43e
A
2430 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2431 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2432 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2433 (mpts->mpts_flags & MPTSF_TFO_REQD));
2434 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
39236c6e 2435
5ba3f43e
A
2436 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2437 __func__, mpts->mpts_flags, mpte->mpte_flags,
2438 mptcp_subflow_cwnd_space(so)),
2439 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2440 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2441 struct mptsub *, mpts);
39236c6e
A
2442
2443 /* Remove Addr Option is not sent reliably as per I-D */
2444 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
39236c6e 2445 tp->t_rem_aid = mpte->mpte_lost_aid;
5ba3f43e 2446 tp->t_mpflags |= TMPF_SND_REM_ADDR;
39236c6e
A
2447 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2448 }
2449
2450 /*
2451 * The mbuf chains containing the metadata (as well as pointing to
2452 * the user data sitting at the MPTCP output queue) would then be
2453 * sent down to the subflow socket.
2454 *
2455 * Some notes on data sequencing:
2456 *
2457 * a. Each mbuf must be a M_PKTHDR.
2458 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2459 * in the mbuf pkthdr structure.
2460 * c. Each mbuf containing the MPTCP metadata must have its
2461 * pkt_flags marked with the PKTF_MPTCP flag.
2462 */
2463
5ba3f43e
A
2464 if (mpte->mpte_reinjectq)
2465 sb_mb = mpte->mpte_reinjectq;
2466 else
2467 sb_mb = mp_so->so_snd.sb_mb;
2468
39236c6e 2469 if (sb_mb == NULL) {
5ba3f43e
A
2470 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u\n",
2471 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2472 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2473 goto out;
2474 }
2475
2476 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2477
5ba3f43e
A
2478 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2479 !(so->so_state & SS_ISCONNECTED) &&
2480 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2481 tp->t_mpflags |= TMPF_TFO_REQUEST;
2482 goto zero_len_write;
39236c6e
A
2483 }
2484
5ba3f43e
A
2485 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2486
2487 /* First, drop acknowledged data */
39236c6e 2488 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
5ba3f43e
A
2489 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2490 "dsn %u suna %u reinject? %u\n",
2491 __func__, (uint32_t)mpt_dsn,
2492 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2493 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2494 if (mpte->mpte_reinjectq) {
2495 mptcp_clean_reinjectq(mpte);
2496 } else {
2497 uint64_t len = 0;
2498 len = mp_tp->mpt_snduna - mpt_dsn;
2499 sbdrop(&mp_so->so_snd, (int)len);
2500 wakeup = 1;
2501 }
2502 }
2503
2504 /* Check again because of above sbdrop */
2505 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2506 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2507 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2508 goto out;
39236c6e
A
2509 }
2510
2511 /*
2512 * In degraded mode, we don't receive data acks, so force free
2513 * mbufs less than snd_nxt
2514 */
39236c6e 2515 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 2516 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
5ba3f43e
A
2517 mp_so->so_snd.sb_mb) {
2518 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2519 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2520 uint64_t len = 0;
2521 len = mp_tp->mpt_snduna - mpt_dsn;
2522 sbdrop(&mp_so->so_snd, (int)len);
2523 wakeup = 1;
2524
2525 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2526 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2527 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2528 }
39236c6e
A
2529 }
2530
fe8ab488
A
2531 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2532 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2533 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2534 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
39236c6e
A
2535 }
2536
2537 /*
2538 * Adjust the top level notion of next byte used for retransmissions
2539 * and sending FINs.
2540 */
5ba3f43e 2541 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
39236c6e 2542 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2543
2544 /* Now determine the offset from which to start transmitting data */
5ba3f43e
A
2545 if (mpte->mpte_reinjectq)
2546 sb_mb = mpte->mpte_reinjectq;
2547 else
2548 sb_mb = mp_so->so_snd.sb_mb;
39236c6e 2549 if (sb_mb == NULL) {
5ba3f43e
A
2550 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2551 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2552 goto out;
2553 }
5ba3f43e
A
2554
2555 if (mpte->mpte_reinjectq) {
2556 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2557 } else if (flags & MPTCP_SUBOUT_PROBING) {
2558 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2559 off = 0;
39236c6e 2560 } else {
5ba3f43e
A
2561 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2562
2563 /*
2564 * With TFO, there might be no data at all, thus still go into this
2565 * code-path here.
2566 */
2567 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2568 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2569 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2570 sb_cc -= off;
2571 } else {
2572 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2573 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2574 (uint32_t)mp_tp->mpt_sndmax),
2575 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2576
2577 goto out;
2578 }
39236c6e 2579 }
39236c6e 2580
5ba3f43e
A
2581 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2582 if (sb_cc <= 0) {
2583 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2584 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2585 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2586 mptcp_subflow_cwnd_space(so)),
2587 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2588 }
2589
2590 sb_cc = min(sb_cc, UINT16_MAX);
2591
2592 /*
2593 * Create a DSN mapping for the data we are about to send. It all
2594 * has the same mapping.
2595 */
2596 if (mpte->mpte_reinjectq)
2597 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2598 else
2599 mpt_dsn = mp_tp->mpt_snduna + off;
39236c6e 2600
5ba3f43e
A
2601 mpt_mbuf = sb_mb;
2602 while (mpt_mbuf && mpte->mpte_reinjectq == NULL &&
2603 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2604 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
39236c6e
A
2605 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2606 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 2607 }
3e170ce0 2608 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
5ba3f43e
A
2609 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2610 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3e170ce0 2611 mpts->mpts_probecnt),
5ba3f43e 2612 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2613
ecc0ceb4 2614 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 2615
fe8ab488
A
2616 head = tail = NULL;
2617
39236c6e 2618 while (tot_sent < sb_cc) {
5ba3f43e 2619 ssize_t mlen;
39236c6e 2620
5ba3f43e 2621 mlen = mpt_mbuf->m_len;
39236c6e 2622 mlen -= off;
5ba3f43e 2623 mlen = min(mlen, sb_cc - tot_sent);
39236c6e 2624
5ba3f43e
A
2625 if (mlen < 0) {
2626 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2627 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2628 (uint32_t)off, sb_cc, tot_sent),
2629 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2630 goto out;
39236c6e
A
2631 }
2632
5ba3f43e
A
2633 if (mlen == 0)
2634 goto next;
2635
fe8ab488
A
2636 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2637 M_COPYM_MUST_COPY_HDR);
39236c6e 2638 if (m == NULL) {
5ba3f43e
A
2639 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2640 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2641 error = ENOBUFS;
2642 break;
2643 }
2644
2645 /* Create a DSN mapping for the data (m_copym does it) */
fe8ab488 2646 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e
A
2647 VERIFY(m->m_next == NULL);
2648
39236c6e
A
2649 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2650 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
5ba3f43e 2651 m->m_pkthdr.mp_dsn = mpt_dsn;
39236c6e 2652 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
39236c6e
A
2653 m->m_pkthdr.len = mlen;
2654
fe8ab488
A
2655 if (head == NULL) {
2656 head = tail = m;
2657 } else {
2658 tail->m_next = m;
2659 tail = m;
2660 }
2661
fe8ab488
A
2662 tot_sent += mlen;
2663 off = 0;
5ba3f43e 2664next:
fe8ab488
A
2665 mpt_mbuf = mpt_mbuf->m_next;
2666 }
2667
5ba3f43e
A
2668 if (mpte->mpte_reinjectq) {
2669 reinjected = TRUE;
fe8ab488 2670
5ba3f43e
A
2671 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2672 struct mbuf *n = sb_mb;
2673
2674 while (n) {
2675 n->m_pkthdr.mp_dsn += sb_cc;
2676 n->m_pkthdr.mp_rlen -= sb_cc;
2677 n = n->m_next;
2678 }
2679 m_adj(sb_mb, sb_cc);
2680 } else {
2681 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2682 m_freem(sb_mb);
2683 }
2684 }
2685
2686 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2687 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2688 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2689
2690 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2691 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2692 tot_sent);
2693 }
2694
2695 /* Now, let's update rel-seq and the data-level length */
2696 mpts->mpts_rel_seq += tot_sent;
2697 m = head;
2698 while (m) {
2699 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2700 m->m_pkthdr.mp_csum = dss_csum;
2701 m->m_pkthdr.mp_rlen = tot_sent;
2702 m = m->m_next;
2703 }
2704
2705 if (head != NULL) {
490019cf 2706 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
5ba3f43e 2707 (tp->t_tfo_stats == 0))
39037602 2708 tp->t_mpflags |= TMPF_TFO_REQUEST;
fe8ab488
A
2709
2710 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2711
5ba3f43e 2712 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
39236c6e
A
2713 struct sockbuf *, &so->so_rcv,
2714 struct sockbuf *, &so->so_snd,
2715 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
2716 size_t, tot_sent);
2717 }
2718
5ba3f43e
A
2719done_sending:
2720 if (error == 0 ||
2721 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2722 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3e170ce0
A
2723
2724 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2725 tcpstat.tcps_mp_num_probes++;
5ba3f43e 2726 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
3e170ce0
A
2727 mpts->mpts_probecnt += 1;
2728 else
2729 mpts->mpts_probecnt +=
2730 tot_sent/mpts->mpts_maxseg;
2731 }
2732
5ba3f43e
A
2733 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2734 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
39236c6e
A
2735 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2736 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
5ba3f43e 2737 mp_tp->mpt_sndnxt = new_sndnxt;
39236c6e 2738 }
fe8ab488 2739
5ba3f43e 2740 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
490019cf 2741
5ba3f43e
A
2742 /* Must be here as mptcp_can_send_more() checks for this */
2743 soclearfastopen(mp_so);
39236c6e 2744
3e170ce0
A
2745 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2746 (mpts->mpts_probesoon != 0))
5ba3f43e
A
2747 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2748 __func__, mpts->mpts_connid,
2749 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2750 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3e170ce0 2751 (tcp_now - mpts->mpts_probesoon)),
5ba3f43e
A
2752 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2753
2754 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2755 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2756
2757 mpte->mpte_used_cell = 1;
2758 } else {
2759 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2760
2761 mpte->mpte_used_wifi = 1;
2762 }
2763
2764 /*
2765 * Don't propagate EWOULDBLOCK - it's already taken care of
2766 * in mptcp_usr_send for TFO.
2767 */
2768 error = 0;
fe8ab488 2769 } else {
5ba3f43e
A
2770 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2771 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
3e170ce0 2772 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2773 }
2774out:
5ba3f43e 2775
39037602 2776 if (wakeup)
5ba3f43e 2777 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
39037602 2778
5ba3f43e 2779 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
39236c6e 2780 return (error);
5ba3f43e
A
2781
2782zero_len_write:
2783 /* Opting to call pru_send as no mbuf at subflow level */
2784 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2785 NULL, current_proc());
2786
2787 goto done_sending;
39236c6e
A
2788}
2789
39236c6e 2790static void
5ba3f43e 2791mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
39236c6e 2792{
5ba3f43e 2793 struct mbuf *n, *prev = NULL;
39236c6e 2794
5ba3f43e
A
2795 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2796 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2797 m->m_pkthdr.mp_rseq),
2798 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2799
2800 n = mpte->mpte_reinjectq;
2801
2802 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2803 * equal than m's sequence number.
2804 */
2805 while (n) {
2806 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2807 break;
2808
2809 prev = n;
2810
2811 n = n->m_nextpkt;
2812 }
2813
2814 if (n) {
2815 /* m is already fully covered by the next mbuf in the queue */
2816 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2817 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2818 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2819 __func__, n->m_pkthdr.mp_rlen),
2820 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2821 goto dont_queue;
2822 }
2823
2824 /* m is covering the next mbuf entirely, thus we remove this guy */
2825 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2826 struct mbuf *tmp = n->m_nextpkt;
2827
2828 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2829 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2830 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2831 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2832
2833 m->m_nextpkt = NULL;
2834 if (prev == NULL)
2835 mpte->mpte_reinjectq = tmp;
2836 else
2837 prev->m_nextpkt = tmp;
2838
2839 m_freem(n);
2840 n = tmp;
2841 }
2842
2843 }
2844
2845 if (prev) {
2846 /* m is already fully covered by the previous mbuf in the queue */
2847 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2848 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2849 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2850 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2851 goto dont_queue;
2852 }
2853 }
2854
2855 if (prev == NULL)
2856 mpte->mpte_reinjectq = m;
2857 else
2858 prev->m_nextpkt = m;
39236c6e 2859
5ba3f43e
A
2860 m->m_nextpkt = n;
2861
2862 return;
2863
2864dont_queue:
2865 m_freem(m);
2866 return;
39236c6e
A
2867}
2868
5ba3f43e
A
2869static struct mbuf *
2870mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
39236c6e 2871{
5ba3f43e
A
2872 struct socket *mp_so = mptetoso(mpte);
2873 struct mbuf *m;
39236c6e 2874
5ba3f43e 2875 m = mp_so->so_snd.sb_mb;
39236c6e 2876
5ba3f43e
A
2877 while (m) {
2878 /* If this segment covers what we are looking for, return it. */
2879 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2880 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2881 break;
2882
2883
2884 /* Segment is no more in the queue */
2885 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
2886 return NULL;
2887
2888 m = m->m_next;
39236c6e
A
2889 }
2890
5ba3f43e
A
2891 return m;
2892}
fe8ab488 2893
5ba3f43e
A
2894static struct mbuf *
2895mptcp_copy_mbuf_list(struct mbuf *m, int len)
2896{
2897 struct mbuf *top = NULL, *tail = NULL;
2898 uint64_t dsn;
2899 uint32_t dlen, rseq;
39236c6e 2900
5ba3f43e
A
2901 dsn = m->m_pkthdr.mp_dsn;
2902 dlen = m->m_pkthdr.mp_rlen;
2903 rseq = m->m_pkthdr.mp_rseq;
3e170ce0 2904
5ba3f43e
A
2905 while (len > 0) {
2906 struct mbuf *n;
2907
2908 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
2909
2910 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
2911 if (n == NULL) {
2912 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
2913 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2914 goto err;
3e170ce0 2915 }
fe8ab488 2916
5ba3f43e
A
2917 VERIFY(n->m_flags & M_PKTHDR);
2918 VERIFY(n->m_next == NULL);
2919 VERIFY(n->m_pkthdr.mp_dsn == dsn);
2920 VERIFY(n->m_pkthdr.mp_rlen == dlen);
2921 VERIFY(n->m_pkthdr.mp_rseq == rseq);
2922 VERIFY(n->m_len == m->m_len);
2923
2924 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
2925
2926 if (top == NULL)
2927 top = n;
2928
2929 if (tail != NULL)
2930 tail->m_next = n;
2931
2932 tail = n;
2933
2934 len -= m->m_len;
2935 m = m->m_next;
39236c6e
A
2936 }
2937
5ba3f43e
A
2938 return top;
2939
2940err:
2941 if (top)
2942 m_freem(top);
2943
2944 return NULL;
39236c6e
A
2945}
2946
5ba3f43e
A
2947static void
2948mptcp_reinject_mbufs(struct socket *so)
39236c6e 2949{
5ba3f43e
A
2950 struct tcpcb *tp = sototcpcb(so);
2951 struct mptsub *mpts = tp->t_mpsub;
2952 struct mptcb *mp_tp = tptomptp(tp);
2953 struct mptses *mpte = mp_tp->mpt_mpte;;
2954 struct sockbuf *sb = &so->so_snd;
2955 struct mbuf *m;
39236c6e 2956
5ba3f43e
A
2957 m = sb->sb_mb;
2958 while (m) {
2959 struct mbuf *n = m->m_next, *orig = m;
39236c6e 2960
5ba3f43e
A
2961 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
2962 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
2963 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
2964 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2965
5ba3f43e 2966 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 2967
5ba3f43e
A
2968 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
2969 goto next;
39236c6e 2970
5ba3f43e
A
2971 /* Has it all already been acknowledged at the data-level? */
2972 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
2973 goto next;
2974
2975 /* Part of this has already been acknowledged - lookup in the
2976 * MPTCP-socket for the segment.
2977 */
2978 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
2979 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
2980 if (m == NULL)
2981 goto next;
2982 }
2983
2984 /* Copy the mbuf with headers (aka, DSN-numbers) */
2985 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
2986 if (m == NULL)
2987 break;
2988
2989 VERIFY(m->m_nextpkt == NULL);
2990
2991 /* Now, add to the reinject-queue, eliminating overlapping
2992 * segments
2993 */
2994 mptcp_add_reinjectq(mpte, m);
2995
2996 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
2997
2998next:
2999 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3000 while (n) {
3001 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3002
3003 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
3004 break;
3005
3006 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3007 n = n->m_next;
3008 }
3009
3010 m = n;
39236c6e 3011 }
5ba3f43e 3012}
39236c6e 3013
5ba3f43e
A
3014void
3015mptcp_clean_reinjectq(struct mptses *mpte)
3016{
3017 struct mptcb *mp_tp = mpte->mpte_mptcb;
3018
3019 mpte_lock_assert_held(mpte);
3020
3021 while (mpte->mpte_reinjectq) {
3022 struct mbuf *m = mpte->mpte_reinjectq;
3023
3024 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3025 MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
3026 break;
3027
3028 mpte->mpte_reinjectq = m->m_nextpkt;
3029 m->m_nextpkt = NULL;
3030 m_freem(m);
3031 }
39236c6e
A
3032}
3033
3034/*
5ba3f43e 3035 * Subflow socket control event upcall.
39236c6e 3036 */
5ba3f43e
A
3037static void
3038mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
39236c6e 3039{
5ba3f43e
A
3040#pragma unused(so)
3041 struct mptsub *mpts = arg;
3042 struct mptses *mpte = mpts->mpts_mpte;
39236c6e 3043
5ba3f43e
A
3044 VERIFY(mpte != NULL);
3045 mpte_lock_assert_held(mpte);
39236c6e 3046
5ba3f43e
A
3047 if ((mpts->mpts_evctl & events) == events)
3048 return;
39236c6e 3049
5ba3f43e
A
3050 mpts->mpts_evctl |= events;
3051
3052 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3053 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3054 return;
39037602 3055 }
39236c6e 3056
5ba3f43e 3057 mptcp_subflow_workloop(mpte);
39236c6e
A
3058}
3059
3060/*
5ba3f43e
A
3061 * Subflow socket control events.
3062 *
3063 * Called for handling events related to the underlying subflow socket.
39236c6e
A
3064 */
3065static ev_ret_t
5ba3f43e 3066mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3e170ce0 3067 uint64_t *p_mpsofilt_hint)
39236c6e 3068{
5ba3f43e
A
3069 ev_ret_t ret = MPTS_EVRET_OK;
3070 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3071 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e 3072
5ba3f43e 3073 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3074
5ba3f43e
A
3075 /* bail if there's nothing to process */
3076 if (!mpts->mpts_evctl)
3077 return (ret);
39236c6e 3078
5ba3f43e
A
3079 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3080 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3081 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3082 SO_FILT_HINT_DISCONNECTED)) {
3083 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3084 }
3e170ce0 3085
5ba3f43e
A
3086 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3087 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3088
3089 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3090 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3091 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3092
3093 /*
3094 * Process all the socket filter hints and reset the hint
3095 * once it is handled
3096 */
3097 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3098 /*
3099 * Always execute the DISCONNECTED event, because it will wakeup
3100 * the app.
3101 */
3102 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3103 (ret >= MPTS_EVRET_OK ||
3104 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3105 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3106 ev_ret_t error =
3107 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3108 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3109 }
3110 }
3111
3112 /*
3113 * We should be getting only events specified via sock_catchevents(),
3114 * so loudly complain if we have any unprocessed one(s).
3115 */
3116 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3117 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3118 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3119 mpts->mpts_connid,
3120 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3121 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3122 else
3123 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3124 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3125 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3126
3127 return (ret);
39236c6e
A
3128}
3129
39236c6e 3130static ev_ret_t
5ba3f43e
A
3131mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3132 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e
A
3133{
3134 struct socket *mp_so, *so;
3135 struct mptcb *mp_tp;
39236c6e 3136
5ba3f43e 3137 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3138 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3139 mp_so = mptetoso(mpte);
39236c6e
A
3140 mp_tp = mpte->mpte_mptcb;
3141 so = mpts->mpts_socket;
3142
5ba3f43e
A
3143 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3144 mpts->mpts_connid, event),
3e170ce0 3145 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3146
39236c6e 3147 /*
5ba3f43e
A
3148 * We got an event for this subflow that might need to be propagated,
3149 * based on the state of the MPTCP connection.
39236c6e 3150 */
5ba3f43e
A
3151 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3152 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3153 mp_so->so_error = so->so_error;
3154 *p_mpsofilt_hint |= event;
39236c6e 3155 }
39236c6e 3156
5ba3f43e 3157 return (MPTS_EVRET_OK);
39236c6e
A
3158}
3159
3160/*
3161 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3162 */
3163static ev_ret_t
3e170ce0 3164mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3165 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3166{
5ba3f43e
A
3167#pragma unused(p_mpsofilt_hint, event)
3168 struct socket *mp_so;
3169 struct tcpcb *tp;
39236c6e 3170
5ba3f43e 3171 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
3172
3173 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e
A
3174 mp_so = mptetoso(mpte);
3175 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
39236c6e 3176
39236c6e
A
3177 /*
3178 * This overwrites any previous mpte_lost_aid to avoid storing
3179 * too much state when the typical case has only two subflows.
3180 */
3181 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3182 mpte->mpte_lost_aid = tp->t_local_aid;
3183
5ba3f43e
A
3184 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3185 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3186
3187 /*
3188 * The subflow connection has lost its source address.
39236c6e 3189 */
5ba3f43e 3190 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
39236c6e 3191
5ba3f43e
A
3192 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3193 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3194
5ba3f43e 3195 return (MPTS_EVRET_DELETE);
39236c6e
A
3196}
3197
fe8ab488
A
3198/*
3199 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3200 * indicates that the remote side sent a Data FIN
3201 */
3202static ev_ret_t
3e170ce0 3203mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3204 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3205{
5ba3f43e 3206#pragma unused(event)
fe8ab488
A
3207 struct mptcb *mp_tp;
3208
5ba3f43e 3209 mpte_lock_assert_held(mpte); /* same as MP socket lock */
fe8ab488
A
3210 mp_tp = mpte->mpte_mptcb;
3211
5ba3f43e 3212 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3e170ce0 3213 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 3214
fe8ab488 3215 /*
39037602 3216 * We got a Data FIN for the MPTCP connection.
fe8ab488
A
3217 * The FIN may arrive with data. The data is handed up to the
3218 * mptcp socket and the user is notified so that it may close
3219 * the socket if needed.
3220 */
39037602 3221 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
5ba3f43e 3222 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
39037602 3223
fe8ab488
A
3224 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3225}
3226
39236c6e
A
3227/*
3228 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3229 */
3230static ev_ret_t
3e170ce0 3231mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3232 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3233{
5ba3f43e 3234#pragma unused(event, p_mpsofilt_hint)
39236c6e 3235 struct mptsub *mpts_alt = NULL;
5ba3f43e 3236 struct socket *alt_so = NULL;
39236c6e
A
3237 struct socket *mp_so;
3238 int altpath_exists = 0;
3239
5ba3f43e
A
3240 mpte_lock_assert_held(mpte);
3241 mp_so = mptetoso(mpte);
3242 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3243 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3244 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3245
5ba3f43e 3246 mptcp_reinject_mbufs(mpts->mpts_socket);
39236c6e 3247
5ba3f43e 3248 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
3249 /*
3250 * If there is no alternate eligible subflow, ignore the
3251 * failover hint.
3252 */
3253 if (mpts_alt == NULL) {
5ba3f43e
A
3254 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3255 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3256
39236c6e
A
3257 goto done;
3258 }
5ba3f43e 3259
39236c6e 3260 altpath_exists = 1;
5ba3f43e 3261 alt_so = mpts_alt->mpts_socket;
39236c6e 3262 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
fe8ab488 3263 /* All data acknowledged and no RTT spike */
5ba3f43e 3264 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
39236c6e
A
3265 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3266 } else {
3267 /* no alternate path available */
3268 altpath_exists = 0;
3269 }
39236c6e 3270 }
39236c6e
A
3271
3272 if (altpath_exists) {
5ba3f43e 3273 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
39236c6e 3274
5ba3f43e 3275 mpte->mpte_active_sub = mpts_alt;
39236c6e
A
3276 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3277 mpts->mpts_flags &= ~MPTSF_ACTIVE;
5ba3f43e
A
3278
3279 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3280 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3281 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3282
3283 mptcpstats_inc_switch(mpte, mpts);
3284
3285 sowwakeup(alt_so);
39236c6e 3286 } else {
5ba3f43e
A
3287 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3288 mpts->mpts_connid),
3289 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3290done:
5ba3f43e 3291 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
39236c6e 3292 }
5ba3f43e 3293
39236c6e
A
3294 return (MPTS_EVRET_OK);
3295}
3296
3297/*
3298 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3299 */
3300static ev_ret_t
3e170ce0 3301mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3302 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3303{
5ba3f43e 3304 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3305 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3306
5ba3f43e
A
3307 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3308 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3309
39236c6e 3310 /*
5ba3f43e
A
3311 * The subflow connection cannot use the outgoing interface, let's
3312 * close this subflow.
39236c6e 3313 */
5ba3f43e 3314 mptcp_subflow_abort(mpts, EPERM);
39236c6e 3315
5ba3f43e 3316 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3317
5ba3f43e 3318 return (MPTS_EVRET_DELETE);
39236c6e
A
3319}
3320
3321/*
3322 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3323 */
3324static ev_ret_t
3e170ce0 3325mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3326 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3327{
5ba3f43e 3328#pragma unused(event, p_mpsofilt_hint)
39236c6e 3329 struct socket *mp_so, *so;
5ba3f43e
A
3330 struct inpcb *inp;
3331 struct tcpcb *tp;
39236c6e 3332 struct mptcb *mp_tp;
5ba3f43e 3333 int af;
39236c6e
A
3334 boolean_t mpok = FALSE;
3335
5ba3f43e 3336 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3337 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3338
5ba3f43e
A
3339 mp_so = mptetoso(mpte);
3340 mp_tp = mpte->mpte_mptcb;
39236c6e 3341 so = mpts->mpts_socket;
5ba3f43e
A
3342 tp = sototcpcb(so);
3343 af = mpts->mpts_dst.sa_family;
39236c6e
A
3344
3345 if (mpts->mpts_flags & MPTSF_CONNECTED)
3346 return (MPTS_EVRET_OK);
3347
3348 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3349 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
3350 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3351 (so->so_state & SS_ISCONNECTED)) {
5ba3f43e 3352 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3e170ce0
A
3353 __func__, mpts->mpts_connid),
3354 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3355 (void) soshutdownlock(so, SHUT_RD);
3356 (void) soshutdownlock(so, SHUT_WR);
3357 (void) sodisconnectlocked(so);
3358 }
39236c6e
A
3359 return (MPTS_EVRET_OK);
3360 }
3361
3362 /*
3363 * The subflow connection has been connected. Find out whether it
3364 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3365 *
3366 * a. If MPTCP connection is not yet established, then this must be
3367 * the first subflow connection. If MPTCP failed to negotiate,
5ba3f43e 3368 * fallback to regular TCP by degrading this subflow.
39236c6e
A
3369 *
3370 * b. If MPTCP connection has been established, then this must be
3371 * one of the subsequent subflow connections. If MPTCP failed
5ba3f43e 3372 * to negotiate, disconnect the connection.
39236c6e
A
3373 *
3374 * Right now, we simply unblock any waiters at the MPTCP socket layer
3375 * if the MPTCP connection has not been established.
3376 */
39236c6e
A
3377
3378 if (so->so_state & SS_ISDISCONNECTED) {
3379 /*
3380 * With MPTCP joins, a connection is connected at the subflow
3381 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
3382 * subflow to connected state. So there is a small window
3383 * where the subflow could get disconnected before the
39236c6e
A
3384 * connected event is processed.
3385 */
39236c6e
A
3386 return (MPTS_EVRET_OK);
3387 }
3388
5ba3f43e
A
3389 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3390 mptcp_drop_tfo_data(mpte, mpts);
490019cf 3391
5ba3f43e
A
3392 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3393 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf 3394
490019cf 3395 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
3396 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3397
490019cf
A
3398 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3399
39236c6e 3400 /* get/verify the outbound interface */
5ba3f43e 3401 inp = sotoinpcb(so);
3e170ce0 3402
5ba3f43e 3403 mpts->mpts_maxseg = tp->t_maxseg;
3e170ce0 3404
5ba3f43e
A
3405 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3406 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3407 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3e170ce0 3408 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
3409
3410 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
39236c6e 3411
39236c6e 3412 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
5ba3f43e
A
3413 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3414 mpte->mpte_associd = mpts->mpts_connid;
3415 DTRACE_MPTCP2(state__change,
3416 struct mptcb *, mp_tp,
3417 uint32_t, 0 /* event */);
3418
3419 if (SOCK_DOM(so) == AF_INET) {
3420 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3421 } else {
3422 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3423 }
3424
39236c6e
A
3425 /* case (a) above */
3426 if (!mpok) {
5ba3f43e
A
3427 tcpstat.tcps_mpcap_fallback++;
3428
3429 tp->t_mpflags |= TMPF_INFIN_SENT;
3430 mptcp_notify_mpfail(so);
39236c6e 3431 } else {
5ba3f43e
A
3432 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3433 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3434 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
39037602
A
3435 } else {
3436 mpts->mpts_flags |= MPTSF_PREFERRED;
3437 }
813fb2f6 3438 mpts->mpts_flags |= MPTSF_ACTIVE;
5ba3f43e 3439
39236c6e
A
3440 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3441 mpte->mpte_nummpcapflows++;
5ba3f43e
A
3442
3443 mptcp_check_subflows_and_add(mpte);
3444
3445 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3446 mpte->mpte_initial_cell = 1;
3447
3448 mpte->mpte_handshake_success = 1;
39236c6e 3449 }
5ba3f43e
A
3450
3451 mp_tp->mpt_sndwnd = tp->snd_wnd;
3452 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3453 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3454 soisconnected(mp_so);
3455
3456 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3457 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3458 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3459 } else if (mpok) {
39236c6e
A
3460 /*
3461 * case (b) above
3462 * In case of additional flows, the MPTCP socket is not
3463 * MPTSF_MP_CAPABLE until an ACK is received from server
3464 * for 3-way handshake. TCP would have guaranteed that this
3465 * is an MPTCP subflow.
3466 */
5ba3f43e
A
3467 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3468 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3469 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3470 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3471 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3472 } else {
3473 mpts->mpts_flags |= MPTSF_PREFERRED;
3474 }
3475
39236c6e
A
3476 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3477 mpte->mpte_nummpcapflows++;
5ba3f43e
A
3478
3479 mpts->mpts_rel_seq = 1;
3480
3481 mptcp_check_subflows_and_remove(mpte);
fe8ab488 3482 } else {
5ba3f43e
A
3483 unsigned int i;
3484
3485 /* Mark this interface as non-MPTCP */
3486 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3487 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3488
3489 if (inp->inp_last_outifp->if_index == info->ifindex) {
3490 info->no_mptcp_support = 1;
3491 break;
3492 }
3493 }
3494
3495 tcpstat.tcps_join_fallback++;
3496 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3497 tcpstat.tcps_mptcp_cell_proxy++;
3498 else
3499 tcpstat.tcps_mptcp_wifi_proxy++;
3500
3501 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3502
3503 return (MPTS_EVRET_OK);
39236c6e 3504 }
fe8ab488 3505
5ba3f43e
A
3506 /* This call, just to "book" an entry in the stats-table for this ifindex */
3507 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3508
3509 mptcp_output(mpte);
39236c6e
A
3510
3511 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3512}
3513
3514/*
3515 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3516 */
3517static ev_ret_t
3e170ce0 3518mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3519 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3520{
5ba3f43e 3521#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3522 struct socket *mp_so, *so;
3523 struct mptcb *mp_tp;
39236c6e 3524
5ba3f43e 3525 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3526 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3527 mp_so = mptetoso(mpte);
39236c6e
A
3528 mp_tp = mpte->mpte_mptcb;
3529 so = mpts->mpts_socket;
3530
5ba3f43e
A
3531 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3532 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3533 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3534 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3e170ce0 3535 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3536
3537 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
5ba3f43e 3538 return (MPTS_EVRET_DELETE);
39236c6e 3539
39236c6e
A
3540 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3541
5ba3f43e 3542 /* The subflow connection has been disconnected. */
39236c6e
A
3543
3544 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3545 mpte->mpte_nummpcapflows--;
fe8ab488
A
3546 if (mpte->mpte_active_sub == mpts) {
3547 mpte->mpte_active_sub = NULL;
5ba3f43e 3548 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3e170ce0 3549 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3550 }
39236c6e
A
3551 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3552 }
3553
5ba3f43e
A
3554 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3555 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3556 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3557 mptcp_drop(mpte, mp_tp, so->so_error);
39236c6e
A
3558 }
3559
39236c6e 3560 /*
5ba3f43e
A
3561 * Clear flags that are used by getconninfo to return state.
3562 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e 3563 */
5ba3f43e
A
3564 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3565 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3566 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3567
3568 return (MPTS_EVRET_DELETE);
39236c6e
A
3569}
3570
3571/*
3572 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3573 */
3574static ev_ret_t
3e170ce0 3575mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3576 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3577{
5ba3f43e 3578#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3579 struct socket *mp_so, *so;
3580 struct mptcb *mp_tp;
3e170ce0 3581 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e 3582
5ba3f43e 3583 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3584 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3585 mp_so = mptetoso(mpte);
39236c6e 3586 mp_tp = mpte->mpte_mptcb;
39236c6e
A
3587 so = mpts->mpts_socket;
3588
39236c6e
A
3589 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3590 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3591 else
3592 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3593
3594 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3595 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3596 goto done;
3597 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3598 }
3599 else
3600 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3601
3602 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3603 mpts->mpts_flags |= MPTSF_MP_READY;
3604 else
3605 mpts->mpts_flags &= ~MPTSF_MP_READY;
3606
3607 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3608 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3609 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3610 }
3611
3612 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3613 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3614 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3615 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3616 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3617 ret = MPTS_EVRET_CONNECT_PENDING;
3618 }
3619
5ba3f43e
A
3620 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3621 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3622 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3623 mpts->mpts_flags, MPTSF_BITS),
3624 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3e170ce0 3625
39236c6e 3626done:
39236c6e
A
3627 return (ret);
3628}
3629
3630/*
3631 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3632 */
3633static ev_ret_t
3e170ce0 3634mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3635 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3636{
5ba3f43e 3637#pragma unused(event)
39236c6e
A
3638 struct socket *mp_so, *so;
3639 struct mptcb *mp_tp;
5ba3f43e 3640 boolean_t is_fastclose;
39236c6e 3641
5ba3f43e 3642 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3643 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3644 mp_so = mptetoso(mpte);
39236c6e
A
3645 mp_tp = mpte->mpte_mptcb;
3646 so = mpts->mpts_socket;
3647
39236c6e 3648 /* We got an invalid option or a fast close */
39236c6e
A
3649 struct tcptemp *t_template;
3650 struct inpcb *inp = sotoinpcb(so);
3651 struct tcpcb *tp = NULL;
3652
3653 tp = intotcpcb(inp);
fe8ab488 3654 so->so_error = ECONNABORTED;
39236c6e 3655
39037602
A
3656 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3657
39236c6e
A
3658 t_template = tcp_maketemplate(tp);
3659 if (t_template) {
fe8ab488 3660 struct tcp_respond_args tra;
39236c6e 3661
fe8ab488 3662 bzero(&tra, sizeof(tra));
39236c6e 3663 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3664 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3665 else
fe8ab488
A
3666 tra.ifscope = IFSCOPE_NONE;
3667 tra.awdl_unrestricted = 1;
39236c6e
A
3668
3669 tcp_respond(tp, t_template->tt_ipgen,
3670 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3671 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3672 (void) m_free(dtom(t_template));
3e170ce0
A
3673 mptcplog((LOG_DEBUG, "MPTCP Events: "
3674 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3675 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3676 so, mpts->mpts_connid),
3677 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3678 }
5ba3f43e 3679 mptcp_subflow_abort(mpts, ECONNABORTED);
39037602
A
3680
3681 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3e170ce0 3682 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 3683
39037602
A
3684 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3685 mp_so->so_error = ECONNABORTED;
3686 else
3687 mp_so->so_error = ECONNRESET;
3688
3689 /*
3690 * mptcp_drop is being called after processing the events, to fully
3691 * close the MPTCP connection
3692 */
39236c6e 3693 }
39037602 3694
3e170ce0
A
3695 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3696 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e 3697
5ba3f43e 3698 return (MPTS_EVRET_DELETE);
39236c6e
A
3699}
3700
fe8ab488 3701static ev_ret_t
5ba3f43e
A
3702mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3703 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3704{
5ba3f43e
A
3705#pragma unused(event)
3706 bool found_active = false;
3707
3708 mpts->mpts_flags |= MPTSF_READ_STALL;
39037602 3709
5ba3f43e
A
3710 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3711 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3e170ce0 3712
5ba3f43e
A
3713 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3714 TCPS_HAVERCVDFIN2(tp->t_state))
3715 continue;
3716
3717 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3718 found_active = true;
3719 break;
fe8ab488 3720 }
fe8ab488
A
3721 }
3722
5ba3f43e
A
3723 if (!found_active)
3724 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3725
fe8ab488
A
3726 return (MPTS_EVRET_OK);
3727}
3728
3729static ev_ret_t
5ba3f43e
A
3730mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3731 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3732{
5ba3f43e
A
3733#pragma unused(event)
3734 bool found_active = false;
3e170ce0 3735
5ba3f43e 3736 mpts->mpts_flags |= MPTSF_WRITE_STALL;
fe8ab488 3737
5ba3f43e
A
3738 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3739 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3740
3741 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3742 tp->t_state > TCPS_CLOSE_WAIT)
3743 continue;
3744
3745 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3746 found_active = true;
3747 break;
3748 }
3749 }
3750
3751 if (!found_active)
3752 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3753
3754 return (MPTS_EVRET_OK);
fe8ab488
A
3755}
3756
39236c6e
A
3757static const char *
3758mptcp_evret2str(ev_ret_t ret)
3759{
3760 const char *c = "UNKNOWN";
3761
3762 switch (ret) {
3763 case MPTS_EVRET_DELETE:
3764 c = "MPTS_EVRET_DELETE";
3765 break;
3766 case MPTS_EVRET_CONNECT_PENDING:
3767 c = "MPTS_EVRET_CONNECT_PENDING";
3768 break;
3769 case MPTS_EVRET_DISCONNECT_FALLBACK:
3770 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3771 break;
3772 case MPTS_EVRET_OK:
3773 c = "MPTS_EVRET_OK";
3774 break;
3e170ce0 3775 default:
39236c6e
A
3776 break;
3777 }
3778 return (c);
3779}
3780
39236c6e
A
3781/*
3782 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3783 * caller must ensure that the option can be issued on subflow sockets, via
3784 * MPOF_SUBFLOW_OK flag.
3785 */
3786int
5ba3f43e 3787mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
39236c6e 3788{
5ba3f43e 3789 struct socket *mp_so, *so;
39236c6e 3790 struct sockopt sopt;
39236c6e
A
3791 int error;
3792
3793 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
3794 mpte_lock_assert_held(mpte);
3795
3796 mp_so = mptetoso(mpte);
3797 so = mpts->mpts_socket;
3798
3799 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
3800 mpo->mpo_level == SOL_SOCKET &&
3801 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
3802 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n",
3803 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(),
3804 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
3805 mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1),
3806 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3807
3808 /*
3809 * When we open a new subflow, mark it as cell fallback, if
3810 * this subflow goes over cell.
3811 *
3812 * (except for first-party apps)
3813 */
3814
3815 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
3816 return (0);
39236c6e 3817
5ba3f43e
A
3818 if (sotoinpcb(so)->inp_last_outifp &&
3819 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
3820 return (0);
3821
3822 /*
3823 * This here is an OR, because if the app is not binding to the
3824 * interface, then it definitely is not a cell-fallback
3825 * connection.
3826 */
3827 if (mpts->mpts_ifscope == IFSCOPE_NONE ||
3828 !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]))
3829 return (0);
3830 }
3831
3832 mpo->mpo_flags &= ~MPOF_INTERIM;
39236c6e
A
3833
3834 bzero(&sopt, sizeof (sopt));
3835 sopt.sopt_dir = SOPT_SET;
3836 sopt.sopt_level = mpo->mpo_level;
3837 sopt.sopt_name = mpo->mpo_name;
3838 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3839 sopt.sopt_valsize = sizeof (int);
3840 sopt.sopt_p = kernproc;
3841
5ba3f43e 3842 error = sosetoptlock(so, &sopt, 0);
39236c6e 3843 if (error == 0) {
5ba3f43e 3844 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3845 "val %d set successful\n", __func__,
3846 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3847 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3848 mpo->mpo_intval),
3849 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3850 } else {
5ba3f43e 3851 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
39236c6e
A
3852 "val %d set error %d\n", __func__,
3853 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3854 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3855 mpo->mpo_intval, error),
3856 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3857 }
3858 return (error);
3859}
3860
3861/*
3862 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3863 * caller must ensure that the option can be issued on subflow sockets, via
3864 * MPOF_SUBFLOW_OK flag.
3865 */
3866int
3867mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3868 struct mptopt *mpo)
3869{
3870 struct socket *mp_so;
3871 struct sockopt sopt;
39236c6e
A
3872 int error;
3873
3874 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
3875 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3876 mp_so = mptetoso(mpte);
39236c6e
A
3877
3878 bzero(&sopt, sizeof (sopt));
3879 sopt.sopt_dir = SOPT_GET;
3880 sopt.sopt_level = mpo->mpo_level;
3881 sopt.sopt_name = mpo->mpo_name;
3882 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3883 sopt.sopt_valsize = sizeof (int);
3884 sopt.sopt_p = kernproc;
3885
3886 error = sogetoptlock(so, &sopt, 0); /* already locked */
3887 if (error == 0) {
3e170ce0
A
3888 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3889 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3890 "val %d get successful\n", __func__,
3891 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3892 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3893 mpo->mpo_intval),
3e170ce0 3894 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3895 } else {
3e170ce0
A
3896 mptcplog((LOG_ERR, "MPTCP Socket: "
3897 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e 3898 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e 3899 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
3e170ce0 3900 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3901 }
3902 return (error);
3903}
3904
3905
3906/*
3907 * MPTCP garbage collector.
3908 *
3909 * This routine is called by the MP domain on-demand, periodic callout,
3910 * which is triggered when a MPTCP socket is closed. The callout will
3911 * repeat as long as this routine returns a non-zero value.
3912 */
3913static uint32_t
3914mptcp_gc(struct mppcbinfo *mppi)
3915{
3916 struct mppcb *mpp, *tmpp;
3917 uint32_t active = 0;
3918
5ba3f43e 3919 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
39236c6e 3920
39236c6e
A
3921 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3922 struct socket *mp_so;
3923 struct mptses *mpte;
3924 struct mptcb *mp_tp;
3925
3926 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3927 mp_so = mpp->mpp_socket;
3928 VERIFY(mp_so != NULL);
3929 mpte = mptompte(mpp);
3930 VERIFY(mpte != NULL);
3931 mp_tp = mpte->mpte_mptcb;
3932 VERIFY(mp_tp != NULL);
3933
3e170ce0
A
3934 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3935 "%s: mp_so 0x%llx found "
39236c6e
A
3936 "(u=%d,r=%d,s=%d)\n", __func__,
3937 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3938 mp_so->so_retaincnt, mpp->mpp_state),
3939 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3940
5ba3f43e 3941 if (!mpte_try_lock(mpte)) {
3e170ce0 3942 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 3943 "%s: mp_so 0x%llx skipped lock "
39236c6e
A
3944 "(u=%d,r=%d)\n", __func__,
3945 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3946 mp_so->so_usecount, mp_so->so_retaincnt),
3947 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3948 active++;
3949 continue;
3950 }
3951
3952 /* check again under the lock */
5ba3f43e 3953 if (mp_so->so_usecount > 0) {
39236c6e
A
3954 boolean_t wakeup = FALSE;
3955 struct mptsub *mpts, *tmpts;
3956
3e170ce0 3957 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 3958 "%s: mp_so 0x%llx skipped usecount "
39236c6e
A
3959 "[u=%d,r=%d] %d %d\n", __func__,
3960 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3961 mp_so->so_usecount, mp_so->so_retaincnt,
3962 mp_tp->mpt_gc_ticks,
3e170ce0
A
3963 mp_tp->mpt_state),
3964 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3965
39236c6e
A
3966 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3967 if (mp_tp->mpt_gc_ticks > 0)
3968 mp_tp->mpt_gc_ticks--;
3969 if (mp_tp->mpt_gc_ticks == 0) {
3970 wakeup = TRUE;
39236c6e
A
3971 }
3972 }
39236c6e
A
3973 if (wakeup) {
3974 TAILQ_FOREACH_SAFE(mpts,
3975 &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 3976 mptcp_subflow_eupcall1(mpts->mpts_socket,
39236c6e 3977 mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
3978 }
3979 }
5ba3f43e 3980 mpte_unlock(mpte);
39236c6e
A
3981 active++;
3982 continue;
3983 }
3984
3985 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
5ba3f43e
A
3986 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
3987 "[u=%d,r=%d,s=%d]\n", __func__,
3988 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3989 mp_so->so_usecount, mp_so->so_retaincnt,
3990 mpp->mpp_state);
39236c6e
A
3991 }
3992
5ba3f43e
A
3993 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
3994 mptcp_close(mpte, mp_tp);
3e170ce0 3995
5ba3f43e 3996 mptcp_session_destroy(mpte);
39236c6e 3997
3e170ce0
A
3998 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3999 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 4000 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
4001 mp_so->so_usecount, mp_so->so_retaincnt),
4002 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4003
39037602 4004 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
4005 struct sockbuf *, &mp_so->so_rcv,
4006 struct sockbuf *, &mp_so->so_snd,
4007 struct mppcb *, mpp);
4008
4009 mp_pcbdispose(mpp);
39037602 4010 sodealloc(mp_so);
39236c6e
A
4011 }
4012
4013 return (active);
4014}
4015
4016/*
4017 * Drop a MPTCP connection, reporting the specified error.
4018 */
4019struct mptses *
4020mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4021{
4022 struct socket *mp_so;
4023
5ba3f43e 4024 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4025 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4026 mp_so = mptetoso(mpte);
39236c6e 4027
39037602 4028 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
4029 uint32_t, 0 /* event */);
4030
4031 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4032 errno = mp_tp->mpt_softerror;
4033 mp_so->so_error = errno;
4034
4035 return (mptcp_close(mpte, mp_tp));
4036}
4037
4038/*
4039 * Close a MPTCP control block.
4040 */
4041struct mptses *
4042mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4043{
3e170ce0
A
4044 struct socket *mp_so = NULL;
4045 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e 4046
5ba3f43e 4047 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4048 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4049 mp_so = mptetoso(mpte);
39236c6e 4050
5ba3f43e 4051 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e 4052
5ba3f43e
A
4053 mptcp_freeq(mp_tp);
4054
4055 soisdisconnected(mp_so);
39236c6e
A
4056
4057 /* Clean up all subflows */
4058 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4059 mptcp_subflow_disconnect(mpte, mpts);
39236c6e 4060 }
39236c6e
A
4061
4062 return (NULL);
4063}
4064
4065void
4066mptcp_notify_close(struct socket *so)
4067{
4068 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4069}
4070
4071/*
5ba3f43e 4072 * MPTCP workloop.
39236c6e
A
4073 */
4074void
5ba3f43e 4075mptcp_subflow_workloop(struct mptses *mpte)
39236c6e
A
4076{
4077 struct socket *mp_so;
4078 struct mptsub *mpts, *tmpts;
4079 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
5ba3f43e 4080 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
39236c6e 4081
5ba3f43e 4082 mpte_lock_assert_held(mpte);
39236c6e 4083 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 4084 mp_so = mptetoso(mpte);
39236c6e
A
4085 VERIFY(mp_so != NULL);
4086
4087 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4088 ev_ret_t ret;
4089
5ba3f43e
A
4090 if (mpts->mpts_socket->so_usecount == 0) {
4091 /* Will be removed soon by tcp_garbage_collect */
4092 continue;
4093 }
3e170ce0 4094
5ba3f43e
A
4095 mptcp_subflow_addref(mpts);
4096 mpts->mpts_socket->so_usecount++;
3e170ce0
A
4097
4098 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e 4099
39236c6e
A
4100 /*
4101 * If MPTCP socket is closed, disconnect all subflows.
4102 * This will generate a disconnect event which will
4103 * be handled during the next iteration, causing a
4104 * non-zero error to be returned above.
4105 */
4106 if (mp_so->so_flags & SOF_PCBCLEARING)
5ba3f43e 4107 mptcp_subflow_disconnect(mpte, mpts);
39236c6e
A
4108
4109 switch (ret) {
39236c6e
A
4110 case MPTS_EVRET_OK:
4111 /* nothing to do */
4112 break;
4113 case MPTS_EVRET_DELETE:
5ba3f43e 4114 mptcp_subflow_soclose(mpts);
39236c6e
A
4115 break;
4116 case MPTS_EVRET_CONNECT_PENDING:
4117 connect_pending = TRUE;
4118 break;
4119 case MPTS_EVRET_DISCONNECT_FALLBACK:
4120 disconnect_fallback = TRUE;
4121 break;
3e170ce0
A
4122 default:
4123 mptcplog((LOG_DEBUG,
4124 "MPTCP Socket: %s: mptcp_subflow_events "
4125 "returned invalid value: %d\n", __func__,
4126 ret),
4127 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4128 break;
39236c6e 4129 }
5ba3f43e
A
4130 mptcp_subflow_remref(mpts); /* ours */
4131
4132 VERIFY(mpts->mpts_socket->so_usecount != 0);
4133 mpts->mpts_socket->so_usecount--;
39236c6e
A
4134 }
4135
5ba3f43e 4136 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
5ba3f43e
A
4137 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4138
3e170ce0 4139 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
4140 }
4141
5ba3f43e 4142 if (!connect_pending && !disconnect_fallback)
39236c6e 4143 return;
39236c6e
A
4144
4145 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
39236c6e
A
4146 if (disconnect_fallback) {
4147 struct socket *so = NULL;
4148 struct inpcb *inp = NULL;
4149 struct tcpcb *tp = NULL;
4150
5ba3f43e 4151 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
39236c6e 4152 continue;
39236c6e
A
4153
4154 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4155
4156 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
5ba3f43e 4157 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
39236c6e 4158 continue;
490019cf 4159
39236c6e
A
4160 so = mpts->mpts_socket;
4161
4162 /*
4163 * The MPTCP connection has degraded to a fallback
4164 * mode, so there is no point in keeping this subflow
4165 * regardless of its MPTCP-readiness state, unless it
4166 * is the primary one which we use for fallback. This
4167 * assumes that the subflow used for fallback is the
4168 * ACTIVE one.
4169 */
4170
39236c6e
A
4171 inp = sotoinpcb(so);
4172 tp = intotcpcb(inp);
4173 tp->t_mpflags &=
4174 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4175 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 4176
39236c6e 4177 if (mpts->mpts_flags & MPTSF_ACTIVE) {
39236c6e
A
4178 continue;
4179 }
4180 tp->t_mpflags |= TMPF_RESET;
5ba3f43e 4181 soevent(so, SO_FILT_HINT_MUSTRST);
39236c6e
A
4182 } else if (connect_pending) {
4183 /*
4184 * The MPTCP connection has progressed to a state
4185 * where it supports full multipath semantics; allow
4186 * additional joins to be attempted for all subflows
4187 * that are in the PENDING state.
4188 */
4189 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5ba3f43e 4190 int error = mptcp_subflow_soconnectx(mpte, mpts);
39236c6e 4191
5ba3f43e
A
4192 if (error)
4193 mptcp_subflow_abort(mpts, error);
4194 }
39236c6e 4195 }
39236c6e
A
4196 }
4197}
4198
39236c6e
A
4199/*
4200 * Protocol pr_lock callback.
4201 */
4202int
4203mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4204{
5ba3f43e 4205 struct mppcb *mpp = mpsotomppcb(mp_so);
39236c6e
A
4206 void *lr_saved;
4207
4208 if (lr == NULL)
4209 lr_saved = __builtin_return_address(0);
4210 else
4211 lr_saved = lr;
4212
4213 if (mpp == NULL) {
4214 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4215 mp_so, lr_saved, solockhistory_nr(mp_so));
4216 /* NOTREACHED */
4217 }
5ba3f43e 4218 mpp_lock(mpp);
39236c6e
A
4219
4220 if (mp_so->so_usecount < 0) {
4221 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4222 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4223 solockhistory_nr(mp_so));
4224 /* NOTREACHED */
4225 }
4226 if (refcount != 0)
4227 mp_so->so_usecount++;
4228 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4229 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4230
4231 return (0);
4232}
4233
4234/*
4235 * Protocol pr_unlock callback.
4236 */
4237int
5ba3f43e 4238mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
39236c6e 4239{
5ba3f43e
A
4240 struct mppcb *mpp = mpsotomppcb(mp_so);
4241 void *lr_saved;
39236c6e 4242
5ba3f43e
A
4243 if (lr == NULL)
4244 lr_saved = __builtin_return_address(0);
4245 else
4246 lr_saved = lr;
39236c6e 4247
5ba3f43e
A
4248 if (mpp == NULL) {
4249 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4250 mp_so, mp_so->so_usecount, lr_saved,
4251 solockhistory_nr(mp_so));
4252 /* NOTREACHED */
4253 }
4254 mpp_lock_assert_held(mpp);
39236c6e 4255
5ba3f43e
A
4256 if (refcount != 0)
4257 mp_so->so_usecount--;
39236c6e 4258
5ba3f43e
A
4259 if (mp_so->so_usecount < 0) {
4260 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4261 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4262 /* NOTREACHED */
39236c6e 4263 }
5ba3f43e
A
4264 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4265 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4266 mpp_unlock(mpp);
4267
4268 return (0);
39236c6e
A
4269}
4270
5ba3f43e
A
4271/*
4272 * Protocol pr_getlock callback.
4273 */
4274lck_mtx_t *
4275mptcp_getlock(struct socket *mp_so, int flags)
39236c6e 4276{
5ba3f43e
A
4277 struct mppcb *mpp = mpsotomppcb(mp_so);
4278
4279 if (mpp == NULL) {
4280 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4281 solockhistory_nr(mp_so));
39236c6e
A
4282 /* NOTREACHED */
4283 }
5ba3f43e
A
4284 if (mp_so->so_usecount < 0) {
4285 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4286 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4287 /* NOTREACHED */
39236c6e 4288 }
5ba3f43e 4289 return (mpp_getlock(mpp, flags));
39236c6e
A
4290}
4291
4292/*
4293 * MPTCP Join support
4294 */
4295
4296static void
4297mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4298 uint8_t addr_id)
39236c6e
A
4299{
4300 struct tcpcb *tp = sototcpcb(so);
4301 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4302 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4303
39236c6e 4304 /*
39236c6e
A
4305 * The address ID of the first flow is implicitly 0.
4306 */
4307 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4308 tp->t_local_aid = 0;
4309 } else {
fe8ab488 4310 tp->t_local_aid = addr_id;
39236c6e
A
4311 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4312 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4313 }
4314 sauth_entry = zalloc(mpt_subauth_zone);
4315 sauth_entry->msae_laddr_id = tp->t_local_aid;
4316 sauth_entry->msae_raddr_id = 0;
4317 sauth_entry->msae_raddr_rand = 0;
4318try_again:
4319 sauth_entry->msae_laddr_rand = RandomULong();
4320 if (sauth_entry->msae_laddr_rand == 0)
4321 goto try_again;
4322 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4323}
4324
4325static void
4326mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4327{
4328 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4329 struct tcpcb *tp = NULL;
39236c6e
A
4330 int found = 0;
4331
fe8ab488 4332 tp = sototcpcb(so);
5ba3f43e 4333 if (tp == NULL)
39236c6e
A
4334 return;
4335
39236c6e
A
4336 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4337 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4338 found = 1;
4339 break;
4340 }
4341 }
4342 if (found) {
4343 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4344 }
fe8ab488 4345
3e170ce0
A
4346 if (found)
4347 zfree(mpt_subauth_zone, sauth_entry);
39236c6e
A
4348}
4349
4350void
4351mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4352 u_int32_t *rrand)
4353{
4354 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4355 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4356
39236c6e
A
4357 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4358 if (sauth_entry->msae_laddr_id == addr_id) {
4359 if (lrand)
4360 *lrand = sauth_entry->msae_laddr_rand;
4361 if (rrand)
4362 *rrand = sauth_entry->msae_raddr_rand;
4363 break;
4364 }
4365 }
39236c6e
A
4366}
4367
4368void
4369mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4370 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4371{
4372 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4373 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4374
39236c6e
A
4375 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4376 if (sauth_entry->msae_laddr_id == laddr_id) {
4377 if ((sauth_entry->msae_raddr_id != 0) &&
4378 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4379 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4380 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4381 sauth_entry->msae_raddr_id),
4382 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4383 return;
4384 }
4385 sauth_entry->msae_raddr_id = raddr_id;
4386 if ((sauth_entry->msae_raddr_rand != 0) &&
4387 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4388 mptcplog((LOG_ERR, "MPTCP Socket: "
4389 "%s: dup SYN_ACK %d %d \n",
39236c6e 4390 __func__, raddr_rand,
3e170ce0
A
4391 sauth_entry->msae_raddr_rand),
4392 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4393 return;
4394 }
4395 sauth_entry->msae_raddr_rand = raddr_rand;
39236c6e
A
4396 return;
4397 }
4398 }
39236c6e
A
4399}
4400
4401/*
4402 * SHA1 support for MPTCP
4403 */
5ba3f43e
A
4404static void
4405mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
39236c6e
A
4406{
4407 SHA1_CTX sha1ctxt;
4408 const unsigned char *sha1_base;
4409 int sha1_size;
4410
39236c6e
A
4411 sha1_base = (const unsigned char *) key;
4412 sha1_size = sizeof (mptcp_key_t);
4413 SHA1Init(&sha1ctxt);
4414 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4415 SHA1Final(sha_digest, &sha1ctxt);
39236c6e
A
4416}
4417
4418void
4419mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5ba3f43e 4420 u_int32_t rand1, u_int32_t rand2, u_char *digest)
39236c6e
A
4421{
4422 SHA1_CTX sha1ctxt;
4423 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4424 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4425 u_int32_t data[2];
4426 int i;
4427
5ba3f43e 4428 bzero(digest, SHA1_RESULTLEN);
39236c6e
A
4429
4430 /* Set up the Key for HMAC */
4431 key_ipad[0] = key1;
4432 key_ipad[1] = key2;
4433
4434 key_opad[0] = key1;
4435 key_opad[1] = key2;
4436
4437 /* Set up the message for HMAC */
4438 data[0] = rand1;
4439 data[1] = rand2;
4440
4441 /* Key is 512 block length, so no need to compute hash */
4442
4443 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4444
4445 for (i = 0; i < 8; i++) {
4446 key_ipad[i] ^= 0x3636363636363636;
4447 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4448 }
4449
4450 /* Perform inner SHA1 */
4451 SHA1Init(&sha1ctxt);
4452 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4453 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4454 SHA1Final(digest, &sha1ctxt);
4455
4456 /* Perform outer SHA1 */
4457 SHA1Init(&sha1ctxt);
4458 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4459 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4460 SHA1Final(digest, &sha1ctxt);
4461}
4462
4463/*
4464 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4465 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4466 */
4467void
5ba3f43e 4468mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
39236c6e
A
4469{
4470 uint32_t lrand, rrand;
39236c6e 4471
5ba3f43e 4472 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4473
4474 lrand = rrand = 0;
4475 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5ba3f43e
A
4476 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4477 digest);
39236c6e
A
4478}
4479
4480/*
4481 * Authentication data generation
4482 */
5ba3f43e 4483static void
39236c6e
A
4484mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4485 int token_len)
4486{
4487 VERIFY(token_len == sizeof (u_int32_t));
4488 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4489
4490 /* Most significant 32 bits of the SHA1 hash */
4491 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4492 return;
39236c6e
A
4493}
4494
5ba3f43e 4495static void
39236c6e
A
4496mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4497 int idsn_len)
4498{
4499 VERIFY(idsn_len == sizeof (u_int64_t));
4500 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4501
4502 /*
4503 * Least significant 64 bits of the SHA1 hash
4504 */
4505
4506 idsn[7] = sha_digest[12];
4507 idsn[6] = sha_digest[13];
4508 idsn[5] = sha_digest[14];
4509 idsn[4] = sha_digest[15];
4510 idsn[3] = sha_digest[16];
4511 idsn[2] = sha_digest[17];
4512 idsn[1] = sha_digest[18];
4513 idsn[0] = sha_digest[19];
490019cf 4514 return;
39236c6e
A
4515}
4516
490019cf
A
4517static void
4518mptcp_conn_properties(struct mptcb *mp_tp)
4519{
4520 /* There is only Version 0 at this time */
4521 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4522
4523 /* Set DSS checksum flag */
4524 if (mptcp_dss_csum)
4525 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4526
4527 /* Set up receive window */
4528 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4529
4530 /* Set up gc ticks */
4531 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4532}
4533
4534static void
5ba3f43e 4535mptcp_init_local_parms(struct mptses *mpte)
39236c6e 4536{
5ba3f43e
A
4537 struct mptcb *mp_tp = mpte->mpte_mptcb;
4538 char key_digest[SHA1_RESULTLEN];
490019cf 4539
5ba3f43e
A
4540 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4541 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4542
4543 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
490019cf 4544 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
5ba3f43e 4545 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
490019cf
A
4546 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4547
4548 /* The subflow SYN is also first MPTCP byte */
4549 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4550 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4551
4552 mptcp_conn_properties(mp_tp);
4553}
4554
4555int
4556mptcp_init_remote_parms(struct mptcb *mp_tp)
4557{
5ba3f43e
A
4558 char remote_digest[SHA1_RESULTLEN];
4559 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4560
4561 /* Only Version 0 is supported for auth purposes */
3e170ce0 4562 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4563 return (-1);
4564
4565 /* Setup local and remote tokens and Initial DSNs */
5ba3f43e 4566 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
39236c6e 4567 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4568 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4569 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4570 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
5ba3f43e 4571 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4572
490019cf 4573 return (0);
39236c6e
A
4574}
4575
5ba3f43e 4576static void
39236c6e
A
4577mptcp_send_dfin(struct socket *so)
4578{
4579 struct tcpcb *tp = NULL;
4580 struct inpcb *inp = NULL;
4581
4582 inp = sotoinpcb(so);
4583 if (!inp)
4584 return;
4585
4586 tp = intotcpcb(inp);
4587 if (!tp)
4588 return;
4589
4590 if (!(tp->t_mpflags & TMPF_RESET))
4591 tp->t_mpflags |= TMPF_SEND_DFIN;
4592}
4593
4594/*
4595 * Data Sequence Mapping routines
4596 */
4597void
4598mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4599{
4600 struct mptcb *mp_tp;
4601
4602 if (m == NULL)
4603 return;
4604
3e170ce0 4605 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5ba3f43e
A
4606 mpte_lock_assert_held(mp_tp->mpt_mpte);
4607
39236c6e
A
4608 while (m) {
4609 VERIFY(m->m_flags & M_PKTHDR);
4610 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4611 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4612 m->m_pkthdr.mp_rlen = m_pktlen(m);
4613 mp_tp->mpt_sndmax += m_pktlen(m);
4614 m = m->m_next;
4615 }
5ba3f43e
A
4616}
4617
4618void
4619mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4620{
4621 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4622 uint64_t data_ack;
4623 uint64_t dsn;
4624
4625 if (!m || len == 0)
4626 return;
4627
4628 while (m && len > 0) {
4629 VERIFY(m->m_flags & M_PKTHDR);
4630 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4631
4632 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4633 dsn = m->m_pkthdr.mp_dsn;
4634
4635 len -= m->m_len;
4636 m = m->m_next;
4637 }
4638
4639 if (m && len == 0) {
4640 /*
4641 * If there is one more mbuf in the chain, it automatically means
4642 * that up to m->mp_dsn has been ack'ed.
4643 *
4644 * This means, we actually correct data_ack back down (compared
4645 * to what we set inside the loop - dsn + data_len). Because in
4646 * the loop we are "optimistic" and assume that the full mapping
4647 * will be acked. If that's not the case and we get out of the
4648 * loop with m != NULL, it means only up to m->mp_dsn has been
4649 * really acked.
4650 */
4651 data_ack = m->m_pkthdr.mp_dsn;
4652 }
4653
4654 if (len < 0) {
4655 /*
4656 * If len is negative, meaning we acked in the middle of an mbuf,
4657 * only up to this mbuf's data-sequence number has been acked
4658 * at the MPTCP-level.
4659 */
4660 data_ack = dsn;
4661 }
4662
4663 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4664 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4665 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
39236c6e
A
4666}
4667
4668void
490019cf 4669mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e 4670{
490019cf
A
4671 int rewinding = 0;
4672
5ba3f43e
A
4673 /* TFO makes things complicated. */
4674 if (so->so_flags1 & SOF1_TFO_REWIND) {
4675 rewinding = 1;
4676 so->so_flags1 &= ~SOF1_TFO_REWIND;
490019cf 4677 }
39236c6e 4678
5ba3f43e
A
4679 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4680 u_int32_t sub_len;
39236c6e 4681 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4682 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4683
5ba3f43e 4684 sub_len = m->m_pkthdr.mp_rlen;
39236c6e 4685
5ba3f43e
A
4686 if (sub_len < len) {
4687 m->m_pkthdr.mp_dsn += sub_len;
4688 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4689 m->m_pkthdr.mp_rseq += sub_len;
39236c6e 4690 }
5ba3f43e
A
4691 m->m_pkthdr.mp_rlen = 0;
4692 len -= sub_len;
39236c6e 4693 } else {
5ba3f43e
A
4694 /* sub_len >= len */
4695 if (rewinding == 0)
4696 m->m_pkthdr.mp_dsn += len;
4697 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4698 if (rewinding == 0)
4699 m->m_pkthdr.mp_rseq += len;
4700 }
4701 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4702 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4703 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4704 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4705 m->m_pkthdr.mp_rlen -= len;
4706 break;
39236c6e
A
4707 }
4708 m = m->m_next;
4709 }
39037602
A
4710
4711 if (so->so_flags & SOF_MP_SUBFLOW &&
4712 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4713 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4714 /*
4715 * Received an ack without receiving a DATA_ACK.
4716 * Need to fallback to regular TCP (or destroy this subflow).
4717 */
5ba3f43e 4718 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
39037602
A
4719 mptcp_notify_mpfail(so);
4720 }
39236c6e
A
4721}
4722
4723/* Obtain the DSN mapping stored in the mbuf */
4724void
5ba3f43e
A
4725mptcp_output_getm_dsnmap32(struct socket *so, int off,
4726 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
39236c6e
A
4727{
4728 u_int64_t dsn64;
4729
5ba3f43e 4730 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
39236c6e 4731 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
39236c6e
A
4732}
4733
4734void
5ba3f43e
A
4735mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4736 uint32_t *relseq, uint16_t *data_len,
4737 uint16_t *dss_csum)
39236c6e
A
4738{
4739 struct mbuf *m = so->so_snd.sb_mb;
5ba3f43e 4740 int off_orig = off;
39236c6e 4741
5ba3f43e 4742 VERIFY(off >= 0);
39236c6e 4743
39236c6e
A
4744 /*
4745 * In the subflow socket, the DSN sequencing can be discontiguous,
4746 * but the subflow sequence mapping is contiguous. Use the subflow
4747 * sequence property to find the right mbuf and corresponding dsn
4748 * mapping.
4749 */
4750
4751 while (m) {
39236c6e 4752 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4753 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4754
5ba3f43e
A
4755 if (off >= m->m_len) {
4756 off -= m->m_len;
39236c6e
A
4757 m = m->m_next;
4758 } else {
4759 break;
4760 }
4761 }
4762
5ba3f43e
A
4763 VERIFY(m);
4764 VERIFY(off >= 0);
4765 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
39236c6e 4766
5ba3f43e
A
4767 *dsn = m->m_pkthdr.mp_dsn;
4768 *relseq = m->m_pkthdr.mp_rseq;
4769 *data_len = m->m_pkthdr.mp_rlen;
4770 *dss_csum = m->m_pkthdr.mp_csum;
39236c6e 4771
5ba3f43e
A
4772 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4773 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4774 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4775}
4776
4777/*
3e170ce0
A
4778 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4779 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4780 * When it trims data tcp_input calls m_adj() which does not remove the
4781 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4782 * The dsn map insertion cannot be delayed after trim, because data can be in
4783 * the reassembly queue for a while and the DSN option info in tp will be
4784 * overwritten for every new packet received.
39236c6e
A
4785 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4786 * with mptcp_adj_rmap()
4787 */
4788void
5c9f4661 4789mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
39236c6e 4790{
5c9f4661 4791 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
4792 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4793
4794 if (tp->t_mpflags & TMPF_EMBED_DSN) {
39236c6e
A
4795 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4796 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4797 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5ba3f43e 4798 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5c9f4661
A
4799 if (tp->t_rcv_map.mpt_dfin)
4800 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
4801
39236c6e 4802 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5c9f4661 4803
39236c6e
A
4804 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4805 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5c9f4661
A
4806 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
4807 if (th->th_flags & TH_FIN)
4808 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
39236c6e
A
4809 }
4810}
4811
5c9f4661
A
4812int
4813mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
4814 uint32_t rseq, uint16_t dlen)
39236c6e 4815{
5ba3f43e 4816 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
39236c6e
A
4817
4818 if (m_pktlen(m) == 0)
5c9f4661 4819 return (0);
39236c6e 4820
5ba3f43e 4821 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5c9f4661
A
4822 if (off && (dsn != m->m_pkthdr.mp_dsn ||
4823 rseq != m->m_pkthdr.mp_rseq ||
4824 dlen != m->m_pkthdr.mp_rlen)) {
4825 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
4826 __func__, dsn, m->m_pkthdr.mp_dsn,
4827 rseq, m->m_pkthdr.mp_rseq,
4828 dlen, m->m_pkthdr.mp_rlen),
4829 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
4830 return (-1);
4831 }
39236c6e
A
4832 m->m_pkthdr.mp_dsn += off;
4833 m->m_pkthdr.mp_rseq += off;
fe8ab488 4834 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e 4835 } else {
5ba3f43e
A
4836 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
4837 /* data arrived without an DSS option mapping */
4838
4839 /* initial subflow can fallback right after SYN handshake */
4840 mptcp_notify_mpfail(so);
4841 }
39236c6e 4842 }
5ba3f43e
A
4843
4844 mpts->mpts_flags |= MPTSF_CONFIRMED;
4845
5c9f4661 4846 return (0);
39236c6e
A
4847}
4848
4849/*
4850 * Following routines help with failure detection and failover of data
4851 * transfer from one subflow to another.
4852 */
4853void
4854mptcp_act_on_txfail(struct socket *so)
4855{
4856 struct tcpcb *tp = NULL;
4857 struct inpcb *inp = sotoinpcb(so);
4858
4859 if (inp == NULL)
4860 return;
4861
4862 tp = intotcpcb(inp);
4863 if (tp == NULL)
4864 return;
4865
5ba3f43e 4866 if (so->so_flags & SOF_MP_TRYFAILOVER)
39236c6e 4867 return;
39236c6e
A
4868
4869 so->so_flags |= SOF_MP_TRYFAILOVER;
4870 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4871}
4872
4873/*
4874 * Support for MP_FAIL option
4875 */
4876int
4877mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4878{
4879 struct mbuf *m = so->so_snd.sb_mb;
4880 u_int64_t dsn;
4881 int off = 0;
4882 u_int32_t datalen;
4883
4884 if (m == NULL)
4885 return (-1);
4886
4887 while (m != NULL) {
4888 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4889 VERIFY(m->m_flags & M_PKTHDR);
4890 dsn = m->m_pkthdr.mp_dsn;
4891 datalen = m->m_pkthdr.mp_rlen;
4892 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4893 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4894 off = dsn_fail - dsn;
4895 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5ba3f43e
A
4896 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
4897 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4898 return (0);
4899 }
4900
4901 m = m->m_next;
4902 }
4903
4904 /*
4905 * If there was no mbuf data and a fallback to TCP occurred, there's
4906 * not much else to do.
4907 */
4908
5ba3f43e
A
4909 mptcplog((LOG_ERR, "MPTCP Sender: "
4910 "%s: %llu not found \n", __func__, dsn_fail),
4911 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
4912 return (-1);
4913}
4914
4915/*
4916 * Support for sending contiguous MPTCP bytes in subflow
4917 * Also for preventing sending data with ACK in 3-way handshake
4918 */
4919int32_t
4920mptcp_adj_sendlen(struct socket *so, int32_t off)
4921{
4922 struct tcpcb *tp = sototcpcb(so);
4923 struct mptsub *mpts = tp->t_mpsub;
4924 uint64_t mdss_dsn;
4925 uint32_t mdss_subflow_seq;
4926 int mdss_subflow_off;
4927 uint16_t mdss_data_len;
4928 uint16_t dss_csum;
4929
4930 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
4931 &mdss_data_len, &dss_csum);
4932
4933 /*
4934 * We need to compute how much of the mapping still remains.
4935 * So, we compute the offset in the send-buffer of the dss-sub-seq.
4936 */
4937 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
4938
4939 /*
4940 * When TFO is used, we are sending the mpts->mpts_iss although the relative
4941 * seq has been set to 1 (while it should be 0).
4942 */
4943 if (tp->t_mpflags & TMPF_TFO_REQUEST)
4944 mdss_subflow_off--;
4945
4946 if (off < mdss_subflow_off)
4947 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
4948 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
4949 VERIFY(off >= mdss_subflow_off);
4950
4951 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
4952 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
4953 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4954 return (mdss_data_len - (off - mdss_subflow_off));
4955}
4956
4957static uint32_t
4958mptcp_get_maxseg(struct mptses *mpte)
4959{
4960 struct mptsub *mpts;
4961 uint32_t maxseg = 0;
4962
4963 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4964 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4965
4966 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4967 TCPS_HAVERCVDFIN2(tp->t_state))
4968 continue;
4969
4970 if (tp->t_maxseg > maxseg)
4971 maxseg = tp->t_maxseg;
4972 }
4973
4974 return (maxseg);
4975}
4976
4977static uint8_t
4978mptcp_get_rcvscale(struct mptses *mpte)
4979{
4980 struct mptsub *mpts;
4981 uint8_t rcvscale = UINT8_MAX;
4982
4983 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4984 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4985
4986 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4987 TCPS_HAVERCVDFIN2(tp->t_state))
4988 continue;
4989
4990 if (tp->rcv_scale < rcvscale)
4991 rcvscale = tp->rcv_scale;
4992 }
4993
4994 return (rcvscale);
4995}
4996
4997/* Similar to tcp_sbrcv_reserve */
4998static void
4999mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5000 u_int32_t newsize, u_int32_t idealsize)
5001{
5002 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5003
5004 /* newsize should not exceed max */
5005 newsize = min(newsize, tcp_autorcvbuf_max);
5006
5007 /* The receive window scale negotiated at the
5008 * beginning of the connection will also set a
5009 * limit on the socket buffer size
5010 */
5011 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5012
5013 /* Set new socket buffer size */
5014 if (newsize > sbrcv->sb_hiwat &&
5015 (sbreserve(sbrcv, newsize) == 1)) {
5016 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5017 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5018
5019 /* Again check the limit set by the advertised
5020 * window scale
5021 */
5022 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5023 TCP_MAXWIN << rcvscale);
5024 }
5025}
5026
5027void
5028mptcp_sbrcv_grow(struct mptcb *mp_tp)
5029{
5030 struct mptses *mpte = mp_tp->mpt_mpte;
5031 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5032 struct sockbuf *sbrcv = &mp_so->so_rcv;
5033 uint32_t hiwat_sum = 0;
5034 uint32_t ideal_sum = 0;
5035 struct mptsub *mpts;
5036
5037 /*
5038 * Do not grow the receive socket buffer if
5039 * - auto resizing is disabled, globally or on this socket
5040 * - the high water mark already reached the maximum
5041 * - the stream is in background and receive side is being
5042 * throttled
5043 * - if there are segments in reassembly queue indicating loss,
5044 * do not need to increase recv window during recovery as more
5045 * data is not going to be sent. A duplicate ack sent during
5046 * recovery should not change the receive window
5047 */
5048 if (tcp_do_autorcvbuf == 0 ||
5049 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5050 tcp_cansbgrow(sbrcv) == 0 ||
5051 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5052 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5053 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5054 /* Can not resize the socket buffer, just return */
5055 return;
5056 }
5057
5058 /*
5059 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5060 *
5061 * But, for this we first need accurate receiver-RTT estimations, which
5062 * we currently don't have.
5063 *
5064 * Let's use a dummy algorithm for now, just taking the sum of all
5065 * subflow's receive-buffers. It's too low, but that's all we can get
5066 * for now.
5067 */
5068
5069 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5070 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5071 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5072 }
5073
5074 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
39236c6e
A
5075}
5076
5077/*
5ba3f43e
A
5078 * Determine if we can grow the recieve socket buffer to avoid sending
5079 * a zero window update to the peer. We allow even socket buffers that
5080 * have fixed size (set by the application) to grow if the resource
5081 * constraints are met. They will also be trimmed after the application
5082 * reads data.
5083 *
5084 * Similar to tcp_sbrcv_grow_rwin
39236c6e 5085 */
5ba3f43e
A
5086static void
5087mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
39236c6e 5088{
5ba3f43e
A
5089 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5090 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5091 u_int32_t rcvbuf = sb->sb_hiwat;
39236c6e 5092
5ba3f43e
A
5093 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5094 return;
39236c6e 5095
5ba3f43e
A
5096 if (tcp_do_autorcvbuf == 1 &&
5097 tcp_cansbgrow(sb) &&
5098 /* Diff to tcp_sbrcv_grow_rwin */
5099 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5100 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5101 rcvbuf < tcp_autorcvbuf_max &&
5102 (sb->sb_idealsize > 0 &&
5103 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5104 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
490019cf 5105 }
39236c6e
A
5106}
5107
5ba3f43e 5108/* Similar to tcp_sbspace */
39236c6e 5109int32_t
5ba3f43e 5110mptcp_sbspace(struct mptcb *mp_tp)
39236c6e 5111{
5ba3f43e 5112 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
39236c6e
A
5113 uint32_t rcvbuf;
5114 int32_t space;
5ba3f43e
A
5115 int32_t pending = 0;
5116
5117 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 5118
5ba3f43e 5119 mptcp_sbrcv_grow_rwin(mp_tp, sb);
39236c6e 5120
5ba3f43e 5121 /* hiwat might have changed */
39236c6e 5122 rcvbuf = sb->sb_hiwat;
5ba3f43e
A
5123
5124 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5125 (sb->sb_mbmax - sb->sb_mbcnt)));
39236c6e
A
5126 if (space < 0)
5127 space = 0;
5ba3f43e
A
5128
5129#if CONTENT_FILTER
5130 /* Compensate for data being processed by content filters */
5131 pending = cfil_sock_data_space(sb);
5132#endif /* CONTENT_FILTER */
5133 if (pending > space)
5134 space = 0;
5135 else
5136 space -= pending;
39236c6e
A
5137
5138 return (space);
5139}
5140
5141/*
5142 * Support Fallback to Regular TCP
5143 */
5144void
5145mptcp_notify_mpready(struct socket *so)
5146{
5147 struct tcpcb *tp = NULL;
5148
5149 if (so == NULL)
5150 return;
5151
5152 tp = intotcpcb(sotoinpcb(so));
5153
5154 if (tp == NULL)
5155 return;
5156
5157 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5158 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5159 struct tcpcb *, tp);
5160
5161 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5162 return;
5163
5164 if (tp->t_mpflags & TMPF_MPTCP_READY)
5165 return;
5166
5167 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5168 tp->t_mpflags |= TMPF_MPTCP_READY;
5169
5170 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5171}
5172
5173void
5174mptcp_notify_mpfail(struct socket *so)
5175{
5176 struct tcpcb *tp = NULL;
5177
5178 if (so == NULL)
5179 return;
5180
5181 tp = intotcpcb(sotoinpcb(so));
5182
5183 if (tp == NULL)
5184 return;
5185
5186 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5187 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5188 struct tcpcb *, tp);
5189
5190 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5191 return;
5192
5193 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5194 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5195
5196 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5197}
5198
5199/*
5200 * Keepalive helper function
5201 */
5202boolean_t
5203mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5204{
5205 boolean_t ret = 1;
5ba3f43e
A
5206 mpte_lock_assert_held(mp_tp->mpt_mpte);
5207
39236c6e
A
5208 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5209 ret = 0;
5210 }
39236c6e
A
5211 return (ret);
5212}
5213
5214/*
5215 * MPTCP t_maxseg adjustment function
5216 */
5217int
5218mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5219{
5220 int mss_lower = 0;
5221 struct mptcb *mp_tp = tptomptp(tp);
5222
5223#define MPTCP_COMPUTE_LEN { \
5224 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
39236c6e
A
5225 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5226 mss_lower += 2; \
5227 else \
5228 /* adjust to 32-bit boundary + EOL */ \
5229 mss_lower += 2; \
39236c6e
A
5230}
5231 if (mp_tp == NULL)
5232 return (0);
5233
5ba3f43e
A
5234 mpte_lock_assert_held(mp_tp->mpt_mpte);
5235
39236c6e
A
5236 /*
5237 * For the first subflow and subsequent subflows, adjust mss for
5238 * most common MPTCP option size, for case where tcp_mss is called
5239 * during option processing and MTU discovery.
5240 */
5ba3f43e
A
5241 if (!mtudisc) {
5242 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5243 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5244 MPTCP_COMPUTE_LEN;
5245 }
39236c6e 5246
5ba3f43e
A
5247 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5248 tp->t_mpflags & TMPF_SENT_JOIN) {
5249 MPTCP_COMPUTE_LEN;
5250 }
5251 } else {
5252 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5253 MPTCP_COMPUTE_LEN;
5254 }
39236c6e
A
5255 }
5256
5257 return (mss_lower);
5258}
5259
5260/*
5261 * Update the pid, upid, uuid of the subflow so, based on parent so
5262 */
5263void
5ba3f43e 5264mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
39236c6e 5265{
5ba3f43e
A
5266 if (so->last_pid != mp_so->last_pid ||
5267 so->last_upid != mp_so->last_upid) {
5268 so->last_upid = mp_so->last_upid;
5269 so->last_pid = mp_so->last_pid;
5270 uuid_copy(so->last_uuid, mp_so->last_uuid);
39236c6e 5271 }
5ba3f43e 5272 so_update_policy(so);
39236c6e
A
5273}
5274
5275static void
5276fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5277{
5278 struct inpcb *inp;
5279
5280 tcp_getconninfo(so, &flow->flow_ci);
5281 inp = sotoinpcb(so);
5282#if INET6
5283 if ((inp->inp_vflag & INP_IPV6) != 0) {
5284 flow->flow_src.ss_family = AF_INET6;
5285 flow->flow_dst.ss_family = AF_INET6;
5286 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5287 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5288 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5289 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5290 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5291 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
39037602 5292 } else
39236c6e 5293#endif
3e170ce0 5294 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5295 flow->flow_src.ss_family = AF_INET;
5296 flow->flow_dst.ss_family = AF_INET;
5297 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5298 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5299 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5300 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5301 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5302 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5303 }
3e170ce0
A
5304 flow->flow_len = sizeof(*flow);
5305 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5306 flow->flow_flags = mpts->mpts_flags;
5307 flow->flow_cid = mpts->mpts_connid;
3e170ce0 5308 flow->flow_relseq = mpts->mpts_rel_seq;
5ba3f43e 5309 flow->flow_soerror = mpts->mpts_socket->so_error;
3e170ce0 5310 flow->flow_probecnt = mpts->mpts_probecnt;
39236c6e
A
5311}
5312
5313static int
5314mptcp_pcblist SYSCTL_HANDLER_ARGS
5315{
5316#pragma unused(oidp, arg1, arg2)
5317 int error = 0, f;
5ba3f43e 5318 size_t len;
39236c6e
A
5319 struct mppcb *mpp;
5320 struct mptses *mpte;
5321 struct mptcb *mp_tp;
5322 struct mptsub *mpts;
5323 struct socket *so;
5324 conninfo_mptcp_t mptcpci;
fe8ab488 5325 mptcp_flow_t *flows = NULL;
39236c6e
A
5326
5327 if (req->newptr != USER_ADDR_NULL)
5328 return (EPERM);
5329
5330 lck_mtx_lock(&mtcbinfo.mppi_lock);
39236c6e 5331 if (req->oldptr == USER_ADDR_NULL) {
5ba3f43e 5332 size_t n = mtcbinfo.mppi_count;
39236c6e 5333 lck_mtx_unlock(&mtcbinfo.mppi_lock);
39037602 5334 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
39236c6e
A
5335 4 * (n + n/8) * sizeof(mptcp_flow_t);
5336 return (0);
5337 }
5338 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5339 flows = NULL;
5ba3f43e 5340 mpp_lock(mpp);
39236c6e
A
5341 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5342 mpte = mptompte(mpp);
5343 VERIFY(mpte != NULL);
5ba3f43e 5344 mpte_lock_assert_held(mpte);
39236c6e
A
5345 mp_tp = mpte->mpte_mptcb;
5346 VERIFY(mp_tp != NULL);
3e170ce0
A
5347
5348 bzero(&mptcpci, sizeof(mptcpci));
39236c6e 5349 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5350 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5351 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5352 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5353 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5354 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5355 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5356 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5357 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5358 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5359 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5ba3f43e 5360 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
3e170ce0
A
5361 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5362 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
3e170ce0 5363
39236c6e 5364 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5365 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5366 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5367 mptcpci.mptcpci_flow_offset =
5368 offsetof(conninfo_mptcp_t, mptcpci_flows);
5369
fe8ab488
A
5370 len = sizeof(*flows) * mpte->mpte_numflows;
5371 if (mpte->mpte_numflows != 0) {
5372 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5373 if (flows == NULL) {
5ba3f43e 5374 mpp_unlock(mpp);
fe8ab488
A
5375 break;
5376 }
5377 mptcpci.mptcpci_len = sizeof(mptcpci) +
5378 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5379 error = SYSCTL_OUT(req, &mptcpci,
5380 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5381 } else {
5382 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5383 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 5384 }
39236c6e 5385 if (error) {
5ba3f43e 5386 mpp_unlock(mpp);
39236c6e
A
5387 FREE(flows, M_TEMP);
5388 break;
5389 }
5390 f = 0;
5391 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
39236c6e 5392 so = mpts->mpts_socket;
39236c6e 5393 fill_mptcp_subflow(so, &flows[f], mpts);
39236c6e
A
5394 f++;
5395 }
5ba3f43e 5396 mpp_unlock(mpp);
fe8ab488
A
5397 if (flows) {
5398 error = SYSCTL_OUT(req, flows, len);
5399 FREE(flows, M_TEMP);
5400 if (error)
5401 break;
5402 }
39236c6e
A
5403 }
5404 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5405
5406 return (error);
5407}
5408
5409SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 5410 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 5411 "List of active MPTCP connections");
fe8ab488 5412
fe8ab488
A
5413/*
5414 * Set notsent lowat mark on the MPTCB
5415 */
5416int
5417mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5418{
5419 struct mptcb *mp_tp = NULL;
5420 int error = 0;
5421
5422 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5423 mp_tp = mpte->mpte_mptcb;
5424
5425 if (mp_tp)
5426 mp_tp->mpt_notsent_lowat = optval;
5427 else
5428 error = EINVAL;
5429
5ba3f43e 5430 return (error);
fe8ab488
A
5431}
5432
5433u_int32_t
5434mptcp_get_notsent_lowat(struct mptses *mpte)
5435{
5436 struct mptcb *mp_tp = NULL;
5437
5438 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5439 mp_tp = mpte->mpte_mptcb;
5440
5441 if (mp_tp)
5ba3f43e 5442 return (mp_tp->mpt_notsent_lowat);
fe8ab488 5443 else
5ba3f43e 5444 return (0);
fe8ab488
A
5445}
5446
39037602 5447int
5ba3f43e
A
5448mptcp_notsent_lowat_check(struct socket *so)
5449{
fe8ab488
A
5450 struct mptses *mpte;
5451 struct mppcb *mpp;
5452 struct mptcb *mp_tp;
5453 struct mptsub *mpts;
5454
5455 int notsent = 0;
5456
5ba3f43e 5457 mpp = mpsotomppcb(so);
fe8ab488
A
5458 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5459 return (0);
5460 }
5461
5462 mpte = mptompte(mpp);
5ba3f43e 5463 mpte_lock_assert_held(mpte);
fe8ab488
A
5464 mp_tp = mpte->mpte_mptcb;
5465
fe8ab488
A
5466 notsent = so->so_snd.sb_cc;
5467
5468 if ((notsent == 0) ||
5469 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5470 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5471 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5472 "lowat %d notsent %d actual %d \n",
5473 mp_tp->mpt_notsent_lowat, notsent,
5474 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5475 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5476 return (1);
5477 }
fe8ab488
A
5478
5479 /* When Nagle's algorithm is not disabled, it is better
5480 * to wakeup the client even before there is atleast one
5481 * maxseg of data to write.
5482 */
5483 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5484 int retval = 0;
fe8ab488
A
5485 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5486 struct socket *subf_so = mpts->mpts_socket;
fe8ab488 5487 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 5488
fe8ab488
A
5489 notsent = so->so_snd.sb_cc -
5490 (tp->snd_nxt - tp->snd_una);
39037602 5491
fe8ab488
A
5492 if ((tp->t_flags & TF_NODELAY) == 0 &&
5493 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5494 retval = 1;
5495 }
3e170ce0 5496 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5497 " nodelay false \n",
3e170ce0
A
5498 mp_tp->mpt_notsent_lowat, notsent),
5499 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5500 return (retval);
5501 }
fe8ab488
A
5502 }
5503 return (0);
5504}
5505
3e170ce0
A
5506/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5507static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5508static uint32_t mptcp_kern_skt_inuse = 0;
5ba3f43e 5509static uint32_t mptcp_kern_skt_unit;
3e170ce0
A
5510symptoms_advisory_t mptcp_advisory;
5511
5512static errno_t
5513mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5514 void **unitinfo)
5515{
5516#pragma unused(kctlref, sac, unitinfo)
5ba3f43e
A
5517
5518 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5519 mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__),
5520 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5521
5522 mptcp_kern_skt_unit = sac->sc_unit;
5523
5524 return (0);
5525}
5526
5527static void
5528mptcp_allow_uuid(uuid_t uuid)
5529{
5530 struct mppcb *mpp;
5531
5532 /* Iterate over all MPTCP connections */
5533
5534 lck_mtx_lock(&mtcbinfo.mppi_lock);
5535
5536 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5537 struct mptses *mpte;
5538 struct socket *mp_so;
5539
5540 mpp_lock(mpp);
5541
5542 mpte = mpp->mpp_pcbe;
5543 mp_so = mpp->mpp_socket;
5544
5545 if (mp_so->so_flags & SOF_DELEGATED &&
5546 uuid_compare(uuid, mp_so->e_uuid))
5547 goto next;
5548 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5549 uuid_compare(uuid, mp_so->last_uuid))
5550 goto next;
5551
5552 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5553
5554 mptcp_check_subflows_and_add(mpte);
5555 mptcp_remove_subflows(mpte);
5556
5557 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5558
5559next:
5560 mpp_unlock(mpp);
5561 }
5562
5563 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5564}
5565
5566static void
5567mptcp_wifi_status_changed(void)
5568{
5569 struct mppcb *mpp;
5570
5571 /* Iterate over all MPTCP connections */
5572
5573 lck_mtx_lock(&mtcbinfo.mppi_lock);
5574
5575 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5576 struct mptses *mpte;
5577 struct socket *mp_so;
5578
5579 mpp_lock(mpp);
5580
5581 mpte = mpp->mpp_pcbe;
5582 mp_so = mpp->mpp_socket;
5583
5584 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5585 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5586 goto next;
5587
5588 mptcp_check_subflows_and_add(mpte);
5589 mptcp_check_subflows_and_remove(mpte);
5590
5591next:
5592 mpp_unlock(mpp);
5593 }
5594
5595 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5596}
5597
5598void
5599mptcp_ask_symptoms(struct mptses *mpte)
5600{
5601 struct mptcp_symptoms_ask_uuid ask;
5602 struct socket *mp_so;
5603 struct proc *p;
5604 int pid, prio, err;
5605
5606 if (mptcp_kern_skt_unit == 0) {
5607 mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__),
5608 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5609 return;
5610 }
5611
5612 mp_so = mptetoso(mpte);
5613
5614 if (mp_so->so_flags & SOF_DELEGATED)
5615 pid = mp_so->e_pid;
5616 else
5617 pid = mp_so->last_pid;
5618
5619 p = proc_find(pid);
5620 if (p == PROC_NULL) {
5621 mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__,
5622 pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5623 return;
5624 }
5625
5626 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5627
5628 if (mp_so->so_flags & SOF_DELEGATED)
5629 uuid_copy(ask.uuid, mp_so->e_uuid);
5630 else
5631 uuid_copy(ask.uuid, mp_so->last_uuid);
5632
5633 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5634
5635 if (prio == TASK_BACKGROUND_APPLICATION)
5636 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5637 else if (prio == TASK_FOREGROUND_APPLICATION)
5638 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
3e170ce0 5639 else
5ba3f43e
A
5640 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5641
5642 mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__,
5643 pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5644
5645 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5646 &ask, sizeof(ask), CTL_DATA_EOR);
5647 if (err)
5648 mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err),
5649 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5650
5651 proc_rele(p);
3e170ce0
A
5652}
5653
5654static errno_t
5655mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5656 void *unitinfo)
5657{
5658#pragma unused(kctlref, kcunit, unitinfo)
5ba3f43e
A
5659
5660 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5661
5662 return (0);
3e170ce0
A
5663}
5664
5665static errno_t
5666mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5667 mbuf_t m, int flags)
5668{
5ba3f43e 5669#pragma unused(kctlref, unitinfo, flags)
3e170ce0
A
5670 symptoms_advisory_t *sa = NULL;
5671
5ba3f43e
A
5672 if (kcunit != mptcp_kern_skt_unit)
5673 mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n",
5674 __func__, kcunit, mptcp_kern_skt_unit),
5675 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5676
3e170ce0
A
5677 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5678 mbuf_freem(m);
5679 return (EINVAL);
5680 }
5681
5682 if (mbuf_len(m) >= sizeof(*sa))
5683 sa = mbuf_data(m);
5684 else
5685 return (EINVAL);
5686
5ba3f43e
A
5687 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5688 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5689 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
3e170ce0 5690
5ba3f43e
A
5691 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5692 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5693 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3e170ce0
A
5694
5695 if ((sa->sa_wifi_status &
5696 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5ba3f43e 5697 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
3e170ce0 5698 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
3e170ce0 5699
5ba3f43e
A
5700 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5701 mptcp_wifi_status_changed();
5702 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5703 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5704 mptcp_advisory.sa_wifi_status),
5705 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5706 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5707 uuid_t uuid;
5708
5709 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5710 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5711
5712 uuid_copy(uuid, (unsigned char *)(sa + 1));
5713
5714 mptcp_allow_uuid(uuid);
3e170ce0 5715 }
5ba3f43e 5716
3e170ce0
A
5717 return (0);
5718}
5719
5720void
5721mptcp_control_register(void)
5722{
5723 /* Set up the advisory control socket */
5724 struct kern_ctl_reg mptcp_kern_ctl;
5725
5726 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5727 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5728 sizeof(mptcp_kern_ctl.ctl_name));
5729 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5730 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5731 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5732 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5733
5734 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5735}
5736
5737int
5738mptcp_is_wifi_unusable(void)
5739{
5740 /* a false return val indicates there is no info or wifi is ok */
5741 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5742}
5743
490019cf
A
5744/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5745static void
5ba3f43e 5746mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
490019cf 5747{
5ba3f43e 5748 struct socket *mp_so = mptetoso(mpte);
490019cf
A
5749 struct socket *so = mpts->mpts_socket;
5750 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5751 struct mptcb *mp_tp = mpte->mpte_mptcb;
5752
5753 /* If data was sent with SYN, rewind state */
5754 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5ba3f43e 5755 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
490019cf 5756 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5ba3f43e 5757
490019cf
A
5758 VERIFY(mp_droplen <= (UINT_MAX));
5759 VERIFY(mp_droplen >= tcp_droplen);
5760
5ba3f43e
A
5761 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5762 mpts->mpts_iss += tcp_droplen;
5763 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5764
490019cf
A
5765 if (mp_droplen > tcp_droplen) {
5766 /* handle partial TCP ack */
5767 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5768 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
490019cf
A
5769 mp_droplen = tcp_droplen;
5770 } else {
5771 /* all data on SYN was acked */
5772 mpts->mpts_rel_seq = 1;
5773 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
490019cf
A
5774 }
5775 mp_tp->mpt_sndmax -= tcp_droplen;
5776
490019cf
A
5777 if (mp_droplen != 0) {
5778 VERIFY(mp_so->so_snd.sb_mb != NULL);
5779 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5780 }
5ba3f43e
A
5781 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
5782 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5783 mpts->mpts_connid, tcp_droplen, mp_droplen),
5784 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5785 }
5786}
5787
5788int
5789mptcp_freeq(struct mptcb *mp_tp)
5790{
5791 struct tseg_qent *q;
5792 int rv = 0;
5793
5794 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
5795 LIST_REMOVE(q, tqe_q);
5796 m_freem(q->tqe_m);
5797 zfree(tcp_reass_zone, q);
5798 rv = 1;
5799 }
5800 mp_tp->mpt_reassqlen = 0;
5801 return (rv);
5802}
5803
5804static int
5805mptcp_post_event(u_int32_t event_code, int value)
5806{
5807 struct kev_mptcp_data event_data;
5808 struct kev_msg ev_msg;
5809
5810 memset(&ev_msg, 0, sizeof(ev_msg));
5811
5812 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5813 ev_msg.kev_class = KEV_NETWORK_CLASS;
5814 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
5815 ev_msg.event_code = event_code;
5816
5817 event_data.value = value;
5818
5819 ev_msg.dv[0].data_ptr = &event_data;
5820 ev_msg.dv[0].data_length = sizeof(event_data);
5821
5822 return kev_post_msg(&ev_msg);
5823}
5824
5825void
5826mptcp_set_cellicon(struct mptses *mpte)
5827{
5828 int error;
5829
5830 /* First-party apps (Siri) don't flip the cellicon */
5831 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
5832 return;
5833
5834 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
5835 mptcp_last_cellicon_set = tcp_now;
5836
5837 /* If cellicon is already set, get out of here! */
5838 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
5839 return;
5840
5841 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
5842
5843 if (error)
5844 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
5845 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5846 else
5847 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
5848 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5849}
5850
5851void
5852mptcp_unset_cellicon(void)
5853{
5854 int error;
5855
5856 /* If cellicon is already unset, get out of here! */
5857 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
5858 return;
5859
5860 /*
5861 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
5862 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
5863 * it again.
5864 */
5865 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
5866 tcp_now)) {
5867 OSTestAndSet(7, &mptcp_cellicon_is_set);
5868 return;
490019cf 5869 }
5ba3f43e
A
5870
5871 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
5872
5873 if (error)
5874 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
5875 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5876 else
5877 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
5878 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5879}
5880
5881void
5882mptcp_reset_rexmit_state(struct tcpcb *tp)
5883{
5884 struct mptsub *mpts;
5885 struct inpcb *inp;
5886 struct socket *so;
5887
5888 inp = tp->t_inpcb;
5889 if (inp == NULL)
5890 return;
5891
5892 so = inp->inp_socket;
5893 if (so == NULL)
5894 return;
5895
5896 if (!(so->so_flags & SOF_MP_SUBFLOW))
5897 return;
5898
5899 mpts = tp->t_mpsub;
5900
5901 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
5902 so->so_flags &= ~SOF_MP_TRYFAILOVER;
5903}
5904
5905void
5906mptcp_reset_keepalive(struct tcpcb *tp)
5907{
5908 struct mptsub *mpts = tp->t_mpsub;
5909
5910 mpts->mpts_flags &= ~MPTSF_READ_STALL;
490019cf 5911}
5ba3f43e 5912