]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-4570.1.46.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
5ba3f43e 2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
5ba3f43e
A
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
39236c6e
A
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
5ba3f43e
A
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
39236c6e
A
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
39236c6e 47#include <sys/sysctl.h>
5ba3f43e
A
48#include <sys/syslog.h>
49#include <sys/systm.h>
39236c6e 50
5ba3f43e 51#include <net/content_filter.h>
39236c6e 52#include <net/if.h>
3e170ce0 53#include <net/if_var.h>
39236c6e
A
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_fsm.h>
59#include <netinet/tcp_seq.h>
60#include <netinet/tcp_var.h>
61#include <netinet/mptcp_var.h>
62#include <netinet/mptcp.h>
5ba3f43e 63#include <netinet/mptcp_opt.h>
39236c6e
A
64#include <netinet/mptcp_seq.h>
65#include <netinet/mptcp_timer.h>
66#include <libkern/crypto/sha1.h>
67#if INET6
68#include <netinet6/in6_pcb.h>
69#include <netinet6/ip6protosw.h>
70#endif /* INET6 */
71#include <dev/random/randomdev.h>
72
73/*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
39236c6e
A
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
5ba3f43e 96 * subflow. This gets decremented prior to the subflow's destruction.
39236c6e 97 *
5ba3f43e
A
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
39236c6e 100 *
5ba3f43e
A
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
39236c6e
A
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
5ba3f43e 109 * of the subflows have been destroyed.
39236c6e
A
110 */
111
fe8ab488 112static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 113static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
114
115static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
116static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
5ba3f43e
A
118static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
39236c6e
A
120static void mptcp_subflow_rupcall(struct socket *, void *, int);
121static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122static void mptcp_subflow_wupcall(struct socket *, void *, int);
5ba3f43e
A
123static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127static void mptcp_subflow_abort(struct mptsub *, int);
128
129static void mptcp_send_dfin(struct socket *so);
39236c6e
A
130
131/*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
143} ev_ret_t;
144
3e170ce0 145static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
5ba3f43e
A
146static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
fe8ab488 157
39236c6e
A
158static const char *mptcp_evret2str(ev_ret_t);
159
5ba3f43e
A
160static void mptcp_do_sha1(mptcp_key_t *, char *);
161static void mptcp_init_local_parms(struct mptses *);
39236c6e
A
162
163static unsigned int mptsub_zone_size; /* size of mptsub */
164static struct zone *mptsub_zone; /* zone for mptsub */
165
166static unsigned int mptopt_zone_size; /* size of mptopt */
167static struct zone *mptopt_zone; /* zone for mptopt */
168
169static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172struct mppcbinfo mtcbinfo;
173
39236c6e
A
174#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177SYSCTL_DECL(_net_inet);
178
179SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
5ba3f43e 181uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
3e170ce0
A
182SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
5ba3f43e 185uint32_t mptcp_dbg_level = 1;
3e170ce0
A
186SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
39236c6e
A
189SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
39236c6e
A
192static struct protosw mptcp_subflow_protosw;
193static struct pr_usrreqs mptcp_subflow_usrreqs;
194#if INET6
195static struct ip6protosw mptcp_subflow_protosw6;
196static struct pr_usrreqs mptcp_subflow_usrreqs6;
197#endif /* INET6 */
198
5ba3f43e
A
199static uint8_t mptcp_create_subflows_scheduled;
200
3e170ce0
A
201typedef struct mptcp_subflow_event_entry {
202 uint64_t sofilt_hint_mask;
203 ev_ret_t (*sofilt_hint_ev_hdlr)(
204 struct mptses *mpte,
205 struct mptsub *mpts,
5ba3f43e
A
206 uint64_t *p_mpsofilt_hint,
207 uint64_t event);
3e170ce0
A
208} mptsub_ev_entry_t;
209
5ba3f43e
A
210static uint8_t mptcp_cellicon_is_set;
211static uint32_t mptcp_last_cellicon_set;
212#define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
213
490019cf
A
214/*
215 * XXX The order of the event handlers below is really
5ba3f43e 216 * really important. Think twice before changing it.
490019cf 217 */
3e170ce0
A
218static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
219 {
220 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
221 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
222 },
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
5ba3f43e 229 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
5ba3f43e 237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
5ba3f43e 241 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
250 },
3e170ce0
A
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
258 },
3e170ce0
A
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
262 },
263 {
5ba3f43e
A
264 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
270 },
3e170ce0
A
271};
272
39236c6e
A
273/*
274 * Protocol pr_init callback.
275 */
276void
277mptcp_init(struct protosw *pp, struct domain *dp)
278{
279#pragma unused(dp)
280 static int mptcp_initialized = 0;
281 struct protosw *prp;
282#if INET6
283 struct ip6protosw *prp6;
284#endif /* INET6 */
285
286 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
287
288 /* do this only once */
289 if (mptcp_initialized)
290 return;
291 mptcp_initialized = 1;
292
293 /*
294 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
295 * we must be able to find IPPROTO_TCP entries for both.
296 */
297 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
298 VERIFY(prp != NULL);
299 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
300 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
301 sizeof (mptcp_subflow_usrreqs));
302 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
303 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
304 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
305 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 306 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
307 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
308 /*
309 * Socket filters shouldn't attach/detach to/from this protosw
310 * since pr_protosw is to be used instead, which points to the
311 * real protocol; if they do, it is a bug and we should panic.
312 */
313 mptcp_subflow_protosw.pr_filter_head.tqh_first =
314 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
315 mptcp_subflow_protosw.pr_filter_head.tqh_last =
316 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
317
318#if INET6
319 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
320 IPPROTO_TCP, SOCK_STREAM);
321 VERIFY(prp6 != NULL);
322 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
323 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
324 sizeof (mptcp_subflow_usrreqs6));
325 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
326 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
327 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
328 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 329 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
330 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
331 /*
332 * Socket filters shouldn't attach/detach to/from this protosw
333 * since pr_protosw is to be used instead, which points to the
334 * real protocol; if they do, it is a bug and we should panic.
335 */
336 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
337 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
338 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
339 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
340#endif /* INET6 */
341
342 bzero(&mtcbinfo, sizeof (mtcbinfo));
343 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
344 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
345 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
346 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
347 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
348 /* NOTREACHED */
349 }
350 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
351 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
352
353 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
354 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
355 mtcbinfo.mppi_lock_grp_attr);
356 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
357 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
358 mtcbinfo.mppi_lock_attr);
39236c6e 359
3e170ce0 360 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e
A
361 mtcbinfo.mppi_timer = mptcp_timer;
362
363 /* attach to MP domain for garbage collection to take place */
364 mp_pcbinfo_attach(&mtcbinfo);
365
366 mptsub_zone_size = sizeof (struct mptsub);
367 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
368 8192, "mptsub")) == NULL) {
369 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
370 /* NOTREACHED */
371 }
372 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
373 zone_change(mptsub_zone, Z_EXPAND, TRUE);
374
375 mptopt_zone_size = sizeof (struct mptopt);
376 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
377 1024, "mptopt")) == NULL) {
378 panic("%s: unable to allocate MPTCP option zone\n", __func__);
379 /* NOTREACHED */
380 }
381 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
382 zone_change(mptopt_zone, Z_EXPAND, TRUE);
383
384 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
385 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
386 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
387 panic("%s: unable to allocate MPTCP address auth zone \n",
388 __func__);
389 /* NOTREACHED */
390 }
391 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
392 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
393
5ba3f43e
A
394 mptcp_last_cellicon_set = tcp_now;
395}
396
397int
398mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
399{
400 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
401
402 int i, index = -1;
403
404 if (ifp == NULL) {
405 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
406 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
407 return (-1);
408 }
409
410 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
411 if (stats[i].ifindex == IFSCOPE_NONE) {
412 if (index < 0)
413 index = i;
414 continue;
415 }
416
417 if (stats[i].ifindex == ifp->if_index) {
418 index = i;
419 return (index);
420 }
421 }
422
423 if (index != -1) {
424 stats[index].ifindex = ifp->if_index;
425 if (stats[index].is_expensive == 0)
426 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
427 }
428
429 return (index);
430}
431
432void
433mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
434{
435 int index;
436
437 tcpstat.tcps_mp_switches++;
438 mpte->mpte_subflow_switches++;
439
440 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
441
442 if (index != -1)
443 mpte->mpte_itfstats[index].switches++;
444}
445
446/*
447 * Flushes all recorded socket options from an MP socket.
448 */
449static void
450mptcp_flush_sopts(struct mptses *mpte)
451{
452 struct mptopt *mpo, *tmpo;
453
454 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
455 mptcp_sopt_remove(mpte, mpo);
456 mptcp_sopt_free(mpo);
457 }
458 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
39236c6e
A
459}
460
461/*
462 * Create an MPTCP session, called as a result of opening a MPTCP socket.
463 */
5ba3f43e
A
464int
465mptcp_sescreate(struct mppcb *mpp)
39236c6e
A
466{
467 struct mppcbinfo *mppi;
468 struct mptses *mpte;
469 struct mptcb *mp_tp;
39236c6e
A
470
471 VERIFY(mpp != NULL);
472 mppi = mpp->mpp_pcbinfo;
473 VERIFY(mppi != NULL);
474
3e170ce0
A
475 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
476 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
477
478 /* MPTCP Multipath PCB Extension */
479 bzero(mpte, sizeof (*mpte));
480 VERIFY(mpp->mpp_pcbe == NULL);
481 mpp->mpp_pcbe = mpte;
482 mpte->mpte_mppcb = mpp;
483 mpte->mpte_mptcb = mp_tp;
484
485 TAILQ_INIT(&mpte->mpte_sopts);
486 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
487 mpte->mpte_associd = SAE_ASSOCID_ANY;
488 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e 489
5ba3f43e
A
490 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
491 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
39236c6e
A
492
493 /* MPTCP Protocol Control Block */
494 bzero(mp_tp, sizeof (*mp_tp));
39236c6e 495 mp_tp->mpt_mpte = mpte;
3e170ce0 496 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e 497
5ba3f43e
A
498 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
499
500 return (0);
501}
502
503static void
504mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
505 uint64_t *cellbytes, uint64_t *allbytes)
506{
507 int64_t mycellbytes = 0;
508 uint64_t myallbytes = 0;
509 int i;
510
511 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
512 if (mpte->mpte_itfstats[i].is_expensive) {
513 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
514 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
515 }
516
517 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
518 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
519 }
520
521 if (initial_cell) {
522 mycellbytes -= mpte->mpte_init_txbytes;
523 mycellbytes -= mpte->mpte_init_txbytes;
524 }
525
526 if (mycellbytes < 0) {
527 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
528 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
529 *cellbytes = 0;
530 *allbytes = 0;
531 } else {
532 *cellbytes = mycellbytes;
533 *allbytes = myallbytes;
534 }
535}
536
537static void
538mptcpstats_session_wrapup(struct mptses *mpte)
539{
540 boolean_t cell = mpte->mpte_initial_cell;
541
542 switch (mpte->mpte_svctype) {
543 case MPTCP_SVCTYPE_HANDOVER:
544 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
545 tcpstat.tcps_mptcp_fp_handover_attempt++;
546
547 if (cell && mpte->mpte_handshake_success) {
548 tcpstat.tcps_mptcp_fp_handover_success_cell++;
549
550 if (mpte->mpte_used_wifi)
551 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
552 } else if (mpte->mpte_handshake_success) {
553 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
554
555 if (mpte->mpte_used_cell)
556 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
557 }
558 } else {
559 tcpstat.tcps_mptcp_handover_attempt++;
560
561 if (cell && mpte->mpte_handshake_success) {
562 tcpstat.tcps_mptcp_handover_success_cell++;
563
564 if (mpte->mpte_used_wifi)
565 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
566 } else if (mpte->mpte_handshake_success) {
567 tcpstat.tcps_mptcp_handover_success_wifi++;
568
569 if (mpte->mpte_used_cell)
570 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
571 }
572 }
573
574 if (mpte->mpte_handshake_success) {
575 uint64_t cellbytes;
576 uint64_t allbytes;
577
578 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
579
580 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
581 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
582 }
583 break;
584 case MPTCP_SVCTYPE_INTERACTIVE:
585 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
586 tcpstat.tcps_mptcp_fp_interactive_attempt++;
587
588 if (mpte->mpte_handshake_success) {
589 tcpstat.tcps_mptcp_fp_interactive_success++;
590
591 if (!cell && mpte->mpte_used_cell)
592 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
593 }
594 } else {
595 tcpstat.tcps_mptcp_interactive_attempt++;
596
597 if (mpte->mpte_handshake_success) {
598 tcpstat.tcps_mptcp_interactive_success++;
599
600 if (!cell && mpte->mpte_used_cell)
601 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
602 }
603 }
604
605 if (mpte->mpte_handshake_success) {
606 uint64_t cellbytes;
607 uint64_t allbytes;
608
609 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
610
611 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
612 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
613 }
614 break;
615 case MPTCP_SVCTYPE_AGGREGATE:
616 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
617 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
618
619 if (mpte->mpte_handshake_success)
620 tcpstat.tcps_mptcp_fp_aggregate_success++;
621 } else {
622 tcpstat.tcps_mptcp_aggregate_attempt++;
623
624 if (mpte->mpte_handshake_success) {
625 tcpstat.tcps_mptcp_aggregate_success++;
626 }
627 }
628
629 if (mpte->mpte_handshake_success) {
630 uint64_t cellbytes;
631 uint64_t allbytes;
632
633 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
634
635 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
636 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
637 }
638 break;
639 }
640
641 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
642 tcpstat.tcps_mptcp_back_to_wifi++;
39236c6e
A
643}
644
645/*
646 * Destroy an MPTCP session.
647 */
648static void
5ba3f43e 649mptcp_session_destroy(struct mptses *mpte)
39236c6e
A
650{
651 struct mptcb *mp_tp;
652
5ba3f43e 653 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
654
655 mp_tp = mpte->mpte_mptcb;
656 VERIFY(mp_tp != NULL);
657
5ba3f43e
A
658 mptcpstats_session_wrapup(mpte);
659
660 mptcp_unset_cellicon();
661
39236c6e
A
662 /*
663 * MPTCP Multipath PCB Extension section
664 */
665 mptcp_flush_sopts(mpte);
666 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
667
5ba3f43e
A
668 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
669 _FREE(mpte->mpte_itfinfo, M_TEMP);
670
671 mpte->mpte_itfinfo = NULL;
672
673 m_freem_list(mpte->mpte_reinjectq);
39236c6e
A
674
675 /*
676 * MPTCP Protocol Control Block section
677 */
39236c6e
A
678 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
679 struct mptcb *, mp_tp);
680}
681
5ba3f43e
A
682static boolean_t
683mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
39236c6e 684{
5ba3f43e
A
685 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
686 mp_tp->mpt_state < MPTCPS_TIME_WAIT &&
687 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
688}
39236c6e 689
5ba3f43e
A
690static int
691mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
692{
693 static const struct in6_addr well_known_prefix = {
694 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
696 0x00, 0x00, 0x00, 0x00},
697 };
698 char buf[MAX_IPv6_STR_LEN];
699 char *ptrv4 = (char *)addrv4;
700 char *ptr = (char *)addr;
701
702 if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network
703 IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback
704 IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local
705 IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite
706 IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast
707 IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast
708 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
709 return (-1);
39236c6e
A
710 }
711
5ba3f43e
A
712 /* Check for the well-known prefix */
713 if (len == NAT64_PREFIX_LEN_96 &&
714 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
715 if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
716 IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space
717 return (-1);
718 }
39236c6e 719
5ba3f43e
A
720 switch (len) {
721 case NAT64_PREFIX_LEN_96:
722 memcpy(ptr + 12, ptrv4, 4);
723 break;
724 case NAT64_PREFIX_LEN_64:
725 memcpy(ptr + 9, ptrv4, 4);
726 break;
727 case NAT64_PREFIX_LEN_56:
728 memcpy(ptr + 7, ptrv4, 1);
729 memcpy(ptr + 9, ptrv4 + 1, 3);
730 break;
731 case NAT64_PREFIX_LEN_48:
732 memcpy(ptr + 6, ptrv4, 2);
733 memcpy(ptr + 9, ptrv4 + 2, 2);
734 break;
735 case NAT64_PREFIX_LEN_40:
736 memcpy(ptr + 5, ptrv4, 3);
737 memcpy(ptr + 9, ptrv4 + 3, 1);
738 break;
739 case NAT64_PREFIX_LEN_32:
740 memcpy(ptr + 4, ptrv4, 4);
741 break;
742 default:
743 panic("NAT64-prefix len is wrong: %u\n", len);
744 }
39236c6e 745
5ba3f43e
A
746 mptcplog((LOG_DEBUG, "%s: nat64prefix-len %u synthesized %s\n", __func__,
747 len, inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf))),
748 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 749
5ba3f43e 750 return (0);
39236c6e
A
751}
752
39236c6e 753void
5ba3f43e 754mptcp_check_subflows_and_add(struct mptses *mpte)
39236c6e 755{
5ba3f43e
A
756 struct mptcb *mp_tp = mpte->mpte_mptcb;
757 uint32_t i;
39236c6e 758
5ba3f43e
A
759 if (!mptcp_ok_to_create_subflows(mp_tp))
760 return;
39236c6e 761
5ba3f43e
A
762 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
763 struct mpt_itf_info *info;
764 struct mptsub *mpts;
765 uint32_t ifindex;
766 int found = 0;
39236c6e 767
5ba3f43e 768 info = &mpte->mpte_itfinfo[i];
39236c6e 769
5ba3f43e
A
770 if (info->no_mptcp_support)
771 continue;
39236c6e 772
5ba3f43e
A
773 ifindex = info->ifindex;
774 if (ifindex == IFSCOPE_NONE)
775 continue;
39236c6e 776
5ba3f43e
A
777 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
778 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
39236c6e 779
5ba3f43e
A
780 if (ifp == NULL)
781 continue;
39236c6e 782
5ba3f43e
A
783 if (ifp->if_index == ifindex &&
784 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED)) {
785 /*
786 * We found a subflow on this interface.
787 * No need to create a new one.
788 */
789 found = 1;
790 break;
791 }
792
793 /*
794 * In Handover mode, only create cell subflow if
795 * 1. Wi-Fi Assist is active
796 * 2. Symptoms marked WiFi as weak
797 * 3. We are experiencing RTOs or we are not sending data.
798 *
799 * This covers the scenario, where:
800 * 1. We send and get retransmission timeouts (thus,
801 * we confirmed that WiFi is indeed bad).
802 * 2. We are not sending and the server tries to send.
803 * Establshing a cell-subflow gives the server a
804 * chance to send us some data over cell if WiFi
805 * is dead. We establish the subflow with the
806 * backup-bit set, so the server is not allowed to
807 * send on this subflow as long as WiFi is providing
808 * good performance.
809 */
810 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
811 !IFNET_IS_CELLULAR(ifp) &&
812 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
813 (!mptcp_is_wifi_unusable() ||
814 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh &&
815 mptetoso(mpte)->so_snd.sb_cc))) {
816 mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n",
817 __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex,
818 ifp->if_index),
819 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
820 found = 1;
821 break;
822 }
823 }
824
825 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
826 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
827 mptcp_developer_mode == 0) {
828 mptcp_ask_symptoms(mpte);
829 return;
830 }
831
832 if (!found) {
833 struct sockaddr *dst = &mpte->mpte_dst;
834 struct sockaddr_in6 nat64pre;
835
836 if (mpte->mpte_dst.sa_family == AF_INET &&
837 !info->has_v4_conn && info->has_v6_conn) {
838 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
839 struct ifnet *ifp;
840 int error, j;
841
842 bzero(&nat64pre, sizeof(struct sockaddr_in6));
843
844 ifnet_head_lock_shared();
845 ifp = ifindex2ifnet[ifindex];
846 ifnet_head_done();
847
848 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
849 if (error) {
850 mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n",
851 __func__, ifp->if_name, error),
852 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
853 continue;
854 }
855
856 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
857 if (nat64prefixes[j].prefix_len != 0)
858 break;
859 }
860
861 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
862
863 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
864 nat64prefixes[j].prefix_len,
865 &mpte->__mpte_dst_v4.sin_addr);
866 if (error != 0) {
867 mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__),
868 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
869 continue;
870 }
871
872 memcpy(&nat64pre.sin6_addr,
873 &nat64prefixes[j].ipv6_prefix,
874 sizeof(nat64pre.sin6_addr));
875 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
876 nat64pre.sin6_family = AF_INET6;
877 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
878 nat64pre.sin6_flowinfo = 0;
879 nat64pre.sin6_scope_id = 0;
880
881 dst = (struct sockaddr *)&nat64pre;
882 }
883
884 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
885 }
886 }
887}
888
889/*
890 * Based on the MPTCP Service-type and the state of the subflows, we
891 * will destroy subflows here.
892 */
893static void
894mptcp_check_subflows_and_remove(struct mptses *mpte)
895{
896 struct mptsub *mpts, *tmpts;
897 int found_working_subflow = 0, removed_some = 0;
898 int wifi_unusable = mptcp_is_wifi_unusable();
899
900 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
901 return;
902
903 /*
904 * Look for a subflow that is on a non-cellular interface
905 * and actually works (aka, no retransmission timeout).
906 */
907 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
908 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
909 struct socket *so;
910 struct tcpcb *tp;
911
912 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
913 continue;
914
915 so = mpts->mpts_socket;
916 tp = sototcpcb(so);
917
918 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
919 tp->t_state != TCPS_ESTABLISHED)
920 continue;
921
922 /* Either this subflow is in good condition while we try to send */
923 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc)
924 found_working_subflow = 1;
39236c6e 925
5ba3f43e
A
926 /* Or WiFi is fine */
927 if (!wifi_unusable)
928 found_working_subflow = 1;
39236c6e
A
929 }
930
5ba3f43e
A
931 /*
932 * Couldn't find a working subflow, let's not remove those on a cellular
933 * interface.
934 */
935 if (!found_working_subflow)
936 return;
937
938 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
939 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
940
941 /* Only remove cellular subflows */
942 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
943 continue;
944
945 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
946 removed_some = 1;
947 }
948
949 if (removed_some)
950 mptcp_unset_cellicon();
951}
952
953static void
954mptcp_remove_subflows(struct mptses *mpte)
955{
956 struct mptsub *mpts, *tmpts;
957
958 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
959 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
960 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
961
962 soevent(mpts->mpts_socket,
963 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
964 }
965 }
966}
967
968static void
969mptcp_create_subflows(__unused void *arg)
970{
971 struct mppcb *mpp;
972
973 /*
974 * Start with clearing, because we might be processing connections
975 * while a new event comes in.
976 */
977 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
978 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
979 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
980
981 /* Iterate over all MPTCP connections */
982
983 lck_mtx_lock(&mtcbinfo.mppi_lock);
984
985 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
986 struct mptses *mpte;
987 struct socket *mp_so;
988
989 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
990 continue;
991
992 mpp_lock(mpp);
993
994 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
995
996 mpte = mpp->mpp_pcbe;
997 mp_so = mpp->mpp_socket;
998
999 VERIFY(mp_so->so_usecount > 0);
1000
1001 mptcp_check_subflows_and_add(mpte);
1002 mptcp_remove_subflows(mpte);
1003
1004 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1005 mpp_unlock(mpp);
1006 }
1007
1008 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1009}
1010
1011/*
1012 * We need this because we are coming from an NECP-event. This event gets posted
1013 * while holding NECP-locks. The creation of the subflow however leads us back
1014 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1015 * So, we would deadlock there as we already hold the NECP-lock.
1016 *
1017 * So, let's schedule this separately. It also gives NECP the chance to make
1018 * progress, without having to wait for MPTCP to finish its subflow creation.
1019 */
1020void
1021mptcp_sched_create_subflows(struct mptses *mpte)
1022{
1023 struct mppcb *mpp = mpte->mpte_mppcb;
1024 struct mptcb *mp_tp = mpte->mpte_mptcb;
1025 struct socket *mp_so = mpp->mpp_socket;
1026
1027 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1028 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1029 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1030 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1031 return;
1032 }
1033
1034 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1035 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1036 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1037 }
1038
1039 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1040 return;
1041
1042 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1043 timeout(mptcp_create_subflows, NULL, hz/10);
1044}
1045
1046/*
1047 * Allocate an MPTCP socket option structure.
1048 */
1049struct mptopt *
1050mptcp_sopt_alloc(int how)
1051{
1052 struct mptopt *mpo;
1053
1054 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1055 zalloc_noblock(mptopt_zone);
1056 if (mpo != NULL) {
1057 bzero(mpo, mptopt_zone_size);
1058 }
1059
1060 return (mpo);
1061}
1062
1063/*
1064 * Free an MPTCP socket option structure.
1065 */
1066void
1067mptcp_sopt_free(struct mptopt *mpo)
1068{
1069 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1070
1071 zfree(mptopt_zone, mpo);
1072}
1073
1074/*
1075 * Add a socket option to the MPTCP socket option list.
1076 */
1077void
1078mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1079{
1080 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1081 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1082 mpo->mpo_flags |= MPOF_ATTACHED;
1083 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1084}
1085
1086/*
1087 * Remove a socket option from the MPTCP socket option list.
1088 */
1089void
1090mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1091{
1092 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1093 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1094 mpo->mpo_flags &= ~MPOF_ATTACHED;
1095 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1096}
1097
1098/*
1099 * Search for an existing <sopt_level,sopt_name> socket option.
1100 */
1101struct mptopt *
1102mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1103{
1104 struct mptopt *mpo;
1105
1106 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1107
1108 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1109 if (mpo->mpo_level == sopt->sopt_level &&
1110 mpo->mpo_name == sopt->sopt_name)
1111 break;
1112 }
1113 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
1114
1115 return (mpo);
1116}
1117
1118/*
1119 * Allocate a MPTCP subflow structure.
1120 */
1121static struct mptsub *
1122mptcp_subflow_alloc(void)
1123{
1124 struct mptsub *mpts = zalloc(mptsub_zone);
1125
1126 if (mpts == NULL)
1127 return (NULL);
1128
1129 bzero(mpts, mptsub_zone_size);
39236c6e
A
1130 return (mpts);
1131}
1132
1133/*
1134 * Deallocate a subflow structure, called when all of the references held
1135 * on it have been released. This implies that the subflow has been deleted.
1136 */
5ba3f43e 1137static void
39236c6e
A
1138mptcp_subflow_free(struct mptsub *mpts)
1139{
39236c6e
A
1140 VERIFY(mpts->mpts_refcnt == 0);
1141 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1142 VERIFY(mpts->mpts_mpte == NULL);
1143 VERIFY(mpts->mpts_socket == NULL);
1144
813fb2f6
A
1145 if (mpts->mpts_src != NULL) {
1146 FREE(mpts->mpts_src, M_SONAME);
1147 mpts->mpts_src = NULL;
39236c6e 1148 }
39236c6e
A
1149
1150 zfree(mptsub_zone, mpts);
1151}
1152
5ba3f43e
A
1153static void
1154mptcp_subflow_addref(struct mptsub *mpts)
1155{
1156 if (++mpts->mpts_refcnt == 0)
1157 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1158 /* NOTREACHED */
1159}
1160
1161static void
1162mptcp_subflow_remref(struct mptsub *mpts)
1163{
1164 if (mpts->mpts_refcnt == 0) {
1165 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1166 /* NOTREACHED */
1167 }
1168 if (--mpts->mpts_refcnt > 0)
1169 return;
1170
1171 /* callee will unlock and destroy lock */
1172 mptcp_subflow_free(mpts);
1173}
1174
1175static void
1176mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1177{
1178 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1179 struct tcpcb *tp = sototcpcb(so);
1180
1181 /*
1182 * From this moment on, the subflow is linked to the MPTCP-connection.
1183 * Locking,... happens now at the MPTCP-layer
1184 */
1185 tp->t_mptcb = mpte->mpte_mptcb;
1186 so->so_flags |= SOF_MP_SUBFLOW;
1187 mp_so->so_usecount++;
1188
1189 /*
1190 * Insert the subflow into the list, and associate the MPTCP PCB
1191 * as well as the the subflow socket. From this point on, removing
1192 * the subflow needs to be done via mptcp_subflow_del().
1193 */
1194 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1195 mpte->mpte_numflows++;
1196
1197 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1198 mpts->mpts_mpte = mpte;
1199 mpts->mpts_socket = so;
1200 tp->t_mpsub = mpts;
1201 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1202 mptcp_subflow_addref(mpts); /* for subflow socket */
1203}
1204
1205static void
1206mptcp_subflow_necp_cb(void *handle, __unused int action,
1207 __unused struct necp_client_flow *flow)
1208{
1209 struct inpcb *inp = (struct inpcb *)handle;
1210 struct socket *so = inp->inp_socket;
1211 struct mptsub *mpts;
1212 struct mptses *mpte;
1213
1214 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1215 return;
1216
1217 /*
1218 * The socket is being garbage-collected. There is nothing to be done
1219 * here.
1220 */
1221 if (so->so_usecount == 0)
1222 return;
1223
1224 socket_lock(so, 1);
1225
1226 /* Check again after we acquired the lock. */
1227 if (so->so_usecount == 0)
1228 goto out;
1229
1230 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1231 mpts = sototcpcb(so)->t_mpsub;
1232
1233 mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__),
1234 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
1235
1236 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1237
1238 mptcp_sched_create_subflows(mpte);
1239
1240 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
1241 flow->viable = 1;
1242
1243out:
1244 socket_unlock(so, 1);
1245}
1246
39236c6e
A
1247/*
1248 * Create an MPTCP subflow socket.
1249 */
1250static int
1251mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
5ba3f43e 1252 struct socket **so)
39236c6e 1253{
5ba3f43e 1254 lck_mtx_t *subflow_mtx;
39236c6e 1255 struct mptopt smpo, *mpo, *tmpo;
5ba3f43e 1256 struct proc *p;
39236c6e
A
1257 struct socket *mp_so;
1258 int error;
1259
1260 *so = NULL;
5ba3f43e
A
1261 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1262 mp_so = mptetoso(mpte);
1263
1264 p = proc_find(mp_so->last_pid);
1265 if (p == PROC_NULL) {
1266 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1267 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1268
1269 return (ESRCH);
1270 }
39236c6e
A
1271
1272 /*
1273 * Create the subflow socket (multipath subflow, non-blocking.)
1274 *
1275 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1276 * socket; it will be cleared when the socket is peeled off or closed.
1277 * It also indicates to the underlying TCP to handle MPTCP options.
1278 * A multipath subflow socket implies SS_NOFDREF state.
1279 */
5ba3f43e
A
1280
1281 /*
1282 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1283 * the ipi-lock. We cannot hold the socket-lock at that point.
1284 */
1285 mpte_unlock(mpte);
1286 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1287 SOCF_ASYNC, PROC_NULL);
1288 mpte_lock(mpte);
1289 if (error) {
1290 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1291 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1292 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1293
1294 proc_rele(p);
1295
1296 mptcp_subflow_free(mpts);
39236c6e
A
1297 return (error);
1298 }
1299
5ba3f43e
A
1300 /*
1301 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1302 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1303 * Which is why we also need to get the lock with pr_getlock, as after
1304 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1305 */
1306 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1307 lck_mtx_lock(subflow_mtx);
1308
1309 /*
1310 * Must be the first thing we do, to make sure all pointers for this
1311 * subflow are set.
1312 */
1313 mptcp_subflow_attach(mpte, mpts, *so);
1314
1315 /*
1316 * A multipath subflow socket is used internally in the kernel,
1317 * therefore it does not have a file desciptor associated by
1318 * default.
1319 */
1320 (*so)->so_state |= SS_NOFDREF;
1321
1322 lck_mtx_unlock(subflow_mtx);
39236c6e
A
1323
1324 /* prevent the socket buffers from being compressed */
1325 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1326 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1327
490019cf
A
1328 /* Inherit preconnect and TFO data flags */
1329 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1330 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
490019cf
A
1331 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1332 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1333
5ba3f43e
A
1334 /* Inherit uuid and create the related flow. */
1335 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1336 struct mptcb *mp_tp = mpte->mpte_mptcb;
1337
1338 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1339
1340 /*
1341 * A note on the unlock: With MPTCP, we do multiple times a
1342 * necp_client_register_socket_flow. This is problematic,
1343 * because now the lock-ordering guarantee (first necp-locks,
1344 * then socket-locks) is no more respected. So, we need to
1345 * unlock here.
1346 */
1347 mpte_unlock(mpte);
1348 error = necp_client_register_socket_flow(mp_so->last_pid,
1349 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1350 mpte_lock(mpte);
1351
1352 if (error)
1353 goto out_err;
1354
1355 /* Possible state-change during the unlock above */
1356 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1357 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1358 goto out_err;
1359
1360 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1361 } else {
1362 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1363 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1364 }
1365
1366 /* inherit the other socket options */
39236c6e
A
1367 bzero(&smpo, sizeof (smpo));
1368 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1369 smpo.mpo_level = SOL_SOCKET;
1370 smpo.mpo_intval = 1;
1371
1372 /* disable SIGPIPE */
1373 smpo.mpo_name = SO_NOSIGPIPE;
5ba3f43e
A
1374 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1375 goto out_err;
39236c6e
A
1376
1377 /* find out if the subflow's source address goes away */
1378 smpo.mpo_name = SO_NOADDRERR;
5ba3f43e
A
1379 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1380 goto out_err;
39236c6e
A
1381
1382 /* enable keepalive */
1383 smpo.mpo_name = SO_KEEPALIVE;
5ba3f43e
A
1384 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1385 goto out_err;
39236c6e
A
1386
1387 smpo.mpo_level = IPPROTO_TCP;
1388 smpo.mpo_intval = mptcp_subflow_keeptime;
1389 smpo.mpo_name = TCP_KEEPALIVE;
5ba3f43e
A
1390 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1391 goto out_err;
1392
1393 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1394 /*
1395 * On secondary subflows we might need to set the cell-fallback
1396 * flag (see conditions in mptcp_subflow_sosetopt).
1397 */
1398 smpo.mpo_level = SOL_SOCKET;
1399 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1400 smpo.mpo_intval = 1;
1401 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1402 goto out_err;
1403 }
39236c6e
A
1404
1405 /* replay setsockopt(2) on the subflow sockets for eligible options */
1406 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1407 int interim;
1408
1409 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1410 continue;
1411
1412 /*
1413 * Skip those that are handled internally; these options
1414 * should not have been recorded and marked with the
1415 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1416 */
1417 if (mpo->mpo_level == SOL_SOCKET &&
1418 (mpo->mpo_name == SO_NOSIGPIPE ||
1419 mpo->mpo_name == SO_NOADDRERR ||
1420 mpo->mpo_name == SO_KEEPALIVE))
1421 continue;
1422
1423 interim = (mpo->mpo_flags & MPOF_INTERIM);
5ba3f43e
A
1424 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1425 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1426 " sopt %s val %d interim record removed\n", __func__,
39236c6e 1427 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
1428 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1429 mpo->mpo_intval),
3e170ce0 1430 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1431 mptcp_sopt_remove(mpte, mpo);
1432 mptcp_sopt_free(mpo);
1433 continue;
1434 }
1435 }
1436
1437 /*
1438 * We need to receive everything that the subflow socket has,
1439 * so use a customized socket receive function. We will undo
1440 * this when the socket is peeled off or closed.
1441 */
39236c6e
A
1442 switch (dom) {
1443 case PF_INET:
1444 (*so)->so_proto = &mptcp_subflow_protosw;
1445 break;
1446#if INET6
1447 case PF_INET6:
1448 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1449 break;
1450#endif /* INET6 */
1451 default:
1452 VERIFY(0);
1453 /* NOTREACHED */
1454 }
1455
5ba3f43e
A
1456 proc_rele(p);
1457
1458 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1459 int, dom, int, error);
1460
1461 return (0);
39236c6e 1462
5ba3f43e
A
1463out_err:
1464 mptcp_subflow_abort(mpts, error);
1465
1466 proc_rele(p);
1467
1468 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1469 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1470
1471 return (error);
1472}
1473
1474/*
1475 * Close an MPTCP subflow socket.
1476 *
1477 * Note that this may be called on an embryonic subflow, and the only
1478 * thing that is guaranteed valid is the protocol-user request.
1479 */
5ba3f43e
A
1480static void
1481mptcp_subflow_soclose(struct mptsub *mpts)
39236c6e 1482{
5ba3f43e
A
1483 struct socket *so = mpts->mpts_socket;
1484
1485 if (mpts->mpts_flags & MPTSF_CLOSED)
1486 return;
39236c6e 1487
5ba3f43e 1488 VERIFY(so != NULL);
39236c6e
A
1489 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1490 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1491
39236c6e
A
1492 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1493 struct socket *, so,
1494 struct sockbuf *, &so->so_rcv,
1495 struct sockbuf *, &so->so_snd,
1496 struct mptses *, mpts->mpts_mpte);
1497
5ba3f43e
A
1498 mpts->mpts_flags |= MPTSF_CLOSED;
1499
1500 if (so->so_retaincnt == 0) {
1501 soclose_locked(so);
1502
1503 return;
1504 } else {
1505 VERIFY(so->so_usecount > 0);
1506 so->so_usecount--;
1507 }
1508
1509 return;
39236c6e
A
1510}
1511
1512/*
1513 * Connect an MPTCP subflow socket.
1514 *
5ba3f43e
A
1515 * Note that in the pending connect case, the subflow socket may have been
1516 * bound to an interface and/or a source IP address which may no longer be
1517 * around by the time this routine is called; in that case the connect attempt
1518 * will most likely fail.
39236c6e
A
1519 */
1520static int
1521mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1522{
5ba3f43e
A
1523 char dbuf[MAX_IPv6_STR_LEN];
1524 struct socket *mp_so, *so;
1525 struct mptcb *mp_tp;
1526 struct sockaddr *dst;
1527 struct proc *p;
39236c6e
A
1528 int af, error;
1529
5ba3f43e 1530 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 1531
5ba3f43e
A
1532 mp_so = mptetoso(mpte);
1533 mp_tp = mpte->mpte_mptcb;
39236c6e 1534
5ba3f43e
A
1535 p = proc_find(mp_so->last_pid);
1536 if (p == PROC_NULL) {
1537 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1538 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1539
5ba3f43e 1540 return (ESRCH);
39236c6e
A
1541 }
1542
5ba3f43e
A
1543 so = mpts->mpts_socket;
1544 af = mpts->mpts_dst.sa_family;
1545
1546 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1547 VERIFY(mpts->mpts_socket != NULL);
1548 VERIFY(af == AF_INET || af == AF_INET6);
1549
1550 dst = &mpts->mpts_dst;
1551 mptcplog((LOG_DEBUG, "%s: connectx mp_so 0x%llx dst %s[%d] cid %d [pended %s]\n",
1552 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1553 inet_ntop(af, ((af == AF_INET) ? (void *)&SIN(dst)->sin_addr.s_addr :
1554 (void *)&SIN6(dst)->sin6_addr),
1555 dbuf, sizeof (dbuf)),
1556 ((af == AF_INET) ? ntohs(SIN(dst)->sin_port) : ntohs(SIN6(dst)->sin6_port)),
1557 mpts->mpts_connid,
1558 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? "YES" : "NO")),
1559 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1560
39236c6e
A
1561 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1562
fe8ab488 1563 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 1564
39236c6e 1565 /* connect the subflow socket */
5ba3f43e
A
1566 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1567 p, mpts->mpts_ifscope,
1568 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1569
1570 mpts->mpts_iss = sototcpcb(so)->iss;
1571
1572 /* See tcp_connect_complete */
1573 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1574 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1575 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1576 }
39236c6e 1577
fe8ab488
A
1578 /* Allocate a unique address id per subflow */
1579 mpte->mpte_addrid_last++;
1580 if (mpte->mpte_addrid_last == 0)
1581 mpte->mpte_addrid_last++;
1582
5ba3f43e
A
1583 proc_rele(p);
1584
39236c6e
A
1585 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1586 struct mptsub *, mpts, int, error);
5ba3f43e
A
1587 if (error)
1588 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1589 __func__, error, mpts->mpts_ifscope),
1590 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1591
1592 return (error);
1593}
1594
1595/*
1596 * MPTCP subflow socket receive routine, derived from soreceive().
1597 */
1598static int
1599mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1600 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1601{
1602#pragma unused(uio)
5ba3f43e 1603 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1604 int flags, error = 0;
1605 struct proc *p = current_proc();
1606 struct mbuf *m, **mp = mp0;
5ba3f43e 1607 boolean_t proc_held = FALSE;
39236c6e 1608
5ba3f43e 1609 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1610 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1611
1612#ifdef MORE_LOCKING_DEBUG
1613 if (so->so_usecount == 1) {
1614 panic("%s: so=%x no other reference on socket\n", __func__, so);
1615 /* NOTREACHED */
1616 }
1617#endif
1618 /*
1619 * We return all that is there in the subflow's socket receive buffer
1620 * to the MPTCP layer, so we require that the caller passes in the
1621 * expected parameters.
1622 */
5ba3f43e 1623 if (mp == NULL || controlp != NULL)
39236c6e 1624 return (EINVAL);
5ba3f43e 1625
39236c6e
A
1626 *mp = NULL;
1627 if (psa != NULL)
1628 *psa = NULL;
1629 if (flagsp != NULL)
1630 flags = *flagsp &~ MSG_EOR;
1631 else
1632 flags = 0;
1633
5ba3f43e 1634 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
39236c6e 1635 return (EOPNOTSUPP);
5ba3f43e 1636
39236c6e
A
1637 flags |= (MSG_DONTWAIT|MSG_NBIO);
1638
1639 /*
1640 * If a recv attempt is made on a previously-accepted socket
1641 * that has been marked as inactive (disconnected), reject
1642 * the request.
1643 */
1644 if (so->so_flags & SOF_DEFUNCT) {
1645 struct sockbuf *sb = &so->so_rcv;
1646
1647 error = ENOTCONN;
39236c6e
A
1648 /*
1649 * This socket should have been disconnected and flushed
1650 * prior to being returned from sodefunct(); there should
1651 * be no data on its receive list, so panic otherwise.
1652 */
1653 if (so->so_state & SS_DEFUNCT)
1654 sb_empty_assert(sb, __func__);
39236c6e
A
1655 return (error);
1656 }
1657
1658 /*
1659 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1660 * and if so just return to the caller. This could happen when
1661 * soreceive() is called by a socket upcall function during the
1662 * time the socket is freed. The socket buffer would have been
1663 * locked across the upcall, therefore we cannot put this thread
1664 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1665 * we may livelock), because the lock on the socket buffer will
1666 * only be released when the upcall routine returns to its caller.
1667 * Because the socket has been officially closed, there can be
1668 * no further read on it.
1669 *
1670 * A multipath subflow socket would have its SS_NOFDREF set by
1671 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1672 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1673 */
1674 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
5ba3f43e 1675 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
39236c6e 1676 return (0);
39236c6e
A
1677
1678 /*
1679 * For consistency with soreceive() semantics, we need to obey
1680 * SB_LOCK in case some other code path has locked the buffer.
1681 */
1682 error = sblock(&so->so_rcv, 0);
5ba3f43e 1683 if (error != 0)
39236c6e 1684 return (error);
39236c6e
A
1685
1686 m = so->so_rcv.sb_mb;
1687 if (m == NULL) {
1688 /*
1689 * Panic if we notice inconsistencies in the socket's
1690 * receive list; both sb_mb and sb_cc should correctly
1691 * reflect the contents of the list, otherwise we may
1692 * end up with false positives during select() or poll()
1693 * which could put the application in a bad state.
1694 */
1695 SB_MB_CHECK(&so->so_rcv);
1696
1697 if (so->so_error != 0) {
1698 error = so->so_error;
1699 so->so_error = 0;
1700 goto release;
1701 }
1702
5ba3f43e
A
1703 if (so->so_state & SS_CANTRCVMORE) {
1704 goto release;
1705 }
1706
1707 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1708 error = ENOTCONN;
1709 goto release;
1710 }
1711
1712 /*
1713 * MSG_DONTWAIT is implicitly defined and this routine will
1714 * never block, so return EWOULDBLOCK when there is nothing.
1715 */
1716 error = EWOULDBLOCK;
1717 goto release;
1718 }
1719
1720 mptcp_update_last_owner(so, mp_so);
1721
1722 if (mp_so->last_pid != proc_pid(p)) {
1723 p = proc_find(mp_so->last_pid);
1724 if (p == PROC_NULL) {
1725 p = current_proc();
1726 } else {
1727 proc_held = TRUE;
1728 }
1729 }
1730
1731 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1732 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1733 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1734
1735 while (m != NULL) {
1736 int dlen = 0;
1737 struct mbuf *start = m;
1738 uint64_t dsn;
1739 uint32_t sseq;
1740 uint16_t orig_dlen;
1741 uint16_t csum;
1742
1743 VERIFY(m->m_nextpkt == NULL);
1744
1745 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1746 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1747 dsn = m->m_pkthdr.mp_dsn;
1748 sseq = m->m_pkthdr.mp_rseq;
1749 csum = m->m_pkthdr.mp_csum;
1750 } else {
1751 /* We did fallback */
1752 mptcp_adj_rmap(so, m, 0);
1753
1754 sbfree(&so->so_rcv, m);
1755
1756 if (mp != NULL) {
1757 *mp = m;
1758 mp = &m->m_next;
1759 so->so_rcv.sb_mb = m = m->m_next;
1760 *mp = NULL;
1761
1762 }
1763
1764 if (m != NULL) {
1765 so->so_rcv.sb_lastrecord = m;
1766 } else {
1767 SB_EMPTY_FIXUP(&so->so_rcv);
1768 }
1769
1770 continue;
39236c6e
A
1771 }
1772
5ba3f43e
A
1773 /*
1774 * Check if the full mapping is now present
1775 */
1776 if ((int)so->so_rcv.sb_cc < dlen) {
1777 mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n",
1778 __func__, so->so_rcv.sb_cc, dlen),
1779 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1780
1781 if (*mp0 == NULL)
1782 error = EWOULDBLOCK;
39236c6e
A
1783 goto release;
1784 }
1785
5ba3f43e
A
1786 /* Now, get the full mapping */
1787 while (dlen > 0) {
1788 mptcp_adj_rmap(so, m, orig_dlen - dlen);
39236c6e 1789
5ba3f43e
A
1790 dlen -= m->m_len;
1791 sbfree(&so->so_rcv, m);
39236c6e 1792
5ba3f43e
A
1793 if (mp != NULL) {
1794 *mp = m;
1795 mp = &m->m_next;
1796 so->so_rcv.sb_mb = m = m->m_next;
1797 *mp = NULL;
1798 }
1799
1800 VERIFY(dlen <= 0 || m);
39236c6e
A
1801 }
1802
5ba3f43e
A
1803 VERIFY(dlen == 0);
1804
39236c6e 1805 if (m != NULL) {
5ba3f43e 1806 so->so_rcv.sb_lastrecord = m;
39236c6e 1807 } else {
39236c6e
A
1808 SB_EMPTY_FIXUP(&so->so_rcv);
1809 }
5ba3f43e
A
1810
1811 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum)) {
1812 error = EIO;
1813 *mp0 = NULL;
1814 goto release;
1815 }
1816
39236c6e
A
1817 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1818 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1819 }
1820
1821 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1822 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
39236c6e
A
1823
1824 if (flagsp != NULL)
1825 *flagsp |= flags;
1826
1827release:
5ba3f43e
A
1828 sbunlock(&so->so_rcv, TRUE);
1829
1830 if (proc_held)
1831 proc_rele(p);
1832
39236c6e
A
1833 return (error);
1834
1835}
1836
39236c6e 1837/*
5ba3f43e 1838 * MPTCP subflow socket send routine, derived from sosend().
39236c6e 1839 */
5ba3f43e
A
1840static int
1841mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1842 struct mbuf *top, struct mbuf *control, int flags)
39236c6e 1843{
5ba3f43e
A
1844 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1845 struct proc *p = current_proc();
1846 boolean_t en_tracing = FALSE, proc_held = FALSE;
1847 int en_tracing_val;
1848 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1849 int error;
39236c6e 1850
5ba3f43e
A
1851 VERIFY(control == NULL);
1852 VERIFY(addr == NULL);
1853 VERIFY(uio == NULL);
1854 VERIFY(flags == 0);
1855 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
39236c6e 1856
5ba3f43e
A
1857 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1858 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e
A
1859
1860 /*
5ba3f43e
A
1861 * trace if tracing & network (vs. unix) sockets & and
1862 * non-loopback
39236c6e 1863 */
5ba3f43e
A
1864 if (ENTR_SHOULDTRACE &&
1865 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1866 struct inpcb *inp = sotoinpcb(so);
1867 if (inp->inp_last_outifp != NULL &&
1868 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1869 en_tracing = TRUE;
1870 en_tracing_val = top->m_pkthdr.len;
1871 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1872 VM_KERNEL_ADDRPERM(so),
1873 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1874 (int64_t)en_tracing_val);
1875 }
1876 }
39236c6e 1877
5ba3f43e 1878 mptcp_update_last_owner(so, mp_so);
39236c6e 1879
5ba3f43e
A
1880 if (mp_so->last_pid != proc_pid(p)) {
1881 p = proc_find(mp_so->last_pid);
1882 if (p == PROC_NULL) {
1883 p = current_proc();
1884 } else {
1885 proc_held = TRUE;
1886 }
1887 }
39236c6e 1888
5ba3f43e
A
1889#if NECP
1890 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1891#endif /* NECP */
39236c6e 1892
5ba3f43e 1893 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 1894
5ba3f43e
A
1895 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1896 if (error)
1897 goto out;
39236c6e 1898
5ba3f43e
A
1899 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1900 top = NULL;
39236c6e 1901
5ba3f43e
A
1902out:
1903 if (top != NULL)
1904 m_freem(top);
39236c6e 1905
5ba3f43e
A
1906 if (proc_held)
1907 proc_rele(p);
1908
1909 soclearfastopen(so);
1910
1911 if (en_tracing) {
1912 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
1913 VM_KERNEL_ADDRPERM(so),
1914 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
1915 (int64_t)en_tracing_val);
1916 }
1917
1918 return (error);
39236c6e 1919
39236c6e
A
1920}
1921
1922/*
1923 * Establish an initial MPTCP connection (if first subflow and not yet
1924 * connected), or add a subflow to an existing MPTCP connection.
1925 */
1926int
5ba3f43e
A
1927mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
1928 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
39236c6e 1929{
39236c6e 1930 struct socket *mp_so, *so = NULL;
39236c6e 1931 struct mptcb *mp_tp;
5ba3f43e 1932 struct mptsub *mpts = NULL;
39236c6e
A
1933 int af, error = 0;
1934
5ba3f43e
A
1935 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1936 mp_so = mptetoso(mpte);
39236c6e
A
1937 mp_tp = mpte->mpte_mptcb;
1938
fe8ab488
A
1939 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1940 /* If the remote end sends Data FIN, refuse subflow adds */
5ba3f43e
A
1941 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
1942 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1943 error = ENOTCONN;
5ba3f43e 1944 goto out_err;
fe8ab488 1945 }
39236c6e 1946
5ba3f43e
A
1947 mpts = mptcp_subflow_alloc();
1948 if (mpts == NULL) {
1949 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
1950 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1951 error = ENOMEM;
1952 goto out_err;
1953 }
39236c6e 1954
5ba3f43e
A
1955 if (src != NULL) {
1956 int len = src->sa_len;
813fb2f6
A
1957
1958 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
1959 M_WAITOK | M_ZERO);
1960 if (mpts->mpts_src == NULL) {
5ba3f43e
A
1961 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
1962 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1963 error = ENOMEM;
1964 goto out_err;
39236c6e 1965 }
5ba3f43e 1966 bcopy(src, mpts->mpts_src, len);
39236c6e
A
1967 }
1968
5ba3f43e
A
1969 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
1970
1971 af = mpts->mpts_dst.sa_family;
1972
1973 mpts->mpts_ifscope = ifscope;
1974
39236c6e 1975 /* create the subflow socket */
5ba3f43e
A
1976 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
1977 /*
1978 * Returning (error) and not cleaning up, because up to here
1979 * all we did is creating mpts.
1980 *
1981 * And the contract is that the call to mptcp_subflow_socreate,
1982 * moves ownership of mpts to mptcp_subflow_socreate.
1983 */
1984 return (error);
1985
1986 /*
1987 * We may be called from within the kernel. Still need to account this
1988 * one to the real app.
1989 */
1990 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
39236c6e
A
1991
1992 /*
3e170ce0
A
1993 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1994 * -1 (SAE_CONNID_ALL).
39236c6e
A
1995 */
1996 mpte->mpte_connid_last++;
3e170ce0
A
1997 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1998 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
1999 mpte->mpte_connid_last++;
2000
2001 mpts->mpts_connid = mpte->mpte_connid_last;
490019cf
A
2002
2003 mpts->mpts_rel_seq = 1;
2004
fe8ab488
A
2005 /* Allocate a unique address id per subflow */
2006 mpte->mpte_addrid_last++;
2007 if (mpte->mpte_addrid_last == 0)
2008 mpte->mpte_addrid_last++;
39236c6e 2009
39236c6e 2010 /* register for subflow socket read/write events */
5ba3f43e 2011 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
39236c6e 2012
5ba3f43e
A
2013 /* Register for subflow socket control events */
2014 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
39236c6e 2015 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
5ba3f43e
A
2016 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2017 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2018 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2019 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2020 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2021 SO_FILT_HINT_ADAPTIVE_WTIMO);
39236c6e
A
2022
2023 /* sanity check */
2024 VERIFY(!(mpts->mpts_flags &
2025 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2026
39236c6e
A
2027 /*
2028 * Indicate to the TCP subflow whether or not it should establish
2029 * the initial MPTCP connection, or join an existing one. Fill
2030 * in the connection request structure with additional info needed
2031 * by the underlying TCP (to be used in the TCP options, etc.)
2032 */
39236c6e 2033 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
5ba3f43e
A
2034 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2035
39236c6e 2036 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5ba3f43e 2037 mptcp_init_local_parms(mpte);
39236c6e 2038 }
39236c6e 2039 soisconnecting(mp_so);
5ba3f43e
A
2040
2041 /* If fastopen is requested, set state in mpts */
2042 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2043 mpts->mpts_flags |= MPTSF_TFO_REQD;
39236c6e
A
2044 } else {
2045 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2046 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
490019cf
A
2047 }
2048
39236c6e
A
2049 mpts->mpts_flags |= MPTSF_CONNECTING;
2050
2051 if (af == AF_INET || af == AF_INET6) {
2052 char dbuf[MAX_IPv6_STR_LEN];
2053
3e170ce0
A
2054 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2055 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
2056 "[pending %s]\n", __func__,
2057 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2058 inet_ntop(af, ((af == AF_INET) ?
5ba3f43e
A
2059 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2060 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
39236c6e 2061 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
5ba3f43e
A
2062 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2063 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
39236c6e
A
2064 mpts->mpts_connid,
2065 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0 2066 "YES" : "NO")),
5ba3f43e 2067 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2068 }
2069
2070 /* connect right away if first attempt, or if join can be done now */
2071 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2072 error = mptcp_subflow_soconnectx(mpte, mpts);
2073
5ba3f43e
A
2074 if (error)
2075 goto out_err_close;
2076
2077 if (pcid)
2078 *pcid = mpts->mpts_connid;
2079
2080 return (0);
2081
2082out_err_close:
2083 mptcp_subflow_abort(mpts, error);
2084
2085 return (error);
2086
2087out_err:
2088 if (mpts)
2089 mptcp_subflow_free(mpts);
2090
39236c6e
A
2091 return (error);
2092}
2093
5ba3f43e
A
2094void
2095mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2096{
2097 int index = mptcp_get_statsindex(stats, mpts);
2098
2099 if (index != -1) {
2100 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2101
2102 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2103 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2104 }
2105}
2106
39236c6e
A
2107/*
2108 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2109 * will no longer be accessible after a subflow is deleted, thus this
2110 * should occur only after the subflow socket has been disconnected.
39236c6e
A
2111 */
2112void
5ba3f43e 2113mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
39236c6e 2114{
5ba3f43e
A
2115 struct socket *mp_so = mptetoso(mpte);
2116 struct socket *so = mpts->mpts_socket;
2117 struct tcpcb *tp = sototcpcb(so);
39037602 2118
5ba3f43e
A
2119 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2120 VERIFY(mpts->mpts_mpte == mpte);
2121 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2122 VERIFY(mpte->mpte_numflows != 0);
2123 VERIFY(mp_so->so_usecount > 0);
39236c6e 2124
5ba3f43e
A
2125 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2126 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2127 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2128 mpts->mpts_flags, mp_so->so_error),
2129 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2130
5ba3f43e
A
2131 mptcpstats_update(mpte->mpte_itfstats, mpts);
2132 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2133 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
39236c6e 2134
39236c6e
A
2135 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2136 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
39236c6e 2137 mpte->mpte_numflows--;
fe8ab488
A
2138 if (mpte->mpte_active_sub == mpts)
2139 mpte->mpte_active_sub = NULL;
39236c6e
A
2140
2141 /*
2142 * Drop references held by this subflow socket; there
2143 * will be no further upcalls made from this point.
2144 */
5ba3f43e
A
2145 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2146 sock_catchevents_locked(so, NULL, NULL, 0);
fe8ab488 2147
39236c6e 2148 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 2149
39236c6e
A
2150 mp_so->so_usecount--; /* for subflow socket */
2151 mpts->mpts_mpte = NULL;
2152 mpts->mpts_socket = NULL;
39236c6e 2153
5ba3f43e
A
2154 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2155 mptcp_subflow_remref(mpts); /* for subflow socket */
2156
2157 so->so_flags &= ~SOF_MP_SUBFLOW;
2158 tp->t_mptcb = NULL;
2159 tp->t_mpsub = NULL;
2160}
2161
2162void
2163mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2164{
2165 struct socket *so = mpts->mpts_socket;
2166 struct mptcb *mp_tp = mpte->mpte_mptcb;
2167 int send_dfin = 0;
2168
2169 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2170 send_dfin = 1;
2171
2172 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2173 (so->so_state & SS_ISCONNECTED)) {
2174 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2175 __func__, mpts->mpts_connid, send_dfin),
2176 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2177
2178 if (send_dfin)
2179 mptcp_send_dfin(so);
2180 soshutdownlock(so, SHUT_WR);
2181 }
2182
2183}
2184
2185static void
2186mptcp_subflow_abort(struct mptsub *mpts, int error)
2187{
2188 struct socket *so = mpts->mpts_socket;
2189 struct tcpcb *tp = sototcpcb(so);
2190
2191 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2192 return;
2193
2194 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2195 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2196
5ba3f43e
A
2197 if (tp->t_state != TCPS_CLOSED)
2198 tcp_drop(tp, error);
2199
2200 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2201}
2202
2203/*
2204 * Disconnect a subflow socket.
2205 */
2206void
5ba3f43e 2207mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
39236c6e
A
2208{
2209 struct socket *so;
2210 struct mptcb *mp_tp;
2211 int send_dfin = 0;
2212
5ba3f43e 2213 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
2214
2215 VERIFY(mpts->mpts_mpte == mpte);
2216 VERIFY(mpts->mpts_socket != NULL);
39236c6e
A
2217
2218 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2219 return;
2220
2221 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2222
39236c6e
A
2223 so = mpts->mpts_socket;
2224 mp_tp = mpte->mpte_mptcb;
5ba3f43e 2225 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
39236c6e 2226 send_dfin = 1;
39236c6e 2227
39236c6e
A
2228 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2229 (so->so_state & SS_ISCONNECTED)) {
5ba3f43e
A
2230 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d\n",
2231 __func__, mpts->mpts_connid, send_dfin),
2232 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2233
2234 if (send_dfin)
2235 mptcp_send_dfin(so);
2236 (void) soshutdownlock(so, SHUT_RD);
2237 (void) soshutdownlock(so, SHUT_WR);
2238 (void) sodisconnectlocked(so);
2239 }
39236c6e
A
2240 /*
2241 * Generate a disconnect event for this subflow socket, in case
2242 * the lower layer doesn't do it; this is needed because the
5ba3f43e 2243 * subflow socket deletion relies on it.
39236c6e 2244 */
5ba3f43e 2245 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2246}
2247
2248/*
5ba3f43e 2249 * Called when the associated subflow socket posted a read event.
39236c6e
A
2250 */
2251static void
2252mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2253{
2254#pragma unused(so, waitf)
5ba3f43e 2255 struct mptsub *mpts = arg, *tmpts;
39236c6e
A
2256 struct mptses *mpte = mpts->mpts_mpte;
2257
5ba3f43e
A
2258 VERIFY(mpte != NULL);
2259
2260 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2261 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2262 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
fe8ab488 2263 return;
5ba3f43e
A
2264 }
2265
2266 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2267 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2268 if (mpts->mpts_socket->so_usecount == 0) {
2269 /* Will be removed soon by tcp_garbage_collect */
2270 continue;
2271 }
2272
2273 mptcp_subflow_addref(mpts);
2274 mpts->mpts_socket->so_usecount++;
39236c6e 2275
5ba3f43e
A
2276 mptcp_subflow_input(mpte, mpts);
2277
2278 mptcp_subflow_remref(mpts); /* ours */
2279
2280 VERIFY(mpts->mpts_socket->so_usecount != 0);
2281 mpts->mpts_socket->so_usecount--;
2282 }
2283
2284 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
39236c6e
A
2285}
2286
2287/*
2288 * Subflow socket input.
39236c6e
A
2289 */
2290static void
2291mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2292{
5ba3f43e 2293 struct socket *mp_so = mptetoso(mpte);
39236c6e
A
2294 struct mbuf *m = NULL;
2295 struct socket *so;
5ba3f43e 2296 int error, wakeup = 0;
39236c6e 2297
5ba3f43e
A
2298 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2299 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
39236c6e 2300
39037602 2301 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
2302 struct mptsub *, mpts);
2303
2304 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
5ba3f43e 2305 goto out;
39236c6e
A
2306
2307 so = mpts->mpts_socket;
2308
2309 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2310 if (error != 0 && error != EWOULDBLOCK) {
5ba3f43e 2311 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
3e170ce0
A
2312 __func__, mpts->mpts_connid, error),
2313 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5ba3f43e
A
2314 if (error == ENODATA) {
2315 /*
2316 * Don't ignore ENODATA so as to discover
2317 * nasty middleboxes.
2318 */
2319 mp_so->so_error = ENODATA;
2320
2321 wakeup = 1;
2322 goto out;
39236c6e 2323 }
39236c6e 2324 } else if (error == 0) {
5ba3f43e 2325 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
3e170ce0 2326 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2327 }
2328
2329 /* In fallback, make sure to accept data on all but one subflow */
5ba3f43e
A
2330 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2331 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2332 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2333 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2334 m_freem(m);
5ba3f43e 2335 goto out;
39236c6e
A
2336 }
2337
2338 if (m != NULL) {
5ba3f43e
A
2339 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2340 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
3e170ce0 2341
5ba3f43e
A
2342 mpte->mpte_used_cell = 1;
2343 } else {
2344 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2345
2346 mpte->mpte_used_wifi = 1;
2347 }
3e170ce0 2348
39236c6e 2349 mptcp_input(mpte, m);
39236c6e 2350 }
5ba3f43e
A
2351
2352 /* notify protocol that we drained all the data */
2353 if (error == 0 && m != NULL &&
2354 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2355 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2356
2357out:
2358 if (wakeup)
2359 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2360
2361 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
39236c6e
A
2362}
2363
2364/*
2365 * Subflow socket write upcall.
2366 *
5ba3f43e 2367 * Called when the associated subflow socket posted a read event.
39236c6e
A
2368 */
2369static void
2370mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2371{
2372#pragma unused(so, waitf)
2373 struct mptsub *mpts = arg;
2374 struct mptses *mpte = mpts->mpts_mpte;
2375
5ba3f43e
A
2376 VERIFY(mpte != NULL);
2377
2378 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2379 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2380 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
fe8ab488 2381 return;
5ba3f43e 2382 }
39236c6e 2383
5ba3f43e 2384 mptcp_output(mpte);
39236c6e
A
2385}
2386
2387/*
2388 * Subflow socket output.
2389 *
2390 * Called for sending data from MPTCP to the underlying subflow socket.
2391 */
2392int
5ba3f43e 2393mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
39236c6e 2394{
39236c6e 2395 struct mptcb *mp_tp = mpte->mpte_mptcb;
5ba3f43e
A
2396 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2397 struct socket *mp_so, *so;
2398 struct tcpcb *tp;
2399 uint64_t mpt_dsn = 0, off = 0;
2400 int sb_cc = 0, error = 0, wakeup = 0;
2401 uint32_t dss_csum;
2402 uint16_t tot_sent = 0;
2403 boolean_t reinjected = FALSE;
2404
2405 mpte_lock_assert_held(mpte);
2406
2407 mp_so = mptetoso(mpte);
39236c6e 2408 so = mpts->mpts_socket;
5ba3f43e 2409 tp = sototcpcb(so);
39236c6e 2410
5ba3f43e
A
2411 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2412 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
39236c6e 2413
5ba3f43e
A
2414 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2415 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2416 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2417 (mpts->mpts_flags & MPTSF_TFO_REQD));
2418 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
39236c6e 2419
5ba3f43e
A
2420 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2421 __func__, mpts->mpts_flags, mpte->mpte_flags,
2422 mptcp_subflow_cwnd_space(so)),
2423 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2424 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2425 struct mptsub *, mpts);
39236c6e
A
2426
2427 /* Remove Addr Option is not sent reliably as per I-D */
2428 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
39236c6e 2429 tp->t_rem_aid = mpte->mpte_lost_aid;
5ba3f43e 2430 tp->t_mpflags |= TMPF_SND_REM_ADDR;
39236c6e
A
2431 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2432 }
2433
2434 /*
2435 * The mbuf chains containing the metadata (as well as pointing to
2436 * the user data sitting at the MPTCP output queue) would then be
2437 * sent down to the subflow socket.
2438 *
2439 * Some notes on data sequencing:
2440 *
2441 * a. Each mbuf must be a M_PKTHDR.
2442 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2443 * in the mbuf pkthdr structure.
2444 * c. Each mbuf containing the MPTCP metadata must have its
2445 * pkt_flags marked with the PKTF_MPTCP flag.
2446 */
2447
5ba3f43e
A
2448 if (mpte->mpte_reinjectq)
2449 sb_mb = mpte->mpte_reinjectq;
2450 else
2451 sb_mb = mp_so->so_snd.sb_mb;
2452
39236c6e 2453 if (sb_mb == NULL) {
5ba3f43e
A
2454 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u\n",
2455 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2456 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2457 goto out;
2458 }
2459
2460 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2461
5ba3f43e
A
2462 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2463 !(so->so_state & SS_ISCONNECTED) &&
2464 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2465 tp->t_mpflags |= TMPF_TFO_REQUEST;
2466 goto zero_len_write;
39236c6e
A
2467 }
2468
5ba3f43e
A
2469 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2470
2471 /* First, drop acknowledged data */
39236c6e 2472 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
5ba3f43e
A
2473 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2474 "dsn %u suna %u reinject? %u\n",
2475 __func__, (uint32_t)mpt_dsn,
2476 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2477 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2478 if (mpte->mpte_reinjectq) {
2479 mptcp_clean_reinjectq(mpte);
2480 } else {
2481 uint64_t len = 0;
2482 len = mp_tp->mpt_snduna - mpt_dsn;
2483 sbdrop(&mp_so->so_snd, (int)len);
2484 wakeup = 1;
2485 }
2486 }
2487
2488 /* Check again because of above sbdrop */
2489 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2490 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2491 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2492 goto out;
39236c6e
A
2493 }
2494
2495 /*
2496 * In degraded mode, we don't receive data acks, so force free
2497 * mbufs less than snd_nxt
2498 */
39236c6e 2499 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 2500 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
5ba3f43e
A
2501 mp_so->so_snd.sb_mb) {
2502 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2503 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2504 uint64_t len = 0;
2505 len = mp_tp->mpt_snduna - mpt_dsn;
2506 sbdrop(&mp_so->so_snd, (int)len);
2507 wakeup = 1;
2508
2509 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2510 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2511 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2512 }
39236c6e
A
2513 }
2514
fe8ab488
A
2515 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2516 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2517 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2518 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
39236c6e
A
2519 }
2520
2521 /*
2522 * Adjust the top level notion of next byte used for retransmissions
2523 * and sending FINs.
2524 */
5ba3f43e 2525 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
39236c6e 2526 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2527
2528 /* Now determine the offset from which to start transmitting data */
5ba3f43e
A
2529 if (mpte->mpte_reinjectq)
2530 sb_mb = mpte->mpte_reinjectq;
2531 else
2532 sb_mb = mp_so->so_snd.sb_mb;
39236c6e 2533 if (sb_mb == NULL) {
5ba3f43e
A
2534 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2535 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2536 goto out;
2537 }
5ba3f43e
A
2538
2539 if (mpte->mpte_reinjectq) {
2540 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2541 } else if (flags & MPTCP_SUBOUT_PROBING) {
2542 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2543 off = 0;
39236c6e 2544 } else {
5ba3f43e
A
2545 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2546
2547 /*
2548 * With TFO, there might be no data at all, thus still go into this
2549 * code-path here.
2550 */
2551 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2552 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2553 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2554 sb_cc -= off;
2555 } else {
2556 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2557 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2558 (uint32_t)mp_tp->mpt_sndmax),
2559 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2560
2561 goto out;
2562 }
39236c6e 2563 }
39236c6e 2564
5ba3f43e
A
2565 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2566 if (sb_cc <= 0) {
2567 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2568 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2569 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2570 mptcp_subflow_cwnd_space(so)),
2571 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2572 }
2573
2574 sb_cc = min(sb_cc, UINT16_MAX);
2575
2576 /*
2577 * Create a DSN mapping for the data we are about to send. It all
2578 * has the same mapping.
2579 */
2580 if (mpte->mpte_reinjectq)
2581 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2582 else
2583 mpt_dsn = mp_tp->mpt_snduna + off;
39236c6e 2584
5ba3f43e
A
2585 mpt_mbuf = sb_mb;
2586 while (mpt_mbuf && mpte->mpte_reinjectq == NULL &&
2587 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2588 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
39236c6e
A
2589 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2590 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 2591 }
3e170ce0 2592 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
5ba3f43e
A
2593 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2594 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3e170ce0 2595 mpts->mpts_probecnt),
5ba3f43e 2596 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2597
ecc0ceb4 2598 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 2599
fe8ab488
A
2600 head = tail = NULL;
2601
39236c6e 2602 while (tot_sent < sb_cc) {
5ba3f43e 2603 ssize_t mlen;
39236c6e 2604
5ba3f43e 2605 mlen = mpt_mbuf->m_len;
39236c6e 2606 mlen -= off;
5ba3f43e 2607 mlen = min(mlen, sb_cc - tot_sent);
39236c6e 2608
5ba3f43e
A
2609 if (mlen < 0) {
2610 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2611 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2612 (uint32_t)off, sb_cc, tot_sent),
2613 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2614 goto out;
39236c6e
A
2615 }
2616
5ba3f43e
A
2617 if (mlen == 0)
2618 goto next;
2619
fe8ab488
A
2620 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2621 M_COPYM_MUST_COPY_HDR);
39236c6e 2622 if (m == NULL) {
5ba3f43e
A
2623 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2624 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2625 error = ENOBUFS;
2626 break;
2627 }
2628
2629 /* Create a DSN mapping for the data (m_copym does it) */
fe8ab488 2630 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e
A
2631 VERIFY(m->m_next == NULL);
2632
39236c6e
A
2633 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2634 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
5ba3f43e 2635 m->m_pkthdr.mp_dsn = mpt_dsn;
39236c6e 2636 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
39236c6e
A
2637 m->m_pkthdr.len = mlen;
2638
fe8ab488
A
2639 if (head == NULL) {
2640 head = tail = m;
2641 } else {
2642 tail->m_next = m;
2643 tail = m;
2644 }
2645
fe8ab488
A
2646 tot_sent += mlen;
2647 off = 0;
5ba3f43e 2648next:
fe8ab488
A
2649 mpt_mbuf = mpt_mbuf->m_next;
2650 }
2651
5ba3f43e
A
2652 if (mpte->mpte_reinjectq) {
2653 reinjected = TRUE;
fe8ab488 2654
5ba3f43e
A
2655 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2656 struct mbuf *n = sb_mb;
2657
2658 while (n) {
2659 n->m_pkthdr.mp_dsn += sb_cc;
2660 n->m_pkthdr.mp_rlen -= sb_cc;
2661 n = n->m_next;
2662 }
2663 m_adj(sb_mb, sb_cc);
2664 } else {
2665 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2666 m_freem(sb_mb);
2667 }
2668 }
2669
2670 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2671 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2672 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2673
2674 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2675 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2676 tot_sent);
2677 }
2678
2679 /* Now, let's update rel-seq and the data-level length */
2680 mpts->mpts_rel_seq += tot_sent;
2681 m = head;
2682 while (m) {
2683 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2684 m->m_pkthdr.mp_csum = dss_csum;
2685 m->m_pkthdr.mp_rlen = tot_sent;
2686 m = m->m_next;
2687 }
2688
2689 if (head != NULL) {
490019cf 2690 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
5ba3f43e 2691 (tp->t_tfo_stats == 0))
39037602 2692 tp->t_mpflags |= TMPF_TFO_REQUEST;
fe8ab488
A
2693
2694 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2695
5ba3f43e 2696 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
39236c6e
A
2697 struct sockbuf *, &so->so_rcv,
2698 struct sockbuf *, &so->so_snd,
2699 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
2700 size_t, tot_sent);
2701 }
2702
5ba3f43e
A
2703done_sending:
2704 if (error == 0 ||
2705 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2706 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3e170ce0
A
2707
2708 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2709 tcpstat.tcps_mp_num_probes++;
5ba3f43e 2710 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
3e170ce0
A
2711 mpts->mpts_probecnt += 1;
2712 else
2713 mpts->mpts_probecnt +=
2714 tot_sent/mpts->mpts_maxseg;
2715 }
2716
5ba3f43e
A
2717 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2718 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
39236c6e
A
2719 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2720 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
5ba3f43e 2721 mp_tp->mpt_sndnxt = new_sndnxt;
39236c6e 2722 }
fe8ab488 2723
5ba3f43e 2724 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
490019cf 2725
5ba3f43e
A
2726 /* Must be here as mptcp_can_send_more() checks for this */
2727 soclearfastopen(mp_so);
39236c6e 2728
3e170ce0
A
2729 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2730 (mpts->mpts_probesoon != 0))
5ba3f43e
A
2731 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2732 __func__, mpts->mpts_connid,
2733 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2734 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3e170ce0 2735 (tcp_now - mpts->mpts_probesoon)),
5ba3f43e
A
2736 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2737
2738 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2739 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2740
2741 mpte->mpte_used_cell = 1;
2742 } else {
2743 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2744
2745 mpte->mpte_used_wifi = 1;
2746 }
2747
2748 /*
2749 * Don't propagate EWOULDBLOCK - it's already taken care of
2750 * in mptcp_usr_send for TFO.
2751 */
2752 error = 0;
fe8ab488 2753 } else {
5ba3f43e
A
2754 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2755 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
3e170ce0 2756 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2757 }
2758out:
5ba3f43e 2759
39037602 2760 if (wakeup)
5ba3f43e 2761 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
39037602 2762
5ba3f43e 2763 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
39236c6e 2764 return (error);
5ba3f43e
A
2765
2766zero_len_write:
2767 /* Opting to call pru_send as no mbuf at subflow level */
2768 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2769 NULL, current_proc());
2770
2771 goto done_sending;
39236c6e
A
2772}
2773
39236c6e 2774static void
5ba3f43e 2775mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
39236c6e 2776{
5ba3f43e 2777 struct mbuf *n, *prev = NULL;
39236c6e 2778
5ba3f43e
A
2779 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2780 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2781 m->m_pkthdr.mp_rseq),
2782 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2783
2784 n = mpte->mpte_reinjectq;
2785
2786 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2787 * equal than m's sequence number.
2788 */
2789 while (n) {
2790 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2791 break;
2792
2793 prev = n;
2794
2795 n = n->m_nextpkt;
2796 }
2797
2798 if (n) {
2799 /* m is already fully covered by the next mbuf in the queue */
2800 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2801 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2802 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2803 __func__, n->m_pkthdr.mp_rlen),
2804 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2805 goto dont_queue;
2806 }
2807
2808 /* m is covering the next mbuf entirely, thus we remove this guy */
2809 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2810 struct mbuf *tmp = n->m_nextpkt;
2811
2812 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2813 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2814 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2815 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2816
2817 m->m_nextpkt = NULL;
2818 if (prev == NULL)
2819 mpte->mpte_reinjectq = tmp;
2820 else
2821 prev->m_nextpkt = tmp;
2822
2823 m_freem(n);
2824 n = tmp;
2825 }
2826
2827 }
2828
2829 if (prev) {
2830 /* m is already fully covered by the previous mbuf in the queue */
2831 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2832 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2833 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2834 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2835 goto dont_queue;
2836 }
2837 }
2838
2839 if (prev == NULL)
2840 mpte->mpte_reinjectq = m;
2841 else
2842 prev->m_nextpkt = m;
39236c6e 2843
5ba3f43e
A
2844 m->m_nextpkt = n;
2845
2846 return;
2847
2848dont_queue:
2849 m_freem(m);
2850 return;
39236c6e
A
2851}
2852
5ba3f43e
A
2853static struct mbuf *
2854mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
39236c6e 2855{
5ba3f43e
A
2856 struct socket *mp_so = mptetoso(mpte);
2857 struct mbuf *m;
39236c6e 2858
5ba3f43e 2859 m = mp_so->so_snd.sb_mb;
39236c6e 2860
5ba3f43e
A
2861 while (m) {
2862 /* If this segment covers what we are looking for, return it. */
2863 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2864 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2865 break;
2866
2867
2868 /* Segment is no more in the queue */
2869 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
2870 return NULL;
2871
2872 m = m->m_next;
39236c6e
A
2873 }
2874
5ba3f43e
A
2875 return m;
2876}
fe8ab488 2877
5ba3f43e
A
2878static struct mbuf *
2879mptcp_copy_mbuf_list(struct mbuf *m, int len)
2880{
2881 struct mbuf *top = NULL, *tail = NULL;
2882 uint64_t dsn;
2883 uint32_t dlen, rseq;
39236c6e 2884
5ba3f43e
A
2885 dsn = m->m_pkthdr.mp_dsn;
2886 dlen = m->m_pkthdr.mp_rlen;
2887 rseq = m->m_pkthdr.mp_rseq;
3e170ce0 2888
5ba3f43e
A
2889 while (len > 0) {
2890 struct mbuf *n;
2891
2892 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
2893
2894 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
2895 if (n == NULL) {
2896 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
2897 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2898 goto err;
3e170ce0 2899 }
fe8ab488 2900
5ba3f43e
A
2901 VERIFY(n->m_flags & M_PKTHDR);
2902 VERIFY(n->m_next == NULL);
2903 VERIFY(n->m_pkthdr.mp_dsn == dsn);
2904 VERIFY(n->m_pkthdr.mp_rlen == dlen);
2905 VERIFY(n->m_pkthdr.mp_rseq == rseq);
2906 VERIFY(n->m_len == m->m_len);
2907
2908 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
2909
2910 if (top == NULL)
2911 top = n;
2912
2913 if (tail != NULL)
2914 tail->m_next = n;
2915
2916 tail = n;
2917
2918 len -= m->m_len;
2919 m = m->m_next;
39236c6e
A
2920 }
2921
5ba3f43e
A
2922 return top;
2923
2924err:
2925 if (top)
2926 m_freem(top);
2927
2928 return NULL;
39236c6e
A
2929}
2930
5ba3f43e
A
2931static void
2932mptcp_reinject_mbufs(struct socket *so)
39236c6e 2933{
5ba3f43e
A
2934 struct tcpcb *tp = sototcpcb(so);
2935 struct mptsub *mpts = tp->t_mpsub;
2936 struct mptcb *mp_tp = tptomptp(tp);
2937 struct mptses *mpte = mp_tp->mpt_mpte;;
2938 struct sockbuf *sb = &so->so_snd;
2939 struct mbuf *m;
39236c6e 2940
5ba3f43e
A
2941 m = sb->sb_mb;
2942 while (m) {
2943 struct mbuf *n = m->m_next, *orig = m;
39236c6e 2944
5ba3f43e
A
2945 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
2946 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
2947 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
2948 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2949
5ba3f43e 2950 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 2951
5ba3f43e
A
2952 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
2953 goto next;
39236c6e 2954
5ba3f43e
A
2955 /* Has it all already been acknowledged at the data-level? */
2956 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
2957 goto next;
2958
2959 /* Part of this has already been acknowledged - lookup in the
2960 * MPTCP-socket for the segment.
2961 */
2962 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
2963 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
2964 if (m == NULL)
2965 goto next;
2966 }
2967
2968 /* Copy the mbuf with headers (aka, DSN-numbers) */
2969 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
2970 if (m == NULL)
2971 break;
2972
2973 VERIFY(m->m_nextpkt == NULL);
2974
2975 /* Now, add to the reinject-queue, eliminating overlapping
2976 * segments
2977 */
2978 mptcp_add_reinjectq(mpte, m);
2979
2980 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
2981
2982next:
2983 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
2984 while (n) {
2985 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
2986
2987 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
2988 break;
2989
2990 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
2991 n = n->m_next;
2992 }
2993
2994 m = n;
39236c6e 2995 }
5ba3f43e 2996}
39236c6e 2997
5ba3f43e
A
2998void
2999mptcp_clean_reinjectq(struct mptses *mpte)
3000{
3001 struct mptcb *mp_tp = mpte->mpte_mptcb;
3002
3003 mpte_lock_assert_held(mpte);
3004
3005 while (mpte->mpte_reinjectq) {
3006 struct mbuf *m = mpte->mpte_reinjectq;
3007
3008 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3009 MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
3010 break;
3011
3012 mpte->mpte_reinjectq = m->m_nextpkt;
3013 m->m_nextpkt = NULL;
3014 m_freem(m);
3015 }
39236c6e
A
3016}
3017
3018/*
5ba3f43e 3019 * Subflow socket control event upcall.
39236c6e 3020 */
5ba3f43e
A
3021static void
3022mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
39236c6e 3023{
5ba3f43e
A
3024#pragma unused(so)
3025 struct mptsub *mpts = arg;
3026 struct mptses *mpte = mpts->mpts_mpte;
39236c6e 3027
5ba3f43e
A
3028 VERIFY(mpte != NULL);
3029 mpte_lock_assert_held(mpte);
39236c6e 3030
5ba3f43e
A
3031 if ((mpts->mpts_evctl & events) == events)
3032 return;
39236c6e 3033
5ba3f43e
A
3034 mpts->mpts_evctl |= events;
3035
3036 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3037 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3038 return;
39037602 3039 }
39236c6e 3040
5ba3f43e 3041 mptcp_subflow_workloop(mpte);
39236c6e
A
3042}
3043
3044/*
5ba3f43e
A
3045 * Subflow socket control events.
3046 *
3047 * Called for handling events related to the underlying subflow socket.
39236c6e
A
3048 */
3049static ev_ret_t
5ba3f43e 3050mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3e170ce0 3051 uint64_t *p_mpsofilt_hint)
39236c6e 3052{
5ba3f43e
A
3053 ev_ret_t ret = MPTS_EVRET_OK;
3054 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3055 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e 3056
5ba3f43e 3057 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3058
5ba3f43e
A
3059 /* bail if there's nothing to process */
3060 if (!mpts->mpts_evctl)
3061 return (ret);
39236c6e 3062
5ba3f43e
A
3063 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3064 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3065 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3066 SO_FILT_HINT_DISCONNECTED)) {
3067 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3068 }
3e170ce0 3069
5ba3f43e
A
3070 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3071 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3072
3073 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3074 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3075 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3076
3077 /*
3078 * Process all the socket filter hints and reset the hint
3079 * once it is handled
3080 */
3081 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3082 /*
3083 * Always execute the DISCONNECTED event, because it will wakeup
3084 * the app.
3085 */
3086 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3087 (ret >= MPTS_EVRET_OK ||
3088 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3089 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3090 ev_ret_t error =
3091 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3092 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3093 }
3094 }
3095
3096 /*
3097 * We should be getting only events specified via sock_catchevents(),
3098 * so loudly complain if we have any unprocessed one(s).
3099 */
3100 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3101 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3102 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3103 mpts->mpts_connid,
3104 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3105 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3106 else
3107 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3108 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3109 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3110
3111 return (ret);
39236c6e
A
3112}
3113
39236c6e 3114static ev_ret_t
5ba3f43e
A
3115mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3116 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e
A
3117{
3118 struct socket *mp_so, *so;
3119 struct mptcb *mp_tp;
39236c6e 3120
5ba3f43e 3121 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3122 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3123 mp_so = mptetoso(mpte);
39236c6e
A
3124 mp_tp = mpte->mpte_mptcb;
3125 so = mpts->mpts_socket;
3126
5ba3f43e
A
3127 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3128 mpts->mpts_connid, event),
3e170ce0 3129 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3130
39236c6e 3131 /*
5ba3f43e
A
3132 * We got an event for this subflow that might need to be propagated,
3133 * based on the state of the MPTCP connection.
39236c6e 3134 */
5ba3f43e
A
3135 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3136 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3137 mp_so->so_error = so->so_error;
3138 *p_mpsofilt_hint |= event;
39236c6e 3139 }
39236c6e 3140
5ba3f43e 3141 return (MPTS_EVRET_OK);
39236c6e
A
3142}
3143
3144/*
3145 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3146 */
3147static ev_ret_t
3e170ce0 3148mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3149 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3150{
5ba3f43e
A
3151#pragma unused(p_mpsofilt_hint, event)
3152 struct socket *mp_so;
3153 struct tcpcb *tp;
39236c6e 3154
5ba3f43e 3155 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
3156
3157 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e
A
3158 mp_so = mptetoso(mpte);
3159 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
39236c6e 3160
39236c6e
A
3161 /*
3162 * This overwrites any previous mpte_lost_aid to avoid storing
3163 * too much state when the typical case has only two subflows.
3164 */
3165 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3166 mpte->mpte_lost_aid = tp->t_local_aid;
3167
5ba3f43e
A
3168 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3169 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3170
3171 /*
3172 * The subflow connection has lost its source address.
39236c6e 3173 */
5ba3f43e 3174 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
39236c6e 3175
5ba3f43e
A
3176 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3177 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3178
5ba3f43e 3179 return (MPTS_EVRET_DELETE);
39236c6e
A
3180}
3181
fe8ab488
A
3182/*
3183 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3184 * indicates that the remote side sent a Data FIN
3185 */
3186static ev_ret_t
3e170ce0 3187mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3188 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3189{
5ba3f43e 3190#pragma unused(event)
fe8ab488
A
3191 struct mptcb *mp_tp;
3192
5ba3f43e 3193 mpte_lock_assert_held(mpte); /* same as MP socket lock */
fe8ab488
A
3194 mp_tp = mpte->mpte_mptcb;
3195
5ba3f43e 3196 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3e170ce0 3197 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 3198
fe8ab488 3199 /*
39037602 3200 * We got a Data FIN for the MPTCP connection.
fe8ab488
A
3201 * The FIN may arrive with data. The data is handed up to the
3202 * mptcp socket and the user is notified so that it may close
3203 * the socket if needed.
3204 */
39037602 3205 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
5ba3f43e 3206 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
39037602 3207
fe8ab488
A
3208 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3209}
3210
39236c6e
A
3211/*
3212 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3213 */
3214static ev_ret_t
3e170ce0 3215mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3216 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3217{
5ba3f43e 3218#pragma unused(event, p_mpsofilt_hint)
39236c6e 3219 struct mptsub *mpts_alt = NULL;
5ba3f43e 3220 struct socket *alt_so = NULL;
39236c6e
A
3221 struct socket *mp_so;
3222 int altpath_exists = 0;
3223
5ba3f43e
A
3224 mpte_lock_assert_held(mpte);
3225 mp_so = mptetoso(mpte);
3226 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3227 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3228 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3229
5ba3f43e 3230 mptcp_reinject_mbufs(mpts->mpts_socket);
39236c6e 3231
5ba3f43e 3232 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
3233 /*
3234 * If there is no alternate eligible subflow, ignore the
3235 * failover hint.
3236 */
3237 if (mpts_alt == NULL) {
5ba3f43e
A
3238 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3239 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3240
39236c6e
A
3241 goto done;
3242 }
5ba3f43e 3243
39236c6e 3244 altpath_exists = 1;
5ba3f43e 3245 alt_so = mpts_alt->mpts_socket;
39236c6e 3246 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
fe8ab488 3247 /* All data acknowledged and no RTT spike */
5ba3f43e 3248 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
39236c6e
A
3249 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3250 } else {
3251 /* no alternate path available */
3252 altpath_exists = 0;
3253 }
39236c6e 3254 }
39236c6e
A
3255
3256 if (altpath_exists) {
5ba3f43e 3257 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
39236c6e 3258
5ba3f43e 3259 mpte->mpte_active_sub = mpts_alt;
39236c6e
A
3260 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3261 mpts->mpts_flags &= ~MPTSF_ACTIVE;
5ba3f43e
A
3262
3263 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3264 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3265 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3266
3267 mptcpstats_inc_switch(mpte, mpts);
3268
3269 sowwakeup(alt_so);
39236c6e 3270 } else {
5ba3f43e
A
3271 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3272 mpts->mpts_connid),
3273 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3274done:
5ba3f43e 3275 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
39236c6e 3276 }
5ba3f43e 3277
39236c6e
A
3278 return (MPTS_EVRET_OK);
3279}
3280
3281/*
3282 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3283 */
3284static ev_ret_t
3e170ce0 3285mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3286 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3287{
5ba3f43e 3288 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3289 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3290
5ba3f43e
A
3291 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3292 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3293
39236c6e 3294 /*
5ba3f43e
A
3295 * The subflow connection cannot use the outgoing interface, let's
3296 * close this subflow.
39236c6e 3297 */
5ba3f43e 3298 mptcp_subflow_abort(mpts, EPERM);
39236c6e 3299
5ba3f43e 3300 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3301
5ba3f43e 3302 return (MPTS_EVRET_DELETE);
39236c6e
A
3303}
3304
3305/*
3306 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3307 */
3308static ev_ret_t
3e170ce0 3309mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3310 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3311{
5ba3f43e 3312#pragma unused(event, p_mpsofilt_hint)
39236c6e 3313 struct socket *mp_so, *so;
5ba3f43e
A
3314 struct inpcb *inp;
3315 struct tcpcb *tp;
39236c6e 3316 struct mptcb *mp_tp;
5ba3f43e 3317 int af;
39236c6e
A
3318 boolean_t mpok = FALSE;
3319
5ba3f43e 3320 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3321 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3322
5ba3f43e
A
3323 mp_so = mptetoso(mpte);
3324 mp_tp = mpte->mpte_mptcb;
39236c6e 3325 so = mpts->mpts_socket;
5ba3f43e
A
3326 tp = sototcpcb(so);
3327 af = mpts->mpts_dst.sa_family;
39236c6e
A
3328
3329 if (mpts->mpts_flags & MPTSF_CONNECTED)
3330 return (MPTS_EVRET_OK);
3331
3332 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3333 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
3334 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3335 (so->so_state & SS_ISCONNECTED)) {
5ba3f43e 3336 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3e170ce0
A
3337 __func__, mpts->mpts_connid),
3338 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3339 (void) soshutdownlock(so, SHUT_RD);
3340 (void) soshutdownlock(so, SHUT_WR);
3341 (void) sodisconnectlocked(so);
3342 }
39236c6e
A
3343 return (MPTS_EVRET_OK);
3344 }
3345
3346 /*
3347 * The subflow connection has been connected. Find out whether it
3348 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3349 *
3350 * a. If MPTCP connection is not yet established, then this must be
3351 * the first subflow connection. If MPTCP failed to negotiate,
5ba3f43e 3352 * fallback to regular TCP by degrading this subflow.
39236c6e
A
3353 *
3354 * b. If MPTCP connection has been established, then this must be
3355 * one of the subsequent subflow connections. If MPTCP failed
5ba3f43e 3356 * to negotiate, disconnect the connection.
39236c6e
A
3357 *
3358 * Right now, we simply unblock any waiters at the MPTCP socket layer
3359 * if the MPTCP connection has not been established.
3360 */
39236c6e
A
3361
3362 if (so->so_state & SS_ISDISCONNECTED) {
3363 /*
3364 * With MPTCP joins, a connection is connected at the subflow
3365 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
3366 * subflow to connected state. So there is a small window
3367 * where the subflow could get disconnected before the
39236c6e
A
3368 * connected event is processed.
3369 */
39236c6e
A
3370 return (MPTS_EVRET_OK);
3371 }
3372
5ba3f43e
A
3373 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3374 mptcp_drop_tfo_data(mpte, mpts);
490019cf 3375
5ba3f43e
A
3376 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3377 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf 3378
490019cf 3379 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
3380 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3381
490019cf
A
3382 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3383
39236c6e 3384 /* get/verify the outbound interface */
5ba3f43e 3385 inp = sotoinpcb(so);
3e170ce0 3386
5ba3f43e 3387 mpts->mpts_maxseg = tp->t_maxseg;
3e170ce0 3388
5ba3f43e
A
3389 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3390 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3391 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3e170ce0 3392 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
3393
3394 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
39236c6e 3395
39236c6e 3396 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
5ba3f43e
A
3397 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3398 mpte->mpte_associd = mpts->mpts_connid;
3399 DTRACE_MPTCP2(state__change,
3400 struct mptcb *, mp_tp,
3401 uint32_t, 0 /* event */);
3402
3403 if (SOCK_DOM(so) == AF_INET) {
3404 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3405 } else {
3406 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3407 }
3408
39236c6e
A
3409 /* case (a) above */
3410 if (!mpok) {
5ba3f43e
A
3411 tcpstat.tcps_mpcap_fallback++;
3412
3413 tp->t_mpflags |= TMPF_INFIN_SENT;
3414 mptcp_notify_mpfail(so);
39236c6e 3415 } else {
5ba3f43e
A
3416 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3417 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3418 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
39037602
A
3419 } else {
3420 mpts->mpts_flags |= MPTSF_PREFERRED;
3421 }
813fb2f6 3422 mpts->mpts_flags |= MPTSF_ACTIVE;
5ba3f43e 3423
39236c6e
A
3424 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3425 mpte->mpte_nummpcapflows++;
5ba3f43e
A
3426
3427 mptcp_check_subflows_and_add(mpte);
3428
3429 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3430 mpte->mpte_initial_cell = 1;
3431
3432 mpte->mpte_handshake_success = 1;
39236c6e 3433 }
5ba3f43e
A
3434
3435 mp_tp->mpt_sndwnd = tp->snd_wnd;
3436 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3437 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3438 soisconnected(mp_so);
3439
3440 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3441 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3442 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3443 } else if (mpok) {
39236c6e
A
3444 /*
3445 * case (b) above
3446 * In case of additional flows, the MPTCP socket is not
3447 * MPTSF_MP_CAPABLE until an ACK is received from server
3448 * for 3-way handshake. TCP would have guaranteed that this
3449 * is an MPTCP subflow.
3450 */
5ba3f43e
A
3451 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3452 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3453 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3454 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3455 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3456 } else {
3457 mpts->mpts_flags |= MPTSF_PREFERRED;
3458 }
3459
39236c6e
A
3460 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3461 mpte->mpte_nummpcapflows++;
5ba3f43e
A
3462
3463 mpts->mpts_rel_seq = 1;
3464
3465 mptcp_check_subflows_and_remove(mpte);
fe8ab488 3466 } else {
5ba3f43e
A
3467 unsigned int i;
3468
3469 /* Mark this interface as non-MPTCP */
3470 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3471 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3472
3473 if (inp->inp_last_outifp->if_index == info->ifindex) {
3474 info->no_mptcp_support = 1;
3475 break;
3476 }
3477 }
3478
3479 tcpstat.tcps_join_fallback++;
3480 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3481 tcpstat.tcps_mptcp_cell_proxy++;
3482 else
3483 tcpstat.tcps_mptcp_wifi_proxy++;
3484
3485 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3486
3487 return (MPTS_EVRET_OK);
39236c6e 3488 }
fe8ab488 3489
5ba3f43e
A
3490 /* This call, just to "book" an entry in the stats-table for this ifindex */
3491 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3492
3493 mptcp_output(mpte);
39236c6e
A
3494
3495 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3496}
3497
3498/*
3499 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3500 */
3501static ev_ret_t
3e170ce0 3502mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3503 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3504{
5ba3f43e 3505#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3506 struct socket *mp_so, *so;
3507 struct mptcb *mp_tp;
39236c6e 3508
5ba3f43e 3509 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3510 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3511 mp_so = mptetoso(mpte);
39236c6e
A
3512 mp_tp = mpte->mpte_mptcb;
3513 so = mpts->mpts_socket;
3514
5ba3f43e
A
3515 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3516 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3517 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3518 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3e170ce0 3519 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3520
3521 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
5ba3f43e 3522 return (MPTS_EVRET_DELETE);
39236c6e 3523
39236c6e
A
3524 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3525
5ba3f43e 3526 /* The subflow connection has been disconnected. */
39236c6e
A
3527
3528 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3529 mpte->mpte_nummpcapflows--;
fe8ab488
A
3530 if (mpte->mpte_active_sub == mpts) {
3531 mpte->mpte_active_sub = NULL;
5ba3f43e 3532 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3e170ce0 3533 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3534 }
39236c6e
A
3535 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3536 }
3537
5ba3f43e
A
3538 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3539 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3540 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3541 mptcp_drop(mpte, mp_tp, so->so_error);
39236c6e
A
3542 }
3543
39236c6e 3544 /*
5ba3f43e
A
3545 * Clear flags that are used by getconninfo to return state.
3546 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e 3547 */
5ba3f43e
A
3548 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3549 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3550 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3551
3552 return (MPTS_EVRET_DELETE);
39236c6e
A
3553}
3554
3555/*
3556 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3557 */
3558static ev_ret_t
3e170ce0 3559mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3560 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3561{
5ba3f43e 3562#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3563 struct socket *mp_so, *so;
3564 struct mptcb *mp_tp;
3e170ce0 3565 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e 3566
5ba3f43e 3567 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3568 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3569 mp_so = mptetoso(mpte);
39236c6e 3570 mp_tp = mpte->mpte_mptcb;
39236c6e
A
3571 so = mpts->mpts_socket;
3572
39236c6e
A
3573 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3574 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3575 else
3576 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3577
3578 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3579 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3580 goto done;
3581 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3582 }
3583 else
3584 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3585
3586 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3587 mpts->mpts_flags |= MPTSF_MP_READY;
3588 else
3589 mpts->mpts_flags &= ~MPTSF_MP_READY;
3590
3591 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3592 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3593 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3594 }
3595
3596 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3597 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3598 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3599 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3600 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3601 ret = MPTS_EVRET_CONNECT_PENDING;
3602 }
3603
5ba3f43e
A
3604 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3605 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3606 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3607 mpts->mpts_flags, MPTSF_BITS),
3608 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3e170ce0 3609
39236c6e 3610done:
39236c6e
A
3611 return (ret);
3612}
3613
3614/*
3615 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3616 */
3617static ev_ret_t
3e170ce0 3618mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3619 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3620{
5ba3f43e 3621#pragma unused(event)
39236c6e
A
3622 struct socket *mp_so, *so;
3623 struct mptcb *mp_tp;
5ba3f43e 3624 boolean_t is_fastclose;
39236c6e 3625
5ba3f43e 3626 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3627 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3628 mp_so = mptetoso(mpte);
39236c6e
A
3629 mp_tp = mpte->mpte_mptcb;
3630 so = mpts->mpts_socket;
3631
39236c6e 3632 /* We got an invalid option or a fast close */
39236c6e
A
3633 struct tcptemp *t_template;
3634 struct inpcb *inp = sotoinpcb(so);
3635 struct tcpcb *tp = NULL;
3636
3637 tp = intotcpcb(inp);
fe8ab488 3638 so->so_error = ECONNABORTED;
39236c6e 3639
39037602
A
3640 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3641
39236c6e
A
3642 t_template = tcp_maketemplate(tp);
3643 if (t_template) {
fe8ab488 3644 struct tcp_respond_args tra;
39236c6e 3645
fe8ab488 3646 bzero(&tra, sizeof(tra));
39236c6e 3647 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3648 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3649 else
fe8ab488
A
3650 tra.ifscope = IFSCOPE_NONE;
3651 tra.awdl_unrestricted = 1;
39236c6e
A
3652
3653 tcp_respond(tp, t_template->tt_ipgen,
3654 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3655 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3656 (void) m_free(dtom(t_template));
3e170ce0
A
3657 mptcplog((LOG_DEBUG, "MPTCP Events: "
3658 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3659 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3660 so, mpts->mpts_connid),
3661 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3662 }
5ba3f43e 3663 mptcp_subflow_abort(mpts, ECONNABORTED);
39037602
A
3664
3665 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3e170ce0 3666 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 3667
39037602
A
3668 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3669 mp_so->so_error = ECONNABORTED;
3670 else
3671 mp_so->so_error = ECONNRESET;
3672
3673 /*
3674 * mptcp_drop is being called after processing the events, to fully
3675 * close the MPTCP connection
3676 */
39236c6e 3677 }
39037602 3678
3e170ce0
A
3679 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3680 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e 3681
5ba3f43e 3682 return (MPTS_EVRET_DELETE);
39236c6e
A
3683}
3684
fe8ab488 3685static ev_ret_t
5ba3f43e
A
3686mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3687 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3688{
5ba3f43e
A
3689#pragma unused(event)
3690 bool found_active = false;
3691
3692 mpts->mpts_flags |= MPTSF_READ_STALL;
39037602 3693
5ba3f43e
A
3694 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3695 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3e170ce0 3696
5ba3f43e
A
3697 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3698 TCPS_HAVERCVDFIN2(tp->t_state))
3699 continue;
3700
3701 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3702 found_active = true;
3703 break;
fe8ab488 3704 }
fe8ab488
A
3705 }
3706
5ba3f43e
A
3707 if (!found_active)
3708 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3709
fe8ab488
A
3710 return (MPTS_EVRET_OK);
3711}
3712
3713static ev_ret_t
5ba3f43e
A
3714mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3715 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3716{
5ba3f43e
A
3717#pragma unused(event)
3718 bool found_active = false;
3e170ce0 3719
5ba3f43e 3720 mpts->mpts_flags |= MPTSF_WRITE_STALL;
fe8ab488 3721
5ba3f43e
A
3722 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3723 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3724
3725 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3726 tp->t_state > TCPS_CLOSE_WAIT)
3727 continue;
3728
3729 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3730 found_active = true;
3731 break;
3732 }
3733 }
3734
3735 if (!found_active)
3736 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3737
3738 return (MPTS_EVRET_OK);
fe8ab488
A
3739}
3740
39236c6e
A
3741static const char *
3742mptcp_evret2str(ev_ret_t ret)
3743{
3744 const char *c = "UNKNOWN";
3745
3746 switch (ret) {
3747 case MPTS_EVRET_DELETE:
3748 c = "MPTS_EVRET_DELETE";
3749 break;
3750 case MPTS_EVRET_CONNECT_PENDING:
3751 c = "MPTS_EVRET_CONNECT_PENDING";
3752 break;
3753 case MPTS_EVRET_DISCONNECT_FALLBACK:
3754 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3755 break;
3756 case MPTS_EVRET_OK:
3757 c = "MPTS_EVRET_OK";
3758 break;
3e170ce0 3759 default:
39236c6e
A
3760 break;
3761 }
3762 return (c);
3763}
3764
39236c6e
A
3765/*
3766 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3767 * caller must ensure that the option can be issued on subflow sockets, via
3768 * MPOF_SUBFLOW_OK flag.
3769 */
3770int
5ba3f43e 3771mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
39236c6e 3772{
5ba3f43e 3773 struct socket *mp_so, *so;
39236c6e 3774 struct sockopt sopt;
39236c6e
A
3775 int error;
3776
3777 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
3778 mpte_lock_assert_held(mpte);
3779
3780 mp_so = mptetoso(mpte);
3781 so = mpts->mpts_socket;
3782
3783 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
3784 mpo->mpo_level == SOL_SOCKET &&
3785 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
3786 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n",
3787 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(),
3788 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
3789 mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1),
3790 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3791
3792 /*
3793 * When we open a new subflow, mark it as cell fallback, if
3794 * this subflow goes over cell.
3795 *
3796 * (except for first-party apps)
3797 */
3798
3799 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
3800 return (0);
39236c6e 3801
5ba3f43e
A
3802 if (sotoinpcb(so)->inp_last_outifp &&
3803 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
3804 return (0);
3805
3806 /*
3807 * This here is an OR, because if the app is not binding to the
3808 * interface, then it definitely is not a cell-fallback
3809 * connection.
3810 */
3811 if (mpts->mpts_ifscope == IFSCOPE_NONE ||
3812 !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]))
3813 return (0);
3814 }
3815
3816 mpo->mpo_flags &= ~MPOF_INTERIM;
39236c6e
A
3817
3818 bzero(&sopt, sizeof (sopt));
3819 sopt.sopt_dir = SOPT_SET;
3820 sopt.sopt_level = mpo->mpo_level;
3821 sopt.sopt_name = mpo->mpo_name;
3822 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3823 sopt.sopt_valsize = sizeof (int);
3824 sopt.sopt_p = kernproc;
3825
5ba3f43e 3826 error = sosetoptlock(so, &sopt, 0);
39236c6e 3827 if (error == 0) {
5ba3f43e 3828 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3829 "val %d set successful\n", __func__,
3830 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3831 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3832 mpo->mpo_intval),
3833 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3834 } else {
5ba3f43e 3835 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
39236c6e
A
3836 "val %d set error %d\n", __func__,
3837 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3838 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3839 mpo->mpo_intval, error),
3840 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3841 }
3842 return (error);
3843}
3844
3845/*
3846 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3847 * caller must ensure that the option can be issued on subflow sockets, via
3848 * MPOF_SUBFLOW_OK flag.
3849 */
3850int
3851mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3852 struct mptopt *mpo)
3853{
3854 struct socket *mp_so;
3855 struct sockopt sopt;
39236c6e
A
3856 int error;
3857
3858 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
3859 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3860 mp_so = mptetoso(mpte);
39236c6e
A
3861
3862 bzero(&sopt, sizeof (sopt));
3863 sopt.sopt_dir = SOPT_GET;
3864 sopt.sopt_level = mpo->mpo_level;
3865 sopt.sopt_name = mpo->mpo_name;
3866 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3867 sopt.sopt_valsize = sizeof (int);
3868 sopt.sopt_p = kernproc;
3869
3870 error = sogetoptlock(so, &sopt, 0); /* already locked */
3871 if (error == 0) {
3e170ce0
A
3872 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3873 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
3874 "val %d get successful\n", __func__,
3875 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
3876 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
3877 mpo->mpo_intval),
3e170ce0 3878 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3879 } else {
3e170ce0
A
3880 mptcplog((LOG_ERR, "MPTCP Socket: "
3881 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e 3882 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e 3883 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
3e170ce0 3884 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
3885 }
3886 return (error);
3887}
3888
3889
3890/*
3891 * MPTCP garbage collector.
3892 *
3893 * This routine is called by the MP domain on-demand, periodic callout,
3894 * which is triggered when a MPTCP socket is closed. The callout will
3895 * repeat as long as this routine returns a non-zero value.
3896 */
3897static uint32_t
3898mptcp_gc(struct mppcbinfo *mppi)
3899{
3900 struct mppcb *mpp, *tmpp;
3901 uint32_t active = 0;
3902
5ba3f43e 3903 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
39236c6e 3904
39236c6e
A
3905 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3906 struct socket *mp_so;
3907 struct mptses *mpte;
3908 struct mptcb *mp_tp;
3909
3910 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3911 mp_so = mpp->mpp_socket;
3912 VERIFY(mp_so != NULL);
3913 mpte = mptompte(mpp);
3914 VERIFY(mpte != NULL);
3915 mp_tp = mpte->mpte_mptcb;
3916 VERIFY(mp_tp != NULL);
3917
3e170ce0
A
3918 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3919 "%s: mp_so 0x%llx found "
39236c6e
A
3920 "(u=%d,r=%d,s=%d)\n", __func__,
3921 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
3922 mp_so->so_retaincnt, mpp->mpp_state),
3923 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3924
5ba3f43e 3925 if (!mpte_try_lock(mpte)) {
3e170ce0 3926 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 3927 "%s: mp_so 0x%llx skipped lock "
39236c6e
A
3928 "(u=%d,r=%d)\n", __func__,
3929 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3930 mp_so->so_usecount, mp_so->so_retaincnt),
3931 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
3932 active++;
3933 continue;
3934 }
3935
3936 /* check again under the lock */
5ba3f43e 3937 if (mp_so->so_usecount > 0) {
39236c6e
A
3938 boolean_t wakeup = FALSE;
3939 struct mptsub *mpts, *tmpts;
3940
3e170ce0 3941 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 3942 "%s: mp_so 0x%llx skipped usecount "
39236c6e
A
3943 "[u=%d,r=%d] %d %d\n", __func__,
3944 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3945 mp_so->so_usecount, mp_so->so_retaincnt,
3946 mp_tp->mpt_gc_ticks,
3e170ce0
A
3947 mp_tp->mpt_state),
3948 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3949
39236c6e
A
3950 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3951 if (mp_tp->mpt_gc_ticks > 0)
3952 mp_tp->mpt_gc_ticks--;
3953 if (mp_tp->mpt_gc_ticks == 0) {
3954 wakeup = TRUE;
39236c6e
A
3955 }
3956 }
39236c6e
A
3957 if (wakeup) {
3958 TAILQ_FOREACH_SAFE(mpts,
3959 &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 3960 mptcp_subflow_eupcall1(mpts->mpts_socket,
39236c6e 3961 mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
3962 }
3963 }
5ba3f43e 3964 mpte_unlock(mpte);
39236c6e
A
3965 active++;
3966 continue;
3967 }
3968
3969 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
5ba3f43e
A
3970 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
3971 "[u=%d,r=%d,s=%d]\n", __func__,
3972 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3973 mp_so->so_usecount, mp_so->so_retaincnt,
3974 mpp->mpp_state);
39236c6e
A
3975 }
3976
5ba3f43e
A
3977 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
3978 mptcp_close(mpte, mp_tp);
3e170ce0 3979
5ba3f43e 3980 mptcp_session_destroy(mpte);
39236c6e 3981
3e170ce0
A
3982 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3983 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 3984 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3985 mp_so->so_usecount, mp_so->so_retaincnt),
3986 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3987
39037602 3988 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
3989 struct sockbuf *, &mp_so->so_rcv,
3990 struct sockbuf *, &mp_so->so_snd,
3991 struct mppcb *, mpp);
3992
3993 mp_pcbdispose(mpp);
39037602 3994 sodealloc(mp_so);
39236c6e
A
3995 }
3996
3997 return (active);
3998}
3999
4000/*
4001 * Drop a MPTCP connection, reporting the specified error.
4002 */
4003struct mptses *
4004mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4005{
4006 struct socket *mp_so;
4007
5ba3f43e 4008 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4009 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4010 mp_so = mptetoso(mpte);
39236c6e 4011
39037602 4012 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
4013 uint32_t, 0 /* event */);
4014
4015 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4016 errno = mp_tp->mpt_softerror;
4017 mp_so->so_error = errno;
4018
4019 return (mptcp_close(mpte, mp_tp));
4020}
4021
4022/*
4023 * Close a MPTCP control block.
4024 */
4025struct mptses *
4026mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4027{
3e170ce0
A
4028 struct socket *mp_so = NULL;
4029 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e 4030
5ba3f43e 4031 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4032 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4033 mp_so = mptetoso(mpte);
39236c6e 4034
5ba3f43e 4035 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e 4036
5ba3f43e
A
4037 mptcp_freeq(mp_tp);
4038
4039 soisdisconnected(mp_so);
39236c6e
A
4040
4041 /* Clean up all subflows */
4042 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4043 mptcp_subflow_disconnect(mpte, mpts);
39236c6e 4044 }
39236c6e
A
4045
4046 return (NULL);
4047}
4048
4049void
4050mptcp_notify_close(struct socket *so)
4051{
4052 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4053}
4054
4055/*
5ba3f43e 4056 * MPTCP workloop.
39236c6e
A
4057 */
4058void
5ba3f43e 4059mptcp_subflow_workloop(struct mptses *mpte)
39236c6e
A
4060{
4061 struct socket *mp_so;
4062 struct mptsub *mpts, *tmpts;
4063 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
5ba3f43e 4064 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
39236c6e 4065
5ba3f43e 4066 mpte_lock_assert_held(mpte);
39236c6e 4067 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 4068 mp_so = mptetoso(mpte);
39236c6e
A
4069 VERIFY(mp_so != NULL);
4070
4071 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4072 ev_ret_t ret;
4073
5ba3f43e
A
4074 if (mpts->mpts_socket->so_usecount == 0) {
4075 /* Will be removed soon by tcp_garbage_collect */
4076 continue;
4077 }
3e170ce0 4078
5ba3f43e
A
4079 mptcp_subflow_addref(mpts);
4080 mpts->mpts_socket->so_usecount++;
3e170ce0
A
4081
4082 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e 4083
39236c6e
A
4084 /*
4085 * If MPTCP socket is closed, disconnect all subflows.
4086 * This will generate a disconnect event which will
4087 * be handled during the next iteration, causing a
4088 * non-zero error to be returned above.
4089 */
4090 if (mp_so->so_flags & SOF_PCBCLEARING)
5ba3f43e 4091 mptcp_subflow_disconnect(mpte, mpts);
39236c6e
A
4092
4093 switch (ret) {
39236c6e
A
4094 case MPTS_EVRET_OK:
4095 /* nothing to do */
4096 break;
4097 case MPTS_EVRET_DELETE:
5ba3f43e 4098 mptcp_subflow_soclose(mpts);
39236c6e
A
4099 break;
4100 case MPTS_EVRET_CONNECT_PENDING:
4101 connect_pending = TRUE;
4102 break;
4103 case MPTS_EVRET_DISCONNECT_FALLBACK:
4104 disconnect_fallback = TRUE;
4105 break;
3e170ce0
A
4106 default:
4107 mptcplog((LOG_DEBUG,
4108 "MPTCP Socket: %s: mptcp_subflow_events "
4109 "returned invalid value: %d\n", __func__,
4110 ret),
4111 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4112 break;
39236c6e 4113 }
5ba3f43e
A
4114 mptcp_subflow_remref(mpts); /* ours */
4115
4116 VERIFY(mpts->mpts_socket->so_usecount != 0);
4117 mpts->mpts_socket->so_usecount--;
39236c6e
A
4118 }
4119
5ba3f43e
A
4120 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4121 struct mptcb *mp_tp = mpte->mpte_mptcb;
4122
4123 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4124
39037602 4125 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
5ba3f43e 4126 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
39037602
A
4127 socantrcvmore(mp_so);
4128 mpsofilt_hint_mask &= ~SO_FILT_HINT_CANTRCVMORE;
4129 }
4130
3e170ce0 4131 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
4132 }
4133
5ba3f43e 4134 if (!connect_pending && !disconnect_fallback)
39236c6e 4135 return;
39236c6e
A
4136
4137 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
39236c6e
A
4138 if (disconnect_fallback) {
4139 struct socket *so = NULL;
4140 struct inpcb *inp = NULL;
4141 struct tcpcb *tp = NULL;
4142
5ba3f43e 4143 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
39236c6e 4144 continue;
39236c6e
A
4145
4146 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4147
4148 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
5ba3f43e 4149 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
39236c6e 4150 continue;
490019cf 4151
39236c6e
A
4152 so = mpts->mpts_socket;
4153
4154 /*
4155 * The MPTCP connection has degraded to a fallback
4156 * mode, so there is no point in keeping this subflow
4157 * regardless of its MPTCP-readiness state, unless it
4158 * is the primary one which we use for fallback. This
4159 * assumes that the subflow used for fallback is the
4160 * ACTIVE one.
4161 */
4162
39236c6e
A
4163 inp = sotoinpcb(so);
4164 tp = intotcpcb(inp);
4165 tp->t_mpflags &=
4166 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4167 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 4168
39236c6e 4169 if (mpts->mpts_flags & MPTSF_ACTIVE) {
39236c6e
A
4170 continue;
4171 }
4172 tp->t_mpflags |= TMPF_RESET;
5ba3f43e 4173 soevent(so, SO_FILT_HINT_MUSTRST);
39236c6e
A
4174 } else if (connect_pending) {
4175 /*
4176 * The MPTCP connection has progressed to a state
4177 * where it supports full multipath semantics; allow
4178 * additional joins to be attempted for all subflows
4179 * that are in the PENDING state.
4180 */
4181 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5ba3f43e 4182 int error = mptcp_subflow_soconnectx(mpte, mpts);
39236c6e 4183
5ba3f43e
A
4184 if (error)
4185 mptcp_subflow_abort(mpts, error);
4186 }
39236c6e 4187 }
39236c6e
A
4188 }
4189}
4190
39236c6e
A
4191/*
4192 * Protocol pr_lock callback.
4193 */
4194int
4195mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4196{
5ba3f43e 4197 struct mppcb *mpp = mpsotomppcb(mp_so);
39236c6e
A
4198 void *lr_saved;
4199
4200 if (lr == NULL)
4201 lr_saved = __builtin_return_address(0);
4202 else
4203 lr_saved = lr;
4204
4205 if (mpp == NULL) {
4206 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4207 mp_so, lr_saved, solockhistory_nr(mp_so));
4208 /* NOTREACHED */
4209 }
5ba3f43e 4210 mpp_lock(mpp);
39236c6e
A
4211
4212 if (mp_so->so_usecount < 0) {
4213 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4214 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4215 solockhistory_nr(mp_so));
4216 /* NOTREACHED */
4217 }
4218 if (refcount != 0)
4219 mp_so->so_usecount++;
4220 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4221 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4222
4223 return (0);
4224}
4225
4226/*
4227 * Protocol pr_unlock callback.
4228 */
4229int
5ba3f43e 4230mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
39236c6e 4231{
5ba3f43e
A
4232 struct mppcb *mpp = mpsotomppcb(mp_so);
4233 void *lr_saved;
39236c6e 4234
5ba3f43e
A
4235 if (lr == NULL)
4236 lr_saved = __builtin_return_address(0);
4237 else
4238 lr_saved = lr;
39236c6e 4239
5ba3f43e
A
4240 if (mpp == NULL) {
4241 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4242 mp_so, mp_so->so_usecount, lr_saved,
4243 solockhistory_nr(mp_so));
4244 /* NOTREACHED */
4245 }
4246 mpp_lock_assert_held(mpp);
39236c6e 4247
5ba3f43e
A
4248 if (refcount != 0)
4249 mp_so->so_usecount--;
39236c6e 4250
5ba3f43e
A
4251 if (mp_so->so_usecount < 0) {
4252 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4253 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4254 /* NOTREACHED */
39236c6e 4255 }
5ba3f43e
A
4256 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4257 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4258 mpp_unlock(mpp);
4259
4260 return (0);
39236c6e
A
4261}
4262
5ba3f43e
A
4263/*
4264 * Protocol pr_getlock callback.
4265 */
4266lck_mtx_t *
4267mptcp_getlock(struct socket *mp_so, int flags)
39236c6e 4268{
5ba3f43e
A
4269 struct mppcb *mpp = mpsotomppcb(mp_so);
4270
4271 if (mpp == NULL) {
4272 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4273 solockhistory_nr(mp_so));
39236c6e
A
4274 /* NOTREACHED */
4275 }
5ba3f43e
A
4276 if (mp_so->so_usecount < 0) {
4277 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4278 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4279 /* NOTREACHED */
39236c6e 4280 }
5ba3f43e 4281 return (mpp_getlock(mpp, flags));
39236c6e
A
4282}
4283
4284/*
4285 * MPTCP Join support
4286 */
4287
4288static void
4289mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4290 uint8_t addr_id)
39236c6e
A
4291{
4292 struct tcpcb *tp = sototcpcb(so);
4293 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4294 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4295
39236c6e 4296 /*
39236c6e
A
4297 * The address ID of the first flow is implicitly 0.
4298 */
4299 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4300 tp->t_local_aid = 0;
4301 } else {
fe8ab488 4302 tp->t_local_aid = addr_id;
39236c6e
A
4303 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4304 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4305 }
4306 sauth_entry = zalloc(mpt_subauth_zone);
4307 sauth_entry->msae_laddr_id = tp->t_local_aid;
4308 sauth_entry->msae_raddr_id = 0;
4309 sauth_entry->msae_raddr_rand = 0;
4310try_again:
4311 sauth_entry->msae_laddr_rand = RandomULong();
4312 if (sauth_entry->msae_laddr_rand == 0)
4313 goto try_again;
4314 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4315}
4316
4317static void
4318mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4319{
4320 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4321 struct tcpcb *tp = NULL;
39236c6e
A
4322 int found = 0;
4323
fe8ab488 4324 tp = sototcpcb(so);
5ba3f43e 4325 if (tp == NULL)
39236c6e
A
4326 return;
4327
39236c6e
A
4328 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4329 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4330 found = 1;
4331 break;
4332 }
4333 }
4334 if (found) {
4335 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4336 }
fe8ab488 4337
3e170ce0
A
4338 if (found)
4339 zfree(mpt_subauth_zone, sauth_entry);
39236c6e
A
4340}
4341
4342void
4343mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4344 u_int32_t *rrand)
4345{
4346 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4347 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4348
39236c6e
A
4349 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4350 if (sauth_entry->msae_laddr_id == addr_id) {
4351 if (lrand)
4352 *lrand = sauth_entry->msae_laddr_rand;
4353 if (rrand)
4354 *rrand = sauth_entry->msae_raddr_rand;
4355 break;
4356 }
4357 }
39236c6e
A
4358}
4359
4360void
4361mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4362 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4363{
4364 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4365 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4366
39236c6e
A
4367 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4368 if (sauth_entry->msae_laddr_id == laddr_id) {
4369 if ((sauth_entry->msae_raddr_id != 0) &&
4370 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4371 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4372 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4373 sauth_entry->msae_raddr_id),
4374 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4375 return;
4376 }
4377 sauth_entry->msae_raddr_id = raddr_id;
4378 if ((sauth_entry->msae_raddr_rand != 0) &&
4379 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4380 mptcplog((LOG_ERR, "MPTCP Socket: "
4381 "%s: dup SYN_ACK %d %d \n",
39236c6e 4382 __func__, raddr_rand,
3e170ce0
A
4383 sauth_entry->msae_raddr_rand),
4384 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4385 return;
4386 }
4387 sauth_entry->msae_raddr_rand = raddr_rand;
39236c6e
A
4388 return;
4389 }
4390 }
39236c6e
A
4391}
4392
4393/*
4394 * SHA1 support for MPTCP
4395 */
5ba3f43e
A
4396static void
4397mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
39236c6e
A
4398{
4399 SHA1_CTX sha1ctxt;
4400 const unsigned char *sha1_base;
4401 int sha1_size;
4402
39236c6e
A
4403 sha1_base = (const unsigned char *) key;
4404 sha1_size = sizeof (mptcp_key_t);
4405 SHA1Init(&sha1ctxt);
4406 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4407 SHA1Final(sha_digest, &sha1ctxt);
39236c6e
A
4408}
4409
4410void
4411mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5ba3f43e 4412 u_int32_t rand1, u_int32_t rand2, u_char *digest)
39236c6e
A
4413{
4414 SHA1_CTX sha1ctxt;
4415 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4416 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4417 u_int32_t data[2];
4418 int i;
4419
5ba3f43e 4420 bzero(digest, SHA1_RESULTLEN);
39236c6e
A
4421
4422 /* Set up the Key for HMAC */
4423 key_ipad[0] = key1;
4424 key_ipad[1] = key2;
4425
4426 key_opad[0] = key1;
4427 key_opad[1] = key2;
4428
4429 /* Set up the message for HMAC */
4430 data[0] = rand1;
4431 data[1] = rand2;
4432
4433 /* Key is 512 block length, so no need to compute hash */
4434
4435 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4436
4437 for (i = 0; i < 8; i++) {
4438 key_ipad[i] ^= 0x3636363636363636;
4439 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4440 }
4441
4442 /* Perform inner SHA1 */
4443 SHA1Init(&sha1ctxt);
4444 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4445 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4446 SHA1Final(digest, &sha1ctxt);
4447
4448 /* Perform outer SHA1 */
4449 SHA1Init(&sha1ctxt);
4450 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4451 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4452 SHA1Final(digest, &sha1ctxt);
4453}
4454
4455/*
4456 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4457 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4458 */
4459void
5ba3f43e 4460mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
39236c6e
A
4461{
4462 uint32_t lrand, rrand;
39236c6e 4463
5ba3f43e 4464 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4465
4466 lrand = rrand = 0;
4467 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5ba3f43e
A
4468 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4469 digest);
39236c6e
A
4470}
4471
4472/*
4473 * Authentication data generation
4474 */
5ba3f43e 4475static void
39236c6e
A
4476mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4477 int token_len)
4478{
4479 VERIFY(token_len == sizeof (u_int32_t));
4480 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4481
4482 /* Most significant 32 bits of the SHA1 hash */
4483 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4484 return;
39236c6e
A
4485}
4486
5ba3f43e 4487static void
39236c6e
A
4488mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4489 int idsn_len)
4490{
4491 VERIFY(idsn_len == sizeof (u_int64_t));
4492 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4493
4494 /*
4495 * Least significant 64 bits of the SHA1 hash
4496 */
4497
4498 idsn[7] = sha_digest[12];
4499 idsn[6] = sha_digest[13];
4500 idsn[5] = sha_digest[14];
4501 idsn[4] = sha_digest[15];
4502 idsn[3] = sha_digest[16];
4503 idsn[2] = sha_digest[17];
4504 idsn[1] = sha_digest[18];
4505 idsn[0] = sha_digest[19];
490019cf 4506 return;
39236c6e
A
4507}
4508
490019cf
A
4509static void
4510mptcp_conn_properties(struct mptcb *mp_tp)
4511{
4512 /* There is only Version 0 at this time */
4513 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4514
4515 /* Set DSS checksum flag */
4516 if (mptcp_dss_csum)
4517 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4518
4519 /* Set up receive window */
4520 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4521
4522 /* Set up gc ticks */
4523 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4524}
4525
4526static void
5ba3f43e 4527mptcp_init_local_parms(struct mptses *mpte)
39236c6e 4528{
5ba3f43e
A
4529 struct mptcb *mp_tp = mpte->mpte_mptcb;
4530 char key_digest[SHA1_RESULTLEN];
490019cf 4531
5ba3f43e
A
4532 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4533 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4534
4535 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
490019cf 4536 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
5ba3f43e 4537 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
490019cf
A
4538 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4539
4540 /* The subflow SYN is also first MPTCP byte */
4541 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4542 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4543
4544 mptcp_conn_properties(mp_tp);
4545}
4546
4547int
4548mptcp_init_remote_parms(struct mptcb *mp_tp)
4549{
5ba3f43e
A
4550 char remote_digest[SHA1_RESULTLEN];
4551 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4552
4553 /* Only Version 0 is supported for auth purposes */
3e170ce0 4554 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4555 return (-1);
4556
4557 /* Setup local and remote tokens and Initial DSNs */
5ba3f43e 4558 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
39236c6e 4559 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4560 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4561 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4562 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
5ba3f43e 4563 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4564
490019cf 4565 return (0);
39236c6e
A
4566}
4567
5ba3f43e 4568static void
39236c6e
A
4569mptcp_send_dfin(struct socket *so)
4570{
4571 struct tcpcb *tp = NULL;
4572 struct inpcb *inp = NULL;
4573
4574 inp = sotoinpcb(so);
4575 if (!inp)
4576 return;
4577
4578 tp = intotcpcb(inp);
4579 if (!tp)
4580 return;
4581
4582 if (!(tp->t_mpflags & TMPF_RESET))
4583 tp->t_mpflags |= TMPF_SEND_DFIN;
4584}
4585
4586/*
4587 * Data Sequence Mapping routines
4588 */
4589void
4590mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4591{
4592 struct mptcb *mp_tp;
4593
4594 if (m == NULL)
4595 return;
4596
3e170ce0 4597 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5ba3f43e
A
4598 mpte_lock_assert_held(mp_tp->mpt_mpte);
4599
39236c6e
A
4600 while (m) {
4601 VERIFY(m->m_flags & M_PKTHDR);
4602 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4603 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4604 m->m_pkthdr.mp_rlen = m_pktlen(m);
4605 mp_tp->mpt_sndmax += m_pktlen(m);
4606 m = m->m_next;
4607 }
5ba3f43e
A
4608}
4609
4610void
4611mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4612{
4613 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4614 uint64_t data_ack;
4615 uint64_t dsn;
4616
4617 if (!m || len == 0)
4618 return;
4619
4620 while (m && len > 0) {
4621 VERIFY(m->m_flags & M_PKTHDR);
4622 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4623
4624 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4625 dsn = m->m_pkthdr.mp_dsn;
4626
4627 len -= m->m_len;
4628 m = m->m_next;
4629 }
4630
4631 if (m && len == 0) {
4632 /*
4633 * If there is one more mbuf in the chain, it automatically means
4634 * that up to m->mp_dsn has been ack'ed.
4635 *
4636 * This means, we actually correct data_ack back down (compared
4637 * to what we set inside the loop - dsn + data_len). Because in
4638 * the loop we are "optimistic" and assume that the full mapping
4639 * will be acked. If that's not the case and we get out of the
4640 * loop with m != NULL, it means only up to m->mp_dsn has been
4641 * really acked.
4642 */
4643 data_ack = m->m_pkthdr.mp_dsn;
4644 }
4645
4646 if (len < 0) {
4647 /*
4648 * If len is negative, meaning we acked in the middle of an mbuf,
4649 * only up to this mbuf's data-sequence number has been acked
4650 * at the MPTCP-level.
4651 */
4652 data_ack = dsn;
4653 }
4654
4655 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4656 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4657 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
39236c6e
A
4658}
4659
4660void
490019cf 4661mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e 4662{
490019cf
A
4663 int rewinding = 0;
4664
5ba3f43e
A
4665 /* TFO makes things complicated. */
4666 if (so->so_flags1 & SOF1_TFO_REWIND) {
4667 rewinding = 1;
4668 so->so_flags1 &= ~SOF1_TFO_REWIND;
490019cf 4669 }
39236c6e 4670
5ba3f43e
A
4671 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4672 u_int32_t sub_len;
39236c6e 4673 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4674 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4675
5ba3f43e 4676 sub_len = m->m_pkthdr.mp_rlen;
39236c6e 4677
5ba3f43e
A
4678 if (sub_len < len) {
4679 m->m_pkthdr.mp_dsn += sub_len;
4680 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4681 m->m_pkthdr.mp_rseq += sub_len;
39236c6e 4682 }
5ba3f43e
A
4683 m->m_pkthdr.mp_rlen = 0;
4684 len -= sub_len;
39236c6e 4685 } else {
5ba3f43e
A
4686 /* sub_len >= len */
4687 if (rewinding == 0)
4688 m->m_pkthdr.mp_dsn += len;
4689 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4690 if (rewinding == 0)
4691 m->m_pkthdr.mp_rseq += len;
4692 }
4693 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4694 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4695 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4696 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4697 m->m_pkthdr.mp_rlen -= len;
4698 break;
39236c6e
A
4699 }
4700 m = m->m_next;
4701 }
39037602
A
4702
4703 if (so->so_flags & SOF_MP_SUBFLOW &&
4704 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4705 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4706 /*
4707 * Received an ack without receiving a DATA_ACK.
4708 * Need to fallback to regular TCP (or destroy this subflow).
4709 */
5ba3f43e 4710 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
39037602
A
4711 mptcp_notify_mpfail(so);
4712 }
39236c6e
A
4713}
4714
4715/* Obtain the DSN mapping stored in the mbuf */
4716void
5ba3f43e
A
4717mptcp_output_getm_dsnmap32(struct socket *so, int off,
4718 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
39236c6e
A
4719{
4720 u_int64_t dsn64;
4721
5ba3f43e 4722 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
39236c6e 4723 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
39236c6e
A
4724}
4725
4726void
5ba3f43e
A
4727mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4728 uint32_t *relseq, uint16_t *data_len,
4729 uint16_t *dss_csum)
39236c6e
A
4730{
4731 struct mbuf *m = so->so_snd.sb_mb;
5ba3f43e 4732 int off_orig = off;
39236c6e 4733
5ba3f43e 4734 VERIFY(off >= 0);
39236c6e 4735
39236c6e
A
4736 /*
4737 * In the subflow socket, the DSN sequencing can be discontiguous,
4738 * but the subflow sequence mapping is contiguous. Use the subflow
4739 * sequence property to find the right mbuf and corresponding dsn
4740 * mapping.
4741 */
4742
4743 while (m) {
39236c6e 4744 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4745 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4746
5ba3f43e
A
4747 if (off >= m->m_len) {
4748 off -= m->m_len;
39236c6e
A
4749 m = m->m_next;
4750 } else {
4751 break;
4752 }
4753 }
4754
5ba3f43e
A
4755 VERIFY(m);
4756 VERIFY(off >= 0);
4757 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
39236c6e 4758
5ba3f43e
A
4759 *dsn = m->m_pkthdr.mp_dsn;
4760 *relseq = m->m_pkthdr.mp_rseq;
4761 *data_len = m->m_pkthdr.mp_rlen;
4762 *dss_csum = m->m_pkthdr.mp_csum;
39236c6e 4763
5ba3f43e
A
4764 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4765 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4766 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4767}
4768
4769/*
3e170ce0
A
4770 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4771 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4772 * When it trims data tcp_input calls m_adj() which does not remove the
4773 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4774 * The dsn map insertion cannot be delayed after trim, because data can be in
4775 * the reassembly queue for a while and the DSN option info in tp will be
4776 * overwritten for every new packet received.
39236c6e
A
4777 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4778 * with mptcp_adj_rmap()
4779 */
4780void
4781mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4782{
4783 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4784
4785 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4786 VERIFY(m->m_flags & M_PKTHDR);
4787 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4788 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4789 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5ba3f43e 4790 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
39236c6e
A
4791 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4792 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4793 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4794 }
4795}
4796
5ba3f43e
A
4797void
4798mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off)
39236c6e 4799{
5ba3f43e 4800 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
39236c6e
A
4801
4802 if (m_pktlen(m) == 0)
5ba3f43e 4803 return;
39236c6e 4804
5ba3f43e 4805 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
39236c6e
A
4806 m->m_pkthdr.mp_dsn += off;
4807 m->m_pkthdr.mp_rseq += off;
fe8ab488 4808 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e 4809 } else {
5ba3f43e
A
4810 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
4811 /* data arrived without an DSS option mapping */
4812
4813 /* initial subflow can fallback right after SYN handshake */
4814 mptcp_notify_mpfail(so);
4815 }
39236c6e 4816 }
5ba3f43e
A
4817
4818 mpts->mpts_flags |= MPTSF_CONFIRMED;
4819
4820 return;
39236c6e
A
4821}
4822
4823/*
4824 * Following routines help with failure detection and failover of data
4825 * transfer from one subflow to another.
4826 */
4827void
4828mptcp_act_on_txfail(struct socket *so)
4829{
4830 struct tcpcb *tp = NULL;
4831 struct inpcb *inp = sotoinpcb(so);
4832
4833 if (inp == NULL)
4834 return;
4835
4836 tp = intotcpcb(inp);
4837 if (tp == NULL)
4838 return;
4839
5ba3f43e 4840 if (so->so_flags & SOF_MP_TRYFAILOVER)
39236c6e 4841 return;
39236c6e
A
4842
4843 so->so_flags |= SOF_MP_TRYFAILOVER;
4844 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4845}
4846
4847/*
4848 * Support for MP_FAIL option
4849 */
4850int
4851mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4852{
4853 struct mbuf *m = so->so_snd.sb_mb;
4854 u_int64_t dsn;
4855 int off = 0;
4856 u_int32_t datalen;
4857
4858 if (m == NULL)
4859 return (-1);
4860
4861 while (m != NULL) {
4862 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4863 VERIFY(m->m_flags & M_PKTHDR);
4864 dsn = m->m_pkthdr.mp_dsn;
4865 datalen = m->m_pkthdr.mp_rlen;
4866 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4867 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4868 off = dsn_fail - dsn;
4869 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5ba3f43e
A
4870 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
4871 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4872 return (0);
4873 }
4874
4875 m = m->m_next;
4876 }
4877
4878 /*
4879 * If there was no mbuf data and a fallback to TCP occurred, there's
4880 * not much else to do.
4881 */
4882
5ba3f43e
A
4883 mptcplog((LOG_ERR, "MPTCP Sender: "
4884 "%s: %llu not found \n", __func__, dsn_fail),
4885 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
4886 return (-1);
4887}
4888
4889/*
4890 * Support for sending contiguous MPTCP bytes in subflow
4891 * Also for preventing sending data with ACK in 3-way handshake
4892 */
4893int32_t
4894mptcp_adj_sendlen(struct socket *so, int32_t off)
4895{
4896 struct tcpcb *tp = sototcpcb(so);
4897 struct mptsub *mpts = tp->t_mpsub;
4898 uint64_t mdss_dsn;
4899 uint32_t mdss_subflow_seq;
4900 int mdss_subflow_off;
4901 uint16_t mdss_data_len;
4902 uint16_t dss_csum;
4903
4904 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
4905 &mdss_data_len, &dss_csum);
4906
4907 /*
4908 * We need to compute how much of the mapping still remains.
4909 * So, we compute the offset in the send-buffer of the dss-sub-seq.
4910 */
4911 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
4912
4913 /*
4914 * When TFO is used, we are sending the mpts->mpts_iss although the relative
4915 * seq has been set to 1 (while it should be 0).
4916 */
4917 if (tp->t_mpflags & TMPF_TFO_REQUEST)
4918 mdss_subflow_off--;
4919
4920 if (off < mdss_subflow_off)
4921 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
4922 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
4923 VERIFY(off >= mdss_subflow_off);
4924
4925 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
4926 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
4927 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4928 return (mdss_data_len - (off - mdss_subflow_off));
4929}
4930
4931static uint32_t
4932mptcp_get_maxseg(struct mptses *mpte)
4933{
4934 struct mptsub *mpts;
4935 uint32_t maxseg = 0;
4936
4937 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4938 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4939
4940 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4941 TCPS_HAVERCVDFIN2(tp->t_state))
4942 continue;
4943
4944 if (tp->t_maxseg > maxseg)
4945 maxseg = tp->t_maxseg;
4946 }
4947
4948 return (maxseg);
4949}
4950
4951static uint8_t
4952mptcp_get_rcvscale(struct mptses *mpte)
4953{
4954 struct mptsub *mpts;
4955 uint8_t rcvscale = UINT8_MAX;
4956
4957 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4958 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4959
4960 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4961 TCPS_HAVERCVDFIN2(tp->t_state))
4962 continue;
4963
4964 if (tp->rcv_scale < rcvscale)
4965 rcvscale = tp->rcv_scale;
4966 }
4967
4968 return (rcvscale);
4969}
4970
4971/* Similar to tcp_sbrcv_reserve */
4972static void
4973mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
4974 u_int32_t newsize, u_int32_t idealsize)
4975{
4976 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
4977
4978 /* newsize should not exceed max */
4979 newsize = min(newsize, tcp_autorcvbuf_max);
4980
4981 /* The receive window scale negotiated at the
4982 * beginning of the connection will also set a
4983 * limit on the socket buffer size
4984 */
4985 newsize = min(newsize, TCP_MAXWIN << rcvscale);
4986
4987 /* Set new socket buffer size */
4988 if (newsize > sbrcv->sb_hiwat &&
4989 (sbreserve(sbrcv, newsize) == 1)) {
4990 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
4991 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
4992
4993 /* Again check the limit set by the advertised
4994 * window scale
4995 */
4996 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
4997 TCP_MAXWIN << rcvscale);
4998 }
4999}
5000
5001void
5002mptcp_sbrcv_grow(struct mptcb *mp_tp)
5003{
5004 struct mptses *mpte = mp_tp->mpt_mpte;
5005 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5006 struct sockbuf *sbrcv = &mp_so->so_rcv;
5007 uint32_t hiwat_sum = 0;
5008 uint32_t ideal_sum = 0;
5009 struct mptsub *mpts;
5010
5011 /*
5012 * Do not grow the receive socket buffer if
5013 * - auto resizing is disabled, globally or on this socket
5014 * - the high water mark already reached the maximum
5015 * - the stream is in background and receive side is being
5016 * throttled
5017 * - if there are segments in reassembly queue indicating loss,
5018 * do not need to increase recv window during recovery as more
5019 * data is not going to be sent. A duplicate ack sent during
5020 * recovery should not change the receive window
5021 */
5022 if (tcp_do_autorcvbuf == 0 ||
5023 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5024 tcp_cansbgrow(sbrcv) == 0 ||
5025 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5026 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5027 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5028 /* Can not resize the socket buffer, just return */
5029 return;
5030 }
5031
5032 /*
5033 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5034 *
5035 * But, for this we first need accurate receiver-RTT estimations, which
5036 * we currently don't have.
5037 *
5038 * Let's use a dummy algorithm for now, just taking the sum of all
5039 * subflow's receive-buffers. It's too low, but that's all we can get
5040 * for now.
5041 */
5042
5043 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5044 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5045 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5046 }
5047
5048 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
39236c6e
A
5049}
5050
5051/*
5ba3f43e
A
5052 * Determine if we can grow the recieve socket buffer to avoid sending
5053 * a zero window update to the peer. We allow even socket buffers that
5054 * have fixed size (set by the application) to grow if the resource
5055 * constraints are met. They will also be trimmed after the application
5056 * reads data.
5057 *
5058 * Similar to tcp_sbrcv_grow_rwin
39236c6e 5059 */
5ba3f43e
A
5060static void
5061mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
39236c6e 5062{
5ba3f43e
A
5063 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5064 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5065 u_int32_t rcvbuf = sb->sb_hiwat;
39236c6e 5066
5ba3f43e
A
5067 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5068 return;
39236c6e 5069
5ba3f43e
A
5070 if (tcp_do_autorcvbuf == 1 &&
5071 tcp_cansbgrow(sb) &&
5072 /* Diff to tcp_sbrcv_grow_rwin */
5073 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5074 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5075 rcvbuf < tcp_autorcvbuf_max &&
5076 (sb->sb_idealsize > 0 &&
5077 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5078 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
490019cf 5079 }
39236c6e
A
5080}
5081
5ba3f43e 5082/* Similar to tcp_sbspace */
39236c6e 5083int32_t
5ba3f43e 5084mptcp_sbspace(struct mptcb *mp_tp)
39236c6e 5085{
5ba3f43e 5086 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
39236c6e
A
5087 uint32_t rcvbuf;
5088 int32_t space;
5ba3f43e
A
5089 int32_t pending = 0;
5090
5091 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 5092
5ba3f43e 5093 mptcp_sbrcv_grow_rwin(mp_tp, sb);
39236c6e 5094
5ba3f43e 5095 /* hiwat might have changed */
39236c6e 5096 rcvbuf = sb->sb_hiwat;
5ba3f43e
A
5097
5098 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5099 (sb->sb_mbmax - sb->sb_mbcnt)));
39236c6e
A
5100 if (space < 0)
5101 space = 0;
5ba3f43e
A
5102
5103#if CONTENT_FILTER
5104 /* Compensate for data being processed by content filters */
5105 pending = cfil_sock_data_space(sb);
5106#endif /* CONTENT_FILTER */
5107 if (pending > space)
5108 space = 0;
5109 else
5110 space -= pending;
39236c6e
A
5111
5112 return (space);
5113}
5114
5115/*
5116 * Support Fallback to Regular TCP
5117 */
5118void
5119mptcp_notify_mpready(struct socket *so)
5120{
5121 struct tcpcb *tp = NULL;
5122
5123 if (so == NULL)
5124 return;
5125
5126 tp = intotcpcb(sotoinpcb(so));
5127
5128 if (tp == NULL)
5129 return;
5130
5131 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5132 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5133 struct tcpcb *, tp);
5134
5135 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5136 return;
5137
5138 if (tp->t_mpflags & TMPF_MPTCP_READY)
5139 return;
5140
5141 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5142 tp->t_mpflags |= TMPF_MPTCP_READY;
5143
5144 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5145}
5146
5147void
5148mptcp_notify_mpfail(struct socket *so)
5149{
5150 struct tcpcb *tp = NULL;
5151
5152 if (so == NULL)
5153 return;
5154
5155 tp = intotcpcb(sotoinpcb(so));
5156
5157 if (tp == NULL)
5158 return;
5159
5160 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5161 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5162 struct tcpcb *, tp);
5163
5164 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5165 return;
5166
5167 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5168 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5169
5170 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5171}
5172
5173/*
5174 * Keepalive helper function
5175 */
5176boolean_t
5177mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5178{
5179 boolean_t ret = 1;
5ba3f43e
A
5180 mpte_lock_assert_held(mp_tp->mpt_mpte);
5181
39236c6e
A
5182 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5183 ret = 0;
5184 }
39236c6e
A
5185 return (ret);
5186}
5187
5188/*
5189 * MPTCP t_maxseg adjustment function
5190 */
5191int
5192mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5193{
5194 int mss_lower = 0;
5195 struct mptcb *mp_tp = tptomptp(tp);
5196
5197#define MPTCP_COMPUTE_LEN { \
5198 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
39236c6e
A
5199 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5200 mss_lower += 2; \
5201 else \
5202 /* adjust to 32-bit boundary + EOL */ \
5203 mss_lower += 2; \
39236c6e
A
5204}
5205 if (mp_tp == NULL)
5206 return (0);
5207
5ba3f43e
A
5208 mpte_lock_assert_held(mp_tp->mpt_mpte);
5209
39236c6e
A
5210 /*
5211 * For the first subflow and subsequent subflows, adjust mss for
5212 * most common MPTCP option size, for case where tcp_mss is called
5213 * during option processing and MTU discovery.
5214 */
5ba3f43e
A
5215 if (!mtudisc) {
5216 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5217 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5218 MPTCP_COMPUTE_LEN;
5219 }
39236c6e 5220
5ba3f43e
A
5221 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5222 tp->t_mpflags & TMPF_SENT_JOIN) {
5223 MPTCP_COMPUTE_LEN;
5224 }
5225 } else {
5226 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5227 MPTCP_COMPUTE_LEN;
5228 }
39236c6e
A
5229 }
5230
5231 return (mss_lower);
5232}
5233
5234/*
5235 * Update the pid, upid, uuid of the subflow so, based on parent so
5236 */
5237void
5ba3f43e 5238mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
39236c6e 5239{
5ba3f43e
A
5240 if (so->last_pid != mp_so->last_pid ||
5241 so->last_upid != mp_so->last_upid) {
5242 so->last_upid = mp_so->last_upid;
5243 so->last_pid = mp_so->last_pid;
5244 uuid_copy(so->last_uuid, mp_so->last_uuid);
39236c6e 5245 }
5ba3f43e 5246 so_update_policy(so);
39236c6e
A
5247}
5248
5249static void
5250fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5251{
5252 struct inpcb *inp;
5253
5254 tcp_getconninfo(so, &flow->flow_ci);
5255 inp = sotoinpcb(so);
5256#if INET6
5257 if ((inp->inp_vflag & INP_IPV6) != 0) {
5258 flow->flow_src.ss_family = AF_INET6;
5259 flow->flow_dst.ss_family = AF_INET6;
5260 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5261 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5262 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5263 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5264 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5265 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
39037602 5266 } else
39236c6e 5267#endif
3e170ce0 5268 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5269 flow->flow_src.ss_family = AF_INET;
5270 flow->flow_dst.ss_family = AF_INET;
5271 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5272 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5273 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5274 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5275 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5276 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5277 }
3e170ce0
A
5278 flow->flow_len = sizeof(*flow);
5279 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5280 flow->flow_flags = mpts->mpts_flags;
5281 flow->flow_cid = mpts->mpts_connid;
3e170ce0 5282 flow->flow_relseq = mpts->mpts_rel_seq;
5ba3f43e 5283 flow->flow_soerror = mpts->mpts_socket->so_error;
3e170ce0 5284 flow->flow_probecnt = mpts->mpts_probecnt;
39236c6e
A
5285}
5286
5287static int
5288mptcp_pcblist SYSCTL_HANDLER_ARGS
5289{
5290#pragma unused(oidp, arg1, arg2)
5291 int error = 0, f;
5ba3f43e 5292 size_t len;
39236c6e
A
5293 struct mppcb *mpp;
5294 struct mptses *mpte;
5295 struct mptcb *mp_tp;
5296 struct mptsub *mpts;
5297 struct socket *so;
5298 conninfo_mptcp_t mptcpci;
fe8ab488 5299 mptcp_flow_t *flows = NULL;
39236c6e
A
5300
5301 if (req->newptr != USER_ADDR_NULL)
5302 return (EPERM);
5303
5304 lck_mtx_lock(&mtcbinfo.mppi_lock);
39236c6e 5305 if (req->oldptr == USER_ADDR_NULL) {
5ba3f43e 5306 size_t n = mtcbinfo.mppi_count;
39236c6e 5307 lck_mtx_unlock(&mtcbinfo.mppi_lock);
39037602 5308 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
39236c6e
A
5309 4 * (n + n/8) * sizeof(mptcp_flow_t);
5310 return (0);
5311 }
5312 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5313 flows = NULL;
5ba3f43e 5314 mpp_lock(mpp);
39236c6e
A
5315 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5316 mpte = mptompte(mpp);
5317 VERIFY(mpte != NULL);
5ba3f43e 5318 mpte_lock_assert_held(mpte);
39236c6e
A
5319 mp_tp = mpte->mpte_mptcb;
5320 VERIFY(mp_tp != NULL);
3e170ce0
A
5321
5322 bzero(&mptcpci, sizeof(mptcpci));
39236c6e 5323 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5324 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5325 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5326 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5327 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5328 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5329 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5330 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5331 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5332 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5333 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5ba3f43e 5334 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
3e170ce0
A
5335 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5336 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
3e170ce0 5337
39236c6e 5338 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5339 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5340 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5341 mptcpci.mptcpci_flow_offset =
5342 offsetof(conninfo_mptcp_t, mptcpci_flows);
5343
fe8ab488
A
5344 len = sizeof(*flows) * mpte->mpte_numflows;
5345 if (mpte->mpte_numflows != 0) {
5346 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5347 if (flows == NULL) {
5ba3f43e 5348 mpp_unlock(mpp);
fe8ab488
A
5349 break;
5350 }
5351 mptcpci.mptcpci_len = sizeof(mptcpci) +
5352 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5353 error = SYSCTL_OUT(req, &mptcpci,
5354 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5355 } else {
5356 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5357 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 5358 }
39236c6e 5359 if (error) {
5ba3f43e 5360 mpp_unlock(mpp);
39236c6e
A
5361 FREE(flows, M_TEMP);
5362 break;
5363 }
5364 f = 0;
5365 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
39236c6e 5366 so = mpts->mpts_socket;
39236c6e 5367 fill_mptcp_subflow(so, &flows[f], mpts);
39236c6e
A
5368 f++;
5369 }
5ba3f43e 5370 mpp_unlock(mpp);
fe8ab488
A
5371 if (flows) {
5372 error = SYSCTL_OUT(req, flows, len);
5373 FREE(flows, M_TEMP);
5374 if (error)
5375 break;
5376 }
39236c6e
A
5377 }
5378 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5379
5380 return (error);
5381}
5382
5383SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 5384 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 5385 "List of active MPTCP connections");
fe8ab488 5386
fe8ab488
A
5387/*
5388 * Set notsent lowat mark on the MPTCB
5389 */
5390int
5391mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5392{
5393 struct mptcb *mp_tp = NULL;
5394 int error = 0;
5395
5396 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5397 mp_tp = mpte->mpte_mptcb;
5398
5399 if (mp_tp)
5400 mp_tp->mpt_notsent_lowat = optval;
5401 else
5402 error = EINVAL;
5403
5ba3f43e 5404 return (error);
fe8ab488
A
5405}
5406
5407u_int32_t
5408mptcp_get_notsent_lowat(struct mptses *mpte)
5409{
5410 struct mptcb *mp_tp = NULL;
5411
5412 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5413 mp_tp = mpte->mpte_mptcb;
5414
5415 if (mp_tp)
5ba3f43e 5416 return (mp_tp->mpt_notsent_lowat);
fe8ab488 5417 else
5ba3f43e 5418 return (0);
fe8ab488
A
5419}
5420
39037602 5421int
5ba3f43e
A
5422mptcp_notsent_lowat_check(struct socket *so)
5423{
fe8ab488
A
5424 struct mptses *mpte;
5425 struct mppcb *mpp;
5426 struct mptcb *mp_tp;
5427 struct mptsub *mpts;
5428
5429 int notsent = 0;
5430
5ba3f43e 5431 mpp = mpsotomppcb(so);
fe8ab488
A
5432 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5433 return (0);
5434 }
5435
5436 mpte = mptompte(mpp);
5ba3f43e 5437 mpte_lock_assert_held(mpte);
fe8ab488
A
5438 mp_tp = mpte->mpte_mptcb;
5439
fe8ab488
A
5440 notsent = so->so_snd.sb_cc;
5441
5442 if ((notsent == 0) ||
5443 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5444 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5445 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5446 "lowat %d notsent %d actual %d \n",
5447 mp_tp->mpt_notsent_lowat, notsent,
5448 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5449 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5450 return (1);
5451 }
fe8ab488
A
5452
5453 /* When Nagle's algorithm is not disabled, it is better
5454 * to wakeup the client even before there is atleast one
5455 * maxseg of data to write.
5456 */
5457 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5458 int retval = 0;
fe8ab488
A
5459 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5460 struct socket *subf_so = mpts->mpts_socket;
fe8ab488 5461 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 5462
fe8ab488
A
5463 notsent = so->so_snd.sb_cc -
5464 (tp->snd_nxt - tp->snd_una);
39037602 5465
fe8ab488
A
5466 if ((tp->t_flags & TF_NODELAY) == 0 &&
5467 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5468 retval = 1;
5469 }
3e170ce0 5470 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5471 " nodelay false \n",
3e170ce0
A
5472 mp_tp->mpt_notsent_lowat, notsent),
5473 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5474 return (retval);
5475 }
fe8ab488
A
5476 }
5477 return (0);
5478}
5479
3e170ce0
A
5480/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5481static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5482static uint32_t mptcp_kern_skt_inuse = 0;
5ba3f43e 5483static uint32_t mptcp_kern_skt_unit;
3e170ce0
A
5484symptoms_advisory_t mptcp_advisory;
5485
5486static errno_t
5487mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5488 void **unitinfo)
5489{
5490#pragma unused(kctlref, sac, unitinfo)
5ba3f43e
A
5491
5492 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5493 mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__),
5494 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5495
5496 mptcp_kern_skt_unit = sac->sc_unit;
5497
5498 return (0);
5499}
5500
5501static void
5502mptcp_allow_uuid(uuid_t uuid)
5503{
5504 struct mppcb *mpp;
5505
5506 /* Iterate over all MPTCP connections */
5507
5508 lck_mtx_lock(&mtcbinfo.mppi_lock);
5509
5510 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5511 struct mptses *mpte;
5512 struct socket *mp_so;
5513
5514 mpp_lock(mpp);
5515
5516 mpte = mpp->mpp_pcbe;
5517 mp_so = mpp->mpp_socket;
5518
5519 if (mp_so->so_flags & SOF_DELEGATED &&
5520 uuid_compare(uuid, mp_so->e_uuid))
5521 goto next;
5522 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5523 uuid_compare(uuid, mp_so->last_uuid))
5524 goto next;
5525
5526 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5527
5528 mptcp_check_subflows_and_add(mpte);
5529 mptcp_remove_subflows(mpte);
5530
5531 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5532
5533next:
5534 mpp_unlock(mpp);
5535 }
5536
5537 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5538}
5539
5540static void
5541mptcp_wifi_status_changed(void)
5542{
5543 struct mppcb *mpp;
5544
5545 /* Iterate over all MPTCP connections */
5546
5547 lck_mtx_lock(&mtcbinfo.mppi_lock);
5548
5549 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5550 struct mptses *mpte;
5551 struct socket *mp_so;
5552
5553 mpp_lock(mpp);
5554
5555 mpte = mpp->mpp_pcbe;
5556 mp_so = mpp->mpp_socket;
5557
5558 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5559 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5560 goto next;
5561
5562 mptcp_check_subflows_and_add(mpte);
5563 mptcp_check_subflows_and_remove(mpte);
5564
5565next:
5566 mpp_unlock(mpp);
5567 }
5568
5569 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5570}
5571
5572void
5573mptcp_ask_symptoms(struct mptses *mpte)
5574{
5575 struct mptcp_symptoms_ask_uuid ask;
5576 struct socket *mp_so;
5577 struct proc *p;
5578 int pid, prio, err;
5579
5580 if (mptcp_kern_skt_unit == 0) {
5581 mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__),
5582 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5583 return;
5584 }
5585
5586 mp_so = mptetoso(mpte);
5587
5588 if (mp_so->so_flags & SOF_DELEGATED)
5589 pid = mp_so->e_pid;
5590 else
5591 pid = mp_so->last_pid;
5592
5593 p = proc_find(pid);
5594 if (p == PROC_NULL) {
5595 mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__,
5596 pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5597 return;
5598 }
5599
5600 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5601
5602 if (mp_so->so_flags & SOF_DELEGATED)
5603 uuid_copy(ask.uuid, mp_so->e_uuid);
5604 else
5605 uuid_copy(ask.uuid, mp_so->last_uuid);
5606
5607 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5608
5609 if (prio == TASK_BACKGROUND_APPLICATION)
5610 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5611 else if (prio == TASK_FOREGROUND_APPLICATION)
5612 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
3e170ce0 5613 else
5ba3f43e
A
5614 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5615
5616 mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__,
5617 pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5618
5619 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5620 &ask, sizeof(ask), CTL_DATA_EOR);
5621 if (err)
5622 mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err),
5623 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5624
5625 proc_rele(p);
3e170ce0
A
5626}
5627
5628static errno_t
5629mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5630 void *unitinfo)
5631{
5632#pragma unused(kctlref, kcunit, unitinfo)
5ba3f43e
A
5633
5634 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5635
5636 return (0);
3e170ce0
A
5637}
5638
5639static errno_t
5640mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5641 mbuf_t m, int flags)
5642{
5ba3f43e 5643#pragma unused(kctlref, unitinfo, flags)
3e170ce0
A
5644 symptoms_advisory_t *sa = NULL;
5645
5ba3f43e
A
5646 if (kcunit != mptcp_kern_skt_unit)
5647 mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n",
5648 __func__, kcunit, mptcp_kern_skt_unit),
5649 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5650
3e170ce0
A
5651 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5652 mbuf_freem(m);
5653 return (EINVAL);
5654 }
5655
5656 if (mbuf_len(m) >= sizeof(*sa))
5657 sa = mbuf_data(m);
5658 else
5659 return (EINVAL);
5660
5ba3f43e
A
5661 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5662 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5663 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
3e170ce0 5664
5ba3f43e
A
5665 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5666 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5667 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3e170ce0
A
5668
5669 if ((sa->sa_wifi_status &
5670 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5ba3f43e 5671 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
3e170ce0 5672 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
3e170ce0 5673
5ba3f43e
A
5674 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5675 mptcp_wifi_status_changed();
5676 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5677 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5678 mptcp_advisory.sa_wifi_status),
5679 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5680 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5681 uuid_t uuid;
5682
5683 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5684 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5685
5686 uuid_copy(uuid, (unsigned char *)(sa + 1));
5687
5688 mptcp_allow_uuid(uuid);
3e170ce0 5689 }
5ba3f43e 5690
3e170ce0
A
5691 return (0);
5692}
5693
5694void
5695mptcp_control_register(void)
5696{
5697 /* Set up the advisory control socket */
5698 struct kern_ctl_reg mptcp_kern_ctl;
5699
5700 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5701 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5702 sizeof(mptcp_kern_ctl.ctl_name));
5703 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5704 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5705 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5706 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5707
5708 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5709}
5710
5711int
5712mptcp_is_wifi_unusable(void)
5713{
5714 /* a false return val indicates there is no info or wifi is ok */
5715 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5716}
5717
490019cf
A
5718/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5719static void
5ba3f43e 5720mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
490019cf 5721{
5ba3f43e 5722 struct socket *mp_so = mptetoso(mpte);
490019cf
A
5723 struct socket *so = mpts->mpts_socket;
5724 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5725 struct mptcb *mp_tp = mpte->mpte_mptcb;
5726
5727 /* If data was sent with SYN, rewind state */
5728 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5ba3f43e 5729 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
490019cf 5730 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5ba3f43e 5731
490019cf
A
5732 VERIFY(mp_droplen <= (UINT_MAX));
5733 VERIFY(mp_droplen >= tcp_droplen);
5734
5ba3f43e
A
5735 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5736 mpts->mpts_iss += tcp_droplen;
5737 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5738
490019cf
A
5739 if (mp_droplen > tcp_droplen) {
5740 /* handle partial TCP ack */
5741 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5742 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
490019cf
A
5743 mp_droplen = tcp_droplen;
5744 } else {
5745 /* all data on SYN was acked */
5746 mpts->mpts_rel_seq = 1;
5747 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
490019cf
A
5748 }
5749 mp_tp->mpt_sndmax -= tcp_droplen;
5750
490019cf
A
5751 if (mp_droplen != 0) {
5752 VERIFY(mp_so->so_snd.sb_mb != NULL);
5753 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5754 }
5ba3f43e
A
5755 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
5756 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5757 mpts->mpts_connid, tcp_droplen, mp_droplen),
5758 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5759 }
5760}
5761
5762int
5763mptcp_freeq(struct mptcb *mp_tp)
5764{
5765 struct tseg_qent *q;
5766 int rv = 0;
5767
5768 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
5769 LIST_REMOVE(q, tqe_q);
5770 m_freem(q->tqe_m);
5771 zfree(tcp_reass_zone, q);
5772 rv = 1;
5773 }
5774 mp_tp->mpt_reassqlen = 0;
5775 return (rv);
5776}
5777
5778static int
5779mptcp_post_event(u_int32_t event_code, int value)
5780{
5781 struct kev_mptcp_data event_data;
5782 struct kev_msg ev_msg;
5783
5784 memset(&ev_msg, 0, sizeof(ev_msg));
5785
5786 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5787 ev_msg.kev_class = KEV_NETWORK_CLASS;
5788 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
5789 ev_msg.event_code = event_code;
5790
5791 event_data.value = value;
5792
5793 ev_msg.dv[0].data_ptr = &event_data;
5794 ev_msg.dv[0].data_length = sizeof(event_data);
5795
5796 return kev_post_msg(&ev_msg);
5797}
5798
5799void
5800mptcp_set_cellicon(struct mptses *mpte)
5801{
5802 int error;
5803
5804 /* First-party apps (Siri) don't flip the cellicon */
5805 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
5806 return;
5807
5808 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
5809 mptcp_last_cellicon_set = tcp_now;
5810
5811 /* If cellicon is already set, get out of here! */
5812 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
5813 return;
5814
5815 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
5816
5817 if (error)
5818 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
5819 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5820 else
5821 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
5822 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5823}
5824
5825void
5826mptcp_unset_cellicon(void)
5827{
5828 int error;
5829
5830 /* If cellicon is already unset, get out of here! */
5831 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
5832 return;
5833
5834 /*
5835 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
5836 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
5837 * it again.
5838 */
5839 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
5840 tcp_now)) {
5841 OSTestAndSet(7, &mptcp_cellicon_is_set);
5842 return;
490019cf 5843 }
5ba3f43e
A
5844
5845 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
5846
5847 if (error)
5848 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
5849 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5850 else
5851 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
5852 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5853}
5854
5855void
5856mptcp_reset_rexmit_state(struct tcpcb *tp)
5857{
5858 struct mptsub *mpts;
5859 struct inpcb *inp;
5860 struct socket *so;
5861
5862 inp = tp->t_inpcb;
5863 if (inp == NULL)
5864 return;
5865
5866 so = inp->inp_socket;
5867 if (so == NULL)
5868 return;
5869
5870 if (!(so->so_flags & SOF_MP_SUBFLOW))
5871 return;
5872
5873 mpts = tp->t_mpsub;
5874
5875 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
5876 so->so_flags &= ~SOF_MP_TRYFAILOVER;
5877}
5878
5879void
5880mptcp_reset_keepalive(struct tcpcb *tp)
5881{
5882 struct mptsub *mpts = tp->t_mpsub;
5883
5884 mpts->mpts_flags &= ~MPTSF_READ_STALL;
490019cf 5885}
5ba3f43e 5886