]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-4570.51.1.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
5ba3f43e 2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
5ba3f43e
A
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
39236c6e
A
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
5ba3f43e
A
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
39236c6e
A
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
39236c6e 47#include <sys/sysctl.h>
5ba3f43e
A
48#include <sys/syslog.h>
49#include <sys/systm.h>
39236c6e 50
5ba3f43e 51#include <net/content_filter.h>
39236c6e 52#include <net/if.h>
3e170ce0 53#include <net/if_var.h>
39236c6e
A
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_fsm.h>
59#include <netinet/tcp_seq.h>
60#include <netinet/tcp_var.h>
61#include <netinet/mptcp_var.h>
62#include <netinet/mptcp.h>
5ba3f43e 63#include <netinet/mptcp_opt.h>
39236c6e
A
64#include <netinet/mptcp_seq.h>
65#include <netinet/mptcp_timer.h>
66#include <libkern/crypto/sha1.h>
67#if INET6
68#include <netinet6/in6_pcb.h>
69#include <netinet6/ip6protosw.h>
70#endif /* INET6 */
71#include <dev/random/randomdev.h>
72
73/*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
39236c6e
A
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
5ba3f43e 96 * subflow. This gets decremented prior to the subflow's destruction.
39236c6e 97 *
5ba3f43e
A
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
39236c6e 100 *
5ba3f43e
A
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
39236c6e
A
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
5ba3f43e 109 * of the subflows have been destroyed.
39236c6e
A
110 */
111
fe8ab488 112static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 113static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
114
115static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
116static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
5ba3f43e
A
118static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
39236c6e
A
120static void mptcp_subflow_rupcall(struct socket *, void *, int);
121static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122static void mptcp_subflow_wupcall(struct socket *, void *, int);
5ba3f43e
A
123static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127static void mptcp_subflow_abort(struct mptsub *, int);
128
129static void mptcp_send_dfin(struct socket *so);
39236c6e
A
130
131/*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
143} ev_ret_t;
144
3e170ce0 145static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
5ba3f43e
A
146static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
fe8ab488 157
39236c6e
A
158static const char *mptcp_evret2str(ev_ret_t);
159
5ba3f43e
A
160static void mptcp_do_sha1(mptcp_key_t *, char *);
161static void mptcp_init_local_parms(struct mptses *);
39236c6e
A
162
163static unsigned int mptsub_zone_size; /* size of mptsub */
164static struct zone *mptsub_zone; /* zone for mptsub */
165
166static unsigned int mptopt_zone_size; /* size of mptopt */
167static struct zone *mptopt_zone; /* zone for mptopt */
168
169static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172struct mppcbinfo mtcbinfo;
173
39236c6e
A
174#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177SYSCTL_DECL(_net_inet);
178
179SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
5ba3f43e 181uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
3e170ce0
A
182SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
5ba3f43e 185uint32_t mptcp_dbg_level = 1;
3e170ce0
A
186SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
39236c6e
A
189SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
a39ff7e2
A
192
193static int mptcp_alternate_port = 0;
194SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
196
39236c6e
A
197static struct protosw mptcp_subflow_protosw;
198static struct pr_usrreqs mptcp_subflow_usrreqs;
199#if INET6
200static struct ip6protosw mptcp_subflow_protosw6;
201static struct pr_usrreqs mptcp_subflow_usrreqs6;
202#endif /* INET6 */
203
5ba3f43e
A
204static uint8_t mptcp_create_subflows_scheduled;
205
3e170ce0
A
206typedef struct mptcp_subflow_event_entry {
207 uint64_t sofilt_hint_mask;
208 ev_ret_t (*sofilt_hint_ev_hdlr)(
209 struct mptses *mpte,
210 struct mptsub *mpts,
5ba3f43e
A
211 uint64_t *p_mpsofilt_hint,
212 uint64_t event);
3e170ce0
A
213} mptsub_ev_entry_t;
214
5ba3f43e
A
215static uint8_t mptcp_cellicon_is_set;
216static uint32_t mptcp_last_cellicon_set;
217#define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
218
490019cf
A
219/*
220 * XXX The order of the event handlers below is really
5ba3f43e 221 * really important. Think twice before changing it.
490019cf 222 */
3e170ce0
A
223static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
224 {
225 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
226 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
227 },
228 {
229 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
230 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
231 },
232 {
233 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
5ba3f43e 234 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
235 },
236 {
237 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
238 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
239 },
240 {
241 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
5ba3f43e 242 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
243 },
244 {
245 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
5ba3f43e 246 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
247 },
248 {
249 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
250 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
251 },
252 {
253 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
254 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
255 },
3e170ce0
A
256 {
257 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
258 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
259 },
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
263 },
3e170ce0
A
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
267 },
268 {
5ba3f43e
A
269 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
275 },
3e170ce0
A
276};
277
a39ff7e2
A
278os_log_t mptcp_log_handle;
279
39236c6e
A
280/*
281 * Protocol pr_init callback.
282 */
283void
284mptcp_init(struct protosw *pp, struct domain *dp)
285{
286#pragma unused(dp)
287 static int mptcp_initialized = 0;
288 struct protosw *prp;
289#if INET6
290 struct ip6protosw *prp6;
291#endif /* INET6 */
292
293 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
294
295 /* do this only once */
296 if (mptcp_initialized)
297 return;
298 mptcp_initialized = 1;
299
300 /*
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
303 */
304 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 VERIFY(prp != NULL);
306 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
307 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 sizeof (mptcp_subflow_usrreqs));
309 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 313 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
314 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 /*
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
319 */
320 mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324
325#if INET6
326 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
327 IPPROTO_TCP, SOCK_STREAM);
328 VERIFY(prp6 != NULL);
329 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
330 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
331 sizeof (mptcp_subflow_usrreqs6));
332 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
333 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
334 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
335 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 336 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
337 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
338 /*
339 * Socket filters shouldn't attach/detach to/from this protosw
340 * since pr_protosw is to be used instead, which points to the
341 * real protocol; if they do, it is a bug and we should panic.
342 */
343 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
344 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
345 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
346 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
347#endif /* INET6 */
348
349 bzero(&mtcbinfo, sizeof (mtcbinfo));
350 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
351 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
352 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
353 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
354 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
355 /* NOTREACHED */
356 }
357 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
358 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
359
360 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
361 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
362 mtcbinfo.mppi_lock_grp_attr);
363 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
364 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
365 mtcbinfo.mppi_lock_attr);
39236c6e 366
3e170ce0 367 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e
A
368 mtcbinfo.mppi_timer = mptcp_timer;
369
370 /* attach to MP domain for garbage collection to take place */
371 mp_pcbinfo_attach(&mtcbinfo);
372
373 mptsub_zone_size = sizeof (struct mptsub);
374 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
375 8192, "mptsub")) == NULL) {
376 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
377 /* NOTREACHED */
378 }
379 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
380 zone_change(mptsub_zone, Z_EXPAND, TRUE);
381
382 mptopt_zone_size = sizeof (struct mptopt);
383 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
384 1024, "mptopt")) == NULL) {
385 panic("%s: unable to allocate MPTCP option zone\n", __func__);
386 /* NOTREACHED */
387 }
388 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
389 zone_change(mptopt_zone, Z_EXPAND, TRUE);
390
391 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
392 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
393 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
394 panic("%s: unable to allocate MPTCP address auth zone \n",
395 __func__);
396 /* NOTREACHED */
397 }
398 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
399 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
400
5ba3f43e 401 mptcp_last_cellicon_set = tcp_now;
a39ff7e2
A
402
403 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
5ba3f43e
A
404}
405
406int
407mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
408{
409 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
410
411 int i, index = -1;
412
413 if (ifp == NULL) {
414 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
416 return (-1);
417 }
418
419 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
420 if (stats[i].ifindex == IFSCOPE_NONE) {
421 if (index < 0)
422 index = i;
423 continue;
424 }
425
426 if (stats[i].ifindex == ifp->if_index) {
427 index = i;
428 return (index);
429 }
430 }
431
432 if (index != -1) {
433 stats[index].ifindex = ifp->if_index;
434 if (stats[index].is_expensive == 0)
435 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
436 }
437
438 return (index);
439}
440
441void
442mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
443{
444 int index;
445
446 tcpstat.tcps_mp_switches++;
447 mpte->mpte_subflow_switches++;
448
449 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
450
451 if (index != -1)
452 mpte->mpte_itfstats[index].switches++;
453}
454
455/*
456 * Flushes all recorded socket options from an MP socket.
457 */
458static void
459mptcp_flush_sopts(struct mptses *mpte)
460{
461 struct mptopt *mpo, *tmpo;
462
463 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
464 mptcp_sopt_remove(mpte, mpo);
465 mptcp_sopt_free(mpo);
466 }
467 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
39236c6e
A
468}
469
470/*
471 * Create an MPTCP session, called as a result of opening a MPTCP socket.
472 */
5ba3f43e
A
473int
474mptcp_sescreate(struct mppcb *mpp)
39236c6e
A
475{
476 struct mppcbinfo *mppi;
477 struct mptses *mpte;
478 struct mptcb *mp_tp;
39236c6e
A
479
480 VERIFY(mpp != NULL);
481 mppi = mpp->mpp_pcbinfo;
482 VERIFY(mppi != NULL);
483
3e170ce0
A
484 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
485 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
486
487 /* MPTCP Multipath PCB Extension */
488 bzero(mpte, sizeof (*mpte));
489 VERIFY(mpp->mpp_pcbe == NULL);
490 mpp->mpp_pcbe = mpte;
491 mpte->mpte_mppcb = mpp;
492 mpte->mpte_mptcb = mp_tp;
493
494 TAILQ_INIT(&mpte->mpte_sopts);
495 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
496 mpte->mpte_associd = SAE_ASSOCID_ANY;
497 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e 498
5ba3f43e
A
499 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
500 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
39236c6e 501
a39ff7e2
A
502 if (mptcp_alternate_port)
503 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
504
39236c6e
A
505 /* MPTCP Protocol Control Block */
506 bzero(mp_tp, sizeof (*mp_tp));
39236c6e 507 mp_tp->mpt_mpte = mpte;
3e170ce0 508 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e 509
5ba3f43e
A
510 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
511
512 return (0);
513}
514
515static void
516mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
517 uint64_t *cellbytes, uint64_t *allbytes)
518{
519 int64_t mycellbytes = 0;
520 uint64_t myallbytes = 0;
521 int i;
522
523 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
524 if (mpte->mpte_itfstats[i].is_expensive) {
525 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
526 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
527 }
528
529 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
530 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
531 }
532
533 if (initial_cell) {
534 mycellbytes -= mpte->mpte_init_txbytes;
535 mycellbytes -= mpte->mpte_init_txbytes;
536 }
537
538 if (mycellbytes < 0) {
539 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
540 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
541 *cellbytes = 0;
542 *allbytes = 0;
543 } else {
544 *cellbytes = mycellbytes;
545 *allbytes = myallbytes;
546 }
547}
548
549static void
550mptcpstats_session_wrapup(struct mptses *mpte)
551{
552 boolean_t cell = mpte->mpte_initial_cell;
553
554 switch (mpte->mpte_svctype) {
555 case MPTCP_SVCTYPE_HANDOVER:
556 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
557 tcpstat.tcps_mptcp_fp_handover_attempt++;
558
559 if (cell && mpte->mpte_handshake_success) {
560 tcpstat.tcps_mptcp_fp_handover_success_cell++;
561
562 if (mpte->mpte_used_wifi)
563 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
564 } else if (mpte->mpte_handshake_success) {
565 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
566
567 if (mpte->mpte_used_cell)
568 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
569 }
570 } else {
571 tcpstat.tcps_mptcp_handover_attempt++;
572
573 if (cell && mpte->mpte_handshake_success) {
574 tcpstat.tcps_mptcp_handover_success_cell++;
575
576 if (mpte->mpte_used_wifi)
577 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
578 } else if (mpte->mpte_handshake_success) {
579 tcpstat.tcps_mptcp_handover_success_wifi++;
580
581 if (mpte->mpte_used_cell)
582 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
583 }
584 }
585
586 if (mpte->mpte_handshake_success) {
587 uint64_t cellbytes;
588 uint64_t allbytes;
589
590 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
591
592 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
593 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
594 }
595 break;
596 case MPTCP_SVCTYPE_INTERACTIVE:
597 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
598 tcpstat.tcps_mptcp_fp_interactive_attempt++;
599
600 if (mpte->mpte_handshake_success) {
601 tcpstat.tcps_mptcp_fp_interactive_success++;
602
603 if (!cell && mpte->mpte_used_cell)
604 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
605 }
606 } else {
607 tcpstat.tcps_mptcp_interactive_attempt++;
608
609 if (mpte->mpte_handshake_success) {
610 tcpstat.tcps_mptcp_interactive_success++;
611
612 if (!cell && mpte->mpte_used_cell)
613 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
614 }
615 }
616
617 if (mpte->mpte_handshake_success) {
618 uint64_t cellbytes;
619 uint64_t allbytes;
620
621 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
622
623 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
624 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
625 }
626 break;
627 case MPTCP_SVCTYPE_AGGREGATE:
628 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
629 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
630
631 if (mpte->mpte_handshake_success)
632 tcpstat.tcps_mptcp_fp_aggregate_success++;
633 } else {
634 tcpstat.tcps_mptcp_aggregate_attempt++;
635
636 if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_aggregate_success++;
638 }
639 }
640
641 if (mpte->mpte_handshake_success) {
642 uint64_t cellbytes;
643 uint64_t allbytes;
644
645 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
646
647 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
648 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
649 }
650 break;
651 }
652
653 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
654 tcpstat.tcps_mptcp_back_to_wifi++;
39236c6e
A
655}
656
657/*
658 * Destroy an MPTCP session.
659 */
660static void
5ba3f43e 661mptcp_session_destroy(struct mptses *mpte)
39236c6e
A
662{
663 struct mptcb *mp_tp;
664
5ba3f43e 665 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
666
667 mp_tp = mpte->mpte_mptcb;
668 VERIFY(mp_tp != NULL);
669
5ba3f43e
A
670 mptcpstats_session_wrapup(mpte);
671
672 mptcp_unset_cellicon();
673
39236c6e
A
674 /*
675 * MPTCP Multipath PCB Extension section
676 */
677 mptcp_flush_sopts(mpte);
678 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
679
5ba3f43e
A
680 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
681 _FREE(mpte->mpte_itfinfo, M_TEMP);
682
683 mpte->mpte_itfinfo = NULL;
684
685 m_freem_list(mpte->mpte_reinjectq);
39236c6e
A
686
687 /*
688 * MPTCP Protocol Control Block section
689 */
39236c6e
A
690 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
691 struct mptcb *, mp_tp);
692}
693
5ba3f43e
A
694static boolean_t
695mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
39236c6e 696{
5ba3f43e
A
697 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
698 mp_tp->mpt_state < MPTCPS_TIME_WAIT &&
699 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
700}
39236c6e 701
5ba3f43e
A
702static int
703mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
704{
705 static const struct in6_addr well_known_prefix = {
706 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
707 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
708 0x00, 0x00, 0x00, 0x00},
709 };
710 char buf[MAX_IPv6_STR_LEN];
711 char *ptrv4 = (char *)addrv4;
712 char *ptr = (char *)addr;
713
714 if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network
715 IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback
716 IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local
717 IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite
718 IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast
719 IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast
720 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
721 return (-1);
39236c6e
A
722 }
723
5ba3f43e
A
724 /* Check for the well-known prefix */
725 if (len == NAT64_PREFIX_LEN_96 &&
726 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
727 if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
728 IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space
729 return (-1);
730 }
39236c6e 731
5ba3f43e
A
732 switch (len) {
733 case NAT64_PREFIX_LEN_96:
734 memcpy(ptr + 12, ptrv4, 4);
735 break;
736 case NAT64_PREFIX_LEN_64:
737 memcpy(ptr + 9, ptrv4, 4);
738 break;
739 case NAT64_PREFIX_LEN_56:
740 memcpy(ptr + 7, ptrv4, 1);
741 memcpy(ptr + 9, ptrv4 + 1, 3);
742 break;
743 case NAT64_PREFIX_LEN_48:
744 memcpy(ptr + 6, ptrv4, 2);
745 memcpy(ptr + 9, ptrv4 + 2, 2);
746 break;
747 case NAT64_PREFIX_LEN_40:
748 memcpy(ptr + 5, ptrv4, 3);
749 memcpy(ptr + 9, ptrv4 + 3, 1);
750 break;
751 case NAT64_PREFIX_LEN_32:
752 memcpy(ptr + 4, ptrv4, 4);
753 break;
754 default:
755 panic("NAT64-prefix len is wrong: %u\n", len);
756 }
39236c6e 757
a39ff7e2
A
758 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
759 __func__, len,
760 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
39236c6e 761
5ba3f43e 762 return (0);
39236c6e
A
763}
764
39236c6e 765void
5ba3f43e 766mptcp_check_subflows_and_add(struct mptses *mpte)
39236c6e 767{
5ba3f43e
A
768 struct mptcb *mp_tp = mpte->mpte_mptcb;
769 uint32_t i;
39236c6e 770
5ba3f43e
A
771 if (!mptcp_ok_to_create_subflows(mp_tp))
772 return;
39236c6e 773
5ba3f43e
A
774 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
775 struct mpt_itf_info *info;
776 struct mptsub *mpts;
777 uint32_t ifindex;
778 int found = 0;
39236c6e 779
5ba3f43e 780 info = &mpte->mpte_itfinfo[i];
39236c6e 781
5ba3f43e
A
782 if (info->no_mptcp_support)
783 continue;
39236c6e 784
5ba3f43e
A
785 ifindex = info->ifindex;
786 if (ifindex == IFSCOPE_NONE)
787 continue;
39236c6e 788
5ba3f43e
A
789 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
790 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
39236c6e 791
5ba3f43e
A
792 if (ifp == NULL)
793 continue;
39236c6e 794
5ba3f43e 795 if (ifp->if_index == ifindex &&
a39ff7e2
A
796 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
797 sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
5ba3f43e
A
798 /*
799 * We found a subflow on this interface.
800 * No need to create a new one.
801 */
802 found = 1;
803 break;
804 }
805
806 /*
807 * In Handover mode, only create cell subflow if
808 * 1. Wi-Fi Assist is active
809 * 2. Symptoms marked WiFi as weak
810 * 3. We are experiencing RTOs or we are not sending data.
811 *
812 * This covers the scenario, where:
813 * 1. We send and get retransmission timeouts (thus,
814 * we confirmed that WiFi is indeed bad).
815 * 2. We are not sending and the server tries to send.
816 * Establshing a cell-subflow gives the server a
817 * chance to send us some data over cell if WiFi
818 * is dead. We establish the subflow with the
819 * backup-bit set, so the server is not allowed to
820 * send on this subflow as long as WiFi is providing
821 * good performance.
822 */
823 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
824 !IFNET_IS_CELLULAR(ifp) &&
825 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
826 (!mptcp_is_wifi_unusable() ||
827 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh &&
828 mptetoso(mpte)->so_snd.sb_cc))) {
829 mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n",
830 __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex,
831 ifp->if_index),
832 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
833 found = 1;
834 break;
835 }
836 }
837
838 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
839 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
840 mptcp_developer_mode == 0) {
841 mptcp_ask_symptoms(mpte);
842 return;
843 }
844
845 if (!found) {
846 struct sockaddr *dst = &mpte->mpte_dst;
847 struct sockaddr_in6 nat64pre;
848
849 if (mpte->mpte_dst.sa_family == AF_INET &&
850 !info->has_v4_conn && info->has_v6_conn) {
851 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
852 struct ifnet *ifp;
853 int error, j;
854
855 bzero(&nat64pre, sizeof(struct sockaddr_in6));
856
857 ifnet_head_lock_shared();
858 ifp = ifindex2ifnet[ifindex];
859 ifnet_head_done();
860
861 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
862 if (error) {
863 mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n",
864 __func__, ifp->if_name, error),
865 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
866 continue;
867 }
868
869 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
870 if (nat64prefixes[j].prefix_len != 0)
871 break;
872 }
873
874 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
875
876 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
877 nat64prefixes[j].prefix_len,
878 &mpte->__mpte_dst_v4.sin_addr);
879 if (error != 0) {
880 mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__),
881 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
882 continue;
883 }
884
885 memcpy(&nat64pre.sin6_addr,
886 &nat64prefixes[j].ipv6_prefix,
887 sizeof(nat64pre.sin6_addr));
888 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
889 nat64pre.sin6_family = AF_INET6;
890 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
891 nat64pre.sin6_flowinfo = 0;
892 nat64pre.sin6_scope_id = 0;
893
894 dst = (struct sockaddr *)&nat64pre;
895 }
896
a39ff7e2
A
897 /* Initial subflow started on a NAT64'd address? */
898 if (mpte->mpte_dst.sa_family == AF_INET6 &&
899 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
900 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
901 }
902
903 if (dst->sa_family == AF_INET && !info->has_v4_conn)
904 continue;
905 if (dst->sa_family == AF_INET6 && !info->has_v6_conn)
906 continue;
907
5ba3f43e
A
908 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
909 }
910 }
911}
912
913/*
914 * Based on the MPTCP Service-type and the state of the subflows, we
915 * will destroy subflows here.
916 */
917static void
918mptcp_check_subflows_and_remove(struct mptses *mpte)
919{
920 struct mptsub *mpts, *tmpts;
921 int found_working_subflow = 0, removed_some = 0;
922 int wifi_unusable = mptcp_is_wifi_unusable();
923
924 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
925 return;
926
927 /*
928 * Look for a subflow that is on a non-cellular interface
929 * and actually works (aka, no retransmission timeout).
930 */
931 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
932 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
933 struct socket *so;
934 struct tcpcb *tp;
935
936 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
937 continue;
938
939 so = mpts->mpts_socket;
940 tp = sototcpcb(so);
941
942 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
943 tp->t_state != TCPS_ESTABLISHED)
944 continue;
945
946 /* Either this subflow is in good condition while we try to send */
947 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc)
948 found_working_subflow = 1;
39236c6e 949
5ba3f43e
A
950 /* Or WiFi is fine */
951 if (!wifi_unusable)
952 found_working_subflow = 1;
39236c6e
A
953 }
954
5ba3f43e
A
955 /*
956 * Couldn't find a working subflow, let's not remove those on a cellular
957 * interface.
958 */
959 if (!found_working_subflow)
960 return;
961
962 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
963 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
964
965 /* Only remove cellular subflows */
966 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
967 continue;
968
969 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
970 removed_some = 1;
971 }
972
973 if (removed_some)
974 mptcp_unset_cellicon();
975}
976
977static void
978mptcp_remove_subflows(struct mptses *mpte)
979{
980 struct mptsub *mpts, *tmpts;
981
982 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
983 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
984 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
985
986 soevent(mpts->mpts_socket,
987 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
988 }
989 }
990}
991
992static void
993mptcp_create_subflows(__unused void *arg)
994{
995 struct mppcb *mpp;
996
997 /*
998 * Start with clearing, because we might be processing connections
999 * while a new event comes in.
1000 */
1001 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
1002 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
1003 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1004
1005 /* Iterate over all MPTCP connections */
1006
1007 lck_mtx_lock(&mtcbinfo.mppi_lock);
1008
1009 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1010 struct mptses *mpte;
1011 struct socket *mp_so;
1012
1013 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
1014 continue;
1015
1016 mpp_lock(mpp);
1017
1018 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1019
1020 mpte = mpp->mpp_pcbe;
1021 mp_so = mpp->mpp_socket;
1022
1023 VERIFY(mp_so->so_usecount > 0);
1024
1025 mptcp_check_subflows_and_add(mpte);
1026 mptcp_remove_subflows(mpte);
1027
1028 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1029 mpp_unlock(mpp);
1030 }
1031
1032 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1033}
1034
1035/*
1036 * We need this because we are coming from an NECP-event. This event gets posted
1037 * while holding NECP-locks. The creation of the subflow however leads us back
1038 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1039 * So, we would deadlock there as we already hold the NECP-lock.
1040 *
1041 * So, let's schedule this separately. It also gives NECP the chance to make
1042 * progress, without having to wait for MPTCP to finish its subflow creation.
1043 */
1044void
1045mptcp_sched_create_subflows(struct mptses *mpte)
1046{
1047 struct mppcb *mpp = mpte->mpte_mppcb;
1048 struct mptcb *mp_tp = mpte->mpte_mptcb;
1049 struct socket *mp_so = mpp->mpp_socket;
1050
1051 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1052 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1053 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1054 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1055 return;
1056 }
1057
1058 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1059 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1060 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1061 }
1062
1063 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1064 return;
1065
1066 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1067 timeout(mptcp_create_subflows, NULL, hz/10);
1068}
1069
1070/*
1071 * Allocate an MPTCP socket option structure.
1072 */
1073struct mptopt *
1074mptcp_sopt_alloc(int how)
1075{
1076 struct mptopt *mpo;
1077
1078 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1079 zalloc_noblock(mptopt_zone);
1080 if (mpo != NULL) {
1081 bzero(mpo, mptopt_zone_size);
1082 }
1083
1084 return (mpo);
1085}
1086
1087/*
1088 * Free an MPTCP socket option structure.
1089 */
1090void
1091mptcp_sopt_free(struct mptopt *mpo)
1092{
1093 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1094
1095 zfree(mptopt_zone, mpo);
1096}
1097
1098/*
1099 * Add a socket option to the MPTCP socket option list.
1100 */
1101void
1102mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1103{
1104 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1105 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1106 mpo->mpo_flags |= MPOF_ATTACHED;
1107 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1108}
1109
1110/*
1111 * Remove a socket option from the MPTCP socket option list.
1112 */
1113void
1114mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1115{
1116 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1117 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1118 mpo->mpo_flags &= ~MPOF_ATTACHED;
1119 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1120}
1121
1122/*
1123 * Search for an existing <sopt_level,sopt_name> socket option.
1124 */
1125struct mptopt *
1126mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1127{
1128 struct mptopt *mpo;
1129
1130 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1131
1132 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1133 if (mpo->mpo_level == sopt->sopt_level &&
1134 mpo->mpo_name == sopt->sopt_name)
1135 break;
1136 }
1137 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
1138
1139 return (mpo);
1140}
1141
1142/*
1143 * Allocate a MPTCP subflow structure.
1144 */
1145static struct mptsub *
1146mptcp_subflow_alloc(void)
1147{
1148 struct mptsub *mpts = zalloc(mptsub_zone);
1149
1150 if (mpts == NULL)
1151 return (NULL);
1152
1153 bzero(mpts, mptsub_zone_size);
39236c6e
A
1154 return (mpts);
1155}
1156
1157/*
1158 * Deallocate a subflow structure, called when all of the references held
1159 * on it have been released. This implies that the subflow has been deleted.
1160 */
5ba3f43e 1161static void
39236c6e
A
1162mptcp_subflow_free(struct mptsub *mpts)
1163{
39236c6e
A
1164 VERIFY(mpts->mpts_refcnt == 0);
1165 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1166 VERIFY(mpts->mpts_mpte == NULL);
1167 VERIFY(mpts->mpts_socket == NULL);
1168
813fb2f6
A
1169 if (mpts->mpts_src != NULL) {
1170 FREE(mpts->mpts_src, M_SONAME);
1171 mpts->mpts_src = NULL;
39236c6e 1172 }
39236c6e
A
1173
1174 zfree(mptsub_zone, mpts);
1175}
1176
5ba3f43e
A
1177static void
1178mptcp_subflow_addref(struct mptsub *mpts)
1179{
1180 if (++mpts->mpts_refcnt == 0)
1181 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1182 /* NOTREACHED */
1183}
1184
1185static void
1186mptcp_subflow_remref(struct mptsub *mpts)
1187{
1188 if (mpts->mpts_refcnt == 0) {
1189 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1190 /* NOTREACHED */
1191 }
1192 if (--mpts->mpts_refcnt > 0)
1193 return;
1194
1195 /* callee will unlock and destroy lock */
1196 mptcp_subflow_free(mpts);
1197}
1198
1199static void
1200mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1201{
1202 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1203 struct tcpcb *tp = sototcpcb(so);
1204
1205 /*
1206 * From this moment on, the subflow is linked to the MPTCP-connection.
1207 * Locking,... happens now at the MPTCP-layer
1208 */
1209 tp->t_mptcb = mpte->mpte_mptcb;
1210 so->so_flags |= SOF_MP_SUBFLOW;
1211 mp_so->so_usecount++;
1212
1213 /*
1214 * Insert the subflow into the list, and associate the MPTCP PCB
1215 * as well as the the subflow socket. From this point on, removing
1216 * the subflow needs to be done via mptcp_subflow_del().
1217 */
1218 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1219 mpte->mpte_numflows++;
1220
1221 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1222 mpts->mpts_mpte = mpte;
1223 mpts->mpts_socket = so;
1224 tp->t_mpsub = mpts;
1225 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1226 mptcp_subflow_addref(mpts); /* for subflow socket */
1227}
1228
1229static void
1230mptcp_subflow_necp_cb(void *handle, __unused int action,
1231 __unused struct necp_client_flow *flow)
1232{
1233 struct inpcb *inp = (struct inpcb *)handle;
1234 struct socket *so = inp->inp_socket;
1235 struct mptsub *mpts;
1236 struct mptses *mpte;
1237
1238 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1239 return;
1240
1241 /*
1242 * The socket is being garbage-collected. There is nothing to be done
1243 * here.
1244 */
1245 if (so->so_usecount == 0)
1246 return;
1247
1248 socket_lock(so, 1);
1249
1250 /* Check again after we acquired the lock. */
1251 if (so->so_usecount == 0)
1252 goto out;
1253
1254 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1255 mpts = sototcpcb(so)->t_mpsub;
1256
1257 mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__),
1258 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
1259
1260 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1261
1262 mptcp_sched_create_subflows(mpte);
1263
1264 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
1265 flow->viable = 1;
1266
1267out:
1268 socket_unlock(so, 1);
1269}
1270
39236c6e
A
1271/*
1272 * Create an MPTCP subflow socket.
1273 */
1274static int
1275mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
5ba3f43e 1276 struct socket **so)
39236c6e 1277{
5ba3f43e 1278 lck_mtx_t *subflow_mtx;
39236c6e 1279 struct mptopt smpo, *mpo, *tmpo;
5ba3f43e 1280 struct proc *p;
39236c6e
A
1281 struct socket *mp_so;
1282 int error;
1283
1284 *so = NULL;
5ba3f43e
A
1285 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1286 mp_so = mptetoso(mpte);
1287
1288 p = proc_find(mp_so->last_pid);
1289 if (p == PROC_NULL) {
1290 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1291 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1292
1293 return (ESRCH);
1294 }
39236c6e
A
1295
1296 /*
1297 * Create the subflow socket (multipath subflow, non-blocking.)
1298 *
1299 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1300 * socket; it will be cleared when the socket is peeled off or closed.
1301 * It also indicates to the underlying TCP to handle MPTCP options.
1302 * A multipath subflow socket implies SS_NOFDREF state.
1303 */
5ba3f43e
A
1304
1305 /*
1306 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1307 * the ipi-lock. We cannot hold the socket-lock at that point.
1308 */
1309 mpte_unlock(mpte);
1310 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1311 SOCF_ASYNC, PROC_NULL);
1312 mpte_lock(mpte);
1313 if (error) {
1314 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1315 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1316 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1317
1318 proc_rele(p);
1319
1320 mptcp_subflow_free(mpts);
39236c6e
A
1321 return (error);
1322 }
1323
5ba3f43e
A
1324 /*
1325 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1326 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1327 * Which is why we also need to get the lock with pr_getlock, as after
1328 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1329 */
1330 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1331 lck_mtx_lock(subflow_mtx);
1332
1333 /*
1334 * Must be the first thing we do, to make sure all pointers for this
1335 * subflow are set.
1336 */
1337 mptcp_subflow_attach(mpte, mpts, *so);
1338
1339 /*
1340 * A multipath subflow socket is used internally in the kernel,
1341 * therefore it does not have a file desciptor associated by
1342 * default.
1343 */
1344 (*so)->so_state |= SS_NOFDREF;
1345
1346 lck_mtx_unlock(subflow_mtx);
39236c6e
A
1347
1348 /* prevent the socket buffers from being compressed */
1349 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1350 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1351
490019cf
A
1352 /* Inherit preconnect and TFO data flags */
1353 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1354 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
490019cf
A
1355 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1356 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1357
5ba3f43e
A
1358 /* Inherit uuid and create the related flow. */
1359 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1360 struct mptcb *mp_tp = mpte->mpte_mptcb;
1361
1362 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1363
1364 /*
1365 * A note on the unlock: With MPTCP, we do multiple times a
1366 * necp_client_register_socket_flow. This is problematic,
1367 * because now the lock-ordering guarantee (first necp-locks,
1368 * then socket-locks) is no more respected. So, we need to
1369 * unlock here.
1370 */
1371 mpte_unlock(mpte);
1372 error = necp_client_register_socket_flow(mp_so->last_pid,
1373 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1374 mpte_lock(mpte);
1375
1376 if (error)
1377 goto out_err;
1378
1379 /* Possible state-change during the unlock above */
1380 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1381 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1382 goto out_err;
1383
1384 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1385 } else {
1386 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1387 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1388 }
1389
1390 /* inherit the other socket options */
39236c6e
A
1391 bzero(&smpo, sizeof (smpo));
1392 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1393 smpo.mpo_level = SOL_SOCKET;
1394 smpo.mpo_intval = 1;
1395
1396 /* disable SIGPIPE */
1397 smpo.mpo_name = SO_NOSIGPIPE;
5ba3f43e
A
1398 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1399 goto out_err;
39236c6e
A
1400
1401 /* find out if the subflow's source address goes away */
1402 smpo.mpo_name = SO_NOADDRERR;
5ba3f43e
A
1403 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1404 goto out_err;
39236c6e
A
1405
1406 /* enable keepalive */
1407 smpo.mpo_name = SO_KEEPALIVE;
5ba3f43e
A
1408 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1409 goto out_err;
39236c6e
A
1410
1411 smpo.mpo_level = IPPROTO_TCP;
1412 smpo.mpo_intval = mptcp_subflow_keeptime;
1413 smpo.mpo_name = TCP_KEEPALIVE;
5ba3f43e
A
1414 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1415 goto out_err;
1416
1417 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1418 /*
1419 * On secondary subflows we might need to set the cell-fallback
1420 * flag (see conditions in mptcp_subflow_sosetopt).
1421 */
1422 smpo.mpo_level = SOL_SOCKET;
1423 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1424 smpo.mpo_intval = 1;
1425 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1426 goto out_err;
1427 }
39236c6e
A
1428
1429 /* replay setsockopt(2) on the subflow sockets for eligible options */
1430 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1431 int interim;
1432
1433 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1434 continue;
1435
1436 /*
1437 * Skip those that are handled internally; these options
1438 * should not have been recorded and marked with the
1439 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1440 */
1441 if (mpo->mpo_level == SOL_SOCKET &&
1442 (mpo->mpo_name == SO_NOSIGPIPE ||
1443 mpo->mpo_name == SO_NOADDRERR ||
1444 mpo->mpo_name == SO_KEEPALIVE))
1445 continue;
1446
1447 interim = (mpo->mpo_flags & MPOF_INTERIM);
5ba3f43e
A
1448 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1449 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1450 " sopt %s val %d interim record removed\n", __func__,
39236c6e 1451 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
1452 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1453 mpo->mpo_intval),
3e170ce0 1454 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1455 mptcp_sopt_remove(mpte, mpo);
1456 mptcp_sopt_free(mpo);
1457 continue;
1458 }
1459 }
1460
1461 /*
1462 * We need to receive everything that the subflow socket has,
1463 * so use a customized socket receive function. We will undo
1464 * this when the socket is peeled off or closed.
1465 */
39236c6e
A
1466 switch (dom) {
1467 case PF_INET:
1468 (*so)->so_proto = &mptcp_subflow_protosw;
1469 break;
1470#if INET6
1471 case PF_INET6:
1472 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1473 break;
1474#endif /* INET6 */
1475 default:
1476 VERIFY(0);
1477 /* NOTREACHED */
1478 }
1479
5ba3f43e
A
1480 proc_rele(p);
1481
1482 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1483 int, dom, int, error);
1484
1485 return (0);
39236c6e 1486
5ba3f43e
A
1487out_err:
1488 mptcp_subflow_abort(mpts, error);
1489
1490 proc_rele(p);
1491
1492 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1493 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1494
1495 return (error);
1496}
1497
1498/*
1499 * Close an MPTCP subflow socket.
1500 *
1501 * Note that this may be called on an embryonic subflow, and the only
1502 * thing that is guaranteed valid is the protocol-user request.
1503 */
5ba3f43e
A
1504static void
1505mptcp_subflow_soclose(struct mptsub *mpts)
39236c6e 1506{
5ba3f43e
A
1507 struct socket *so = mpts->mpts_socket;
1508
1509 if (mpts->mpts_flags & MPTSF_CLOSED)
1510 return;
39236c6e 1511
5ba3f43e 1512 VERIFY(so != NULL);
39236c6e
A
1513 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1514 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1515
39236c6e
A
1516 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1517 struct socket *, so,
1518 struct sockbuf *, &so->so_rcv,
1519 struct sockbuf *, &so->so_snd,
1520 struct mptses *, mpts->mpts_mpte);
1521
5ba3f43e
A
1522 mpts->mpts_flags |= MPTSF_CLOSED;
1523
1524 if (so->so_retaincnt == 0) {
1525 soclose_locked(so);
1526
1527 return;
1528 } else {
1529 VERIFY(so->so_usecount > 0);
1530 so->so_usecount--;
1531 }
1532
1533 return;
39236c6e
A
1534}
1535
1536/*
1537 * Connect an MPTCP subflow socket.
1538 *
5ba3f43e
A
1539 * Note that in the pending connect case, the subflow socket may have been
1540 * bound to an interface and/or a source IP address which may no longer be
1541 * around by the time this routine is called; in that case the connect attempt
1542 * will most likely fail.
39236c6e
A
1543 */
1544static int
1545mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1546{
5ba3f43e
A
1547 char dbuf[MAX_IPv6_STR_LEN];
1548 struct socket *mp_so, *so;
1549 struct mptcb *mp_tp;
1550 struct sockaddr *dst;
1551 struct proc *p;
a39ff7e2 1552 int af, error, dport;
39236c6e 1553
5ba3f43e
A
1554 mp_so = mptetoso(mpte);
1555 mp_tp = mpte->mpte_mptcb;
a39ff7e2
A
1556 so = mpts->mpts_socket;
1557 af = mpts->mpts_dst.sa_family;
1558 dst = &mpts->mpts_dst;
1559
1560 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1561 VERIFY(mpts->mpts_socket != NULL);
1562 VERIFY(af == AF_INET || af == AF_INET6);
1563
1564 if (af == AF_INET) {
1565 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof (dbuf));
1566 dport = ntohs(SIN(dst)->sin_port);
1567 } else {
1568 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof (dbuf));
1569 dport = ntohs(SIN6(dst)->sin6_port);
1570 }
1571
1572 os_log_info(mptcp_log_handle,
1573 "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
1574 dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
39236c6e 1575
5ba3f43e
A
1576 p = proc_find(mp_so->last_pid);
1577 if (p == PROC_NULL) {
1578 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1579 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e 1580
5ba3f43e 1581 return (ESRCH);
39236c6e
A
1582 }
1583
1584 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1585
fe8ab488 1586 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 1587
39236c6e 1588 /* connect the subflow socket */
5ba3f43e
A
1589 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1590 p, mpts->mpts_ifscope,
1591 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1592
1593 mpts->mpts_iss = sototcpcb(so)->iss;
1594
1595 /* See tcp_connect_complete */
1596 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1597 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1598 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1599 }
39236c6e 1600
fe8ab488
A
1601 /* Allocate a unique address id per subflow */
1602 mpte->mpte_addrid_last++;
1603 if (mpte->mpte_addrid_last == 0)
1604 mpte->mpte_addrid_last++;
1605
5ba3f43e
A
1606 proc_rele(p);
1607
39236c6e
A
1608 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1609 struct mptsub *, mpts, int, error);
5ba3f43e
A
1610 if (error)
1611 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1612 __func__, error, mpts->mpts_ifscope),
1613 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1614
1615 return (error);
1616}
1617
1618/*
1619 * MPTCP subflow socket receive routine, derived from soreceive().
1620 */
1621static int
1622mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1623 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1624{
1625#pragma unused(uio)
5ba3f43e 1626 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1627 int flags, error = 0;
1628 struct proc *p = current_proc();
1629 struct mbuf *m, **mp = mp0;
5ba3f43e 1630 boolean_t proc_held = FALSE;
39236c6e 1631
5ba3f43e 1632 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
39236c6e
A
1633 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1634
1635#ifdef MORE_LOCKING_DEBUG
1636 if (so->so_usecount == 1) {
1637 panic("%s: so=%x no other reference on socket\n", __func__, so);
1638 /* NOTREACHED */
1639 }
1640#endif
1641 /*
1642 * We return all that is there in the subflow's socket receive buffer
1643 * to the MPTCP layer, so we require that the caller passes in the
1644 * expected parameters.
1645 */
5ba3f43e 1646 if (mp == NULL || controlp != NULL)
39236c6e 1647 return (EINVAL);
5ba3f43e 1648
39236c6e
A
1649 *mp = NULL;
1650 if (psa != NULL)
1651 *psa = NULL;
1652 if (flagsp != NULL)
1653 flags = *flagsp &~ MSG_EOR;
1654 else
1655 flags = 0;
1656
5ba3f43e 1657 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
39236c6e 1658 return (EOPNOTSUPP);
5ba3f43e 1659
39236c6e
A
1660 flags |= (MSG_DONTWAIT|MSG_NBIO);
1661
1662 /*
1663 * If a recv attempt is made on a previously-accepted socket
1664 * that has been marked as inactive (disconnected), reject
1665 * the request.
1666 */
1667 if (so->so_flags & SOF_DEFUNCT) {
1668 struct sockbuf *sb = &so->so_rcv;
1669
1670 error = ENOTCONN;
39236c6e
A
1671 /*
1672 * This socket should have been disconnected and flushed
1673 * prior to being returned from sodefunct(); there should
1674 * be no data on its receive list, so panic otherwise.
1675 */
1676 if (so->so_state & SS_DEFUNCT)
1677 sb_empty_assert(sb, __func__);
39236c6e
A
1678 return (error);
1679 }
1680
1681 /*
1682 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1683 * and if so just return to the caller. This could happen when
1684 * soreceive() is called by a socket upcall function during the
1685 * time the socket is freed. The socket buffer would have been
1686 * locked across the upcall, therefore we cannot put this thread
1687 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1688 * we may livelock), because the lock on the socket buffer will
1689 * only be released when the upcall routine returns to its caller.
1690 * Because the socket has been officially closed, there can be
1691 * no further read on it.
1692 *
1693 * A multipath subflow socket would have its SS_NOFDREF set by
1694 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1695 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1696 */
1697 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
5ba3f43e 1698 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
39236c6e 1699 return (0);
39236c6e
A
1700
1701 /*
1702 * For consistency with soreceive() semantics, we need to obey
1703 * SB_LOCK in case some other code path has locked the buffer.
1704 */
1705 error = sblock(&so->so_rcv, 0);
5ba3f43e 1706 if (error != 0)
39236c6e 1707 return (error);
39236c6e
A
1708
1709 m = so->so_rcv.sb_mb;
1710 if (m == NULL) {
1711 /*
1712 * Panic if we notice inconsistencies in the socket's
1713 * receive list; both sb_mb and sb_cc should correctly
1714 * reflect the contents of the list, otherwise we may
1715 * end up with false positives during select() or poll()
1716 * which could put the application in a bad state.
1717 */
1718 SB_MB_CHECK(&so->so_rcv);
1719
1720 if (so->so_error != 0) {
1721 error = so->so_error;
1722 so->so_error = 0;
1723 goto release;
1724 }
1725
5ba3f43e
A
1726 if (so->so_state & SS_CANTRCVMORE) {
1727 goto release;
1728 }
1729
1730 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1731 error = ENOTCONN;
1732 goto release;
1733 }
1734
1735 /*
1736 * MSG_DONTWAIT is implicitly defined and this routine will
1737 * never block, so return EWOULDBLOCK when there is nothing.
1738 */
1739 error = EWOULDBLOCK;
1740 goto release;
1741 }
1742
1743 mptcp_update_last_owner(so, mp_so);
1744
1745 if (mp_so->last_pid != proc_pid(p)) {
1746 p = proc_find(mp_so->last_pid);
1747 if (p == PROC_NULL) {
1748 p = current_proc();
1749 } else {
1750 proc_held = TRUE;
1751 }
1752 }
1753
1754 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1755 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1756 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1757
1758 while (m != NULL) {
5c9f4661 1759 int dlen = 0, dfin = 0, error_out = 0;
5ba3f43e
A
1760 struct mbuf *start = m;
1761 uint64_t dsn;
1762 uint32_t sseq;
1763 uint16_t orig_dlen;
1764 uint16_t csum;
1765
1766 VERIFY(m->m_nextpkt == NULL);
1767
1768 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1769 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1770 dsn = m->m_pkthdr.mp_dsn;
1771 sseq = m->m_pkthdr.mp_rseq;
1772 csum = m->m_pkthdr.mp_csum;
1773 } else {
1774 /* We did fallback */
5c9f4661 1775 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
5ba3f43e
A
1776
1777 sbfree(&so->so_rcv, m);
1778
1779 if (mp != NULL) {
1780 *mp = m;
1781 mp = &m->m_next;
1782 so->so_rcv.sb_mb = m = m->m_next;
1783 *mp = NULL;
1784
1785 }
1786
1787 if (m != NULL) {
1788 so->so_rcv.sb_lastrecord = m;
1789 } else {
1790 SB_EMPTY_FIXUP(&so->so_rcv);
1791 }
1792
1793 continue;
39236c6e
A
1794 }
1795
5c9f4661
A
1796 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
1797 dfin = 1;
1798
5ba3f43e
A
1799 /*
1800 * Check if the full mapping is now present
1801 */
5c9f4661 1802 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
5ba3f43e
A
1803 mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n",
1804 __func__, so->so_rcv.sb_cc, dlen),
1805 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1806
1807 if (*mp0 == NULL)
1808 error = EWOULDBLOCK;
39236c6e
A
1809 goto release;
1810 }
1811
5ba3f43e
A
1812 /* Now, get the full mapping */
1813 while (dlen > 0) {
5c9f4661
A
1814 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1815 error_out = 1;
1816 error = EIO;
1817 dlen = 0;
1818 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1819 break;
1820 }
39236c6e 1821
5ba3f43e
A
1822 dlen -= m->m_len;
1823 sbfree(&so->so_rcv, m);
39236c6e 1824
5ba3f43e
A
1825 if (mp != NULL) {
1826 *mp = m;
1827 mp = &m->m_next;
1828 so->so_rcv.sb_mb = m = m->m_next;
1829 *mp = NULL;
1830 }
1831
5c9f4661
A
1832 if (dlen - dfin == 0)
1833 dlen = 0;
1834
5ba3f43e 1835 VERIFY(dlen <= 0 || m);
39236c6e
A
1836 }
1837
5ba3f43e
A
1838 VERIFY(dlen == 0);
1839
39236c6e 1840 if (m != NULL) {
5ba3f43e 1841 so->so_rcv.sb_lastrecord = m;
39236c6e 1842 } else {
39236c6e
A
1843 SB_EMPTY_FIXUP(&so->so_rcv);
1844 }
5ba3f43e 1845
5c9f4661
A
1846 if (error_out)
1847 goto release;
1848
1849
1850 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
5ba3f43e
A
1851 error = EIO;
1852 *mp0 = NULL;
1853 goto release;
1854 }
1855
39236c6e
A
1856 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1857 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1858 }
1859
1860 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1861 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
39236c6e
A
1862
1863 if (flagsp != NULL)
1864 *flagsp |= flags;
1865
1866release:
5ba3f43e
A
1867 sbunlock(&so->so_rcv, TRUE);
1868
1869 if (proc_held)
1870 proc_rele(p);
1871
39236c6e
A
1872 return (error);
1873
1874}
1875
39236c6e 1876/*
5ba3f43e 1877 * MPTCP subflow socket send routine, derived from sosend().
39236c6e 1878 */
5ba3f43e
A
1879static int
1880mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1881 struct mbuf *top, struct mbuf *control, int flags)
39236c6e 1882{
5ba3f43e
A
1883 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1884 struct proc *p = current_proc();
1885 boolean_t en_tracing = FALSE, proc_held = FALSE;
1886 int en_tracing_val;
1887 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1888 int error;
39236c6e 1889
5ba3f43e
A
1890 VERIFY(control == NULL);
1891 VERIFY(addr == NULL);
1892 VERIFY(uio == NULL);
1893 VERIFY(flags == 0);
1894 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
39236c6e 1895
5ba3f43e
A
1896 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1897 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e
A
1898
1899 /*
5ba3f43e
A
1900 * trace if tracing & network (vs. unix) sockets & and
1901 * non-loopback
39236c6e 1902 */
5ba3f43e
A
1903 if (ENTR_SHOULDTRACE &&
1904 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1905 struct inpcb *inp = sotoinpcb(so);
1906 if (inp->inp_last_outifp != NULL &&
1907 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1908 en_tracing = TRUE;
1909 en_tracing_val = top->m_pkthdr.len;
1910 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1911 VM_KERNEL_ADDRPERM(so),
1912 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1913 (int64_t)en_tracing_val);
1914 }
1915 }
39236c6e 1916
5ba3f43e 1917 mptcp_update_last_owner(so, mp_so);
39236c6e 1918
5ba3f43e
A
1919 if (mp_so->last_pid != proc_pid(p)) {
1920 p = proc_find(mp_so->last_pid);
1921 if (p == PROC_NULL) {
1922 p = current_proc();
1923 } else {
1924 proc_held = TRUE;
1925 }
1926 }
39236c6e 1927
5ba3f43e
A
1928#if NECP
1929 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1930#endif /* NECP */
39236c6e 1931
5ba3f43e 1932 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 1933
5ba3f43e
A
1934 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1935 if (error)
1936 goto out;
39236c6e 1937
5ba3f43e
A
1938 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1939 top = NULL;
39236c6e 1940
5ba3f43e
A
1941out:
1942 if (top != NULL)
1943 m_freem(top);
39236c6e 1944
5ba3f43e
A
1945 if (proc_held)
1946 proc_rele(p);
1947
1948 soclearfastopen(so);
1949
1950 if (en_tracing) {
1951 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
1952 VM_KERNEL_ADDRPERM(so),
1953 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
1954 (int64_t)en_tracing_val);
1955 }
1956
1957 return (error);
39236c6e 1958
39236c6e
A
1959}
1960
1961/*
1962 * Establish an initial MPTCP connection (if first subflow and not yet
1963 * connected), or add a subflow to an existing MPTCP connection.
1964 */
1965int
5ba3f43e
A
1966mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
1967 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
39236c6e 1968{
39236c6e 1969 struct socket *mp_so, *so = NULL;
39236c6e 1970 struct mptcb *mp_tp;
5ba3f43e 1971 struct mptsub *mpts = NULL;
39236c6e
A
1972 int af, error = 0;
1973
5ba3f43e
A
1974 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1975 mp_so = mptetoso(mpte);
39236c6e
A
1976 mp_tp = mpte->mpte_mptcb;
1977
fe8ab488
A
1978 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1979 /* If the remote end sends Data FIN, refuse subflow adds */
5ba3f43e
A
1980 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
1981 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
fe8ab488 1982 error = ENOTCONN;
5ba3f43e 1983 goto out_err;
fe8ab488 1984 }
39236c6e 1985
5ba3f43e
A
1986 mpts = mptcp_subflow_alloc();
1987 if (mpts == NULL) {
1988 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
1989 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1990 error = ENOMEM;
1991 goto out_err;
1992 }
39236c6e 1993
5ba3f43e
A
1994 if (src != NULL) {
1995 int len = src->sa_len;
813fb2f6
A
1996
1997 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
1998 M_WAITOK | M_ZERO);
1999 if (mpts->mpts_src == NULL) {
5ba3f43e
A
2000 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
2001 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2002 error = ENOMEM;
2003 goto out_err;
39236c6e 2004 }
5ba3f43e 2005 bcopy(src, mpts->mpts_src, len);
39236c6e
A
2006 }
2007
5ba3f43e
A
2008 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
2009
2010 af = mpts->mpts_dst.sa_family;
2011
2012 mpts->mpts_ifscope = ifscope;
2013
39236c6e 2014 /* create the subflow socket */
5ba3f43e
A
2015 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
2016 /*
2017 * Returning (error) and not cleaning up, because up to here
2018 * all we did is creating mpts.
2019 *
2020 * And the contract is that the call to mptcp_subflow_socreate,
2021 * moves ownership of mpts to mptcp_subflow_socreate.
2022 */
2023 return (error);
2024
2025 /*
2026 * We may be called from within the kernel. Still need to account this
2027 * one to the real app.
2028 */
2029 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
39236c6e
A
2030
2031 /*
3e170ce0
A
2032 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2033 * -1 (SAE_CONNID_ALL).
39236c6e
A
2034 */
2035 mpte->mpte_connid_last++;
3e170ce0
A
2036 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2037 mpte->mpte_connid_last == SAE_CONNID_ANY)
39236c6e
A
2038 mpte->mpte_connid_last++;
2039
2040 mpts->mpts_connid = mpte->mpte_connid_last;
490019cf
A
2041
2042 mpts->mpts_rel_seq = 1;
2043
fe8ab488
A
2044 /* Allocate a unique address id per subflow */
2045 mpte->mpte_addrid_last++;
2046 if (mpte->mpte_addrid_last == 0)
2047 mpte->mpte_addrid_last++;
39236c6e 2048
39236c6e 2049 /* register for subflow socket read/write events */
5ba3f43e 2050 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
39236c6e 2051
5ba3f43e
A
2052 /* Register for subflow socket control events */
2053 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
39236c6e 2054 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
5ba3f43e
A
2055 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2056 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2057 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2058 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2059 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2060 SO_FILT_HINT_ADAPTIVE_WTIMO);
39236c6e
A
2061
2062 /* sanity check */
2063 VERIFY(!(mpts->mpts_flags &
2064 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2065
39236c6e
A
2066 /*
2067 * Indicate to the TCP subflow whether or not it should establish
2068 * the initial MPTCP connection, or join an existing one. Fill
2069 * in the connection request structure with additional info needed
2070 * by the underlying TCP (to be used in the TCP options, etc.)
2071 */
39236c6e 2072 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
5ba3f43e
A
2073 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2074
39236c6e 2075 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5ba3f43e 2076 mptcp_init_local_parms(mpte);
39236c6e 2077 }
39236c6e 2078 soisconnecting(mp_so);
5ba3f43e
A
2079
2080 /* If fastopen is requested, set state in mpts */
2081 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2082 mpts->mpts_flags |= MPTSF_TFO_REQD;
39236c6e
A
2083 } else {
2084 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2085 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
490019cf
A
2086 }
2087
39236c6e
A
2088 mpts->mpts_flags |= MPTSF_CONNECTING;
2089
2090 if (af == AF_INET || af == AF_INET6) {
2091 char dbuf[MAX_IPv6_STR_LEN];
2092
3e170ce0
A
2093 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2094 "mp_so 0x%llx dst %s[%d] cid %d "
39236c6e
A
2095 "[pending %s]\n", __func__,
2096 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2097 inet_ntop(af, ((af == AF_INET) ?
5ba3f43e
A
2098 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2099 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
39236c6e 2100 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
5ba3f43e
A
2101 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2102 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
39236c6e
A
2103 mpts->mpts_connid,
2104 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
3e170ce0 2105 "YES" : "NO")),
5ba3f43e 2106 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2107 }
2108
2109 /* connect right away if first attempt, or if join can be done now */
2110 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2111 error = mptcp_subflow_soconnectx(mpte, mpts);
2112
5ba3f43e
A
2113 if (error)
2114 goto out_err_close;
2115
2116 if (pcid)
2117 *pcid = mpts->mpts_connid;
2118
2119 return (0);
2120
2121out_err_close:
2122 mptcp_subflow_abort(mpts, error);
2123
2124 return (error);
2125
2126out_err:
2127 if (mpts)
2128 mptcp_subflow_free(mpts);
2129
39236c6e
A
2130 return (error);
2131}
2132
5ba3f43e
A
2133void
2134mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2135{
2136 int index = mptcp_get_statsindex(stats, mpts);
2137
2138 if (index != -1) {
2139 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2140
2141 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2142 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2143 }
2144}
2145
39236c6e
A
2146/*
2147 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2148 * will no longer be accessible after a subflow is deleted, thus this
2149 * should occur only after the subflow socket has been disconnected.
39236c6e
A
2150 */
2151void
5ba3f43e 2152mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
39236c6e 2153{
5ba3f43e
A
2154 struct socket *mp_so = mptetoso(mpte);
2155 struct socket *so = mpts->mpts_socket;
2156 struct tcpcb *tp = sototcpcb(so);
39037602 2157
5ba3f43e
A
2158 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2159 VERIFY(mpts->mpts_mpte == mpte);
2160 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2161 VERIFY(mpte->mpte_numflows != 0);
2162 VERIFY(mp_so->so_usecount > 0);
39236c6e 2163
5ba3f43e
A
2164 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2165 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2166 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2167 mpts->mpts_flags, mp_so->so_error),
2168 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2169
5ba3f43e
A
2170 mptcpstats_update(mpte->mpte_itfstats, mpts);
2171 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2172 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
39236c6e 2173
39236c6e
A
2174 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2175 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
39236c6e 2176 mpte->mpte_numflows--;
fe8ab488
A
2177 if (mpte->mpte_active_sub == mpts)
2178 mpte->mpte_active_sub = NULL;
39236c6e
A
2179
2180 /*
2181 * Drop references held by this subflow socket; there
2182 * will be no further upcalls made from this point.
2183 */
5ba3f43e
A
2184 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2185 sock_catchevents_locked(so, NULL, NULL, 0);
fe8ab488 2186
39236c6e 2187 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 2188
39236c6e
A
2189 mp_so->so_usecount--; /* for subflow socket */
2190 mpts->mpts_mpte = NULL;
2191 mpts->mpts_socket = NULL;
39236c6e 2192
5ba3f43e
A
2193 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2194 mptcp_subflow_remref(mpts); /* for subflow socket */
2195
2196 so->so_flags &= ~SOF_MP_SUBFLOW;
2197 tp->t_mptcb = NULL;
2198 tp->t_mpsub = NULL;
2199}
2200
2201void
2202mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2203{
2204 struct socket *so = mpts->mpts_socket;
2205 struct mptcb *mp_tp = mpte->mpte_mptcb;
2206 int send_dfin = 0;
2207
2208 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2209 send_dfin = 1;
2210
2211 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2212 (so->so_state & SS_ISCONNECTED)) {
2213 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2214 __func__, mpts->mpts_connid, send_dfin),
2215 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2216
2217 if (send_dfin)
2218 mptcp_send_dfin(so);
2219 soshutdownlock(so, SHUT_WR);
2220 }
2221
2222}
2223
2224static void
2225mptcp_subflow_abort(struct mptsub *mpts, int error)
2226{
2227 struct socket *so = mpts->mpts_socket;
2228 struct tcpcb *tp = sototcpcb(so);
2229
2230 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2231 return;
2232
2233 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2234 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2235
5ba3f43e
A
2236 if (tp->t_state != TCPS_CLOSED)
2237 tcp_drop(tp, error);
2238
2239 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2240}
2241
2242/*
2243 * Disconnect a subflow socket.
2244 */
2245void
5ba3f43e 2246mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
39236c6e
A
2247{
2248 struct socket *so;
2249 struct mptcb *mp_tp;
2250 int send_dfin = 0;
2251
5ba3f43e 2252 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
2253
2254 VERIFY(mpts->mpts_mpte == mpte);
2255 VERIFY(mpts->mpts_socket != NULL);
39236c6e
A
2256
2257 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2258 return;
2259
2260 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2261
39236c6e
A
2262 so = mpts->mpts_socket;
2263 mp_tp = mpte->mpte_mptcb;
5ba3f43e 2264 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
39236c6e 2265 send_dfin = 1;
39236c6e 2266
39236c6e
A
2267 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2268 (so->so_state & SS_ISCONNECTED)) {
a39ff7e2 2269 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
5ba3f43e
A
2270 __func__, mpts->mpts_connid, send_dfin),
2271 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2272
2273 if (send_dfin)
2274 mptcp_send_dfin(so);
2275 (void) soshutdownlock(so, SHUT_RD);
2276 (void) soshutdownlock(so, SHUT_WR);
2277 (void) sodisconnectlocked(so);
2278 }
39236c6e
A
2279 /*
2280 * Generate a disconnect event for this subflow socket, in case
2281 * the lower layer doesn't do it; this is needed because the
5ba3f43e 2282 * subflow socket deletion relies on it.
39236c6e 2283 */
5ba3f43e 2284 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2285}
2286
2287/*
5ba3f43e 2288 * Called when the associated subflow socket posted a read event.
39236c6e
A
2289 */
2290static void
2291mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2292{
2293#pragma unused(so, waitf)
5ba3f43e 2294 struct mptsub *mpts = arg, *tmpts;
39236c6e
A
2295 struct mptses *mpte = mpts->mpts_mpte;
2296
5ba3f43e
A
2297 VERIFY(mpte != NULL);
2298
2299 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2300 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2301 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
fe8ab488 2302 return;
5ba3f43e
A
2303 }
2304
2305 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2306 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2307 if (mpts->mpts_socket->so_usecount == 0) {
2308 /* Will be removed soon by tcp_garbage_collect */
2309 continue;
2310 }
2311
2312 mptcp_subflow_addref(mpts);
2313 mpts->mpts_socket->so_usecount++;
39236c6e 2314
5ba3f43e
A
2315 mptcp_subflow_input(mpte, mpts);
2316
2317 mptcp_subflow_remref(mpts); /* ours */
2318
2319 VERIFY(mpts->mpts_socket->so_usecount != 0);
2320 mpts->mpts_socket->so_usecount--;
2321 }
2322
2323 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
39236c6e
A
2324}
2325
2326/*
2327 * Subflow socket input.
39236c6e
A
2328 */
2329static void
2330mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2331{
5ba3f43e 2332 struct socket *mp_so = mptetoso(mpte);
39236c6e
A
2333 struct mbuf *m = NULL;
2334 struct socket *so;
5ba3f43e 2335 int error, wakeup = 0;
39236c6e 2336
5ba3f43e
A
2337 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2338 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
39236c6e 2339
39037602 2340 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
2341 struct mptsub *, mpts);
2342
2343 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
5ba3f43e 2344 goto out;
39236c6e
A
2345
2346 so = mpts->mpts_socket;
2347
2348 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2349 if (error != 0 && error != EWOULDBLOCK) {
5ba3f43e 2350 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
3e170ce0
A
2351 __func__, mpts->mpts_connid, error),
2352 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5ba3f43e
A
2353 if (error == ENODATA) {
2354 /*
2355 * Don't ignore ENODATA so as to discover
2356 * nasty middleboxes.
2357 */
2358 mp_so->so_error = ENODATA;
2359
2360 wakeup = 1;
2361 goto out;
39236c6e 2362 }
39236c6e 2363 } else if (error == 0) {
5ba3f43e 2364 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
3e170ce0 2365 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2366 }
2367
2368 /* In fallback, make sure to accept data on all but one subflow */
5ba3f43e
A
2369 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2370 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2371 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2372 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2373 m_freem(m);
5ba3f43e 2374 goto out;
39236c6e
A
2375 }
2376
2377 if (m != NULL) {
5ba3f43e
A
2378 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2379 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
3e170ce0 2380
5ba3f43e
A
2381 mpte->mpte_used_cell = 1;
2382 } else {
2383 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2384
2385 mpte->mpte_used_wifi = 1;
2386 }
3e170ce0 2387
39236c6e 2388 mptcp_input(mpte, m);
39236c6e 2389 }
5ba3f43e
A
2390
2391 /* notify protocol that we drained all the data */
2392 if (error == 0 && m != NULL &&
2393 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2394 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2395
2396out:
2397 if (wakeup)
2398 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2399
2400 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
39236c6e
A
2401}
2402
2403/*
2404 * Subflow socket write upcall.
2405 *
5ba3f43e 2406 * Called when the associated subflow socket posted a read event.
39236c6e
A
2407 */
2408static void
2409mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2410{
2411#pragma unused(so, waitf)
2412 struct mptsub *mpts = arg;
2413 struct mptses *mpte = mpts->mpts_mpte;
2414
5ba3f43e
A
2415 VERIFY(mpte != NULL);
2416
2417 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2418 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2419 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
fe8ab488 2420 return;
5ba3f43e 2421 }
39236c6e 2422
5ba3f43e 2423 mptcp_output(mpte);
39236c6e
A
2424}
2425
a39ff7e2
A
2426static boolean_t
2427mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2428{
2429 struct mbuf *so_m = so->so_snd.sb_mb;
2430 uint64_t dsn = m->m_pkthdr.mp_dsn;
2431
2432 while (so_m) {
2433 VERIFY(so_m->m_flags & M_PKTHDR);
2434 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2435
2436 /* Part of the segment is covered, don't reinject here */
2437 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2438 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn)
2439 return TRUE;
2440
2441 so_m = so_m->m_next;
2442 }
2443
2444 return FALSE;
2445}
2446
39236c6e
A
2447/*
2448 * Subflow socket output.
2449 *
2450 * Called for sending data from MPTCP to the underlying subflow socket.
2451 */
2452int
5ba3f43e 2453mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
39236c6e 2454{
39236c6e 2455 struct mptcb *mp_tp = mpte->mpte_mptcb;
5ba3f43e
A
2456 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2457 struct socket *mp_so, *so;
2458 struct tcpcb *tp;
2459 uint64_t mpt_dsn = 0, off = 0;
2460 int sb_cc = 0, error = 0, wakeup = 0;
2461 uint32_t dss_csum;
2462 uint16_t tot_sent = 0;
2463 boolean_t reinjected = FALSE;
2464
2465 mpte_lock_assert_held(mpte);
2466
2467 mp_so = mptetoso(mpte);
39236c6e 2468 so = mpts->mpts_socket;
5ba3f43e 2469 tp = sototcpcb(so);
39236c6e 2470
5ba3f43e
A
2471 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2472 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
39236c6e 2473
5ba3f43e
A
2474 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2475 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2476 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2477 (mpts->mpts_flags & MPTSF_TFO_REQD));
2478 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
39236c6e 2479
5ba3f43e
A
2480 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2481 __func__, mpts->mpts_flags, mpte->mpte_flags,
2482 mptcp_subflow_cwnd_space(so)),
2483 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2484 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2485 struct mptsub *, mpts);
39236c6e
A
2486
2487 /* Remove Addr Option is not sent reliably as per I-D */
2488 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
39236c6e 2489 tp->t_rem_aid = mpte->mpte_lost_aid;
5ba3f43e 2490 tp->t_mpflags |= TMPF_SND_REM_ADDR;
39236c6e
A
2491 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2492 }
2493
2494 /*
2495 * The mbuf chains containing the metadata (as well as pointing to
2496 * the user data sitting at the MPTCP output queue) would then be
2497 * sent down to the subflow socket.
2498 *
2499 * Some notes on data sequencing:
2500 *
2501 * a. Each mbuf must be a M_PKTHDR.
2502 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2503 * in the mbuf pkthdr structure.
2504 * c. Each mbuf containing the MPTCP metadata must have its
2505 * pkt_flags marked with the PKTF_MPTCP flag.
2506 */
2507
5ba3f43e
A
2508 if (mpte->mpte_reinjectq)
2509 sb_mb = mpte->mpte_reinjectq;
2510 else
2511 sb_mb = mp_so->so_snd.sb_mb;
2512
39236c6e 2513 if (sb_mb == NULL) {
a39ff7e2
A
2514 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2515 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2516 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
5ba3f43e 2517 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
a39ff7e2
A
2518
2519 /* Fix it to prevent looping */
2520 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2521 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2522 goto out;
2523 }
2524
2525 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2526
5ba3f43e
A
2527 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2528 !(so->so_state & SS_ISCONNECTED) &&
2529 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2530 tp->t_mpflags |= TMPF_TFO_REQUEST;
2531 goto zero_len_write;
39236c6e
A
2532 }
2533
5ba3f43e
A
2534 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2535
2536 /* First, drop acknowledged data */
39236c6e 2537 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
5ba3f43e
A
2538 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2539 "dsn %u suna %u reinject? %u\n",
2540 __func__, (uint32_t)mpt_dsn,
2541 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2542 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2543 if (mpte->mpte_reinjectq) {
2544 mptcp_clean_reinjectq(mpte);
2545 } else {
2546 uint64_t len = 0;
2547 len = mp_tp->mpt_snduna - mpt_dsn;
2548 sbdrop(&mp_so->so_snd, (int)len);
2549 wakeup = 1;
2550 }
2551 }
2552
2553 /* Check again because of above sbdrop */
2554 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2555 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2556 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2557 goto out;
39236c6e
A
2558 }
2559
2560 /*
2561 * In degraded mode, we don't receive data acks, so force free
2562 * mbufs less than snd_nxt
2563 */
39236c6e 2564 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 2565 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
5ba3f43e
A
2566 mp_so->so_snd.sb_mb) {
2567 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2568 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2569 uint64_t len = 0;
2570 len = mp_tp->mpt_snduna - mpt_dsn;
2571 sbdrop(&mp_so->so_snd, (int)len);
2572 wakeup = 1;
2573
2574 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2575 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2576 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2577 }
39236c6e
A
2578 }
2579
fe8ab488
A
2580 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2581 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2582 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2583 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
39236c6e
A
2584 }
2585
2586 /*
2587 * Adjust the top level notion of next byte used for retransmissions
2588 * and sending FINs.
2589 */
5ba3f43e 2590 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
39236c6e 2591 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
39236c6e
A
2592
2593 /* Now determine the offset from which to start transmitting data */
5ba3f43e
A
2594 if (mpte->mpte_reinjectq)
2595 sb_mb = mpte->mpte_reinjectq;
2596 else
a39ff7e2 2597dont_reinject:
5ba3f43e 2598 sb_mb = mp_so->so_snd.sb_mb;
39236c6e 2599 if (sb_mb == NULL) {
5ba3f43e
A
2600 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2601 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2602 goto out;
2603 }
5ba3f43e 2604
a39ff7e2 2605 if (sb_mb == mpte->mpte_reinjectq) {
5ba3f43e 2606 sb_cc = sb_mb->m_pkthdr.mp_rlen;
a39ff7e2
A
2607 off = 0;
2608
2609 if (mptcp_search_seq_in_sub(sb_mb, so)) {
2610 if (mptcp_can_send_more(mp_tp, TRUE)) {
2611 goto dont_reinject;
2612 }
2613
2614 error = ECANCELED;
2615 goto out;
2616 }
2617
2618 reinjected = TRUE;
5ba3f43e
A
2619 } else if (flags & MPTCP_SUBOUT_PROBING) {
2620 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2621 off = 0;
39236c6e 2622 } else {
5ba3f43e
A
2623 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2624
2625 /*
2626 * With TFO, there might be no data at all, thus still go into this
2627 * code-path here.
2628 */
2629 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2630 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2631 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2632 sb_cc -= off;
2633 } else {
2634 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2635 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2636 (uint32_t)mp_tp->mpt_sndmax),
2637 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2638
2639 goto out;
2640 }
39236c6e 2641 }
39236c6e 2642
5ba3f43e
A
2643 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2644 if (sb_cc <= 0) {
2645 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2646 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2647 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2648 mptcp_subflow_cwnd_space(so)),
2649 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2650 }
2651
2652 sb_cc = min(sb_cc, UINT16_MAX);
2653
2654 /*
2655 * Create a DSN mapping for the data we are about to send. It all
2656 * has the same mapping.
2657 */
a39ff7e2 2658 if (reinjected)
5ba3f43e
A
2659 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2660 else
2661 mpt_dsn = mp_tp->mpt_snduna + off;
39236c6e 2662
5ba3f43e 2663 mpt_mbuf = sb_mb;
a39ff7e2 2664 while (mpt_mbuf && reinjected == FALSE &&
5ba3f43e
A
2665 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2666 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
39236c6e
A
2667 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2668 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 2669 }
3e170ce0 2670 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
5ba3f43e
A
2671 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2672 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3e170ce0 2673 mpts->mpts_probecnt),
5ba3f43e 2674 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2675
ecc0ceb4 2676 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 2677
fe8ab488
A
2678 head = tail = NULL;
2679
39236c6e 2680 while (tot_sent < sb_cc) {
5ba3f43e 2681 ssize_t mlen;
39236c6e 2682
5ba3f43e 2683 mlen = mpt_mbuf->m_len;
39236c6e 2684 mlen -= off;
5ba3f43e 2685 mlen = min(mlen, sb_cc - tot_sent);
39236c6e 2686
5ba3f43e
A
2687 if (mlen < 0) {
2688 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2689 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2690 (uint32_t)off, sb_cc, tot_sent),
2691 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2692 goto out;
39236c6e
A
2693 }
2694
5ba3f43e
A
2695 if (mlen == 0)
2696 goto next;
2697
fe8ab488
A
2698 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2699 M_COPYM_MUST_COPY_HDR);
39236c6e 2700 if (m == NULL) {
5ba3f43e
A
2701 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2702 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2703 error = ENOBUFS;
2704 break;
2705 }
2706
2707 /* Create a DSN mapping for the data (m_copym does it) */
fe8ab488 2708 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e
A
2709 VERIFY(m->m_next == NULL);
2710
39236c6e
A
2711 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2712 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
5ba3f43e 2713 m->m_pkthdr.mp_dsn = mpt_dsn;
39236c6e 2714 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
39236c6e
A
2715 m->m_pkthdr.len = mlen;
2716
fe8ab488
A
2717 if (head == NULL) {
2718 head = tail = m;
2719 } else {
2720 tail->m_next = m;
2721 tail = m;
2722 }
2723
fe8ab488
A
2724 tot_sent += mlen;
2725 off = 0;
5ba3f43e 2726next:
fe8ab488
A
2727 mpt_mbuf = mpt_mbuf->m_next;
2728 }
2729
a39ff7e2 2730 if (reinjected) {
5ba3f43e
A
2731 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2732 struct mbuf *n = sb_mb;
2733
2734 while (n) {
2735 n->m_pkthdr.mp_dsn += sb_cc;
2736 n->m_pkthdr.mp_rlen -= sb_cc;
2737 n = n->m_next;
2738 }
2739 m_adj(sb_mb, sb_cc);
2740 } else {
2741 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2742 m_freem(sb_mb);
2743 }
2744 }
2745
2746 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2747 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2748 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2749
2750 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2751 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2752 tot_sent);
2753 }
2754
2755 /* Now, let's update rel-seq and the data-level length */
2756 mpts->mpts_rel_seq += tot_sent;
2757 m = head;
2758 while (m) {
2759 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2760 m->m_pkthdr.mp_csum = dss_csum;
2761 m->m_pkthdr.mp_rlen = tot_sent;
2762 m = m->m_next;
2763 }
2764
2765 if (head != NULL) {
490019cf 2766 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
5ba3f43e 2767 (tp->t_tfo_stats == 0))
39037602 2768 tp->t_mpflags |= TMPF_TFO_REQUEST;
fe8ab488
A
2769
2770 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2771
5ba3f43e 2772 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
39236c6e
A
2773 struct sockbuf *, &so->so_rcv,
2774 struct sockbuf *, &so->so_snd,
2775 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
2776 size_t, tot_sent);
2777 }
2778
5ba3f43e
A
2779done_sending:
2780 if (error == 0 ||
2781 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2782 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3e170ce0
A
2783
2784 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2785 tcpstat.tcps_mp_num_probes++;
5ba3f43e 2786 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
3e170ce0
A
2787 mpts->mpts_probecnt += 1;
2788 else
2789 mpts->mpts_probecnt +=
2790 tot_sent/mpts->mpts_maxseg;
2791 }
2792
5ba3f43e
A
2793 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2794 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
39236c6e
A
2795 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2796 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
5ba3f43e 2797 mp_tp->mpt_sndnxt = new_sndnxt;
39236c6e 2798 }
fe8ab488 2799
5ba3f43e 2800 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
490019cf 2801
5ba3f43e
A
2802 /* Must be here as mptcp_can_send_more() checks for this */
2803 soclearfastopen(mp_so);
39236c6e 2804
3e170ce0
A
2805 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2806 (mpts->mpts_probesoon != 0))
5ba3f43e
A
2807 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2808 __func__, mpts->mpts_connid,
2809 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2810 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3e170ce0 2811 (tcp_now - mpts->mpts_probesoon)),
5ba3f43e
A
2812 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2813
2814 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2815 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2816
2817 mpte->mpte_used_cell = 1;
2818 } else {
2819 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2820
2821 mpte->mpte_used_wifi = 1;
2822 }
2823
2824 /*
2825 * Don't propagate EWOULDBLOCK - it's already taken care of
2826 * in mptcp_usr_send for TFO.
2827 */
2828 error = 0;
fe8ab488 2829 } else {
5ba3f43e
A
2830 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2831 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
3e170ce0 2832 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
2833 }
2834out:
5ba3f43e 2835
39037602 2836 if (wakeup)
5ba3f43e 2837 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
39037602 2838
5ba3f43e 2839 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
39236c6e 2840 return (error);
5ba3f43e
A
2841
2842zero_len_write:
2843 /* Opting to call pru_send as no mbuf at subflow level */
2844 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2845 NULL, current_proc());
2846
2847 goto done_sending;
39236c6e
A
2848}
2849
39236c6e 2850static void
5ba3f43e 2851mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
39236c6e 2852{
5ba3f43e 2853 struct mbuf *n, *prev = NULL;
39236c6e 2854
5ba3f43e
A
2855 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2856 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2857 m->m_pkthdr.mp_rseq),
2858 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2859
2860 n = mpte->mpte_reinjectq;
2861
2862 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2863 * equal than m's sequence number.
2864 */
2865 while (n) {
2866 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2867 break;
2868
2869 prev = n;
2870
2871 n = n->m_nextpkt;
2872 }
2873
2874 if (n) {
2875 /* m is already fully covered by the next mbuf in the queue */
2876 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2877 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2878 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2879 __func__, n->m_pkthdr.mp_rlen),
2880 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2881 goto dont_queue;
2882 }
2883
2884 /* m is covering the next mbuf entirely, thus we remove this guy */
2885 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2886 struct mbuf *tmp = n->m_nextpkt;
2887
2888 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2889 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2890 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2891 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2892
2893 m->m_nextpkt = NULL;
2894 if (prev == NULL)
2895 mpte->mpte_reinjectq = tmp;
2896 else
2897 prev->m_nextpkt = tmp;
2898
2899 m_freem(n);
2900 n = tmp;
2901 }
2902
2903 }
2904
2905 if (prev) {
2906 /* m is already fully covered by the previous mbuf in the queue */
2907 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2908 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2909 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2910 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2911 goto dont_queue;
2912 }
2913 }
2914
2915 if (prev == NULL)
2916 mpte->mpte_reinjectq = m;
2917 else
2918 prev->m_nextpkt = m;
39236c6e 2919
5ba3f43e
A
2920 m->m_nextpkt = n;
2921
2922 return;
2923
2924dont_queue:
2925 m_freem(m);
2926 return;
39236c6e
A
2927}
2928
5ba3f43e
A
2929static struct mbuf *
2930mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
39236c6e 2931{
5ba3f43e
A
2932 struct socket *mp_so = mptetoso(mpte);
2933 struct mbuf *m;
39236c6e 2934
5ba3f43e 2935 m = mp_so->so_snd.sb_mb;
39236c6e 2936
5ba3f43e
A
2937 while (m) {
2938 /* If this segment covers what we are looking for, return it. */
2939 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2940 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2941 break;
2942
2943
2944 /* Segment is no more in the queue */
2945 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
2946 return NULL;
2947
2948 m = m->m_next;
39236c6e
A
2949 }
2950
5ba3f43e
A
2951 return m;
2952}
fe8ab488 2953
5ba3f43e
A
2954static struct mbuf *
2955mptcp_copy_mbuf_list(struct mbuf *m, int len)
2956{
2957 struct mbuf *top = NULL, *tail = NULL;
2958 uint64_t dsn;
2959 uint32_t dlen, rseq;
39236c6e 2960
5ba3f43e
A
2961 dsn = m->m_pkthdr.mp_dsn;
2962 dlen = m->m_pkthdr.mp_rlen;
2963 rseq = m->m_pkthdr.mp_rseq;
3e170ce0 2964
5ba3f43e
A
2965 while (len > 0) {
2966 struct mbuf *n;
2967
2968 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
2969
2970 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
2971 if (n == NULL) {
2972 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
2973 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2974 goto err;
3e170ce0 2975 }
fe8ab488 2976
5ba3f43e
A
2977 VERIFY(n->m_flags & M_PKTHDR);
2978 VERIFY(n->m_next == NULL);
2979 VERIFY(n->m_pkthdr.mp_dsn == dsn);
2980 VERIFY(n->m_pkthdr.mp_rlen == dlen);
2981 VERIFY(n->m_pkthdr.mp_rseq == rseq);
2982 VERIFY(n->m_len == m->m_len);
2983
2984 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
2985
2986 if (top == NULL)
2987 top = n;
2988
2989 if (tail != NULL)
2990 tail->m_next = n;
2991
2992 tail = n;
2993
2994 len -= m->m_len;
2995 m = m->m_next;
39236c6e
A
2996 }
2997
5ba3f43e
A
2998 return top;
2999
3000err:
3001 if (top)
3002 m_freem(top);
3003
3004 return NULL;
39236c6e
A
3005}
3006
5ba3f43e
A
3007static void
3008mptcp_reinject_mbufs(struct socket *so)
39236c6e 3009{
5ba3f43e
A
3010 struct tcpcb *tp = sototcpcb(so);
3011 struct mptsub *mpts = tp->t_mpsub;
3012 struct mptcb *mp_tp = tptomptp(tp);
3013 struct mptses *mpte = mp_tp->mpt_mpte;;
3014 struct sockbuf *sb = &so->so_snd;
3015 struct mbuf *m;
39236c6e 3016
5ba3f43e
A
3017 m = sb->sb_mb;
3018 while (m) {
3019 struct mbuf *n = m->m_next, *orig = m;
39236c6e 3020
5ba3f43e
A
3021 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3022 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3023 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3024 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3025
5ba3f43e 3026 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 3027
5ba3f43e
A
3028 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
3029 goto next;
39236c6e 3030
5ba3f43e
A
3031 /* Has it all already been acknowledged at the data-level? */
3032 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
3033 goto next;
3034
3035 /* Part of this has already been acknowledged - lookup in the
3036 * MPTCP-socket for the segment.
3037 */
3038 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3039 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3040 if (m == NULL)
3041 goto next;
3042 }
3043
3044 /* Copy the mbuf with headers (aka, DSN-numbers) */
3045 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
3046 if (m == NULL)
3047 break;
3048
3049 VERIFY(m->m_nextpkt == NULL);
3050
3051 /* Now, add to the reinject-queue, eliminating overlapping
3052 * segments
3053 */
3054 mptcp_add_reinjectq(mpte, m);
3055
3056 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3057
3058next:
3059 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3060 while (n) {
3061 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3062
3063 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
3064 break;
3065
3066 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3067 n = n->m_next;
3068 }
3069
3070 m = n;
39236c6e 3071 }
5ba3f43e 3072}
39236c6e 3073
5ba3f43e
A
3074void
3075mptcp_clean_reinjectq(struct mptses *mpte)
3076{
3077 struct mptcb *mp_tp = mpte->mpte_mptcb;
3078
3079 mpte_lock_assert_held(mpte);
3080
3081 while (mpte->mpte_reinjectq) {
3082 struct mbuf *m = mpte->mpte_reinjectq;
3083
3084 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
a39ff7e2 3085 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
5ba3f43e
A
3086 break;
3087
3088 mpte->mpte_reinjectq = m->m_nextpkt;
3089 m->m_nextpkt = NULL;
3090 m_freem(m);
3091 }
39236c6e
A
3092}
3093
3094/*
5ba3f43e 3095 * Subflow socket control event upcall.
39236c6e 3096 */
5ba3f43e
A
3097static void
3098mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
39236c6e 3099{
5ba3f43e
A
3100#pragma unused(so)
3101 struct mptsub *mpts = arg;
3102 struct mptses *mpte = mpts->mpts_mpte;
39236c6e 3103
5ba3f43e
A
3104 VERIFY(mpte != NULL);
3105 mpte_lock_assert_held(mpte);
39236c6e 3106
5ba3f43e
A
3107 if ((mpts->mpts_evctl & events) == events)
3108 return;
39236c6e 3109
5ba3f43e
A
3110 mpts->mpts_evctl |= events;
3111
3112 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3113 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3114 return;
39037602 3115 }
39236c6e 3116
5ba3f43e 3117 mptcp_subflow_workloop(mpte);
39236c6e
A
3118}
3119
3120/*
5ba3f43e
A
3121 * Subflow socket control events.
3122 *
3123 * Called for handling events related to the underlying subflow socket.
39236c6e
A
3124 */
3125static ev_ret_t
5ba3f43e 3126mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3e170ce0 3127 uint64_t *p_mpsofilt_hint)
39236c6e 3128{
5ba3f43e
A
3129 ev_ret_t ret = MPTS_EVRET_OK;
3130 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3131 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e 3132
5ba3f43e 3133 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3134
5ba3f43e
A
3135 /* bail if there's nothing to process */
3136 if (!mpts->mpts_evctl)
3137 return (ret);
39236c6e 3138
5ba3f43e
A
3139 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3140 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3141 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3142 SO_FILT_HINT_DISCONNECTED)) {
3143 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3144 }
3e170ce0 3145
5ba3f43e
A
3146 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3147 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3148
3149 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3150 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3151 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3152
3153 /*
3154 * Process all the socket filter hints and reset the hint
3155 * once it is handled
3156 */
3157 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3158 /*
3159 * Always execute the DISCONNECTED event, because it will wakeup
3160 * the app.
3161 */
3162 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3163 (ret >= MPTS_EVRET_OK ||
3164 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3165 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3166 ev_ret_t error =
3167 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3168 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3169 }
3170 }
3171
3172 /*
3173 * We should be getting only events specified via sock_catchevents(),
3174 * so loudly complain if we have any unprocessed one(s).
3175 */
3176 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3177 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3178 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3179 mpts->mpts_connid,
3180 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3181 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3182 else
3183 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3184 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3185 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3186
3187 return (ret);
39236c6e
A
3188}
3189
39236c6e 3190static ev_ret_t
5ba3f43e
A
3191mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3192 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e
A
3193{
3194 struct socket *mp_so, *so;
3195 struct mptcb *mp_tp;
39236c6e 3196
5ba3f43e 3197 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3198 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3199 mp_so = mptetoso(mpte);
39236c6e
A
3200 mp_tp = mpte->mpte_mptcb;
3201 so = mpts->mpts_socket;
3202
5ba3f43e
A
3203 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3204 mpts->mpts_connid, event),
3e170ce0 3205 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3206
39236c6e 3207 /*
5ba3f43e
A
3208 * We got an event for this subflow that might need to be propagated,
3209 * based on the state of the MPTCP connection.
39236c6e 3210 */
5ba3f43e
A
3211 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3212 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3213 mp_so->so_error = so->so_error;
3214 *p_mpsofilt_hint |= event;
39236c6e 3215 }
39236c6e 3216
5ba3f43e 3217 return (MPTS_EVRET_OK);
39236c6e
A
3218}
3219
3220/*
3221 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3222 */
3223static ev_ret_t
3e170ce0 3224mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3225 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3226{
5ba3f43e
A
3227#pragma unused(p_mpsofilt_hint, event)
3228 struct socket *mp_so;
3229 struct tcpcb *tp;
39236c6e 3230
5ba3f43e 3231 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e
A
3232
3233 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e
A
3234 mp_so = mptetoso(mpte);
3235 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
39236c6e 3236
39236c6e
A
3237 /*
3238 * This overwrites any previous mpte_lost_aid to avoid storing
3239 * too much state when the typical case has only two subflows.
3240 */
3241 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3242 mpte->mpte_lost_aid = tp->t_local_aid;
3243
5ba3f43e
A
3244 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3245 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3246
3247 /*
3248 * The subflow connection has lost its source address.
39236c6e 3249 */
5ba3f43e 3250 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
39236c6e 3251
5ba3f43e
A
3252 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3253 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3254
5ba3f43e 3255 return (MPTS_EVRET_DELETE);
39236c6e
A
3256}
3257
fe8ab488
A
3258/*
3259 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3260 * indicates that the remote side sent a Data FIN
3261 */
3262static ev_ret_t
3e170ce0 3263mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3264 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3265{
5ba3f43e 3266#pragma unused(event)
fe8ab488
A
3267 struct mptcb *mp_tp;
3268
5ba3f43e 3269 mpte_lock_assert_held(mpte); /* same as MP socket lock */
fe8ab488
A
3270 mp_tp = mpte->mpte_mptcb;
3271
5ba3f43e 3272 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3e170ce0 3273 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 3274
fe8ab488 3275 /*
39037602 3276 * We got a Data FIN for the MPTCP connection.
fe8ab488
A
3277 * The FIN may arrive with data. The data is handed up to the
3278 * mptcp socket and the user is notified so that it may close
3279 * the socket if needed.
3280 */
39037602 3281 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
5ba3f43e 3282 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
39037602 3283
fe8ab488
A
3284 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3285}
3286
39236c6e
A
3287/*
3288 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3289 */
3290static ev_ret_t
3e170ce0 3291mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3292 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3293{
5ba3f43e 3294#pragma unused(event, p_mpsofilt_hint)
39236c6e 3295 struct mptsub *mpts_alt = NULL;
5ba3f43e 3296 struct socket *alt_so = NULL;
39236c6e
A
3297 struct socket *mp_so;
3298 int altpath_exists = 0;
3299
5ba3f43e
A
3300 mpte_lock_assert_held(mpte);
3301 mp_so = mptetoso(mpte);
3302 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3303 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3304 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3305
5ba3f43e 3306 mptcp_reinject_mbufs(mpts->mpts_socket);
39236c6e 3307
5ba3f43e 3308 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
39236c6e
A
3309 /*
3310 * If there is no alternate eligible subflow, ignore the
3311 * failover hint.
3312 */
3313 if (mpts_alt == NULL) {
5ba3f43e
A
3314 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3315 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3316
39236c6e
A
3317 goto done;
3318 }
5ba3f43e 3319
39236c6e 3320 altpath_exists = 1;
5ba3f43e 3321 alt_so = mpts_alt->mpts_socket;
39236c6e 3322 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
fe8ab488 3323 /* All data acknowledged and no RTT spike */
5ba3f43e 3324 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
39236c6e
A
3325 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3326 } else {
3327 /* no alternate path available */
3328 altpath_exists = 0;
3329 }
39236c6e 3330 }
39236c6e
A
3331
3332 if (altpath_exists) {
5ba3f43e 3333 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
39236c6e 3334
5ba3f43e 3335 mpte->mpte_active_sub = mpts_alt;
39236c6e
A
3336 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3337 mpts->mpts_flags &= ~MPTSF_ACTIVE;
5ba3f43e
A
3338
3339 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3340 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3341 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3342
3343 mptcpstats_inc_switch(mpte, mpts);
3344
3345 sowwakeup(alt_so);
39236c6e 3346 } else {
5ba3f43e
A
3347 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3348 mpts->mpts_connid),
3349 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3350done:
5ba3f43e 3351 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
39236c6e 3352 }
5ba3f43e 3353
39236c6e
A
3354 return (MPTS_EVRET_OK);
3355}
3356
3357/*
3358 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3359 */
3360static ev_ret_t
3e170ce0 3361mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3362 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3363{
5ba3f43e 3364 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3365 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3366
5ba3f43e
A
3367 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3368 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3369
39236c6e 3370 /*
5ba3f43e
A
3371 * The subflow connection cannot use the outgoing interface, let's
3372 * close this subflow.
39236c6e 3373 */
5ba3f43e 3374 mptcp_subflow_abort(mpts, EPERM);
39236c6e 3375
5ba3f43e 3376 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 3377
5ba3f43e 3378 return (MPTS_EVRET_DELETE);
39236c6e
A
3379}
3380
a39ff7e2
A
3381/*
3382 * https://tools.ietf.org/html/rfc6052#section-2
3383 * https://tools.ietf.org/html/rfc6147#section-5.2
3384 */
3385static boolean_t
3386mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3387 const struct ipv6_prefix *prefix,
3388 struct in_addr *addrv4)
3389{
3390 char buf[MAX_IPv4_STR_LEN];
3391 char *ptrv4 = (char *)addrv4;
3392 const char *ptr = (const char *)addr;
3393
3394 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0)
3395 return false;
3396
3397 switch (prefix->prefix_len) {
3398 case NAT64_PREFIX_LEN_96:
3399 memcpy(ptrv4, ptr + 12, 4);
3400 break;
3401 case NAT64_PREFIX_LEN_64:
3402 memcpy(ptrv4, ptr + 9, 4);
3403 break;
3404 case NAT64_PREFIX_LEN_56:
3405 memcpy(ptrv4, ptr + 7, 1);
3406 memcpy(ptrv4 + 1, ptr + 9, 3);
3407 break;
3408 case NAT64_PREFIX_LEN_48:
3409 memcpy(ptrv4, ptr + 6, 2);
3410 memcpy(ptrv4 + 2, ptr + 9, 2);
3411 break;
3412 case NAT64_PREFIX_LEN_40:
3413 memcpy(ptrv4, ptr + 5, 3);
3414 memcpy(ptrv4 + 3, ptr + 9, 1);
3415 break;
3416 case NAT64_PREFIX_LEN_32:
3417 memcpy(ptrv4, ptr + 4, 4);
3418 break;
3419 default:
3420 panic("NAT64-prefix len is wrong: %u\n",
3421 prefix->prefix_len);
3422 }
3423
3424 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3425 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3426
3427 return true;
3428}
3429
3430static void
3431mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3432{
3433 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3434 struct socket *so = mpts->mpts_socket;
3435 struct ifnet *ifp;
3436 int j;
3437
3438 ifp = sotoinpcb(so)->inp_last_outifp;
3439
3440 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3441 mptcp_ask_for_nat64(ifp);
3442 return;
3443 }
3444
3445
3446 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3447 int success;
3448
3449 if (nat64prefixes[j].prefix_len == 0)
3450 continue;
3451
3452 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3453 &nat64prefixes[j],
3454 &mpte->mpte_dst_v4_nat64.sin_addr);
3455 if (success) {
3456 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3457 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3458 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3459 break;
3460 }
3461 }
3462}
3463
39236c6e
A
3464/*
3465 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3466 */
3467static ev_ret_t
3e170ce0 3468mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3469 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3470{
5ba3f43e 3471#pragma unused(event, p_mpsofilt_hint)
39236c6e 3472 struct socket *mp_so, *so;
5ba3f43e
A
3473 struct inpcb *inp;
3474 struct tcpcb *tp;
39236c6e 3475 struct mptcb *mp_tp;
5ba3f43e 3476 int af;
39236c6e
A
3477 boolean_t mpok = FALSE;
3478
5ba3f43e 3479 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3480 VERIFY(mpte->mpte_mppcb != NULL);
39236c6e 3481
5ba3f43e
A
3482 mp_so = mptetoso(mpte);
3483 mp_tp = mpte->mpte_mptcb;
39236c6e 3484 so = mpts->mpts_socket;
5ba3f43e
A
3485 tp = sototcpcb(so);
3486 af = mpts->mpts_dst.sa_family;
39236c6e
A
3487
3488 if (mpts->mpts_flags & MPTSF_CONNECTED)
3489 return (MPTS_EVRET_OK);
3490
3491 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3492 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
3493 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3494 (so->so_state & SS_ISCONNECTED)) {
5ba3f43e 3495 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3e170ce0
A
3496 __func__, mpts->mpts_connid),
3497 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
3498 (void) soshutdownlock(so, SHUT_RD);
3499 (void) soshutdownlock(so, SHUT_WR);
3500 (void) sodisconnectlocked(so);
3501 }
39236c6e
A
3502 return (MPTS_EVRET_OK);
3503 }
3504
3505 /*
3506 * The subflow connection has been connected. Find out whether it
3507 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3508 *
3509 * a. If MPTCP connection is not yet established, then this must be
3510 * the first subflow connection. If MPTCP failed to negotiate,
5ba3f43e 3511 * fallback to regular TCP by degrading this subflow.
39236c6e
A
3512 *
3513 * b. If MPTCP connection has been established, then this must be
3514 * one of the subsequent subflow connections. If MPTCP failed
5ba3f43e 3515 * to negotiate, disconnect the connection.
39236c6e
A
3516 *
3517 * Right now, we simply unblock any waiters at the MPTCP socket layer
3518 * if the MPTCP connection has not been established.
3519 */
39236c6e
A
3520
3521 if (so->so_state & SS_ISDISCONNECTED) {
3522 /*
3523 * With MPTCP joins, a connection is connected at the subflow
3524 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
3525 * subflow to connected state. So there is a small window
3526 * where the subflow could get disconnected before the
39236c6e
A
3527 * connected event is processed.
3528 */
39236c6e
A
3529 return (MPTS_EVRET_OK);
3530 }
3531
5ba3f43e
A
3532 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3533 mptcp_drop_tfo_data(mpte, mpts);
490019cf 3534
5ba3f43e
A
3535 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3536 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf 3537
490019cf 3538 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
39236c6e
A
3539 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3540
490019cf
A
3541 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3542
39236c6e 3543 /* get/verify the outbound interface */
5ba3f43e 3544 inp = sotoinpcb(so);
3e170ce0 3545
5ba3f43e 3546 mpts->mpts_maxseg = tp->t_maxseg;
3e170ce0 3547
5ba3f43e
A
3548 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3549 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3550 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3e170ce0 3551 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
3552
3553 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
39236c6e 3554
39236c6e 3555 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
5ba3f43e
A
3556 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3557 mpte->mpte_associd = mpts->mpts_connid;
3558 DTRACE_MPTCP2(state__change,
3559 struct mptcb *, mp_tp,
3560 uint32_t, 0 /* event */);
3561
3562 if (SOCK_DOM(so) == AF_INET) {
3563 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3564 } else {
3565 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3566 }
3567
a39ff7e2
A
3568 mpts->mpts_flags |= MPTSF_ACTIVE;
3569
39236c6e
A
3570 /* case (a) above */
3571 if (!mpok) {
5ba3f43e
A
3572 tcpstat.tcps_mpcap_fallback++;
3573
3574 tp->t_mpflags |= TMPF_INFIN_SENT;
3575 mptcp_notify_mpfail(so);
39236c6e 3576 } else {
5ba3f43e
A
3577 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3578 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3579 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
39037602
A
3580 } else {
3581 mpts->mpts_flags |= MPTSF_PREFERRED;
3582 }
39236c6e
A
3583 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3584 mpte->mpte_nummpcapflows++;
5ba3f43e 3585
a39ff7e2
A
3586 if (SOCK_DOM(so) == AF_INET6)
3587 mptcp_handle_ipv6_connection(mpte, mpts);
3588
5ba3f43e
A
3589 mptcp_check_subflows_and_add(mpte);
3590
3591 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3592 mpte->mpte_initial_cell = 1;
3593
3594 mpte->mpte_handshake_success = 1;
39236c6e 3595 }
5ba3f43e
A
3596
3597 mp_tp->mpt_sndwnd = tp->snd_wnd;
3598 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3599 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3600 soisconnected(mp_so);
3601
3602 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3603 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3604 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3605 } else if (mpok) {
39236c6e
A
3606 /*
3607 * case (b) above
3608 * In case of additional flows, the MPTCP socket is not
3609 * MPTSF_MP_CAPABLE until an ACK is received from server
3610 * for 3-way handshake. TCP would have guaranteed that this
3611 * is an MPTCP subflow.
3612 */
5ba3f43e
A
3613 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3614 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3615 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3616 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3617 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3618 } else {
3619 mpts->mpts_flags |= MPTSF_PREFERRED;
3620 }
3621
39236c6e
A
3622 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3623 mpte->mpte_nummpcapflows++;
5ba3f43e
A
3624
3625 mpts->mpts_rel_seq = 1;
3626
3627 mptcp_check_subflows_and_remove(mpte);
fe8ab488 3628 } else {
5ba3f43e
A
3629 unsigned int i;
3630
a39ff7e2
A
3631 /* Should we try the alternate port? */
3632 if (mpte->mpte_alternate_port &&
3633 inp->inp_fport != mpte->mpte_alternate_port) {
3634 union sockaddr_in_4_6 dst;
3635 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
5ba3f43e 3636
a39ff7e2
A
3637 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3638
3639 dst_in->sin_port = mpte->mpte_alternate_port;
3640
3641 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3642 mpts->mpts_ifscope , NULL);
3643 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3644 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3645 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3646
3647 if (inp->inp_last_outifp->if_index == info->ifindex) {
3648 info->no_mptcp_support = 1;
3649 break;
3650 }
5ba3f43e
A
3651 }
3652 }
3653
3654 tcpstat.tcps_join_fallback++;
3655 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3656 tcpstat.tcps_mptcp_cell_proxy++;
3657 else
3658 tcpstat.tcps_mptcp_wifi_proxy++;
3659
3660 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3661
3662 return (MPTS_EVRET_OK);
39236c6e 3663 }
fe8ab488 3664
5ba3f43e
A
3665 /* This call, just to "book" an entry in the stats-table for this ifindex */
3666 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3667
3668 mptcp_output(mpte);
39236c6e
A
3669
3670 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3671}
3672
3673/*
3674 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3675 */
3676static ev_ret_t
3e170ce0 3677mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3678 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3679{
5ba3f43e 3680#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3681 struct socket *mp_so, *so;
3682 struct mptcb *mp_tp;
39236c6e 3683
5ba3f43e 3684 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3685 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3686 mp_so = mptetoso(mpte);
39236c6e
A
3687 mp_tp = mpte->mpte_mptcb;
3688 so = mpts->mpts_socket;
3689
5ba3f43e
A
3690 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3691 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3692 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3693 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3e170ce0 3694 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3695
3696 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
5ba3f43e 3697 return (MPTS_EVRET_DELETE);
39236c6e 3698
39236c6e
A
3699 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3700
5ba3f43e 3701 /* The subflow connection has been disconnected. */
39236c6e
A
3702
3703 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3704 mpte->mpte_nummpcapflows--;
fe8ab488
A
3705 if (mpte->mpte_active_sub == mpts) {
3706 mpte->mpte_active_sub = NULL;
5ba3f43e 3707 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3e170ce0 3708 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3709 }
39236c6e
A
3710 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3711 }
3712
5ba3f43e
A
3713 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3714 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3715 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3716 mptcp_drop(mpte, mp_tp, so->so_error);
39236c6e
A
3717 }
3718
39236c6e 3719 /*
5ba3f43e
A
3720 * Clear flags that are used by getconninfo to return state.
3721 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e 3722 */
5ba3f43e
A
3723 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3724 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3725 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3726
3727 return (MPTS_EVRET_DELETE);
39236c6e
A
3728}
3729
3730/*
3731 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3732 */
3733static ev_ret_t
3e170ce0 3734mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3735 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3736{
5ba3f43e 3737#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
3738 struct socket *mp_so, *so;
3739 struct mptcb *mp_tp;
3e170ce0 3740 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e 3741
5ba3f43e 3742 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3743 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3744 mp_so = mptetoso(mpte);
39236c6e 3745 mp_tp = mpte->mpte_mptcb;
39236c6e
A
3746 so = mpts->mpts_socket;
3747
39236c6e
A
3748 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3749 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3750 else
3751 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3752
3753 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3754 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3755 goto done;
3756 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3757 }
3758 else
3759 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3760
3761 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3762 mpts->mpts_flags |= MPTSF_MP_READY;
3763 else
3764 mpts->mpts_flags &= ~MPTSF_MP_READY;
3765
3766 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3767 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3768 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3769 }
3770
3771 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3772 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3773 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3774 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3775 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3776 ret = MPTS_EVRET_CONNECT_PENDING;
3777 }
3778
5ba3f43e
A
3779 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3780 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3781 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3782 mpts->mpts_flags, MPTSF_BITS),
3783 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3e170ce0 3784
39236c6e 3785done:
39236c6e
A
3786 return (ret);
3787}
3788
3789/*
3790 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3791 */
3792static ev_ret_t
3e170ce0 3793mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
5ba3f43e 3794 uint64_t *p_mpsofilt_hint, uint64_t event)
39236c6e 3795{
5ba3f43e 3796#pragma unused(event)
39236c6e
A
3797 struct socket *mp_so, *so;
3798 struct mptcb *mp_tp;
5ba3f43e 3799 boolean_t is_fastclose;
39236c6e 3800
5ba3f43e 3801 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 3802 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 3803 mp_so = mptetoso(mpte);
39236c6e
A
3804 mp_tp = mpte->mpte_mptcb;
3805 so = mpts->mpts_socket;
3806
39236c6e 3807 /* We got an invalid option or a fast close */
39236c6e
A
3808 struct tcptemp *t_template;
3809 struct inpcb *inp = sotoinpcb(so);
3810 struct tcpcb *tp = NULL;
3811
3812 tp = intotcpcb(inp);
fe8ab488 3813 so->so_error = ECONNABORTED;
39236c6e 3814
39037602
A
3815 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3816
39236c6e
A
3817 t_template = tcp_maketemplate(tp);
3818 if (t_template) {
fe8ab488 3819 struct tcp_respond_args tra;
39236c6e 3820
fe8ab488 3821 bzero(&tra, sizeof(tra));
39236c6e 3822 if (inp->inp_flags & INP_BOUND_IF)
fe8ab488 3823 tra.ifscope = inp->inp_boundifp->if_index;
39236c6e 3824 else
fe8ab488
A
3825 tra.ifscope = IFSCOPE_NONE;
3826 tra.awdl_unrestricted = 1;
39236c6e
A
3827
3828 tcp_respond(tp, t_template->tt_ipgen,
3829 &t_template->tt_t, (struct mbuf *)NULL,
fe8ab488 3830 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
39236c6e 3831 (void) m_free(dtom(t_template));
3e170ce0
A
3832 mptcplog((LOG_DEBUG, "MPTCP Events: "
3833 "%s: mp_so 0x%llx cid %d \n",
39236c6e 3834 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
3835 so, mpts->mpts_connid),
3836 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3837 }
5ba3f43e 3838 mptcp_subflow_abort(mpts, ECONNABORTED);
39037602
A
3839
3840 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3e170ce0 3841 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 3842
39037602
A
3843 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3844 mp_so->so_error = ECONNABORTED;
3845 else
3846 mp_so->so_error = ECONNRESET;
3847
3848 /*
3849 * mptcp_drop is being called after processing the events, to fully
3850 * close the MPTCP connection
3851 */
39236c6e 3852 }
39037602 3853
3e170ce0
A
3854 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3855 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
39236c6e 3856
5ba3f43e 3857 return (MPTS_EVRET_DELETE);
39236c6e
A
3858}
3859
fe8ab488 3860static ev_ret_t
5ba3f43e
A
3861mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3862 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3863{
5ba3f43e
A
3864#pragma unused(event)
3865 bool found_active = false;
3866
3867 mpts->mpts_flags |= MPTSF_READ_STALL;
39037602 3868
5ba3f43e
A
3869 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3870 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3e170ce0 3871
5ba3f43e
A
3872 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3873 TCPS_HAVERCVDFIN2(tp->t_state))
3874 continue;
3875
3876 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3877 found_active = true;
3878 break;
fe8ab488 3879 }
fe8ab488
A
3880 }
3881
5ba3f43e
A
3882 if (!found_active)
3883 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3884
fe8ab488
A
3885 return (MPTS_EVRET_OK);
3886}
3887
3888static ev_ret_t
5ba3f43e
A
3889mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3890 uint64_t *p_mpsofilt_hint, uint64_t event)
fe8ab488 3891{
5ba3f43e
A
3892#pragma unused(event)
3893 bool found_active = false;
3e170ce0 3894
5ba3f43e 3895 mpts->mpts_flags |= MPTSF_WRITE_STALL;
fe8ab488 3896
5ba3f43e
A
3897 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3898 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3899
3900 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3901 tp->t_state > TCPS_CLOSE_WAIT)
3902 continue;
3903
3904 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3905 found_active = true;
3906 break;
3907 }
3908 }
3909
3910 if (!found_active)
3911 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3912
3913 return (MPTS_EVRET_OK);
fe8ab488
A
3914}
3915
39236c6e
A
3916static const char *
3917mptcp_evret2str(ev_ret_t ret)
3918{
3919 const char *c = "UNKNOWN";
3920
3921 switch (ret) {
3922 case MPTS_EVRET_DELETE:
3923 c = "MPTS_EVRET_DELETE";
3924 break;
3925 case MPTS_EVRET_CONNECT_PENDING:
3926 c = "MPTS_EVRET_CONNECT_PENDING";
3927 break;
3928 case MPTS_EVRET_DISCONNECT_FALLBACK:
3929 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3930 break;
3931 case MPTS_EVRET_OK:
3932 c = "MPTS_EVRET_OK";
3933 break;
3e170ce0 3934 default:
39236c6e
A
3935 break;
3936 }
3937 return (c);
3938}
3939
39236c6e
A
3940/*
3941 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3942 * caller must ensure that the option can be issued on subflow sockets, via
3943 * MPOF_SUBFLOW_OK flag.
3944 */
3945int
5ba3f43e 3946mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
39236c6e 3947{
5ba3f43e 3948 struct socket *mp_so, *so;
39236c6e 3949 struct sockopt sopt;
39236c6e
A
3950 int error;
3951
3952 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
3953 mpte_lock_assert_held(mpte);
3954
3955 mp_so = mptetoso(mpte);
3956 so = mpts->mpts_socket;
3957
3958 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
3959 mpo->mpo_level == SOL_SOCKET &&
3960 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
3961 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n",
3962 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(),
3963 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
3964 mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1),
3965 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3966
3967 /*
3968 * When we open a new subflow, mark it as cell fallback, if
3969 * this subflow goes over cell.
3970 *
3971 * (except for first-party apps)
3972 */
3973
3974 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
3975 return (0);
39236c6e 3976
5ba3f43e
A
3977 if (sotoinpcb(so)->inp_last_outifp &&
3978 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
3979 return (0);
3980
3981 /*
3982 * This here is an OR, because if the app is not binding to the
3983 * interface, then it definitely is not a cell-fallback
3984 * connection.
3985 */
3986 if (mpts->mpts_ifscope == IFSCOPE_NONE ||
3987 !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]))
3988 return (0);
3989 }
3990
3991 mpo->mpo_flags &= ~MPOF_INTERIM;
39236c6e
A
3992
3993 bzero(&sopt, sizeof (sopt));
3994 sopt.sopt_dir = SOPT_SET;
3995 sopt.sopt_level = mpo->mpo_level;
3996 sopt.sopt_name = mpo->mpo_name;
3997 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3998 sopt.sopt_valsize = sizeof (int);
3999 sopt.sopt_p = kernproc;
4000
5ba3f43e 4001 error = sosetoptlock(so, &sopt, 0);
39236c6e 4002 if (error == 0) {
5ba3f43e 4003 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
39236c6e
A
4004 "val %d set successful\n", __func__,
4005 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
4006 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4007 mpo->mpo_intval),
4008 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e 4009 } else {
5ba3f43e 4010 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
39236c6e
A
4011 "val %d set error %d\n", __func__,
4012 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
4013 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4014 mpo->mpo_intval, error),
4015 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
4016 }
4017 return (error);
4018}
4019
4020/*
4021 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4022 * caller must ensure that the option can be issued on subflow sockets, via
4023 * MPOF_SUBFLOW_OK flag.
4024 */
4025int
4026mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4027 struct mptopt *mpo)
4028{
4029 struct socket *mp_so;
4030 struct sockopt sopt;
39236c6e
A
4031 int error;
4032
4033 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
4034 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4035 mp_so = mptetoso(mpte);
39236c6e
A
4036
4037 bzero(&sopt, sizeof (sopt));
4038 sopt.sopt_dir = SOPT_GET;
4039 sopt.sopt_level = mpo->mpo_level;
4040 sopt.sopt_name = mpo->mpo_name;
4041 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4042 sopt.sopt_valsize = sizeof (int);
4043 sopt.sopt_p = kernproc;
4044
4045 error = sogetoptlock(so, &sopt, 0); /* already locked */
4046 if (error == 0) {
3e170ce0
A
4047 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4048 "%s: mp_so 0x%llx sopt %s "
39236c6e
A
4049 "val %d get successful\n", __func__,
4050 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e
A
4051 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4052 mpo->mpo_intval),
3e170ce0 4053 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 4054 } else {
3e170ce0
A
4055 mptcplog((LOG_ERR, "MPTCP Socket: "
4056 "%s: mp_so 0x%llx sopt %s get error %d\n",
39236c6e 4057 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5ba3f43e 4058 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
3e170ce0 4059 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
4060 }
4061 return (error);
4062}
4063
4064
4065/*
4066 * MPTCP garbage collector.
4067 *
4068 * This routine is called by the MP domain on-demand, periodic callout,
4069 * which is triggered when a MPTCP socket is closed. The callout will
4070 * repeat as long as this routine returns a non-zero value.
4071 */
4072static uint32_t
4073mptcp_gc(struct mppcbinfo *mppi)
4074{
4075 struct mppcb *mpp, *tmpp;
4076 uint32_t active = 0;
4077
5ba3f43e 4078 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
39236c6e 4079
39236c6e
A
4080 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4081 struct socket *mp_so;
4082 struct mptses *mpte;
4083 struct mptcb *mp_tp;
4084
4085 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4086 mp_so = mpp->mpp_socket;
4087 VERIFY(mp_so != NULL);
4088 mpte = mptompte(mpp);
4089 VERIFY(mpte != NULL);
4090 mp_tp = mpte->mpte_mptcb;
4091 VERIFY(mp_tp != NULL);
4092
3e170ce0
A
4093 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4094 "%s: mp_so 0x%llx found "
39236c6e
A
4095 "(u=%d,r=%d,s=%d)\n", __func__,
4096 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3e170ce0
A
4097 mp_so->so_retaincnt, mpp->mpp_state),
4098 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 4099
5ba3f43e 4100 if (!mpte_try_lock(mpte)) {
3e170ce0 4101 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 4102 "%s: mp_so 0x%llx skipped lock "
39236c6e
A
4103 "(u=%d,r=%d)\n", __func__,
4104 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
4105 mp_so->so_usecount, mp_so->so_retaincnt),
4106 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4107 active++;
4108 continue;
4109 }
4110
4111 /* check again under the lock */
5ba3f43e 4112 if (mp_so->so_usecount > 0) {
39236c6e
A
4113 boolean_t wakeup = FALSE;
4114 struct mptsub *mpts, *tmpts;
4115
3e170ce0 4116 mptcplog((LOG_DEBUG, "MPTCP Socket: "
5ba3f43e 4117 "%s: mp_so 0x%llx skipped usecount "
39236c6e
A
4118 "[u=%d,r=%d] %d %d\n", __func__,
4119 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4120 mp_so->so_usecount, mp_so->so_retaincnt,
4121 mp_tp->mpt_gc_ticks,
3e170ce0
A
4122 mp_tp->mpt_state),
4123 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4124
39236c6e
A
4125 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4126 if (mp_tp->mpt_gc_ticks > 0)
4127 mp_tp->mpt_gc_ticks--;
4128 if (mp_tp->mpt_gc_ticks == 0) {
4129 wakeup = TRUE;
39236c6e
A
4130 }
4131 }
39236c6e
A
4132 if (wakeup) {
4133 TAILQ_FOREACH_SAFE(mpts,
4134 &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4135 mptcp_subflow_eupcall1(mpts->mpts_socket,
39236c6e 4136 mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
4137 }
4138 }
5ba3f43e 4139 mpte_unlock(mpte);
39236c6e
A
4140 active++;
4141 continue;
4142 }
4143
4144 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
5ba3f43e
A
4145 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4146 "[u=%d,r=%d,s=%d]\n", __func__,
4147 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4148 mp_so->so_usecount, mp_so->so_retaincnt,
4149 mpp->mpp_state);
39236c6e
A
4150 }
4151
5ba3f43e
A
4152 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
4153 mptcp_close(mpte, mp_tp);
3e170ce0 4154
5ba3f43e 4155 mptcp_session_destroy(mpte);
39236c6e 4156
3e170ce0
A
4157 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4158 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
39236c6e 4159 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3e170ce0
A
4160 mp_so->so_usecount, mp_so->so_retaincnt),
4161 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4162
39037602 4163 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
4164 struct sockbuf *, &mp_so->so_rcv,
4165 struct sockbuf *, &mp_so->so_snd,
4166 struct mppcb *, mpp);
4167
4168 mp_pcbdispose(mpp);
39037602 4169 sodealloc(mp_so);
39236c6e
A
4170 }
4171
4172 return (active);
4173}
4174
4175/*
4176 * Drop a MPTCP connection, reporting the specified error.
4177 */
4178struct mptses *
4179mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4180{
4181 struct socket *mp_so;
4182
5ba3f43e 4183 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4184 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4185 mp_so = mptetoso(mpte);
39236c6e 4186
39037602 4187 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
4188 uint32_t, 0 /* event */);
4189
4190 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4191 errno = mp_tp->mpt_softerror;
4192 mp_so->so_error = errno;
4193
4194 return (mptcp_close(mpte, mp_tp));
4195}
4196
4197/*
4198 * Close a MPTCP control block.
4199 */
4200struct mptses *
4201mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4202{
3e170ce0
A
4203 struct socket *mp_so = NULL;
4204 struct mptsub *mpts = NULL, *tmpts = NULL;
39236c6e 4205
5ba3f43e 4206 mpte_lock_assert_held(mpte); /* same as MP socket lock */
39236c6e 4207 VERIFY(mpte->mpte_mptcb == mp_tp);
5ba3f43e 4208 mp_so = mptetoso(mpte);
39236c6e 4209
5ba3f43e 4210 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e 4211
5ba3f43e
A
4212 mptcp_freeq(mp_tp);
4213
4214 soisdisconnected(mp_so);
39236c6e
A
4215
4216 /* Clean up all subflows */
4217 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4218 mptcp_subflow_disconnect(mpte, mpts);
39236c6e 4219 }
39236c6e
A
4220
4221 return (NULL);
4222}
4223
4224void
4225mptcp_notify_close(struct socket *so)
4226{
4227 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4228}
4229
4230/*
5ba3f43e 4231 * MPTCP workloop.
39236c6e
A
4232 */
4233void
5ba3f43e 4234mptcp_subflow_workloop(struct mptses *mpte)
39236c6e
A
4235{
4236 struct socket *mp_so;
4237 struct mptsub *mpts, *tmpts;
4238 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
5ba3f43e 4239 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
39236c6e 4240
5ba3f43e 4241 mpte_lock_assert_held(mpte);
39236c6e 4242 VERIFY(mpte->mpte_mppcb != NULL);
5ba3f43e 4243 mp_so = mptetoso(mpte);
39236c6e
A
4244 VERIFY(mp_so != NULL);
4245
4246 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4247 ev_ret_t ret;
4248
5ba3f43e
A
4249 if (mpts->mpts_socket->so_usecount == 0) {
4250 /* Will be removed soon by tcp_garbage_collect */
4251 continue;
4252 }
3e170ce0 4253
5ba3f43e
A
4254 mptcp_subflow_addref(mpts);
4255 mpts->mpts_socket->so_usecount++;
3e170ce0
A
4256
4257 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e 4258
39236c6e
A
4259 /*
4260 * If MPTCP socket is closed, disconnect all subflows.
4261 * This will generate a disconnect event which will
4262 * be handled during the next iteration, causing a
4263 * non-zero error to be returned above.
4264 */
4265 if (mp_so->so_flags & SOF_PCBCLEARING)
5ba3f43e 4266 mptcp_subflow_disconnect(mpte, mpts);
39236c6e
A
4267
4268 switch (ret) {
39236c6e
A
4269 case MPTS_EVRET_OK:
4270 /* nothing to do */
4271 break;
4272 case MPTS_EVRET_DELETE:
5ba3f43e 4273 mptcp_subflow_soclose(mpts);
39236c6e
A
4274 break;
4275 case MPTS_EVRET_CONNECT_PENDING:
4276 connect_pending = TRUE;
4277 break;
4278 case MPTS_EVRET_DISCONNECT_FALLBACK:
4279 disconnect_fallback = TRUE;
4280 break;
3e170ce0
A
4281 default:
4282 mptcplog((LOG_DEBUG,
4283 "MPTCP Socket: %s: mptcp_subflow_events "
4284 "returned invalid value: %d\n", __func__,
4285 ret),
4286 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4287 break;
39236c6e 4288 }
5ba3f43e
A
4289 mptcp_subflow_remref(mpts); /* ours */
4290
4291 VERIFY(mpts->mpts_socket->so_usecount != 0);
4292 mpts->mpts_socket->so_usecount--;
39236c6e
A
4293 }
4294
5ba3f43e 4295 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
5ba3f43e
A
4296 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4297
3e170ce0 4298 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
4299 }
4300
5ba3f43e 4301 if (!connect_pending && !disconnect_fallback)
39236c6e 4302 return;
39236c6e
A
4303
4304 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
39236c6e
A
4305 if (disconnect_fallback) {
4306 struct socket *so = NULL;
4307 struct inpcb *inp = NULL;
4308 struct tcpcb *tp = NULL;
4309
5ba3f43e 4310 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
39236c6e 4311 continue;
39236c6e
A
4312
4313 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4314
4315 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
5ba3f43e 4316 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
39236c6e 4317 continue;
490019cf 4318
39236c6e
A
4319 so = mpts->mpts_socket;
4320
4321 /*
4322 * The MPTCP connection has degraded to a fallback
4323 * mode, so there is no point in keeping this subflow
4324 * regardless of its MPTCP-readiness state, unless it
4325 * is the primary one which we use for fallback. This
4326 * assumes that the subflow used for fallback is the
4327 * ACTIVE one.
4328 */
4329
39236c6e
A
4330 inp = sotoinpcb(so);
4331 tp = intotcpcb(inp);
4332 tp->t_mpflags &=
4333 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4334 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 4335
39236c6e 4336 if (mpts->mpts_flags & MPTSF_ACTIVE) {
39236c6e
A
4337 continue;
4338 }
4339 tp->t_mpflags |= TMPF_RESET;
5ba3f43e 4340 soevent(so, SO_FILT_HINT_MUSTRST);
39236c6e
A
4341 } else if (connect_pending) {
4342 /*
4343 * The MPTCP connection has progressed to a state
4344 * where it supports full multipath semantics; allow
4345 * additional joins to be attempted for all subflows
4346 * that are in the PENDING state.
4347 */
4348 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5ba3f43e 4349 int error = mptcp_subflow_soconnectx(mpte, mpts);
39236c6e 4350
5ba3f43e
A
4351 if (error)
4352 mptcp_subflow_abort(mpts, error);
4353 }
39236c6e 4354 }
39236c6e
A
4355 }
4356}
4357
39236c6e
A
4358/*
4359 * Protocol pr_lock callback.
4360 */
4361int
4362mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4363{
5ba3f43e 4364 struct mppcb *mpp = mpsotomppcb(mp_so);
39236c6e
A
4365 void *lr_saved;
4366
4367 if (lr == NULL)
4368 lr_saved = __builtin_return_address(0);
4369 else
4370 lr_saved = lr;
4371
4372 if (mpp == NULL) {
4373 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4374 mp_so, lr_saved, solockhistory_nr(mp_so));
4375 /* NOTREACHED */
4376 }
5ba3f43e 4377 mpp_lock(mpp);
39236c6e
A
4378
4379 if (mp_so->so_usecount < 0) {
4380 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4381 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4382 solockhistory_nr(mp_so));
4383 /* NOTREACHED */
4384 }
4385 if (refcount != 0)
4386 mp_so->so_usecount++;
4387 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4388 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4389
4390 return (0);
4391}
4392
4393/*
4394 * Protocol pr_unlock callback.
4395 */
4396int
5ba3f43e 4397mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
39236c6e 4398{
5ba3f43e
A
4399 struct mppcb *mpp = mpsotomppcb(mp_so);
4400 void *lr_saved;
39236c6e 4401
5ba3f43e
A
4402 if (lr == NULL)
4403 lr_saved = __builtin_return_address(0);
4404 else
4405 lr_saved = lr;
39236c6e 4406
5ba3f43e
A
4407 if (mpp == NULL) {
4408 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4409 mp_so, mp_so->so_usecount, lr_saved,
4410 solockhistory_nr(mp_so));
4411 /* NOTREACHED */
4412 }
4413 mpp_lock_assert_held(mpp);
39236c6e 4414
5ba3f43e
A
4415 if (refcount != 0)
4416 mp_so->so_usecount--;
39236c6e 4417
5ba3f43e
A
4418 if (mp_so->so_usecount < 0) {
4419 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4420 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4421 /* NOTREACHED */
39236c6e 4422 }
5ba3f43e
A
4423 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4424 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4425 mpp_unlock(mpp);
4426
4427 return (0);
39236c6e
A
4428}
4429
5ba3f43e
A
4430/*
4431 * Protocol pr_getlock callback.
4432 */
4433lck_mtx_t *
4434mptcp_getlock(struct socket *mp_so, int flags)
39236c6e 4435{
5ba3f43e
A
4436 struct mppcb *mpp = mpsotomppcb(mp_so);
4437
4438 if (mpp == NULL) {
4439 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4440 solockhistory_nr(mp_so));
39236c6e
A
4441 /* NOTREACHED */
4442 }
5ba3f43e
A
4443 if (mp_so->so_usecount < 0) {
4444 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4445 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4446 /* NOTREACHED */
39236c6e 4447 }
5ba3f43e 4448 return (mpp_getlock(mpp, flags));
39236c6e
A
4449}
4450
4451/*
4452 * MPTCP Join support
4453 */
4454
4455static void
4456mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
fe8ab488 4457 uint8_t addr_id)
39236c6e
A
4458{
4459 struct tcpcb *tp = sototcpcb(so);
4460 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4461 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4462
39236c6e 4463 /*
39236c6e
A
4464 * The address ID of the first flow is implicitly 0.
4465 */
4466 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4467 tp->t_local_aid = 0;
4468 } else {
fe8ab488 4469 tp->t_local_aid = addr_id;
39236c6e
A
4470 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4471 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4472 }
4473 sauth_entry = zalloc(mpt_subauth_zone);
4474 sauth_entry->msae_laddr_id = tp->t_local_aid;
4475 sauth_entry->msae_raddr_id = 0;
4476 sauth_entry->msae_raddr_rand = 0;
4477try_again:
4478 sauth_entry->msae_laddr_rand = RandomULong();
4479 if (sauth_entry->msae_laddr_rand == 0)
4480 goto try_again;
4481 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4482}
4483
4484static void
4485mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4486{
4487 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 4488 struct tcpcb *tp = NULL;
39236c6e
A
4489 int found = 0;
4490
fe8ab488 4491 tp = sototcpcb(so);
5ba3f43e 4492 if (tp == NULL)
39236c6e
A
4493 return;
4494
39236c6e
A
4495 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4496 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4497 found = 1;
4498 break;
4499 }
4500 }
4501 if (found) {
4502 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 4503 }
fe8ab488 4504
3e170ce0
A
4505 if (found)
4506 zfree(mpt_subauth_zone, sauth_entry);
39236c6e
A
4507}
4508
4509void
4510mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4511 u_int32_t *rrand)
4512{
4513 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4514 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4515
39236c6e
A
4516 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4517 if (sauth_entry->msae_laddr_id == addr_id) {
4518 if (lrand)
4519 *lrand = sauth_entry->msae_laddr_rand;
4520 if (rrand)
4521 *rrand = sauth_entry->msae_raddr_rand;
4522 break;
4523 }
4524 }
39236c6e
A
4525}
4526
4527void
4528mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4529 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4530{
4531 struct mptcp_subf_auth_entry *sauth_entry;
5ba3f43e 4532 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 4533
39236c6e
A
4534 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4535 if (sauth_entry->msae_laddr_id == laddr_id) {
4536 if ((sauth_entry->msae_raddr_id != 0) &&
4537 (sauth_entry->msae_raddr_id != raddr_id)) {
3e170ce0 4538 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
39236c6e 4539 " address ids %d %d \n", __func__, raddr_id,
3e170ce0
A
4540 sauth_entry->msae_raddr_id),
4541 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4542 return;
4543 }
4544 sauth_entry->msae_raddr_id = raddr_id;
4545 if ((sauth_entry->msae_raddr_rand != 0) &&
4546 (sauth_entry->msae_raddr_rand != raddr_rand)) {
3e170ce0
A
4547 mptcplog((LOG_ERR, "MPTCP Socket: "
4548 "%s: dup SYN_ACK %d %d \n",
39236c6e 4549 __func__, raddr_rand,
3e170ce0
A
4550 sauth_entry->msae_raddr_rand),
4551 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
4552 return;
4553 }
4554 sauth_entry->msae_raddr_rand = raddr_rand;
39236c6e
A
4555 return;
4556 }
4557 }
39236c6e
A
4558}
4559
4560/*
4561 * SHA1 support for MPTCP
4562 */
5ba3f43e
A
4563static void
4564mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
39236c6e
A
4565{
4566 SHA1_CTX sha1ctxt;
4567 const unsigned char *sha1_base;
4568 int sha1_size;
4569
39236c6e
A
4570 sha1_base = (const unsigned char *) key;
4571 sha1_size = sizeof (mptcp_key_t);
4572 SHA1Init(&sha1ctxt);
4573 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4574 SHA1Final(sha_digest, &sha1ctxt);
39236c6e
A
4575}
4576
4577void
4578mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5ba3f43e 4579 u_int32_t rand1, u_int32_t rand2, u_char *digest)
39236c6e
A
4580{
4581 SHA1_CTX sha1ctxt;
4582 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4583 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4584 u_int32_t data[2];
4585 int i;
4586
5ba3f43e 4587 bzero(digest, SHA1_RESULTLEN);
39236c6e
A
4588
4589 /* Set up the Key for HMAC */
4590 key_ipad[0] = key1;
4591 key_ipad[1] = key2;
4592
4593 key_opad[0] = key1;
4594 key_opad[1] = key2;
4595
4596 /* Set up the message for HMAC */
4597 data[0] = rand1;
4598 data[1] = rand2;
4599
4600 /* Key is 512 block length, so no need to compute hash */
4601
4602 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4603
4604 for (i = 0; i < 8; i++) {
4605 key_ipad[i] ^= 0x3636363636363636;
4606 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4607 }
4608
4609 /* Perform inner SHA1 */
4610 SHA1Init(&sha1ctxt);
4611 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4612 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4613 SHA1Final(digest, &sha1ctxt);
4614
4615 /* Perform outer SHA1 */
4616 SHA1Init(&sha1ctxt);
4617 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4618 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4619 SHA1Final(digest, &sha1ctxt);
4620}
4621
4622/*
4623 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4624 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4625 */
4626void
5ba3f43e 4627mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
39236c6e
A
4628{
4629 uint32_t lrand, rrand;
39236c6e 4630
5ba3f43e 4631 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4632
4633 lrand = rrand = 0;
4634 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5ba3f43e
A
4635 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4636 digest);
39236c6e
A
4637}
4638
4639/*
4640 * Authentication data generation
4641 */
5ba3f43e 4642static void
39236c6e
A
4643mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4644 int token_len)
4645{
4646 VERIFY(token_len == sizeof (u_int32_t));
4647 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4648
4649 /* Most significant 32 bits of the SHA1 hash */
4650 bcopy(sha_digest, token, sizeof (u_int32_t));
490019cf 4651 return;
39236c6e
A
4652}
4653
5ba3f43e 4654static void
39236c6e
A
4655mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4656 int idsn_len)
4657{
4658 VERIFY(idsn_len == sizeof (u_int64_t));
4659 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4660
4661 /*
4662 * Least significant 64 bits of the SHA1 hash
4663 */
4664
4665 idsn[7] = sha_digest[12];
4666 idsn[6] = sha_digest[13];
4667 idsn[5] = sha_digest[14];
4668 idsn[4] = sha_digest[15];
4669 idsn[3] = sha_digest[16];
4670 idsn[2] = sha_digest[17];
4671 idsn[1] = sha_digest[18];
4672 idsn[0] = sha_digest[19];
490019cf 4673 return;
39236c6e
A
4674}
4675
490019cf
A
4676static void
4677mptcp_conn_properties(struct mptcb *mp_tp)
4678{
4679 /* There is only Version 0 at this time */
4680 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4681
4682 /* Set DSS checksum flag */
4683 if (mptcp_dss_csum)
4684 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4685
4686 /* Set up receive window */
4687 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4688
4689 /* Set up gc ticks */
4690 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4691}
4692
4693static void
5ba3f43e 4694mptcp_init_local_parms(struct mptses *mpte)
39236c6e 4695{
5ba3f43e
A
4696 struct mptcb *mp_tp = mpte->mpte_mptcb;
4697 char key_digest[SHA1_RESULTLEN];
490019cf 4698
5ba3f43e
A
4699 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4700 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4701
4702 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
490019cf 4703 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
5ba3f43e 4704 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
490019cf
A
4705 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4706
4707 /* The subflow SYN is also first MPTCP byte */
4708 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4709 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4710
4711 mptcp_conn_properties(mp_tp);
4712}
4713
4714int
4715mptcp_init_remote_parms(struct mptcb *mp_tp)
4716{
5ba3f43e
A
4717 char remote_digest[SHA1_RESULTLEN];
4718 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e
A
4719
4720 /* Only Version 0 is supported for auth purposes */
3e170ce0 4721 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
39236c6e
A
4722 return (-1);
4723
4724 /* Setup local and remote tokens and Initial DSNs */
5ba3f43e 4725 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
39236c6e 4726 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
490019cf 4727 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
39236c6e
A
4728 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4729 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
5ba3f43e 4730 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
39236c6e 4731
490019cf 4732 return (0);
39236c6e
A
4733}
4734
5ba3f43e 4735static void
39236c6e
A
4736mptcp_send_dfin(struct socket *so)
4737{
4738 struct tcpcb *tp = NULL;
4739 struct inpcb *inp = NULL;
4740
4741 inp = sotoinpcb(so);
4742 if (!inp)
4743 return;
4744
4745 tp = intotcpcb(inp);
4746 if (!tp)
4747 return;
4748
4749 if (!(tp->t_mpflags & TMPF_RESET))
4750 tp->t_mpflags |= TMPF_SEND_DFIN;
4751}
4752
4753/*
4754 * Data Sequence Mapping routines
4755 */
4756void
4757mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4758{
4759 struct mptcb *mp_tp;
4760
4761 if (m == NULL)
4762 return;
4763
3e170ce0 4764 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5ba3f43e
A
4765 mpte_lock_assert_held(mp_tp->mpt_mpte);
4766
39236c6e
A
4767 while (m) {
4768 VERIFY(m->m_flags & M_PKTHDR);
4769 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4770 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4771 m->m_pkthdr.mp_rlen = m_pktlen(m);
4772 mp_tp->mpt_sndmax += m_pktlen(m);
4773 m = m->m_next;
4774 }
5ba3f43e
A
4775}
4776
4777void
4778mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4779{
4780 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4781 uint64_t data_ack;
4782 uint64_t dsn;
4783
4784 if (!m || len == 0)
4785 return;
4786
4787 while (m && len > 0) {
4788 VERIFY(m->m_flags & M_PKTHDR);
4789 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4790
4791 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4792 dsn = m->m_pkthdr.mp_dsn;
4793
4794 len -= m->m_len;
4795 m = m->m_next;
4796 }
4797
4798 if (m && len == 0) {
4799 /*
4800 * If there is one more mbuf in the chain, it automatically means
4801 * that up to m->mp_dsn has been ack'ed.
4802 *
4803 * This means, we actually correct data_ack back down (compared
4804 * to what we set inside the loop - dsn + data_len). Because in
4805 * the loop we are "optimistic" and assume that the full mapping
4806 * will be acked. If that's not the case and we get out of the
4807 * loop with m != NULL, it means only up to m->mp_dsn has been
4808 * really acked.
4809 */
4810 data_ack = m->m_pkthdr.mp_dsn;
4811 }
4812
4813 if (len < 0) {
4814 /*
4815 * If len is negative, meaning we acked in the middle of an mbuf,
4816 * only up to this mbuf's data-sequence number has been acked
4817 * at the MPTCP-level.
4818 */
4819 data_ack = dsn;
4820 }
4821
4822 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4823 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4824 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
39236c6e
A
4825}
4826
4827void
490019cf 4828mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e 4829{
490019cf
A
4830 int rewinding = 0;
4831
5ba3f43e
A
4832 /* TFO makes things complicated. */
4833 if (so->so_flags1 & SOF1_TFO_REWIND) {
4834 rewinding = 1;
4835 so->so_flags1 &= ~SOF1_TFO_REWIND;
490019cf 4836 }
39236c6e 4837
5ba3f43e
A
4838 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4839 u_int32_t sub_len;
39236c6e 4840 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4841 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4842
5ba3f43e 4843 sub_len = m->m_pkthdr.mp_rlen;
39236c6e 4844
5ba3f43e
A
4845 if (sub_len < len) {
4846 m->m_pkthdr.mp_dsn += sub_len;
4847 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4848 m->m_pkthdr.mp_rseq += sub_len;
39236c6e 4849 }
5ba3f43e
A
4850 m->m_pkthdr.mp_rlen = 0;
4851 len -= sub_len;
39236c6e 4852 } else {
5ba3f43e
A
4853 /* sub_len >= len */
4854 if (rewinding == 0)
4855 m->m_pkthdr.mp_dsn += len;
4856 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4857 if (rewinding == 0)
4858 m->m_pkthdr.mp_rseq += len;
4859 }
4860 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4861 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4862 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4863 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4864 m->m_pkthdr.mp_rlen -= len;
4865 break;
39236c6e
A
4866 }
4867 m = m->m_next;
4868 }
39037602
A
4869
4870 if (so->so_flags & SOF_MP_SUBFLOW &&
4871 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4872 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4873 /*
4874 * Received an ack without receiving a DATA_ACK.
4875 * Need to fallback to regular TCP (or destroy this subflow).
4876 */
5ba3f43e 4877 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
39037602
A
4878 mptcp_notify_mpfail(so);
4879 }
39236c6e
A
4880}
4881
4882/* Obtain the DSN mapping stored in the mbuf */
4883void
5ba3f43e
A
4884mptcp_output_getm_dsnmap32(struct socket *so, int off,
4885 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
39236c6e
A
4886{
4887 u_int64_t dsn64;
4888
5ba3f43e 4889 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
39236c6e 4890 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
39236c6e
A
4891}
4892
4893void
5ba3f43e
A
4894mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4895 uint32_t *relseq, uint16_t *data_len,
4896 uint16_t *dss_csum)
39236c6e
A
4897{
4898 struct mbuf *m = so->so_snd.sb_mb;
5ba3f43e 4899 int off_orig = off;
39236c6e 4900
5ba3f43e 4901 VERIFY(off >= 0);
39236c6e 4902
39236c6e
A
4903 /*
4904 * In the subflow socket, the DSN sequencing can be discontiguous,
4905 * but the subflow sequence mapping is contiguous. Use the subflow
4906 * sequence property to find the right mbuf and corresponding dsn
4907 * mapping.
4908 */
4909
4910 while (m) {
39236c6e 4911 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 4912 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 4913
5ba3f43e
A
4914 if (off >= m->m_len) {
4915 off -= m->m_len;
39236c6e
A
4916 m = m->m_next;
4917 } else {
4918 break;
4919 }
4920 }
4921
5ba3f43e
A
4922 VERIFY(m);
4923 VERIFY(off >= 0);
4924 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
39236c6e 4925
5ba3f43e
A
4926 *dsn = m->m_pkthdr.mp_dsn;
4927 *relseq = m->m_pkthdr.mp_rseq;
4928 *data_len = m->m_pkthdr.mp_rlen;
4929 *dss_csum = m->m_pkthdr.mp_csum;
39236c6e 4930
5ba3f43e
A
4931 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4932 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4933 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
4934}
4935
4936/*
3e170ce0
A
4937 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4938 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4939 * When it trims data tcp_input calls m_adj() which does not remove the
4940 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4941 * The dsn map insertion cannot be delayed after trim, because data can be in
4942 * the reassembly queue for a while and the DSN option info in tp will be
4943 * overwritten for every new packet received.
39236c6e
A
4944 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4945 * with mptcp_adj_rmap()
4946 */
4947void
5c9f4661 4948mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
39236c6e 4949{
5c9f4661 4950 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
4951 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4952
4953 if (tp->t_mpflags & TMPF_EMBED_DSN) {
39236c6e
A
4954 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4955 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4956 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5ba3f43e 4957 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5c9f4661
A
4958 if (tp->t_rcv_map.mpt_dfin)
4959 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
4960
39236c6e 4961 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5c9f4661 4962
39236c6e
A
4963 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4964 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5c9f4661
A
4965 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
4966 if (th->th_flags & TH_FIN)
4967 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
39236c6e
A
4968 }
4969}
4970
5c9f4661
A
4971int
4972mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
4973 uint32_t rseq, uint16_t dlen)
39236c6e 4974{
5ba3f43e 4975 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
39236c6e
A
4976
4977 if (m_pktlen(m) == 0)
5c9f4661 4978 return (0);
39236c6e 4979
5ba3f43e 4980 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5c9f4661
A
4981 if (off && (dsn != m->m_pkthdr.mp_dsn ||
4982 rseq != m->m_pkthdr.mp_rseq ||
4983 dlen != m->m_pkthdr.mp_rlen)) {
4984 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
4985 __func__, dsn, m->m_pkthdr.mp_dsn,
4986 rseq, m->m_pkthdr.mp_rseq,
4987 dlen, m->m_pkthdr.mp_rlen),
4988 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
4989 return (-1);
4990 }
39236c6e
A
4991 m->m_pkthdr.mp_dsn += off;
4992 m->m_pkthdr.mp_rseq += off;
fe8ab488 4993 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
39236c6e 4994 } else {
5ba3f43e
A
4995 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
4996 /* data arrived without an DSS option mapping */
4997
4998 /* initial subflow can fallback right after SYN handshake */
4999 mptcp_notify_mpfail(so);
5000 }
39236c6e 5001 }
5ba3f43e
A
5002
5003 mpts->mpts_flags |= MPTSF_CONFIRMED;
5004
5c9f4661 5005 return (0);
39236c6e
A
5006}
5007
5008/*
5009 * Following routines help with failure detection and failover of data
5010 * transfer from one subflow to another.
5011 */
5012void
5013mptcp_act_on_txfail(struct socket *so)
5014{
5015 struct tcpcb *tp = NULL;
5016 struct inpcb *inp = sotoinpcb(so);
5017
5018 if (inp == NULL)
5019 return;
5020
5021 tp = intotcpcb(inp);
5022 if (tp == NULL)
5023 return;
5024
5ba3f43e 5025 if (so->so_flags & SOF_MP_TRYFAILOVER)
39236c6e 5026 return;
39236c6e
A
5027
5028 so->so_flags |= SOF_MP_TRYFAILOVER;
5029 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5030}
5031
5032/*
5033 * Support for MP_FAIL option
5034 */
5035int
5036mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5037{
5038 struct mbuf *m = so->so_snd.sb_mb;
5039 u_int64_t dsn;
5040 int off = 0;
5041 u_int32_t datalen;
5042
5043 if (m == NULL)
5044 return (-1);
5045
5046 while (m != NULL) {
5047 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5048 VERIFY(m->m_flags & M_PKTHDR);
5049 dsn = m->m_pkthdr.mp_dsn;
5050 datalen = m->m_pkthdr.mp_rlen;
5051 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5052 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5053 off = dsn_fail - dsn;
5054 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5ba3f43e
A
5055 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5056 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
5057 return (0);
5058 }
5059
5060 m = m->m_next;
5061 }
5062
5063 /*
5064 * If there was no mbuf data and a fallback to TCP occurred, there's
5065 * not much else to do.
5066 */
5067
5ba3f43e
A
5068 mptcplog((LOG_ERR, "MPTCP Sender: "
5069 "%s: %llu not found \n", __func__, dsn_fail),
5070 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5071 return (-1);
5072}
5073
5074/*
5075 * Support for sending contiguous MPTCP bytes in subflow
5076 * Also for preventing sending data with ACK in 3-way handshake
5077 */
5078int32_t
5079mptcp_adj_sendlen(struct socket *so, int32_t off)
5080{
5081 struct tcpcb *tp = sototcpcb(so);
5082 struct mptsub *mpts = tp->t_mpsub;
5083 uint64_t mdss_dsn;
5084 uint32_t mdss_subflow_seq;
5085 int mdss_subflow_off;
5086 uint16_t mdss_data_len;
5087 uint16_t dss_csum;
5088
5089 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5090 &mdss_data_len, &dss_csum);
5091
5092 /*
5093 * We need to compute how much of the mapping still remains.
5094 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5095 */
5096 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5097
5098 /*
5099 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5100 * seq has been set to 1 (while it should be 0).
5101 */
5102 if (tp->t_mpflags & TMPF_TFO_REQUEST)
5103 mdss_subflow_off--;
5104
5105 if (off < mdss_subflow_off)
5106 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5107 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5108 VERIFY(off >= mdss_subflow_off);
5109
5110 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5111 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5112 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5113 return (mdss_data_len - (off - mdss_subflow_off));
5114}
5115
5116static uint32_t
5117mptcp_get_maxseg(struct mptses *mpte)
5118{
5119 struct mptsub *mpts;
5120 uint32_t maxseg = 0;
5121
5122 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5123 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5124
5125 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5126 TCPS_HAVERCVDFIN2(tp->t_state))
5127 continue;
5128
5129 if (tp->t_maxseg > maxseg)
5130 maxseg = tp->t_maxseg;
5131 }
5132
5133 return (maxseg);
5134}
5135
5136static uint8_t
5137mptcp_get_rcvscale(struct mptses *mpte)
5138{
5139 struct mptsub *mpts;
5140 uint8_t rcvscale = UINT8_MAX;
5141
5142 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5143 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5144
5145 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5146 TCPS_HAVERCVDFIN2(tp->t_state))
5147 continue;
5148
5149 if (tp->rcv_scale < rcvscale)
5150 rcvscale = tp->rcv_scale;
5151 }
5152
5153 return (rcvscale);
5154}
5155
5156/* Similar to tcp_sbrcv_reserve */
5157static void
5158mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5159 u_int32_t newsize, u_int32_t idealsize)
5160{
5161 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5162
5163 /* newsize should not exceed max */
5164 newsize = min(newsize, tcp_autorcvbuf_max);
5165
5166 /* The receive window scale negotiated at the
5167 * beginning of the connection will also set a
5168 * limit on the socket buffer size
5169 */
5170 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5171
5172 /* Set new socket buffer size */
5173 if (newsize > sbrcv->sb_hiwat &&
5174 (sbreserve(sbrcv, newsize) == 1)) {
5175 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5176 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5177
5178 /* Again check the limit set by the advertised
5179 * window scale
5180 */
5181 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5182 TCP_MAXWIN << rcvscale);
5183 }
5184}
5185
5186void
5187mptcp_sbrcv_grow(struct mptcb *mp_tp)
5188{
5189 struct mptses *mpte = mp_tp->mpt_mpte;
5190 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5191 struct sockbuf *sbrcv = &mp_so->so_rcv;
5192 uint32_t hiwat_sum = 0;
5193 uint32_t ideal_sum = 0;
5194 struct mptsub *mpts;
5195
5196 /*
5197 * Do not grow the receive socket buffer if
5198 * - auto resizing is disabled, globally or on this socket
5199 * - the high water mark already reached the maximum
5200 * - the stream is in background and receive side is being
5201 * throttled
5202 * - if there are segments in reassembly queue indicating loss,
5203 * do not need to increase recv window during recovery as more
5204 * data is not going to be sent. A duplicate ack sent during
5205 * recovery should not change the receive window
5206 */
5207 if (tcp_do_autorcvbuf == 0 ||
5208 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5209 tcp_cansbgrow(sbrcv) == 0 ||
5210 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5211 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5212 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5213 /* Can not resize the socket buffer, just return */
5214 return;
5215 }
5216
5217 /*
5218 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5219 *
5220 * But, for this we first need accurate receiver-RTT estimations, which
5221 * we currently don't have.
5222 *
5223 * Let's use a dummy algorithm for now, just taking the sum of all
5224 * subflow's receive-buffers. It's too low, but that's all we can get
5225 * for now.
5226 */
5227
5228 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5229 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5230 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5231 }
5232
5233 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
39236c6e
A
5234}
5235
5236/*
5ba3f43e
A
5237 * Determine if we can grow the recieve socket buffer to avoid sending
5238 * a zero window update to the peer. We allow even socket buffers that
5239 * have fixed size (set by the application) to grow if the resource
5240 * constraints are met. They will also be trimmed after the application
5241 * reads data.
5242 *
5243 * Similar to tcp_sbrcv_grow_rwin
39236c6e 5244 */
5ba3f43e
A
5245static void
5246mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
39236c6e 5247{
5ba3f43e
A
5248 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5249 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5250 u_int32_t rcvbuf = sb->sb_hiwat;
39236c6e 5251
5ba3f43e
A
5252 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5253 return;
39236c6e 5254
5ba3f43e
A
5255 if (tcp_do_autorcvbuf == 1 &&
5256 tcp_cansbgrow(sb) &&
5257 /* Diff to tcp_sbrcv_grow_rwin */
5258 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5259 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5260 rcvbuf < tcp_autorcvbuf_max &&
5261 (sb->sb_idealsize > 0 &&
5262 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5263 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
490019cf 5264 }
39236c6e
A
5265}
5266
5ba3f43e 5267/* Similar to tcp_sbspace */
39236c6e 5268int32_t
5ba3f43e 5269mptcp_sbspace(struct mptcb *mp_tp)
39236c6e 5270{
5ba3f43e 5271 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
39236c6e
A
5272 uint32_t rcvbuf;
5273 int32_t space;
5ba3f43e
A
5274 int32_t pending = 0;
5275
5276 mpte_lock_assert_held(mp_tp->mpt_mpte);
39236c6e 5277
5ba3f43e 5278 mptcp_sbrcv_grow_rwin(mp_tp, sb);
39236c6e 5279
5ba3f43e 5280 /* hiwat might have changed */
39236c6e 5281 rcvbuf = sb->sb_hiwat;
5ba3f43e
A
5282
5283 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5284 (sb->sb_mbmax - sb->sb_mbcnt)));
39236c6e
A
5285 if (space < 0)
5286 space = 0;
5ba3f43e
A
5287
5288#if CONTENT_FILTER
5289 /* Compensate for data being processed by content filters */
5290 pending = cfil_sock_data_space(sb);
5291#endif /* CONTENT_FILTER */
5292 if (pending > space)
5293 space = 0;
5294 else
5295 space -= pending;
39236c6e
A
5296
5297 return (space);
5298}
5299
5300/*
5301 * Support Fallback to Regular TCP
5302 */
5303void
5304mptcp_notify_mpready(struct socket *so)
5305{
5306 struct tcpcb *tp = NULL;
5307
5308 if (so == NULL)
5309 return;
5310
5311 tp = intotcpcb(sotoinpcb(so));
5312
5313 if (tp == NULL)
5314 return;
5315
5316 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5317 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5318 struct tcpcb *, tp);
5319
5320 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5321 return;
5322
5323 if (tp->t_mpflags & TMPF_MPTCP_READY)
5324 return;
5325
5326 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5327 tp->t_mpflags |= TMPF_MPTCP_READY;
5328
5329 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5330}
5331
5332void
5333mptcp_notify_mpfail(struct socket *so)
5334{
5335 struct tcpcb *tp = NULL;
5336
5337 if (so == NULL)
5338 return;
5339
5340 tp = intotcpcb(sotoinpcb(so));
5341
5342 if (tp == NULL)
5343 return;
5344
5345 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5346 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5347 struct tcpcb *, tp);
5348
5349 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5350 return;
5351
5352 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5353 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5354
5355 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5356}
5357
5358/*
5359 * Keepalive helper function
5360 */
5361boolean_t
5362mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5363{
5364 boolean_t ret = 1;
5ba3f43e
A
5365 mpte_lock_assert_held(mp_tp->mpt_mpte);
5366
39236c6e
A
5367 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5368 ret = 0;
5369 }
39236c6e
A
5370 return (ret);
5371}
5372
5373/*
5374 * MPTCP t_maxseg adjustment function
5375 */
5376int
5377mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5378{
5379 int mss_lower = 0;
5380 struct mptcb *mp_tp = tptomptp(tp);
5381
5382#define MPTCP_COMPUTE_LEN { \
5383 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
39236c6e
A
5384 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5385 mss_lower += 2; \
5386 else \
5387 /* adjust to 32-bit boundary + EOL */ \
5388 mss_lower += 2; \
39236c6e
A
5389}
5390 if (mp_tp == NULL)
5391 return (0);
5392
5ba3f43e
A
5393 mpte_lock_assert_held(mp_tp->mpt_mpte);
5394
39236c6e
A
5395 /*
5396 * For the first subflow and subsequent subflows, adjust mss for
5397 * most common MPTCP option size, for case where tcp_mss is called
5398 * during option processing and MTU discovery.
5399 */
5ba3f43e
A
5400 if (!mtudisc) {
5401 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5402 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5403 MPTCP_COMPUTE_LEN;
5404 }
39236c6e 5405
5ba3f43e
A
5406 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5407 tp->t_mpflags & TMPF_SENT_JOIN) {
5408 MPTCP_COMPUTE_LEN;
5409 }
5410 } else {
5411 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5412 MPTCP_COMPUTE_LEN;
5413 }
39236c6e
A
5414 }
5415
5416 return (mss_lower);
5417}
5418
5419/*
5420 * Update the pid, upid, uuid of the subflow so, based on parent so
5421 */
5422void
5ba3f43e 5423mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
39236c6e 5424{
5ba3f43e
A
5425 if (so->last_pid != mp_so->last_pid ||
5426 so->last_upid != mp_so->last_upid) {
5427 so->last_upid = mp_so->last_upid;
5428 so->last_pid = mp_so->last_pid;
5429 uuid_copy(so->last_uuid, mp_so->last_uuid);
39236c6e 5430 }
5ba3f43e 5431 so_update_policy(so);
39236c6e
A
5432}
5433
5434static void
5435fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5436{
5437 struct inpcb *inp;
5438
5439 tcp_getconninfo(so, &flow->flow_ci);
5440 inp = sotoinpcb(so);
5441#if INET6
5442 if ((inp->inp_vflag & INP_IPV6) != 0) {
5443 flow->flow_src.ss_family = AF_INET6;
5444 flow->flow_dst.ss_family = AF_INET6;
5445 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5446 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5447 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5448 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5449 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5450 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
39037602 5451 } else
39236c6e 5452#endif
3e170ce0 5453 if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
5454 flow->flow_src.ss_family = AF_INET;
5455 flow->flow_dst.ss_family = AF_INET;
5456 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5457 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5458 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5459 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5460 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5461 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5462 }
3e170ce0
A
5463 flow->flow_len = sizeof(*flow);
5464 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
5465 flow->flow_flags = mpts->mpts_flags;
5466 flow->flow_cid = mpts->mpts_connid;
3e170ce0 5467 flow->flow_relseq = mpts->mpts_rel_seq;
5ba3f43e 5468 flow->flow_soerror = mpts->mpts_socket->so_error;
3e170ce0 5469 flow->flow_probecnt = mpts->mpts_probecnt;
39236c6e
A
5470}
5471
5472static int
5473mptcp_pcblist SYSCTL_HANDLER_ARGS
5474{
5475#pragma unused(oidp, arg1, arg2)
5476 int error = 0, f;
5ba3f43e 5477 size_t len;
39236c6e
A
5478 struct mppcb *mpp;
5479 struct mptses *mpte;
5480 struct mptcb *mp_tp;
5481 struct mptsub *mpts;
5482 struct socket *so;
5483 conninfo_mptcp_t mptcpci;
fe8ab488 5484 mptcp_flow_t *flows = NULL;
39236c6e
A
5485
5486 if (req->newptr != USER_ADDR_NULL)
5487 return (EPERM);
5488
5489 lck_mtx_lock(&mtcbinfo.mppi_lock);
39236c6e 5490 if (req->oldptr == USER_ADDR_NULL) {
5ba3f43e 5491 size_t n = mtcbinfo.mppi_count;
39236c6e 5492 lck_mtx_unlock(&mtcbinfo.mppi_lock);
39037602 5493 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
39236c6e
A
5494 4 * (n + n/8) * sizeof(mptcp_flow_t);
5495 return (0);
5496 }
5497 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 5498 flows = NULL;
5ba3f43e 5499 mpp_lock(mpp);
39236c6e
A
5500 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5501 mpte = mptompte(mpp);
5502 VERIFY(mpte != NULL);
5ba3f43e 5503 mpte_lock_assert_held(mpte);
39236c6e
A
5504 mp_tp = mpte->mpte_mptcb;
5505 VERIFY(mp_tp != NULL);
3e170ce0
A
5506
5507 bzero(&mptcpci, sizeof(mptcpci));
39236c6e 5508 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
5509 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5510 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5511 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5512 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5513 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5514 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5515 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5516 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5517 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5518 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5ba3f43e 5519 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
3e170ce0
A
5520 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5521 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
3e170ce0 5522
39236c6e 5523 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
5524 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5525 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5526 mptcpci.mptcpci_flow_offset =
5527 offsetof(conninfo_mptcp_t, mptcpci_flows);
5528
fe8ab488
A
5529 len = sizeof(*flows) * mpte->mpte_numflows;
5530 if (mpte->mpte_numflows != 0) {
5531 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5532 if (flows == NULL) {
5ba3f43e 5533 mpp_unlock(mpp);
fe8ab488
A
5534 break;
5535 }
5536 mptcpci.mptcpci_len = sizeof(mptcpci) +
5537 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5538 error = SYSCTL_OUT(req, &mptcpci,
5539 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5540 } else {
5541 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 5542 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 5543 }
39236c6e 5544 if (error) {
5ba3f43e 5545 mpp_unlock(mpp);
39236c6e
A
5546 FREE(flows, M_TEMP);
5547 break;
5548 }
5549 f = 0;
5550 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
39236c6e 5551 so = mpts->mpts_socket;
39236c6e 5552 fill_mptcp_subflow(so, &flows[f], mpts);
39236c6e
A
5553 f++;
5554 }
5ba3f43e 5555 mpp_unlock(mpp);
fe8ab488
A
5556 if (flows) {
5557 error = SYSCTL_OUT(req, flows, len);
5558 FREE(flows, M_TEMP);
5559 if (error)
5560 break;
5561 }
39236c6e
A
5562 }
5563 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5564
5565 return (error);
5566}
5567
5568SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 5569 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 5570 "List of active MPTCP connections");
fe8ab488 5571
fe8ab488
A
5572/*
5573 * Set notsent lowat mark on the MPTCB
5574 */
5575int
5576mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5577{
5578 struct mptcb *mp_tp = NULL;
5579 int error = 0;
5580
5581 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5582 mp_tp = mpte->mpte_mptcb;
5583
5584 if (mp_tp)
5585 mp_tp->mpt_notsent_lowat = optval;
5586 else
5587 error = EINVAL;
5588
5ba3f43e 5589 return (error);
fe8ab488
A
5590}
5591
5592u_int32_t
5593mptcp_get_notsent_lowat(struct mptses *mpte)
5594{
5595 struct mptcb *mp_tp = NULL;
5596
5597 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5598 mp_tp = mpte->mpte_mptcb;
5599
5600 if (mp_tp)
5ba3f43e 5601 return (mp_tp->mpt_notsent_lowat);
fe8ab488 5602 else
5ba3f43e 5603 return (0);
fe8ab488
A
5604}
5605
39037602 5606int
5ba3f43e
A
5607mptcp_notsent_lowat_check(struct socket *so)
5608{
fe8ab488
A
5609 struct mptses *mpte;
5610 struct mppcb *mpp;
5611 struct mptcb *mp_tp;
5612 struct mptsub *mpts;
5613
5614 int notsent = 0;
5615
5ba3f43e 5616 mpp = mpsotomppcb(so);
fe8ab488
A
5617 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5618 return (0);
5619 }
5620
5621 mpte = mptompte(mpp);
5ba3f43e 5622 mpte_lock_assert_held(mpte);
fe8ab488
A
5623 mp_tp = mpte->mpte_mptcb;
5624
fe8ab488
A
5625 notsent = so->so_snd.sb_cc;
5626
5627 if ((notsent == 0) ||
5628 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5629 mp_tp->mpt_notsent_lowat)) {
3e170ce0
A
5630 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5631 "lowat %d notsent %d actual %d \n",
5632 mp_tp->mpt_notsent_lowat, notsent,
5633 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5634 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5635 return (1);
5636 }
fe8ab488
A
5637
5638 /* When Nagle's algorithm is not disabled, it is better
5639 * to wakeup the client even before there is atleast one
5640 * maxseg of data to write.
5641 */
5642 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5643 int retval = 0;
fe8ab488
A
5644 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5645 struct socket *subf_so = mpts->mpts_socket;
fe8ab488 5646 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 5647
fe8ab488
A
5648 notsent = so->so_snd.sb_cc -
5649 (tp->snd_nxt - tp->snd_una);
39037602 5650
fe8ab488
A
5651 if ((tp->t_flags & TF_NODELAY) == 0 &&
5652 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5653 retval = 1;
5654 }
3e170ce0 5655 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 5656 " nodelay false \n",
3e170ce0
A
5657 mp_tp->mpt_notsent_lowat, notsent),
5658 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
fe8ab488
A
5659 return (retval);
5660 }
fe8ab488
A
5661 }
5662 return (0);
5663}
5664
3e170ce0
A
5665/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5666static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5667static uint32_t mptcp_kern_skt_inuse = 0;
5ba3f43e 5668static uint32_t mptcp_kern_skt_unit;
3e170ce0
A
5669symptoms_advisory_t mptcp_advisory;
5670
5671static errno_t
5672mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5673 void **unitinfo)
5674{
5675#pragma unused(kctlref, sac, unitinfo)
5ba3f43e
A
5676
5677 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5678 mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__),
5679 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5680
5681 mptcp_kern_skt_unit = sac->sc_unit;
5682
5683 return (0);
5684}
5685
5686static void
5687mptcp_allow_uuid(uuid_t uuid)
5688{
5689 struct mppcb *mpp;
5690
5691 /* Iterate over all MPTCP connections */
5692
5693 lck_mtx_lock(&mtcbinfo.mppi_lock);
5694
5695 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5696 struct mptses *mpte;
5697 struct socket *mp_so;
5698
5699 mpp_lock(mpp);
5700
5701 mpte = mpp->mpp_pcbe;
5702 mp_so = mpp->mpp_socket;
5703
5704 if (mp_so->so_flags & SOF_DELEGATED &&
5705 uuid_compare(uuid, mp_so->e_uuid))
5706 goto next;
5707 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5708 uuid_compare(uuid, mp_so->last_uuid))
5709 goto next;
5710
5711 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5712
5713 mptcp_check_subflows_and_add(mpte);
5714 mptcp_remove_subflows(mpte);
5715
5716 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5717
5718next:
5719 mpp_unlock(mpp);
5720 }
5721
5722 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5723}
5724
5725static void
5726mptcp_wifi_status_changed(void)
5727{
5728 struct mppcb *mpp;
5729
5730 /* Iterate over all MPTCP connections */
5731
5732 lck_mtx_lock(&mtcbinfo.mppi_lock);
5733
5734 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5735 struct mptses *mpte;
5736 struct socket *mp_so;
5737
5738 mpp_lock(mpp);
5739
5740 mpte = mpp->mpp_pcbe;
5741 mp_so = mpp->mpp_socket;
5742
5743 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5744 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5745 goto next;
5746
5747 mptcp_check_subflows_and_add(mpte);
5748 mptcp_check_subflows_and_remove(mpte);
5749
5750next:
5751 mpp_unlock(mpp);
5752 }
5753
5754 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5755}
5756
5757void
5758mptcp_ask_symptoms(struct mptses *mpte)
5759{
5760 struct mptcp_symptoms_ask_uuid ask;
5761 struct socket *mp_so;
5762 struct proc *p;
5763 int pid, prio, err;
5764
5765 if (mptcp_kern_skt_unit == 0) {
5766 mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__),
5767 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5768 return;
5769 }
5770
5771 mp_so = mptetoso(mpte);
5772
5773 if (mp_so->so_flags & SOF_DELEGATED)
5774 pid = mp_so->e_pid;
5775 else
5776 pid = mp_so->last_pid;
5777
5778 p = proc_find(pid);
5779 if (p == PROC_NULL) {
5780 mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__,
5781 pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5782 return;
5783 }
5784
5785 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5786
5787 if (mp_so->so_flags & SOF_DELEGATED)
5788 uuid_copy(ask.uuid, mp_so->e_uuid);
5789 else
5790 uuid_copy(ask.uuid, mp_so->last_uuid);
5791
5792 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5793
5794 if (prio == TASK_BACKGROUND_APPLICATION)
5795 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5796 else if (prio == TASK_FOREGROUND_APPLICATION)
5797 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
3e170ce0 5798 else
5ba3f43e
A
5799 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5800
5801 mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__,
5802 pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5803
5804 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5805 &ask, sizeof(ask), CTL_DATA_EOR);
5806 if (err)
5807 mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err),
5808 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5809
5810 proc_rele(p);
3e170ce0
A
5811}
5812
5813static errno_t
5814mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5815 void *unitinfo)
5816{
5817#pragma unused(kctlref, kcunit, unitinfo)
5ba3f43e
A
5818
5819 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5820
5821 return (0);
3e170ce0
A
5822}
5823
5824static errno_t
5825mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5826 mbuf_t m, int flags)
5827{
5ba3f43e 5828#pragma unused(kctlref, unitinfo, flags)
3e170ce0
A
5829 symptoms_advisory_t *sa = NULL;
5830
5ba3f43e
A
5831 if (kcunit != mptcp_kern_skt_unit)
5832 mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n",
5833 __func__, kcunit, mptcp_kern_skt_unit),
5834 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5835
3e170ce0
A
5836 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5837 mbuf_freem(m);
5838 return (EINVAL);
5839 }
5840
5841 if (mbuf_len(m) >= sizeof(*sa))
5842 sa = mbuf_data(m);
5843 else
5844 return (EINVAL);
5845
5ba3f43e
A
5846 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5847 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5848 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
3e170ce0 5849
5ba3f43e
A
5850 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5851 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5852 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3e170ce0
A
5853
5854 if ((sa->sa_wifi_status &
5855 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5ba3f43e 5856 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
3e170ce0 5857 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
3e170ce0 5858
5ba3f43e
A
5859 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5860 mptcp_wifi_status_changed();
5861 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5862 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5863 mptcp_advisory.sa_wifi_status),
5864 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5865 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5866 uuid_t uuid;
5867
5868 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5869 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5870
5871 uuid_copy(uuid, (unsigned char *)(sa + 1));
5872
5873 mptcp_allow_uuid(uuid);
3e170ce0 5874 }
5ba3f43e 5875
3e170ce0
A
5876 return (0);
5877}
5878
5879void
5880mptcp_control_register(void)
5881{
5882 /* Set up the advisory control socket */
5883 struct kern_ctl_reg mptcp_kern_ctl;
5884
5885 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5886 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5887 sizeof(mptcp_kern_ctl.ctl_name));
5888 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5889 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5890 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5891 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5892
5893 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5894}
5895
5896int
5897mptcp_is_wifi_unusable(void)
5898{
5899 /* a false return val indicates there is no info or wifi is ok */
5900 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5901}
5902
490019cf
A
5903/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5904static void
5ba3f43e 5905mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
490019cf 5906{
5ba3f43e 5907 struct socket *mp_so = mptetoso(mpte);
490019cf
A
5908 struct socket *so = mpts->mpts_socket;
5909 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5910 struct mptcb *mp_tp = mpte->mpte_mptcb;
5911
5912 /* If data was sent with SYN, rewind state */
5913 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5ba3f43e 5914 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
490019cf 5915 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5ba3f43e 5916
490019cf
A
5917 VERIFY(mp_droplen <= (UINT_MAX));
5918 VERIFY(mp_droplen >= tcp_droplen);
5919
5ba3f43e
A
5920 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5921 mpts->mpts_iss += tcp_droplen;
5922 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5923
490019cf
A
5924 if (mp_droplen > tcp_droplen) {
5925 /* handle partial TCP ack */
5926 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5927 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
490019cf
A
5928 mp_droplen = tcp_droplen;
5929 } else {
5930 /* all data on SYN was acked */
5931 mpts->mpts_rel_seq = 1;
5932 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
490019cf
A
5933 }
5934 mp_tp->mpt_sndmax -= tcp_droplen;
5935
490019cf
A
5936 if (mp_droplen != 0) {
5937 VERIFY(mp_so->so_snd.sb_mb != NULL);
5938 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5939 }
5ba3f43e
A
5940 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
5941 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5942 mpts->mpts_connid, tcp_droplen, mp_droplen),
5943 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5944 }
5945}
5946
5947int
5948mptcp_freeq(struct mptcb *mp_tp)
5949{
5950 struct tseg_qent *q;
5951 int rv = 0;
5952
5953 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
5954 LIST_REMOVE(q, tqe_q);
5955 m_freem(q->tqe_m);
5956 zfree(tcp_reass_zone, q);
5957 rv = 1;
5958 }
5959 mp_tp->mpt_reassqlen = 0;
5960 return (rv);
5961}
5962
5963static int
5964mptcp_post_event(u_int32_t event_code, int value)
5965{
5966 struct kev_mptcp_data event_data;
5967 struct kev_msg ev_msg;
5968
5969 memset(&ev_msg, 0, sizeof(ev_msg));
5970
5971 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5972 ev_msg.kev_class = KEV_NETWORK_CLASS;
5973 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
5974 ev_msg.event_code = event_code;
5975
5976 event_data.value = value;
5977
5978 ev_msg.dv[0].data_ptr = &event_data;
5979 ev_msg.dv[0].data_length = sizeof(event_data);
5980
5981 return kev_post_msg(&ev_msg);
5982}
5983
5984void
5985mptcp_set_cellicon(struct mptses *mpte)
5986{
5987 int error;
5988
5989 /* First-party apps (Siri) don't flip the cellicon */
5990 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
5991 return;
5992
5993 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
5994 mptcp_last_cellicon_set = tcp_now;
5995
5996 /* If cellicon is already set, get out of here! */
5997 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
5998 return;
5999
6000 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6001
6002 if (error)
6003 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
6004 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6005 else
6006 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
6007 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6008}
6009
6010void
6011mptcp_unset_cellicon(void)
6012{
6013 int error;
6014
6015 /* If cellicon is already unset, get out of here! */
6016 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
6017 return;
6018
6019 /*
6020 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6021 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6022 * it again.
6023 */
6024 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
6025 tcp_now)) {
6026 OSTestAndSet(7, &mptcp_cellicon_is_set);
6027 return;
490019cf 6028 }
5ba3f43e
A
6029
6030 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6031
6032 if (error)
6033 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
6034 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6035 else
6036 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
6037 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6038}
6039
6040void
6041mptcp_reset_rexmit_state(struct tcpcb *tp)
6042{
6043 struct mptsub *mpts;
6044 struct inpcb *inp;
6045 struct socket *so;
6046
6047 inp = tp->t_inpcb;
6048 if (inp == NULL)
6049 return;
6050
6051 so = inp->inp_socket;
6052 if (so == NULL)
6053 return;
6054
6055 if (!(so->so_flags & SOF_MP_SUBFLOW))
6056 return;
6057
6058 mpts = tp->t_mpsub;
6059
6060 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6061 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6062}
6063
6064void
6065mptcp_reset_keepalive(struct tcpcb *tp)
6066{
6067 struct mptsub *mpts = tp->t_mpsub;
6068
6069 mpts->mpts_flags &= ~MPTSF_READ_STALL;
490019cf 6070}
5ba3f43e 6071