]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_subr.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
CommitLineData
39236c6e 1/*
f427ee49 2 * Copyright (c) 2012-2020 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
5ba3f43e
A
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
39236c6e
A
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
5ba3f43e
A
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
39236c6e
A
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
39236c6e 47#include <sys/sysctl.h>
5ba3f43e
A
48#include <sys/syslog.h>
49#include <sys/systm.h>
39236c6e 50
5ba3f43e 51#include <net/content_filter.h>
39236c6e 52#include <net/if.h>
3e170ce0 53#include <net/if_var.h>
39236c6e
A
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_fsm.h>
59#include <netinet/tcp_seq.h>
60#include <netinet/tcp_var.h>
61#include <netinet/mptcp_var.h>
62#include <netinet/mptcp.h>
5ba3f43e 63#include <netinet/mptcp_opt.h>
39236c6e
A
64#include <netinet/mptcp_seq.h>
65#include <netinet/mptcp_timer.h>
66#include <libkern/crypto/sha1.h>
39236c6e
A
67#include <netinet6/in6_pcb.h>
68#include <netinet6/ip6protosw.h>
39236c6e
A
69#include <dev/random/randomdev.h>
70
71/*
72 * Notes on MPTCP implementation.
73 *
74 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
75 * communication domain. The structure mtcbinfo describes the MPTCP instance
76 * of a Multipath protocol in that domain. It is used to keep track of all
77 * MPTCP PCB instances in the system, and is protected by the global lock
78 * mppi_lock.
79 *
80 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
81 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
82 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
83 * allocated from the same memory block, and each structure has a pointer
84 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
85 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
86 * PCB (mppcb) as well as the MPTCP Session (mptses).
87 *
88 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
39236c6e
A
89 *
90 * A functioning MPTCP Session consists of one or more subflow sockets. Each
91 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
92 * represented by the mptsub structure. Because each subflow requires access
93 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
5ba3f43e 94 * subflow. This gets decremented prior to the subflow's destruction.
39236c6e 95 *
5ba3f43e
A
96 * To handle events (read, write, control) from the subflows, we do direct
97 * upcalls into the specific function.
39236c6e 98 *
5ba3f43e
A
99 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
100 * lock. Incoming data on a subflow also ends up taking this single lock. To
101 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
102 * of the MPTCP-socket.
39236c6e
A
103 *
104 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
105 * work is done by the MPTCP garbage collector which is invoked on demand by
106 * the PF_MULTIPATH garbage collector. This process will take place once all
5ba3f43e 107 * of the subflows have been destroyed.
39236c6e
A
108 */
109
fe8ab488 110static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
39236c6e 111static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
39236c6e
A
112
113static uint32_t mptcp_gc(struct mppcbinfo *);
39236c6e
A
114static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
115 struct uio *, struct mbuf **, struct mbuf **, int *);
5ba3f43e
A
116static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
117 struct uio *, struct mbuf *, struct mbuf *, int);
39236c6e 118static void mptcp_subflow_wupcall(struct socket *, void *, int);
f427ee49 119static void mptcp_subflow_eupcall1(struct socket *so, void *arg, long events);
5ba3f43e
A
120static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
121static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
122
123static void mptcp_subflow_abort(struct mptsub *, int);
124
125static void mptcp_send_dfin(struct socket *so);
cb323159 126static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
cb323159 127static int mptcp_freeq(struct mptcb *mp_tp);
39236c6e
A
128
129/*
130 * Possible return values for subflow event handlers. Note that success
131 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
132 * indicate errors or actions which require immediate attention; they will
133 * prevent the rest of the handlers from processing their respective events
134 * until the next round of events processing.
135 */
136typedef enum {
0a7de745
A
137 MPTS_EVRET_DELETE = 1, /* delete this subflow */
138 MPTS_EVRET_OK = 2, /* OK */
139 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
140 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
39236c6e
A
141} ev_ret_t;
142
f427ee49
A
143static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, long *, long);
144static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, long *, long);
145static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, long *, long);
146static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, long *, long);
147static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, long *, long);
148static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, long *, long);
149static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, long *, long);
150static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, long *, long);
151static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, long *, long);
152static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, long *, long);
153static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, long *, long);
154static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, long *, long);
fe8ab488 155
5ba3f43e
A
156static void mptcp_do_sha1(mptcp_key_t *, char *);
157static void mptcp_init_local_parms(struct mptses *);
39236c6e 158
f427ee49
A
159static ZONE_DECLARE(mptsub_zone, "mptsub", sizeof(struct mptsub), ZC_ZFREE_CLEARMEM);
160static ZONE_DECLARE(mptopt_zone, "mptopt", sizeof(struct mptopt), ZC_ZFREE_CLEARMEM);
161static ZONE_DECLARE(mpt_subauth_zone, "mptauth",
162 sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
39236c6e
A
163
164struct mppcbinfo mtcbinfo;
165
39236c6e
A
166SYSCTL_DECL(_net_inet);
167
0a7de745 168SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
39236c6e 169
0a7de745
A
170uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
171SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
172 &mptcp_dbg_area, 0, "MPTCP debug area");
3e170ce0 173
5ba3f43e 174uint32_t mptcp_dbg_level = 1;
3e170ce0 175SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 176 &mptcp_dbg_level, 0, "MPTCP debug level");
3e170ce0 177
0a7de745
A
178SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
179 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
39236c6e 180
a39ff7e2
A
181
182static int mptcp_alternate_port = 0;
183SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 184 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
a39ff7e2 185
39236c6e
A
186static struct protosw mptcp_subflow_protosw;
187static struct pr_usrreqs mptcp_subflow_usrreqs;
39236c6e
A
188static struct ip6protosw mptcp_subflow_protosw6;
189static struct pr_usrreqs mptcp_subflow_usrreqs6;
39236c6e 190
0a7de745 191static uint8_t mptcp_create_subflows_scheduled;
5ba3f43e 192
3e170ce0 193typedef struct mptcp_subflow_event_entry {
f427ee49
A
194 long sofilt_hint_mask;
195 ev_ret_t (*sofilt_hint_ev_hdlr)(
0a7de745
A
196 struct mptses *mpte,
197 struct mptsub *mpts,
f427ee49
A
198 long *p_mpsofilt_hint,
199 long event);
3e170ce0
A
200} mptsub_ev_entry_t;
201
cb323159
A
202/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
203static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
204static uint32_t mptcp_kern_skt_inuse = 0;
205static uint32_t mptcp_kern_skt_unit;
206static symptoms_advisory_t mptcp_advisory;
207
208uint32_t mptcp_cellicon_refcount = 0;
5ba3f43e 209
490019cf
A
210/*
211 * XXX The order of the event handlers below is really
5ba3f43e 212 * really important. Think twice before changing it.
490019cf 213 */
0a7de745 214static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
cb323159
A
215 {
216 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
217 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
218 },
3e170ce0
A
219 {
220 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
0a7de745 221 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
3e170ce0
A
222 },
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
5ba3f43e 229 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
5ba3f43e 237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
5ba3f43e 241 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
3e170ce0
A
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
250 },
3e170ce0
A
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
258 },
3e170ce0
A
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
262 },
263 {
5ba3f43e
A
264 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
270 },
3e170ce0
A
271};
272
a39ff7e2
A
273os_log_t mptcp_log_handle;
274
39236c6e
A
275/*
276 * Protocol pr_init callback.
277 */
278void
279mptcp_init(struct protosw *pp, struct domain *dp)
280{
281#pragma unused(dp)
282 static int mptcp_initialized = 0;
283 struct protosw *prp;
39236c6e 284 struct ip6protosw *prp6;
39236c6e 285
0a7de745 286 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
39236c6e
A
287
288 /* do this only once */
0a7de745 289 if (mptcp_initialized) {
39236c6e 290 return;
0a7de745 291 }
39236c6e
A
292 mptcp_initialized = 1;
293
cb323159
A
294 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
295
39236c6e
A
296 /*
297 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
298 * we must be able to find IPPROTO_TCP entries for both.
299 */
300 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
301 VERIFY(prp != NULL);
0a7de745 302 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
39236c6e 303 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
0a7de745 304 sizeof(mptcp_subflow_usrreqs));
39236c6e
A
305 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
306 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
307 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
308 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 309 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
310 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
311 /*
312 * Socket filters shouldn't attach/detach to/from this protosw
313 * since pr_protosw is to be used instead, which points to the
314 * real protocol; if they do, it is a bug and we should panic.
315 */
316 mptcp_subflow_protosw.pr_filter_head.tqh_first =
317 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
318 mptcp_subflow_protosw.pr_filter_head.tqh_last =
319 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
320
39236c6e
A
321 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
322 IPPROTO_TCP, SOCK_STREAM);
323 VERIFY(prp6 != NULL);
0a7de745 324 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
39236c6e 325 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
0a7de745 326 sizeof(mptcp_subflow_usrreqs6));
39236c6e
A
327 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
328 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
329 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
330 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
5ba3f43e 331 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
39236c6e
A
332 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
333 /*
334 * Socket filters shouldn't attach/detach to/from this protosw
335 * since pr_protosw is to be used instead, which points to the
336 * real protocol; if they do, it is a bug and we should panic.
337 */
338 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
339 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
340 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
341 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
39236c6e 342
0a7de745 343 bzero(&mtcbinfo, sizeof(mtcbinfo));
39236c6e 344 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
0a7de745 345 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
f427ee49
A
346 mtcbinfo.mppi_zone = zone_create("mptc", mtcbinfo.mppi_size,
347 ZC_NONE);
39236c6e
A
348
349 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
350 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
351 mtcbinfo.mppi_lock_grp_attr);
352 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
353 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
354 mtcbinfo.mppi_lock_attr);
39236c6e 355
3e170ce0 356 mtcbinfo.mppi_gc = mptcp_gc;
39236c6e
A
357 mtcbinfo.mppi_timer = mptcp_timer;
358
359 /* attach to MP domain for garbage collection to take place */
360 mp_pcbinfo_attach(&mtcbinfo);
361
a39ff7e2 362 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
5ba3f43e
A
363}
364
365int
f427ee49 366mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
5ba3f43e 367{
5ba3f43e
A
368 int i, index = -1;
369
5ba3f43e 370 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
cb323159 371 if (create && stats[i].ifindex == IFSCOPE_NONE) {
0a7de745 372 if (index < 0) {
5ba3f43e 373 index = i;
0a7de745 374 }
5ba3f43e
A
375 continue;
376 }
377
cb323159 378 if (stats[i].ifindex == ifindex) {
5ba3f43e 379 index = i;
0a7de745 380 return index;
5ba3f43e
A
381 }
382 }
383
384 if (index != -1) {
cb323159
A
385 stats[index].ifindex = ifindex;
386 }
387
388 return index;
389}
390
391static int
392mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
393{
394 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
395 int index;
396
397 if (ifp == NULL) {
398 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
399 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
400 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
401 return -1;
402 }
403
404 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
405
406 if (index != -1) {
0a7de745 407 if (stats[index].is_expensive == 0) {
5ba3f43e 408 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
0a7de745 409 }
5ba3f43e
A
410 }
411
0a7de745 412 return index;
5ba3f43e
A
413}
414
415void
416mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
417{
418 int index;
419
420 tcpstat.tcps_mp_switches++;
421 mpte->mpte_subflow_switches++;
422
cb323159 423 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
5ba3f43e 424
0a7de745 425 if (index != -1) {
5ba3f43e 426 mpte->mpte_itfstats[index].switches++;
0a7de745 427 }
5ba3f43e
A
428}
429
430/*
431 * Flushes all recorded socket options from an MP socket.
432 */
433static void
434mptcp_flush_sopts(struct mptses *mpte)
435{
436 struct mptopt *mpo, *tmpo;
437
438 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
439 mptcp_sopt_remove(mpte, mpo);
440 mptcp_sopt_free(mpo);
441 }
442 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
39236c6e
A
443}
444
445/*
446 * Create an MPTCP session, called as a result of opening a MPTCP socket.
447 */
5ba3f43e 448int
cb323159 449mptcp_session_create(struct mppcb *mpp)
39236c6e
A
450{
451 struct mppcbinfo *mppi;
452 struct mptses *mpte;
453 struct mptcb *mp_tp;
39236c6e
A
454
455 VERIFY(mpp != NULL);
456 mppi = mpp->mpp_pcbinfo;
457 VERIFY(mppi != NULL);
458
3e170ce0
A
459 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
460 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
39236c6e
A
461
462 /* MPTCP Multipath PCB Extension */
0a7de745 463 bzero(mpte, sizeof(*mpte));
39236c6e
A
464 VERIFY(mpp->mpp_pcbe == NULL);
465 mpp->mpp_pcbe = mpte;
466 mpte->mpte_mppcb = mpp;
467 mpte->mpte_mptcb = mp_tp;
468
469 TAILQ_INIT(&mpte->mpte_sopts);
470 TAILQ_INIT(&mpte->mpte_subflows);
3e170ce0
A
471 mpte->mpte_associd = SAE_ASSOCID_ANY;
472 mpte->mpte_connid_last = SAE_CONNID_ANY;
39236c6e 473
cb323159
A
474 mptcp_init_urgency_timer(mpte);
475
5ba3f43e
A
476 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
477 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
39236c6e 478
f427ee49
A
479 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
480 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
0a7de745 481 }
a39ff7e2 482
cb323159
A
483 mpte->mpte_last_cellicon_set = tcp_now;
484
39236c6e 485 /* MPTCP Protocol Control Block */
0a7de745 486 bzero(mp_tp, sizeof(*mp_tp));
39236c6e 487 mp_tp->mpt_mpte = mpte;
3e170ce0 488 mp_tp->mpt_state = MPTCPS_CLOSED;
39236c6e 489
5ba3f43e
A
490 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
491
0a7de745 492 return 0;
5ba3f43e
A
493}
494
cb323159
A
495struct sockaddr *
496mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
497{
c3c9b80d
A
498 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
499 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
cb323159
A
500 }
501
c3c9b80d
A
502 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
503 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
cb323159
A
504 }
505
506 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
507 * meaning we prefer IPv6 over IPv4.
508 */
c3c9b80d
A
509 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
510 return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
cb323159
A
511 }
512
c3c9b80d
A
513 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
514 return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
cb323159
A
515 }
516
517 /* We don't yet have a unicast IP */
518 return NULL;
519}
520
5ba3f43e
A
521static void
522mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
0a7de745 523 uint64_t *cellbytes, uint64_t *allbytes)
5ba3f43e
A
524{
525 int64_t mycellbytes = 0;
526 uint64_t myallbytes = 0;
527 int i;
528
529 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
530 if (mpte->mpte_itfstats[i].is_expensive) {
531 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
532 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
533 }
534
535 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
536 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
537 }
538
539 if (initial_cell) {
540 mycellbytes -= mpte->mpte_init_txbytes;
cb323159 541 mycellbytes -= mpte->mpte_init_rxbytes;
5ba3f43e
A
542 }
543
544 if (mycellbytes < 0) {
cb323159
A
545 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
546 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
5ba3f43e
A
547 *cellbytes = 0;
548 *allbytes = 0;
549 } else {
550 *cellbytes = mycellbytes;
551 *allbytes = myallbytes;
552 }
553}
554
555static void
556mptcpstats_session_wrapup(struct mptses *mpte)
557{
558 boolean_t cell = mpte->mpte_initial_cell;
559
560 switch (mpte->mpte_svctype) {
561 case MPTCP_SVCTYPE_HANDOVER:
562 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
563 tcpstat.tcps_mptcp_fp_handover_attempt++;
564
565 if (cell && mpte->mpte_handshake_success) {
566 tcpstat.tcps_mptcp_fp_handover_success_cell++;
567
0a7de745 568 if (mpte->mpte_used_wifi) {
5ba3f43e 569 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
0a7de745 570 }
5ba3f43e
A
571 } else if (mpte->mpte_handshake_success) {
572 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
573
0a7de745 574 if (mpte->mpte_used_cell) {
5ba3f43e 575 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
0a7de745 576 }
5ba3f43e
A
577 }
578 } else {
579 tcpstat.tcps_mptcp_handover_attempt++;
580
581 if (cell && mpte->mpte_handshake_success) {
582 tcpstat.tcps_mptcp_handover_success_cell++;
583
0a7de745 584 if (mpte->mpte_used_wifi) {
5ba3f43e 585 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
0a7de745 586 }
5ba3f43e
A
587 } else if (mpte->mpte_handshake_success) {
588 tcpstat.tcps_mptcp_handover_success_wifi++;
589
0a7de745 590 if (mpte->mpte_used_cell) {
5ba3f43e 591 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
0a7de745 592 }
5ba3f43e
A
593 }
594 }
595
596 if (mpte->mpte_handshake_success) {
597 uint64_t cellbytes;
598 uint64_t allbytes;
599
600 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
601
602 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
603 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
604 }
605 break;
606 case MPTCP_SVCTYPE_INTERACTIVE:
607 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
608 tcpstat.tcps_mptcp_fp_interactive_attempt++;
609
610 if (mpte->mpte_handshake_success) {
611 tcpstat.tcps_mptcp_fp_interactive_success++;
612
0a7de745 613 if (!cell && mpte->mpte_used_cell) {
5ba3f43e 614 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
0a7de745 615 }
5ba3f43e
A
616 }
617 } else {
618 tcpstat.tcps_mptcp_interactive_attempt++;
619
620 if (mpte->mpte_handshake_success) {
621 tcpstat.tcps_mptcp_interactive_success++;
622
0a7de745 623 if (!cell && mpte->mpte_used_cell) {
5ba3f43e 624 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
0a7de745 625 }
5ba3f43e
A
626 }
627 }
628
629 if (mpte->mpte_handshake_success) {
630 uint64_t cellbytes;
631 uint64_t allbytes;
632
633 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
634
635 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
636 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
637 }
638 break;
639 case MPTCP_SVCTYPE_AGGREGATE:
640 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
641 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
642
0a7de745 643 if (mpte->mpte_handshake_success) {
5ba3f43e 644 tcpstat.tcps_mptcp_fp_aggregate_success++;
0a7de745 645 }
5ba3f43e
A
646 } else {
647 tcpstat.tcps_mptcp_aggregate_attempt++;
648
649 if (mpte->mpte_handshake_success) {
650 tcpstat.tcps_mptcp_aggregate_success++;
651 }
652 }
653
654 if (mpte->mpte_handshake_success) {
655 uint64_t cellbytes;
656 uint64_t allbytes;
657
658 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
659
660 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
661 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
662 }
663 break;
664 }
665
0a7de745 666 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
5ba3f43e 667 tcpstat.tcps_mptcp_back_to_wifi++;
0a7de745 668 }
d9a64523 669
0a7de745 670 if (mpte->mpte_triggered_cell) {
d9a64523 671 tcpstat.tcps_mptcp_triggered_cell++;
0a7de745 672 }
39236c6e
A
673}
674
675/*
676 * Destroy an MPTCP session.
677 */
678static void
5ba3f43e 679mptcp_session_destroy(struct mptses *mpte)
39236c6e 680{
cb323159 681 struct mptcb *mp_tp = mpte->mpte_mptcb;
39236c6e 682
39236c6e 683 VERIFY(mp_tp != NULL);
cb323159 684 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
39236c6e 685
5ba3f43e 686 mptcpstats_session_wrapup(mpte);
cb323159 687 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
39236c6e 688 mptcp_flush_sopts(mpte);
39236c6e 689
0a7de745 690 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
5ba3f43e 691 _FREE(mpte->mpte_itfinfo, M_TEMP);
0a7de745 692 }
5ba3f43e
A
693 mpte->mpte_itfinfo = NULL;
694
695 m_freem_list(mpte->mpte_reinjectq);
39236c6e 696
cb323159
A
697 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
698 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
39236c6e
A
699}
700
cb323159 701boolean_t
5ba3f43e 702mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
39236c6e 703{
0a7de745
A
704 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
705 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
706 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
5ba3f43e 707}
39236c6e 708
5ba3f43e 709static int
cb323159
A
710mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
711 const struct in_addr *addrv4)
5ba3f43e
A
712{
713 static const struct in6_addr well_known_prefix = {
714 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
0a7de745
A
715 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
716 0x00, 0x00, 0x00, 0x00},
5ba3f43e 717 };
cb323159 718 const char *ptrv4 = (const char *)addrv4;
5ba3f43e
A
719 char *ptr = (char *)addr;
720
d9a64523
A
721 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
722 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
723 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
724 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
725 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
726 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
5ba3f43e 727 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
0a7de745 728 return -1;
39236c6e
A
729 }
730
5ba3f43e
A
731 /* Check for the well-known prefix */
732 if (len == NAT64_PREFIX_LEN_96 &&
733 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
d9a64523 734 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
0a7de745
A
735 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
736 return -1;
737 }
5ba3f43e 738 }
39236c6e 739
5ba3f43e 740 switch (len) {
0a7de745
A
741 case NAT64_PREFIX_LEN_96:
742 memcpy(ptr + 12, ptrv4, 4);
743 break;
744 case NAT64_PREFIX_LEN_64:
745 memcpy(ptr + 9, ptrv4, 4);
746 break;
747 case NAT64_PREFIX_LEN_56:
748 memcpy(ptr + 7, ptrv4, 1);
749 memcpy(ptr + 9, ptrv4 + 1, 3);
750 break;
751 case NAT64_PREFIX_LEN_48:
752 memcpy(ptr + 6, ptrv4, 2);
753 memcpy(ptr + 9, ptrv4 + 2, 2);
754 break;
755 case NAT64_PREFIX_LEN_40:
756 memcpy(ptr + 5, ptrv4, 3);
757 memcpy(ptr + 9, ptrv4 + 3, 1);
758 break;
759 case NAT64_PREFIX_LEN_32:
760 memcpy(ptr + 4, ptrv4, 4);
761 break;
762 default:
763 panic("NAT64-prefix len is wrong: %u\n", len);
5ba3f43e 764 }
39236c6e 765
0a7de745 766 return 0;
39236c6e
A
767}
768
d9a64523
A
769static void
770mptcp_trigger_cell_bringup(struct mptses *mpte)
771{
772 struct socket *mp_so = mptetoso(mpte);
773
774 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
775 uuid_string_t uuidstr;
776 int err;
777
cb323159 778 socket_unlock(mp_so, 0);
d9a64523 779 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
0a7de745 780 TRUE);
cb323159 781 socket_lock(mp_so, 0);
d9a64523 782
0a7de745 783 if (err == 0) {
d9a64523 784 mpte->mpte_triggered_cell = 1;
0a7de745 785 }
d9a64523
A
786
787 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
cb323159
A
788 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
789 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
d9a64523 790 } else {
cb323159
A
791 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
792 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
d9a64523
A
793 }
794}
795
cb323159
A
796static boolean_t
797mptcp_subflow_disconnecting(struct mptsub *mpts)
798{
cb323159
A
799 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
800 return true;
801 }
802
803 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
804 return true;
805 }
806
807 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
808 return true;
809 }
810
811 return false;
812}
d9a64523 813
f427ee49
A
814/*
815 * In Handover mode, only create cell subflow if
816 * - Symptoms marked WiFi as weak:
817 * Here, if we are sending data, then we can check the RTO-state. That is a
818 * stronger signal of WiFi quality than the Symptoms indicator.
819 * If however we are not sending any data, the only thing we can do is guess
820 * and thus bring up Cell.
821 *
822 * - Symptoms marked WiFi as unknown:
823 * In this state we don't know what the situation is and thus remain
824 * conservative, only bringing up cell if there are retransmissions going on.
825 */
826static boolean_t
827mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
828{
829 int unusable_state = mptcp_is_wifi_unusable_for_session(mpte);
830
831 if (unusable_state == 0) {
832 /* WiFi is good - don't use cell */
833 return false;
834 }
835
836 if (unusable_state == -1) {
837 /*
838 * We are in unknown state, only use Cell if we have confirmed
839 * that WiFi is bad.
840 */
841 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
842 return true;
843 } else {
844 return false;
845 }
846 }
847
848 if (unusable_state == 1) {
849 /*
850 * WiFi is confirmed to be bad from Symptoms-Framework.
851 * If we are sending data, check the RTOs.
852 * Otherwise, be pessimistic and use Cell.
853 */
854 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
855 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
856 return true;
857 } else {
858 return false;
859 }
860 } else {
861 return true;
862 }
863 }
864
865 return false;
866}
867
39236c6e 868void
5ba3f43e 869mptcp_check_subflows_and_add(struct mptses *mpte)
39236c6e 870{
5ba3f43e 871 struct mptcb *mp_tp = mpte->mpte_mptcb;
d9a64523
A
872 boolean_t cellular_viable = FALSE;
873 boolean_t want_cellular = TRUE;
5ba3f43e 874 uint32_t i;
39236c6e 875
0a7de745 876 if (!mptcp_ok_to_create_subflows(mp_tp)) {
cb323159
A
877 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
878 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
879 return;
880 }
881
c3c9b80d 882 /* Just to see if we have an IP-address available */
cb323159 883 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
5ba3f43e 884 return;
0a7de745 885 }
39236c6e 886
5ba3f43e 887 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
cb323159 888 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
5ba3f43e 889 struct mpt_itf_info *info;
cb323159
A
890 struct sockaddr_in6 nat64pre;
891 struct sockaddr *dst;
5ba3f43e 892 struct mptsub *mpts;
d9a64523 893 struct ifnet *ifp;
5ba3f43e 894 uint32_t ifindex;
39236c6e 895
5ba3f43e 896 info = &mpte->mpte_itfinfo[i];
39236c6e 897
cb323159
A
898 ifindex = info->ifindex;
899 if (ifindex == IFSCOPE_NONE) {
5ba3f43e 900 continue;
0a7de745 901 }
39236c6e 902
cb323159
A
903 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
904 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
905 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
906
907 if (info->no_mptcp_support) {
5ba3f43e 908 continue;
0a7de745 909 }
39236c6e 910
d9a64523
A
911 ifnet_head_lock_shared();
912 ifp = ifindex2ifnet[ifindex];
913 ifnet_head_done();
914
0a7de745 915 if (ifp == NULL) {
d9a64523 916 continue;
0a7de745 917 }
d9a64523 918
0a7de745 919 if (IFNET_IS_CELLULAR(ifp)) {
d9a64523 920 cellular_viable = TRUE;
c3c9b80d
A
921
922 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
923 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
924 if (!mptcp_is_wifi_unusable_for_session(mpte)) {
925 continue;
926 }
927 }
0a7de745 928 }
d9a64523 929
5ba3f43e 930 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
d9a64523 931 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
cb323159 932 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
39236c6e 933
0a7de745 934 if (subifp == NULL) {
5ba3f43e 935 continue;
0a7de745 936 }
39236c6e 937
cb323159
A
938 /*
939 * If there is at least one functioning subflow on WiFi
940 * and we are checking for the cell interface, then
941 * we always need to ask symptoms for permission as
942 * cell is triggered even if WiFi is available.
943 */
944 if (!IFNET_IS_CELLULAR(subifp) &&
945 !mptcp_subflow_disconnecting(mpts) &&
946 IFNET_IS_CELLULAR(ifp)) {
947 need_to_ask_symptoms = TRUE;
948 }
949
c3c9b80d 950 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
f427ee49 951 os_log(mptcp_log_handle,
c3c9b80d 952 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
cb323159 953 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
c3c9b80d 954 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
f427ee49 955 IFNET_IS_CELLULAR(subifp),
cb323159 956 mptcp_is_wifi_unusable_for_session(mpte),
f427ee49 957 mpts->mpts_flags,
cb323159 958 tp->t_rxtshift,
0a7de745
A
959 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
960 mptetoso(mpte)->so_snd.sb_cc,
cb323159
A
961 ifindex, subifp->if_index,
962 tp->t_srtt >> TCP_RTT_SHIFT,
963 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
964 tp->t_rxtcur);
d9a64523 965
f427ee49
A
966 if (!IFNET_IS_CELLULAR(subifp) &&
967 !mptcp_subflow_disconnecting(mpts) &&
968 (mpts->mpts_flags & MPTSF_CONNECTED) &&
969 !mptcp_handover_use_cellular(mpte, tp)) {
970 found = TRUE;
971
972 /* We found a proper subflow on WiFi - no need for cell */
973 want_cellular = FALSE;
974 break;
975 }
cb323159
A
976 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
977 uint64_t time_now = mach_continuous_time();
978
979 os_log(mptcp_log_handle,
980 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
981 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
982 time_now, mptcp_is_wifi_unusable_for_session(mpte),
983 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
984 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
985
986 if (!IFNET_IS_CELLULAR(subifp) &&
987 !mptcp_subflow_disconnecting(mpts) &&
988 (mpte->mpte_time_target == 0 ||
989 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
990 !mptcp_is_wifi_unusable_for_session(mpte))) {
991 found = TRUE;
992
993 want_cellular = FALSE;
994 break;
995 }
d9a64523
A
996 }
997
998 if (subifp->if_index == ifindex &&
cb323159 999 !mptcp_subflow_disconnecting(mpts)) {
d9a64523
A
1000 /*
1001 * We found a subflow on this interface.
1002 * No need to create a new one.
1003 */
cb323159 1004 found = TRUE;
5ba3f43e
A
1005 break;
1006 }
1007 }
1008
cb323159
A
1009 if (found) {
1010 continue;
1011 }
1012
1013 if (need_to_ask_symptoms &&
1014 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
5ba3f43e
A
1015 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1016 mptcp_developer_mode == 0) {
1017 mptcp_ask_symptoms(mpte);
1018 return;
1019 }
1020
cb323159 1021 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
5ba3f43e 1022
cb323159
A
1023 if (dst->sa_family == AF_INET &&
1024 !info->has_v4_conn && info->has_nat64_conn) {
1025 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1026 int error, j;
5ba3f43e 1027
cb323159 1028 bzero(&nat64pre, sizeof(struct sockaddr_in6));
5ba3f43e 1029
cb323159
A
1030 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1031 if (error) {
1032 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1033 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1034 continue;
1035 }
5ba3f43e 1036
cb323159
A
1037 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1038 if (nat64prefixes[j].prefix_len != 0) {
1039 break;
5ba3f43e 1040 }
5ba3f43e
A
1041 }
1042
cb323159 1043 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
a39ff7e2 1044
cb323159
A
1045 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1046 nat64prefixes[j].prefix_len,
1047 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1048 if (error != 0) {
f427ee49 1049 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
cb323159 1050 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
a39ff7e2 1051 continue;
0a7de745 1052 }
a39ff7e2 1053
cb323159
A
1054 memcpy(&nat64pre.sin6_addr,
1055 &nat64prefixes[j].ipv6_prefix,
1056 sizeof(nat64pre.sin6_addr));
1057 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1058 nat64pre.sin6_family = AF_INET6;
1059 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1060 nat64pre.sin6_flowinfo = 0;
1061 nat64pre.sin6_scope_id = 0;
1062
1063 dst = (struct sockaddr *)&nat64pre;
5ba3f43e 1064 }
cb323159 1065
cb323159
A
1066 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1067 continue;
1068 }
1069 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1070 continue;
1071 }
1072
1073 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
5ba3f43e 1074 }
d9a64523
A
1075
1076 if (!cellular_viable && want_cellular) {
1077 /* Trigger Cell Bringup */
1078 mptcp_trigger_cell_bringup(mpte);
1079 }
5ba3f43e
A
1080}
1081
5ba3f43e 1082static void
cb323159 1083mptcp_remove_cell_subflows(struct mptses *mpte)
5ba3f43e
A
1084{
1085 struct mptsub *mpts, *tmpts;
cb323159 1086
c3c9b80d 1087 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
cb323159 1088 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
5ba3f43e 1089
c3c9b80d 1090 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
cb323159
A
1091 continue;
1092 }
1093
c3c9b80d
A
1094 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1095 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
cb323159 1096
c3c9b80d 1097 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
0a7de745 1098 }
5ba3f43e 1099
c3c9b80d
A
1100 return;
1101}
1102
1103static void
1104mptcp_remove_wifi_subflows(struct mptses *mpte)
1105{
1106 struct mptsub *mpts, *tmpts;
1107
cb323159
A
1108 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1109 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1110
c3c9b80d 1111 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
cb323159
A
1112 continue;
1113 }
1114
c3c9b80d 1115 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
cb323159
A
1116 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1117
1118 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1119 }
1120
1121 return;
1122}
1123
c3c9b80d
A
1124static void
1125mptcp_pure_handover_subflows_remove(struct mptses *mpte)
1126{
1127 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1128 boolean_t found_working_wifi_subflow = false;
1129 boolean_t found_working_cell_subflow = false;
1130
1131 struct mptsub *mpts;
1132
1133 /*
1134 * Look for a subflow that is on a non-cellular interface in connected
1135 * state.
1136 *
1137 * In that case, remove all cellular subflows.
1138 *
1139 * If however there is no connected subflow
1140 */
1141 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1142 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1143 struct socket *so;
1144 struct tcpcb *tp;
1145
1146 if (ifp == NULL) {
1147 continue;
1148 }
1149
1150 so = mpts->mpts_socket;
1151 tp = sototcpcb(so);
1152
1153 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1154 tp->t_state != TCPS_ESTABLISHED ||
1155 mptcp_subflow_disconnecting(mpts)) {
1156 continue;
1157 }
1158
1159 if (IFNET_IS_CELLULAR(ifp)) {
1160 found_working_cell_subflow = true;
1161 } else {
1162 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1163 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1164 if (!mptcp_handover_use_cellular(mpte, tp)) {
1165 found_working_wifi_subflow = true;
1166 }
1167 }
1168 }
1169
1170 /*
1171 * Couldn't find a working subflow, let's not remove those on a cellular
1172 * interface.
1173 */
1174 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
1175 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1176 found_working_wifi_subflow, found_working_cell_subflow);
1177 if (!found_working_wifi_subflow && wifi_unusable) {
1178 if (found_working_cell_subflow) {
1179 mptcp_remove_wifi_subflows(mpte);
1180 }
1181 return;
1182 }
1183
1184 mptcp_remove_cell_subflows(mpte);
1185}
1186
cb323159
A
1187static void
1188mptcp_handover_subflows_remove(struct mptses *mpte)
1189{
1190 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1191 boolean_t found_working_subflow = false;
1192 struct mptsub *mpts;
1193
5ba3f43e
A
1194 /*
1195 * Look for a subflow that is on a non-cellular interface
1196 * and actually works (aka, no retransmission timeout).
1197 */
1198 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1199 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1200 struct socket *so;
1201 struct tcpcb *tp;
1202
0a7de745 1203 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
5ba3f43e 1204 continue;
0a7de745 1205 }
5ba3f43e
A
1206
1207 so = mpts->mpts_socket;
1208 tp = sototcpcb(so);
1209
1210 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
0a7de745 1211 tp->t_state != TCPS_ESTABLISHED) {
5ba3f43e 1212 continue;
0a7de745 1213 }
5ba3f43e 1214
cb323159
A
1215 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1216 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1217
f427ee49 1218 if (!mptcp_handover_use_cellular(mpte, tp)) {
cb323159 1219 found_working_subflow = true;
f427ee49 1220 break;
0a7de745 1221 }
39236c6e
A
1222 }
1223
5ba3f43e
A
1224 /*
1225 * Couldn't find a working subflow, let's not remove those on a cellular
1226 * interface.
1227 */
0a7de745 1228 if (!found_working_subflow) {
5ba3f43e 1229 return;
0a7de745 1230 }
5ba3f43e 1231
cb323159
A
1232 mptcp_remove_cell_subflows(mpte);
1233}
5ba3f43e 1234
cb323159
A
1235static void
1236mptcp_targetbased_subflows_remove(struct mptses *mpte)
1237{
1238 uint64_t time_now = mach_continuous_time();
c3c9b80d 1239 struct mptsub *mpts;
5ba3f43e 1240
cb323159
A
1241 if (mpte->mpte_time_target != 0 &&
1242 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1243 mptcp_is_wifi_unusable_for_session(mpte)) {
1244 /* WiFi is bad and we are below the target - don't remove any subflows */
1245 return;
5ba3f43e
A
1246 }
1247
c3c9b80d
A
1248 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1249 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1250
1251 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1252 continue;
1253 }
1254
1255 /* We have a functioning subflow on WiFi. No need for cell! */
1256 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1257 !mptcp_subflow_disconnecting(mpts)) {
1258 mptcp_remove_cell_subflows(mpte);
1259 break;
1260 }
1261 }
cb323159
A
1262}
1263
1264/*
1265 * Based on the MPTCP Service-type and the state of the subflows, we
1266 * will destroy subflows here.
1267 */
1268void
1269mptcp_check_subflows_and_remove(struct mptses *mpte)
1270{
1271 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1272 return;
1273 }
1274
1275 socket_lock_assert_owned(mptetoso(mpte));
1276
c3c9b80d
A
1277 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1278 mptcp_pure_handover_subflows_remove(mpte);
1279 }
1280
cb323159
A
1281 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1282 mptcp_handover_subflows_remove(mpte);
1283 }
1284
1285 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1286 mptcp_targetbased_subflows_remove(mpte);
0a7de745 1287 }
5ba3f43e
A
1288}
1289
1290static void
1291mptcp_remove_subflows(struct mptses *mpte)
1292{
1293 struct mptsub *mpts, *tmpts;
1294
cb323159
A
1295 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1296 return;
1297 }
1298
5ba3f43e 1299 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
cb323159
A
1300 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1301 boolean_t found = false;
1302 uint32_t ifindex;
1303 uint32_t i;
1304
5ba3f43e
A
1305 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1306 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1307
cb323159
A
1308 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1309 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1310 ifp ? ifp->if_index : -1);
1311 soevent(mpts->mpts_socket,
1312 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1313
1314 continue;
1315 }
1316
1317 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1318 continue;
1319 }
1320
1321 if (ifp) {
1322 ifindex = ifp->if_index;
1323 } else {
1324 ifindex = mpts->mpts_ifscope;
1325 }
1326
1327 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1328 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1329 continue;
1330 }
1331
1332 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1333 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1334 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1335 found = true;
1336 break;
1337 }
1338
1339 if (mpts->mpts_dst.sa_family == AF_INET &&
1340 mpte->mpte_itfinfo[i].has_v4_conn) {
1341 found = true;
1342 break;
1343 }
1344 }
1345 }
1346
1347 if (!found) {
1348 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1349 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1350 ifindex, mpts->mpts_flags);
1351
5ba3f43e 1352 soevent(mpts->mpts_socket,
0a7de745 1353 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
5ba3f43e
A
1354 }
1355 }
1356}
1357
1358static void
1359mptcp_create_subflows(__unused void *arg)
1360{
1361 struct mppcb *mpp;
1362
1363 /*
1364 * Start with clearing, because we might be processing connections
1365 * while a new event comes in.
1366 */
0a7de745 1367 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
cb323159 1368 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
0a7de745 1369 }
5ba3f43e
A
1370
1371 /* Iterate over all MPTCP connections */
1372
1373 lck_mtx_lock(&mtcbinfo.mppi_lock);
1374
1375 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
cb323159
A
1376 struct socket *mp_so = mpp->mpp_socket;
1377 struct mptses *mpte = mpp->mpp_pcbe;
5ba3f43e 1378
0a7de745 1379 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
5ba3f43e 1380 continue;
0a7de745 1381 }
5ba3f43e 1382
cb323159
A
1383 socket_lock(mp_so, 1);
1384 VERIFY(mp_so->so_usecount > 0);
5ba3f43e
A
1385
1386 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1387
5ba3f43e
A
1388 mptcp_check_subflows_and_add(mpte);
1389 mptcp_remove_subflows(mpte);
1390
1391 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
cb323159 1392 socket_unlock(mp_so, 1);
5ba3f43e
A
1393 }
1394
1395 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1396}
1397
1398/*
1399 * We need this because we are coming from an NECP-event. This event gets posted
1400 * while holding NECP-locks. The creation of the subflow however leads us back
1401 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1402 * So, we would deadlock there as we already hold the NECP-lock.
1403 *
1404 * So, let's schedule this separately. It also gives NECP the chance to make
1405 * progress, without having to wait for MPTCP to finish its subflow creation.
1406 */
1407void
1408mptcp_sched_create_subflows(struct mptses *mpte)
1409{
1410 struct mppcb *mpp = mpte->mpte_mppcb;
1411 struct mptcb *mp_tp = mpte->mpte_mptcb;
1412 struct socket *mp_so = mpp->mpp_socket;
1413
1414 if (!mptcp_ok_to_create_subflows(mp_tp)) {
cb323159
A
1415 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1416 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
5ba3f43e
A
1417 return;
1418 }
1419
1420 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1421 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1422 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1423 }
1424
0a7de745 1425 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
5ba3f43e 1426 return;
0a7de745 1427 }
5ba3f43e
A
1428
1429 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
0a7de745 1430 timeout(mptcp_create_subflows, NULL, hz / 10);
5ba3f43e
A
1431}
1432
1433/*
1434 * Allocate an MPTCP socket option structure.
1435 */
1436struct mptopt *
f427ee49 1437mptcp_sopt_alloc(zalloc_flags_t how)
5ba3f43e 1438{
f427ee49 1439 return zalloc_flags(mptopt_zone, how | Z_ZERO);
5ba3f43e
A
1440}
1441
1442/*
1443 * Free an MPTCP socket option structure.
1444 */
1445void
1446mptcp_sopt_free(struct mptopt *mpo)
1447{
1448 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1449
1450 zfree(mptopt_zone, mpo);
1451}
1452
1453/*
1454 * Add a socket option to the MPTCP socket option list.
1455 */
1456void
1457mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1458{
cb323159 1459 socket_lock_assert_owned(mptetoso(mpte));
5ba3f43e
A
1460 mpo->mpo_flags |= MPOF_ATTACHED;
1461 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1462}
1463
1464/*
1465 * Remove a socket option from the MPTCP socket option list.
1466 */
1467void
1468mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1469{
cb323159 1470 socket_lock_assert_owned(mptetoso(mpte));
5ba3f43e
A
1471 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1472 mpo->mpo_flags &= ~MPOF_ATTACHED;
1473 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1474}
1475
1476/*
1477 * Search for an existing <sopt_level,sopt_name> socket option.
1478 */
1479struct mptopt *
1480mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1481{
1482 struct mptopt *mpo;
1483
cb323159 1484 socket_lock_assert_owned(mptetoso(mpte));
5ba3f43e
A
1485
1486 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1487 if (mpo->mpo_level == sopt->sopt_level &&
0a7de745 1488 mpo->mpo_name == sopt->sopt_name) {
5ba3f43e 1489 break;
0a7de745 1490 }
5ba3f43e 1491 }
0a7de745 1492 return mpo;
5ba3f43e
A
1493}
1494
1495/*
1496 * Allocate a MPTCP subflow structure.
1497 */
1498static struct mptsub *
1499mptcp_subflow_alloc(void)
1500{
f427ee49 1501 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
39236c6e
A
1502}
1503
1504/*
1505 * Deallocate a subflow structure, called when all of the references held
1506 * on it have been released. This implies that the subflow has been deleted.
1507 */
5ba3f43e 1508static void
39236c6e
A
1509mptcp_subflow_free(struct mptsub *mpts)
1510{
39236c6e
A
1511 VERIFY(mpts->mpts_refcnt == 0);
1512 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1513 VERIFY(mpts->mpts_mpte == NULL);
1514 VERIFY(mpts->mpts_socket == NULL);
1515
813fb2f6
A
1516 if (mpts->mpts_src != NULL) {
1517 FREE(mpts->mpts_src, M_SONAME);
1518 mpts->mpts_src = NULL;
39236c6e 1519 }
39236c6e
A
1520
1521 zfree(mptsub_zone, mpts);
1522}
1523
5ba3f43e
A
1524static void
1525mptcp_subflow_addref(struct mptsub *mpts)
1526{
0a7de745 1527 if (++mpts->mpts_refcnt == 0) {
5ba3f43e 1528 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
0a7de745
A
1529 }
1530 /* NOTREACHED */
5ba3f43e
A
1531}
1532
1533static void
1534mptcp_subflow_remref(struct mptsub *mpts)
1535{
1536 if (mpts->mpts_refcnt == 0) {
1537 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1538 /* NOTREACHED */
1539 }
0a7de745 1540 if (--mpts->mpts_refcnt > 0) {
5ba3f43e 1541 return;
0a7de745 1542 }
5ba3f43e
A
1543
1544 /* callee will unlock and destroy lock */
1545 mptcp_subflow_free(mpts);
1546}
1547
1548static void
1549mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1550{
1551 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1552 struct tcpcb *tp = sototcpcb(so);
1553
1554 /*
1555 * From this moment on, the subflow is linked to the MPTCP-connection.
1556 * Locking,... happens now at the MPTCP-layer
1557 */
1558 tp->t_mptcb = mpte->mpte_mptcb;
1559 so->so_flags |= SOF_MP_SUBFLOW;
1560 mp_so->so_usecount++;
1561
1562 /*
1563 * Insert the subflow into the list, and associate the MPTCP PCB
1564 * as well as the the subflow socket. From this point on, removing
1565 * the subflow needs to be done via mptcp_subflow_del().
1566 */
1567 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1568 mpte->mpte_numflows++;
1569
1570 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1571 mpts->mpts_mpte = mpte;
1572 mpts->mpts_socket = so;
1573 tp->t_mpsub = mpts;
0a7de745
A
1574 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1575 mptcp_subflow_addref(mpts); /* for subflow socket */
5ba3f43e
A
1576}
1577
1578static void
1579mptcp_subflow_necp_cb(void *handle, __unused int action,
0a7de745
A
1580 __unused uint32_t interface_index,
1581 uint32_t necp_flags, bool *viable)
5ba3f43e 1582{
d9a64523 1583 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
5ba3f43e
A
1584 struct inpcb *inp = (struct inpcb *)handle;
1585 struct socket *so = inp->inp_socket;
1586 struct mptsub *mpts;
1587 struct mptses *mpte;
1588
0a7de745 1589 if (low_power) {
d9a64523 1590 action = NECP_CLIENT_CBACTION_NONVIABLE;
0a7de745 1591 }
d9a64523 1592
0a7de745 1593 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
5ba3f43e 1594 return;
0a7de745 1595 }
5ba3f43e
A
1596
1597 /*
1598 * The socket is being garbage-collected. There is nothing to be done
1599 * here.
1600 */
cb323159 1601 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
5ba3f43e 1602 return;
0a7de745 1603 }
5ba3f43e
A
1604
1605 socket_lock(so, 1);
1606
1607 /* Check again after we acquired the lock. */
cb323159 1608 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5ba3f43e 1609 goto out;
0a7de745 1610 }
5ba3f43e
A
1611
1612 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1613 mpts = sototcpcb(so)->t_mpsub;
1614
cb323159
A
1615 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1616 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
5ba3f43e
A
1617
1618 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1619
1620 mptcp_sched_create_subflows(mpte);
1621
cb323159 1622 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
c3c9b80d 1623 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
cb323159
A
1624 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1625 viable != NULL) {
d9a64523 1626 *viable = 1;
0a7de745 1627 }
5ba3f43e
A
1628
1629out:
1630 socket_unlock(so, 1);
1631}
1632
39236c6e
A
1633/*
1634 * Create an MPTCP subflow socket.
1635 */
1636static int
1637mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
5ba3f43e 1638 struct socket **so)
39236c6e 1639{
5ba3f43e 1640 lck_mtx_t *subflow_mtx;
39236c6e 1641 struct mptopt smpo, *mpo, *tmpo;
5ba3f43e 1642 struct proc *p;
39236c6e
A
1643 struct socket *mp_so;
1644 int error;
1645
1646 *so = NULL;
cb323159 1647
5ba3f43e
A
1648 mp_so = mptetoso(mpte);
1649
1650 p = proc_find(mp_so->last_pid);
1651 if (p == PROC_NULL) {
cb323159
A
1652 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1653 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
5ba3f43e 1654
f427ee49 1655 mptcp_subflow_free(mpts);
0a7de745 1656 return ESRCH;
5ba3f43e 1657 }
39236c6e
A
1658
1659 /*
1660 * Create the subflow socket (multipath subflow, non-blocking.)
1661 *
1662 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1663 * socket; it will be cleared when the socket is peeled off or closed.
1664 * It also indicates to the underlying TCP to handle MPTCP options.
1665 * A multipath subflow socket implies SS_NOFDREF state.
1666 */
5ba3f43e
A
1667
1668 /*
1669 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1670 * the ipi-lock. We cannot hold the socket-lock at that point.
1671 */
cb323159 1672 socket_unlock(mp_so, 0);
5ba3f43e 1673 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
cb323159
A
1674 SOCF_MPTCP, PROC_NULL);
1675 socket_lock(mp_so, 0);
5ba3f43e 1676 if (error) {
cb323159
A
1677 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1678 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
5ba3f43e
A
1679
1680 proc_rele(p);
1681
1682 mptcp_subflow_free(mpts);
0a7de745 1683 return error;
39236c6e
A
1684 }
1685
5ba3f43e
A
1686 /*
1687 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1688 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1689 * Which is why we also need to get the lock with pr_getlock, as after
1690 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1691 */
1692 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1693 lck_mtx_lock(subflow_mtx);
1694
1695 /*
1696 * Must be the first thing we do, to make sure all pointers for this
1697 * subflow are set.
1698 */
1699 mptcp_subflow_attach(mpte, mpts, *so);
1700
1701 /*
1702 * A multipath subflow socket is used internally in the kernel,
1703 * therefore it does not have a file desciptor associated by
1704 * default.
1705 */
1706 (*so)->so_state |= SS_NOFDREF;
1707
1708 lck_mtx_unlock(subflow_mtx);
39236c6e
A
1709
1710 /* prevent the socket buffers from being compressed */
1711 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1712 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1713
490019cf 1714 /* Inherit preconnect and TFO data flags */
0a7de745 1715 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
490019cf 1716 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
0a7de745
A
1717 }
1718 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
490019cf 1719 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
0a7de745 1720 }
c3c9b80d
A
1721 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1722 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1723 }
490019cf 1724
5ba3f43e
A
1725 /* Inherit uuid and create the related flow. */
1726 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1727 struct mptcb *mp_tp = mpte->mpte_mptcb;
1728
1729 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1730
1731 /*
1732 * A note on the unlock: With MPTCP, we do multiple times a
1733 * necp_client_register_socket_flow. This is problematic,
1734 * because now the lock-ordering guarantee (first necp-locks,
1735 * then socket-locks) is no more respected. So, we need to
1736 * unlock here.
1737 */
cb323159 1738 socket_unlock(mp_so, 0);
5ba3f43e
A
1739 error = necp_client_register_socket_flow(mp_so->last_pid,
1740 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
cb323159 1741 socket_lock(mp_so, 0);
5ba3f43e 1742
0a7de745 1743 if (error) {
cb323159
A
1744 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1745 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1746
5ba3f43e 1747 goto out_err;
0a7de745 1748 }
5ba3f43e
A
1749
1750 /* Possible state-change during the unlock above */
1751 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
0a7de745 1752 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
cb323159
A
1753 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1754 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1755 mp_tp->mpt_state, mp_tp->mpt_flags);
1756
1757 error = EINVAL;
5ba3f43e 1758 goto out_err;
0a7de745 1759 }
5ba3f43e
A
1760
1761 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
cb323159
A
1762 }
1763
1764 /* Needs to happen prior to the delegation! */
1765 (*so)->last_pid = mp_so->last_pid;
1766
1767 if (mp_so->so_flags & SOF_DELEGATED) {
1768 if (mpte->mpte_epid) {
1769 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1770 if (error) {
1771 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1772 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1773 goto out_err;
1774 }
1775 }
1776 if (!uuid_is_null(mpte->mpte_euuid)) {
1777 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1778 if (error) {
1779 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1780 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1781 goto out_err;
1782 }
1783 }
5ba3f43e
A
1784 }
1785
1786 /* inherit the other socket options */
0a7de745 1787 bzero(&smpo, sizeof(smpo));
39236c6e
A
1788 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1789 smpo.mpo_level = SOL_SOCKET;
1790 smpo.mpo_intval = 1;
1791
1792 /* disable SIGPIPE */
1793 smpo.mpo_name = SO_NOSIGPIPE;
0a7de745 1794 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
5ba3f43e 1795 goto out_err;
0a7de745 1796 }
39236c6e
A
1797
1798 /* find out if the subflow's source address goes away */
1799 smpo.mpo_name = SO_NOADDRERR;
0a7de745 1800 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
5ba3f43e 1801 goto out_err;
0a7de745 1802 }
39236c6e 1803
5ba3f43e
A
1804 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1805 /*
1806 * On secondary subflows we might need to set the cell-fallback
1807 * flag (see conditions in mptcp_subflow_sosetopt).
1808 */
1809 smpo.mpo_level = SOL_SOCKET;
1810 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1811 smpo.mpo_intval = 1;
0a7de745 1812 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
5ba3f43e 1813 goto out_err;
0a7de745 1814 }
5ba3f43e 1815 }
39236c6e
A
1816
1817 /* replay setsockopt(2) on the subflow sockets for eligible options */
1818 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1819 int interim;
1820
0a7de745 1821 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
39236c6e 1822 continue;
0a7de745 1823 }
39236c6e
A
1824
1825 /*
1826 * Skip those that are handled internally; these options
1827 * should not have been recorded and marked with the
1828 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1829 */
1830 if (mpo->mpo_level == SOL_SOCKET &&
1831 (mpo->mpo_name == SO_NOSIGPIPE ||
1832 mpo->mpo_name == SO_NOADDRERR ||
0a7de745 1833 mpo->mpo_name == SO_KEEPALIVE)) {
39236c6e 1834 continue;
0a7de745 1835 }
39236c6e
A
1836
1837 interim = (mpo->mpo_flags & MPOF_INTERIM);
5ba3f43e 1838 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
cb323159
A
1839 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1840 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
5ba3f43e 1841 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
cb323159 1842 mpo->mpo_intval);
39236c6e
A
1843 mptcp_sopt_remove(mpte, mpo);
1844 mptcp_sopt_free(mpo);
1845 continue;
1846 }
1847 }
1848
1849 /*
1850 * We need to receive everything that the subflow socket has,
1851 * so use a customized socket receive function. We will undo
1852 * this when the socket is peeled off or closed.
1853 */
39236c6e
A
1854 switch (dom) {
1855 case PF_INET:
1856 (*so)->so_proto = &mptcp_subflow_protosw;
1857 break;
39236c6e
A
1858 case PF_INET6:
1859 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1860 break;
39236c6e
A
1861 default:
1862 VERIFY(0);
1863 /* NOTREACHED */
1864 }
1865
5ba3f43e
A
1866 proc_rele(p);
1867
1868 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1869 int, dom, int, error);
1870
0a7de745 1871 return 0;
39236c6e 1872
5ba3f43e
A
1873out_err:
1874 mptcp_subflow_abort(mpts, error);
1875
1876 proc_rele(p);
1877
0a7de745 1878 return error;
39236c6e
A
1879}
1880
1881/*
1882 * Close an MPTCP subflow socket.
1883 *
1884 * Note that this may be called on an embryonic subflow, and the only
1885 * thing that is guaranteed valid is the protocol-user request.
1886 */
5ba3f43e
A
1887static void
1888mptcp_subflow_soclose(struct mptsub *mpts)
39236c6e 1889{
5ba3f43e
A
1890 struct socket *so = mpts->mpts_socket;
1891
0a7de745 1892 if (mpts->mpts_flags & MPTSF_CLOSED) {
5ba3f43e 1893 return;
0a7de745 1894 }
39236c6e 1895
5ba3f43e 1896 VERIFY(so != NULL);
39236c6e 1897 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
0a7de745 1898 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
39236c6e 1899
39236c6e
A
1900 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1901 struct socket *, so,
1902 struct sockbuf *, &so->so_rcv,
1903 struct sockbuf *, &so->so_snd,
1904 struct mptses *, mpts->mpts_mpte);
1905
5ba3f43e
A
1906 mpts->mpts_flags |= MPTSF_CLOSED;
1907
1908 if (so->so_retaincnt == 0) {
1909 soclose_locked(so);
1910
1911 return;
1912 } else {
1913 VERIFY(so->so_usecount > 0);
1914 so->so_usecount--;
1915 }
1916
1917 return;
39236c6e
A
1918}
1919
1920/*
1921 * Connect an MPTCP subflow socket.
1922 *
5ba3f43e
A
1923 * Note that in the pending connect case, the subflow socket may have been
1924 * bound to an interface and/or a source IP address which may no longer be
1925 * around by the time this routine is called; in that case the connect attempt
1926 * will most likely fail.
39236c6e
A
1927 */
1928static int
1929mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1930{
5ba3f43e
A
1931 char dbuf[MAX_IPv6_STR_LEN];
1932 struct socket *mp_so, *so;
1933 struct mptcb *mp_tp;
1934 struct sockaddr *dst;
1935 struct proc *p;
a39ff7e2 1936 int af, error, dport;
39236c6e 1937
5ba3f43e
A
1938 mp_so = mptetoso(mpte);
1939 mp_tp = mpte->mpte_mptcb;
a39ff7e2
A
1940 so = mpts->mpts_socket;
1941 af = mpts->mpts_dst.sa_family;
1942 dst = &mpts->mpts_dst;
1943
0a7de745 1944 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
a39ff7e2
A
1945 VERIFY(mpts->mpts_socket != NULL);
1946 VERIFY(af == AF_INET || af == AF_INET6);
1947
1948 if (af == AF_INET) {
0a7de745 1949 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
a39ff7e2
A
1950 dport = ntohs(SIN(dst)->sin_port);
1951 } else {
0a7de745 1952 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
a39ff7e2
A
1953 dport = ntohs(SIN6(dst)->sin6_port);
1954 }
1955
f427ee49 1956 os_log(mptcp_log_handle,
cb323159
A
1957 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1958 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
39236c6e 1959
5ba3f43e
A
1960 p = proc_find(mp_so->last_pid);
1961 if (p == PROC_NULL) {
cb323159
A
1962 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1963 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
39236c6e 1964
0a7de745 1965 return ESRCH;
39236c6e
A
1966 }
1967
1968 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1969
fe8ab488 1970 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
39037602 1971
39236c6e 1972 /* connect the subflow socket */
5ba3f43e
A
1973 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1974 p, mpts->mpts_ifscope,
1975 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1976
1977 mpts->mpts_iss = sototcpcb(so)->iss;
1978
1979 /* See tcp_connect_complete */
1980 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1981 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1982 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1983 }
39236c6e 1984
fe8ab488
A
1985 /* Allocate a unique address id per subflow */
1986 mpte->mpte_addrid_last++;
0a7de745 1987 if (mpte->mpte_addrid_last == 0) {
fe8ab488 1988 mpte->mpte_addrid_last++;
0a7de745 1989 }
fe8ab488 1990
5ba3f43e
A
1991 proc_rele(p);
1992
39236c6e
A
1993 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1994 struct mptsub *, mpts, int, error);
0a7de745 1995 if (error) {
cb323159
A
1996 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1997 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
0a7de745 1998 }
39236c6e 1999
0a7de745 2000 return error;
39236c6e
A
2001}
2002
cb323159
A
2003static int
2004mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
c3c9b80d 2005 uint32_t rseq, uint16_t dlen, uint8_t dfin)
cb323159
A
2006{
2007 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2008
2009 if (m_pktlen(m) == 0) {
2010 return 0;
2011 }
2012
2a1bd2d3
A
2013 if (!(m->m_flags & M_PKTHDR)) {
2014 return 0;
2015 }
2016
2017 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
cb323159
A
2018 if (off && (dsn != m->m_pkthdr.mp_dsn ||
2019 rseq != m->m_pkthdr.mp_rseq ||
c3c9b80d
A
2020 dlen != m->m_pkthdr.mp_rlen ||
2021 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
2022 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
cb323159
A
2023 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
2024 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
2025 rseq, m->m_pkthdr.mp_rseq,
c3c9b80d
A
2026 dlen, m->m_pkthdr.mp_rlen,
2027 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
cb323159
A
2028
2029 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2030 return -1;
2031 }
2a1bd2d3 2032 }
f427ee49 2033
2a1bd2d3 2034 /* If mbuf is beyond right edge of the mapping, we need to split */
c3c9b80d
A
2035 if (m_pktlen(m) > dlen - dfin - off) {
2036 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
2a1bd2d3 2037 if (new == NULL) {
c3c9b80d 2038 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
2a1bd2d3 2039 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
c3c9b80d 2040 dlen, dfin, off, m_pktlen(m),
2a1bd2d3 2041 mpts->mpts_connid);
cb323159 2042
2a1bd2d3
A
2043 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2044 return -1;
2045 }
cb323159 2046
2a1bd2d3
A
2047 m->m_next = new;
2048 sballoc(&so->so_rcv, new);
2049 /* Undo, as sballoc will add to it as well */
2050 so->so_rcv.sb_cc -= new->m_len;
2051
2052 if (so->so_rcv.sb_mbtail == m) {
2053 so->so_rcv.sb_mbtail = new;
cb323159
A
2054 }
2055 }
2056
2a1bd2d3
A
2057 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2058 m->m_pkthdr.mp_dsn = dsn + off;
2059 m->m_pkthdr.mp_rseq = rseq + off;
2a1bd2d3
A
2060 VERIFY(m_pktlen(m) < UINT16_MAX);
2061 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
2062
c3c9b80d
A
2063 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
2064 if (dfin) {
2065 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
2066 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
2067 } else {
2068 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
2069 }
2070 }
2071
2072
cb323159
A
2073 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2074
2075 return 0;
2076}
2077
39236c6e
A
2078/*
2079 * MPTCP subflow socket receive routine, derived from soreceive().
2080 */
2081static int
2082mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2083 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2084{
2085#pragma unused(uio)
2a1bd2d3
A
2086 struct socket *mp_so;
2087 struct mptses *mpte;
2088 struct mptcb *mp_tp;
39236c6e 2089 int flags, error = 0;
39236c6e 2090 struct mbuf *m, **mp = mp0;
2a1bd2d3
A
2091
2092 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2093 mp_so = mptetoso(mpte);
2094 mp_tp = mpte->mpte_mptcb;
39236c6e 2095
39236c6e
A
2096 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2097
2098#ifdef MORE_LOCKING_DEBUG
2099 if (so->so_usecount == 1) {
2100 panic("%s: so=%x no other reference on socket\n", __func__, so);
2101 /* NOTREACHED */
2102 }
2103#endif
2104 /*
2105 * We return all that is there in the subflow's socket receive buffer
2106 * to the MPTCP layer, so we require that the caller passes in the
2107 * expected parameters.
2108 */
0a7de745
A
2109 if (mp == NULL || controlp != NULL) {
2110 return EINVAL;
2111 }
5ba3f43e 2112
39236c6e 2113 *mp = NULL;
0a7de745 2114 if (psa != NULL) {
39236c6e 2115 *psa = NULL;
0a7de745
A
2116 }
2117 if (flagsp != NULL) {
2118 flags = *flagsp & ~MSG_EOR;
2119 } else {
39236c6e 2120 flags = 0;
0a7de745 2121 }
39236c6e 2122
0a7de745
A
2123 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2124 return EOPNOTSUPP;
2125 }
5ba3f43e 2126
0a7de745 2127 flags |= (MSG_DONTWAIT | MSG_NBIO);
39236c6e
A
2128
2129 /*
2130 * If a recv attempt is made on a previously-accepted socket
2131 * that has been marked as inactive (disconnected), reject
2132 * the request.
2133 */
2134 if (so->so_flags & SOF_DEFUNCT) {
2135 struct sockbuf *sb = &so->so_rcv;
2136
2137 error = ENOTCONN;
39236c6e
A
2138 /*
2139 * This socket should have been disconnected and flushed
2140 * prior to being returned from sodefunct(); there should
2141 * be no data on its receive list, so panic otherwise.
2142 */
0a7de745 2143 if (so->so_state & SS_DEFUNCT) {
39236c6e 2144 sb_empty_assert(sb, __func__);
0a7de745
A
2145 }
2146 return error;
39236c6e
A
2147 }
2148
2149 /*
2150 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2151 * and if so just return to the caller. This could happen when
2152 * soreceive() is called by a socket upcall function during the
2153 * time the socket is freed. The socket buffer would have been
2154 * locked across the upcall, therefore we cannot put this thread
2155 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2156 * we may livelock), because the lock on the socket buffer will
2157 * only be released when the upcall routine returns to its caller.
2158 * Because the socket has been officially closed, there can be
2159 * no further read on it.
2160 *
2161 * A multipath subflow socket would have its SS_NOFDREF set by
2162 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2163 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2164 */
2165 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
0a7de745
A
2166 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2167 return 0;
2168 }
39236c6e
A
2169
2170 /*
2171 * For consistency with soreceive() semantics, we need to obey
2172 * SB_LOCK in case some other code path has locked the buffer.
2173 */
2174 error = sblock(&so->so_rcv, 0);
0a7de745
A
2175 if (error != 0) {
2176 return error;
2177 }
39236c6e
A
2178
2179 m = so->so_rcv.sb_mb;
2180 if (m == NULL) {
2181 /*
2182 * Panic if we notice inconsistencies in the socket's
2183 * receive list; both sb_mb and sb_cc should correctly
2184 * reflect the contents of the list, otherwise we may
2185 * end up with false positives during select() or poll()
2186 * which could put the application in a bad state.
2187 */
2188 SB_MB_CHECK(&so->so_rcv);
2189
2190 if (so->so_error != 0) {
2191 error = so->so_error;
2192 so->so_error = 0;
2193 goto release;
2194 }
2195
5ba3f43e
A
2196 if (so->so_state & SS_CANTRCVMORE) {
2197 goto release;
2198 }
2199
0a7de745 2200 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
5ba3f43e
A
2201 error = ENOTCONN;
2202 goto release;
2203 }
2204
2205 /*
2206 * MSG_DONTWAIT is implicitly defined and this routine will
2207 * never block, so return EWOULDBLOCK when there is nothing.
2208 */
2209 error = EWOULDBLOCK;
2210 goto release;
2211 }
2212
2213 mptcp_update_last_owner(so, mp_so);
2214
5ba3f43e
A
2215 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2216 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2217
2218 while (m != NULL) {
c3c9b80d
A
2219 int dlen = 0, error_out = 0, off = 0;
2220 uint8_t dfin = 0;
5ba3f43e
A
2221 struct mbuf *start = m;
2222 uint64_t dsn;
2223 uint32_t sseq;
2224 uint16_t orig_dlen;
2225 uint16_t csum;
2226
2227 VERIFY(m->m_nextpkt == NULL);
2228
2a1bd2d3
A
2229 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2230fallback:
2231 /* Just move mbuf to MPTCP-level */
5ba3f43e
A
2232
2233 sbfree(&so->so_rcv, m);
2234
2235 if (mp != NULL) {
2236 *mp = m;
2237 mp = &m->m_next;
2238 so->so_rcv.sb_mb = m = m->m_next;
2239 *mp = NULL;
5ba3f43e
A
2240 }
2241
2242 if (m != NULL) {
2243 so->so_rcv.sb_lastrecord = m;
2244 } else {
2245 SB_EMPTY_FIXUP(&so->so_rcv);
2246 }
2247
2248 continue;
2a1bd2d3
A
2249 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2250 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2251 boolean_t found_mapping = false;
2252 int parsed_length = 0;
2253 struct mbuf *m_iter;
2254
2255 /*
2256 * No MPTCP-option in the header. Either fallback or
2257 * wait for additional mappings.
2258 */
2259 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2260 /* data arrived without a DSS option mapping */
2261
2262 /* initial subflow can fallback right after SYN handshake */
2263 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2264 mptcp_notify_mpfail(so);
2265
2266 goto fallback;
2267 } else {
2268 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2269 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2270 mpts->mpts_connid);
2271 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2272
2273 error = EIO;
2274 *mp0 = NULL;
2275 goto release;
2276 }
2277 }
2278
2279 /* Thus, let's look for an mbuf with the mapping */
2280 m_iter = m->m_next;
2281 parsed_length = m->m_len;
2282 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2283 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2284 parsed_length += m_iter->m_len;
2285 m_iter = m_iter->m_next;
2286 continue;
2287 }
2288
2289 found_mapping = true;
2290
2291 /* Found an mbuf with a DSS-mapping */
2292 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2293 dsn = m_iter->m_pkthdr.mp_dsn;
2294 sseq = m_iter->m_pkthdr.mp_rseq;
2295 csum = m_iter->m_pkthdr.mp_csum;
2296
2297 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2298 dfin = 1;
c3c9b80d 2299 dlen--;
2a1bd2d3
A
2300 }
2301
2302 break;
2303 }
2304
2305 if (!found_mapping && parsed_length < UINT16_MAX) {
2306 /* Mapping not yet present, we can wait! */
2307 if (*mp0 == NULL) {
2308 error = EWOULDBLOCK;
2309 }
2310 goto release;
2311 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2312 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2313 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2314 mpts->mpts_connid);
2315 /* Received 64KB without DSS-mapping. We should kill the subflow */
2316 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2317
2318 error = EIO;
2319 *mp0 = NULL;
2320 goto release;
2321 }
2322 } else {
2323 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2324 dsn = m->m_pkthdr.mp_dsn;
2325 sseq = m->m_pkthdr.mp_rseq;
2326 csum = m->m_pkthdr.mp_csum;
39236c6e 2327
2a1bd2d3
A
2328 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2329 dfin = 1;
c3c9b80d 2330 dlen--;
2a1bd2d3 2331 }
0a7de745 2332 }
5c9f4661 2333
5ba3f43e
A
2334 /*
2335 * Check if the full mapping is now present
2336 */
c3c9b80d 2337 if ((int)so->so_rcv.sb_cc < dlen) {
0a7de745 2338 if (*mp0 == NULL) {
5ba3f43e 2339 error = EWOULDBLOCK;
0a7de745 2340 }
39236c6e
A
2341 goto release;
2342 }
2343
5ba3f43e 2344 /* Now, get the full mapping */
c3c9b80d 2345 off = 0;
5ba3f43e 2346 while (dlen > 0) {
c3c9b80d 2347 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
5c9f4661
A
2348 error_out = 1;
2349 error = EIO;
2350 dlen = 0;
0a7de745 2351 *mp0 = NULL;
5c9f4661
A
2352 break;
2353 }
39236c6e 2354
5ba3f43e 2355 dlen -= m->m_len;
c3c9b80d 2356 off += m->m_len;
5ba3f43e 2357 sbfree(&so->so_rcv, m);
39236c6e 2358
5ba3f43e
A
2359 if (mp != NULL) {
2360 *mp = m;
2361 mp = &m->m_next;
2362 so->so_rcv.sb_mb = m = m->m_next;
2363 *mp = NULL;
2364 }
2365
c3c9b80d 2366 VERIFY(dlen == 0 || m);
39236c6e
A
2367 }
2368
5ba3f43e
A
2369 VERIFY(dlen == 0);
2370
39236c6e 2371 if (m != NULL) {
5ba3f43e 2372 so->so_rcv.sb_lastrecord = m;
39236c6e 2373 } else {
39236c6e
A
2374 SB_EMPTY_FIXUP(&so->so_rcv);
2375 }
5ba3f43e 2376
0a7de745 2377 if (error_out) {
5c9f4661 2378 goto release;
0a7de745 2379 }
5c9f4661
A
2380
2381 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
5ba3f43e
A
2382 error = EIO;
2383 *mp0 = NULL;
2384 goto release;
2385 }
2386
39236c6e
A
2387 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2388 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2389 }
2390
2391 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2392 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
39236c6e 2393
0a7de745 2394 if (flagsp != NULL) {
39236c6e 2395 *flagsp |= flags;
0a7de745 2396 }
39236c6e
A
2397
2398release:
5ba3f43e
A
2399 sbunlock(&so->so_rcv, TRUE);
2400
0a7de745 2401 return error;
39236c6e
A
2402}
2403
39236c6e 2404/*
5ba3f43e 2405 * MPTCP subflow socket send routine, derived from sosend().
39236c6e 2406 */
5ba3f43e
A
2407static int
2408mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2409 struct mbuf *top, struct mbuf *control, int flags)
39236c6e 2410{
5ba3f43e 2411 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
5ba3f43e 2412 boolean_t en_tracing = FALSE, proc_held = FALSE;
2a1bd2d3 2413 struct proc *p = current_proc();
5ba3f43e
A
2414 int en_tracing_val;
2415 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2416 int error;
39236c6e 2417
5ba3f43e
A
2418 VERIFY(control == NULL);
2419 VERIFY(addr == NULL);
2420 VERIFY(uio == NULL);
2421 VERIFY(flags == 0);
2422 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
39236c6e 2423
5ba3f43e
A
2424 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2425 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e
A
2426
2427 /*
5ba3f43e
A
2428 * trace if tracing & network (vs. unix) sockets & and
2429 * non-loopback
39236c6e 2430 */
5ba3f43e
A
2431 if (ENTR_SHOULDTRACE &&
2432 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2433 struct inpcb *inp = sotoinpcb(so);
2434 if (inp->inp_last_outifp != NULL &&
2435 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2436 en_tracing = TRUE;
2437 en_tracing_val = top->m_pkthdr.len;
2438 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
cb323159 2439 (unsigned long)VM_KERNEL_ADDRPERM(so),
5ba3f43e
A
2440 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2441 (int64_t)en_tracing_val);
2442 }
2443 }
39236c6e 2444
5ba3f43e 2445 mptcp_update_last_owner(so, mp_so);
39236c6e 2446
5ba3f43e
A
2447 if (mp_so->last_pid != proc_pid(p)) {
2448 p = proc_find(mp_so->last_pid);
2449 if (p == PROC_NULL) {
2450 p = current_proc();
2451 } else {
2452 proc_held = TRUE;
2453 }
2454 }
39236c6e 2455
5ba3f43e
A
2456#if NECP
2457 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2458#endif /* NECP */
39236c6e 2459
f427ee49 2460 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
0a7de745 2461 if (error) {
5ba3f43e 2462 goto out;
0a7de745 2463 }
39236c6e 2464
5ba3f43e
A
2465 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2466 top = NULL;
39236c6e 2467
5ba3f43e 2468out:
0a7de745 2469 if (top != NULL) {
5ba3f43e 2470 m_freem(top);
0a7de745 2471 }
39236c6e 2472
0a7de745 2473 if (proc_held) {
5ba3f43e 2474 proc_rele(p);
0a7de745 2475 }
5ba3f43e
A
2476
2477 soclearfastopen(so);
2478
2479 if (en_tracing) {
2480 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
cb323159 2481 (unsigned long)VM_KERNEL_ADDRPERM(so),
5ba3f43e
A
2482 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2483 (int64_t)en_tracing_val);
2484 }
2485
0a7de745 2486 return error;
39236c6e
A
2487}
2488
2489/*
2490 * Establish an initial MPTCP connection (if first subflow and not yet
2491 * connected), or add a subflow to an existing MPTCP connection.
2492 */
2493int
5ba3f43e
A
2494mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2495 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
39236c6e 2496{
39236c6e 2497 struct socket *mp_so, *so = NULL;
39236c6e 2498 struct mptcb *mp_tp;
5ba3f43e 2499 struct mptsub *mpts = NULL;
39236c6e
A
2500 int af, error = 0;
2501
5ba3f43e 2502 mp_so = mptetoso(mpte);
39236c6e
A
2503 mp_tp = mpte->mpte_mptcb;
2504
cb323159
A
2505 socket_lock_assert_owned(mp_so);
2506
fe8ab488
A
2507 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2508 /* If the remote end sends Data FIN, refuse subflow adds */
cb323159
A
2509 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2510 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
fe8ab488 2511 error = ENOTCONN;
5ba3f43e 2512 goto out_err;
fe8ab488 2513 }
39236c6e 2514
bca245ac
A
2515 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2516 error = EOVERFLOW;
2517 goto out_err;
2518 }
2519
5ba3f43e
A
2520 mpts = mptcp_subflow_alloc();
2521 if (mpts == NULL) {
cb323159
A
2522 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2523 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
5ba3f43e
A
2524 error = ENOMEM;
2525 goto out_err;
2526 }
39236c6e 2527
0a7de745
A
2528 if (src) {
2529 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2530 error = EAFNOSUPPORT;
2531 goto out_err;
2532 }
813fb2f6 2533
0a7de745
A
2534 if (src->sa_family == AF_INET &&
2535 src->sa_len != sizeof(struct sockaddr_in)) {
2536 error = EINVAL;
2537 goto out_err;
2538 }
2539
2540 if (src->sa_family == AF_INET6 &&
2541 src->sa_len != sizeof(struct sockaddr_in6)) {
2542 error = EINVAL;
2543 goto out_err;
2544 }
2545
2546 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
813fb2f6
A
2547 M_WAITOK | M_ZERO);
2548 if (mpts->mpts_src == NULL) {
5ba3f43e
A
2549 error = ENOMEM;
2550 goto out_err;
39236c6e 2551 }
0a7de745
A
2552 bcopy(src, mpts->mpts_src, src->sa_len);
2553 }
2554
2555 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2556 error = EAFNOSUPPORT;
2557 goto out_err;
2558 }
2559
2560 if (dst->sa_family == AF_INET &&
2561 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2562 error = EINVAL;
2563 goto out_err;
2564 }
2565
2566 if (dst->sa_family == AF_INET6 &&
2567 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2568 error = EINVAL;
2569 goto out_err;
39236c6e
A
2570 }
2571
cb323159 2572 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
5ba3f43e
A
2573
2574 af = mpts->mpts_dst.sa_family;
2575
0a7de745
A
2576 ifnet_head_lock_shared();
2577 if ((ifscope > (unsigned)if_index)) {
2578 ifnet_head_done();
2579 error = ENXIO;
2580 goto out_err;
2581 }
2582 ifnet_head_done();
2583
5ba3f43e
A
2584 mpts->mpts_ifscope = ifscope;
2585
39236c6e 2586 /* create the subflow socket */
0a7de745 2587 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
5ba3f43e
A
2588 /*
2589 * Returning (error) and not cleaning up, because up to here
2590 * all we did is creating mpts.
2591 *
2592 * And the contract is that the call to mptcp_subflow_socreate,
2593 * moves ownership of mpts to mptcp_subflow_socreate.
2594 */
0a7de745
A
2595 return error;
2596 }
5ba3f43e
A
2597
2598 /*
2599 * We may be called from within the kernel. Still need to account this
2600 * one to the real app.
2601 */
2602 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
39236c6e
A
2603
2604 /*
3e170ce0
A
2605 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2606 * -1 (SAE_CONNID_ALL).
39236c6e
A
2607 */
2608 mpte->mpte_connid_last++;
3e170ce0 2609 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
0a7de745 2610 mpte->mpte_connid_last == SAE_CONNID_ANY) {
39236c6e 2611 mpte->mpte_connid_last++;
0a7de745 2612 }
39236c6e
A
2613
2614 mpts->mpts_connid = mpte->mpte_connid_last;
490019cf
A
2615
2616 mpts->mpts_rel_seq = 1;
2617
fe8ab488
A
2618 /* Allocate a unique address id per subflow */
2619 mpte->mpte_addrid_last++;
0a7de745 2620 if (mpte->mpte_addrid_last == 0) {
fe8ab488 2621 mpte->mpte_addrid_last++;
0a7de745 2622 }
39236c6e 2623
39236c6e 2624 /* register for subflow socket read/write events */
cb323159 2625 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
39236c6e 2626
5ba3f43e
A
2627 /* Register for subflow socket control events */
2628 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
39236c6e 2629 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
5ba3f43e
A
2630 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2631 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2632 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2633 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2634 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
cb323159 2635 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
39236c6e
A
2636
2637 /* sanity check */
2638 VERIFY(!(mpts->mpts_flags &
0a7de745 2639 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
39236c6e 2640
39236c6e
A
2641 /*
2642 * Indicate to the TCP subflow whether or not it should establish
2643 * the initial MPTCP connection, or join an existing one. Fill
2644 * in the connection request structure with additional info needed
2645 * by the underlying TCP (to be used in the TCP options, etc.)
2646 */
39236c6e 2647 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
5ba3f43e
A
2648 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2649
39236c6e 2650 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5ba3f43e 2651 mptcp_init_local_parms(mpte);
39236c6e 2652 }
39236c6e 2653 soisconnecting(mp_so);
5ba3f43e
A
2654
2655 /* If fastopen is requested, set state in mpts */
0a7de745 2656 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
5ba3f43e 2657 mpts->mpts_flags |= MPTSF_TFO_REQD;
0a7de745 2658 }
39236c6e 2659 } else {
0a7de745 2660 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
39236c6e 2661 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
0a7de745 2662 }
490019cf
A
2663 }
2664
39236c6e
A
2665 mpts->mpts_flags |= MPTSF_CONNECTING;
2666
39236c6e 2667 /* connect right away if first attempt, or if join can be done now */
0a7de745 2668 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
39236c6e 2669 error = mptcp_subflow_soconnectx(mpte, mpts);
0a7de745 2670 }
39236c6e 2671
0a7de745 2672 if (error) {
5ba3f43e 2673 goto out_err_close;
0a7de745 2674 }
5ba3f43e 2675
0a7de745 2676 if (pcid) {
5ba3f43e 2677 *pcid = mpts->mpts_connid;
0a7de745 2678 }
5ba3f43e 2679
0a7de745 2680 return 0;
5ba3f43e
A
2681
2682out_err_close:
2683 mptcp_subflow_abort(mpts, error);
2684
0a7de745 2685 return error;
5ba3f43e
A
2686
2687out_err:
0a7de745 2688 if (mpts) {
5ba3f43e 2689 mptcp_subflow_free(mpts);
0a7de745 2690 }
5ba3f43e 2691
0a7de745 2692 return error;
39236c6e
A
2693}
2694
5ba3f43e 2695void
cb323159 2696mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
5ba3f43e 2697{
cb323159 2698 int index = mptcpstats_get_index(stats, mpts);
5ba3f43e
A
2699
2700 if (index != -1) {
2701 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2702
2703 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2704 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
cb323159
A
2705
2706 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2707 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2708
2709 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2710 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2711
2712 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2713 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
5ba3f43e
A
2714 }
2715}
2716
39236c6e
A
2717/*
2718 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2719 * will no longer be accessible after a subflow is deleted, thus this
2720 * should occur only after the subflow socket has been disconnected.
39236c6e
A
2721 */
2722void
5ba3f43e 2723mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
39236c6e 2724{
5ba3f43e
A
2725 struct socket *mp_so = mptetoso(mpte);
2726 struct socket *so = mpts->mpts_socket;
2727 struct tcpcb *tp = sototcpcb(so);
39037602 2728
cb323159 2729 socket_lock_assert_owned(mp_so);
5ba3f43e
A
2730 VERIFY(mpts->mpts_mpte == mpte);
2731 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2732 VERIFY(mpte->mpte_numflows != 0);
2733 VERIFY(mp_so->so_usecount > 0);
39236c6e 2734
5ba3f43e 2735 mptcpstats_update(mpte->mpte_itfstats, mpts);
cb323159
A
2736
2737 mptcp_unset_cellicon(mpte, mpts, 1);
2738
5ba3f43e
A
2739 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2740 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
39236c6e 2741
39236c6e
A
2742 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2743 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
39236c6e 2744 mpte->mpte_numflows--;
0a7de745 2745 if (mpte->mpte_active_sub == mpts) {
fe8ab488 2746 mpte->mpte_active_sub = NULL;
0a7de745 2747 }
39236c6e
A
2748
2749 /*
2750 * Drop references held by this subflow socket; there
2751 * will be no further upcalls made from this point.
2752 */
5ba3f43e
A
2753 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2754 sock_catchevents_locked(so, NULL, NULL, 0);
fe8ab488 2755
39236c6e 2756 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
39037602 2757
0a7de745 2758 mp_so->so_usecount--; /* for subflow socket */
39236c6e
A
2759 mpts->mpts_mpte = NULL;
2760 mpts->mpts_socket = NULL;
39236c6e 2761
0a7de745
A
2762 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2763 mptcp_subflow_remref(mpts); /* for subflow socket */
5ba3f43e
A
2764
2765 so->so_flags &= ~SOF_MP_SUBFLOW;
2766 tp->t_mptcb = NULL;
2767 tp->t_mpsub = NULL;
2768}
2769
2770void
2771mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2772{
2773 struct socket *so = mpts->mpts_socket;
2774 struct mptcb *mp_tp = mpte->mpte_mptcb;
2775 int send_dfin = 0;
2776
0a7de745 2777 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
5ba3f43e 2778 send_dfin = 1;
0a7de745 2779 }
5ba3f43e
A
2780
2781 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2782 (so->so_state & SS_ISCONNECTED)) {
2783 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2784 __func__, mpts->mpts_connid, send_dfin),
2785 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2786
0a7de745 2787 if (send_dfin) {
5ba3f43e 2788 mptcp_send_dfin(so);
0a7de745 2789 }
5ba3f43e
A
2790 soshutdownlock(so, SHUT_WR);
2791 }
5ba3f43e
A
2792}
2793
2794static void
2795mptcp_subflow_abort(struct mptsub *mpts, int error)
2796{
2797 struct socket *so = mpts->mpts_socket;
2798 struct tcpcb *tp = sototcpcb(so);
2799
0a7de745 2800 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
5ba3f43e 2801 return;
0a7de745 2802 }
5ba3f43e
A
2803
2804 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
0a7de745 2805 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2806
0a7de745 2807 if (tp->t_state != TCPS_CLOSED) {
5ba3f43e 2808 tcp_drop(tp, error);
0a7de745 2809 }
5ba3f43e
A
2810
2811 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2812}
2813
2814/*
2815 * Disconnect a subflow socket.
2816 */
2817void
5ba3f43e 2818mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
39236c6e 2819{
94ff46dc 2820 struct socket *so, *mp_so;
39236c6e
A
2821 struct mptcb *mp_tp;
2822 int send_dfin = 0;
2823
94ff46dc
A
2824 so = mpts->mpts_socket;
2825 mp_tp = mpte->mpte_mptcb;
2826 mp_so = mptetoso(mpte);
2827
2828 socket_lock_assert_owned(mp_so);
39236c6e 2829
0a7de745 2830 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
39236c6e 2831 return;
0a7de745 2832 }
39236c6e 2833
cb323159
A
2834 mptcp_unset_cellicon(mpte, mpts, 1);
2835
39236c6e
A
2836 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2837
0a7de745 2838 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
39236c6e 2839 send_dfin = 1;
0a7de745 2840 }
39236c6e 2841
c3c9b80d
A
2842 if (mp_so->so_flags & SOF_DEFUNCT) {
2843 errno_t ret;
2844
2845 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2846 if (ret == 0) {
2847 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2848
2849 if (ret != 0) {
2850 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2851 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2852 }
2853 } else {
2854 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2855 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2856 }
2857 }
2858
39236c6e
A
2859 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2860 (so->so_state & SS_ISCONNECTED)) {
a39ff7e2 2861 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
5ba3f43e
A
2862 __func__, mpts->mpts_connid, send_dfin),
2863 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2864
0a7de745 2865 if (send_dfin) {
39236c6e 2866 mptcp_send_dfin(so);
0a7de745 2867 }
94ff46dc 2868
c3c9b80d
A
2869 (void) soshutdownlock(so, SHUT_RD);
2870 (void) soshutdownlock(so, SHUT_WR);
2871 (void) sodisconnectlocked(so);
39236c6e 2872 }
94ff46dc 2873
39236c6e
A
2874 /*
2875 * Generate a disconnect event for this subflow socket, in case
2876 * the lower layer doesn't do it; this is needed because the
5ba3f43e 2877 * subflow socket deletion relies on it.
39236c6e 2878 */
5ba3f43e 2879 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
2880}
2881
39236c6e
A
2882/*
2883 * Subflow socket input.
39236c6e
A
2884 */
2885static void
2886mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2887{
5ba3f43e 2888 struct socket *mp_so = mptetoso(mpte);
39236c6e
A
2889 struct mbuf *m = NULL;
2890 struct socket *so;
5ba3f43e 2891 int error, wakeup = 0;
39236c6e 2892
5ba3f43e
A
2893 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2894 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
39236c6e 2895
39037602 2896 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
39236c6e
A
2897 struct mptsub *, mpts);
2898
0a7de745 2899 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
5ba3f43e 2900 goto out;
0a7de745 2901 }
39236c6e
A
2902
2903 so = mpts->mpts_socket;
2904
2905 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2906 if (error != 0 && error != EWOULDBLOCK) {
cb323159
A
2907 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2908 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
5ba3f43e
A
2909 if (error == ENODATA) {
2910 /*
2911 * Don't ignore ENODATA so as to discover
2912 * nasty middleboxes.
2913 */
2914 mp_so->so_error = ENODATA;
2915
2916 wakeup = 1;
2917 goto out;
39236c6e 2918 }
39236c6e 2919 } else if (error == 0) {
5ba3f43e 2920 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
3e170ce0 2921 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
2922 }
2923
2924 /* In fallback, make sure to accept data on all but one subflow */
5ba3f43e
A
2925 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2926 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2927 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2928 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 2929 m_freem(m);
5ba3f43e 2930 goto out;
39236c6e
A
2931 }
2932
2933 if (m != NULL) {
5ba3f43e 2934 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
cb323159 2935 mptcp_set_cellicon(mpte, mpts);
3e170ce0 2936
5ba3f43e
A
2937 mpte->mpte_used_cell = 1;
2938 } else {
cb323159
A
2939 /*
2940 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2941 * explicitly set the cellicon, then we unset it again.
2942 */
2943 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2944 mptcp_unset_cellicon(mpte, NULL, 1);
2945 }
5ba3f43e
A
2946
2947 mpte->mpte_used_wifi = 1;
2948 }
3e170ce0 2949
39236c6e 2950 mptcp_input(mpte, m);
39236c6e 2951 }
5ba3f43e 2952
5ba3f43e 2953out:
0a7de745 2954 if (wakeup) {
5ba3f43e 2955 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
0a7de745 2956 }
5ba3f43e 2957
cb323159
A
2958 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2959}
2960
2961void
2962mptcp_handle_input(struct socket *so)
2963{
2964 struct mptsub *mpts, *tmpts;
2965 struct mptses *mpte;
2966
2967 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2968 return;
2969 }
2970
2971 mpts = sototcpcb(so)->t_mpsub;
2972 mpte = mpts->mpts_mpte;
2973
2974 socket_lock_assert_owned(mptetoso(mpte));
2975
2976 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2977 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2978 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2979 }
2980 return;
2981 }
2982
2983 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2984 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2985 if (mpts->mpts_socket->so_usecount == 0) {
2986 /* Will be removed soon by tcp_garbage_collect */
2987 continue;
2988 }
2989
2990 mptcp_subflow_addref(mpts);
2991 mpts->mpts_socket->so_usecount++;
2992
2993 mptcp_subflow_input(mpte, mpts);
2994
2995 mptcp_subflow_remref(mpts); /* ours */
2996
2997 VERIFY(mpts->mpts_socket->so_usecount != 0);
2998 mpts->mpts_socket->so_usecount--;
2999 }
3000
3001 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
39236c6e
A
3002}
3003
3004/*
3005 * Subflow socket write upcall.
3006 *
5ba3f43e 3007 * Called when the associated subflow socket posted a read event.
39236c6e
A
3008 */
3009static void
3010mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
3011{
3012#pragma unused(so, waitf)
3013 struct mptsub *mpts = arg;
3014 struct mptses *mpte = mpts->mpts_mpte;
3015
5ba3f43e
A
3016 VERIFY(mpte != NULL);
3017
3018 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
0a7de745 3019 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
5ba3f43e 3020 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
0a7de745 3021 }
fe8ab488 3022 return;
5ba3f43e 3023 }
39236c6e 3024
5ba3f43e 3025 mptcp_output(mpte);
39236c6e
A
3026}
3027
a39ff7e2
A
3028static boolean_t
3029mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
3030{
3031 struct mbuf *so_m = so->so_snd.sb_mb;
3032 uint64_t dsn = m->m_pkthdr.mp_dsn;
3033
3034 while (so_m) {
3035 VERIFY(so_m->m_flags & M_PKTHDR);
3036 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
3037
3038 /* Part of the segment is covered, don't reinject here */
3039 if (so_m->m_pkthdr.mp_dsn <= dsn &&
0a7de745 3040 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
a39ff7e2 3041 return TRUE;
0a7de745 3042 }
a39ff7e2
A
3043
3044 so_m = so_m->m_next;
3045 }
3046
3047 return FALSE;
3048}
3049
39236c6e
A
3050/*
3051 * Subflow socket output.
3052 *
3053 * Called for sending data from MPTCP to the underlying subflow socket.
3054 */
3055int
5ba3f43e 3056mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
39236c6e 3057{
39236c6e 3058 struct mptcb *mp_tp = mpte->mpte_mptcb;
5ba3f43e
A
3059 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
3060 struct socket *mp_so, *so;
3061 struct tcpcb *tp;
3062 uint64_t mpt_dsn = 0, off = 0;
3063 int sb_cc = 0, error = 0, wakeup = 0;
f427ee49 3064 uint16_t dss_csum;
5ba3f43e
A
3065 uint16_t tot_sent = 0;
3066 boolean_t reinjected = FALSE;
3067
5ba3f43e 3068 mp_so = mptetoso(mpte);
39236c6e 3069 so = mpts->mpts_socket;
5ba3f43e 3070 tp = sototcpcb(so);
39236c6e 3071
cb323159
A
3072 socket_lock_assert_owned(mp_so);
3073
5ba3f43e
A
3074 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3075 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
39236c6e 3076
5ba3f43e
A
3077 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3078 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
0a7de745
A
3079 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3080 (mpts->mpts_flags & MPTSF_TFO_REQD));
5ba3f43e 3081 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
39236c6e 3082
5ba3f43e 3083 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
0a7de745
A
3084 __func__, mpts->mpts_flags, mpte->mpte_flags,
3085 mptcp_subflow_cwnd_space(so)),
3086 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5ba3f43e
A
3087 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3088 struct mptsub *, mpts);
39236c6e
A
3089
3090 /* Remove Addr Option is not sent reliably as per I-D */
3091 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
39236c6e 3092 tp->t_rem_aid = mpte->mpte_lost_aid;
5ba3f43e 3093 tp->t_mpflags |= TMPF_SND_REM_ADDR;
39236c6e
A
3094 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3095 }
3096
3097 /*
3098 * The mbuf chains containing the metadata (as well as pointing to
3099 * the user data sitting at the MPTCP output queue) would then be
3100 * sent down to the subflow socket.
3101 *
3102 * Some notes on data sequencing:
3103 *
3104 * a. Each mbuf must be a M_PKTHDR.
3105 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3106 * in the mbuf pkthdr structure.
3107 * c. Each mbuf containing the MPTCP metadata must have its
3108 * pkt_flags marked with the PKTF_MPTCP flag.
3109 */
3110
0a7de745 3111 if (mpte->mpte_reinjectq) {
5ba3f43e 3112 sb_mb = mpte->mpte_reinjectq;
0a7de745 3113 } else {
5ba3f43e 3114 sb_mb = mp_so->so_snd.sb_mb;
0a7de745 3115 }
5ba3f43e 3116
39236c6e 3117 if (sb_mb == NULL) {
cb323159
A
3118 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3119 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3120 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3121 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
a39ff7e2
A
3122
3123 /* Fix it to prevent looping */
0a7de745 3124 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
a39ff7e2 3125 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
0a7de745 3126 }
39236c6e
A
3127 goto out;
3128 }
3129
3130 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3131
5ba3f43e
A
3132 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3133 !(so->so_state & SS_ISCONNECTED) &&
3134 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3135 tp->t_mpflags |= TMPF_TFO_REQUEST;
3136 goto zero_len_write;
39236c6e
A
3137 }
3138
5ba3f43e
A
3139 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3140
3141 /* First, drop acknowledged data */
39236c6e 3142 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
cb323159 3143 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
0a7de745 3144 "dsn %u suna %u reinject? %u\n",
cb323159
A
3145 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3146 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
5ba3f43e
A
3147 if (mpte->mpte_reinjectq) {
3148 mptcp_clean_reinjectq(mpte);
3149 } else {
3150 uint64_t len = 0;
3151 len = mp_tp->mpt_snduna - mpt_dsn;
3152 sbdrop(&mp_so->so_snd, (int)len);
3153 wakeup = 1;
3154 }
3155 }
3156
3157 /* Check again because of above sbdrop */
3158 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
cb323159
A
3159 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3160 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
5ba3f43e 3161 goto out;
39236c6e
A
3162 }
3163
3164 /*
3165 * In degraded mode, we don't receive data acks, so force free
3166 * mbufs less than snd_nxt
3167 */
39236c6e 3168 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
fe8ab488 3169 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
5ba3f43e
A
3170 mp_so->so_snd.sb_mb) {
3171 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3172 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3173 uint64_t len = 0;
3174 len = mp_tp->mpt_snduna - mpt_dsn;
3175 sbdrop(&mp_so->so_snd, (int)len);
3176 wakeup = 1;
3177
cb323159
A
3178 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3179 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3180 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
5ba3f43e 3181 }
39236c6e
A
3182 }
3183
fe8ab488
A
3184 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3185 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3186 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3187 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
39236c6e
A
3188 }
3189
3190 /*
3191 * Adjust the top level notion of next byte used for retransmissions
3192 * and sending FINs.
3193 */
0a7de745 3194 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
39236c6e 3195 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
0a7de745 3196 }
39236c6e
A
3197
3198 /* Now determine the offset from which to start transmitting data */
0a7de745 3199 if (mpte->mpte_reinjectq) {
5ba3f43e 3200 sb_mb = mpte->mpte_reinjectq;
0a7de745 3201 } else {
a39ff7e2 3202dont_reinject:
5ba3f43e 3203 sb_mb = mp_so->so_snd.sb_mb;
0a7de745 3204 }
39236c6e 3205 if (sb_mb == NULL) {
cb323159
A
3206 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3207 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
39236c6e
A
3208 goto out;
3209 }
5ba3f43e 3210
a39ff7e2 3211 if (sb_mb == mpte->mpte_reinjectq) {
5ba3f43e 3212 sb_cc = sb_mb->m_pkthdr.mp_rlen;
a39ff7e2
A
3213 off = 0;
3214
3215 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3216 if (mptcp_can_send_more(mp_tp, TRUE)) {
3217 goto dont_reinject;
3218 }
3219
3220 error = ECANCELED;
3221 goto out;
3222 }
3223
3224 reinjected = TRUE;
5ba3f43e
A
3225 } else if (flags & MPTCP_SUBOUT_PROBING) {
3226 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3227 off = 0;
39236c6e 3228 } else {
5ba3f43e
A
3229 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3230
3231 /*
3232 * With TFO, there might be no data at all, thus still go into this
3233 * code-path here.
3234 */
3235 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3236 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3237 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3238 sb_cc -= off;
3239 } else {
cb323159
A
3240 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3241 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3242 (uint32_t)mp_tp->mpt_sndmax);
5ba3f43e
A
3243
3244 goto out;
3245 }
39236c6e 3246 }
39236c6e 3247
5ba3f43e
A
3248 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3249 if (sb_cc <= 0) {
cb323159
A
3250 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3251 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
0a7de745 3252 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
cb323159 3253 mptcp_subflow_cwnd_space(so));
5ba3f43e
A
3254 }
3255
3256 sb_cc = min(sb_cc, UINT16_MAX);
3257
3258 /*
3259 * Create a DSN mapping for the data we are about to send. It all
3260 * has the same mapping.
3261 */
0a7de745 3262 if (reinjected) {
5ba3f43e 3263 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
0a7de745 3264 } else {
5ba3f43e 3265 mpt_dsn = mp_tp->mpt_snduna + off;
0a7de745 3266 }
39236c6e 3267
5ba3f43e 3268 mpt_mbuf = sb_mb;
a39ff7e2 3269 while (mpt_mbuf && reinjected == FALSE &&
0a7de745
A
3270 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3271 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
39236c6e
A
3272 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3273 mpt_mbuf = mpt_mbuf->m_next;
39236c6e 3274 }
0a7de745 3275 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
5ba3f43e
A
3276 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3277 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3e170ce0 3278 mpts->mpts_probecnt),
5ba3f43e 3279 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
0a7de745 3280 }
39236c6e 3281
ecc0ceb4 3282 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 3283
fe8ab488
A
3284 head = tail = NULL;
3285
39236c6e 3286 while (tot_sent < sb_cc) {
f427ee49 3287 int32_t mlen;
39236c6e 3288
5ba3f43e 3289 mlen = mpt_mbuf->m_len;
39236c6e 3290 mlen -= off;
f427ee49 3291 mlen = MIN(mlen, sb_cc - tot_sent);
39236c6e 3292
5ba3f43e 3293 if (mlen < 0) {
cb323159 3294 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
f427ee49 3295 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
cb323159 3296 (uint32_t)off, sb_cc, tot_sent);
5ba3f43e 3297 goto out;
39236c6e
A
3298 }
3299
0a7de745 3300 if (mlen == 0) {
5ba3f43e 3301 goto next;
0a7de745 3302 }
5ba3f43e 3303
fe8ab488
A
3304 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3305 M_COPYM_MUST_COPY_HDR);
39236c6e 3306 if (m == NULL) {
cb323159
A
3307 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3308 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
39236c6e
A
3309 error = ENOBUFS;
3310 break;
3311 }
3312
3313 /* Create a DSN mapping for the data (m_copym does it) */
fe8ab488 3314 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e
A
3315 VERIFY(m->m_next == NULL);
3316
39236c6e
A
3317 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3318 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
5ba3f43e 3319 m->m_pkthdr.mp_dsn = mpt_dsn;
39236c6e 3320 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
39236c6e
A
3321 m->m_pkthdr.len = mlen;
3322
fe8ab488 3323 if (head == NULL) {
0a7de745 3324 head = tail = m;
fe8ab488
A
3325 } else {
3326 tail->m_next = m;
3327 tail = m;
3328 }
3329
fe8ab488
A
3330 tot_sent += mlen;
3331 off = 0;
5ba3f43e 3332next:
fe8ab488
A
3333 mpt_mbuf = mpt_mbuf->m_next;
3334 }
3335
a39ff7e2 3336 if (reinjected) {
5ba3f43e
A
3337 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3338 struct mbuf *n = sb_mb;
3339
3340 while (n) {
3341 n->m_pkthdr.mp_dsn += sb_cc;
3342 n->m_pkthdr.mp_rlen -= sb_cc;
3343 n = n->m_next;
3344 }
3345 m_adj(sb_mb, sb_cc);
3346 } else {
3347 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3348 m_freem(sb_mb);
3349 }
3350 }
3351
3352 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
0a7de745
A
3353 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3354 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5ba3f43e
A
3355
3356 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3357 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
0a7de745 3358 tot_sent);
5ba3f43e
A
3359 }
3360
3361 /* Now, let's update rel-seq and the data-level length */
3362 mpts->mpts_rel_seq += tot_sent;
3363 m = head;
3364 while (m) {
0a7de745 3365 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
5ba3f43e 3366 m->m_pkthdr.mp_csum = dss_csum;
0a7de745 3367 }
5ba3f43e
A
3368 m->m_pkthdr.mp_rlen = tot_sent;
3369 m = m->m_next;
3370 }
3371
3372 if (head != NULL) {
490019cf 3373 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
0a7de745 3374 (tp->t_tfo_stats == 0)) {
39037602 3375 tp->t_mpflags |= TMPF_TFO_REQUEST;
0a7de745 3376 }
fe8ab488
A
3377
3378 error = sock_sendmbuf(so, NULL, head, 0, NULL);
3379
5ba3f43e 3380 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
39236c6e
A
3381 struct sockbuf *, &so->so_rcv,
3382 struct sockbuf *, &so->so_snd,
3383 struct mptses *, mpte, struct mptsub *, mpts,
fe8ab488
A
3384 size_t, tot_sent);
3385 }
3386
5ba3f43e
A
3387done_sending:
3388 if (error == 0 ||
3389 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3390 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3e170ce0
A
3391
3392 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3393 tcpstat.tcps_mp_num_probes++;
0a7de745 3394 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3e170ce0 3395 mpts->mpts_probecnt += 1;
0a7de745 3396 } else {
3e170ce0 3397 mpts->mpts_probecnt +=
0a7de745
A
3398 tot_sent / mpts->mpts_maxseg;
3399 }
3e170ce0
A
3400 }
3401
5ba3f43e
A
3402 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3403 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
0a7de745 3404 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
39236c6e 3405 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
0a7de745 3406 }
5ba3f43e 3407 mp_tp->mpt_sndnxt = new_sndnxt;
39236c6e 3408 }
fe8ab488 3409
5ba3f43e 3410 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
490019cf 3411
5ba3f43e
A
3412 /* Must be here as mptcp_can_send_more() checks for this */
3413 soclearfastopen(mp_so);
39236c6e 3414
3e170ce0 3415 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
0a7de745 3416 (mpts->mpts_probesoon != 0)) {
5ba3f43e
A
3417 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3418 __func__, mpts->mpts_connid,
3419 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3420 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3e170ce0 3421 (tcp_now - mpts->mpts_probesoon)),
5ba3f43e 3422 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
0a7de745 3423 }
5ba3f43e
A
3424
3425 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
cb323159 3426 mptcp_set_cellicon(mpte, mpts);
5ba3f43e
A
3427
3428 mpte->mpte_used_cell = 1;
3429 } else {
cb323159
A
3430 /*
3431 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3432 * explicitly set the cellicon, then we unset it again.
3433 */
3434 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3435 mptcp_unset_cellicon(mpte, NULL, 1);
3436 }
5ba3f43e
A
3437
3438 mpte->mpte_used_wifi = 1;
3439 }
3440
3441 /*
3442 * Don't propagate EWOULDBLOCK - it's already taken care of
3443 * in mptcp_usr_send for TFO.
3444 */
3445 error = 0;
fe8ab488 3446 } else {
c3c9b80d
A
3447 /* We need to revert our change to mpts_rel_seq */
3448 mpts->mpts_rel_seq -= tot_sent;
3449
cb323159
A
3450 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3451 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
39236c6e
A
3452 }
3453out:
5ba3f43e 3454
0a7de745 3455 if (wakeup) {
5ba3f43e 3456 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
0a7de745 3457 }
39037602 3458
5ba3f43e 3459 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
0a7de745 3460 return error;
5ba3f43e
A
3461
3462zero_len_write:
3463 /* Opting to call pru_send as no mbuf at subflow level */
3464 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
0a7de745 3465 NULL, current_proc());
5ba3f43e
A
3466
3467 goto done_sending;
39236c6e
A
3468}
3469
39236c6e 3470static void
5ba3f43e 3471mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
39236c6e 3472{
5ba3f43e 3473 struct mbuf *n, *prev = NULL;
39236c6e 3474
5ba3f43e 3475 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
0a7de745
A
3476 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3477 m->m_pkthdr.mp_rseq),
3478 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5ba3f43e
A
3479
3480 n = mpte->mpte_reinjectq;
3481
3482 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3483 * equal than m's sequence number.
3484 */
3485 while (n) {
0a7de745 3486 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
5ba3f43e 3487 break;
0a7de745 3488 }
5ba3f43e
A
3489
3490 prev = n;
3491
3492 n = n->m_nextpkt;
3493 }
3494
3495 if (n) {
3496 /* m is already fully covered by the next mbuf in the queue */
3497 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3498 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
c3c9b80d
A
3499 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3500 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3501 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3502 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
5ba3f43e
A
3503 goto dont_queue;
3504 }
3505
3506 /* m is covering the next mbuf entirely, thus we remove this guy */
3507 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3508 struct mbuf *tmp = n->m_nextpkt;
3509
c3c9b80d
A
3510 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3511 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3512 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3513 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
5ba3f43e
A
3514
3515 m->m_nextpkt = NULL;
0a7de745 3516 if (prev == NULL) {
5ba3f43e 3517 mpte->mpte_reinjectq = tmp;
0a7de745 3518 } else {
5ba3f43e 3519 prev->m_nextpkt = tmp;
0a7de745 3520 }
5ba3f43e
A
3521
3522 m_freem(n);
3523 n = tmp;
3524 }
5ba3f43e
A
3525 }
3526
3527 if (prev) {
3528 /* m is already fully covered by the previous mbuf in the queue */
3529 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
c3c9b80d
A
3530 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3531 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3532 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3533 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
5ba3f43e
A
3534 goto dont_queue;
3535 }
3536 }
3537
0a7de745 3538 if (prev == NULL) {
5ba3f43e 3539 mpte->mpte_reinjectq = m;
0a7de745 3540 } else {
5ba3f43e 3541 prev->m_nextpkt = m;
0a7de745 3542 }
39236c6e 3543
5ba3f43e
A
3544 m->m_nextpkt = n;
3545
3546 return;
3547
3548dont_queue:
3549 m_freem(m);
3550 return;
39236c6e
A
3551}
3552
5ba3f43e
A
3553static struct mbuf *
3554mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
39236c6e 3555{
5ba3f43e
A
3556 struct socket *mp_so = mptetoso(mpte);
3557 struct mbuf *m;
39236c6e 3558
5ba3f43e 3559 m = mp_so->so_snd.sb_mb;
39236c6e 3560
5ba3f43e
A
3561 while (m) {
3562 /* If this segment covers what we are looking for, return it. */
3563 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
0a7de745 3564 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
5ba3f43e 3565 break;
0a7de745 3566 }
5ba3f43e
A
3567
3568
3569 /* Segment is no more in the queue */
0a7de745 3570 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
5ba3f43e 3571 return NULL;
0a7de745 3572 }
5ba3f43e
A
3573
3574 m = m->m_next;
39236c6e
A
3575 }
3576
5ba3f43e
A
3577 return m;
3578}
fe8ab488 3579
5ba3f43e 3580static struct mbuf *
cb323159 3581mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
5ba3f43e
A
3582{
3583 struct mbuf *top = NULL, *tail = NULL;
3584 uint64_t dsn;
3585 uint32_t dlen, rseq;
39236c6e 3586
5ba3f43e
A
3587 dsn = m->m_pkthdr.mp_dsn;
3588 dlen = m->m_pkthdr.mp_rlen;
3589 rseq = m->m_pkthdr.mp_rseq;
3e170ce0 3590
5ba3f43e
A
3591 while (len > 0) {
3592 struct mbuf *n;
3593
3594 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3595
3596 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3597 if (n == NULL) {
cb323159
A
3598 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3599 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
5ba3f43e 3600 goto err;
3e170ce0 3601 }
fe8ab488 3602
5ba3f43e
A
3603 VERIFY(n->m_flags & M_PKTHDR);
3604 VERIFY(n->m_next == NULL);
3605 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3606 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3607 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3608 VERIFY(n->m_len == m->m_len);
3609
3610 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3611
0a7de745 3612 if (top == NULL) {
5ba3f43e 3613 top = n;
0a7de745 3614 }
5ba3f43e 3615
0a7de745 3616 if (tail != NULL) {
5ba3f43e 3617 tail->m_next = n;
0a7de745 3618 }
5ba3f43e
A
3619
3620 tail = n;
3621
3622 len -= m->m_len;
3623 m = m->m_next;
39236c6e
A
3624 }
3625
5ba3f43e
A
3626 return top;
3627
3628err:
0a7de745 3629 if (top) {
5ba3f43e 3630 m_freem(top);
0a7de745 3631 }
5ba3f43e
A
3632
3633 return NULL;
39236c6e
A
3634}
3635
5ba3f43e
A
3636static void
3637mptcp_reinject_mbufs(struct socket *so)
39236c6e 3638{
5ba3f43e
A
3639 struct tcpcb *tp = sototcpcb(so);
3640 struct mptsub *mpts = tp->t_mpsub;
3641 struct mptcb *mp_tp = tptomptp(tp);
3642 struct mptses *mpte = mp_tp->mpt_mpte;;
3643 struct sockbuf *sb = &so->so_snd;
3644 struct mbuf *m;
39236c6e 3645
5ba3f43e
A
3646 m = sb->sb_mb;
3647 while (m) {
3648 struct mbuf *n = m->m_next, *orig = m;
c3c9b80d 3649 bool set_reinject_flag = false;
39236c6e 3650
5ba3f43e 3651 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
0a7de745
A
3652 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3653 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3654 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 3655
5ba3f43e 3656 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
39236c6e 3657
0a7de745 3658 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
5ba3f43e 3659 goto next;
0a7de745 3660 }
39236c6e 3661
5ba3f43e 3662 /* Has it all already been acknowledged at the data-level? */
0a7de745 3663 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
5ba3f43e 3664 goto next;
0a7de745 3665 }
5ba3f43e
A
3666
3667 /* Part of this has already been acknowledged - lookup in the
3668 * MPTCP-socket for the segment.
3669 */
3670 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3671 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
0a7de745 3672 if (m == NULL) {
5ba3f43e 3673 goto next;
0a7de745 3674 }
5ba3f43e
A
3675 }
3676
3677 /* Copy the mbuf with headers (aka, DSN-numbers) */
cb323159 3678 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
0a7de745 3679 if (m == NULL) {
5ba3f43e 3680 break;
0a7de745 3681 }
5ba3f43e
A
3682
3683 VERIFY(m->m_nextpkt == NULL);
3684
3685 /* Now, add to the reinject-queue, eliminating overlapping
3686 * segments
3687 */
3688 mptcp_add_reinjectq(mpte, m);
3689
c3c9b80d 3690 set_reinject_flag = true;
5ba3f43e
A
3691 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3692
3693next:
3694 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3695 while (n) {
3696 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3697
0a7de745 3698 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
5ba3f43e 3699 break;
0a7de745 3700 }
5ba3f43e 3701
c3c9b80d
A
3702 if (set_reinject_flag) {
3703 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3704 }
5ba3f43e
A
3705 n = n->m_next;
3706 }
3707
3708 m = n;
39236c6e 3709 }
5ba3f43e 3710}
39236c6e 3711
5ba3f43e
A
3712void
3713mptcp_clean_reinjectq(struct mptses *mpte)
3714{
3715 struct mptcb *mp_tp = mpte->mpte_mptcb;
3716
cb323159 3717 socket_lock_assert_owned(mptetoso(mpte));
5ba3f43e
A
3718
3719 while (mpte->mpte_reinjectq) {
3720 struct mbuf *m = mpte->mpte_reinjectq;
3721
3722 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
0a7de745 3723 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
5ba3f43e 3724 break;
0a7de745 3725 }
5ba3f43e
A
3726
3727 mpte->mpte_reinjectq = m->m_nextpkt;
3728 m->m_nextpkt = NULL;
3729 m_freem(m);
3730 }
39236c6e
A
3731}
3732
3733/*
5ba3f43e 3734 * Subflow socket control event upcall.
39236c6e 3735 */
5ba3f43e 3736static void
f427ee49 3737mptcp_subflow_eupcall1(struct socket *so, void *arg, long events)
39236c6e 3738{
5ba3f43e
A
3739#pragma unused(so)
3740 struct mptsub *mpts = arg;
3741 struct mptses *mpte = mpts->mpts_mpte;
39236c6e 3742
cb323159 3743 socket_lock_assert_owned(mptetoso(mpte));
39236c6e 3744
0a7de745 3745 if ((mpts->mpts_evctl & events) == events) {
5ba3f43e 3746 return;
0a7de745 3747 }
39236c6e 3748
5ba3f43e
A
3749 mpts->mpts_evctl |= events;
3750
3751 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3752 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3753 return;
39037602 3754 }
39236c6e 3755
5ba3f43e 3756 mptcp_subflow_workloop(mpte);
39236c6e
A
3757}
3758
3759/*
5ba3f43e
A
3760 * Subflow socket control events.
3761 *
3762 * Called for handling events related to the underlying subflow socket.
39236c6e
A
3763 */
3764static ev_ret_t
5ba3f43e 3765mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3766 long *p_mpsofilt_hint)
39236c6e 3767{
5ba3f43e
A
3768 ev_ret_t ret = MPTS_EVRET_OK;
3769 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
0a7de745 3770 sizeof(mpsub_ev_entry_tbl[0]);
39236c6e 3771
5ba3f43e 3772 /* bail if there's nothing to process */
0a7de745
A
3773 if (!mpts->mpts_evctl) {
3774 return ret;
3775 }
39236c6e 3776
0a7de745
A
3777 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3778 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3779 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
5ba3f43e
A
3780 SO_FILT_HINT_DISCONNECTED)) {
3781 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3782 }
3e170ce0 3783
5ba3f43e
A
3784 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3785 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3786
5ba3f43e
A
3787 /*
3788 * Process all the socket filter hints and reset the hint
3789 * once it is handled
3790 */
3791 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3792 /*
3793 * Always execute the DISCONNECTED event, because it will wakeup
3794 * the app.
3795 */
3796 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3797 (ret >= MPTS_EVRET_OK ||
0a7de745 3798 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
5ba3f43e
A
3799 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3800 ev_ret_t error =
0a7de745 3801 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
5ba3f43e
A
3802 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3803 }
3804 }
3805
0a7de745 3806 return ret;
39236c6e
A
3807}
3808
39236c6e 3809static ev_ret_t
5ba3f43e 3810mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3811 long *p_mpsofilt_hint, long event)
39236c6e
A
3812{
3813 struct socket *mp_so, *so;
3814 struct mptcb *mp_tp;
39236c6e 3815
5ba3f43e 3816 mp_so = mptetoso(mpte);
39236c6e
A
3817 mp_tp = mpte->mpte_mptcb;
3818 so = mpts->mpts_socket;
3819
39236c6e 3820 /*
5ba3f43e
A
3821 * We got an event for this subflow that might need to be propagated,
3822 * based on the state of the MPTCP connection.
39236c6e 3823 */
5ba3f43e 3824 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
cb323159 3825 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
5ba3f43e
A
3826 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3827 mp_so->so_error = so->so_error;
3828 *p_mpsofilt_hint |= event;
39236c6e 3829 }
39236c6e 3830
0a7de745 3831 return MPTS_EVRET_OK;
39236c6e
A
3832}
3833
3834/*
3835 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3836 */
3837static ev_ret_t
3e170ce0 3838mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3839 long *p_mpsofilt_hint, long event)
39236c6e 3840{
5ba3f43e
A
3841#pragma unused(p_mpsofilt_hint, event)
3842 struct socket *mp_so;
3843 struct tcpcb *tp;
39236c6e 3844
5ba3f43e
A
3845 mp_so = mptetoso(mpte);
3846 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
39236c6e 3847
39236c6e
A
3848 /*
3849 * This overwrites any previous mpte_lost_aid to avoid storing
3850 * too much state when the typical case has only two subflows.
3851 */
3852 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3853 mpte->mpte_lost_aid = tp->t_local_aid;
3854
5ba3f43e 3855 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
0a7de745 3856 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
3857
3858 /*
3859 * The subflow connection has lost its source address.
39236c6e 3860 */
5ba3f43e 3861 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
39236c6e 3862
0a7de745 3863 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
5ba3f43e 3864 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
0a7de745 3865 }
39236c6e 3866
0a7de745 3867 return MPTS_EVRET_DELETE;
39236c6e
A
3868}
3869
cb323159
A
3870static ev_ret_t
3871mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3872 long *p_mpsofilt_hint, long event)
cb323159
A
3873{
3874#pragma unused(event, p_mpsofilt_hint)
3875 struct socket *so, *mp_so;
3876
3877 so = mpts->mpts_socket;
3878
3879 if (so->so_error != ENODATA) {
3880 return MPTS_EVRET_OK;
3881 }
3882
3883
3884 mp_so = mptetoso(mpte);
3885
3886 mp_so->so_error = ENODATA;
3887
3888 sorwakeup(mp_so);
3889 sowwakeup(mp_so);
3890
3891 return MPTS_EVRET_OK;
3892}
3893
3894
fe8ab488
A
3895/*
3896 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3897 * indicates that the remote side sent a Data FIN
3898 */
3899static ev_ret_t
3e170ce0 3900mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3901 long *p_mpsofilt_hint, long event)
fe8ab488 3902{
5ba3f43e 3903#pragma unused(event)
cb323159 3904 struct mptcb *mp_tp = mpte->mpte_mptcb;
fe8ab488 3905
5ba3f43e 3906 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3e170ce0 3907 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39037602 3908
fe8ab488 3909 /*
0a7de745
A
3910 * We got a Data FIN for the MPTCP connection.
3911 * The FIN may arrive with data. The data is handed up to the
3912 * mptcp socket and the user is notified so that it may close
3913 * the socket if needed.
3914 */
3915 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
5ba3f43e 3916 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
0a7de745 3917 }
39037602 3918
0a7de745 3919 return MPTS_EVRET_OK; /* keep the subflow socket around */
fe8ab488
A
3920}
3921
39236c6e
A
3922/*
3923 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3924 */
3925static ev_ret_t
3e170ce0 3926mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3927 long *p_mpsofilt_hint, long event)
39236c6e 3928{
5ba3f43e 3929#pragma unused(event, p_mpsofilt_hint)
39236c6e 3930 struct mptsub *mpts_alt = NULL;
5ba3f43e 3931 struct socket *alt_so = NULL;
39236c6e
A
3932 struct socket *mp_so;
3933 int altpath_exists = 0;
3934
5ba3f43e 3935 mp_so = mptetoso(mpte);
cb323159 3936 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
39236c6e 3937
5ba3f43e 3938 mptcp_reinject_mbufs(mpts->mpts_socket);
39236c6e 3939
cb323159
A
3940 mpts_alt = mptcp_get_subflow(mpte, NULL);
3941
3942 /* If there is no alternate eligible subflow, ignore the failover hint. */
3943 if (mpts_alt == NULL || mpts_alt == mpts) {
3944 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3945 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
5ba3f43e 3946
39236c6e
A
3947 goto done;
3948 }
5ba3f43e 3949
39236c6e 3950 altpath_exists = 1;
5ba3f43e 3951 alt_so = mpts_alt->mpts_socket;
39236c6e 3952 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
fe8ab488 3953 /* All data acknowledged and no RTT spike */
5ba3f43e 3954 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
39236c6e
A
3955 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3956 } else {
3957 /* no alternate path available */
3958 altpath_exists = 0;
3959 }
39236c6e 3960 }
39236c6e
A
3961
3962 if (altpath_exists) {
5ba3f43e 3963 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
39236c6e 3964
5ba3f43e 3965 mpte->mpte_active_sub = mpts_alt;
39236c6e
A
3966 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3967 mpts->mpts_flags &= ~MPTSF_ACTIVE;
5ba3f43e 3968
cb323159
A
3969 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3970 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
5ba3f43e
A
3971
3972 mptcpstats_inc_switch(mpte, mpts);
3973
3974 sowwakeup(alt_so);
39236c6e 3975 } else {
5ba3f43e 3976 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
0a7de745
A
3977 mpts->mpts_connid),
3978 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 3979done:
5ba3f43e 3980 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
39236c6e 3981 }
5ba3f43e 3982
0a7de745 3983 return MPTS_EVRET_OK;
39236c6e
A
3984}
3985
3986/*
3987 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3988 */
3989static ev_ret_t
3e170ce0 3990mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 3991 long *p_mpsofilt_hint, long event)
39236c6e 3992{
5ba3f43e
A
3993 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3994 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 3995
39236c6e 3996 /*
5ba3f43e
A
3997 * The subflow connection cannot use the outgoing interface, let's
3998 * close this subflow.
39236c6e 3999 */
5ba3f43e 4000 mptcp_subflow_abort(mpts, EPERM);
39236c6e 4001
5ba3f43e 4002 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
39236c6e 4003
0a7de745 4004 return MPTS_EVRET_DELETE;
39236c6e
A
4005}
4006
a39ff7e2
A
4007/*
4008 * https://tools.ietf.org/html/rfc6052#section-2
4009 * https://tools.ietf.org/html/rfc6147#section-5.2
4010 */
4011static boolean_t
4012mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
0a7de745
A
4013 const struct ipv6_prefix *prefix,
4014 struct in_addr *addrv4)
a39ff7e2
A
4015{
4016 char buf[MAX_IPv4_STR_LEN];
4017 char *ptrv4 = (char *)addrv4;
4018 const char *ptr = (const char *)addr;
4019
0a7de745 4020 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
a39ff7e2 4021 return false;
0a7de745 4022 }
a39ff7e2
A
4023
4024 switch (prefix->prefix_len) {
0a7de745
A
4025 case NAT64_PREFIX_LEN_96:
4026 memcpy(ptrv4, ptr + 12, 4);
4027 break;
4028 case NAT64_PREFIX_LEN_64:
4029 memcpy(ptrv4, ptr + 9, 4);
4030 break;
4031 case NAT64_PREFIX_LEN_56:
4032 memcpy(ptrv4, ptr + 7, 1);
4033 memcpy(ptrv4 + 1, ptr + 9, 3);
4034 break;
4035 case NAT64_PREFIX_LEN_48:
4036 memcpy(ptrv4, ptr + 6, 2);
4037 memcpy(ptrv4 + 2, ptr + 9, 2);
4038 break;
4039 case NAT64_PREFIX_LEN_40:
4040 memcpy(ptrv4, ptr + 5, 3);
4041 memcpy(ptrv4 + 3, ptr + 9, 1);
4042 break;
4043 case NAT64_PREFIX_LEN_32:
4044 memcpy(ptrv4, ptr + 4, 4);
4045 break;
4046 default:
4047 panic("NAT64-prefix len is wrong: %u\n",
4048 prefix->prefix_len);
a39ff7e2
A
4049 }
4050
4051 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
0a7de745 4052 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
a39ff7e2
A
4053
4054 return true;
4055}
4056
4057static void
4058mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
4059{
4060 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
4061 struct socket *so = mpts->mpts_socket;
4062 struct ifnet *ifp;
4063 int j;
4064
cb323159
A
4065 /* Subflow IPs will be steered directly by the server - no need to
4066 * desynthesize.
4067 */
4068 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
4069 return;
4070 }
4071
a39ff7e2
A
4072 ifp = sotoinpcb(so)->inp_last_outifp;
4073
4074 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
a39ff7e2
A
4075 return;
4076 }
4077
a39ff7e2
A
4078 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
4079 int success;
4080
0a7de745 4081 if (nat64prefixes[j].prefix_len == 0) {
a39ff7e2 4082 continue;
0a7de745 4083 }
a39ff7e2
A
4084
4085 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
0a7de745 4086 &nat64prefixes[j],
c3c9b80d 4087 &mpte->mpte_sub_dst_v4.sin_addr);
a39ff7e2 4088 if (success) {
c3c9b80d
A
4089 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
4090 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
4091 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
a39ff7e2
A
4092 break;
4093 }
4094 }
4095}
4096
f427ee49
A
4097static void
4098mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
4099{
4100 struct inpcb *inp;
4101
4102 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
4103 return;
4104 }
4105
4106 inp = sotoinpcb(mpts->mpts_socket);
4107 if (inp == NULL) {
4108 return;
4109 }
4110
4111 /* Should we try the alternate port? */
4112 if (mpte->mpte_alternate_port &&
4113 inp->inp_fport != mpte->mpte_alternate_port) {
4114 union sockaddr_in_4_6 dst;
4115 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4116
4117 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4118
4119 dst_in->sin_port = mpte->mpte_alternate_port;
4120
4121 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4122 mpts->mpts_ifscope, NULL);
4123 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4124 unsigned int i;
4125
4126 if (inp->inp_last_outifp == NULL) {
4127 return;
4128 }
4129
4130 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4131 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
4132
4133 if (inp->inp_last_outifp->if_index == info->ifindex) {
4134 info->no_mptcp_support = 1;
4135 break;
4136 }
4137 }
4138 }
4139}
4140
39236c6e
A
4141/*
4142 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4143 */
4144static ev_ret_t
3e170ce0 4145mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4146 long *p_mpsofilt_hint, long event)
39236c6e 4147{
5ba3f43e 4148#pragma unused(event, p_mpsofilt_hint)
39236c6e 4149 struct socket *mp_so, *so;
5ba3f43e
A
4150 struct inpcb *inp;
4151 struct tcpcb *tp;
39236c6e 4152 struct mptcb *mp_tp;
5ba3f43e 4153 int af;
39236c6e
A
4154 boolean_t mpok = FALSE;
4155
5ba3f43e
A
4156 mp_so = mptetoso(mpte);
4157 mp_tp = mpte->mpte_mptcb;
39236c6e 4158 so = mpts->mpts_socket;
5ba3f43e
A
4159 tp = sototcpcb(so);
4160 af = mpts->mpts_dst.sa_family;
39236c6e 4161
0a7de745
A
4162 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4163 return MPTS_EVRET_OK;
4164 }
39236c6e
A
4165
4166 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4167 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
fe8ab488
A
4168 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4169 (so->so_state & SS_ISCONNECTED)) {
0a7de745
A
4170 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4171 __func__, mpts->mpts_connid),
4172 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488
A
4173 (void) soshutdownlock(so, SHUT_RD);
4174 (void) soshutdownlock(so, SHUT_WR);
4175 (void) sodisconnectlocked(so);
4176 }
0a7de745 4177 return MPTS_EVRET_OK;
39236c6e
A
4178 }
4179
4180 /*
4181 * The subflow connection has been connected. Find out whether it
4182 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4183 *
4184 * a. If MPTCP connection is not yet established, then this must be
4185 * the first subflow connection. If MPTCP failed to negotiate,
5ba3f43e 4186 * fallback to regular TCP by degrading this subflow.
39236c6e
A
4187 *
4188 * b. If MPTCP connection has been established, then this must be
4189 * one of the subsequent subflow connections. If MPTCP failed
5ba3f43e 4190 * to negotiate, disconnect the connection.
39236c6e
A
4191 *
4192 * Right now, we simply unblock any waiters at the MPTCP socket layer
4193 * if the MPTCP connection has not been established.
4194 */
39236c6e
A
4195
4196 if (so->so_state & SS_ISDISCONNECTED) {
4197 /*
4198 * With MPTCP joins, a connection is connected at the subflow
4199 * level, but the 4th ACK from the server elevates the MPTCP
490019cf
A
4200 * subflow to connected state. So there is a small window
4201 * where the subflow could get disconnected before the
39236c6e
A
4202 * connected event is processed.
4203 */
0a7de745 4204 return MPTS_EVRET_OK;
39236c6e
A
4205 }
4206
0a7de745 4207 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
5ba3f43e 4208 mptcp_drop_tfo_data(mpte, mpts);
0a7de745 4209 }
490019cf 4210
5ba3f43e
A
4211 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4212 mpts->mpts_flags |= MPTSF_CONNECTED;
490019cf 4213
0a7de745 4214 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
39236c6e 4215 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
0a7de745 4216 }
39236c6e 4217
490019cf
A
4218 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4219
39236c6e 4220 /* get/verify the outbound interface */
5ba3f43e 4221 inp = sotoinpcb(so);
3e170ce0 4222
5ba3f43e 4223 mpts->mpts_maxseg = tp->t_maxseg;
3e170ce0 4224
5ba3f43e
A
4225 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4226 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4227 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3e170ce0 4228 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
39236c6e
A
4229
4230 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
39236c6e 4231
39236c6e 4232 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
5ba3f43e
A
4233 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4234 mpte->mpte_associd = mpts->mpts_connid;
4235 DTRACE_MPTCP2(state__change,
4236 struct mptcb *, mp_tp,
4237 uint32_t, 0 /* event */);
4238
4239 if (SOCK_DOM(so) == AF_INET) {
4240 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4241 } else {
4242 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4243 }
4244
a39ff7e2
A
4245 mpts->mpts_flags |= MPTSF_ACTIVE;
4246
39236c6e
A
4247 /* case (a) above */
4248 if (!mpok) {
5ba3f43e
A
4249 tcpstat.tcps_mpcap_fallback++;
4250
4251 tp->t_mpflags |= TMPF_INFIN_SENT;
4252 mptcp_notify_mpfail(so);
39236c6e 4253 } else {
5ba3f43e 4254 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
c3c9b80d 4255 mptcp_subflows_need_backup_flag(mpte)) {
5ba3f43e 4256 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
39037602
A
4257 } else {
4258 mpts->mpts_flags |= MPTSF_PREFERRED;
4259 }
39236c6e
A
4260 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4261 mpte->mpte_nummpcapflows++;
5ba3f43e 4262
0a7de745 4263 if (SOCK_DOM(so) == AF_INET6) {
a39ff7e2 4264 mptcp_handle_ipv6_connection(mpte, mpts);
0a7de745 4265 }
a39ff7e2 4266
5ba3f43e
A
4267 mptcp_check_subflows_and_add(mpte);
4268
0a7de745 4269 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
5ba3f43e 4270 mpte->mpte_initial_cell = 1;
0a7de745 4271 }
5ba3f43e
A
4272
4273 mpte->mpte_handshake_success = 1;
39236c6e 4274 }
5ba3f43e
A
4275
4276 mp_tp->mpt_sndwnd = tp->snd_wnd;
4277 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4278 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4279 soisconnected(mp_so);
39236c6e 4280 } else if (mpok) {
39236c6e
A
4281 /*
4282 * case (b) above
4283 * In case of additional flows, the MPTCP socket is not
4284 * MPTSF_MP_CAPABLE until an ACK is received from server
4285 * for 3-way handshake. TCP would have guaranteed that this
4286 * is an MPTCP subflow.
4287 */
5ba3f43e
A
4288 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4289 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
c3c9b80d 4290 mptcp_subflows_need_backup_flag(mpte)) {
5ba3f43e
A
4291 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4292 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4293 } else {
4294 mpts->mpts_flags |= MPTSF_PREFERRED;
4295 }
4296
39236c6e
A
4297 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4298 mpte->mpte_nummpcapflows++;
5ba3f43e
A
4299
4300 mpts->mpts_rel_seq = 1;
4301
4302 mptcp_check_subflows_and_remove(mpte);
fe8ab488 4303 } else {
f427ee49 4304 mptcp_try_alternate_port(mpte, mpts);
5ba3f43e
A
4305
4306 tcpstat.tcps_join_fallback++;
0a7de745 4307 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
5ba3f43e 4308 tcpstat.tcps_mptcp_cell_proxy++;
0a7de745 4309 } else {
5ba3f43e 4310 tcpstat.tcps_mptcp_wifi_proxy++;
0a7de745 4311 }
5ba3f43e
A
4312
4313 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4314
0a7de745 4315 return MPTS_EVRET_OK;
39236c6e 4316 }
fe8ab488 4317
5ba3f43e 4318 /* This call, just to "book" an entry in the stats-table for this ifindex */
cb323159 4319 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
5ba3f43e
A
4320
4321 mptcp_output(mpte);
39236c6e 4322
0a7de745 4323 return MPTS_EVRET_OK; /* keep the subflow socket around */
39236c6e
A
4324}
4325
4326/*
4327 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4328 */
4329static ev_ret_t
3e170ce0 4330mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4331 long *p_mpsofilt_hint, long event)
39236c6e 4332{
5ba3f43e 4333#pragma unused(event, p_mpsofilt_hint)
39236c6e
A
4334 struct socket *mp_so, *so;
4335 struct mptcb *mp_tp;
39236c6e 4336
5ba3f43e 4337 mp_so = mptetoso(mpte);
39236c6e
A
4338 mp_tp = mpte->mpte_mptcb;
4339 so = mpts->mpts_socket;
4340
5ba3f43e
A
4341 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4342 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4343 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4344 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3e170ce0 4345 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
39236c6e 4346
0a7de745
A
4347 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4348 return MPTS_EVRET_DELETE;
4349 }
39236c6e 4350
39236c6e
A
4351 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4352
5ba3f43e 4353 /* The subflow connection has been disconnected. */
39236c6e
A
4354
4355 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4356 mpte->mpte_nummpcapflows--;
fe8ab488
A
4357 if (mpte->mpte_active_sub == mpts) {
4358 mpte->mpte_active_sub = NULL;
5ba3f43e 4359 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3e170ce0 4360 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
fe8ab488 4361 }
39236c6e 4362 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
f427ee49
A
4363 } else {
4364 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4365 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4366 mptcp_try_alternate_port(mpte, mpts);
4367 }
39236c6e
A
4368 }
4369
5ba3f43e 4370 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
0a7de745 4371 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
5ba3f43e 4372 mptcp_drop(mpte, mp_tp, so->so_error);
39236c6e
A
4373 }
4374
39236c6e 4375 /*
5ba3f43e
A
4376 * Clear flags that are used by getconninfo to return state.
4377 * Retain like MPTSF_DELETEOK for internal purposes.
39236c6e 4378 */
0a7de745
A
4379 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4380 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4381 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
5ba3f43e 4382
0a7de745 4383 return MPTS_EVRET_DELETE;
39236c6e
A
4384}
4385
4386/*
4387 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4388 */
4389static ev_ret_t
3e170ce0 4390mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4391 long *p_mpsofilt_hint, long event)
39236c6e 4392{
5ba3f43e 4393#pragma unused(event, p_mpsofilt_hint)
cb323159 4394 ev_ret_t ret = MPTS_EVRET_OK;
39236c6e
A
4395 struct socket *mp_so, *so;
4396 struct mptcb *mp_tp;
39236c6e 4397
5ba3f43e 4398 mp_so = mptetoso(mpte);
39236c6e 4399 mp_tp = mpte->mpte_mptcb;
39236c6e
A
4400 so = mpts->mpts_socket;
4401
0a7de745 4402 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
39236c6e 4403 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
0a7de745 4404 } else {
39236c6e 4405 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
0a7de745 4406 }
39236c6e
A
4407
4408 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
0a7de745 4409 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
39236c6e 4410 goto done;
0a7de745 4411 }
39236c6e 4412 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
d9a64523 4413 } else {
39236c6e 4414 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
d9a64523 4415 }
39236c6e 4416
0a7de745 4417 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
39236c6e 4418 mpts->mpts_flags |= MPTSF_MP_READY;
0a7de745 4419 } else {
39236c6e 4420 mpts->mpts_flags &= ~MPTSF_MP_READY;
0a7de745 4421 }
39236c6e
A
4422
4423 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4424 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4425 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4426 }
4427
4428 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
39236c6e 4429 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
d9a64523
A
4430
4431 m_freem_list(mpte->mpte_reinjectq);
4432 mpte->mpte_reinjectq = NULL;
39236c6e
A
4433 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4434 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4435 ret = MPTS_EVRET_CONNECT_PENDING;
4436 }
4437
39236c6e 4438done:
0a7de745 4439 return ret;
39236c6e
A
4440}
4441
4442/*
4443 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4444 */
4445static ev_ret_t
3e170ce0 4446mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4447 long *p_mpsofilt_hint, long event)
39236c6e 4448{
5ba3f43e 4449#pragma unused(event)
39236c6e
A
4450 struct socket *mp_so, *so;
4451 struct mptcb *mp_tp;
5ba3f43e 4452 boolean_t is_fastclose;
39236c6e 4453
5ba3f43e 4454 mp_so = mptetoso(mpte);
39236c6e
A
4455 mp_tp = mpte->mpte_mptcb;
4456 so = mpts->mpts_socket;
4457
39236c6e 4458 /* We got an invalid option or a fast close */
39236c6e
A
4459 struct inpcb *inp = sotoinpcb(so);
4460 struct tcpcb *tp = NULL;
4461
4462 tp = intotcpcb(inp);
fe8ab488 4463 so->so_error = ECONNABORTED;
39236c6e 4464
39037602
A
4465 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4466
cb323159
A
4467 tp->t_mpflags |= TMPF_RESET;
4468
f427ee49
A
4469 if (tp->t_state != TCPS_CLOSED) {
4470 struct tcptemp *t_template = tcp_maketemplate(tp);
39236c6e 4471
f427ee49
A
4472 if (t_template) {
4473 struct tcp_respond_args tra;
39236c6e 4474
f427ee49
A
4475 bzero(&tra, sizeof(tra));
4476 if (inp->inp_flags & INP_BOUND_IF) {
4477 tra.ifscope = inp->inp_boundifp->if_index;
4478 } else {
4479 tra.ifscope = IFSCOPE_NONE;
4480 }
4481 tra.awdl_unrestricted = 1;
4482
4483 tcp_respond(tp, t_template->tt_ipgen,
4484 &t_template->tt_t, (struct mbuf *)NULL,
4485 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4486 (void) m_free(dtom(t_template));
4487 }
39236c6e 4488 }
39037602
A
4489
4490 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
cb323159
A
4491 struct mptsub *iter, *tmp;
4492
3e170ce0 4493 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
39236c6e 4494
cb323159
A
4495 mp_so->so_error = ECONNRESET;
4496
4497 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4498 if (iter == mpts) {
4499 continue;
4500 }
4501 mptcp_subflow_abort(iter, ECONNABORTED);
0a7de745 4502 }
39037602
A
4503
4504 /*
4505 * mptcp_drop is being called after processing the events, to fully
4506 * close the MPTCP connection
4507 */
cb323159 4508 mptcp_drop(mpte, mp_tp, mp_so->so_error);
39236c6e 4509 }
39037602 4510
cb323159
A
4511 mptcp_subflow_abort(mpts, ECONNABORTED);
4512
0a7de745 4513 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
3e170ce0 4514 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
0a7de745 4515 }
39236c6e 4516
0a7de745 4517 return MPTS_EVRET_DELETE;
39236c6e
A
4518}
4519
fe8ab488 4520static ev_ret_t
5ba3f43e 4521mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4522 long *p_mpsofilt_hint, long event)
fe8ab488 4523{
5ba3f43e
A
4524#pragma unused(event)
4525 bool found_active = false;
4526
4527 mpts->mpts_flags |= MPTSF_READ_STALL;
39037602 4528
5ba3f43e
A
4529 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4530 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3e170ce0 4531
5ba3f43e 4532 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
0a7de745 4533 TCPS_HAVERCVDFIN2(tp->t_state)) {
5ba3f43e 4534 continue;
0a7de745 4535 }
5ba3f43e
A
4536
4537 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4538 found_active = true;
4539 break;
fe8ab488 4540 }
fe8ab488
A
4541 }
4542
0a7de745 4543 if (!found_active) {
5ba3f43e 4544 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
0a7de745 4545 }
5ba3f43e 4546
0a7de745 4547 return MPTS_EVRET_OK;
fe8ab488
A
4548}
4549
4550static ev_ret_t
5ba3f43e 4551mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
f427ee49 4552 long *p_mpsofilt_hint, long event)
fe8ab488 4553{
5ba3f43e
A
4554#pragma unused(event)
4555 bool found_active = false;
3e170ce0 4556
5ba3f43e 4557 mpts->mpts_flags |= MPTSF_WRITE_STALL;
fe8ab488 4558
5ba3f43e
A
4559 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4560 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4561
4562 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
0a7de745 4563 tp->t_state > TCPS_CLOSE_WAIT) {
5ba3f43e 4564 continue;
0a7de745 4565 }
5ba3f43e
A
4566
4567 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4568 found_active = true;
4569 break;
4570 }
4571 }
4572
0a7de745 4573 if (!found_active) {
5ba3f43e 4574 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
0a7de745 4575 }
5ba3f43e 4576
0a7de745 4577 return MPTS_EVRET_OK;
fe8ab488
A
4578}
4579
39236c6e
A
4580/*
4581 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4582 * caller must ensure that the option can be issued on subflow sockets, via
4583 * MPOF_SUBFLOW_OK flag.
4584 */
4585int
5ba3f43e 4586mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
39236c6e 4587{
5ba3f43e 4588 struct socket *mp_so, *so;
39236c6e 4589 struct sockopt sopt;
39236c6e
A
4590 int error;
4591
4592 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e
A
4593
4594 mp_so = mptetoso(mpte);
4595 so = mpts->mpts_socket;
4596
cb323159
A
4597 socket_lock_assert_owned(mp_so);
4598
5ba3f43e
A
4599 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4600 mpo->mpo_level == SOL_SOCKET &&
4601 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
d9a64523
A
4602 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4603
4604 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
cb323159 4605 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
0a7de745
A
4606 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4607 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4608 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5ba3f43e
A
4609
4610 /*
4611 * When we open a new subflow, mark it as cell fallback, if
4612 * this subflow goes over cell.
4613 *
4614 * (except for first-party apps)
4615 */
4616
0a7de745
A
4617 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4618 return 0;
4619 }
39236c6e 4620
5ba3f43e 4621 if (sotoinpcb(so)->inp_last_outifp &&
0a7de745
A
4622 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4623 return 0;
4624 }
5ba3f43e
A
4625
4626 /*
4627 * This here is an OR, because if the app is not binding to the
4628 * interface, then it definitely is not a cell-fallback
4629 * connection.
4630 */
d9a64523 4631 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
0a7de745
A
4632 !IFNET_IS_CELLULAR(ifp)) {
4633 return 0;
4634 }
5ba3f43e
A
4635 }
4636
4637 mpo->mpo_flags &= ~MPOF_INTERIM;
39236c6e 4638
0a7de745 4639 bzero(&sopt, sizeof(sopt));
39236c6e
A
4640 sopt.sopt_dir = SOPT_SET;
4641 sopt.sopt_level = mpo->mpo_level;
4642 sopt.sopt_name = mpo->mpo_name;
4643 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
0a7de745 4644 sopt.sopt_valsize = sizeof(int);
39236c6e
A
4645 sopt.sopt_p = kernproc;
4646
5ba3f43e 4647 error = sosetoptlock(so, &sopt, 0);
cb323159
A
4648 if (error) {
4649 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
39236c6e 4650 "val %d set error %d\n", __func__,
cb323159 4651 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
5ba3f43e 4652 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
cb323159 4653 mpo->mpo_intval, error);
39236c6e 4654 }
0a7de745 4655 return error;
39236c6e
A
4656}
4657
4658/*
4659 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4660 * caller must ensure that the option can be issued on subflow sockets, via
4661 * MPOF_SUBFLOW_OK flag.
4662 */
4663int
4664mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4665 struct mptopt *mpo)
4666{
4667 struct socket *mp_so;
4668 struct sockopt sopt;
39236c6e
A
4669 int error;
4670
4671 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
5ba3f43e 4672 mp_so = mptetoso(mpte);
39236c6e 4673
cb323159
A
4674 socket_lock_assert_owned(mp_so);
4675
0a7de745 4676 bzero(&sopt, sizeof(sopt));
39236c6e
A
4677 sopt.sopt_dir = SOPT_GET;
4678 sopt.sopt_level = mpo->mpo_level;
4679 sopt.sopt_name = mpo->mpo_name;
4680 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
0a7de745 4681 sopt.sopt_valsize = sizeof(int);
39236c6e
A
4682 sopt.sopt_p = kernproc;
4683
0a7de745 4684 error = sogetoptlock(so, &sopt, 0); /* already locked */
cb323159
A
4685 if (error) {
4686 os_log_error(mptcp_log_handle,
4687 "%s - %lx: sopt %s get error %d\n",
4688 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4689 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
39236c6e 4690 }
0a7de745 4691 return error;
39236c6e
A
4692}
4693
4694
4695/*
4696 * MPTCP garbage collector.
4697 *
4698 * This routine is called by the MP domain on-demand, periodic callout,
4699 * which is triggered when a MPTCP socket is closed. The callout will
4700 * repeat as long as this routine returns a non-zero value.
4701 */
4702static uint32_t
4703mptcp_gc(struct mppcbinfo *mppi)
4704{
4705 struct mppcb *mpp, *tmpp;
4706 uint32_t active = 0;
4707
5ba3f43e 4708 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
39236c6e 4709
39236c6e
A
4710 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4711 struct socket *mp_so;
4712 struct mptses *mpte;
4713 struct mptcb *mp_tp;
4714
39236c6e 4715 mp_so = mpp->mpp_socket;
39236c6e 4716 mpte = mptompte(mpp);
39236c6e 4717 mp_tp = mpte->mpte_mptcb;
39236c6e 4718
cb323159 4719 if (!mpp_try_lock(mpp)) {
39236c6e
A
4720 active++;
4721 continue;
4722 }
4723
cb323159
A
4724 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4725
39236c6e 4726 /* check again under the lock */
5ba3f43e 4727 if (mp_so->so_usecount > 0) {
39236c6e
A
4728 boolean_t wakeup = FALSE;
4729 struct mptsub *mpts, *tmpts;
4730
39236c6e 4731 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
0a7de745 4732 if (mp_tp->mpt_gc_ticks > 0) {
39236c6e 4733 mp_tp->mpt_gc_ticks--;
0a7de745 4734 }
39236c6e
A
4735 if (mp_tp->mpt_gc_ticks == 0) {
4736 wakeup = TRUE;
39236c6e
A
4737 }
4738 }
39236c6e
A
4739 if (wakeup) {
4740 TAILQ_FOREACH_SAFE(mpts,
4741 &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4742 mptcp_subflow_eupcall1(mpts->mpts_socket,
39236c6e 4743 mpts, SO_FILT_HINT_DISCONNECTED);
39236c6e
A
4744 }
4745 }
cb323159 4746 socket_unlock(mp_so, 0);
39236c6e
A
4747 active++;
4748 continue;
4749 }
4750
4751 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
cb323159 4752 panic("%s - %lx: skipped state "
0a7de745 4753 "[u=%d,r=%d,s=%d]\n", __func__,
cb323159 4754 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
0a7de745
A
4755 mp_so->so_usecount, mp_so->so_retaincnt,
4756 mpp->mpp_state);
39236c6e
A
4757 }
4758
0a7de745 4759 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
5ba3f43e 4760 mptcp_close(mpte, mp_tp);
0a7de745 4761 }
3e170ce0 4762
5ba3f43e 4763 mptcp_session_destroy(mpte);
39236c6e 4764
39037602 4765 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
39236c6e
A
4766 struct sockbuf *, &mp_so->so_rcv,
4767 struct sockbuf *, &mp_so->so_snd,
4768 struct mppcb *, mpp);
4769
4770 mp_pcbdispose(mpp);
39037602 4771 sodealloc(mp_so);
39236c6e
A
4772 }
4773
0a7de745 4774 return active;
39236c6e
A
4775}
4776
4777/*
4778 * Drop a MPTCP connection, reporting the specified error.
4779 */
4780struct mptses *
f427ee49 4781mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
39236c6e 4782{
cb323159 4783 struct socket *mp_so = mptetoso(mpte);
39236c6e 4784
39236c6e 4785 VERIFY(mpte->mpte_mptcb == mp_tp);
cb323159
A
4786
4787 socket_lock_assert_owned(mp_so);
39236c6e 4788
39037602 4789 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
4790 uint32_t, 0 /* event */);
4791
0a7de745 4792 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
39236c6e 4793 errno = mp_tp->mpt_softerror;
0a7de745 4794 }
39236c6e
A
4795 mp_so->so_error = errno;
4796
0a7de745 4797 return mptcp_close(mpte, mp_tp);
39236c6e
A
4798}
4799
4800/*
4801 * Close a MPTCP control block.
4802 */
4803struct mptses *
4804mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4805{
3e170ce0 4806 struct mptsub *mpts = NULL, *tmpts = NULL;
cb323159 4807 struct socket *mp_so = mptetoso(mpte);
39236c6e 4808
cb323159 4809 socket_lock_assert_owned(mp_so);
39236c6e 4810 VERIFY(mpte->mpte_mptcb == mp_tp);
39236c6e 4811
5ba3f43e 4812 mp_tp->mpt_state = MPTCPS_TERMINATE;
39236c6e 4813
5ba3f43e
A
4814 mptcp_freeq(mp_tp);
4815
4816 soisdisconnected(mp_so);
39236c6e
A
4817
4818 /* Clean up all subflows */
4819 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
5ba3f43e 4820 mptcp_subflow_disconnect(mpte, mpts);
39236c6e 4821 }
39236c6e 4822
0a7de745 4823 return NULL;
39236c6e
A
4824}
4825
4826void
4827mptcp_notify_close(struct socket *so)
4828{
4829 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4830}
4831
4832/*
5ba3f43e 4833 * MPTCP workloop.
39236c6e
A
4834 */
4835void
5ba3f43e 4836mptcp_subflow_workloop(struct mptses *mpte)
39236c6e 4837{
39236c6e 4838 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
f427ee49 4839 long mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
0a7de745
A
4840 struct mptsub *mpts, *tmpts;
4841 struct socket *mp_so;
39236c6e 4842
cb323159
A
4843 mp_so = mptetoso(mpte);
4844
4845 socket_lock_assert_owned(mp_so);
0a7de745
A
4846
4847 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4848 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4849 return;
4850 }
4851 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4852
0a7de745 4853relaunch:
0a7de745 4854 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
39236c6e
A
4855
4856 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4857 ev_ret_t ret;
4858
5ba3f43e
A
4859 if (mpts->mpts_socket->so_usecount == 0) {
4860 /* Will be removed soon by tcp_garbage_collect */
4861 continue;
4862 }
3e170ce0 4863
5ba3f43e
A
4864 mptcp_subflow_addref(mpts);
4865 mpts->mpts_socket->so_usecount++;
3e170ce0
A
4866
4867 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
39236c6e 4868
39236c6e
A
4869 /*
4870 * If MPTCP socket is closed, disconnect all subflows.
4871 * This will generate a disconnect event which will
4872 * be handled during the next iteration, causing a
4873 * non-zero error to be returned above.
4874 */
0a7de745 4875 if (mp_so->so_flags & SOF_PCBCLEARING) {
5ba3f43e 4876 mptcp_subflow_disconnect(mpte, mpts);
0a7de745 4877 }
39236c6e
A
4878
4879 switch (ret) {
39236c6e
A
4880 case MPTS_EVRET_OK:
4881 /* nothing to do */
4882 break;
4883 case MPTS_EVRET_DELETE:
5ba3f43e 4884 mptcp_subflow_soclose(mpts);
39236c6e
A
4885 break;
4886 case MPTS_EVRET_CONNECT_PENDING:
4887 connect_pending = TRUE;
4888 break;
4889 case MPTS_EVRET_DISCONNECT_FALLBACK:
4890 disconnect_fallback = TRUE;
4891 break;
3e170ce0
A
4892 default:
4893 mptcplog((LOG_DEBUG,
4894 "MPTCP Socket: %s: mptcp_subflow_events "
0a7de745 4895 "returned invalid value: %d\n", __func__,
3e170ce0
A
4896 ret),
4897 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4898 break;
39236c6e 4899 }
0a7de745 4900 mptcp_subflow_remref(mpts); /* ours */
5ba3f43e
A
4901
4902 VERIFY(mpts->mpts_socket->so_usecount != 0);
4903 mpts->mpts_socket->so_usecount--;
39236c6e
A
4904 }
4905
5ba3f43e 4906 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
5ba3f43e
A
4907 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4908
cb323159
A
4909 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4910 mp_so->so_state |= SS_CANTRCVMORE;
4911 sorwakeup(mp_so);
4912 }
4913
3e170ce0 4914 soevent(mp_so, mpsofilt_hint_mask);
39236c6e
A
4915 }
4916
0a7de745
A
4917 if (!connect_pending && !disconnect_fallback) {
4918 goto exit;
4919 }
39236c6e
A
4920
4921 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
39236c6e
A
4922 if (disconnect_fallback) {
4923 struct socket *so = NULL;
4924 struct inpcb *inp = NULL;
4925 struct tcpcb *tp = NULL;
4926
0a7de745 4927 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
39236c6e 4928 continue;
0a7de745 4929 }
39236c6e
A
4930
4931 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4932
0a7de745 4933 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
f427ee49 4934 MPTSF_DISCONNECTED)) {
39236c6e 4935 continue;
0a7de745 4936 }
490019cf 4937
39236c6e
A
4938 so = mpts->mpts_socket;
4939
4940 /*
4941 * The MPTCP connection has degraded to a fallback
4942 * mode, so there is no point in keeping this subflow
4943 * regardless of its MPTCP-readiness state, unless it
4944 * is the primary one which we use for fallback. This
4945 * assumes that the subflow used for fallback is the
4946 * ACTIVE one.
4947 */
4948
39236c6e
A
4949 inp = sotoinpcb(so);
4950 tp = intotcpcb(inp);
4951 tp->t_mpflags &=
0a7de745 4952 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
39236c6e 4953 tp->t_mpflags |= TMPF_TCP_FALLBACK;
490019cf 4954
5ba3f43e 4955 soevent(so, SO_FILT_HINT_MUSTRST);
39236c6e
A
4956 } else if (connect_pending) {
4957 /*
4958 * The MPTCP connection has progressed to a state
4959 * where it supports full multipath semantics; allow
4960 * additional joins to be attempted for all subflows
4961 * that are in the PENDING state.
4962 */
4963 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5ba3f43e 4964 int error = mptcp_subflow_soconnectx(mpte, mpts);
39236c6e 4965
0a7de745 4966 if (error) {
5ba3f43e 4967 mptcp_subflow_abort(mpts, error);
0a7de745 4968 }
5ba3f43e 4969 }
39236c6e 4970 }
39236c6e 4971 }
0a7de745
A
4972
4973exit:
4974 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4975 goto relaunch;
4976 }
4977
4978 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
39236c6e
A
4979}
4980
39236c6e
A
4981/*
4982 * Protocol pr_lock callback.
4983 */
4984int
4985mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4986{
5ba3f43e 4987 struct mppcb *mpp = mpsotomppcb(mp_so);
39236c6e
A
4988 void *lr_saved;
4989
0a7de745 4990 if (lr == NULL) {
39236c6e 4991 lr_saved = __builtin_return_address(0);
0a7de745 4992 } else {
39236c6e 4993 lr_saved = lr;
0a7de745 4994 }
39236c6e
A
4995
4996 if (mpp == NULL) {
4997 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4998 mp_so, lr_saved, solockhistory_nr(mp_so));
4999 /* NOTREACHED */
5000 }
5ba3f43e 5001 mpp_lock(mpp);
39236c6e
A
5002
5003 if (mp_so->so_usecount < 0) {
5004 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
5005 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
5006 solockhistory_nr(mp_so));
5007 /* NOTREACHED */
5008 }
0a7de745 5009 if (refcount != 0) {
39236c6e 5010 mp_so->so_usecount++;
cb323159 5011 mpp->mpp_inside++;
0a7de745 5012 }
39236c6e
A
5013 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
5014 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
5015
0a7de745 5016 return 0;
39236c6e
A
5017}
5018
5019/*
5020 * Protocol pr_unlock callback.
5021 */
5022int
5ba3f43e 5023mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
39236c6e 5024{
5ba3f43e
A
5025 struct mppcb *mpp = mpsotomppcb(mp_so);
5026 void *lr_saved;
39236c6e 5027
0a7de745 5028 if (lr == NULL) {
5ba3f43e 5029 lr_saved = __builtin_return_address(0);
0a7de745 5030 } else {
5ba3f43e 5031 lr_saved = lr;
0a7de745 5032 }
39236c6e 5033
5ba3f43e
A
5034 if (mpp == NULL) {
5035 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
5036 mp_so, mp_so->so_usecount, lr_saved,
5037 solockhistory_nr(mp_so));
5038 /* NOTREACHED */
5039 }
cb323159 5040 socket_lock_assert_owned(mp_so);
39236c6e 5041
0a7de745 5042 if (refcount != 0) {
5ba3f43e 5043 mp_so->so_usecount--;
cb323159 5044 mpp->mpp_inside--;
0a7de745 5045 }
39236c6e 5046
5ba3f43e
A
5047 if (mp_so->so_usecount < 0) {
5048 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
5049 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5050 /* NOTREACHED */
39236c6e 5051 }
cb323159
A
5052 if (mpp->mpp_inside < 0) {
5053 panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
5054 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5055 /* NOTREACHED */
5056 }
5ba3f43e
A
5057 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5058 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5059 mpp_unlock(mpp);
5060
0a7de745 5061 return 0;
39236c6e
A
5062}
5063
5ba3f43e
A
5064/*
5065 * Protocol pr_getlock callback.
5066 */
5067lck_mtx_t *
5068mptcp_getlock(struct socket *mp_so, int flags)
39236c6e 5069{
5ba3f43e
A
5070 struct mppcb *mpp = mpsotomppcb(mp_so);
5071
5072 if (mpp == NULL) {
5073 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
5074 solockhistory_nr(mp_so));
39236c6e
A
5075 /* NOTREACHED */
5076 }
5ba3f43e
A
5077 if (mp_so->so_usecount < 0) {
5078 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
5079 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5080 /* NOTREACHED */
39236c6e 5081 }
0a7de745 5082 return mpp_getlock(mpp, flags);
39236c6e
A
5083}
5084
5085/*
5086 * MPTCP Join support
5087 */
5088
5089static void
cb323159 5090mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
39236c6e
A
5091{
5092 struct tcpcb *tp = sototcpcb(so);
5093 struct mptcp_subf_auth_entry *sauth_entry;
39236c6e 5094
39236c6e 5095 /*
39236c6e
A
5096 * The address ID of the first flow is implicitly 0.
5097 */
5098 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5099 tp->t_local_aid = 0;
5100 } else {
fe8ab488 5101 tp->t_local_aid = addr_id;
39236c6e
A
5102 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
5103 so->so_flags |= SOF_MP_SEC_SUBFLOW;
5104 }
5105 sauth_entry = zalloc(mpt_subauth_zone);
5106 sauth_entry->msae_laddr_id = tp->t_local_aid;
5107 sauth_entry->msae_raddr_id = 0;
5108 sauth_entry->msae_raddr_rand = 0;
5109try_again:
5110 sauth_entry->msae_laddr_rand = RandomULong();
0a7de745 5111 if (sauth_entry->msae_laddr_rand == 0) {
39236c6e 5112 goto try_again;
0a7de745 5113 }
39236c6e
A
5114 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
5115}
5116
5117static void
5118mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
5119{
5120 struct mptcp_subf_auth_entry *sauth_entry;
fe8ab488 5121 struct tcpcb *tp = NULL;
39236c6e
A
5122 int found = 0;
5123
fe8ab488 5124 tp = sototcpcb(so);
0a7de745 5125 if (tp == NULL) {
39236c6e 5126 return;
0a7de745 5127 }
39236c6e 5128
39236c6e
A
5129 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5130 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
5131 found = 1;
5132 break;
5133 }
5134 }
5135 if (found) {
5136 LIST_REMOVE(sauth_entry, msae_next);
39236c6e 5137 }
fe8ab488 5138
0a7de745 5139 if (found) {
3e170ce0 5140 zfree(mpt_subauth_zone, sauth_entry);
0a7de745 5141 }
39236c6e
A
5142}
5143
5144void
5145mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5146 u_int32_t *rrand)
5147{
5148 struct mptcp_subf_auth_entry *sauth_entry;
39236c6e 5149
39236c6e
A
5150 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5151 if (sauth_entry->msae_laddr_id == addr_id) {
0a7de745 5152 if (lrand) {
39236c6e 5153 *lrand = sauth_entry->msae_laddr_rand;
0a7de745
A
5154 }
5155 if (rrand) {
39236c6e 5156 *rrand = sauth_entry->msae_raddr_rand;
0a7de745 5157 }
39236c6e
A
5158 break;
5159 }
5160 }
39236c6e
A
5161}
5162
5163void
5164mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5165 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5166{
5167 struct mptcp_subf_auth_entry *sauth_entry;
39236c6e 5168
39236c6e
A
5169 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5170 if (sauth_entry->msae_laddr_id == laddr_id) {
5171 if ((sauth_entry->msae_raddr_id != 0) &&
5172 (sauth_entry->msae_raddr_id != raddr_id)) {
cb323159
A
5173 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5174 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5175 raddr_id, sauth_entry->msae_raddr_id);
39236c6e
A
5176 return;
5177 }
5178 sauth_entry->msae_raddr_id = raddr_id;
5179 if ((sauth_entry->msae_raddr_rand != 0) &&
5180 (sauth_entry->msae_raddr_rand != raddr_rand)) {
cb323159
A
5181 os_log_error(mptcp_log_handle, "%s - %lx: "
5182 "dup SYN_ACK %d %d \n",
5183 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5184 raddr_rand, sauth_entry->msae_raddr_rand);
39236c6e
A
5185 return;
5186 }
5187 sauth_entry->msae_raddr_rand = raddr_rand;
39236c6e
A
5188 return;
5189 }
5190 }
39236c6e
A
5191}
5192
5193/*
5194 * SHA1 support for MPTCP
5195 */
5ba3f43e
A
5196static void
5197mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
39236c6e
A
5198{
5199 SHA1_CTX sha1ctxt;
5200 const unsigned char *sha1_base;
5201 int sha1_size;
5202
39236c6e 5203 sha1_base = (const unsigned char *) key;
0a7de745 5204 sha1_size = sizeof(mptcp_key_t);
39236c6e
A
5205 SHA1Init(&sha1ctxt);
5206 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5207 SHA1Final(sha_digest, &sha1ctxt);
39236c6e
A
5208}
5209
5210void
5211mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
0a7de745 5212 u_int32_t rand1, u_int32_t rand2, u_char *digest)
39236c6e
A
5213{
5214 SHA1_CTX sha1ctxt;
5215 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5216 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5217 u_int32_t data[2];
5218 int i;
5219
5ba3f43e 5220 bzero(digest, SHA1_RESULTLEN);
39236c6e
A
5221
5222 /* Set up the Key for HMAC */
5223 key_ipad[0] = key1;
5224 key_ipad[1] = key2;
5225
5226 key_opad[0] = key1;
5227 key_opad[1] = key2;
5228
5229 /* Set up the message for HMAC */
5230 data[0] = rand1;
5231 data[1] = rand2;
5232
5233 /* Key is 512 block length, so no need to compute hash */
5234
5235 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5236
5237 for (i = 0; i < 8; i++) {
5238 key_ipad[i] ^= 0x3636363636363636;
5239 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5240 }
5241
5242 /* Perform inner SHA1 */
5243 SHA1Init(&sha1ctxt);
0a7de745
A
5244 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5245 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
39236c6e
A
5246 SHA1Final(digest, &sha1ctxt);
5247
5248 /* Perform outer SHA1 */
5249 SHA1Init(&sha1ctxt);
0a7de745 5250 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
39236c6e
A
5251 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5252 SHA1Final(digest, &sha1ctxt);
5253}
5254
5255/*
5256 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5257 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5258 */
5259void
5ba3f43e 5260mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
39236c6e
A
5261{
5262 uint32_t lrand, rrand;
39236c6e 5263
39236c6e
A
5264 lrand = rrand = 0;
5265 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5ba3f43e
A
5266 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
5267 digest);
39236c6e
A
5268}
5269
5270/*
5271 * Authentication data generation
5272 */
5ba3f43e 5273static void
39236c6e
A
5274mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5275 int token_len)
5276{
0a7de745 5277 VERIFY(token_len == sizeof(u_int32_t));
39236c6e
A
5278 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5279
5280 /* Most significant 32 bits of the SHA1 hash */
0a7de745 5281 bcopy(sha_digest, token, sizeof(u_int32_t));
490019cf 5282 return;
39236c6e
A
5283}
5284
5ba3f43e 5285static void
39236c6e
A
5286mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5287 int idsn_len)
5288{
0a7de745 5289 VERIFY(idsn_len == sizeof(u_int64_t));
39236c6e
A
5290 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5291
5292 /*
5293 * Least significant 64 bits of the SHA1 hash
5294 */
5295
5296 idsn[7] = sha_digest[12];
5297 idsn[6] = sha_digest[13];
5298 idsn[5] = sha_digest[14];
5299 idsn[4] = sha_digest[15];
5300 idsn[3] = sha_digest[16];
5301 idsn[2] = sha_digest[17];
5302 idsn[1] = sha_digest[18];
5303 idsn[0] = sha_digest[19];
490019cf 5304 return;
39236c6e
A
5305}
5306
490019cf
A
5307static void
5308mptcp_conn_properties(struct mptcb *mp_tp)
5309{
5310 /* There is only Version 0 at this time */
5311 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
5312
5313 /* Set DSS checksum flag */
0a7de745 5314 if (mptcp_dss_csum) {
490019cf 5315 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
0a7de745 5316 }
490019cf
A
5317
5318 /* Set up receive window */
5319 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5320
5321 /* Set up gc ticks */
5322 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5323}
5324
5325static void
5ba3f43e 5326mptcp_init_local_parms(struct mptses *mpte)
39236c6e 5327{
5ba3f43e
A
5328 struct mptcb *mp_tp = mpte->mpte_mptcb;
5329 char key_digest[SHA1_RESULTLEN];
490019cf 5330
5ba3f43e
A
5331 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5332 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5333
5334 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
0a7de745 5335 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5ba3f43e 5336 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
0a7de745 5337 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
490019cf
A
5338
5339 /* The subflow SYN is also first MPTCP byte */
5340 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5341 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5342
5343 mptcp_conn_properties(mp_tp);
5344}
5345
5346int
5347mptcp_init_remote_parms(struct mptcb *mp_tp)
5348{
5ba3f43e 5349 char remote_digest[SHA1_RESULTLEN];
39236c6e
A
5350
5351 /* Only Version 0 is supported for auth purposes */
0a7de745
A
5352 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5353 return -1;
5354 }
39236c6e
A
5355
5356 /* Setup local and remote tokens and Initial DSNs */
5ba3f43e 5357 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
39236c6e 5358 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
0a7de745 5359 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
39236c6e 5360 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
0a7de745 5361 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5ba3f43e 5362 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
cb323159 5363 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
39236c6e 5364
0a7de745 5365 return 0;
39236c6e
A
5366}
5367
5ba3f43e 5368static void
39236c6e
A
5369mptcp_send_dfin(struct socket *so)
5370{
5371 struct tcpcb *tp = NULL;
5372 struct inpcb *inp = NULL;
5373
5374 inp = sotoinpcb(so);
0a7de745 5375 if (!inp) {
39236c6e 5376 return;
0a7de745 5377 }
39236c6e
A
5378
5379 tp = intotcpcb(inp);
0a7de745 5380 if (!tp) {
39236c6e 5381 return;
0a7de745 5382 }
39236c6e 5383
0a7de745 5384 if (!(tp->t_mpflags & TMPF_RESET)) {
39236c6e 5385 tp->t_mpflags |= TMPF_SEND_DFIN;
0a7de745 5386 }
39236c6e
A
5387}
5388
5389/*
5390 * Data Sequence Mapping routines
5391 */
5392void
5393mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5394{
5395 struct mptcb *mp_tp;
5396
0a7de745 5397 if (m == NULL) {
39236c6e 5398 return;
0a7de745 5399 }
39236c6e 5400
3e170ce0 5401 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5ba3f43e 5402
39236c6e
A
5403 while (m) {
5404 VERIFY(m->m_flags & M_PKTHDR);
5405 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5406 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
f427ee49
A
5407 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5408 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
39236c6e
A
5409 mp_tp->mpt_sndmax += m_pktlen(m);
5410 m = m->m_next;
5411 }
5ba3f43e
A
5412}
5413
5414void
5415mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5416{
5417 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5418 uint64_t data_ack;
5419 uint64_t dsn;
5420
f427ee49
A
5421 VERIFY(len >= 0);
5422
0a7de745 5423 if (!m || len == 0) {
5ba3f43e 5424 return;
0a7de745 5425 }
5ba3f43e
A
5426
5427 while (m && len > 0) {
5428 VERIFY(m->m_flags & M_PKTHDR);
5429 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5430
5431 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5432 dsn = m->m_pkthdr.mp_dsn;
5433
5434 len -= m->m_len;
5435 m = m->m_next;
5436 }
5437
5438 if (m && len == 0) {
5439 /*
5440 * If there is one more mbuf in the chain, it automatically means
5441 * that up to m->mp_dsn has been ack'ed.
5442 *
5443 * This means, we actually correct data_ack back down (compared
5444 * to what we set inside the loop - dsn + data_len). Because in
5445 * the loop we are "optimistic" and assume that the full mapping
5446 * will be acked. If that's not the case and we get out of the
5447 * loop with m != NULL, it means only up to m->mp_dsn has been
5448 * really acked.
5449 */
5450 data_ack = m->m_pkthdr.mp_dsn;
5451 }
5452
5453 if (len < 0) {
5454 /*
5455 * If len is negative, meaning we acked in the middle of an mbuf,
5456 * only up to this mbuf's data-sequence number has been acked
5457 * at the MPTCP-level.
5458 */
5459 data_ack = dsn;
5460 }
5461
5462 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
0a7de745 5463 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
cb323159
A
5464
5465 /* We can have data in the subflow's send-queue that is being acked,
5466 * while the DATA_ACK has already advanced. Thus, we should check whether
5467 * or not the DATA_ACK is actually new here.
5468 */
5469 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5470 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5471 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5472 }
39236c6e
A
5473}
5474
5475void
490019cf 5476mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
39236c6e 5477{
490019cf
A
5478 int rewinding = 0;
5479
5ba3f43e
A
5480 /* TFO makes things complicated. */
5481 if (so->so_flags1 & SOF1_TFO_REWIND) {
5482 rewinding = 1;
5483 so->so_flags1 &= ~SOF1_TFO_REWIND;
490019cf 5484 }
39236c6e 5485
5ba3f43e
A
5486 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5487 u_int32_t sub_len;
39236c6e 5488 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 5489 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 5490
5ba3f43e 5491 sub_len = m->m_pkthdr.mp_rlen;
39236c6e 5492
5ba3f43e
A
5493 if (sub_len < len) {
5494 m->m_pkthdr.mp_dsn += sub_len;
5495 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5496 m->m_pkthdr.mp_rseq += sub_len;
39236c6e 5497 }
5ba3f43e
A
5498 m->m_pkthdr.mp_rlen = 0;
5499 len -= sub_len;
39236c6e 5500 } else {
5ba3f43e 5501 /* sub_len >= len */
0a7de745 5502 if (rewinding == 0) {
5ba3f43e 5503 m->m_pkthdr.mp_dsn += len;
0a7de745 5504 }
5ba3f43e 5505 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
0a7de745 5506 if (rewinding == 0) {
5ba3f43e 5507 m->m_pkthdr.mp_rseq += len;
0a7de745 5508 }
5ba3f43e
A
5509 }
5510 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5511 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5512 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5513 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5514 m->m_pkthdr.mp_rlen -= len;
5515 break;
39236c6e
A
5516 }
5517 m = m->m_next;
5518 }
39037602
A
5519
5520 if (so->so_flags & SOF_MP_SUBFLOW &&
5521 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5522 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5523 /*
5524 * Received an ack without receiving a DATA_ACK.
5525 * Need to fallback to regular TCP (or destroy this subflow).
5526 */
5ba3f43e 5527 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
39037602
A
5528 mptcp_notify_mpfail(so);
5529 }
39236c6e
A
5530}
5531
5532/* Obtain the DSN mapping stored in the mbuf */
5533void
5ba3f43e
A
5534mptcp_output_getm_dsnmap32(struct socket *so, int off,
5535 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
39236c6e
A
5536{
5537 u_int64_t dsn64;
5538
5ba3f43e 5539 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
39236c6e 5540 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
39236c6e
A
5541}
5542
5543void
5ba3f43e 5544mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
0a7de745
A
5545 uint32_t *relseq, uint16_t *data_len,
5546 uint16_t *dss_csum)
39236c6e
A
5547{
5548 struct mbuf *m = so->so_snd.sb_mb;
5ba3f43e 5549 int off_orig = off;
39236c6e 5550
5ba3f43e 5551 VERIFY(off >= 0);
39236c6e 5552
4ba76501
A
5553 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5554 *dsn = 0;
5555 *relseq = 0;
5556 *data_len = 0;
5557 *dss_csum = 0;
5558 return;
5559 }
5560
39236c6e
A
5561 /*
5562 * In the subflow socket, the DSN sequencing can be discontiguous,
5563 * but the subflow sequence mapping is contiguous. Use the subflow
5564 * sequence property to find the right mbuf and corresponding dsn
5565 * mapping.
5566 */
5567
5568 while (m) {
39236c6e 5569 VERIFY(m->m_flags & M_PKTHDR);
5ba3f43e 5570 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
39236c6e 5571
5ba3f43e
A
5572 if (off >= m->m_len) {
5573 off -= m->m_len;
39236c6e
A
5574 m = m->m_next;
5575 } else {
5576 break;
5577 }
5578 }
5579
5ba3f43e
A
5580 VERIFY(off >= 0);
5581 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
39236c6e 5582
5ba3f43e
A
5583 *dsn = m->m_pkthdr.mp_dsn;
5584 *relseq = m->m_pkthdr.mp_rseq;
5585 *data_len = m->m_pkthdr.mp_rlen;
5586 *dss_csum = m->m_pkthdr.mp_csum;
39236c6e 5587
5ba3f43e 5588 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
0a7de745
A
5589 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5590 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
5591}
5592
5593/*
3e170ce0
A
5594 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5595 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5596 * When it trims data tcp_input calls m_adj() which does not remove the
5597 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5598 * The dsn map insertion cannot be delayed after trim, because data can be in
5599 * the reassembly queue for a while and the DSN option info in tp will be
5600 * overwritten for every new packet received.
39236c6e
A
5601 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5602 * with mptcp_adj_rmap()
5603 */
5604void
5c9f4661 5605mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
39236c6e 5606{
5c9f4661 5607 VERIFY(m->m_flags & M_PKTHDR);
39236c6e
A
5608 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5609
5610 if (tp->t_mpflags & TMPF_EMBED_DSN) {
39236c6e
A
5611 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5612 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5613 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5ba3f43e 5614 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
0a7de745 5615 if (tp->t_rcv_map.mpt_dfin) {
5c9f4661 5616 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
0a7de745 5617 }
5c9f4661 5618
39236c6e 5619 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5c9f4661 5620
39236c6e
A
5621 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5622 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5c9f4661 5623 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
0a7de745 5624 if (th->th_flags & TH_FIN) {
5c9f4661 5625 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
0a7de745 5626 }
39236c6e
A
5627 }
5628}
5629
39236c6e
A
5630/*
5631 * Following routines help with failure detection and failover of data
5632 * transfer from one subflow to another.
5633 */
5634void
5635mptcp_act_on_txfail(struct socket *so)
5636{
5637 struct tcpcb *tp = NULL;
5638 struct inpcb *inp = sotoinpcb(so);
5639
0a7de745 5640 if (inp == NULL) {
39236c6e 5641 return;
0a7de745 5642 }
39236c6e
A
5643
5644 tp = intotcpcb(inp);
0a7de745 5645 if (tp == NULL) {
39236c6e 5646 return;
0a7de745 5647 }
39236c6e 5648
0a7de745 5649 if (so->so_flags & SOF_MP_TRYFAILOVER) {
39236c6e 5650 return;
0a7de745 5651 }
39236c6e
A
5652
5653 so->so_flags |= SOF_MP_TRYFAILOVER;
5654 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5655}
5656
5657/*
5658 * Support for MP_FAIL option
5659 */
5660int
f427ee49 5661mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
39236c6e
A
5662{
5663 struct mbuf *m = so->so_snd.sb_mb;
f427ee49
A
5664 uint16_t datalen;
5665 uint64_t dsn;
39236c6e 5666 int off = 0;
39236c6e 5667
0a7de745
A
5668 if (m == NULL) {
5669 return -1;
5670 }
39236c6e
A
5671
5672 while (m != NULL) {
5673 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5674 VERIFY(m->m_flags & M_PKTHDR);
5675 dsn = m->m_pkthdr.mp_dsn;
5676 datalen = m->m_pkthdr.mp_rlen;
5677 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5678 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
f427ee49 5679 off = (int)(dsn_fail - dsn);
39236c6e 5680 *tcp_seq = m->m_pkthdr.mp_rseq + off;
0a7de745 5681 return 0;
39236c6e
A
5682 }
5683
5684 m = m->m_next;
5685 }
5686
5687 /*
5688 * If there was no mbuf data and a fallback to TCP occurred, there's
5689 * not much else to do.
5690 */
5691
cb323159 5692 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
0a7de745 5693 return -1;
5ba3f43e
A
5694}
5695
5696/*
5697 * Support for sending contiguous MPTCP bytes in subflow
5698 * Also for preventing sending data with ACK in 3-way handshake
5699 */
5700int32_t
5701mptcp_adj_sendlen(struct socket *so, int32_t off)
5702{
5703 struct tcpcb *tp = sototcpcb(so);
5704 struct mptsub *mpts = tp->t_mpsub;
5705 uint64_t mdss_dsn;
5706 uint32_t mdss_subflow_seq;
5707 int mdss_subflow_off;
5708 uint16_t mdss_data_len;
5709 uint16_t dss_csum;
5710
4ba76501
A
5711 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5712 return 0;
5713 }
5714
5ba3f43e 5715 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
0a7de745 5716 &mdss_data_len, &dss_csum);
5ba3f43e
A
5717
5718 /*
5719 * We need to compute how much of the mapping still remains.
5720 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5721 */
5722 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5723
5724 /*
5725 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5726 * seq has been set to 1 (while it should be 0).
5727 */
0a7de745 5728 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5ba3f43e 5729 mdss_subflow_off--;
0a7de745 5730 }
5ba3f43e 5731
5ba3f43e
A
5732 VERIFY(off >= mdss_subflow_off);
5733
0a7de745 5734 return mdss_data_len - (off - mdss_subflow_off);
5ba3f43e
A
5735}
5736
5737static uint32_t
5738mptcp_get_maxseg(struct mptses *mpte)
5739{
5740 struct mptsub *mpts;
5741 uint32_t maxseg = 0;
5742
5743 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5744 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5745
5746 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
0a7de745 5747 TCPS_HAVERCVDFIN2(tp->t_state)) {
5ba3f43e 5748 continue;
0a7de745 5749 }
5ba3f43e 5750
0a7de745 5751 if (tp->t_maxseg > maxseg) {
5ba3f43e 5752 maxseg = tp->t_maxseg;
0a7de745 5753 }
5ba3f43e
A
5754 }
5755
0a7de745 5756 return maxseg;
5ba3f43e
A
5757}
5758
5759static uint8_t
5760mptcp_get_rcvscale(struct mptses *mpte)
5761{
5762 struct mptsub *mpts;
5763 uint8_t rcvscale = UINT8_MAX;
5764
5765 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5766 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5767
5768 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
0a7de745 5769 TCPS_HAVERCVDFIN2(tp->t_state)) {
5ba3f43e 5770 continue;
0a7de745 5771 }
5ba3f43e 5772
0a7de745 5773 if (tp->rcv_scale < rcvscale) {
5ba3f43e 5774 rcvscale = tp->rcv_scale;
0a7de745 5775 }
5ba3f43e
A
5776 }
5777
0a7de745 5778 return rcvscale;
5ba3f43e
A
5779}
5780
5781/* Similar to tcp_sbrcv_reserve */
5782static void
5783mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
0a7de745 5784 u_int32_t newsize, u_int32_t idealsize)
5ba3f43e
A
5785{
5786 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5787
5788 /* newsize should not exceed max */
5789 newsize = min(newsize, tcp_autorcvbuf_max);
5790
5791 /* The receive window scale negotiated at the
5792 * beginning of the connection will also set a
5793 * limit on the socket buffer size
5794 */
5795 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5796
5797 /* Set new socket buffer size */
5798 if (newsize > sbrcv->sb_hiwat &&
0a7de745 5799 (sbreserve(sbrcv, newsize) == 1)) {
5ba3f43e
A
5800 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5801 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5802
5803 /* Again check the limit set by the advertised
5804 * window scale
5805 */
5806 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
0a7de745 5807 TCP_MAXWIN << rcvscale);
5ba3f43e
A
5808 }
5809}
5810
5811void
5812mptcp_sbrcv_grow(struct mptcb *mp_tp)
5813{
5814 struct mptses *mpte = mp_tp->mpt_mpte;
5815 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5816 struct sockbuf *sbrcv = &mp_so->so_rcv;
5817 uint32_t hiwat_sum = 0;
5818 uint32_t ideal_sum = 0;
5819 struct mptsub *mpts;
5820
5821 /*
5822 * Do not grow the receive socket buffer if
5823 * - auto resizing is disabled, globally or on this socket
5824 * - the high water mark already reached the maximum
5825 * - the stream is in background and receive side is being
5826 * throttled
5827 * - if there are segments in reassembly queue indicating loss,
5828 * do not need to increase recv window during recovery as more
5829 * data is not going to be sent. A duplicate ack sent during
5830 * recovery should not change the receive window
5831 */
5832 if (tcp_do_autorcvbuf == 0 ||
5833 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5834 tcp_cansbgrow(sbrcv) == 0 ||
5835 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5836 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5837 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5838 /* Can not resize the socket buffer, just return */
5839 return;
5840 }
5841
5842 /*
5843 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5844 *
5845 * But, for this we first need accurate receiver-RTT estimations, which
5846 * we currently don't have.
5847 *
5848 * Let's use a dummy algorithm for now, just taking the sum of all
5849 * subflow's receive-buffers. It's too low, but that's all we can get
5850 * for now.
5851 */
5852
5853 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5854 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5855 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5856 }
5857
5858 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
39236c6e
A
5859}
5860
5861/*
5ba3f43e
A
5862 * Determine if we can grow the recieve socket buffer to avoid sending
5863 * a zero window update to the peer. We allow even socket buffers that
5864 * have fixed size (set by the application) to grow if the resource
5865 * constraints are met. They will also be trimmed after the application
5866 * reads data.
5867 *
5868 * Similar to tcp_sbrcv_grow_rwin
39236c6e 5869 */
5ba3f43e
A
5870static void
5871mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
39236c6e 5872{
5ba3f43e
A
5873 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5874 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5875 u_int32_t rcvbuf = sb->sb_hiwat;
39236c6e 5876
0a7de745 5877 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5ba3f43e 5878 return;
0a7de745 5879 }
39236c6e 5880
5ba3f43e
A
5881 if (tcp_do_autorcvbuf == 1 &&
5882 tcp_cansbgrow(sb) &&
5883 /* Diff to tcp_sbrcv_grow_rwin */
5884 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5885 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5886 rcvbuf < tcp_autorcvbuf_max &&
5887 (sb->sb_idealsize > 0 &&
5888 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5889 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
490019cf 5890 }
39236c6e
A
5891}
5892
5ba3f43e 5893/* Similar to tcp_sbspace */
39236c6e 5894int32_t
5ba3f43e 5895mptcp_sbspace(struct mptcb *mp_tp)
39236c6e 5896{
5ba3f43e 5897 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
39236c6e
A
5898 uint32_t rcvbuf;
5899 int32_t space;
5ba3f43e
A
5900 int32_t pending = 0;
5901
cb323159 5902 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
39236c6e 5903
5ba3f43e 5904 mptcp_sbrcv_grow_rwin(mp_tp, sb);
39236c6e 5905
5ba3f43e 5906 /* hiwat might have changed */
39236c6e 5907 rcvbuf = sb->sb_hiwat;
5ba3f43e
A
5908
5909 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
0a7de745
A
5910 (sb->sb_mbmax - sb->sb_mbcnt)));
5911 if (space < 0) {
39236c6e 5912 space = 0;
0a7de745 5913 }
5ba3f43e
A
5914
5915#if CONTENT_FILTER
5916 /* Compensate for data being processed by content filters */
5917 pending = cfil_sock_data_space(sb);
5918#endif /* CONTENT_FILTER */
0a7de745 5919 if (pending > space) {
5ba3f43e 5920 space = 0;
0a7de745 5921 } else {
5ba3f43e 5922 space -= pending;
0a7de745 5923 }
39236c6e 5924
0a7de745 5925 return space;
39236c6e
A
5926}
5927
5928/*
5929 * Support Fallback to Regular TCP
5930 */
5931void
5932mptcp_notify_mpready(struct socket *so)
5933{
5934 struct tcpcb *tp = NULL;
5935
0a7de745 5936 if (so == NULL) {
39236c6e 5937 return;
0a7de745 5938 }
39236c6e
A
5939
5940 tp = intotcpcb(sotoinpcb(so));
5941
0a7de745 5942 if (tp == NULL) {
39236c6e 5943 return;
0a7de745 5944 }
39236c6e
A
5945
5946 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5947 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5948 struct tcpcb *, tp);
5949
0a7de745 5950 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
39236c6e 5951 return;
0a7de745 5952 }
39236c6e 5953
0a7de745 5954 if (tp->t_mpflags & TMPF_MPTCP_READY) {
39236c6e 5955 return;
0a7de745 5956 }
39236c6e
A
5957
5958 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5959 tp->t_mpflags |= TMPF_MPTCP_READY;
5960
5961 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5962}
5963
5964void
5965mptcp_notify_mpfail(struct socket *so)
5966{
5967 struct tcpcb *tp = NULL;
5968
0a7de745 5969 if (so == NULL) {
39236c6e 5970 return;
0a7de745 5971 }
39236c6e
A
5972
5973 tp = intotcpcb(sotoinpcb(so));
5974
0a7de745 5975 if (tp == NULL) {
39236c6e 5976 return;
0a7de745 5977 }
39236c6e
A
5978
5979 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5980 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5981 struct tcpcb *, tp);
5982
0a7de745 5983 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
39236c6e 5984 return;
0a7de745 5985 }
39236c6e 5986
0a7de745 5987 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
39236c6e
A
5988 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5989
5990 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5991}
5992
5993/*
5994 * Keepalive helper function
5995 */
5996boolean_t
5997mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5998{
5999 boolean_t ret = 1;
cb323159
A
6000
6001 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5ba3f43e 6002
39236c6e
A
6003 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6004 ret = 0;
6005 }
0a7de745 6006 return ret;
39236c6e
A
6007}
6008
6009/*
6010 * MPTCP t_maxseg adjustment function
6011 */
6012int
6013mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6014{
6015 int mss_lower = 0;
6016 struct mptcb *mp_tp = tptomptp(tp);
6017
0a7de745
A
6018#define MPTCP_COMPUTE_LEN { \
6019 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6020 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6021 mss_lower += 2; \
6022 else \
6023 /* adjust to 32-bit boundary + EOL */ \
6024 mss_lower += 2; \
39236c6e 6025}
0a7de745
A
6026 if (mp_tp == NULL) {
6027 return 0;
6028 }
39236c6e 6029
cb323159 6030 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5ba3f43e 6031
39236c6e
A
6032 /*
6033 * For the first subflow and subsequent subflows, adjust mss for
6034 * most common MPTCP option size, for case where tcp_mss is called
6035 * during option processing and MTU discovery.
6036 */
5ba3f43e
A
6037 if (!mtudisc) {
6038 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6039 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6040 MPTCP_COMPUTE_LEN;
6041 }
39236c6e 6042
5ba3f43e
A
6043 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6044 tp->t_mpflags & TMPF_SENT_JOIN) {
6045 MPTCP_COMPUTE_LEN;
6046 }
6047 } else {
6048 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6049 MPTCP_COMPUTE_LEN;
6050 }
39236c6e
A
6051 }
6052
0a7de745 6053 return mss_lower;
39236c6e
A
6054}
6055
6056/*
6057 * Update the pid, upid, uuid of the subflow so, based on parent so
6058 */
6059void
5ba3f43e 6060mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
39236c6e 6061{
5ba3f43e
A
6062 if (so->last_pid != mp_so->last_pid ||
6063 so->last_upid != mp_so->last_upid) {
6064 so->last_upid = mp_so->last_upid;
6065 so->last_pid = mp_so->last_pid;
6066 uuid_copy(so->last_uuid, mp_so->last_uuid);
39236c6e 6067 }
5ba3f43e 6068 so_update_policy(so);
39236c6e
A
6069}
6070
6071static void
6072fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6073{
6074 struct inpcb *inp;
6075
6076 tcp_getconninfo(so, &flow->flow_ci);
6077 inp = sotoinpcb(so);
39236c6e
A
6078 if ((inp->inp_vflag & INP_IPV6) != 0) {
6079 flow->flow_src.ss_family = AF_INET6;
6080 flow->flow_dst.ss_family = AF_INET6;
6081 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6082 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6083 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6084 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6085 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6086 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
f427ee49 6087 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
39236c6e
A
6088 flow->flow_src.ss_family = AF_INET;
6089 flow->flow_dst.ss_family = AF_INET;
6090 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6091 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6092 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6093 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6094 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6095 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6096 }
3e170ce0
A
6097 flow->flow_len = sizeof(*flow);
6098 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
39236c6e
A
6099 flow->flow_flags = mpts->mpts_flags;
6100 flow->flow_cid = mpts->mpts_connid;
3e170ce0 6101 flow->flow_relseq = mpts->mpts_rel_seq;
5ba3f43e 6102 flow->flow_soerror = mpts->mpts_socket->so_error;
3e170ce0 6103 flow->flow_probecnt = mpts->mpts_probecnt;
39236c6e
A
6104}
6105
6106static int
6107mptcp_pcblist SYSCTL_HANDLER_ARGS
6108{
6109#pragma unused(oidp, arg1, arg2)
6110 int error = 0, f;
5ba3f43e 6111 size_t len;
39236c6e
A
6112 struct mppcb *mpp;
6113 struct mptses *mpte;
6114 struct mptcb *mp_tp;
6115 struct mptsub *mpts;
6116 struct socket *so;
6117 conninfo_mptcp_t mptcpci;
fe8ab488 6118 mptcp_flow_t *flows = NULL;
39236c6e 6119
0a7de745
A
6120 if (req->newptr != USER_ADDR_NULL) {
6121 return EPERM;
6122 }
39236c6e
A
6123
6124 lck_mtx_lock(&mtcbinfo.mppi_lock);
39236c6e 6125 if (req->oldptr == USER_ADDR_NULL) {
5ba3f43e 6126 size_t n = mtcbinfo.mppi_count;
39236c6e 6127 lck_mtx_unlock(&mtcbinfo.mppi_lock);
0a7de745
A
6128 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6129 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6130 return 0;
39236c6e
A
6131 }
6132 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
fe8ab488 6133 flows = NULL;
cb323159 6134 socket_lock(mpp->mpp_socket, 1);
39236c6e
A
6135 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6136 mpte = mptompte(mpp);
cb323159
A
6137
6138 socket_lock_assert_owned(mptetoso(mpte));
39236c6e 6139 mp_tp = mpte->mpte_mptcb;
3e170ce0
A
6140
6141 bzero(&mptcpci, sizeof(mptcpci));
39236c6e 6142 mptcpci.mptcpci_state = mp_tp->mpt_state;
3e170ce0
A
6143 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6144 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6145 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6146 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6147 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6148 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6149 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6150 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6151 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6152 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5ba3f43e 6153 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
3e170ce0
A
6154 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6155 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
3e170ce0 6156
39236c6e 6157 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
3e170ce0
A
6158 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6159 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6160 mptcpci.mptcpci_flow_offset =
6161 offsetof(conninfo_mptcp_t, mptcpci_flows);
6162
fe8ab488
A
6163 len = sizeof(*flows) * mpte->mpte_numflows;
6164 if (mpte->mpte_numflows != 0) {
6165 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
6166 if (flows == NULL) {
cb323159 6167 socket_unlock(mpp->mpp_socket, 1);
fe8ab488
A
6168 break;
6169 }
6170 mptcpci.mptcpci_len = sizeof(mptcpci) +
6171 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6172 error = SYSCTL_OUT(req, &mptcpci,
6173 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6174 } else {
6175 mptcpci.mptcpci_len = sizeof(mptcpci);
3e170ce0 6176 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
39037602 6177 }
39236c6e 6178 if (error) {
cb323159 6179 socket_unlock(mpp->mpp_socket, 1);
39236c6e
A
6180 FREE(flows, M_TEMP);
6181 break;
6182 }
6183 f = 0;
6184 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
39236c6e 6185 so = mpts->mpts_socket;
39236c6e 6186 fill_mptcp_subflow(so, &flows[f], mpts);
39236c6e
A
6187 f++;
6188 }
cb323159 6189 socket_unlock(mpp->mpp_socket, 1);
fe8ab488
A
6190 if (flows) {
6191 error = SYSCTL_OUT(req, flows, len);
6192 FREE(flows, M_TEMP);
0a7de745 6193 if (error) {
fe8ab488 6194 break;
0a7de745 6195 }
fe8ab488 6196 }
39236c6e
A
6197 }
6198 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6199
0a7de745 6200 return error;
39236c6e
A
6201}
6202
6203SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
39037602 6204 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
39236c6e 6205 "List of active MPTCP connections");
fe8ab488 6206
fe8ab488
A
6207/*
6208 * Set notsent lowat mark on the MPTCB
6209 */
6210int
6211mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6212{
6213 struct mptcb *mp_tp = NULL;
6214 int error = 0;
6215
0a7de745 6216 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
fe8ab488 6217 mp_tp = mpte->mpte_mptcb;
0a7de745 6218 }
fe8ab488 6219
0a7de745 6220 if (mp_tp) {
fe8ab488 6221 mp_tp->mpt_notsent_lowat = optval;
0a7de745 6222 } else {
fe8ab488 6223 error = EINVAL;
0a7de745 6224 }
fe8ab488 6225
0a7de745 6226 return error;
fe8ab488
A
6227}
6228
6229u_int32_t
6230mptcp_get_notsent_lowat(struct mptses *mpte)
6231{
6232 struct mptcb *mp_tp = NULL;
6233
0a7de745 6234 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
fe8ab488 6235 mp_tp = mpte->mpte_mptcb;
0a7de745 6236 }
fe8ab488 6237
0a7de745
A
6238 if (mp_tp) {
6239 return mp_tp->mpt_notsent_lowat;
6240 } else {
6241 return 0;
6242 }
fe8ab488
A
6243}
6244
39037602 6245int
5ba3f43e
A
6246mptcp_notsent_lowat_check(struct socket *so)
6247{
fe8ab488
A
6248 struct mptses *mpte;
6249 struct mppcb *mpp;
6250 struct mptcb *mp_tp;
6251 struct mptsub *mpts;
6252
6253 int notsent = 0;
6254
5ba3f43e 6255 mpp = mpsotomppcb(so);
fe8ab488 6256 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
0a7de745 6257 return 0;
fe8ab488
A
6258 }
6259
6260 mpte = mptompte(mpp);
cb323159 6261 socket_lock_assert_owned(mptetoso(mpte));
fe8ab488
A
6262 mp_tp = mpte->mpte_mptcb;
6263
fe8ab488
A
6264 notsent = so->so_snd.sb_cc;
6265
6266 if ((notsent == 0) ||
6267 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6268 mp_tp->mpt_notsent_lowat)) {
3e170ce0 6269 mptcplog((LOG_DEBUG, "MPTCP Sender: "
f427ee49 6270 "lowat %d notsent %d actual %llu \n",
3e170ce0
A
6271 mp_tp->mpt_notsent_lowat, notsent,
6272 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
0a7de745
A
6273 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6274 return 1;
fe8ab488 6275 }
fe8ab488
A
6276
6277 /* When Nagle's algorithm is not disabled, it is better
6278 * to wakeup the client even before there is atleast one
6279 * maxseg of data to write.
6280 */
6281 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6282 int retval = 0;
fe8ab488
A
6283 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6284 struct socket *subf_so = mpts->mpts_socket;
fe8ab488 6285 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
39037602 6286
fe8ab488 6287 notsent = so->so_snd.sb_cc -
0a7de745 6288 (tp->snd_nxt - tp->snd_una);
39037602 6289
fe8ab488
A
6290 if ((tp->t_flags & TF_NODELAY) == 0 &&
6291 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6292 retval = 1;
6293 }
3e170ce0 6294 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
fe8ab488 6295 " nodelay false \n",
3e170ce0 6296 mp_tp->mpt_notsent_lowat, notsent),
0a7de745
A
6297 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6298 return retval;
fe8ab488 6299 }
fe8ab488 6300 }
0a7de745 6301 return 0;
fe8ab488
A
6302}
6303
3e170ce0
A
6304static errno_t
6305mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
0a7de745 6306 void **unitinfo)
3e170ce0
A
6307{
6308#pragma unused(kctlref, sac, unitinfo)
5ba3f43e 6309
0a7de745 6310 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
cb323159 6311 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
0a7de745 6312 }
5ba3f43e
A
6313
6314 mptcp_kern_skt_unit = sac->sc_unit;
6315
0a7de745 6316 return 0;
5ba3f43e
A
6317}
6318
6319static void
cb323159 6320mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
5ba3f43e
A
6321{
6322 struct mppcb *mpp;
6323
6324 /* Iterate over all MPTCP connections */
6325
6326 lck_mtx_lock(&mtcbinfo.mppi_lock);
6327
6328 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
cb323159
A
6329 struct socket *mp_so = mpp->mpp_socket;
6330 struct mptses *mpte = mpp->mpp_pcbe;
5ba3f43e 6331
cb323159 6332 socket_lock(mp_so, 1);
5ba3f43e
A
6333
6334 if (mp_so->so_flags & SOF_DELEGATED &&
0a7de745 6335 uuid_compare(uuid, mp_so->e_uuid)) {
5ba3f43e 6336 goto next;
0a7de745
A
6337 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6338 uuid_compare(uuid, mp_so->last_uuid)) {
5ba3f43e 6339 goto next;
0a7de745
A
6340 }
6341
cb323159
A
6342 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6343 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
5ba3f43e
A
6344
6345 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6346
cb323159
A
6347 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6348 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6349 }
6350
5ba3f43e
A
6351 mptcp_check_subflows_and_add(mpte);
6352 mptcp_remove_subflows(mpte);
6353
cb323159 6354 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
5ba3f43e
A
6355
6356next:
cb323159 6357 socket_unlock(mp_so, 1);
5ba3f43e
A
6358 }
6359
6360 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6361}
6362
6363static void
6364mptcp_wifi_status_changed(void)
6365{
6366 struct mppcb *mpp;
6367
6368 /* Iterate over all MPTCP connections */
6369
6370 lck_mtx_lock(&mtcbinfo.mppi_lock);
6371
6372 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
cb323159
A
6373 struct socket *mp_so = mpp->mpp_socket;
6374 struct mptses *mpte = mpp->mpp_pcbe;
5ba3f43e 6375
cb323159 6376 socket_lock(mp_so, 1);
5ba3f43e 6377
cb323159
A
6378 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6379 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
c3c9b80d 6380 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
cb323159 6381 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
5ba3f43e 6382 goto next;
0a7de745 6383 }
5ba3f43e
A
6384
6385 mptcp_check_subflows_and_add(mpte);
6386 mptcp_check_subflows_and_remove(mpte);
6387
6388next:
cb323159 6389 socket_unlock(mp_so, 1);
5ba3f43e
A
6390 }
6391
6392 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6393}
6394
c3c9b80d
A
6395struct mptcp_uuid_search_info {
6396 uuid_t target_uuid;
6397 proc_t found_proc;
6398 boolean_t is_proc_found;
6399};
6400
6401static int
6402mptcp_find_proc_filter(proc_t p, void *arg)
6403{
6404 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6405 int found;
6406
6407 if (info->is_proc_found) {
6408 return 0;
6409 }
6410
6411 /*
6412 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6413 * expects != 0 for a matching filter.
6414 */
6415 found = uuid_compare(p->p_uuid, info->target_uuid) == 0;
6416 if (found) {
6417 info->is_proc_found = true;
6418 }
6419
6420 return found;
6421}
6422
6423static int
6424mptcp_find_proc_callout(proc_t p, void * arg)
6425{
6426 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6427
6428 if (uuid_compare(p->p_uuid, info->target_uuid) == 0) {
6429 info->found_proc = p;
6430 return PROC_CLAIMED_DONE;
6431 }
6432
6433 return PROC_RETURNED;
6434}
6435
6436static proc_t
6437mptcp_find_proc(const uuid_t uuid)
6438{
6439 struct mptcp_uuid_search_info info;
6440
6441 uuid_copy(info.target_uuid, uuid);
6442 info.found_proc = PROC_NULL;
6443 info.is_proc_found = false;
6444
6445 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6446 mptcp_find_proc_filter, &info);
6447
6448 return info.found_proc;
6449}
6450
5ba3f43e
A
6451void
6452mptcp_ask_symptoms(struct mptses *mpte)
6453{
6454 struct mptcp_symptoms_ask_uuid ask;
6455 struct socket *mp_so;
c3c9b80d 6456 struct proc *p = PROC_NULL;
5ba3f43e
A
6457 int pid, prio, err;
6458
6459 if (mptcp_kern_skt_unit == 0) {
cb323159
A
6460 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6461 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
5ba3f43e
A
6462 return;
6463 }
6464
6465 mp_so = mptetoso(mpte);
6466
0a7de745 6467 if (mp_so->so_flags & SOF_DELEGATED) {
c3c9b80d
A
6468 if (mpte->mpte_epid != 0) {
6469 p = proc_find(mpte->mpte_epid);
6470 if (p != PROC_NULL) {
6471 /* We found a pid, check its UUID */
6472 if (uuid_compare(mp_so->e_uuid, p->p_uuid)) {
6473 /* It's not the same - we need to look for the real proc */
6474 proc_rele(p);
6475 p = PROC_NULL;
6476 }
6477 }
6478 }
5ba3f43e 6479
c3c9b80d
A
6480 if (p == PROC_NULL) {
6481 p = mptcp_find_proc(mp_so->e_uuid);
6482 if (p == PROC_NULL) {
6483 uuid_string_t uuid_string;
6484 uuid_unparse(mp_so->e_uuid, uuid_string);
5ba3f43e 6485
c3c9b80d
A
6486 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6487 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
5ba3f43e 6488
c3c9b80d
A
6489 return;
6490 }
6491 mpte->mpte_epid = proc_pid(p);
6492 }
6493
6494 pid = mpte->mpte_epid;
5ba3f43e 6495 uuid_copy(ask.uuid, mp_so->e_uuid);
0a7de745 6496 } else {
c3c9b80d
A
6497 pid = mp_so->last_pid;
6498
6499 p = proc_find(pid);
6500 if (p == PROC_NULL) {
6501 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6502 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6503 return;
6504 }
6505
5ba3f43e 6506 uuid_copy(ask.uuid, mp_so->last_uuid);
0a7de745 6507 }
5ba3f43e 6508
c3c9b80d
A
6509
6510 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6511
5ba3f43e
A
6512 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6513
cb323159
A
6514 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6515 prio == TASK_DARWINBG_APPLICATION) {
5ba3f43e 6516 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
0a7de745 6517 } else if (prio == TASK_FOREGROUND_APPLICATION) {
5ba3f43e 6518 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
0a7de745 6519 } else {
5ba3f43e 6520 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
0a7de745 6521 }
5ba3f43e 6522
5ba3f43e 6523 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
0a7de745 6524 &ask, sizeof(ask), CTL_DATA_EOR);
d9a64523 6525
cb323159
A
6526 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6527 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
d9a64523 6528
5ba3f43e
A
6529
6530 proc_rele(p);
3e170ce0
A
6531}
6532
6533static errno_t
6534mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
0a7de745 6535 void *unitinfo)
3e170ce0
A
6536{
6537#pragma unused(kctlref, kcunit, unitinfo)
5ba3f43e
A
6538
6539 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6540
0a7de745 6541 return 0;
3e170ce0
A
6542}
6543
6544static errno_t
6545mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
0a7de745 6546 mbuf_t m, int flags)
3e170ce0 6547{
5ba3f43e 6548#pragma unused(kctlref, unitinfo, flags)
0a7de745 6549 symptoms_advisory_t *sa = NULL;
3e170ce0 6550
0a7de745 6551 if (kcunit != mptcp_kern_skt_unit) {
cb323159 6552 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
0a7de745
A
6553 __func__, kcunit, mptcp_kern_skt_unit);
6554 }
5ba3f43e 6555
3e170ce0
A
6556 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6557 mbuf_freem(m);
0a7de745 6558 return EINVAL;
3e170ce0
A
6559 }
6560
d9a64523 6561 if (mbuf_len(m) < sizeof(*sa)) {
0a7de745
A
6562 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6563 __func__, mbuf_len(m), sizeof(*sa));
d9a64523 6564 mbuf_freem(m);
0a7de745 6565 return EINVAL;
d9a64523
A
6566 }
6567
6568 sa = mbuf_data(m);
3e170ce0 6569
cb323159
A
6570 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6571 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6572 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6573 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
3e170ce0 6574
cb323159 6575 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
3e170ce0 6576 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5ba3f43e 6577 mptcp_wifi_status_changed();
0a7de745 6578 }
cb323159
A
6579 } else {
6580 struct mptcp_symptoms_answer answer;
0a7de745 6581 errno_t err;
5ba3f43e 6582
cb323159
A
6583 /* We temporarily allow different sizes for ease of submission */
6584 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6585 mbuf_len(m) != sizeof(answer)) {
6586 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6587 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6588 sizeof(answer));
0a7de745
A
6589 mbuf_free(m);
6590 return EINVAL;
6591 }
5ba3f43e 6592
cb323159
A
6593 memset(&answer, 0, sizeof(answer));
6594
6595 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
0a7de745
A
6596 if (err) {
6597 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6598 mbuf_free(m);
6599 return err;
6600 }
5ba3f43e 6601
cb323159 6602 mptcp_allow_uuid(answer.uuid, answer.rssi);
3e170ce0 6603 }
5ba3f43e 6604
d9a64523 6605 mbuf_freem(m);
0a7de745 6606 return 0;
3e170ce0
A
6607}
6608
6609void
6610mptcp_control_register(void)
6611{
6612 /* Set up the advisory control socket */
6613 struct kern_ctl_reg mptcp_kern_ctl;
6614
6615 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6616 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6617 sizeof(mptcp_kern_ctl.ctl_name));
6618 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6619 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6620 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6621 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6622
6623 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6624}
6625
d9a64523
A
6626/*
6627 * Three return-values:
6628 * 1 : WiFi is bad
6629 * 0 : WiFi is good
cb323159 6630 * -1 : WiFi-state is unknown
d9a64523 6631 */
3e170ce0 6632int
cb323159 6633mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
3e170ce0 6634{
d9a64523 6635 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
f427ee49
A
6636 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6637 mptcp_advisory.sa_wifi_status) {
cb323159 6638 return symptoms_is_wifi_lossy() ? 1 : 0;
0a7de745 6639 }
d9a64523
A
6640
6641 /*
6642 * If it's a first-party app and we don't have any info
6643 * about the Wi-Fi state, let's be pessimistic.
6644 */
0a7de745 6645 return -1;
cb323159
A
6646 } else {
6647 if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
6648 return 1;
6649 }
d9a64523 6650
cb323159
A
6651 /*
6652 * If we are target-based (meaning, we allow to be more lax on
6653 * the "unusable" target. We only *know* about the state once
6654 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6655 *
6656 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6657 * be set.
6658 *
6659 * In any other case (while in target-mode), consider WiFi bad
6660 * and we are going to ask for allowance from Symptoms anyway.
6661 */
6662 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6663 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6664 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6665 return 0;
6666 }
d9a64523 6667
cb323159
A
6668 return 1;
6669 }
d9a64523 6670
cb323159 6671 return 0;
0a7de745 6672 }
cb323159 6673}
d9a64523 6674
cb323159
A
6675boolean_t
6676symptoms_is_wifi_lossy(void)
6677{
6678 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
3e170ce0
A
6679}
6680
490019cf
A
6681/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6682static void
5ba3f43e 6683mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
490019cf 6684{
5ba3f43e 6685 struct socket *mp_so = mptetoso(mpte);
490019cf
A
6686 struct socket *so = mpts->mpts_socket;
6687 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6688 struct mptcb *mp_tp = mpte->mpte_mptcb;
6689
6690 /* If data was sent with SYN, rewind state */
6691 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5ba3f43e 6692 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
490019cf 6693 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5ba3f43e 6694
490019cf
A
6695 VERIFY(mp_droplen <= (UINT_MAX));
6696 VERIFY(mp_droplen >= tcp_droplen);
6697
5ba3f43e
A
6698 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6699 mpts->mpts_iss += tcp_droplen;
6700 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6701
490019cf
A
6702 if (mp_droplen > tcp_droplen) {
6703 /* handle partial TCP ack */
6704 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6705 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
490019cf
A
6706 mp_droplen = tcp_droplen;
6707 } else {
6708 /* all data on SYN was acked */
6709 mpts->mpts_rel_seq = 1;
6710 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
490019cf
A
6711 }
6712 mp_tp->mpt_sndmax -= tcp_droplen;
6713
490019cf
A
6714 if (mp_droplen != 0) {
6715 VERIFY(mp_so->so_snd.sb_mb != NULL);
6716 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6717 }
5ba3f43e
A
6718 }
6719}
6720
6721int
6722mptcp_freeq(struct mptcb *mp_tp)
6723{
6724 struct tseg_qent *q;
6725 int rv = 0;
6726
6727 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6728 LIST_REMOVE(q, tqe_q);
6729 m_freem(q->tqe_m);
6730 zfree(tcp_reass_zone, q);
6731 rv = 1;
6732 }
6733 mp_tp->mpt_reassqlen = 0;
0a7de745 6734 return rv;
5ba3f43e
A
6735}
6736
6737static int
6738mptcp_post_event(u_int32_t event_code, int value)
6739{
6740 struct kev_mptcp_data event_data;
6741 struct kev_msg ev_msg;
6742
6743 memset(&ev_msg, 0, sizeof(ev_msg));
6744
0a7de745
A
6745 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6746 ev_msg.kev_class = KEV_NETWORK_CLASS;
6747 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6748 ev_msg.event_code = event_code;
5ba3f43e
A
6749
6750 event_data.value = value;
6751
0a7de745 6752 ev_msg.dv[0].data_ptr = &event_data;
5ba3f43e
A
6753 ev_msg.dv[0].data_length = sizeof(event_data);
6754
6755 return kev_post_msg(&ev_msg);
6756}
6757
cb323159
A
6758static void
6759mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
5ba3f43e 6760{
94ff46dc 6761 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5ba3f43e
A
6762 int error;
6763
6764 /* First-party apps (Siri) don't flip the cellicon */
0a7de745 6765 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
5ba3f43e 6766 return;
0a7de745 6767 }
5ba3f43e 6768
cb323159
A
6769 /* Subflow is disappearing - don't set it on this one */
6770 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6771 return;
6772 }
6773
94ff46dc
A
6774 /* Fallen back connections are not triggering the cellicon */
6775 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6776 return;
6777 }
6778
cb323159
A
6779 /* Remember the last time we set the cellicon. Needed for debouncing */
6780 mpte->mpte_last_cellicon_set = tcp_now;
6781
94ff46dc
A
6782 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6783 tcp_sched_timers(tp);
6784
cb323159
A
6785 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6786 mpte->mpte_cellicon_increments != 0) {
6787 if (mptcp_cellicon_refcount == 0) {
6788 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6789 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6790
6791 /* Continue, so that the icon gets set... */
6792 } else {
6793 /*
6794 * In this case, the cellicon is already set. No need to bump it
6795 * even higher
6796 */
6797
6798 return;
6799 }
6800 }
6801
6802 /* When tearing down this subflow, we need to decrement the
6803 * reference counter
6804 */
6805 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6806
6807 /* This counter, so that when a session gets destroyed we decrement
6808 * the reference counter by whatever is left
6809 */
6810 mpte->mpte_cellicon_increments++;
5ba3f43e 6811
cb323159
A
6812 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6813 /* If cellicon is already set, get out of here! */
5ba3f43e 6814 return;
0a7de745 6815 }
5ba3f43e
A
6816
6817 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6818
0a7de745 6819 if (error) {
cb323159
A
6820 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6821 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
0a7de745 6822 } else {
cb323159
A
6823 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6824 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
0a7de745 6825 }
5ba3f43e
A
6826}
6827
6828void
cb323159 6829mptcp_clear_cellicon(void)
5ba3f43e 6830{
cb323159
A
6831 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6832
6833 if (error) {
6834 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6835 __func__, error);
6836 } else {
6837 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6838 __func__);
6839 }
6840}
6841
6842/*
6843 * Returns true if the icon has been flipped to WiFi.
6844 */
6845static boolean_t
f427ee49 6846__mptcp_unset_cellicon(uint32_t val)
cb323159 6847{
f427ee49
A
6848 VERIFY(val < INT32_MAX);
6849 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
cb323159
A
6850 return false;
6851 }
6852
6853 mptcp_clear_cellicon();
6854
6855 return true;
6856}
5ba3f43e 6857
94ff46dc
A
6858void
6859mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
cb323159
A
6860{
6861 /* First-party apps (Siri) don't flip the cellicon */
6862 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
5ba3f43e 6863 return;
0a7de745 6864 }
5ba3f43e 6865
cb323159
A
6866 if (mpte->mpte_cellicon_increments == 0) {
6867 /* This flow never used cell - get out of here! */
5ba3f43e 6868 return;
490019cf 6869 }
5ba3f43e 6870
cb323159
A
6871 if (mptcp_cellicon_refcount == 0) {
6872 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6873 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
5ba3f43e 6874
cb323159
A
6875 return;
6876 }
6877
6878 if (mpts) {
6879 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6880 return;
6881 }
6882
6883 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6884 }
6885
94ff46dc
A
6886 if (mpte->mpte_cellicon_increments < val) {
6887 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6888 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6889 val = mpte->mpte_cellicon_increments;
6890 }
6891
6892 mpte->mpte_cellicon_increments -= val;
cb323159
A
6893
6894 if (__mptcp_unset_cellicon(val) == false) {
6895 return;
6896 }
6897
6898 /* All flows are gone - our counter should be at zero too! */
6899 if (mpte->mpte_cellicon_increments != 0) {
6900 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6901 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
0a7de745 6902 }
5ba3f43e
A
6903}
6904
6905void
6906mptcp_reset_rexmit_state(struct tcpcb *tp)
6907{
6908 struct mptsub *mpts;
6909 struct inpcb *inp;
6910 struct socket *so;
6911
6912 inp = tp->t_inpcb;
0a7de745 6913 if (inp == NULL) {
5ba3f43e 6914 return;
0a7de745 6915 }
5ba3f43e
A
6916
6917 so = inp->inp_socket;
0a7de745 6918 if (so == NULL) {
5ba3f43e 6919 return;
0a7de745 6920 }
5ba3f43e 6921
0a7de745 6922 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
5ba3f43e 6923 return;
0a7de745 6924 }
5ba3f43e
A
6925
6926 mpts = tp->t_mpsub;
6927
6928 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6929 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6930}
6931
6932void
6933mptcp_reset_keepalive(struct tcpcb *tp)
6934{
6935 struct mptsub *mpts = tp->t_mpsub;
6936
6937 mpts->mpts_flags &= ~MPTSF_READ_STALL;
490019cf 6938}