]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
xnu-6153.121.1.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_wupcall(struct socket *, void *, int);
121 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
122 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
123 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
124
125 static void mptcp_subflow_abort(struct mptsub *, int);
126
127 static void mptcp_send_dfin(struct socket *so);
128 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
129 static int mptcp_freeq(struct mptcb *mp_tp);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
146 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158 static void mptcp_do_sha1(mptcp_key_t *, char *);
159 static void mptcp_init_local_parms(struct mptses *);
160
161 static unsigned int mptsub_zone_size; /* size of mptsub */
162 static struct zone *mptsub_zone; /* zone for mptsub */
163
164 static unsigned int mptopt_zone_size; /* size of mptopt */
165 static struct zone *mptopt_zone; /* zone for mptopt */
166
167 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
168 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
169
170 struct mppcbinfo mtcbinfo;
171
172 SYSCTL_DECL(_net_inet);
173
174 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
175
176 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
177 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
178 &mptcp_dbg_area, 0, "MPTCP debug area");
179
180 uint32_t mptcp_dbg_level = 1;
181 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
182 &mptcp_dbg_level, 0, "MPTCP debug level");
183
184 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
185 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
186
187
188 static int mptcp_alternate_port = 0;
189 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
190 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
191
192 static struct protosw mptcp_subflow_protosw;
193 static struct pr_usrreqs mptcp_subflow_usrreqs;
194 #if INET6
195 static struct ip6protosw mptcp_subflow_protosw6;
196 static struct pr_usrreqs mptcp_subflow_usrreqs6;
197 #endif /* INET6 */
198
199 static uint8_t mptcp_create_subflows_scheduled;
200
201 typedef struct mptcp_subflow_event_entry {
202 uint64_t sofilt_hint_mask;
203 ev_ret_t (*sofilt_hint_ev_hdlr)(
204 struct mptses *mpte,
205 struct mptsub *mpts,
206 uint64_t *p_mpsofilt_hint,
207 uint64_t event);
208 } mptsub_ev_entry_t;
209
210 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
211 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
212 static uint32_t mptcp_kern_skt_inuse = 0;
213 static uint32_t mptcp_kern_skt_unit;
214 static symptoms_advisory_t mptcp_advisory;
215
216 uint32_t mptcp_cellicon_refcount = 0;
217
218 /*
219 * XXX The order of the event handlers below is really
220 * really important. Think twice before changing it.
221 */
222 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
229 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
241 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
250 },
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
258 },
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
262 },
263 {
264 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
270 },
271 {
272 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
273 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
274 },
275 {
276 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
277 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
278 },
279 };
280
281 os_log_t mptcp_log_handle;
282
283 /*
284 * Protocol pr_init callback.
285 */
286 void
287 mptcp_init(struct protosw *pp, struct domain *dp)
288 {
289 #pragma unused(dp)
290 static int mptcp_initialized = 0;
291 struct protosw *prp;
292 #if INET6
293 struct ip6protosw *prp6;
294 #endif /* INET6 */
295
296 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
297
298 /* do this only once */
299 if (mptcp_initialized) {
300 return;
301 }
302 mptcp_initialized = 1;
303
304 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
305
306 /*
307 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
308 * we must be able to find IPPROTO_TCP entries for both.
309 */
310 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
311 VERIFY(prp != NULL);
312 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
313 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
314 sizeof(mptcp_subflow_usrreqs));
315 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
316 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
317 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
318 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
319 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
320 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
321 /*
322 * Socket filters shouldn't attach/detach to/from this protosw
323 * since pr_protosw is to be used instead, which points to the
324 * real protocol; if they do, it is a bug and we should panic.
325 */
326 mptcp_subflow_protosw.pr_filter_head.tqh_first =
327 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
328 mptcp_subflow_protosw.pr_filter_head.tqh_last =
329 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
330
331 #if INET6
332 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
333 IPPROTO_TCP, SOCK_STREAM);
334 VERIFY(prp6 != NULL);
335 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
336 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
337 sizeof(mptcp_subflow_usrreqs6));
338 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
339 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
340 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
341 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
342 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
343 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
344 /*
345 * Socket filters shouldn't attach/detach to/from this protosw
346 * since pr_protosw is to be used instead, which points to the
347 * real protocol; if they do, it is a bug and we should panic.
348 */
349 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
350 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
351 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
352 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
353 #endif /* INET6 */
354
355 bzero(&mtcbinfo, sizeof(mtcbinfo));
356 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
357 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
358 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
359 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
360 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
361 /* NOTREACHED */
362 }
363 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
364 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
365
366 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
367 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
368 mtcbinfo.mppi_lock_grp_attr);
369 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
370 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
371 mtcbinfo.mppi_lock_attr);
372
373 mtcbinfo.mppi_gc = mptcp_gc;
374 mtcbinfo.mppi_timer = mptcp_timer;
375
376 /* attach to MP domain for garbage collection to take place */
377 mp_pcbinfo_attach(&mtcbinfo);
378
379 mptsub_zone_size = sizeof(struct mptsub);
380 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
381 8192, "mptsub")) == NULL) {
382 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
383 /* NOTREACHED */
384 }
385 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
386 zone_change(mptsub_zone, Z_EXPAND, TRUE);
387
388 mptopt_zone_size = sizeof(struct mptopt);
389 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
390 1024, "mptopt")) == NULL) {
391 panic("%s: unable to allocate MPTCP option zone\n", __func__);
392 /* NOTREACHED */
393 }
394 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
395 zone_change(mptopt_zone, Z_EXPAND, TRUE);
396
397 mpt_subauth_entry_size = sizeof(struct mptcp_subf_auth_entry);
398 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
399 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
400 panic("%s: unable to allocate MPTCP address auth zone \n",
401 __func__);
402 /* NOTREACHED */
403 }
404 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
405 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
406
407 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
408 }
409
410 int
411 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create)
412 {
413 int i, index = -1;
414
415 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
416 if (create && stats[i].ifindex == IFSCOPE_NONE) {
417 if (index < 0) {
418 index = i;
419 }
420 continue;
421 }
422
423 if (stats[i].ifindex == ifindex) {
424 index = i;
425 return index;
426 }
427 }
428
429 if (index != -1) {
430 stats[index].ifindex = ifindex;
431 }
432
433 return index;
434 }
435
436 static int
437 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
438 {
439 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
440 int index;
441
442 if (ifp == NULL) {
443 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
444 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
445 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
446 return -1;
447 }
448
449 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
450
451 if (index != -1) {
452 if (stats[index].is_expensive == 0) {
453 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
454 }
455 }
456
457 return index;
458 }
459
460 void
461 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
462 {
463 int index;
464
465 tcpstat.tcps_mp_switches++;
466 mpte->mpte_subflow_switches++;
467
468 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
469
470 if (index != -1) {
471 mpte->mpte_itfstats[index].switches++;
472 }
473 }
474
475 /*
476 * Flushes all recorded socket options from an MP socket.
477 */
478 static void
479 mptcp_flush_sopts(struct mptses *mpte)
480 {
481 struct mptopt *mpo, *tmpo;
482
483 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
484 mptcp_sopt_remove(mpte, mpo);
485 mptcp_sopt_free(mpo);
486 }
487 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
488 }
489
490 /*
491 * Create an MPTCP session, called as a result of opening a MPTCP socket.
492 */
493 int
494 mptcp_session_create(struct mppcb *mpp)
495 {
496 struct mppcbinfo *mppi;
497 struct mptses *mpte;
498 struct mptcb *mp_tp;
499
500 VERIFY(mpp != NULL);
501 mppi = mpp->mpp_pcbinfo;
502 VERIFY(mppi != NULL);
503
504 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
505 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
506
507 /* MPTCP Multipath PCB Extension */
508 bzero(mpte, sizeof(*mpte));
509 VERIFY(mpp->mpp_pcbe == NULL);
510 mpp->mpp_pcbe = mpte;
511 mpte->mpte_mppcb = mpp;
512 mpte->mpte_mptcb = mp_tp;
513
514 TAILQ_INIT(&mpte->mpte_sopts);
515 TAILQ_INIT(&mpte->mpte_subflows);
516 mpte->mpte_associd = SAE_ASSOCID_ANY;
517 mpte->mpte_connid_last = SAE_CONNID_ANY;
518
519 mptcp_init_urgency_timer(mpte);
520
521 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
522 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
523
524 if (mptcp_alternate_port) {
525 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
526 }
527
528 mpte->mpte_last_cellicon_set = tcp_now;
529
530 /* MPTCP Protocol Control Block */
531 bzero(mp_tp, sizeof(*mp_tp));
532 mp_tp->mpt_mpte = mpte;
533 mp_tp->mpt_state = MPTCPS_CLOSED;
534
535 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
536
537 return 0;
538 }
539
540 struct sockaddr *
541 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
542 {
543 if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
544 return &mpte->mpte_dst;
545 }
546
547 if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
548 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
549 }
550
551 if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
552 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
553 }
554
555 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
556 * meaning we prefer IPv6 over IPv4.
557 */
558 if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
559 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
560 }
561
562 if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
563 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
564 }
565
566 /* We don't yet have a unicast IP */
567 return NULL;
568 }
569
570 static void
571 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
572 uint64_t *cellbytes, uint64_t *allbytes)
573 {
574 int64_t mycellbytes = 0;
575 uint64_t myallbytes = 0;
576 int i;
577
578 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
579 if (mpte->mpte_itfstats[i].is_expensive) {
580 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
581 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
582 }
583
584 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
585 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
586 }
587
588 if (initial_cell) {
589 mycellbytes -= mpte->mpte_init_txbytes;
590 mycellbytes -= mpte->mpte_init_rxbytes;
591 }
592
593 if (mycellbytes < 0) {
594 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
595 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
596 *cellbytes = 0;
597 *allbytes = 0;
598 } else {
599 *cellbytes = mycellbytes;
600 *allbytes = myallbytes;
601 }
602 }
603
604 static void
605 mptcpstats_session_wrapup(struct mptses *mpte)
606 {
607 boolean_t cell = mpte->mpte_initial_cell;
608
609 switch (mpte->mpte_svctype) {
610 case MPTCP_SVCTYPE_HANDOVER:
611 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
612 tcpstat.tcps_mptcp_fp_handover_attempt++;
613
614 if (cell && mpte->mpte_handshake_success) {
615 tcpstat.tcps_mptcp_fp_handover_success_cell++;
616
617 if (mpte->mpte_used_wifi) {
618 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
619 }
620 } else if (mpte->mpte_handshake_success) {
621 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
622
623 if (mpte->mpte_used_cell) {
624 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
625 }
626 }
627 } else {
628 tcpstat.tcps_mptcp_handover_attempt++;
629
630 if (cell && mpte->mpte_handshake_success) {
631 tcpstat.tcps_mptcp_handover_success_cell++;
632
633 if (mpte->mpte_used_wifi) {
634 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
635 }
636 } else if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_handover_success_wifi++;
638
639 if (mpte->mpte_used_cell) {
640 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
641 }
642 }
643 }
644
645 if (mpte->mpte_handshake_success) {
646 uint64_t cellbytes;
647 uint64_t allbytes;
648
649 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
650
651 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
652 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
653 }
654 break;
655 case MPTCP_SVCTYPE_INTERACTIVE:
656 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
657 tcpstat.tcps_mptcp_fp_interactive_attempt++;
658
659 if (mpte->mpte_handshake_success) {
660 tcpstat.tcps_mptcp_fp_interactive_success++;
661
662 if (!cell && mpte->mpte_used_cell) {
663 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
664 }
665 }
666 } else {
667 tcpstat.tcps_mptcp_interactive_attempt++;
668
669 if (mpte->mpte_handshake_success) {
670 tcpstat.tcps_mptcp_interactive_success++;
671
672 if (!cell && mpte->mpte_used_cell) {
673 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
674 }
675 }
676 }
677
678 if (mpte->mpte_handshake_success) {
679 uint64_t cellbytes;
680 uint64_t allbytes;
681
682 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
683
684 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
685 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
686 }
687 break;
688 case MPTCP_SVCTYPE_AGGREGATE:
689 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
690 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
691
692 if (mpte->mpte_handshake_success) {
693 tcpstat.tcps_mptcp_fp_aggregate_success++;
694 }
695 } else {
696 tcpstat.tcps_mptcp_aggregate_attempt++;
697
698 if (mpte->mpte_handshake_success) {
699 tcpstat.tcps_mptcp_aggregate_success++;
700 }
701 }
702
703 if (mpte->mpte_handshake_success) {
704 uint64_t cellbytes;
705 uint64_t allbytes;
706
707 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
708
709 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
710 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
711 }
712 break;
713 }
714
715 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
716 tcpstat.tcps_mptcp_back_to_wifi++;
717 }
718
719 if (mpte->mpte_triggered_cell) {
720 tcpstat.tcps_mptcp_triggered_cell++;
721 }
722 }
723
724 /*
725 * Destroy an MPTCP session.
726 */
727 static void
728 mptcp_session_destroy(struct mptses *mpte)
729 {
730 struct mptcb *mp_tp = mpte->mpte_mptcb;
731
732 VERIFY(mp_tp != NULL);
733 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
734
735 mptcpstats_session_wrapup(mpte);
736 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
737 mptcp_flush_sopts(mpte);
738
739 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
740 _FREE(mpte->mpte_itfinfo, M_TEMP);
741 }
742 mpte->mpte_itfinfo = NULL;
743
744 m_freem_list(mpte->mpte_reinjectq);
745
746 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
747 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
748 }
749
750 boolean_t
751 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
752 {
753 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
754 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
755 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
756 }
757
758 static int
759 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
760 const struct in_addr *addrv4)
761 {
762 static const struct in6_addr well_known_prefix = {
763 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
764 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
765 0x00, 0x00, 0x00, 0x00},
766 };
767 const char *ptrv4 = (const char *)addrv4;
768 char buf[MAX_IPv6_STR_LEN];
769 char *ptr = (char *)addr;
770
771 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
772 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
773 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
774 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
775 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
776 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
777 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
778 return -1;
779 }
780
781 /* Check for the well-known prefix */
782 if (len == NAT64_PREFIX_LEN_96 &&
783 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
784 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
785 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
786 return -1;
787 }
788 }
789
790 switch (len) {
791 case NAT64_PREFIX_LEN_96:
792 memcpy(ptr + 12, ptrv4, 4);
793 break;
794 case NAT64_PREFIX_LEN_64:
795 memcpy(ptr + 9, ptrv4, 4);
796 break;
797 case NAT64_PREFIX_LEN_56:
798 memcpy(ptr + 7, ptrv4, 1);
799 memcpy(ptr + 9, ptrv4 + 1, 3);
800 break;
801 case NAT64_PREFIX_LEN_48:
802 memcpy(ptr + 6, ptrv4, 2);
803 memcpy(ptr + 9, ptrv4 + 2, 2);
804 break;
805 case NAT64_PREFIX_LEN_40:
806 memcpy(ptr + 5, ptrv4, 3);
807 memcpy(ptr + 9, ptrv4 + 3, 1);
808 break;
809 case NAT64_PREFIX_LEN_32:
810 memcpy(ptr + 4, ptrv4, 4);
811 break;
812 default:
813 panic("NAT64-prefix len is wrong: %u\n", len);
814 }
815
816 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
817 __func__, len,
818 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
819
820 return 0;
821 }
822
823 static void
824 mptcp_trigger_cell_bringup(struct mptses *mpte)
825 {
826 struct socket *mp_so = mptetoso(mpte);
827
828 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
829 uuid_string_t uuidstr;
830 int err;
831
832 socket_unlock(mp_so, 0);
833 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
834 TRUE);
835 socket_lock(mp_so, 0);
836
837 if (err == 0) {
838 mpte->mpte_triggered_cell = 1;
839 }
840
841 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
842 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
843 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
844 } else {
845 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
846 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
847 }
848 }
849
850 static boolean_t
851 mptcp_subflow_disconnecting(struct mptsub *mpts)
852 {
853 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
854 return true;
855 }
856
857 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
858 return true;
859 }
860
861 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
862 return true;
863 }
864
865 return false;
866 }
867
868 void
869 mptcp_check_subflows_and_add(struct mptses *mpte)
870 {
871 struct mptcb *mp_tp = mpte->mpte_mptcb;
872 boolean_t cellular_viable = FALSE;
873 boolean_t want_cellular = TRUE;
874 uint32_t i;
875
876 if (!mptcp_ok_to_create_subflows(mp_tp)) {
877 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
878 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
879 return;
880 }
881
882 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
883 return;
884 }
885
886 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
887 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
888 struct mpt_itf_info *info;
889 struct sockaddr_in6 nat64pre;
890 struct sockaddr *dst;
891 struct mptsub *mpts;
892 struct ifnet *ifp;
893 uint32_t ifindex;
894
895 info = &mpte->mpte_itfinfo[i];
896
897 ifindex = info->ifindex;
898 if (ifindex == IFSCOPE_NONE) {
899 continue;
900 }
901
902 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
903 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
904 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
905
906 if (info->no_mptcp_support) {
907 continue;
908 }
909
910 ifnet_head_lock_shared();
911 ifp = ifindex2ifnet[ifindex];
912 ifnet_head_done();
913
914 if (ifp == NULL) {
915 continue;
916 }
917
918 if (IFNET_IS_CELLULAR(ifp)) {
919 cellular_viable = TRUE;
920 }
921
922 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
923 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
924 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
925
926 if (subifp == NULL) {
927 continue;
928 }
929
930 /*
931 * If there is at least one functioning subflow on WiFi
932 * and we are checking for the cell interface, then
933 * we always need to ask symptoms for permission as
934 * cell is triggered even if WiFi is available.
935 */
936 if (!IFNET_IS_CELLULAR(subifp) &&
937 !mptcp_subflow_disconnecting(mpts) &&
938 IFNET_IS_CELLULAR(ifp)) {
939 need_to_ask_symptoms = TRUE;
940 }
941
942 /*
943 * In Handover mode, only create cell subflow if
944 * 1. Wi-Fi Assist is active
945 * 2. Symptoms marked WiFi as weak
946 * 3. We are experiencing RTOs or we are not sending data.
947 *
948 * This covers the scenario, where:
949 * 1. We send and get retransmission timeouts (thus,
950 * we confirmed that WiFi is indeed bad).
951 * 2. We are not sending and the server tries to send.
952 * Establshing a cell-subflow gives the server a
953 * chance to send us some data over cell if WiFi
954 * is dead. We establish the subflow with the
955 * backup-bit set, so the server is not allowed to
956 * send on this subflow as long as WiFi is providing
957 * good performance.
958 */
959 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
960 !IFNET_IS_CELLULAR(subifp) &&
961 !mptcp_subflow_disconnecting(mpts) &&
962 (mptcp_is_wifi_unusable_for_session(mpte) == 0 ||
963 (tp->t_rxtshift < mptcp_fail_thresh * 2 && mptetoso(mpte)->so_snd.sb_cc))) {
964 os_log_debug(mptcp_log_handle,
965 "%s - %lx: handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
966 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
967 mptcp_is_wifi_unusable_for_session(mpte),
968 tp->t_rxtshift,
969 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
970 mptetoso(mpte)->so_snd.sb_cc,
971 ifindex, subifp->if_index,
972 tp->t_srtt >> TCP_RTT_SHIFT,
973 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
974 tp->t_rxtcur);
975 found = TRUE;
976
977 /* We found a proper subflow on WiFi - no need for cell */
978 want_cellular = FALSE;
979 break;
980 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
981 uint64_t time_now = mach_continuous_time();
982
983 os_log(mptcp_log_handle,
984 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
985 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
986 time_now, mptcp_is_wifi_unusable_for_session(mpte),
987 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
988 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
989
990 if (!IFNET_IS_CELLULAR(subifp) &&
991 !mptcp_subflow_disconnecting(mpts) &&
992 (mpte->mpte_time_target == 0 ||
993 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
994 !mptcp_is_wifi_unusable_for_session(mpte))) {
995 found = TRUE;
996
997 want_cellular = FALSE;
998 break;
999 }
1000 } else {
1001 os_log_debug(mptcp_log_handle,
1002 "%s - %lx: svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u rtt %u rttvar %u rto %u\n",
1003 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1004 mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
1005 mptcp_is_wifi_unusable_for_session(mpte), tp->t_rxtshift,
1006 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc,
1007 tp->t_srtt >> TCP_RTT_SHIFT,
1008 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
1009 tp->t_rxtcur);
1010 }
1011
1012 if (subifp->if_index == ifindex &&
1013 !mptcp_subflow_disconnecting(mpts)) {
1014 /*
1015 * We found a subflow on this interface.
1016 * No need to create a new one.
1017 */
1018 found = TRUE;
1019 break;
1020 }
1021 }
1022
1023 if (found) {
1024 continue;
1025 }
1026
1027 if (need_to_ask_symptoms &&
1028 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1029 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1030 mptcp_developer_mode == 0) {
1031 mptcp_ask_symptoms(mpte);
1032 return;
1033 }
1034
1035 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1036
1037 if (dst->sa_family == AF_INET &&
1038 !info->has_v4_conn && info->has_nat64_conn) {
1039 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1040 int error, j;
1041
1042 bzero(&nat64pre, sizeof(struct sockaddr_in6));
1043
1044 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1045 if (error) {
1046 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1047 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1048 continue;
1049 }
1050
1051 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1052 if (nat64prefixes[j].prefix_len != 0) {
1053 break;
1054 }
1055 }
1056
1057 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1058
1059 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1060 nat64prefixes[j].prefix_len,
1061 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1062 if (error != 0) {
1063 os_log_info(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1064 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1065 continue;
1066 }
1067
1068 memcpy(&nat64pre.sin6_addr,
1069 &nat64prefixes[j].ipv6_prefix,
1070 sizeof(nat64pre.sin6_addr));
1071 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1072 nat64pre.sin6_family = AF_INET6;
1073 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1074 nat64pre.sin6_flowinfo = 0;
1075 nat64pre.sin6_scope_id = 0;
1076
1077 dst = (struct sockaddr *)&nat64pre;
1078 }
1079
1080 /* Initial subflow started on a NAT64'd address? */
1081 if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
1082 mpte->mpte_dst.sa_family == AF_INET6 &&
1083 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
1084 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
1085 }
1086
1087 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1088 continue;
1089 }
1090 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1091 continue;
1092 }
1093
1094 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1095 }
1096
1097 if (!cellular_viable && want_cellular) {
1098 /* Trigger Cell Bringup */
1099 mptcp_trigger_cell_bringup(mpte);
1100 }
1101 }
1102
1103 static void
1104 mptcp_remove_cell_subflows(struct mptses *mpte)
1105 {
1106 struct mptsub *mpts, *tmpts;
1107 boolean_t found = false;
1108
1109 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1110 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1111
1112 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1113 continue;
1114 }
1115
1116 /* We have a functioning subflow on WiFi. No need for cell! */
1117 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1118 !mptcp_subflow_disconnecting(mpts)) {
1119 found = true;
1120 }
1121 }
1122
1123 /* Didn't found functional sub on WiFi - stay on cell */
1124 if (!found) {
1125 return;
1126 }
1127
1128 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1129 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1130
1131 /* Only remove cellular subflows */
1132 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1133 continue;
1134 }
1135
1136 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1137 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1138
1139 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1140 }
1141
1142 return;
1143 }
1144
1145 /* Returns true if it removed a subflow on cell */
1146 static void
1147 mptcp_handover_subflows_remove(struct mptses *mpte)
1148 {
1149 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1150 boolean_t found_working_subflow = false;
1151 struct mptsub *mpts;
1152
1153 /*
1154 * Look for a subflow that is on a non-cellular interface
1155 * and actually works (aka, no retransmission timeout).
1156 */
1157 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1158 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1159 struct socket *so;
1160 struct tcpcb *tp;
1161
1162 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1163 continue;
1164 }
1165
1166 so = mpts->mpts_socket;
1167 tp = sototcpcb(so);
1168
1169 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1170 tp->t_state != TCPS_ESTABLISHED) {
1171 continue;
1172 }
1173
1174 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1175 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1176
1177 /* Is this subflow in good condition? */
1178 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) {
1179 found_working_subflow = true;
1180 }
1181
1182 /* Or WiFi is fine */
1183 if (!wifi_unusable) {
1184 found_working_subflow = true;
1185 }
1186 }
1187
1188 /*
1189 * Couldn't find a working subflow, let's not remove those on a cellular
1190 * interface.
1191 */
1192 if (!found_working_subflow) {
1193 return;
1194 }
1195
1196 mptcp_remove_cell_subflows(mpte);
1197 }
1198
1199 static void
1200 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1201 {
1202 uint64_t time_now = mach_continuous_time();
1203
1204 if (mpte->mpte_time_target != 0 &&
1205 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1206 mptcp_is_wifi_unusable_for_session(mpte)) {
1207 /* WiFi is bad and we are below the target - don't remove any subflows */
1208 return;
1209 }
1210
1211 mptcp_remove_cell_subflows(mpte);
1212 }
1213
1214 /*
1215 * Based on the MPTCP Service-type and the state of the subflows, we
1216 * will destroy subflows here.
1217 */
1218 void
1219 mptcp_check_subflows_and_remove(struct mptses *mpte)
1220 {
1221 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1222 return;
1223 }
1224
1225 socket_lock_assert_owned(mptetoso(mpte));
1226
1227 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1228 mptcp_handover_subflows_remove(mpte);
1229 }
1230
1231 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1232 mptcp_targetbased_subflows_remove(mpte);
1233 }
1234 }
1235
1236 static void
1237 mptcp_remove_subflows(struct mptses *mpte)
1238 {
1239 struct mptsub *mpts, *tmpts;
1240
1241 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1242 return;
1243 }
1244
1245 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1246 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1247 boolean_t found = false;
1248 uint32_t ifindex;
1249 uint32_t i;
1250
1251 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1252 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1253
1254 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1255 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1256 ifp ? ifp->if_index : -1);
1257 soevent(mpts->mpts_socket,
1258 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1259
1260 continue;
1261 }
1262
1263 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1264 continue;
1265 }
1266
1267 if (ifp) {
1268 ifindex = ifp->if_index;
1269 } else {
1270 ifindex = mpts->mpts_ifscope;
1271 }
1272
1273 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1274 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1275 continue;
1276 }
1277
1278 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1279 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1280 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1281 found = true;
1282 break;
1283 }
1284
1285 if (mpts->mpts_dst.sa_family == AF_INET &&
1286 mpte->mpte_itfinfo[i].has_v4_conn) {
1287 found = true;
1288 break;
1289 }
1290 }
1291 }
1292
1293 if (!found) {
1294 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1295 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1296 ifindex, mpts->mpts_flags);
1297
1298 soevent(mpts->mpts_socket,
1299 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1300 }
1301 }
1302 }
1303
1304 static void
1305 mptcp_create_subflows(__unused void *arg)
1306 {
1307 struct mppcb *mpp;
1308
1309 /*
1310 * Start with clearing, because we might be processing connections
1311 * while a new event comes in.
1312 */
1313 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1314 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1315 }
1316
1317 /* Iterate over all MPTCP connections */
1318
1319 lck_mtx_lock(&mtcbinfo.mppi_lock);
1320
1321 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1322 struct socket *mp_so = mpp->mpp_socket;
1323 struct mptses *mpte = mpp->mpp_pcbe;
1324
1325 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1326 continue;
1327 }
1328
1329 socket_lock(mp_so, 1);
1330 VERIFY(mp_so->so_usecount > 0);
1331
1332 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1333
1334 mptcp_check_subflows_and_add(mpte);
1335 mptcp_remove_subflows(mpte);
1336
1337 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1338 socket_unlock(mp_so, 1);
1339 }
1340
1341 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1342 }
1343
1344 /*
1345 * We need this because we are coming from an NECP-event. This event gets posted
1346 * while holding NECP-locks. The creation of the subflow however leads us back
1347 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1348 * So, we would deadlock there as we already hold the NECP-lock.
1349 *
1350 * So, let's schedule this separately. It also gives NECP the chance to make
1351 * progress, without having to wait for MPTCP to finish its subflow creation.
1352 */
1353 void
1354 mptcp_sched_create_subflows(struct mptses *mpte)
1355 {
1356 struct mppcb *mpp = mpte->mpte_mppcb;
1357 struct mptcb *mp_tp = mpte->mpte_mptcb;
1358 struct socket *mp_so = mpp->mpp_socket;
1359
1360 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1361 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1362 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1363 return;
1364 }
1365
1366 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1367 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1368 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1369 }
1370
1371 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1372 return;
1373 }
1374
1375 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1376 timeout(mptcp_create_subflows, NULL, hz / 10);
1377 }
1378
1379 /*
1380 * Allocate an MPTCP socket option structure.
1381 */
1382 struct mptopt *
1383 mptcp_sopt_alloc(int how)
1384 {
1385 struct mptopt *mpo;
1386
1387 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1388 zalloc_noblock(mptopt_zone);
1389 if (mpo != NULL) {
1390 bzero(mpo, mptopt_zone_size);
1391 }
1392
1393 return mpo;
1394 }
1395
1396 /*
1397 * Free an MPTCP socket option structure.
1398 */
1399 void
1400 mptcp_sopt_free(struct mptopt *mpo)
1401 {
1402 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1403
1404 zfree(mptopt_zone, mpo);
1405 }
1406
1407 /*
1408 * Add a socket option to the MPTCP socket option list.
1409 */
1410 void
1411 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1412 {
1413 socket_lock_assert_owned(mptetoso(mpte));
1414 mpo->mpo_flags |= MPOF_ATTACHED;
1415 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1416 }
1417
1418 /*
1419 * Remove a socket option from the MPTCP socket option list.
1420 */
1421 void
1422 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1423 {
1424 socket_lock_assert_owned(mptetoso(mpte));
1425 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1426 mpo->mpo_flags &= ~MPOF_ATTACHED;
1427 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1428 }
1429
1430 /*
1431 * Search for an existing <sopt_level,sopt_name> socket option.
1432 */
1433 struct mptopt *
1434 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1435 {
1436 struct mptopt *mpo;
1437
1438 socket_lock_assert_owned(mptetoso(mpte));
1439
1440 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1441 if (mpo->mpo_level == sopt->sopt_level &&
1442 mpo->mpo_name == sopt->sopt_name) {
1443 break;
1444 }
1445 }
1446 return mpo;
1447 }
1448
1449 /*
1450 * Allocate a MPTCP subflow structure.
1451 */
1452 static struct mptsub *
1453 mptcp_subflow_alloc(void)
1454 {
1455 struct mptsub *mpts = zalloc(mptsub_zone);
1456
1457 if (mpts == NULL) {
1458 return NULL;
1459 }
1460
1461 bzero(mpts, mptsub_zone_size);
1462 return mpts;
1463 }
1464
1465 /*
1466 * Deallocate a subflow structure, called when all of the references held
1467 * on it have been released. This implies that the subflow has been deleted.
1468 */
1469 static void
1470 mptcp_subflow_free(struct mptsub *mpts)
1471 {
1472 VERIFY(mpts->mpts_refcnt == 0);
1473 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1474 VERIFY(mpts->mpts_mpte == NULL);
1475 VERIFY(mpts->mpts_socket == NULL);
1476
1477 if (mpts->mpts_src != NULL) {
1478 FREE(mpts->mpts_src, M_SONAME);
1479 mpts->mpts_src = NULL;
1480 }
1481
1482 zfree(mptsub_zone, mpts);
1483 }
1484
1485 static void
1486 mptcp_subflow_addref(struct mptsub *mpts)
1487 {
1488 if (++mpts->mpts_refcnt == 0) {
1489 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1490 }
1491 /* NOTREACHED */
1492 }
1493
1494 static void
1495 mptcp_subflow_remref(struct mptsub *mpts)
1496 {
1497 if (mpts->mpts_refcnt == 0) {
1498 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1499 /* NOTREACHED */
1500 }
1501 if (--mpts->mpts_refcnt > 0) {
1502 return;
1503 }
1504
1505 /* callee will unlock and destroy lock */
1506 mptcp_subflow_free(mpts);
1507 }
1508
1509 static void
1510 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1511 {
1512 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1513 struct tcpcb *tp = sototcpcb(so);
1514
1515 /*
1516 * From this moment on, the subflow is linked to the MPTCP-connection.
1517 * Locking,... happens now at the MPTCP-layer
1518 */
1519 tp->t_mptcb = mpte->mpte_mptcb;
1520 so->so_flags |= SOF_MP_SUBFLOW;
1521 mp_so->so_usecount++;
1522
1523 /*
1524 * Insert the subflow into the list, and associate the MPTCP PCB
1525 * as well as the the subflow socket. From this point on, removing
1526 * the subflow needs to be done via mptcp_subflow_del().
1527 */
1528 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1529 mpte->mpte_numflows++;
1530
1531 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1532 mpts->mpts_mpte = mpte;
1533 mpts->mpts_socket = so;
1534 tp->t_mpsub = mpts;
1535 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1536 mptcp_subflow_addref(mpts); /* for subflow socket */
1537 }
1538
1539 static void
1540 mptcp_subflow_necp_cb(void *handle, __unused int action,
1541 __unused uint32_t interface_index,
1542 uint32_t necp_flags, bool *viable)
1543 {
1544 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1545 struct inpcb *inp = (struct inpcb *)handle;
1546 struct socket *so = inp->inp_socket;
1547 struct mptsub *mpts;
1548 struct mptses *mpte;
1549
1550 if (low_power) {
1551 action = NECP_CLIENT_CBACTION_NONVIABLE;
1552 }
1553
1554 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1555 return;
1556 }
1557
1558 /*
1559 * The socket is being garbage-collected. There is nothing to be done
1560 * here.
1561 */
1562 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1563 return;
1564 }
1565
1566 socket_lock(so, 1);
1567
1568 /* Check again after we acquired the lock. */
1569 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1570 goto out;
1571 }
1572
1573 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1574 mpts = sototcpcb(so)->t_mpsub;
1575
1576 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1577 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1578
1579 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1580
1581 mptcp_sched_create_subflows(mpte);
1582
1583 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1584 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1585 viable != NULL) {
1586 *viable = 1;
1587 }
1588
1589 out:
1590 socket_unlock(so, 1);
1591 }
1592
1593 /*
1594 * Create an MPTCP subflow socket.
1595 */
1596 static int
1597 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1598 struct socket **so)
1599 {
1600 lck_mtx_t *subflow_mtx;
1601 struct mptopt smpo, *mpo, *tmpo;
1602 struct proc *p;
1603 struct socket *mp_so;
1604 int error;
1605
1606 *so = NULL;
1607
1608 mp_so = mptetoso(mpte);
1609
1610 p = proc_find(mp_so->last_pid);
1611 if (p == PROC_NULL) {
1612 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1613 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1614
1615 return ESRCH;
1616 }
1617
1618 /*
1619 * Create the subflow socket (multipath subflow, non-blocking.)
1620 *
1621 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1622 * socket; it will be cleared when the socket is peeled off or closed.
1623 * It also indicates to the underlying TCP to handle MPTCP options.
1624 * A multipath subflow socket implies SS_NOFDREF state.
1625 */
1626
1627 /*
1628 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1629 * the ipi-lock. We cannot hold the socket-lock at that point.
1630 */
1631 socket_unlock(mp_so, 0);
1632 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1633 SOCF_MPTCP, PROC_NULL);
1634 socket_lock(mp_so, 0);
1635 if (error) {
1636 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1637 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1638
1639 proc_rele(p);
1640
1641 mptcp_subflow_free(mpts);
1642 return error;
1643 }
1644
1645 /*
1646 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1647 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1648 * Which is why we also need to get the lock with pr_getlock, as after
1649 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1650 */
1651 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1652 lck_mtx_lock(subflow_mtx);
1653
1654 /*
1655 * Must be the first thing we do, to make sure all pointers for this
1656 * subflow are set.
1657 */
1658 mptcp_subflow_attach(mpte, mpts, *so);
1659
1660 /*
1661 * A multipath subflow socket is used internally in the kernel,
1662 * therefore it does not have a file desciptor associated by
1663 * default.
1664 */
1665 (*so)->so_state |= SS_NOFDREF;
1666
1667 lck_mtx_unlock(subflow_mtx);
1668
1669 /* prevent the socket buffers from being compressed */
1670 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1671 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1672
1673 /* Inherit preconnect and TFO data flags */
1674 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1675 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1676 }
1677 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1678 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1679 }
1680
1681 /* Inherit uuid and create the related flow. */
1682 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1683 struct mptcb *mp_tp = mpte->mpte_mptcb;
1684
1685 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1686
1687 /*
1688 * A note on the unlock: With MPTCP, we do multiple times a
1689 * necp_client_register_socket_flow. This is problematic,
1690 * because now the lock-ordering guarantee (first necp-locks,
1691 * then socket-locks) is no more respected. So, we need to
1692 * unlock here.
1693 */
1694 socket_unlock(mp_so, 0);
1695 error = necp_client_register_socket_flow(mp_so->last_pid,
1696 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1697 socket_lock(mp_so, 0);
1698
1699 if (error) {
1700 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1701 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1702
1703 goto out_err;
1704 }
1705
1706 /* Possible state-change during the unlock above */
1707 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1708 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1709 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1710 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1711 mp_tp->mpt_state, mp_tp->mpt_flags);
1712
1713 error = EINVAL;
1714 goto out_err;
1715 }
1716
1717 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1718 }
1719
1720 /* Needs to happen prior to the delegation! */
1721 (*so)->last_pid = mp_so->last_pid;
1722
1723 if (mp_so->so_flags & SOF_DELEGATED) {
1724 if (mpte->mpte_epid) {
1725 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1726 if (error) {
1727 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1728 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1729 goto out_err;
1730 }
1731 }
1732 if (!uuid_is_null(mpte->mpte_euuid)) {
1733 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1734 if (error) {
1735 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1736 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1737 goto out_err;
1738 }
1739 }
1740 }
1741
1742 /* inherit the other socket options */
1743 bzero(&smpo, sizeof(smpo));
1744 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1745 smpo.mpo_level = SOL_SOCKET;
1746 smpo.mpo_intval = 1;
1747
1748 /* disable SIGPIPE */
1749 smpo.mpo_name = SO_NOSIGPIPE;
1750 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1751 goto out_err;
1752 }
1753
1754 /* find out if the subflow's source address goes away */
1755 smpo.mpo_name = SO_NOADDRERR;
1756 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1757 goto out_err;
1758 }
1759
1760 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1761 /*
1762 * On secondary subflows we might need to set the cell-fallback
1763 * flag (see conditions in mptcp_subflow_sosetopt).
1764 */
1765 smpo.mpo_level = SOL_SOCKET;
1766 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1767 smpo.mpo_intval = 1;
1768 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1769 goto out_err;
1770 }
1771 }
1772
1773 /* replay setsockopt(2) on the subflow sockets for eligible options */
1774 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1775 int interim;
1776
1777 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1778 continue;
1779 }
1780
1781 /*
1782 * Skip those that are handled internally; these options
1783 * should not have been recorded and marked with the
1784 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1785 */
1786 if (mpo->mpo_level == SOL_SOCKET &&
1787 (mpo->mpo_name == SO_NOSIGPIPE ||
1788 mpo->mpo_name == SO_NOADDRERR ||
1789 mpo->mpo_name == SO_KEEPALIVE)) {
1790 continue;
1791 }
1792
1793 interim = (mpo->mpo_flags & MPOF_INTERIM);
1794 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1795 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1796 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1797 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1798 mpo->mpo_intval);
1799 mptcp_sopt_remove(mpte, mpo);
1800 mptcp_sopt_free(mpo);
1801 continue;
1802 }
1803 }
1804
1805 /*
1806 * We need to receive everything that the subflow socket has,
1807 * so use a customized socket receive function. We will undo
1808 * this when the socket is peeled off or closed.
1809 */
1810 switch (dom) {
1811 case PF_INET:
1812 (*so)->so_proto = &mptcp_subflow_protosw;
1813 break;
1814 #if INET6
1815 case PF_INET6:
1816 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1817 break;
1818 #endif /* INET6 */
1819 default:
1820 VERIFY(0);
1821 /* NOTREACHED */
1822 }
1823
1824 proc_rele(p);
1825
1826 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1827 int, dom, int, error);
1828
1829 return 0;
1830
1831 out_err:
1832 mptcp_subflow_abort(mpts, error);
1833
1834 proc_rele(p);
1835
1836 return error;
1837 }
1838
1839 /*
1840 * Close an MPTCP subflow socket.
1841 *
1842 * Note that this may be called on an embryonic subflow, and the only
1843 * thing that is guaranteed valid is the protocol-user request.
1844 */
1845 static void
1846 mptcp_subflow_soclose(struct mptsub *mpts)
1847 {
1848 struct socket *so = mpts->mpts_socket;
1849
1850 if (mpts->mpts_flags & MPTSF_CLOSED) {
1851 return;
1852 }
1853
1854 VERIFY(so != NULL);
1855 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1856 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1857
1858 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1859 struct socket *, so,
1860 struct sockbuf *, &so->so_rcv,
1861 struct sockbuf *, &so->so_snd,
1862 struct mptses *, mpts->mpts_mpte);
1863
1864 mpts->mpts_flags |= MPTSF_CLOSED;
1865
1866 if (so->so_retaincnt == 0) {
1867 soclose_locked(so);
1868
1869 return;
1870 } else {
1871 VERIFY(so->so_usecount > 0);
1872 so->so_usecount--;
1873 }
1874
1875 return;
1876 }
1877
1878 /*
1879 * Connect an MPTCP subflow socket.
1880 *
1881 * Note that in the pending connect case, the subflow socket may have been
1882 * bound to an interface and/or a source IP address which may no longer be
1883 * around by the time this routine is called; in that case the connect attempt
1884 * will most likely fail.
1885 */
1886 static int
1887 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1888 {
1889 char dbuf[MAX_IPv6_STR_LEN];
1890 struct socket *mp_so, *so;
1891 struct mptcb *mp_tp;
1892 struct sockaddr *dst;
1893 struct proc *p;
1894 int af, error, dport;
1895
1896 mp_so = mptetoso(mpte);
1897 mp_tp = mpte->mpte_mptcb;
1898 so = mpts->mpts_socket;
1899 af = mpts->mpts_dst.sa_family;
1900 dst = &mpts->mpts_dst;
1901
1902 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1903 VERIFY(mpts->mpts_socket != NULL);
1904 VERIFY(af == AF_INET || af == AF_INET6);
1905
1906 if (af == AF_INET) {
1907 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1908 dport = ntohs(SIN(dst)->sin_port);
1909 } else {
1910 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1911 dport = ntohs(SIN6(dst)->sin6_port);
1912 }
1913
1914 os_log_info(mptcp_log_handle,
1915 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1916 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1917
1918 p = proc_find(mp_so->last_pid);
1919 if (p == PROC_NULL) {
1920 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1921 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1922
1923 return ESRCH;
1924 }
1925
1926 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1927
1928 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1929
1930 /* connect the subflow socket */
1931 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1932 p, mpts->mpts_ifscope,
1933 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1934
1935 mpts->mpts_iss = sototcpcb(so)->iss;
1936
1937 /* See tcp_connect_complete */
1938 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1939 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1940 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1941 }
1942
1943 /* Allocate a unique address id per subflow */
1944 mpte->mpte_addrid_last++;
1945 if (mpte->mpte_addrid_last == 0) {
1946 mpte->mpte_addrid_last++;
1947 }
1948
1949 proc_rele(p);
1950
1951 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1952 struct mptsub *, mpts, int, error);
1953 if (error) {
1954 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1955 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1956 }
1957
1958 return error;
1959 }
1960
1961 static int
1962 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1963 uint32_t rseq, uint16_t dlen)
1964 {
1965 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1966
1967 if (m_pktlen(m) == 0) {
1968 return 0;
1969 }
1970
1971 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1972 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1973 rseq != m->m_pkthdr.mp_rseq ||
1974 dlen != m->m_pkthdr.mp_rlen)) {
1975 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
1976 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1977 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1978 rseq, m->m_pkthdr.mp_rseq,
1979 dlen, m->m_pkthdr.mp_rlen);
1980
1981 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1982 return -1;
1983 }
1984 m->m_pkthdr.mp_dsn += off;
1985 m->m_pkthdr.mp_rseq += off;
1986 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
1987 } else {
1988 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
1989 /* data arrived without an DSS option mapping */
1990
1991 /* initial subflow can fallback right after SYN handshake */
1992 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
1993 mptcp_notify_mpfail(so);
1994 } else {
1995 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1996
1997 return -1;
1998 }
1999 } else if (m->m_flags & M_PKTHDR) {
2000 /* We need to fake the DATA-mapping */
2001 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2002 m->m_pkthdr.mp_dsn = dsn + off;
2003 m->m_pkthdr.mp_rseq = rseq + off;
2004 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
2005 }
2006 }
2007
2008 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2009
2010 return 0;
2011 }
2012
2013 /*
2014 * MPTCP subflow socket receive routine, derived from soreceive().
2015 */
2016 static int
2017 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2018 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2019 {
2020 #pragma unused(uio)
2021 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2022 int flags, error = 0;
2023 struct proc *p = current_proc();
2024 struct mbuf *m, **mp = mp0;
2025 boolean_t proc_held = FALSE;
2026
2027 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2028
2029 #ifdef MORE_LOCKING_DEBUG
2030 if (so->so_usecount == 1) {
2031 panic("%s: so=%x no other reference on socket\n", __func__, so);
2032 /* NOTREACHED */
2033 }
2034 #endif
2035 /*
2036 * We return all that is there in the subflow's socket receive buffer
2037 * to the MPTCP layer, so we require that the caller passes in the
2038 * expected parameters.
2039 */
2040 if (mp == NULL || controlp != NULL) {
2041 return EINVAL;
2042 }
2043
2044 *mp = NULL;
2045 if (psa != NULL) {
2046 *psa = NULL;
2047 }
2048 if (flagsp != NULL) {
2049 flags = *flagsp & ~MSG_EOR;
2050 } else {
2051 flags = 0;
2052 }
2053
2054 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2055 return EOPNOTSUPP;
2056 }
2057
2058 flags |= (MSG_DONTWAIT | MSG_NBIO);
2059
2060 /*
2061 * If a recv attempt is made on a previously-accepted socket
2062 * that has been marked as inactive (disconnected), reject
2063 * the request.
2064 */
2065 if (so->so_flags & SOF_DEFUNCT) {
2066 struct sockbuf *sb = &so->so_rcv;
2067
2068 error = ENOTCONN;
2069 /*
2070 * This socket should have been disconnected and flushed
2071 * prior to being returned from sodefunct(); there should
2072 * be no data on its receive list, so panic otherwise.
2073 */
2074 if (so->so_state & SS_DEFUNCT) {
2075 sb_empty_assert(sb, __func__);
2076 }
2077 return error;
2078 }
2079
2080 /*
2081 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2082 * and if so just return to the caller. This could happen when
2083 * soreceive() is called by a socket upcall function during the
2084 * time the socket is freed. The socket buffer would have been
2085 * locked across the upcall, therefore we cannot put this thread
2086 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2087 * we may livelock), because the lock on the socket buffer will
2088 * only be released when the upcall routine returns to its caller.
2089 * Because the socket has been officially closed, there can be
2090 * no further read on it.
2091 *
2092 * A multipath subflow socket would have its SS_NOFDREF set by
2093 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2094 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2095 */
2096 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2097 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2098 return 0;
2099 }
2100
2101 /*
2102 * For consistency with soreceive() semantics, we need to obey
2103 * SB_LOCK in case some other code path has locked the buffer.
2104 */
2105 error = sblock(&so->so_rcv, 0);
2106 if (error != 0) {
2107 return error;
2108 }
2109
2110 m = so->so_rcv.sb_mb;
2111 if (m == NULL) {
2112 /*
2113 * Panic if we notice inconsistencies in the socket's
2114 * receive list; both sb_mb and sb_cc should correctly
2115 * reflect the contents of the list, otherwise we may
2116 * end up with false positives during select() or poll()
2117 * which could put the application in a bad state.
2118 */
2119 SB_MB_CHECK(&so->so_rcv);
2120
2121 if (so->so_error != 0) {
2122 error = so->so_error;
2123 so->so_error = 0;
2124 goto release;
2125 }
2126
2127 if (so->so_state & SS_CANTRCVMORE) {
2128 goto release;
2129 }
2130
2131 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2132 error = ENOTCONN;
2133 goto release;
2134 }
2135
2136 /*
2137 * MSG_DONTWAIT is implicitly defined and this routine will
2138 * never block, so return EWOULDBLOCK when there is nothing.
2139 */
2140 error = EWOULDBLOCK;
2141 goto release;
2142 }
2143
2144 mptcp_update_last_owner(so, mp_so);
2145
2146 if (mp_so->last_pid != proc_pid(p)) {
2147 p = proc_find(mp_so->last_pid);
2148 if (p == PROC_NULL) {
2149 p = current_proc();
2150 } else {
2151 proc_held = TRUE;
2152 }
2153 }
2154
2155 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2156 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2157 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2158
2159 while (m != NULL) {
2160 int dlen = 0, dfin = 0, error_out = 0;
2161 struct mbuf *start = m;
2162 uint64_t dsn;
2163 uint32_t sseq;
2164 uint16_t orig_dlen;
2165 uint16_t csum;
2166
2167 VERIFY(m->m_nextpkt == NULL);
2168
2169 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2170 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2171 dsn = m->m_pkthdr.mp_dsn;
2172 sseq = m->m_pkthdr.mp_rseq;
2173 csum = m->m_pkthdr.mp_csum;
2174 } else {
2175 /* We did fallback */
2176 if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) {
2177 error = EIO;
2178 *mp0 = NULL;
2179 goto release;
2180 }
2181
2182 sbfree(&so->so_rcv, m);
2183
2184 if (mp != NULL) {
2185 *mp = m;
2186 mp = &m->m_next;
2187 so->so_rcv.sb_mb = m = m->m_next;
2188 *mp = NULL;
2189 }
2190
2191 if (m != NULL) {
2192 so->so_rcv.sb_lastrecord = m;
2193 } else {
2194 SB_EMPTY_FIXUP(&so->so_rcv);
2195 }
2196
2197 continue;
2198 }
2199
2200 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2201 dfin = 1;
2202 }
2203
2204 /*
2205 * Check if the full mapping is now present
2206 */
2207 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
2208 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
2209 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
2210 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
2211
2212 if (*mp0 == NULL) {
2213 error = EWOULDBLOCK;
2214 }
2215 goto release;
2216 }
2217
2218 /* Now, get the full mapping */
2219 while (dlen > 0) {
2220 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
2221 error_out = 1;
2222 error = EIO;
2223 dlen = 0;
2224 *mp0 = NULL;
2225 break;
2226 }
2227
2228 dlen -= m->m_len;
2229 sbfree(&so->so_rcv, m);
2230
2231 if (mp != NULL) {
2232 *mp = m;
2233 mp = &m->m_next;
2234 so->so_rcv.sb_mb = m = m->m_next;
2235 *mp = NULL;
2236 }
2237
2238 if (dlen - dfin == 0) {
2239 dlen = 0;
2240 }
2241
2242 VERIFY(dlen <= 0 || m);
2243 }
2244
2245 VERIFY(dlen == 0);
2246
2247 if (m != NULL) {
2248 so->so_rcv.sb_lastrecord = m;
2249 } else {
2250 SB_EMPTY_FIXUP(&so->so_rcv);
2251 }
2252
2253 if (error_out) {
2254 goto release;
2255 }
2256
2257 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2258 error = EIO;
2259 *mp0 = NULL;
2260 goto release;
2261 }
2262
2263 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2264 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2265 }
2266
2267 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2268 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2269
2270 if (flagsp != NULL) {
2271 *flagsp |= flags;
2272 }
2273
2274 release:
2275 sbunlock(&so->so_rcv, TRUE);
2276
2277 if (proc_held) {
2278 proc_rele(p);
2279 }
2280
2281 return error;
2282 }
2283
2284 /*
2285 * MPTCP subflow socket send routine, derived from sosend().
2286 */
2287 static int
2288 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2289 struct mbuf *top, struct mbuf *control, int flags)
2290 {
2291 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2292 struct proc *p = current_proc();
2293 boolean_t en_tracing = FALSE, proc_held = FALSE;
2294 int en_tracing_val;
2295 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2296 int error;
2297
2298 VERIFY(control == NULL);
2299 VERIFY(addr == NULL);
2300 VERIFY(uio == NULL);
2301 VERIFY(flags == 0);
2302 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2303
2304 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2305 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2306
2307 /*
2308 * trace if tracing & network (vs. unix) sockets & and
2309 * non-loopback
2310 */
2311 if (ENTR_SHOULDTRACE &&
2312 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2313 struct inpcb *inp = sotoinpcb(so);
2314 if (inp->inp_last_outifp != NULL &&
2315 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2316 en_tracing = TRUE;
2317 en_tracing_val = top->m_pkthdr.len;
2318 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2319 (unsigned long)VM_KERNEL_ADDRPERM(so),
2320 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2321 (int64_t)en_tracing_val);
2322 }
2323 }
2324
2325 mptcp_update_last_owner(so, mp_so);
2326
2327 if (mp_so->last_pid != proc_pid(p)) {
2328 p = proc_find(mp_so->last_pid);
2329 if (p == PROC_NULL) {
2330 p = current_proc();
2331 } else {
2332 proc_held = TRUE;
2333 }
2334 }
2335
2336 #if NECP
2337 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2338 #endif /* NECP */
2339
2340 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2341
2342 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
2343 if (error) {
2344 goto out;
2345 }
2346
2347 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2348 top = NULL;
2349
2350 out:
2351 if (top != NULL) {
2352 m_freem(top);
2353 }
2354
2355 if (proc_held) {
2356 proc_rele(p);
2357 }
2358
2359 soclearfastopen(so);
2360
2361 if (en_tracing) {
2362 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2363 (unsigned long)VM_KERNEL_ADDRPERM(so),
2364 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2365 (int64_t)en_tracing_val);
2366 }
2367
2368 return error;
2369 }
2370
2371 /*
2372 * Establish an initial MPTCP connection (if first subflow and not yet
2373 * connected), or add a subflow to an existing MPTCP connection.
2374 */
2375 int
2376 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2377 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2378 {
2379 struct socket *mp_so, *so = NULL;
2380 struct mptcb *mp_tp;
2381 struct mptsub *mpts = NULL;
2382 int af, error = 0;
2383
2384 mp_so = mptetoso(mpte);
2385 mp_tp = mpte->mpte_mptcb;
2386
2387 socket_lock_assert_owned(mp_so);
2388
2389 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2390 /* If the remote end sends Data FIN, refuse subflow adds */
2391 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2392 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2393 error = ENOTCONN;
2394 goto out_err;
2395 }
2396
2397 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2398 error = EOVERFLOW;
2399 goto out_err;
2400 }
2401
2402 mpts = mptcp_subflow_alloc();
2403 if (mpts == NULL) {
2404 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2405 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2406 error = ENOMEM;
2407 goto out_err;
2408 }
2409
2410 if (src) {
2411 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2412 error = EAFNOSUPPORT;
2413 goto out_err;
2414 }
2415
2416 if (src->sa_family == AF_INET &&
2417 src->sa_len != sizeof(struct sockaddr_in)) {
2418 error = EINVAL;
2419 goto out_err;
2420 }
2421
2422 if (src->sa_family == AF_INET6 &&
2423 src->sa_len != sizeof(struct sockaddr_in6)) {
2424 error = EINVAL;
2425 goto out_err;
2426 }
2427
2428 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
2429 M_WAITOK | M_ZERO);
2430 if (mpts->mpts_src == NULL) {
2431 error = ENOMEM;
2432 goto out_err;
2433 }
2434 bcopy(src, mpts->mpts_src, src->sa_len);
2435 }
2436
2437 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2438 error = EAFNOSUPPORT;
2439 goto out_err;
2440 }
2441
2442 if (dst->sa_family == AF_INET &&
2443 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2444 error = EINVAL;
2445 goto out_err;
2446 }
2447
2448 if (dst->sa_family == AF_INET6 &&
2449 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2450 error = EINVAL;
2451 goto out_err;
2452 }
2453
2454 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2455
2456 af = mpts->mpts_dst.sa_family;
2457
2458 ifnet_head_lock_shared();
2459 if ((ifscope > (unsigned)if_index)) {
2460 ifnet_head_done();
2461 error = ENXIO;
2462 goto out_err;
2463 }
2464 ifnet_head_done();
2465
2466 mpts->mpts_ifscope = ifscope;
2467
2468 /* create the subflow socket */
2469 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2470 /*
2471 * Returning (error) and not cleaning up, because up to here
2472 * all we did is creating mpts.
2473 *
2474 * And the contract is that the call to mptcp_subflow_socreate,
2475 * moves ownership of mpts to mptcp_subflow_socreate.
2476 */
2477 return error;
2478 }
2479
2480 /*
2481 * We may be called from within the kernel. Still need to account this
2482 * one to the real app.
2483 */
2484 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2485
2486 /*
2487 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2488 * -1 (SAE_CONNID_ALL).
2489 */
2490 mpte->mpte_connid_last++;
2491 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2492 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2493 mpte->mpte_connid_last++;
2494 }
2495
2496 mpts->mpts_connid = mpte->mpte_connid_last;
2497
2498 mpts->mpts_rel_seq = 1;
2499
2500 /* Allocate a unique address id per subflow */
2501 mpte->mpte_addrid_last++;
2502 if (mpte->mpte_addrid_last == 0) {
2503 mpte->mpte_addrid_last++;
2504 }
2505
2506 /* register for subflow socket read/write events */
2507 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2508
2509 /* Register for subflow socket control events */
2510 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2511 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2512 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2513 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2514 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2515 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2516 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2517 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2518
2519 /* sanity check */
2520 VERIFY(!(mpts->mpts_flags &
2521 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2522
2523 /*
2524 * Indicate to the TCP subflow whether or not it should establish
2525 * the initial MPTCP connection, or join an existing one. Fill
2526 * in the connection request structure with additional info needed
2527 * by the underlying TCP (to be used in the TCP options, etc.)
2528 */
2529 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2530 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2531
2532 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2533 mptcp_init_local_parms(mpte);
2534 }
2535 soisconnecting(mp_so);
2536
2537 /* If fastopen is requested, set state in mpts */
2538 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2539 mpts->mpts_flags |= MPTSF_TFO_REQD;
2540 }
2541 } else {
2542 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2543 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2544 }
2545 }
2546
2547 mpts->mpts_flags |= MPTSF_CONNECTING;
2548
2549 /* connect right away if first attempt, or if join can be done now */
2550 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2551 error = mptcp_subflow_soconnectx(mpte, mpts);
2552 }
2553
2554 if (error) {
2555 goto out_err_close;
2556 }
2557
2558 if (pcid) {
2559 *pcid = mpts->mpts_connid;
2560 }
2561
2562 return 0;
2563
2564 out_err_close:
2565 mptcp_subflow_abort(mpts, error);
2566
2567 return error;
2568
2569 out_err:
2570 if (mpts) {
2571 mptcp_subflow_free(mpts);
2572 }
2573
2574 return error;
2575 }
2576
2577 void
2578 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2579 {
2580 int index = mptcpstats_get_index(stats, mpts);
2581
2582 if (index != -1) {
2583 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2584
2585 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2586 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2587
2588 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2589 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2590
2591 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2592 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2593
2594 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2595 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2596 }
2597 }
2598
2599 /*
2600 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2601 * will no longer be accessible after a subflow is deleted, thus this
2602 * should occur only after the subflow socket has been disconnected.
2603 */
2604 void
2605 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2606 {
2607 struct socket *mp_so = mptetoso(mpte);
2608 struct socket *so = mpts->mpts_socket;
2609 struct tcpcb *tp = sototcpcb(so);
2610
2611 socket_lock_assert_owned(mp_so);
2612 VERIFY(mpts->mpts_mpte == mpte);
2613 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2614 VERIFY(mpte->mpte_numflows != 0);
2615 VERIFY(mp_so->so_usecount > 0);
2616
2617 mptcpstats_update(mpte->mpte_itfstats, mpts);
2618
2619 mptcp_unset_cellicon(mpte, mpts, 1);
2620
2621 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2622 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2623
2624 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2625 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2626 mpte->mpte_numflows--;
2627 if (mpte->mpte_active_sub == mpts) {
2628 mpte->mpte_active_sub = NULL;
2629 }
2630
2631 /*
2632 * Drop references held by this subflow socket; there
2633 * will be no further upcalls made from this point.
2634 */
2635 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2636 sock_catchevents_locked(so, NULL, NULL, 0);
2637
2638 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2639
2640 mp_so->so_usecount--; /* for subflow socket */
2641 mpts->mpts_mpte = NULL;
2642 mpts->mpts_socket = NULL;
2643
2644 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2645 mptcp_subflow_remref(mpts); /* for subflow socket */
2646
2647 so->so_flags &= ~SOF_MP_SUBFLOW;
2648 tp->t_mptcb = NULL;
2649 tp->t_mpsub = NULL;
2650 }
2651
2652 void
2653 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2654 {
2655 struct socket *so = mpts->mpts_socket;
2656 struct mptcb *mp_tp = mpte->mpte_mptcb;
2657 int send_dfin = 0;
2658
2659 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2660 send_dfin = 1;
2661 }
2662
2663 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2664 (so->so_state & SS_ISCONNECTED)) {
2665 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2666 __func__, mpts->mpts_connid, send_dfin),
2667 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2668
2669 if (send_dfin) {
2670 mptcp_send_dfin(so);
2671 }
2672 soshutdownlock(so, SHUT_WR);
2673 }
2674 }
2675
2676 static void
2677 mptcp_subflow_abort(struct mptsub *mpts, int error)
2678 {
2679 struct socket *so = mpts->mpts_socket;
2680 struct tcpcb *tp = sototcpcb(so);
2681
2682 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2683 return;
2684 }
2685
2686 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2687 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2688
2689 if (tp->t_state != TCPS_CLOSED) {
2690 tcp_drop(tp, error);
2691 }
2692
2693 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2694 }
2695
2696 /*
2697 * Disconnect a subflow socket.
2698 */
2699 void
2700 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2701 {
2702 struct socket *so, *mp_so;
2703 struct mptcb *mp_tp;
2704 int send_dfin = 0;
2705
2706 so = mpts->mpts_socket;
2707 mp_tp = mpte->mpte_mptcb;
2708 mp_so = mptetoso(mpte);
2709
2710 socket_lock_assert_owned(mp_so);
2711
2712 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2713 return;
2714 }
2715
2716 mptcp_unset_cellicon(mpte, mpts, 1);
2717
2718 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2719
2720 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2721 send_dfin = 1;
2722 }
2723
2724 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2725 (so->so_state & SS_ISCONNECTED)) {
2726 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2727 __func__, mpts->mpts_connid, send_dfin),
2728 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2729
2730 if (send_dfin) {
2731 mptcp_send_dfin(so);
2732 }
2733
2734 if (mp_so->so_flags & SOF_DEFUNCT) {
2735 errno_t ret;
2736
2737 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2738 if (ret == 0) {
2739 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2740
2741 if (ret != 0) {
2742 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2743 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2744 }
2745 } else {
2746 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2747 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2748 }
2749 } else {
2750 (void) soshutdownlock(so, SHUT_RD);
2751 (void) soshutdownlock(so, SHUT_WR);
2752 (void) sodisconnectlocked(so);
2753 }
2754 }
2755
2756 /*
2757 * Generate a disconnect event for this subflow socket, in case
2758 * the lower layer doesn't do it; this is needed because the
2759 * subflow socket deletion relies on it.
2760 */
2761 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2762 }
2763
2764 /*
2765 * Subflow socket input.
2766 */
2767 static void
2768 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2769 {
2770 struct socket *mp_so = mptetoso(mpte);
2771 struct mbuf *m = NULL;
2772 struct socket *so;
2773 int error, wakeup = 0;
2774
2775 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2776 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2777
2778 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2779 struct mptsub *, mpts);
2780
2781 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2782 goto out;
2783 }
2784
2785 so = mpts->mpts_socket;
2786
2787 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2788 if (error != 0 && error != EWOULDBLOCK) {
2789 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2790 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2791 if (error == ENODATA) {
2792 /*
2793 * Don't ignore ENODATA so as to discover
2794 * nasty middleboxes.
2795 */
2796 mp_so->so_error = ENODATA;
2797
2798 wakeup = 1;
2799 goto out;
2800 }
2801 } else if (error == 0) {
2802 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2803 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2804 }
2805
2806 /* In fallback, make sure to accept data on all but one subflow */
2807 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2808 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2809 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2810 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2811 m_freem(m);
2812 goto out;
2813 }
2814
2815 if (m != NULL) {
2816 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2817 mptcp_set_cellicon(mpte, mpts);
2818
2819 mpte->mpte_used_cell = 1;
2820 } else {
2821 /*
2822 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2823 * explicitly set the cellicon, then we unset it again.
2824 */
2825 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2826 mptcp_unset_cellicon(mpte, NULL, 1);
2827 }
2828
2829 mpte->mpte_used_wifi = 1;
2830 }
2831
2832 mptcp_input(mpte, m);
2833 }
2834
2835 out:
2836 if (wakeup) {
2837 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2838 }
2839
2840 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2841 }
2842
2843 void
2844 mptcp_handle_input(struct socket *so)
2845 {
2846 struct mptsub *mpts, *tmpts;
2847 struct mptses *mpte;
2848
2849 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2850 return;
2851 }
2852
2853 mpts = sototcpcb(so)->t_mpsub;
2854 mpte = mpts->mpts_mpte;
2855
2856 socket_lock_assert_owned(mptetoso(mpte));
2857
2858 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2859 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2860 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2861 }
2862 return;
2863 }
2864
2865 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2866 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2867 if (mpts->mpts_socket->so_usecount == 0) {
2868 /* Will be removed soon by tcp_garbage_collect */
2869 continue;
2870 }
2871
2872 mptcp_subflow_addref(mpts);
2873 mpts->mpts_socket->so_usecount++;
2874
2875 mptcp_subflow_input(mpte, mpts);
2876
2877 mptcp_subflow_remref(mpts); /* ours */
2878
2879 VERIFY(mpts->mpts_socket->so_usecount != 0);
2880 mpts->mpts_socket->so_usecount--;
2881 }
2882
2883 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2884 }
2885
2886 /*
2887 * Subflow socket write upcall.
2888 *
2889 * Called when the associated subflow socket posted a read event.
2890 */
2891 static void
2892 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2893 {
2894 #pragma unused(so, waitf)
2895 struct mptsub *mpts = arg;
2896 struct mptses *mpte = mpts->mpts_mpte;
2897
2898 VERIFY(mpte != NULL);
2899
2900 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2901 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2902 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2903 }
2904 return;
2905 }
2906
2907 mptcp_output(mpte);
2908 }
2909
2910 static boolean_t
2911 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2912 {
2913 struct mbuf *so_m = so->so_snd.sb_mb;
2914 uint64_t dsn = m->m_pkthdr.mp_dsn;
2915
2916 while (so_m) {
2917 VERIFY(so_m->m_flags & M_PKTHDR);
2918 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2919
2920 /* Part of the segment is covered, don't reinject here */
2921 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2922 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2923 return TRUE;
2924 }
2925
2926 so_m = so_m->m_next;
2927 }
2928
2929 return FALSE;
2930 }
2931
2932 /*
2933 * Subflow socket output.
2934 *
2935 * Called for sending data from MPTCP to the underlying subflow socket.
2936 */
2937 int
2938 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2939 {
2940 struct mptcb *mp_tp = mpte->mpte_mptcb;
2941 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2942 struct socket *mp_so, *so;
2943 struct tcpcb *tp;
2944 uint64_t mpt_dsn = 0, off = 0;
2945 int sb_cc = 0, error = 0, wakeup = 0;
2946 uint32_t dss_csum;
2947 uint16_t tot_sent = 0;
2948 boolean_t reinjected = FALSE;
2949
2950 mp_so = mptetoso(mpte);
2951 so = mpts->mpts_socket;
2952 tp = sototcpcb(so);
2953
2954 socket_lock_assert_owned(mp_so);
2955
2956 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2957 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2958
2959 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2960 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2961 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2962 (mpts->mpts_flags & MPTSF_TFO_REQD));
2963 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2964
2965 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2966 __func__, mpts->mpts_flags, mpte->mpte_flags,
2967 mptcp_subflow_cwnd_space(so)),
2968 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2969 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2970 struct mptsub *, mpts);
2971
2972 /* Remove Addr Option is not sent reliably as per I-D */
2973 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2974 tp->t_rem_aid = mpte->mpte_lost_aid;
2975 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2976 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2977 }
2978
2979 /*
2980 * The mbuf chains containing the metadata (as well as pointing to
2981 * the user data sitting at the MPTCP output queue) would then be
2982 * sent down to the subflow socket.
2983 *
2984 * Some notes on data sequencing:
2985 *
2986 * a. Each mbuf must be a M_PKTHDR.
2987 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2988 * in the mbuf pkthdr structure.
2989 * c. Each mbuf containing the MPTCP metadata must have its
2990 * pkt_flags marked with the PKTF_MPTCP flag.
2991 */
2992
2993 if (mpte->mpte_reinjectq) {
2994 sb_mb = mpte->mpte_reinjectq;
2995 } else {
2996 sb_mb = mp_so->so_snd.sb_mb;
2997 }
2998
2999 if (sb_mb == NULL) {
3000 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3001 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3002 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3003 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3004
3005 /* Fix it to prevent looping */
3006 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3007 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3008 }
3009 goto out;
3010 }
3011
3012 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3013
3014 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3015 !(so->so_state & SS_ISCONNECTED) &&
3016 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3017 tp->t_mpflags |= TMPF_TFO_REQUEST;
3018 goto zero_len_write;
3019 }
3020
3021 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3022
3023 /* First, drop acknowledged data */
3024 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3025 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3026 "dsn %u suna %u reinject? %u\n",
3027 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3028 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3029 if (mpte->mpte_reinjectq) {
3030 mptcp_clean_reinjectq(mpte);
3031 } else {
3032 uint64_t len = 0;
3033 len = mp_tp->mpt_snduna - mpt_dsn;
3034 sbdrop(&mp_so->so_snd, (int)len);
3035 wakeup = 1;
3036 }
3037 }
3038
3039 /* Check again because of above sbdrop */
3040 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3041 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3042 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3043 goto out;
3044 }
3045
3046 /*
3047 * In degraded mode, we don't receive data acks, so force free
3048 * mbufs less than snd_nxt
3049 */
3050 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3051 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3052 mp_so->so_snd.sb_mb) {
3053 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3054 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3055 uint64_t len = 0;
3056 len = mp_tp->mpt_snduna - mpt_dsn;
3057 sbdrop(&mp_so->so_snd, (int)len);
3058 wakeup = 1;
3059
3060 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3061 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3062 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3063 }
3064 }
3065
3066 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3067 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3068 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3069 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3070 }
3071
3072 /*
3073 * Adjust the top level notion of next byte used for retransmissions
3074 * and sending FINs.
3075 */
3076 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3077 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3078 }
3079
3080 /* Now determine the offset from which to start transmitting data */
3081 if (mpte->mpte_reinjectq) {
3082 sb_mb = mpte->mpte_reinjectq;
3083 } else {
3084 dont_reinject:
3085 sb_mb = mp_so->so_snd.sb_mb;
3086 }
3087 if (sb_mb == NULL) {
3088 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3089 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3090 goto out;
3091 }
3092
3093 if (sb_mb == mpte->mpte_reinjectq) {
3094 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3095 off = 0;
3096
3097 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3098 if (mptcp_can_send_more(mp_tp, TRUE)) {
3099 goto dont_reinject;
3100 }
3101
3102 error = ECANCELED;
3103 goto out;
3104 }
3105
3106 reinjected = TRUE;
3107 } else if (flags & MPTCP_SUBOUT_PROBING) {
3108 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3109 off = 0;
3110 } else {
3111 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3112
3113 /*
3114 * With TFO, there might be no data at all, thus still go into this
3115 * code-path here.
3116 */
3117 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3118 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3119 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3120 sb_cc -= off;
3121 } else {
3122 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3123 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3124 (uint32_t)mp_tp->mpt_sndmax);
3125
3126 goto out;
3127 }
3128 }
3129
3130 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3131 if (sb_cc <= 0) {
3132 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3133 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3134 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3135 mptcp_subflow_cwnd_space(so));
3136 }
3137
3138 sb_cc = min(sb_cc, UINT16_MAX);
3139
3140 /*
3141 * Create a DSN mapping for the data we are about to send. It all
3142 * has the same mapping.
3143 */
3144 if (reinjected) {
3145 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3146 } else {
3147 mpt_dsn = mp_tp->mpt_snduna + off;
3148 }
3149
3150 mpt_mbuf = sb_mb;
3151 while (mpt_mbuf && reinjected == FALSE &&
3152 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3153 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3154 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3155 mpt_mbuf = mpt_mbuf->m_next;
3156 }
3157 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3158 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3159 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3160 mpts->mpts_probecnt),
3161 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3162 }
3163
3164 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3165
3166 head = tail = NULL;
3167
3168 while (tot_sent < sb_cc) {
3169 ssize_t mlen;
3170
3171 mlen = mpt_mbuf->m_len;
3172 mlen -= off;
3173 mlen = min(mlen, sb_cc - tot_sent);
3174
3175 if (mlen < 0) {
3176 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3177 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3178 (uint32_t)off, sb_cc, tot_sent);
3179 goto out;
3180 }
3181
3182 if (mlen == 0) {
3183 goto next;
3184 }
3185
3186 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3187 M_COPYM_MUST_COPY_HDR);
3188 if (m == NULL) {
3189 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3190 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3191 error = ENOBUFS;
3192 break;
3193 }
3194
3195 /* Create a DSN mapping for the data (m_copym does it) */
3196 VERIFY(m->m_flags & M_PKTHDR);
3197 VERIFY(m->m_next == NULL);
3198
3199 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3200 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3201 m->m_pkthdr.mp_dsn = mpt_dsn;
3202 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3203 m->m_pkthdr.len = mlen;
3204
3205 if (head == NULL) {
3206 head = tail = m;
3207 } else {
3208 tail->m_next = m;
3209 tail = m;
3210 }
3211
3212 tot_sent += mlen;
3213 off = 0;
3214 next:
3215 mpt_mbuf = mpt_mbuf->m_next;
3216 }
3217
3218 if (reinjected) {
3219 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3220 struct mbuf *n = sb_mb;
3221
3222 while (n) {
3223 n->m_pkthdr.mp_dsn += sb_cc;
3224 n->m_pkthdr.mp_rlen -= sb_cc;
3225 n = n->m_next;
3226 }
3227 m_adj(sb_mb, sb_cc);
3228 } else {
3229 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3230 m_freem(sb_mb);
3231 }
3232 }
3233
3234 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3235 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3236 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3237
3238 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3239 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3240 tot_sent);
3241 }
3242
3243 /* Now, let's update rel-seq and the data-level length */
3244 mpts->mpts_rel_seq += tot_sent;
3245 m = head;
3246 while (m) {
3247 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3248 m->m_pkthdr.mp_csum = dss_csum;
3249 }
3250 m->m_pkthdr.mp_rlen = tot_sent;
3251 m = m->m_next;
3252 }
3253
3254 if (head != NULL) {
3255 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3256 (tp->t_tfo_stats == 0)) {
3257 tp->t_mpflags |= TMPF_TFO_REQUEST;
3258 }
3259
3260 error = sock_sendmbuf(so, NULL, head, 0, NULL);
3261
3262 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
3263 struct sockbuf *, &so->so_rcv,
3264 struct sockbuf *, &so->so_snd,
3265 struct mptses *, mpte, struct mptsub *, mpts,
3266 size_t, tot_sent);
3267 }
3268
3269 done_sending:
3270 if (error == 0 ||
3271 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3272 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3273
3274 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3275 tcpstat.tcps_mp_num_probes++;
3276 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3277 mpts->mpts_probecnt += 1;
3278 } else {
3279 mpts->mpts_probecnt +=
3280 tot_sent / mpts->mpts_maxseg;
3281 }
3282 }
3283
3284 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3285 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3286 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3287 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3288 }
3289 mp_tp->mpt_sndnxt = new_sndnxt;
3290 }
3291
3292 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3293
3294 /* Must be here as mptcp_can_send_more() checks for this */
3295 soclearfastopen(mp_so);
3296
3297 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3298 (mpts->mpts_probesoon != 0)) {
3299 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3300 __func__, mpts->mpts_connid,
3301 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3302 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3303 (tcp_now - mpts->mpts_probesoon)),
3304 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3305 }
3306
3307 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3308 mptcp_set_cellicon(mpte, mpts);
3309
3310 mpte->mpte_used_cell = 1;
3311 } else {
3312 /*
3313 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3314 * explicitly set the cellicon, then we unset it again.
3315 */
3316 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3317 mptcp_unset_cellicon(mpte, NULL, 1);
3318 }
3319
3320 mpte->mpte_used_wifi = 1;
3321 }
3322
3323 /*
3324 * Don't propagate EWOULDBLOCK - it's already taken care of
3325 * in mptcp_usr_send for TFO.
3326 */
3327 error = 0;
3328 } else {
3329 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3330 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3331 }
3332 out:
3333
3334 if (wakeup) {
3335 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3336 }
3337
3338 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3339 return error;
3340
3341 zero_len_write:
3342 /* Opting to call pru_send as no mbuf at subflow level */
3343 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3344 NULL, current_proc());
3345
3346 goto done_sending;
3347 }
3348
3349 static void
3350 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3351 {
3352 struct mbuf *n, *prev = NULL;
3353
3354 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3355 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3356 m->m_pkthdr.mp_rseq),
3357 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3358
3359 n = mpte->mpte_reinjectq;
3360
3361 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3362 * equal than m's sequence number.
3363 */
3364 while (n) {
3365 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3366 break;
3367 }
3368
3369 prev = n;
3370
3371 n = n->m_nextpkt;
3372 }
3373
3374 if (n) {
3375 /* m is already fully covered by the next mbuf in the queue */
3376 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3377 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3378 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
3379 __func__, n->m_pkthdr.mp_rlen),
3380 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3381 goto dont_queue;
3382 }
3383
3384 /* m is covering the next mbuf entirely, thus we remove this guy */
3385 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3386 struct mbuf *tmp = n->m_nextpkt;
3387
3388 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
3389 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3390 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
3391 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3392
3393 m->m_nextpkt = NULL;
3394 if (prev == NULL) {
3395 mpte->mpte_reinjectq = tmp;
3396 } else {
3397 prev->m_nextpkt = tmp;
3398 }
3399
3400 m_freem(n);
3401 n = tmp;
3402 }
3403 }
3404
3405 if (prev) {
3406 /* m is already fully covered by the previous mbuf in the queue */
3407 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3408 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
3409 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
3410 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3411 goto dont_queue;
3412 }
3413 }
3414
3415 if (prev == NULL) {
3416 mpte->mpte_reinjectq = m;
3417 } else {
3418 prev->m_nextpkt = m;
3419 }
3420
3421 m->m_nextpkt = n;
3422
3423 return;
3424
3425 dont_queue:
3426 m_freem(m);
3427 return;
3428 }
3429
3430 static struct mbuf *
3431 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3432 {
3433 struct socket *mp_so = mptetoso(mpte);
3434 struct mbuf *m;
3435
3436 m = mp_so->so_snd.sb_mb;
3437
3438 while (m) {
3439 /* If this segment covers what we are looking for, return it. */
3440 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3441 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3442 break;
3443 }
3444
3445
3446 /* Segment is no more in the queue */
3447 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3448 return NULL;
3449 }
3450
3451 m = m->m_next;
3452 }
3453
3454 return m;
3455 }
3456
3457 static struct mbuf *
3458 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3459 {
3460 struct mbuf *top = NULL, *tail = NULL;
3461 uint64_t dsn;
3462 uint32_t dlen, rseq;
3463
3464 dsn = m->m_pkthdr.mp_dsn;
3465 dlen = m->m_pkthdr.mp_rlen;
3466 rseq = m->m_pkthdr.mp_rseq;
3467
3468 while (len > 0) {
3469 struct mbuf *n;
3470
3471 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3472
3473 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3474 if (n == NULL) {
3475 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3476 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3477 goto err;
3478 }
3479
3480 VERIFY(n->m_flags & M_PKTHDR);
3481 VERIFY(n->m_next == NULL);
3482 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3483 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3484 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3485 VERIFY(n->m_len == m->m_len);
3486
3487 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3488
3489 if (top == NULL) {
3490 top = n;
3491 }
3492
3493 if (tail != NULL) {
3494 tail->m_next = n;
3495 }
3496
3497 tail = n;
3498
3499 len -= m->m_len;
3500 m = m->m_next;
3501 }
3502
3503 return top;
3504
3505 err:
3506 if (top) {
3507 m_freem(top);
3508 }
3509
3510 return NULL;
3511 }
3512
3513 static void
3514 mptcp_reinject_mbufs(struct socket *so)
3515 {
3516 struct tcpcb *tp = sototcpcb(so);
3517 struct mptsub *mpts = tp->t_mpsub;
3518 struct mptcb *mp_tp = tptomptp(tp);
3519 struct mptses *mpte = mp_tp->mpt_mpte;;
3520 struct sockbuf *sb = &so->so_snd;
3521 struct mbuf *m;
3522
3523 m = sb->sb_mb;
3524 while (m) {
3525 struct mbuf *n = m->m_next, *orig = m;
3526
3527 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3528 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3529 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3530 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3531
3532 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3533
3534 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3535 goto next;
3536 }
3537
3538 /* Has it all already been acknowledged at the data-level? */
3539 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3540 goto next;
3541 }
3542
3543 /* Part of this has already been acknowledged - lookup in the
3544 * MPTCP-socket for the segment.
3545 */
3546 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3547 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3548 if (m == NULL) {
3549 goto next;
3550 }
3551 }
3552
3553 /* Copy the mbuf with headers (aka, DSN-numbers) */
3554 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3555 if (m == NULL) {
3556 break;
3557 }
3558
3559 VERIFY(m->m_nextpkt == NULL);
3560
3561 /* Now, add to the reinject-queue, eliminating overlapping
3562 * segments
3563 */
3564 mptcp_add_reinjectq(mpte, m);
3565
3566 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3567
3568 next:
3569 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3570 while (n) {
3571 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3572
3573 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3574 break;
3575 }
3576
3577 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3578 n = n->m_next;
3579 }
3580
3581 m = n;
3582 }
3583 }
3584
3585 void
3586 mptcp_clean_reinjectq(struct mptses *mpte)
3587 {
3588 struct mptcb *mp_tp = mpte->mpte_mptcb;
3589
3590 socket_lock_assert_owned(mptetoso(mpte));
3591
3592 while (mpte->mpte_reinjectq) {
3593 struct mbuf *m = mpte->mpte_reinjectq;
3594
3595 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3596 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3597 break;
3598 }
3599
3600 mpte->mpte_reinjectq = m->m_nextpkt;
3601 m->m_nextpkt = NULL;
3602 m_freem(m);
3603 }
3604 }
3605
3606 /*
3607 * Subflow socket control event upcall.
3608 */
3609 static void
3610 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3611 {
3612 #pragma unused(so)
3613 struct mptsub *mpts = arg;
3614 struct mptses *mpte = mpts->mpts_mpte;
3615
3616 socket_lock_assert_owned(mptetoso(mpte));
3617
3618 if ((mpts->mpts_evctl & events) == events) {
3619 return;
3620 }
3621
3622 mpts->mpts_evctl |= events;
3623
3624 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3625 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3626 return;
3627 }
3628
3629 mptcp_subflow_workloop(mpte);
3630 }
3631
3632 /*
3633 * Subflow socket control events.
3634 *
3635 * Called for handling events related to the underlying subflow socket.
3636 */
3637 static ev_ret_t
3638 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3639 uint64_t *p_mpsofilt_hint)
3640 {
3641 ev_ret_t ret = MPTS_EVRET_OK;
3642 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3643 sizeof(mpsub_ev_entry_tbl[0]);
3644
3645 /* bail if there's nothing to process */
3646 if (!mpts->mpts_evctl) {
3647 return ret;
3648 }
3649
3650 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3651 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3652 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3653 SO_FILT_HINT_DISCONNECTED)) {
3654 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3655 }
3656
3657 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3658 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3659
3660 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3661 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3662 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3663
3664 /*
3665 * Process all the socket filter hints and reset the hint
3666 * once it is handled
3667 */
3668 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3669 /*
3670 * Always execute the DISCONNECTED event, because it will wakeup
3671 * the app.
3672 */
3673 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3674 (ret >= MPTS_EVRET_OK ||
3675 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3676 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3677 ev_ret_t error =
3678 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3679 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3680 }
3681 }
3682
3683 /*
3684 * We should be getting only events specified via sock_catchevents(),
3685 * so loudly complain if we have any unprocessed one(s).
3686 */
3687 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) {
3688 mptcplog((LOG_WARNING, "%s%s: cid %d evret %d unhandled events=%b\n", __func__,
3689 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3690 mpts->mpts_connid,
3691 ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3692 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3693 } else {
3694 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3695 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3696 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3697 }
3698
3699 return ret;
3700 }
3701
3702 static ev_ret_t
3703 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3704 uint64_t *p_mpsofilt_hint, uint64_t event)
3705 {
3706 struct socket *mp_so, *so;
3707 struct mptcb *mp_tp;
3708
3709 mp_so = mptetoso(mpte);
3710 mp_tp = mpte->mpte_mptcb;
3711 so = mpts->mpts_socket;
3712
3713 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3714 mpts->mpts_connid, event),
3715 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3716
3717 /*
3718 * We got an event for this subflow that might need to be propagated,
3719 * based on the state of the MPTCP connection.
3720 */
3721 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3722 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3723 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3724 mp_so->so_error = so->so_error;
3725 *p_mpsofilt_hint |= event;
3726 }
3727
3728 return MPTS_EVRET_OK;
3729 }
3730
3731 /*
3732 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3733 */
3734 static ev_ret_t
3735 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3736 uint64_t *p_mpsofilt_hint, uint64_t event)
3737 {
3738 #pragma unused(p_mpsofilt_hint, event)
3739 struct socket *mp_so;
3740 struct tcpcb *tp;
3741
3742 mp_so = mptetoso(mpte);
3743 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3744
3745 /*
3746 * This overwrites any previous mpte_lost_aid to avoid storing
3747 * too much state when the typical case has only two subflows.
3748 */
3749 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3750 mpte->mpte_lost_aid = tp->t_local_aid;
3751
3752 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3753 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3754
3755 /*
3756 * The subflow connection has lost its source address.
3757 */
3758 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3759
3760 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3761 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3762 }
3763
3764 return MPTS_EVRET_DELETE;
3765 }
3766
3767 static ev_ret_t
3768 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3769 uint64_t *p_mpsofilt_hint, uint64_t event)
3770 {
3771 #pragma unused(event, p_mpsofilt_hint)
3772 struct socket *so, *mp_so;
3773
3774 so = mpts->mpts_socket;
3775
3776 if (so->so_error != ENODATA) {
3777 return MPTS_EVRET_OK;
3778 }
3779
3780
3781 mp_so = mptetoso(mpte);
3782
3783 mp_so->so_error = ENODATA;
3784
3785 sorwakeup(mp_so);
3786 sowwakeup(mp_so);
3787
3788 return MPTS_EVRET_OK;
3789 }
3790
3791
3792 /*
3793 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3794 * indicates that the remote side sent a Data FIN
3795 */
3796 static ev_ret_t
3797 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3798 uint64_t *p_mpsofilt_hint, uint64_t event)
3799 {
3800 #pragma unused(event)
3801 struct mptcb *mp_tp = mpte->mpte_mptcb;
3802
3803 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3804 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3805
3806 /*
3807 * We got a Data FIN for the MPTCP connection.
3808 * The FIN may arrive with data. The data is handed up to the
3809 * mptcp socket and the user is notified so that it may close
3810 * the socket if needed.
3811 */
3812 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3813 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3814 }
3815
3816 return MPTS_EVRET_OK; /* keep the subflow socket around */
3817 }
3818
3819 /*
3820 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3821 */
3822 static ev_ret_t
3823 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3824 uint64_t *p_mpsofilt_hint, uint64_t event)
3825 {
3826 #pragma unused(event, p_mpsofilt_hint)
3827 struct mptsub *mpts_alt = NULL;
3828 struct socket *alt_so = NULL;
3829 struct socket *mp_so;
3830 int altpath_exists = 0;
3831
3832 mp_so = mptetoso(mpte);
3833 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3834
3835 mptcp_reinject_mbufs(mpts->mpts_socket);
3836
3837 mpts_alt = mptcp_get_subflow(mpte, NULL);
3838
3839 /* If there is no alternate eligible subflow, ignore the failover hint. */
3840 if (mpts_alt == NULL || mpts_alt == mpts) {
3841 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3842 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3843
3844 goto done;
3845 }
3846
3847 altpath_exists = 1;
3848 alt_so = mpts_alt->mpts_socket;
3849 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3850 /* All data acknowledged and no RTT spike */
3851 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3852 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3853 } else {
3854 /* no alternate path available */
3855 altpath_exists = 0;
3856 }
3857 }
3858
3859 if (altpath_exists) {
3860 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3861
3862 mpte->mpte_active_sub = mpts_alt;
3863 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3864 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3865
3866 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3867 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3868
3869 mptcpstats_inc_switch(mpte, mpts);
3870
3871 sowwakeup(alt_so);
3872 } else {
3873 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3874 mpts->mpts_connid),
3875 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3876 done:
3877 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3878 }
3879
3880 return MPTS_EVRET_OK;
3881 }
3882
3883 /*
3884 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3885 */
3886 static ev_ret_t
3887 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3888 uint64_t *p_mpsofilt_hint, uint64_t event)
3889 {
3890 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3891 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3892
3893 /*
3894 * The subflow connection cannot use the outgoing interface, let's
3895 * close this subflow.
3896 */
3897 mptcp_subflow_abort(mpts, EPERM);
3898
3899 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3900
3901 return MPTS_EVRET_DELETE;
3902 }
3903
3904 /*
3905 * https://tools.ietf.org/html/rfc6052#section-2
3906 * https://tools.ietf.org/html/rfc6147#section-5.2
3907 */
3908 static boolean_t
3909 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3910 const struct ipv6_prefix *prefix,
3911 struct in_addr *addrv4)
3912 {
3913 char buf[MAX_IPv4_STR_LEN];
3914 char *ptrv4 = (char *)addrv4;
3915 const char *ptr = (const char *)addr;
3916
3917 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3918 return false;
3919 }
3920
3921 switch (prefix->prefix_len) {
3922 case NAT64_PREFIX_LEN_96:
3923 memcpy(ptrv4, ptr + 12, 4);
3924 break;
3925 case NAT64_PREFIX_LEN_64:
3926 memcpy(ptrv4, ptr + 9, 4);
3927 break;
3928 case NAT64_PREFIX_LEN_56:
3929 memcpy(ptrv4, ptr + 7, 1);
3930 memcpy(ptrv4 + 1, ptr + 9, 3);
3931 break;
3932 case NAT64_PREFIX_LEN_48:
3933 memcpy(ptrv4, ptr + 6, 2);
3934 memcpy(ptrv4 + 2, ptr + 9, 2);
3935 break;
3936 case NAT64_PREFIX_LEN_40:
3937 memcpy(ptrv4, ptr + 5, 3);
3938 memcpy(ptrv4 + 3, ptr + 9, 1);
3939 break;
3940 case NAT64_PREFIX_LEN_32:
3941 memcpy(ptrv4, ptr + 4, 4);
3942 break;
3943 default:
3944 panic("NAT64-prefix len is wrong: %u\n",
3945 prefix->prefix_len);
3946 }
3947
3948 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3949 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3950
3951 return true;
3952 }
3953
3954 static void
3955 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3956 {
3957 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3958 struct socket *so = mpts->mpts_socket;
3959 struct ifnet *ifp;
3960 int j;
3961
3962 /* Subflow IPs will be steered directly by the server - no need to
3963 * desynthesize.
3964 */
3965 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3966 return;
3967 }
3968
3969 ifp = sotoinpcb(so)->inp_last_outifp;
3970
3971 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3972 mptcp_ask_for_nat64(ifp);
3973 return;
3974 }
3975
3976
3977 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3978 int success;
3979
3980 if (nat64prefixes[j].prefix_len == 0) {
3981 continue;
3982 }
3983
3984 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3985 &nat64prefixes[j],
3986 &mpte->mpte_dst_v4_nat64.sin_addr);
3987 if (success) {
3988 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3989 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3990 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3991 break;
3992 }
3993 }
3994 }
3995
3996 /*
3997 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3998 */
3999 static ev_ret_t
4000 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4001 uint64_t *p_mpsofilt_hint, uint64_t event)
4002 {
4003 #pragma unused(event, p_mpsofilt_hint)
4004 struct socket *mp_so, *so;
4005 struct inpcb *inp;
4006 struct tcpcb *tp;
4007 struct mptcb *mp_tp;
4008 int af;
4009 boolean_t mpok = FALSE;
4010
4011 mp_so = mptetoso(mpte);
4012 mp_tp = mpte->mpte_mptcb;
4013 so = mpts->mpts_socket;
4014 tp = sototcpcb(so);
4015 af = mpts->mpts_dst.sa_family;
4016
4017 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4018 return MPTS_EVRET_OK;
4019 }
4020
4021 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4022 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4023 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4024 (so->so_state & SS_ISCONNECTED)) {
4025 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4026 __func__, mpts->mpts_connid),
4027 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4028 (void) soshutdownlock(so, SHUT_RD);
4029 (void) soshutdownlock(so, SHUT_WR);
4030 (void) sodisconnectlocked(so);
4031 }
4032 return MPTS_EVRET_OK;
4033 }
4034
4035 /*
4036 * The subflow connection has been connected. Find out whether it
4037 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4038 *
4039 * a. If MPTCP connection is not yet established, then this must be
4040 * the first subflow connection. If MPTCP failed to negotiate,
4041 * fallback to regular TCP by degrading this subflow.
4042 *
4043 * b. If MPTCP connection has been established, then this must be
4044 * one of the subsequent subflow connections. If MPTCP failed
4045 * to negotiate, disconnect the connection.
4046 *
4047 * Right now, we simply unblock any waiters at the MPTCP socket layer
4048 * if the MPTCP connection has not been established.
4049 */
4050
4051 if (so->so_state & SS_ISDISCONNECTED) {
4052 /*
4053 * With MPTCP joins, a connection is connected at the subflow
4054 * level, but the 4th ACK from the server elevates the MPTCP
4055 * subflow to connected state. So there is a small window
4056 * where the subflow could get disconnected before the
4057 * connected event is processed.
4058 */
4059 return MPTS_EVRET_OK;
4060 }
4061
4062 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4063 mptcp_drop_tfo_data(mpte, mpts);
4064 }
4065
4066 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4067 mpts->mpts_flags |= MPTSF_CONNECTED;
4068
4069 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4070 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4071 }
4072
4073 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4074
4075 /* get/verify the outbound interface */
4076 inp = sotoinpcb(so);
4077
4078 mpts->mpts_maxseg = tp->t_maxseg;
4079
4080 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4081 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4082 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4083 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4084
4085 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4086
4087 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4088 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4089 mpte->mpte_associd = mpts->mpts_connid;
4090 DTRACE_MPTCP2(state__change,
4091 struct mptcb *, mp_tp,
4092 uint32_t, 0 /* event */);
4093
4094 if (SOCK_DOM(so) == AF_INET) {
4095 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4096 } else {
4097 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4098 }
4099
4100 mpts->mpts_flags |= MPTSF_ACTIVE;
4101
4102 /* case (a) above */
4103 if (!mpok) {
4104 tcpstat.tcps_mpcap_fallback++;
4105
4106 tp->t_mpflags |= TMPF_INFIN_SENT;
4107 mptcp_notify_mpfail(so);
4108 } else {
4109 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4110 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4111 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4112 } else {
4113 mpts->mpts_flags |= MPTSF_PREFERRED;
4114 }
4115 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4116 mpte->mpte_nummpcapflows++;
4117
4118 if (SOCK_DOM(so) == AF_INET6) {
4119 mptcp_handle_ipv6_connection(mpte, mpts);
4120 }
4121
4122 mptcp_check_subflows_and_add(mpte);
4123
4124 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4125 mpte->mpte_initial_cell = 1;
4126 }
4127
4128 mpte->mpte_handshake_success = 1;
4129 }
4130
4131 mp_tp->mpt_sndwnd = tp->snd_wnd;
4132 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4133 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4134 soisconnected(mp_so);
4135 } else if (mpok) {
4136 /*
4137 * case (b) above
4138 * In case of additional flows, the MPTCP socket is not
4139 * MPTSF_MP_CAPABLE until an ACK is received from server
4140 * for 3-way handshake. TCP would have guaranteed that this
4141 * is an MPTCP subflow.
4142 */
4143 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4144 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4145 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4146 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4147 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4148 } else {
4149 mpts->mpts_flags |= MPTSF_PREFERRED;
4150 }
4151
4152 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4153 mpte->mpte_nummpcapflows++;
4154
4155 mpts->mpts_rel_seq = 1;
4156
4157 mptcp_check_subflows_and_remove(mpte);
4158 } else {
4159 unsigned int i;
4160
4161 /* Should we try the alternate port? */
4162 if (mpte->mpte_alternate_port &&
4163 inp->inp_fport != mpte->mpte_alternate_port) {
4164 union sockaddr_in_4_6 dst;
4165 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4166
4167 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4168
4169 dst_in->sin_port = mpte->mpte_alternate_port;
4170
4171 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4172 mpts->mpts_ifscope, NULL);
4173 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4174 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4175 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
4176
4177 if (inp->inp_last_outifp->if_index == info->ifindex) {
4178 info->no_mptcp_support = 1;
4179 break;
4180 }
4181 }
4182 }
4183
4184 tcpstat.tcps_join_fallback++;
4185 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4186 tcpstat.tcps_mptcp_cell_proxy++;
4187 } else {
4188 tcpstat.tcps_mptcp_wifi_proxy++;
4189 }
4190
4191 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4192
4193 return MPTS_EVRET_OK;
4194 }
4195
4196 /* This call, just to "book" an entry in the stats-table for this ifindex */
4197 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4198
4199 mptcp_output(mpte);
4200
4201 return MPTS_EVRET_OK; /* keep the subflow socket around */
4202 }
4203
4204 /*
4205 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4206 */
4207 static ev_ret_t
4208 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4209 uint64_t *p_mpsofilt_hint, uint64_t event)
4210 {
4211 #pragma unused(event, p_mpsofilt_hint)
4212 struct socket *mp_so, *so;
4213 struct mptcb *mp_tp;
4214
4215 mp_so = mptetoso(mpte);
4216 mp_tp = mpte->mpte_mptcb;
4217 so = mpts->mpts_socket;
4218
4219 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4220 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4221 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4222 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4223 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4224
4225 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4226 return MPTS_EVRET_DELETE;
4227 }
4228
4229 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4230
4231 /* The subflow connection has been disconnected. */
4232
4233 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4234 mpte->mpte_nummpcapflows--;
4235 if (mpte->mpte_active_sub == mpts) {
4236 mpte->mpte_active_sub = NULL;
4237 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4238 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4239 }
4240 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4241 }
4242
4243 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4244 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4245 mptcp_drop(mpte, mp_tp, so->so_error);
4246 }
4247
4248 /*
4249 * Clear flags that are used by getconninfo to return state.
4250 * Retain like MPTSF_DELETEOK for internal purposes.
4251 */
4252 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4253 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4254 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4255
4256 return MPTS_EVRET_DELETE;
4257 }
4258
4259 /*
4260 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4261 */
4262 static ev_ret_t
4263 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4264 uint64_t *p_mpsofilt_hint, uint64_t event)
4265 {
4266 #pragma unused(event, p_mpsofilt_hint)
4267 ev_ret_t ret = MPTS_EVRET_OK;
4268 struct socket *mp_so, *so;
4269 struct mptcb *mp_tp;
4270
4271 mp_so = mptetoso(mpte);
4272 mp_tp = mpte->mpte_mptcb;
4273 so = mpts->mpts_socket;
4274
4275 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4276 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4277 } else {
4278 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4279 }
4280
4281 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4282 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4283 goto done;
4284 }
4285 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4286 } else {
4287 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4288 }
4289
4290 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4291 mpts->mpts_flags |= MPTSF_MP_READY;
4292 } else {
4293 mpts->mpts_flags &= ~MPTSF_MP_READY;
4294 }
4295
4296 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4297 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4298 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4299 }
4300
4301 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4302 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4303
4304 m_freem_list(mpte->mpte_reinjectq);
4305 mpte->mpte_reinjectq = NULL;
4306 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4307 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4308 ret = MPTS_EVRET_CONNECT_PENDING;
4309 }
4310
4311 done:
4312 return ret;
4313 }
4314
4315 /*
4316 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4317 */
4318 static ev_ret_t
4319 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4320 uint64_t *p_mpsofilt_hint, uint64_t event)
4321 {
4322 #pragma unused(event)
4323 struct socket *mp_so, *so;
4324 struct mptcb *mp_tp;
4325 boolean_t is_fastclose;
4326
4327 mp_so = mptetoso(mpte);
4328 mp_tp = mpte->mpte_mptcb;
4329 so = mpts->mpts_socket;
4330
4331 /* We got an invalid option or a fast close */
4332 struct tcptemp *t_template;
4333 struct inpcb *inp = sotoinpcb(so);
4334 struct tcpcb *tp = NULL;
4335
4336 tp = intotcpcb(inp);
4337 so->so_error = ECONNABORTED;
4338
4339 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4340
4341 tp->t_mpflags |= TMPF_RESET;
4342
4343 t_template = tcp_maketemplate(tp);
4344 if (t_template) {
4345 struct tcp_respond_args tra;
4346
4347 bzero(&tra, sizeof(tra));
4348 if (inp->inp_flags & INP_BOUND_IF) {
4349 tra.ifscope = inp->inp_boundifp->if_index;
4350 } else {
4351 tra.ifscope = IFSCOPE_NONE;
4352 }
4353 tra.awdl_unrestricted = 1;
4354
4355 tcp_respond(tp, t_template->tt_ipgen,
4356 &t_template->tt_t, (struct mbuf *)NULL,
4357 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4358 (void) m_free(dtom(t_template));
4359 }
4360
4361 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4362 struct mptsub *iter, *tmp;
4363
4364 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4365
4366 mp_so->so_error = ECONNRESET;
4367
4368 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4369 if (iter == mpts) {
4370 continue;
4371 }
4372 mptcp_subflow_abort(iter, ECONNABORTED);
4373 }
4374
4375 /*
4376 * mptcp_drop is being called after processing the events, to fully
4377 * close the MPTCP connection
4378 */
4379 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4380 }
4381
4382 mptcp_subflow_abort(mpts, ECONNABORTED);
4383
4384
4385 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4386 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4387 }
4388
4389 return MPTS_EVRET_DELETE;
4390 }
4391
4392 static ev_ret_t
4393 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4394 uint64_t *p_mpsofilt_hint, uint64_t event)
4395 {
4396 #pragma unused(event)
4397 bool found_active = false;
4398
4399 mpts->mpts_flags |= MPTSF_READ_STALL;
4400
4401 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4402 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4403
4404 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4405 TCPS_HAVERCVDFIN2(tp->t_state)) {
4406 continue;
4407 }
4408
4409 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4410 found_active = true;
4411 break;
4412 }
4413 }
4414
4415 if (!found_active) {
4416 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4417 }
4418
4419 return MPTS_EVRET_OK;
4420 }
4421
4422 static ev_ret_t
4423 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4424 uint64_t *p_mpsofilt_hint, uint64_t event)
4425 {
4426 #pragma unused(event)
4427 bool found_active = false;
4428
4429 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4430
4431 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4432 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4433
4434 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4435 tp->t_state > TCPS_CLOSE_WAIT) {
4436 continue;
4437 }
4438
4439 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4440 found_active = true;
4441 break;
4442 }
4443 }
4444
4445 if (!found_active) {
4446 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4447 }
4448
4449 return MPTS_EVRET_OK;
4450 }
4451
4452 /*
4453 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4454 * caller must ensure that the option can be issued on subflow sockets, via
4455 * MPOF_SUBFLOW_OK flag.
4456 */
4457 int
4458 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4459 {
4460 struct socket *mp_so, *so;
4461 struct sockopt sopt;
4462 int error;
4463
4464 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4465
4466 mp_so = mptetoso(mpte);
4467 so = mpts->mpts_socket;
4468
4469 socket_lock_assert_owned(mp_so);
4470
4471 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4472 mpo->mpo_level == SOL_SOCKET &&
4473 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4474 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4475
4476 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4477 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4478 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4479 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4480 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4481
4482 /*
4483 * When we open a new subflow, mark it as cell fallback, if
4484 * this subflow goes over cell.
4485 *
4486 * (except for first-party apps)
4487 */
4488
4489 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4490 return 0;
4491 }
4492
4493 if (sotoinpcb(so)->inp_last_outifp &&
4494 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4495 return 0;
4496 }
4497
4498 /*
4499 * This here is an OR, because if the app is not binding to the
4500 * interface, then it definitely is not a cell-fallback
4501 * connection.
4502 */
4503 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4504 !IFNET_IS_CELLULAR(ifp)) {
4505 return 0;
4506 }
4507 }
4508
4509 mpo->mpo_flags &= ~MPOF_INTERIM;
4510
4511 bzero(&sopt, sizeof(sopt));
4512 sopt.sopt_dir = SOPT_SET;
4513 sopt.sopt_level = mpo->mpo_level;
4514 sopt.sopt_name = mpo->mpo_name;
4515 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4516 sopt.sopt_valsize = sizeof(int);
4517 sopt.sopt_p = kernproc;
4518
4519 error = sosetoptlock(so, &sopt, 0);
4520 if (error) {
4521 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4522 "val %d set error %d\n", __func__,
4523 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4524 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4525 mpo->mpo_intval, error);
4526 }
4527 return error;
4528 }
4529
4530 /*
4531 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4532 * caller must ensure that the option can be issued on subflow sockets, via
4533 * MPOF_SUBFLOW_OK flag.
4534 */
4535 int
4536 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4537 struct mptopt *mpo)
4538 {
4539 struct socket *mp_so;
4540 struct sockopt sopt;
4541 int error;
4542
4543 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4544 mp_so = mptetoso(mpte);
4545
4546 socket_lock_assert_owned(mp_so);
4547
4548 bzero(&sopt, sizeof(sopt));
4549 sopt.sopt_dir = SOPT_GET;
4550 sopt.sopt_level = mpo->mpo_level;
4551 sopt.sopt_name = mpo->mpo_name;
4552 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4553 sopt.sopt_valsize = sizeof(int);
4554 sopt.sopt_p = kernproc;
4555
4556 error = sogetoptlock(so, &sopt, 0); /* already locked */
4557 if (error) {
4558 os_log_error(mptcp_log_handle,
4559 "%s - %lx: sopt %s get error %d\n",
4560 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4561 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4562 }
4563 return error;
4564 }
4565
4566
4567 /*
4568 * MPTCP garbage collector.
4569 *
4570 * This routine is called by the MP domain on-demand, periodic callout,
4571 * which is triggered when a MPTCP socket is closed. The callout will
4572 * repeat as long as this routine returns a non-zero value.
4573 */
4574 static uint32_t
4575 mptcp_gc(struct mppcbinfo *mppi)
4576 {
4577 struct mppcb *mpp, *tmpp;
4578 uint32_t active = 0;
4579
4580 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4581
4582 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4583 struct socket *mp_so;
4584 struct mptses *mpte;
4585 struct mptcb *mp_tp;
4586
4587 mp_so = mpp->mpp_socket;
4588 mpte = mptompte(mpp);
4589 mp_tp = mpte->mpte_mptcb;
4590
4591 if (!mpp_try_lock(mpp)) {
4592 active++;
4593 continue;
4594 }
4595
4596 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4597
4598 /* check again under the lock */
4599 if (mp_so->so_usecount > 0) {
4600 boolean_t wakeup = FALSE;
4601 struct mptsub *mpts, *tmpts;
4602
4603 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4604 if (mp_tp->mpt_gc_ticks > 0) {
4605 mp_tp->mpt_gc_ticks--;
4606 }
4607 if (mp_tp->mpt_gc_ticks == 0) {
4608 wakeup = TRUE;
4609 }
4610 }
4611 if (wakeup) {
4612 TAILQ_FOREACH_SAFE(mpts,
4613 &mpte->mpte_subflows, mpts_entry, tmpts) {
4614 mptcp_subflow_eupcall1(mpts->mpts_socket,
4615 mpts, SO_FILT_HINT_DISCONNECTED);
4616 }
4617 }
4618 socket_unlock(mp_so, 0);
4619 active++;
4620 continue;
4621 }
4622
4623 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4624 panic("%s - %lx: skipped state "
4625 "[u=%d,r=%d,s=%d]\n", __func__,
4626 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4627 mp_so->so_usecount, mp_so->so_retaincnt,
4628 mpp->mpp_state);
4629 }
4630
4631 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4632 mptcp_close(mpte, mp_tp);
4633 }
4634
4635 mptcp_session_destroy(mpte);
4636
4637 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4638 struct sockbuf *, &mp_so->so_rcv,
4639 struct sockbuf *, &mp_so->so_snd,
4640 struct mppcb *, mpp);
4641
4642 mp_pcbdispose(mpp);
4643 sodealloc(mp_so);
4644 }
4645
4646 return active;
4647 }
4648
4649 /*
4650 * Drop a MPTCP connection, reporting the specified error.
4651 */
4652 struct mptses *
4653 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4654 {
4655 struct socket *mp_so = mptetoso(mpte);
4656
4657 VERIFY(mpte->mpte_mptcb == mp_tp);
4658
4659 socket_lock_assert_owned(mp_so);
4660
4661 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4662 uint32_t, 0 /* event */);
4663
4664 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4665 errno = mp_tp->mpt_softerror;
4666 }
4667 mp_so->so_error = errno;
4668
4669 return mptcp_close(mpte, mp_tp);
4670 }
4671
4672 /*
4673 * Close a MPTCP control block.
4674 */
4675 struct mptses *
4676 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4677 {
4678 struct mptsub *mpts = NULL, *tmpts = NULL;
4679 struct socket *mp_so = mptetoso(mpte);
4680
4681 socket_lock_assert_owned(mp_so);
4682 VERIFY(mpte->mpte_mptcb == mp_tp);
4683
4684 mp_tp->mpt_state = MPTCPS_TERMINATE;
4685
4686 mptcp_freeq(mp_tp);
4687
4688 soisdisconnected(mp_so);
4689
4690 /* Clean up all subflows */
4691 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4692 mptcp_subflow_disconnect(mpte, mpts);
4693 }
4694
4695 return NULL;
4696 }
4697
4698 void
4699 mptcp_notify_close(struct socket *so)
4700 {
4701 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4702 }
4703
4704 /*
4705 * MPTCP workloop.
4706 */
4707 void
4708 mptcp_subflow_workloop(struct mptses *mpte)
4709 {
4710 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4711 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4712 struct mptsub *mpts, *tmpts;
4713 struct socket *mp_so;
4714
4715 mp_so = mptetoso(mpte);
4716
4717 socket_lock_assert_owned(mp_so);
4718
4719 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4720 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4721 return;
4722 }
4723 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4724
4725 relaunch:
4726 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4727
4728 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4729 ev_ret_t ret;
4730
4731 if (mpts->mpts_socket->so_usecount == 0) {
4732 /* Will be removed soon by tcp_garbage_collect */
4733 continue;
4734 }
4735
4736 mptcp_subflow_addref(mpts);
4737 mpts->mpts_socket->so_usecount++;
4738
4739 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4740
4741 /*
4742 * If MPTCP socket is closed, disconnect all subflows.
4743 * This will generate a disconnect event which will
4744 * be handled during the next iteration, causing a
4745 * non-zero error to be returned above.
4746 */
4747 if (mp_so->so_flags & SOF_PCBCLEARING) {
4748 mptcp_subflow_disconnect(mpte, mpts);
4749 }
4750
4751 switch (ret) {
4752 case MPTS_EVRET_OK:
4753 /* nothing to do */
4754 break;
4755 case MPTS_EVRET_DELETE:
4756 mptcp_subflow_soclose(mpts);
4757 break;
4758 case MPTS_EVRET_CONNECT_PENDING:
4759 connect_pending = TRUE;
4760 break;
4761 case MPTS_EVRET_DISCONNECT_FALLBACK:
4762 disconnect_fallback = TRUE;
4763 break;
4764 default:
4765 mptcplog((LOG_DEBUG,
4766 "MPTCP Socket: %s: mptcp_subflow_events "
4767 "returned invalid value: %d\n", __func__,
4768 ret),
4769 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4770 break;
4771 }
4772 mptcp_subflow_remref(mpts); /* ours */
4773
4774 VERIFY(mpts->mpts_socket->so_usecount != 0);
4775 mpts->mpts_socket->so_usecount--;
4776 }
4777
4778 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4779 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4780
4781 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4782 mp_so->so_state |= SS_CANTRCVMORE;
4783 sorwakeup(mp_so);
4784 }
4785
4786 soevent(mp_so, mpsofilt_hint_mask);
4787 }
4788
4789 if (!connect_pending && !disconnect_fallback) {
4790 goto exit;
4791 }
4792
4793 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4794 if (disconnect_fallback) {
4795 struct socket *so = NULL;
4796 struct inpcb *inp = NULL;
4797 struct tcpcb *tp = NULL;
4798
4799 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4800 continue;
4801 }
4802
4803 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4804
4805 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4806 MPTSF_DISCONNECTED | MPTSF_CONNECT_PENDING)) {
4807 continue;
4808 }
4809
4810 so = mpts->mpts_socket;
4811
4812 /*
4813 * The MPTCP connection has degraded to a fallback
4814 * mode, so there is no point in keeping this subflow
4815 * regardless of its MPTCP-readiness state, unless it
4816 * is the primary one which we use for fallback. This
4817 * assumes that the subflow used for fallback is the
4818 * ACTIVE one.
4819 */
4820
4821 inp = sotoinpcb(so);
4822 tp = intotcpcb(inp);
4823 tp->t_mpflags &=
4824 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4825 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4826
4827 soevent(so, SO_FILT_HINT_MUSTRST);
4828 } else if (connect_pending) {
4829 /*
4830 * The MPTCP connection has progressed to a state
4831 * where it supports full multipath semantics; allow
4832 * additional joins to be attempted for all subflows
4833 * that are in the PENDING state.
4834 */
4835 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4836 int error = mptcp_subflow_soconnectx(mpte, mpts);
4837
4838 if (error) {
4839 mptcp_subflow_abort(mpts, error);
4840 }
4841 }
4842 }
4843 }
4844
4845 exit:
4846 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4847 goto relaunch;
4848 }
4849
4850 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4851 }
4852
4853 /*
4854 * Protocol pr_lock callback.
4855 */
4856 int
4857 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4858 {
4859 struct mppcb *mpp = mpsotomppcb(mp_so);
4860 void *lr_saved;
4861
4862 if (lr == NULL) {
4863 lr_saved = __builtin_return_address(0);
4864 } else {
4865 lr_saved = lr;
4866 }
4867
4868 if (mpp == NULL) {
4869 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4870 mp_so, lr_saved, solockhistory_nr(mp_so));
4871 /* NOTREACHED */
4872 }
4873 mpp_lock(mpp);
4874
4875 if (mp_so->so_usecount < 0) {
4876 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4877 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4878 solockhistory_nr(mp_so));
4879 /* NOTREACHED */
4880 }
4881 if (refcount != 0) {
4882 mp_so->so_usecount++;
4883 mpp->mpp_inside++;
4884 }
4885 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4886 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4887
4888 return 0;
4889 }
4890
4891 /*
4892 * Protocol pr_unlock callback.
4893 */
4894 int
4895 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4896 {
4897 struct mppcb *mpp = mpsotomppcb(mp_so);
4898 void *lr_saved;
4899
4900 if (lr == NULL) {
4901 lr_saved = __builtin_return_address(0);
4902 } else {
4903 lr_saved = lr;
4904 }
4905
4906 if (mpp == NULL) {
4907 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4908 mp_so, mp_so->so_usecount, lr_saved,
4909 solockhistory_nr(mp_so));
4910 /* NOTREACHED */
4911 }
4912 socket_lock_assert_owned(mp_so);
4913
4914 if (refcount != 0) {
4915 mp_so->so_usecount--;
4916 mpp->mpp_inside--;
4917 }
4918
4919 if (mp_so->so_usecount < 0) {
4920 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4921 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4922 /* NOTREACHED */
4923 }
4924 if (mpp->mpp_inside < 0) {
4925 panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
4926 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
4927 /* NOTREACHED */
4928 }
4929 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4930 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4931 mpp_unlock(mpp);
4932
4933 return 0;
4934 }
4935
4936 /*
4937 * Protocol pr_getlock callback.
4938 */
4939 lck_mtx_t *
4940 mptcp_getlock(struct socket *mp_so, int flags)
4941 {
4942 struct mppcb *mpp = mpsotomppcb(mp_so);
4943
4944 if (mpp == NULL) {
4945 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4946 solockhistory_nr(mp_so));
4947 /* NOTREACHED */
4948 }
4949 if (mp_so->so_usecount < 0) {
4950 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4951 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4952 /* NOTREACHED */
4953 }
4954 return mpp_getlock(mpp, flags);
4955 }
4956
4957 /*
4958 * MPTCP Join support
4959 */
4960
4961 static void
4962 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
4963 {
4964 struct tcpcb *tp = sototcpcb(so);
4965 struct mptcp_subf_auth_entry *sauth_entry;
4966
4967 /*
4968 * The address ID of the first flow is implicitly 0.
4969 */
4970 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4971 tp->t_local_aid = 0;
4972 } else {
4973 tp->t_local_aid = addr_id;
4974 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4975 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4976 }
4977 sauth_entry = zalloc(mpt_subauth_zone);
4978 sauth_entry->msae_laddr_id = tp->t_local_aid;
4979 sauth_entry->msae_raddr_id = 0;
4980 sauth_entry->msae_raddr_rand = 0;
4981 try_again:
4982 sauth_entry->msae_laddr_rand = RandomULong();
4983 if (sauth_entry->msae_laddr_rand == 0) {
4984 goto try_again;
4985 }
4986 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4987 }
4988
4989 static void
4990 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4991 {
4992 struct mptcp_subf_auth_entry *sauth_entry;
4993 struct tcpcb *tp = NULL;
4994 int found = 0;
4995
4996 tp = sototcpcb(so);
4997 if (tp == NULL) {
4998 return;
4999 }
5000
5001 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5002 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
5003 found = 1;
5004 break;
5005 }
5006 }
5007 if (found) {
5008 LIST_REMOVE(sauth_entry, msae_next);
5009 }
5010
5011 if (found) {
5012 zfree(mpt_subauth_zone, sauth_entry);
5013 }
5014 }
5015
5016 void
5017 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5018 u_int32_t *rrand)
5019 {
5020 struct mptcp_subf_auth_entry *sauth_entry;
5021
5022 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5023 if (sauth_entry->msae_laddr_id == addr_id) {
5024 if (lrand) {
5025 *lrand = sauth_entry->msae_laddr_rand;
5026 }
5027 if (rrand) {
5028 *rrand = sauth_entry->msae_raddr_rand;
5029 }
5030 break;
5031 }
5032 }
5033 }
5034
5035 void
5036 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5037 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5038 {
5039 struct mptcp_subf_auth_entry *sauth_entry;
5040
5041 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5042 if (sauth_entry->msae_laddr_id == laddr_id) {
5043 if ((sauth_entry->msae_raddr_id != 0) &&
5044 (sauth_entry->msae_raddr_id != raddr_id)) {
5045 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5046 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5047 raddr_id, sauth_entry->msae_raddr_id);
5048 return;
5049 }
5050 sauth_entry->msae_raddr_id = raddr_id;
5051 if ((sauth_entry->msae_raddr_rand != 0) &&
5052 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5053 os_log_error(mptcp_log_handle, "%s - %lx: "
5054 "dup SYN_ACK %d %d \n",
5055 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5056 raddr_rand, sauth_entry->msae_raddr_rand);
5057 return;
5058 }
5059 sauth_entry->msae_raddr_rand = raddr_rand;
5060 return;
5061 }
5062 }
5063 }
5064
5065 /*
5066 * SHA1 support for MPTCP
5067 */
5068 static void
5069 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5070 {
5071 SHA1_CTX sha1ctxt;
5072 const unsigned char *sha1_base;
5073 int sha1_size;
5074
5075 sha1_base = (const unsigned char *) key;
5076 sha1_size = sizeof(mptcp_key_t);
5077 SHA1Init(&sha1ctxt);
5078 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5079 SHA1Final(sha_digest, &sha1ctxt);
5080 }
5081
5082 void
5083 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5084 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5085 {
5086 SHA1_CTX sha1ctxt;
5087 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5088 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5089 u_int32_t data[2];
5090 int i;
5091
5092 bzero(digest, SHA1_RESULTLEN);
5093
5094 /* Set up the Key for HMAC */
5095 key_ipad[0] = key1;
5096 key_ipad[1] = key2;
5097
5098 key_opad[0] = key1;
5099 key_opad[1] = key2;
5100
5101 /* Set up the message for HMAC */
5102 data[0] = rand1;
5103 data[1] = rand2;
5104
5105 /* Key is 512 block length, so no need to compute hash */
5106
5107 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5108
5109 for (i = 0; i < 8; i++) {
5110 key_ipad[i] ^= 0x3636363636363636;
5111 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5112 }
5113
5114 /* Perform inner SHA1 */
5115 SHA1Init(&sha1ctxt);
5116 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5117 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5118 SHA1Final(digest, &sha1ctxt);
5119
5120 /* Perform outer SHA1 */
5121 SHA1Init(&sha1ctxt);
5122 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5123 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5124 SHA1Final(digest, &sha1ctxt);
5125 }
5126
5127 /*
5128 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5129 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5130 */
5131 void
5132 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
5133 {
5134 uint32_t lrand, rrand;
5135
5136 lrand = rrand = 0;
5137 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5138 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
5139 digest);
5140 }
5141
5142 /*
5143 * Authentication data generation
5144 */
5145 static void
5146 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5147 int token_len)
5148 {
5149 VERIFY(token_len == sizeof(u_int32_t));
5150 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5151
5152 /* Most significant 32 bits of the SHA1 hash */
5153 bcopy(sha_digest, token, sizeof(u_int32_t));
5154 return;
5155 }
5156
5157 static void
5158 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5159 int idsn_len)
5160 {
5161 VERIFY(idsn_len == sizeof(u_int64_t));
5162 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5163
5164 /*
5165 * Least significant 64 bits of the SHA1 hash
5166 */
5167
5168 idsn[7] = sha_digest[12];
5169 idsn[6] = sha_digest[13];
5170 idsn[5] = sha_digest[14];
5171 idsn[4] = sha_digest[15];
5172 idsn[3] = sha_digest[16];
5173 idsn[2] = sha_digest[17];
5174 idsn[1] = sha_digest[18];
5175 idsn[0] = sha_digest[19];
5176 return;
5177 }
5178
5179 static void
5180 mptcp_conn_properties(struct mptcb *mp_tp)
5181 {
5182 /* There is only Version 0 at this time */
5183 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
5184
5185 /* Set DSS checksum flag */
5186 if (mptcp_dss_csum) {
5187 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5188 }
5189
5190 /* Set up receive window */
5191 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5192
5193 /* Set up gc ticks */
5194 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5195 }
5196
5197 static void
5198 mptcp_init_local_parms(struct mptses *mpte)
5199 {
5200 struct mptcb *mp_tp = mpte->mpte_mptcb;
5201 char key_digest[SHA1_RESULTLEN];
5202
5203 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5204 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5205
5206 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
5207 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5208 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
5209 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
5210
5211 /* The subflow SYN is also first MPTCP byte */
5212 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5213 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5214
5215 mptcp_conn_properties(mp_tp);
5216 }
5217
5218 int
5219 mptcp_init_remote_parms(struct mptcb *mp_tp)
5220 {
5221 char remote_digest[SHA1_RESULTLEN];
5222
5223 /* Only Version 0 is supported for auth purposes */
5224 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5225 return -1;
5226 }
5227
5228 /* Setup local and remote tokens and Initial DSNs */
5229 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5230 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
5231 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5232 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
5233 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5234 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5235 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5236
5237 return 0;
5238 }
5239
5240 static void
5241 mptcp_send_dfin(struct socket *so)
5242 {
5243 struct tcpcb *tp = NULL;
5244 struct inpcb *inp = NULL;
5245
5246 inp = sotoinpcb(so);
5247 if (!inp) {
5248 return;
5249 }
5250
5251 tp = intotcpcb(inp);
5252 if (!tp) {
5253 return;
5254 }
5255
5256 if (!(tp->t_mpflags & TMPF_RESET)) {
5257 tp->t_mpflags |= TMPF_SEND_DFIN;
5258 }
5259 }
5260
5261 /*
5262 * Data Sequence Mapping routines
5263 */
5264 void
5265 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5266 {
5267 struct mptcb *mp_tp;
5268
5269 if (m == NULL) {
5270 return;
5271 }
5272
5273 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5274
5275 while (m) {
5276 VERIFY(m->m_flags & M_PKTHDR);
5277 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5278 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5279 m->m_pkthdr.mp_rlen = m_pktlen(m);
5280 mp_tp->mpt_sndmax += m_pktlen(m);
5281 m = m->m_next;
5282 }
5283 }
5284
5285 void
5286 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5287 {
5288 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5289 uint64_t data_ack;
5290 uint64_t dsn;
5291
5292 if (!m || len == 0) {
5293 return;
5294 }
5295
5296 while (m && len > 0) {
5297 VERIFY(m->m_flags & M_PKTHDR);
5298 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5299
5300 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5301 dsn = m->m_pkthdr.mp_dsn;
5302
5303 len -= m->m_len;
5304 m = m->m_next;
5305 }
5306
5307 if (m && len == 0) {
5308 /*
5309 * If there is one more mbuf in the chain, it automatically means
5310 * that up to m->mp_dsn has been ack'ed.
5311 *
5312 * This means, we actually correct data_ack back down (compared
5313 * to what we set inside the loop - dsn + data_len). Because in
5314 * the loop we are "optimistic" and assume that the full mapping
5315 * will be acked. If that's not the case and we get out of the
5316 * loop with m != NULL, it means only up to m->mp_dsn has been
5317 * really acked.
5318 */
5319 data_ack = m->m_pkthdr.mp_dsn;
5320 }
5321
5322 if (len < 0) {
5323 /*
5324 * If len is negative, meaning we acked in the middle of an mbuf,
5325 * only up to this mbuf's data-sequence number has been acked
5326 * at the MPTCP-level.
5327 */
5328 data_ack = dsn;
5329 }
5330
5331 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5332 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5333
5334 /* We can have data in the subflow's send-queue that is being acked,
5335 * while the DATA_ACK has already advanced. Thus, we should check whether
5336 * or not the DATA_ACK is actually new here.
5337 */
5338 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5339 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5340 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5341 }
5342 }
5343
5344 void
5345 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5346 {
5347 int rewinding = 0;
5348
5349 /* TFO makes things complicated. */
5350 if (so->so_flags1 & SOF1_TFO_REWIND) {
5351 rewinding = 1;
5352 so->so_flags1 &= ~SOF1_TFO_REWIND;
5353 }
5354
5355 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5356 u_int32_t sub_len;
5357 VERIFY(m->m_flags & M_PKTHDR);
5358 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5359
5360 sub_len = m->m_pkthdr.mp_rlen;
5361
5362 if (sub_len < len) {
5363 m->m_pkthdr.mp_dsn += sub_len;
5364 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5365 m->m_pkthdr.mp_rseq += sub_len;
5366 }
5367 m->m_pkthdr.mp_rlen = 0;
5368 len -= sub_len;
5369 } else {
5370 /* sub_len >= len */
5371 if (rewinding == 0) {
5372 m->m_pkthdr.mp_dsn += len;
5373 }
5374 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5375 if (rewinding == 0) {
5376 m->m_pkthdr.mp_rseq += len;
5377 }
5378 }
5379 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5380 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5381 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5382 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5383 m->m_pkthdr.mp_rlen -= len;
5384 break;
5385 }
5386 m = m->m_next;
5387 }
5388
5389 if (so->so_flags & SOF_MP_SUBFLOW &&
5390 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5391 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5392 /*
5393 * Received an ack without receiving a DATA_ACK.
5394 * Need to fallback to regular TCP (or destroy this subflow).
5395 */
5396 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5397 mptcp_notify_mpfail(so);
5398 }
5399 }
5400
5401 /* Obtain the DSN mapping stored in the mbuf */
5402 void
5403 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5404 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5405 {
5406 u_int64_t dsn64;
5407
5408 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5409 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5410 }
5411
5412 void
5413 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5414 uint32_t *relseq, uint16_t *data_len,
5415 uint16_t *dss_csum)
5416 {
5417 struct mbuf *m = so->so_snd.sb_mb;
5418 int off_orig = off;
5419
5420 VERIFY(off >= 0);
5421
5422 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5423 *dsn = 0;
5424 *relseq = 0;
5425 *data_len = 0;
5426 *dss_csum = 0;
5427 return;
5428 }
5429
5430 /*
5431 * In the subflow socket, the DSN sequencing can be discontiguous,
5432 * but the subflow sequence mapping is contiguous. Use the subflow
5433 * sequence property to find the right mbuf and corresponding dsn
5434 * mapping.
5435 */
5436
5437 while (m) {
5438 VERIFY(m->m_flags & M_PKTHDR);
5439 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5440
5441 if (off >= m->m_len) {
5442 off -= m->m_len;
5443 m = m->m_next;
5444 } else {
5445 break;
5446 }
5447 }
5448
5449 VERIFY(off >= 0);
5450 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5451
5452 *dsn = m->m_pkthdr.mp_dsn;
5453 *relseq = m->m_pkthdr.mp_rseq;
5454 *data_len = m->m_pkthdr.mp_rlen;
5455 *dss_csum = m->m_pkthdr.mp_csum;
5456
5457 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5458 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5459 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5460 }
5461
5462 /*
5463 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5464 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5465 * When it trims data tcp_input calls m_adj() which does not remove the
5466 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5467 * The dsn map insertion cannot be delayed after trim, because data can be in
5468 * the reassembly queue for a while and the DSN option info in tp will be
5469 * overwritten for every new packet received.
5470 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5471 * with mptcp_adj_rmap()
5472 */
5473 void
5474 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5475 {
5476 VERIFY(m->m_flags & M_PKTHDR);
5477 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5478
5479 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5480 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5481 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5482 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5483 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5484 if (tp->t_rcv_map.mpt_dfin) {
5485 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5486 }
5487
5488 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5489
5490 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5491 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5492 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5493 if (th->th_flags & TH_FIN) {
5494 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5495 }
5496 }
5497 }
5498
5499 /*
5500 * Following routines help with failure detection and failover of data
5501 * transfer from one subflow to another.
5502 */
5503 void
5504 mptcp_act_on_txfail(struct socket *so)
5505 {
5506 struct tcpcb *tp = NULL;
5507 struct inpcb *inp = sotoinpcb(so);
5508
5509 if (inp == NULL) {
5510 return;
5511 }
5512
5513 tp = intotcpcb(inp);
5514 if (tp == NULL) {
5515 return;
5516 }
5517
5518 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5519 return;
5520 }
5521
5522 so->so_flags |= SOF_MP_TRYFAILOVER;
5523 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5524 }
5525
5526 /*
5527 * Support for MP_FAIL option
5528 */
5529 int
5530 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5531 {
5532 struct mbuf *m = so->so_snd.sb_mb;
5533 u_int64_t dsn;
5534 int off = 0;
5535 u_int32_t datalen;
5536
5537 if (m == NULL) {
5538 return -1;
5539 }
5540
5541 while (m != NULL) {
5542 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5543 VERIFY(m->m_flags & M_PKTHDR);
5544 dsn = m->m_pkthdr.mp_dsn;
5545 datalen = m->m_pkthdr.mp_rlen;
5546 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5547 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5548 off = dsn_fail - dsn;
5549 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5550 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5551 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5552 return 0;
5553 }
5554
5555 m = m->m_next;
5556 }
5557
5558 /*
5559 * If there was no mbuf data and a fallback to TCP occurred, there's
5560 * not much else to do.
5561 */
5562
5563 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5564 return -1;
5565 }
5566
5567 /*
5568 * Support for sending contiguous MPTCP bytes in subflow
5569 * Also for preventing sending data with ACK in 3-way handshake
5570 */
5571 int32_t
5572 mptcp_adj_sendlen(struct socket *so, int32_t off)
5573 {
5574 struct tcpcb *tp = sototcpcb(so);
5575 struct mptsub *mpts = tp->t_mpsub;
5576 uint64_t mdss_dsn;
5577 uint32_t mdss_subflow_seq;
5578 int mdss_subflow_off;
5579 uint16_t mdss_data_len;
5580 uint16_t dss_csum;
5581
5582 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5583 return 0;
5584 }
5585
5586 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5587 &mdss_data_len, &dss_csum);
5588
5589 /*
5590 * We need to compute how much of the mapping still remains.
5591 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5592 */
5593 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5594
5595 /*
5596 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5597 * seq has been set to 1 (while it should be 0).
5598 */
5599 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5600 mdss_subflow_off--;
5601 }
5602
5603 VERIFY(off >= mdss_subflow_off);
5604
5605 return mdss_data_len - (off - mdss_subflow_off);
5606 }
5607
5608 static uint32_t
5609 mptcp_get_maxseg(struct mptses *mpte)
5610 {
5611 struct mptsub *mpts;
5612 uint32_t maxseg = 0;
5613
5614 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5615 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5616
5617 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5618 TCPS_HAVERCVDFIN2(tp->t_state)) {
5619 continue;
5620 }
5621
5622 if (tp->t_maxseg > maxseg) {
5623 maxseg = tp->t_maxseg;
5624 }
5625 }
5626
5627 return maxseg;
5628 }
5629
5630 static uint8_t
5631 mptcp_get_rcvscale(struct mptses *mpte)
5632 {
5633 struct mptsub *mpts;
5634 uint8_t rcvscale = UINT8_MAX;
5635
5636 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5637 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5638
5639 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5640 TCPS_HAVERCVDFIN2(tp->t_state)) {
5641 continue;
5642 }
5643
5644 if (tp->rcv_scale < rcvscale) {
5645 rcvscale = tp->rcv_scale;
5646 }
5647 }
5648
5649 return rcvscale;
5650 }
5651
5652 /* Similar to tcp_sbrcv_reserve */
5653 static void
5654 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5655 u_int32_t newsize, u_int32_t idealsize)
5656 {
5657 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5658
5659 /* newsize should not exceed max */
5660 newsize = min(newsize, tcp_autorcvbuf_max);
5661
5662 /* The receive window scale negotiated at the
5663 * beginning of the connection will also set a
5664 * limit on the socket buffer size
5665 */
5666 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5667
5668 /* Set new socket buffer size */
5669 if (newsize > sbrcv->sb_hiwat &&
5670 (sbreserve(sbrcv, newsize) == 1)) {
5671 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5672 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5673
5674 /* Again check the limit set by the advertised
5675 * window scale
5676 */
5677 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5678 TCP_MAXWIN << rcvscale);
5679 }
5680 }
5681
5682 void
5683 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5684 {
5685 struct mptses *mpte = mp_tp->mpt_mpte;
5686 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5687 struct sockbuf *sbrcv = &mp_so->so_rcv;
5688 uint32_t hiwat_sum = 0;
5689 uint32_t ideal_sum = 0;
5690 struct mptsub *mpts;
5691
5692 /*
5693 * Do not grow the receive socket buffer if
5694 * - auto resizing is disabled, globally or on this socket
5695 * - the high water mark already reached the maximum
5696 * - the stream is in background and receive side is being
5697 * throttled
5698 * - if there are segments in reassembly queue indicating loss,
5699 * do not need to increase recv window during recovery as more
5700 * data is not going to be sent. A duplicate ack sent during
5701 * recovery should not change the receive window
5702 */
5703 if (tcp_do_autorcvbuf == 0 ||
5704 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5705 tcp_cansbgrow(sbrcv) == 0 ||
5706 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5707 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5708 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5709 /* Can not resize the socket buffer, just return */
5710 return;
5711 }
5712
5713 /*
5714 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5715 *
5716 * But, for this we first need accurate receiver-RTT estimations, which
5717 * we currently don't have.
5718 *
5719 * Let's use a dummy algorithm for now, just taking the sum of all
5720 * subflow's receive-buffers. It's too low, but that's all we can get
5721 * for now.
5722 */
5723
5724 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5725 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5726 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5727 }
5728
5729 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5730 }
5731
5732 /*
5733 * Determine if we can grow the recieve socket buffer to avoid sending
5734 * a zero window update to the peer. We allow even socket buffers that
5735 * have fixed size (set by the application) to grow if the resource
5736 * constraints are met. They will also be trimmed after the application
5737 * reads data.
5738 *
5739 * Similar to tcp_sbrcv_grow_rwin
5740 */
5741 static void
5742 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5743 {
5744 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5745 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5746 u_int32_t rcvbuf = sb->sb_hiwat;
5747
5748 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5749 return;
5750 }
5751
5752 if (tcp_do_autorcvbuf == 1 &&
5753 tcp_cansbgrow(sb) &&
5754 /* Diff to tcp_sbrcv_grow_rwin */
5755 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5756 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5757 rcvbuf < tcp_autorcvbuf_max &&
5758 (sb->sb_idealsize > 0 &&
5759 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5760 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5761 }
5762 }
5763
5764 /* Similar to tcp_sbspace */
5765 int32_t
5766 mptcp_sbspace(struct mptcb *mp_tp)
5767 {
5768 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5769 uint32_t rcvbuf;
5770 int32_t space;
5771 int32_t pending = 0;
5772
5773 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5774
5775 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5776
5777 /* hiwat might have changed */
5778 rcvbuf = sb->sb_hiwat;
5779
5780 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5781 (sb->sb_mbmax - sb->sb_mbcnt)));
5782 if (space < 0) {
5783 space = 0;
5784 }
5785
5786 #if CONTENT_FILTER
5787 /* Compensate for data being processed by content filters */
5788 pending = cfil_sock_data_space(sb);
5789 #endif /* CONTENT_FILTER */
5790 if (pending > space) {
5791 space = 0;
5792 } else {
5793 space -= pending;
5794 }
5795
5796 return space;
5797 }
5798
5799 /*
5800 * Support Fallback to Regular TCP
5801 */
5802 void
5803 mptcp_notify_mpready(struct socket *so)
5804 {
5805 struct tcpcb *tp = NULL;
5806
5807 if (so == NULL) {
5808 return;
5809 }
5810
5811 tp = intotcpcb(sotoinpcb(so));
5812
5813 if (tp == NULL) {
5814 return;
5815 }
5816
5817 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5818 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5819 struct tcpcb *, tp);
5820
5821 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5822 return;
5823 }
5824
5825 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5826 return;
5827 }
5828
5829 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5830 tp->t_mpflags |= TMPF_MPTCP_READY;
5831
5832 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5833 }
5834
5835 void
5836 mptcp_notify_mpfail(struct socket *so)
5837 {
5838 struct tcpcb *tp = NULL;
5839
5840 if (so == NULL) {
5841 return;
5842 }
5843
5844 tp = intotcpcb(sotoinpcb(so));
5845
5846 if (tp == NULL) {
5847 return;
5848 }
5849
5850 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5851 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5852 struct tcpcb *, tp);
5853
5854 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5855 return;
5856 }
5857
5858 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5859 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5860
5861 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5862 }
5863
5864 /*
5865 * Keepalive helper function
5866 */
5867 boolean_t
5868 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5869 {
5870 boolean_t ret = 1;
5871
5872 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5873
5874 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5875 ret = 0;
5876 }
5877 return ret;
5878 }
5879
5880 /*
5881 * MPTCP t_maxseg adjustment function
5882 */
5883 int
5884 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5885 {
5886 int mss_lower = 0;
5887 struct mptcb *mp_tp = tptomptp(tp);
5888
5889 #define MPTCP_COMPUTE_LEN { \
5890 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5891 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5892 mss_lower += 2; \
5893 else \
5894 /* adjust to 32-bit boundary + EOL */ \
5895 mss_lower += 2; \
5896 }
5897 if (mp_tp == NULL) {
5898 return 0;
5899 }
5900
5901 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5902
5903 /*
5904 * For the first subflow and subsequent subflows, adjust mss for
5905 * most common MPTCP option size, for case where tcp_mss is called
5906 * during option processing and MTU discovery.
5907 */
5908 if (!mtudisc) {
5909 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5910 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5911 MPTCP_COMPUTE_LEN;
5912 }
5913
5914 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5915 tp->t_mpflags & TMPF_SENT_JOIN) {
5916 MPTCP_COMPUTE_LEN;
5917 }
5918 } else {
5919 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5920 MPTCP_COMPUTE_LEN;
5921 }
5922 }
5923
5924 return mss_lower;
5925 }
5926
5927 /*
5928 * Update the pid, upid, uuid of the subflow so, based on parent so
5929 */
5930 void
5931 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5932 {
5933 if (so->last_pid != mp_so->last_pid ||
5934 so->last_upid != mp_so->last_upid) {
5935 so->last_upid = mp_so->last_upid;
5936 so->last_pid = mp_so->last_pid;
5937 uuid_copy(so->last_uuid, mp_so->last_uuid);
5938 }
5939 so_update_policy(so);
5940 }
5941
5942 static void
5943 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5944 {
5945 struct inpcb *inp;
5946
5947 tcp_getconninfo(so, &flow->flow_ci);
5948 inp = sotoinpcb(so);
5949 #if INET6
5950 if ((inp->inp_vflag & INP_IPV6) != 0) {
5951 flow->flow_src.ss_family = AF_INET6;
5952 flow->flow_dst.ss_family = AF_INET6;
5953 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5954 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5955 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5956 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5957 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5958 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5959 } else
5960 #endif
5961 if ((inp->inp_vflag & INP_IPV4) != 0) {
5962 flow->flow_src.ss_family = AF_INET;
5963 flow->flow_dst.ss_family = AF_INET;
5964 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5965 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5966 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5967 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5968 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5969 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5970 }
5971 flow->flow_len = sizeof(*flow);
5972 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5973 flow->flow_flags = mpts->mpts_flags;
5974 flow->flow_cid = mpts->mpts_connid;
5975 flow->flow_relseq = mpts->mpts_rel_seq;
5976 flow->flow_soerror = mpts->mpts_socket->so_error;
5977 flow->flow_probecnt = mpts->mpts_probecnt;
5978 }
5979
5980 static int
5981 mptcp_pcblist SYSCTL_HANDLER_ARGS
5982 {
5983 #pragma unused(oidp, arg1, arg2)
5984 int error = 0, f;
5985 size_t len;
5986 struct mppcb *mpp;
5987 struct mptses *mpte;
5988 struct mptcb *mp_tp;
5989 struct mptsub *mpts;
5990 struct socket *so;
5991 conninfo_mptcp_t mptcpci;
5992 mptcp_flow_t *flows = NULL;
5993
5994 if (req->newptr != USER_ADDR_NULL) {
5995 return EPERM;
5996 }
5997
5998 lck_mtx_lock(&mtcbinfo.mppi_lock);
5999 if (req->oldptr == USER_ADDR_NULL) {
6000 size_t n = mtcbinfo.mppi_count;
6001 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6002 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6003 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6004 return 0;
6005 }
6006 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6007 flows = NULL;
6008 socket_lock(mpp->mpp_socket, 1);
6009 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6010 mpte = mptompte(mpp);
6011
6012 socket_lock_assert_owned(mptetoso(mpte));
6013 mp_tp = mpte->mpte_mptcb;
6014
6015 bzero(&mptcpci, sizeof(mptcpci));
6016 mptcpci.mptcpci_state = mp_tp->mpt_state;
6017 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6018 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6019 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6020 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6021 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6022 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6023 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6024 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6025 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6026 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6027 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6028 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6029 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6030
6031 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6032 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6033 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6034 mptcpci.mptcpci_flow_offset =
6035 offsetof(conninfo_mptcp_t, mptcpci_flows);
6036
6037 len = sizeof(*flows) * mpte->mpte_numflows;
6038 if (mpte->mpte_numflows != 0) {
6039 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
6040 if (flows == NULL) {
6041 socket_unlock(mpp->mpp_socket, 1);
6042 break;
6043 }
6044 mptcpci.mptcpci_len = sizeof(mptcpci) +
6045 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6046 error = SYSCTL_OUT(req, &mptcpci,
6047 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6048 } else {
6049 mptcpci.mptcpci_len = sizeof(mptcpci);
6050 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6051 }
6052 if (error) {
6053 socket_unlock(mpp->mpp_socket, 1);
6054 FREE(flows, M_TEMP);
6055 break;
6056 }
6057 f = 0;
6058 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6059 so = mpts->mpts_socket;
6060 fill_mptcp_subflow(so, &flows[f], mpts);
6061 f++;
6062 }
6063 socket_unlock(mpp->mpp_socket, 1);
6064 if (flows) {
6065 error = SYSCTL_OUT(req, flows, len);
6066 FREE(flows, M_TEMP);
6067 if (error) {
6068 break;
6069 }
6070 }
6071 }
6072 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6073
6074 return error;
6075 }
6076
6077 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6078 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6079 "List of active MPTCP connections");
6080
6081 /*
6082 * Set notsent lowat mark on the MPTCB
6083 */
6084 int
6085 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6086 {
6087 struct mptcb *mp_tp = NULL;
6088 int error = 0;
6089
6090 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6091 mp_tp = mpte->mpte_mptcb;
6092 }
6093
6094 if (mp_tp) {
6095 mp_tp->mpt_notsent_lowat = optval;
6096 } else {
6097 error = EINVAL;
6098 }
6099
6100 return error;
6101 }
6102
6103 u_int32_t
6104 mptcp_get_notsent_lowat(struct mptses *mpte)
6105 {
6106 struct mptcb *mp_tp = NULL;
6107
6108 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6109 mp_tp = mpte->mpte_mptcb;
6110 }
6111
6112 if (mp_tp) {
6113 return mp_tp->mpt_notsent_lowat;
6114 } else {
6115 return 0;
6116 }
6117 }
6118
6119 int
6120 mptcp_notsent_lowat_check(struct socket *so)
6121 {
6122 struct mptses *mpte;
6123 struct mppcb *mpp;
6124 struct mptcb *mp_tp;
6125 struct mptsub *mpts;
6126
6127 int notsent = 0;
6128
6129 mpp = mpsotomppcb(so);
6130 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6131 return 0;
6132 }
6133
6134 mpte = mptompte(mpp);
6135 socket_lock_assert_owned(mptetoso(mpte));
6136 mp_tp = mpte->mpte_mptcb;
6137
6138 notsent = so->so_snd.sb_cc;
6139
6140 if ((notsent == 0) ||
6141 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6142 mp_tp->mpt_notsent_lowat)) {
6143 mptcplog((LOG_DEBUG, "MPTCP Sender: "
6144 "lowat %d notsent %d actual %d \n",
6145 mp_tp->mpt_notsent_lowat, notsent,
6146 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6147 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6148 return 1;
6149 }
6150
6151 /* When Nagle's algorithm is not disabled, it is better
6152 * to wakeup the client even before there is atleast one
6153 * maxseg of data to write.
6154 */
6155 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6156 int retval = 0;
6157 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6158 struct socket *subf_so = mpts->mpts_socket;
6159 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6160
6161 notsent = so->so_snd.sb_cc -
6162 (tp->snd_nxt - tp->snd_una);
6163
6164 if ((tp->t_flags & TF_NODELAY) == 0 &&
6165 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6166 retval = 1;
6167 }
6168 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6169 " nodelay false \n",
6170 mp_tp->mpt_notsent_lowat, notsent),
6171 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6172 return retval;
6173 }
6174 }
6175 return 0;
6176 }
6177
6178 static errno_t
6179 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6180 void **unitinfo)
6181 {
6182 #pragma unused(kctlref, sac, unitinfo)
6183
6184 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6185 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6186 }
6187
6188 mptcp_kern_skt_unit = sac->sc_unit;
6189
6190 return 0;
6191 }
6192
6193 static void
6194 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6195 {
6196 struct mppcb *mpp;
6197
6198 /* Iterate over all MPTCP connections */
6199
6200 lck_mtx_lock(&mtcbinfo.mppi_lock);
6201
6202 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6203 struct socket *mp_so = mpp->mpp_socket;
6204 struct mptses *mpte = mpp->mpp_pcbe;
6205
6206 socket_lock(mp_so, 1);
6207
6208 if (mp_so->so_flags & SOF_DELEGATED &&
6209 uuid_compare(uuid, mp_so->e_uuid)) {
6210 goto next;
6211 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6212 uuid_compare(uuid, mp_so->last_uuid)) {
6213 goto next;
6214 }
6215
6216 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6217 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6218
6219 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6220
6221 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6222 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6223 }
6224
6225 mptcp_check_subflows_and_add(mpte);
6226 mptcp_remove_subflows(mpte);
6227
6228 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6229
6230 next:
6231 socket_unlock(mp_so, 1);
6232 }
6233
6234 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6235 }
6236
6237 static void
6238 mptcp_wifi_status_changed(void)
6239 {
6240 struct mppcb *mpp;
6241
6242 /* Iterate over all MPTCP connections */
6243
6244 lck_mtx_lock(&mtcbinfo.mppi_lock);
6245
6246 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6247 struct socket *mp_so = mpp->mpp_socket;
6248 struct mptses *mpte = mpp->mpp_pcbe;
6249
6250 socket_lock(mp_so, 1);
6251
6252 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6253 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6254 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6255 goto next;
6256 }
6257
6258 mptcp_check_subflows_and_add(mpte);
6259 mptcp_check_subflows_and_remove(mpte);
6260
6261 next:
6262 socket_unlock(mp_so, 1);
6263 }
6264
6265 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6266 }
6267
6268 void
6269 mptcp_ask_symptoms(struct mptses *mpte)
6270 {
6271 struct mptcp_symptoms_ask_uuid ask;
6272 struct socket *mp_so;
6273 struct proc *p;
6274 int pid, prio, err;
6275
6276 if (mptcp_kern_skt_unit == 0) {
6277 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6278 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6279 return;
6280 }
6281
6282 mp_so = mptetoso(mpte);
6283
6284 if (mp_so->so_flags & SOF_DELEGATED) {
6285 pid = mp_so->e_pid;
6286 } else {
6287 pid = mp_so->last_pid;
6288 }
6289
6290 p = proc_find(pid);
6291 if (p == PROC_NULL) {
6292 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6293 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6294 return;
6295 }
6296
6297 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6298
6299 if (mp_so->so_flags & SOF_DELEGATED) {
6300 uuid_copy(ask.uuid, mp_so->e_uuid);
6301 } else {
6302 uuid_copy(ask.uuid, mp_so->last_uuid);
6303 }
6304
6305 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6306
6307 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6308 prio == TASK_DARWINBG_APPLICATION) {
6309 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6310 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6311 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6312 } else {
6313 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6314 }
6315
6316 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6317 &ask, sizeof(ask), CTL_DATA_EOR);
6318
6319 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6320 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6321
6322
6323 proc_rele(p);
6324 }
6325
6326 static errno_t
6327 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6328 void *unitinfo)
6329 {
6330 #pragma unused(kctlref, kcunit, unitinfo)
6331
6332 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6333
6334 return 0;
6335 }
6336
6337 static errno_t
6338 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6339 mbuf_t m, int flags)
6340 {
6341 #pragma unused(kctlref, unitinfo, flags)
6342 symptoms_advisory_t *sa = NULL;
6343
6344 if (kcunit != mptcp_kern_skt_unit) {
6345 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6346 __func__, kcunit, mptcp_kern_skt_unit);
6347 }
6348
6349 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6350 mbuf_freem(m);
6351 return EINVAL;
6352 }
6353
6354 if (mbuf_len(m) < sizeof(*sa)) {
6355 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6356 __func__, mbuf_len(m), sizeof(*sa));
6357 mbuf_freem(m);
6358 return EINVAL;
6359 }
6360
6361 sa = mbuf_data(m);
6362
6363 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6364 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6365 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6366 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6367
6368 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6369 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6370 mptcp_wifi_status_changed();
6371 }
6372 } else {
6373 struct mptcp_symptoms_answer answer;
6374 errno_t err;
6375
6376 /* We temporarily allow different sizes for ease of submission */
6377 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6378 mbuf_len(m) != sizeof(answer)) {
6379 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6380 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6381 sizeof(answer));
6382 mbuf_free(m);
6383 return EINVAL;
6384 }
6385
6386 memset(&answer, 0, sizeof(answer));
6387
6388 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6389 if (err) {
6390 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6391 mbuf_free(m);
6392 return err;
6393 }
6394
6395 mptcp_allow_uuid(answer.uuid, answer.rssi);
6396 }
6397
6398 mbuf_freem(m);
6399 return 0;
6400 }
6401
6402 void
6403 mptcp_control_register(void)
6404 {
6405 /* Set up the advisory control socket */
6406 struct kern_ctl_reg mptcp_kern_ctl;
6407
6408 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6409 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6410 sizeof(mptcp_kern_ctl.ctl_name));
6411 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6412 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6413 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6414 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6415
6416 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6417 }
6418
6419 /*
6420 * Three return-values:
6421 * 1 : WiFi is bad
6422 * 0 : WiFi is good
6423 * -1 : WiFi-state is unknown
6424 */
6425 int
6426 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6427 {
6428 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6429 if (mptcp_advisory.sa_wifi_status) {
6430 return symptoms_is_wifi_lossy() ? 1 : 0;
6431 }
6432
6433 /*
6434 * If it's a first-party app and we don't have any info
6435 * about the Wi-Fi state, let's be pessimistic.
6436 */
6437 return -1;
6438 } else {
6439 if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
6440 return 1;
6441 }
6442
6443 /*
6444 * If we are target-based (meaning, we allow to be more lax on
6445 * the "unusable" target. We only *know* about the state once
6446 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6447 *
6448 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6449 * be set.
6450 *
6451 * In any other case (while in target-mode), consider WiFi bad
6452 * and we are going to ask for allowance from Symptoms anyway.
6453 */
6454 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6455 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6456 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6457 return 0;
6458 }
6459
6460 return 1;
6461 }
6462
6463 return 0;
6464 }
6465 }
6466
6467 boolean_t
6468 symptoms_is_wifi_lossy(void)
6469 {
6470 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6471 }
6472
6473 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6474 static void
6475 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6476 {
6477 struct socket *mp_so = mptetoso(mpte);
6478 struct socket *so = mpts->mpts_socket;
6479 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6480 struct mptcb *mp_tp = mpte->mpte_mptcb;
6481
6482 /* If data was sent with SYN, rewind state */
6483 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6484 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6485 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6486
6487 VERIFY(mp_droplen <= (UINT_MAX));
6488 VERIFY(mp_droplen >= tcp_droplen);
6489
6490 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6491 mpts->mpts_iss += tcp_droplen;
6492 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6493
6494 if (mp_droplen > tcp_droplen) {
6495 /* handle partial TCP ack */
6496 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6497 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6498 mp_droplen = tcp_droplen;
6499 } else {
6500 /* all data on SYN was acked */
6501 mpts->mpts_rel_seq = 1;
6502 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6503 }
6504 mp_tp->mpt_sndmax -= tcp_droplen;
6505
6506 if (mp_droplen != 0) {
6507 VERIFY(mp_so->so_snd.sb_mb != NULL);
6508 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6509 }
6510 }
6511 }
6512
6513 int
6514 mptcp_freeq(struct mptcb *mp_tp)
6515 {
6516 struct tseg_qent *q;
6517 int rv = 0;
6518
6519 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6520 LIST_REMOVE(q, tqe_q);
6521 m_freem(q->tqe_m);
6522 zfree(tcp_reass_zone, q);
6523 rv = 1;
6524 }
6525 mp_tp->mpt_reassqlen = 0;
6526 return rv;
6527 }
6528
6529 static int
6530 mptcp_post_event(u_int32_t event_code, int value)
6531 {
6532 struct kev_mptcp_data event_data;
6533 struct kev_msg ev_msg;
6534
6535 memset(&ev_msg, 0, sizeof(ev_msg));
6536
6537 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6538 ev_msg.kev_class = KEV_NETWORK_CLASS;
6539 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6540 ev_msg.event_code = event_code;
6541
6542 event_data.value = value;
6543
6544 ev_msg.dv[0].data_ptr = &event_data;
6545 ev_msg.dv[0].data_length = sizeof(event_data);
6546
6547 return kev_post_msg(&ev_msg);
6548 }
6549
6550 static void
6551 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6552 {
6553 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6554 int error;
6555
6556 /* First-party apps (Siri) don't flip the cellicon */
6557 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6558 return;
6559 }
6560
6561 /* Subflow is disappearing - don't set it on this one */
6562 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6563 return;
6564 }
6565
6566 /* Fallen back connections are not triggering the cellicon */
6567 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6568 return;
6569 }
6570
6571 /* Remember the last time we set the cellicon. Needed for debouncing */
6572 mpte->mpte_last_cellicon_set = tcp_now;
6573
6574 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6575 tcp_sched_timers(tp);
6576
6577 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6578 mpte->mpte_cellicon_increments != 0) {
6579 if (mptcp_cellicon_refcount == 0) {
6580 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6581 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6582
6583 /* Continue, so that the icon gets set... */
6584 } else {
6585 /*
6586 * In this case, the cellicon is already set. No need to bump it
6587 * even higher
6588 */
6589
6590 return;
6591 }
6592 }
6593
6594 /* When tearing down this subflow, we need to decrement the
6595 * reference counter
6596 */
6597 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6598
6599 /* This counter, so that when a session gets destroyed we decrement
6600 * the reference counter by whatever is left
6601 */
6602 mpte->mpte_cellicon_increments++;
6603
6604 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6605 /* If cellicon is already set, get out of here! */
6606 return;
6607 }
6608
6609 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6610
6611 if (error) {
6612 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6613 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6614 } else {
6615 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6616 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6617 }
6618 }
6619
6620 void
6621 mptcp_clear_cellicon(void)
6622 {
6623 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6624
6625 if (error) {
6626 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6627 __func__, error);
6628 } else {
6629 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6630 __func__);
6631 }
6632 }
6633
6634 /*
6635 * Returns true if the icon has been flipped to WiFi.
6636 */
6637 static boolean_t
6638 __mptcp_unset_cellicon(long val)
6639 {
6640 if (OSAddAtomic(-val, &mptcp_cellicon_refcount) != 1) {
6641 return false;
6642 }
6643
6644 mptcp_clear_cellicon();
6645
6646 return true;
6647 }
6648
6649 void
6650 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6651 {
6652 /* First-party apps (Siri) don't flip the cellicon */
6653 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6654 return;
6655 }
6656
6657 if (mpte->mpte_cellicon_increments == 0) {
6658 /* This flow never used cell - get out of here! */
6659 return;
6660 }
6661
6662 if (mptcp_cellicon_refcount == 0) {
6663 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6664 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6665
6666 return;
6667 }
6668
6669 if (mpts) {
6670 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6671 return;
6672 }
6673
6674 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6675 }
6676
6677 if (mpte->mpte_cellicon_increments < val) {
6678 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6679 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6680 val = mpte->mpte_cellicon_increments;
6681 }
6682
6683 mpte->mpte_cellicon_increments -= val;
6684
6685 if (__mptcp_unset_cellicon(val) == false) {
6686 return;
6687 }
6688
6689 /* All flows are gone - our counter should be at zero too! */
6690 if (mpte->mpte_cellicon_increments != 0) {
6691 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6692 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6693 }
6694 }
6695
6696 void
6697 mptcp_reset_rexmit_state(struct tcpcb *tp)
6698 {
6699 struct mptsub *mpts;
6700 struct inpcb *inp;
6701 struct socket *so;
6702
6703 inp = tp->t_inpcb;
6704 if (inp == NULL) {
6705 return;
6706 }
6707
6708 so = inp->inp_socket;
6709 if (so == NULL) {
6710 return;
6711 }
6712
6713 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6714 return;
6715 }
6716
6717 mpts = tp->t_mpsub;
6718
6719 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6720 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6721 }
6722
6723 void
6724 mptcp_reset_keepalive(struct tcpcb *tp)
6725 {
6726 struct mptsub *mpts = tp->t_mpsub;
6727
6728 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6729 }