]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
9e8637a928dffa7e5f7a0255ec6639d40457db4d
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_wupcall(struct socket *, void *, int);
121 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
122 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
123 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
124
125 static void mptcp_subflow_abort(struct mptsub *, int);
126
127 static void mptcp_send_dfin(struct socket *so);
128 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
129 static int mptcp_freeq(struct mptcb *mp_tp);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
146 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158 static void mptcp_do_sha1(mptcp_key_t *, char *);
159 static void mptcp_init_local_parms(struct mptses *);
160
161 static unsigned int mptsub_zone_size; /* size of mptsub */
162 static struct zone *mptsub_zone; /* zone for mptsub */
163
164 static unsigned int mptopt_zone_size; /* size of mptopt */
165 static struct zone *mptopt_zone; /* zone for mptopt */
166
167 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
168 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
169
170 struct mppcbinfo mtcbinfo;
171
172 SYSCTL_DECL(_net_inet);
173
174 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
175
176 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
177 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
178 &mptcp_dbg_area, 0, "MPTCP debug area");
179
180 uint32_t mptcp_dbg_level = 1;
181 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
182 &mptcp_dbg_level, 0, "MPTCP debug level");
183
184 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
185 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
186
187
188 static int mptcp_alternate_port = 0;
189 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
190 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
191
192 static struct protosw mptcp_subflow_protosw;
193 static struct pr_usrreqs mptcp_subflow_usrreqs;
194 #if INET6
195 static struct ip6protosw mptcp_subflow_protosw6;
196 static struct pr_usrreqs mptcp_subflow_usrreqs6;
197 #endif /* INET6 */
198
199 static uint8_t mptcp_create_subflows_scheduled;
200
201 typedef struct mptcp_subflow_event_entry {
202 uint64_t sofilt_hint_mask;
203 ev_ret_t (*sofilt_hint_ev_hdlr)(
204 struct mptses *mpte,
205 struct mptsub *mpts,
206 uint64_t *p_mpsofilt_hint,
207 uint64_t event);
208 } mptsub_ev_entry_t;
209
210 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
211 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
212 static uint32_t mptcp_kern_skt_inuse = 0;
213 static uint32_t mptcp_kern_skt_unit;
214 static symptoms_advisory_t mptcp_advisory;
215
216 uint32_t mptcp_cellicon_refcount = 0;
217
218 /*
219 * XXX The order of the event handlers below is really
220 * really important. Think twice before changing it.
221 */
222 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
229 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
241 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
250 },
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
258 },
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
262 },
263 {
264 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
270 },
271 {
272 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
273 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
274 },
275 {
276 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
277 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
278 },
279 };
280
281 os_log_t mptcp_log_handle;
282
283 /*
284 * Protocol pr_init callback.
285 */
286 void
287 mptcp_init(struct protosw *pp, struct domain *dp)
288 {
289 #pragma unused(dp)
290 static int mptcp_initialized = 0;
291 struct protosw *prp;
292 #if INET6
293 struct ip6protosw *prp6;
294 #endif /* INET6 */
295
296 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
297
298 /* do this only once */
299 if (mptcp_initialized) {
300 return;
301 }
302 mptcp_initialized = 1;
303
304 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
305
306 /*
307 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
308 * we must be able to find IPPROTO_TCP entries for both.
309 */
310 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
311 VERIFY(prp != NULL);
312 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
313 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
314 sizeof(mptcp_subflow_usrreqs));
315 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
316 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
317 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
318 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
319 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
320 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
321 /*
322 * Socket filters shouldn't attach/detach to/from this protosw
323 * since pr_protosw is to be used instead, which points to the
324 * real protocol; if they do, it is a bug and we should panic.
325 */
326 mptcp_subflow_protosw.pr_filter_head.tqh_first =
327 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
328 mptcp_subflow_protosw.pr_filter_head.tqh_last =
329 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
330
331 #if INET6
332 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
333 IPPROTO_TCP, SOCK_STREAM);
334 VERIFY(prp6 != NULL);
335 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
336 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
337 sizeof(mptcp_subflow_usrreqs6));
338 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
339 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
340 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
341 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
342 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
343 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
344 /*
345 * Socket filters shouldn't attach/detach to/from this protosw
346 * since pr_protosw is to be used instead, which points to the
347 * real protocol; if they do, it is a bug and we should panic.
348 */
349 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
350 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
351 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
352 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
353 #endif /* INET6 */
354
355 bzero(&mtcbinfo, sizeof(mtcbinfo));
356 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
357 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
358 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
359 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
360 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
361 /* NOTREACHED */
362 }
363 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
364 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
365
366 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
367 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
368 mtcbinfo.mppi_lock_grp_attr);
369 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
370 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
371 mtcbinfo.mppi_lock_attr);
372
373 mtcbinfo.mppi_gc = mptcp_gc;
374 mtcbinfo.mppi_timer = mptcp_timer;
375
376 /* attach to MP domain for garbage collection to take place */
377 mp_pcbinfo_attach(&mtcbinfo);
378
379 mptsub_zone_size = sizeof(struct mptsub);
380 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
381 8192, "mptsub")) == NULL) {
382 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
383 /* NOTREACHED */
384 }
385 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
386 zone_change(mptsub_zone, Z_EXPAND, TRUE);
387
388 mptopt_zone_size = sizeof(struct mptopt);
389 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
390 1024, "mptopt")) == NULL) {
391 panic("%s: unable to allocate MPTCP option zone\n", __func__);
392 /* NOTREACHED */
393 }
394 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
395 zone_change(mptopt_zone, Z_EXPAND, TRUE);
396
397 mpt_subauth_entry_size = sizeof(struct mptcp_subf_auth_entry);
398 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
399 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
400 panic("%s: unable to allocate MPTCP address auth zone \n",
401 __func__);
402 /* NOTREACHED */
403 }
404 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
405 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
406
407 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
408 }
409
410 int
411 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create)
412 {
413 int i, index = -1;
414
415 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
416 if (create && stats[i].ifindex == IFSCOPE_NONE) {
417 if (index < 0) {
418 index = i;
419 }
420 continue;
421 }
422
423 if (stats[i].ifindex == ifindex) {
424 index = i;
425 return index;
426 }
427 }
428
429 if (index != -1) {
430 stats[index].ifindex = ifindex;
431 }
432
433 return index;
434 }
435
436 static int
437 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
438 {
439 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
440 int index;
441
442 if (ifp == NULL) {
443 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
444 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
445 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
446 return -1;
447 }
448
449 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
450
451 if (index != -1) {
452 if (stats[index].is_expensive == 0) {
453 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
454 }
455 }
456
457 return index;
458 }
459
460 void
461 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
462 {
463 int index;
464
465 tcpstat.tcps_mp_switches++;
466 mpte->mpte_subflow_switches++;
467
468 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
469
470 if (index != -1) {
471 mpte->mpte_itfstats[index].switches++;
472 }
473 }
474
475 /*
476 * Flushes all recorded socket options from an MP socket.
477 */
478 static void
479 mptcp_flush_sopts(struct mptses *mpte)
480 {
481 struct mptopt *mpo, *tmpo;
482
483 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
484 mptcp_sopt_remove(mpte, mpo);
485 mptcp_sopt_free(mpo);
486 }
487 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
488 }
489
490 /*
491 * Create an MPTCP session, called as a result of opening a MPTCP socket.
492 */
493 int
494 mptcp_session_create(struct mppcb *mpp)
495 {
496 struct mppcbinfo *mppi;
497 struct mptses *mpte;
498 struct mptcb *mp_tp;
499
500 VERIFY(mpp != NULL);
501 mppi = mpp->mpp_pcbinfo;
502 VERIFY(mppi != NULL);
503
504 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
505 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
506
507 /* MPTCP Multipath PCB Extension */
508 bzero(mpte, sizeof(*mpte));
509 VERIFY(mpp->mpp_pcbe == NULL);
510 mpp->mpp_pcbe = mpte;
511 mpte->mpte_mppcb = mpp;
512 mpte->mpte_mptcb = mp_tp;
513
514 TAILQ_INIT(&mpte->mpte_sopts);
515 TAILQ_INIT(&mpte->mpte_subflows);
516 mpte->mpte_associd = SAE_ASSOCID_ANY;
517 mpte->mpte_connid_last = SAE_CONNID_ANY;
518
519 mptcp_init_urgency_timer(mpte);
520
521 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
522 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
523
524 if (mptcp_alternate_port) {
525 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
526 }
527
528 mpte->mpte_last_cellicon_set = tcp_now;
529
530 /* MPTCP Protocol Control Block */
531 bzero(mp_tp, sizeof(*mp_tp));
532 mp_tp->mpt_mpte = mpte;
533 mp_tp->mpt_state = MPTCPS_CLOSED;
534
535 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
536
537 return 0;
538 }
539
540 struct sockaddr *
541 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
542 {
543 if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
544 return &mpte->mpte_dst;
545 }
546
547 if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
548 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
549 }
550
551 if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
552 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
553 }
554
555 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
556 * meaning we prefer IPv6 over IPv4.
557 */
558 if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
559 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
560 }
561
562 if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
563 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
564 }
565
566 /* We don't yet have a unicast IP */
567 return NULL;
568 }
569
570 static void
571 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
572 uint64_t *cellbytes, uint64_t *allbytes)
573 {
574 int64_t mycellbytes = 0;
575 uint64_t myallbytes = 0;
576 int i;
577
578 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
579 if (mpte->mpte_itfstats[i].is_expensive) {
580 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
581 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
582 }
583
584 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
585 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
586 }
587
588 if (initial_cell) {
589 mycellbytes -= mpte->mpte_init_txbytes;
590 mycellbytes -= mpte->mpte_init_rxbytes;
591 }
592
593 if (mycellbytes < 0) {
594 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
595 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
596 *cellbytes = 0;
597 *allbytes = 0;
598 } else {
599 *cellbytes = mycellbytes;
600 *allbytes = myallbytes;
601 }
602 }
603
604 static void
605 mptcpstats_session_wrapup(struct mptses *mpte)
606 {
607 boolean_t cell = mpte->mpte_initial_cell;
608
609 switch (mpte->mpte_svctype) {
610 case MPTCP_SVCTYPE_HANDOVER:
611 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
612 tcpstat.tcps_mptcp_fp_handover_attempt++;
613
614 if (cell && mpte->mpte_handshake_success) {
615 tcpstat.tcps_mptcp_fp_handover_success_cell++;
616
617 if (mpte->mpte_used_wifi) {
618 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
619 }
620 } else if (mpte->mpte_handshake_success) {
621 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
622
623 if (mpte->mpte_used_cell) {
624 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
625 }
626 }
627 } else {
628 tcpstat.tcps_mptcp_handover_attempt++;
629
630 if (cell && mpte->mpte_handshake_success) {
631 tcpstat.tcps_mptcp_handover_success_cell++;
632
633 if (mpte->mpte_used_wifi) {
634 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
635 }
636 } else if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_handover_success_wifi++;
638
639 if (mpte->mpte_used_cell) {
640 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
641 }
642 }
643 }
644
645 if (mpte->mpte_handshake_success) {
646 uint64_t cellbytes;
647 uint64_t allbytes;
648
649 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
650
651 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
652 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
653 }
654 break;
655 case MPTCP_SVCTYPE_INTERACTIVE:
656 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
657 tcpstat.tcps_mptcp_fp_interactive_attempt++;
658
659 if (mpte->mpte_handshake_success) {
660 tcpstat.tcps_mptcp_fp_interactive_success++;
661
662 if (!cell && mpte->mpte_used_cell) {
663 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
664 }
665 }
666 } else {
667 tcpstat.tcps_mptcp_interactive_attempt++;
668
669 if (mpte->mpte_handshake_success) {
670 tcpstat.tcps_mptcp_interactive_success++;
671
672 if (!cell && mpte->mpte_used_cell) {
673 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
674 }
675 }
676 }
677
678 if (mpte->mpte_handshake_success) {
679 uint64_t cellbytes;
680 uint64_t allbytes;
681
682 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
683
684 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
685 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
686 }
687 break;
688 case MPTCP_SVCTYPE_AGGREGATE:
689 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
690 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
691
692 if (mpte->mpte_handshake_success) {
693 tcpstat.tcps_mptcp_fp_aggregate_success++;
694 }
695 } else {
696 tcpstat.tcps_mptcp_aggregate_attempt++;
697
698 if (mpte->mpte_handshake_success) {
699 tcpstat.tcps_mptcp_aggregate_success++;
700 }
701 }
702
703 if (mpte->mpte_handshake_success) {
704 uint64_t cellbytes;
705 uint64_t allbytes;
706
707 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
708
709 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
710 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
711 }
712 break;
713 }
714
715 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
716 tcpstat.tcps_mptcp_back_to_wifi++;
717 }
718
719 if (mpte->mpte_triggered_cell) {
720 tcpstat.tcps_mptcp_triggered_cell++;
721 }
722 }
723
724 /*
725 * Destroy an MPTCP session.
726 */
727 static void
728 mptcp_session_destroy(struct mptses *mpte)
729 {
730 struct mptcb *mp_tp = mpte->mpte_mptcb;
731
732 VERIFY(mp_tp != NULL);
733 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
734
735 mptcpstats_session_wrapup(mpte);
736 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
737 mptcp_flush_sopts(mpte);
738
739 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
740 _FREE(mpte->mpte_itfinfo, M_TEMP);
741 }
742 mpte->mpte_itfinfo = NULL;
743
744 m_freem_list(mpte->mpte_reinjectq);
745
746 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
747 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
748 }
749
750 boolean_t
751 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
752 {
753 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
754 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
755 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
756 }
757
758 static int
759 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
760 const struct in_addr *addrv4)
761 {
762 static const struct in6_addr well_known_prefix = {
763 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
764 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
765 0x00, 0x00, 0x00, 0x00},
766 };
767 const char *ptrv4 = (const char *)addrv4;
768 char buf[MAX_IPv6_STR_LEN];
769 char *ptr = (char *)addr;
770
771 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
772 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
773 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
774 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
775 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
776 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
777 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
778 return -1;
779 }
780
781 /* Check for the well-known prefix */
782 if (len == NAT64_PREFIX_LEN_96 &&
783 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
784 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
785 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
786 return -1;
787 }
788 }
789
790 switch (len) {
791 case NAT64_PREFIX_LEN_96:
792 memcpy(ptr + 12, ptrv4, 4);
793 break;
794 case NAT64_PREFIX_LEN_64:
795 memcpy(ptr + 9, ptrv4, 4);
796 break;
797 case NAT64_PREFIX_LEN_56:
798 memcpy(ptr + 7, ptrv4, 1);
799 memcpy(ptr + 9, ptrv4 + 1, 3);
800 break;
801 case NAT64_PREFIX_LEN_48:
802 memcpy(ptr + 6, ptrv4, 2);
803 memcpy(ptr + 9, ptrv4 + 2, 2);
804 break;
805 case NAT64_PREFIX_LEN_40:
806 memcpy(ptr + 5, ptrv4, 3);
807 memcpy(ptr + 9, ptrv4 + 3, 1);
808 break;
809 case NAT64_PREFIX_LEN_32:
810 memcpy(ptr + 4, ptrv4, 4);
811 break;
812 default:
813 panic("NAT64-prefix len is wrong: %u\n", len);
814 }
815
816 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
817 __func__, len,
818 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
819
820 return 0;
821 }
822
823 static void
824 mptcp_trigger_cell_bringup(struct mptses *mpte)
825 {
826 struct socket *mp_so = mptetoso(mpte);
827
828 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
829 uuid_string_t uuidstr;
830 int err;
831
832 socket_unlock(mp_so, 0);
833 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
834 TRUE);
835 socket_lock(mp_so, 0);
836
837 if (err == 0) {
838 mpte->mpte_triggered_cell = 1;
839 }
840
841 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
842 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
843 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
844 } else {
845 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
846 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
847 }
848 }
849
850 static boolean_t
851 mptcp_subflow_disconnecting(struct mptsub *mpts)
852 {
853 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
854 return true;
855 }
856
857 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
858 return true;
859 }
860
861 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
862 return true;
863 }
864
865 return false;
866 }
867
868 void
869 mptcp_check_subflows_and_add(struct mptses *mpte)
870 {
871 struct mptcb *mp_tp = mpte->mpte_mptcb;
872 boolean_t cellular_viable = FALSE;
873 boolean_t want_cellular = TRUE;
874 uint32_t i;
875
876 if (!mptcp_ok_to_create_subflows(mp_tp)) {
877 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
878 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
879 return;
880 }
881
882 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
883 return;
884 }
885
886 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
887 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
888 struct mpt_itf_info *info;
889 struct sockaddr_in6 nat64pre;
890 struct sockaddr *dst;
891 struct mptsub *mpts;
892 struct ifnet *ifp;
893 uint32_t ifindex;
894
895 info = &mpte->mpte_itfinfo[i];
896
897 ifindex = info->ifindex;
898 if (ifindex == IFSCOPE_NONE) {
899 continue;
900 }
901
902 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
903 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
904 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
905
906 if (info->no_mptcp_support) {
907 continue;
908 }
909
910 ifnet_head_lock_shared();
911 ifp = ifindex2ifnet[ifindex];
912 ifnet_head_done();
913
914 if (ifp == NULL) {
915 continue;
916 }
917
918 if (IFNET_IS_CELLULAR(ifp)) {
919 cellular_viable = TRUE;
920 }
921
922 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
923 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
924 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
925
926 if (subifp == NULL) {
927 continue;
928 }
929
930 /*
931 * If there is at least one functioning subflow on WiFi
932 * and we are checking for the cell interface, then
933 * we always need to ask symptoms for permission as
934 * cell is triggered even if WiFi is available.
935 */
936 if (!IFNET_IS_CELLULAR(subifp) &&
937 !mptcp_subflow_disconnecting(mpts) &&
938 IFNET_IS_CELLULAR(ifp)) {
939 need_to_ask_symptoms = TRUE;
940 }
941
942 /*
943 * In Handover mode, only create cell subflow if
944 * 1. Wi-Fi Assist is active
945 * 2. Symptoms marked WiFi as weak
946 * 3. We are experiencing RTOs or we are not sending data.
947 *
948 * This covers the scenario, where:
949 * 1. We send and get retransmission timeouts (thus,
950 * we confirmed that WiFi is indeed bad).
951 * 2. We are not sending and the server tries to send.
952 * Establshing a cell-subflow gives the server a
953 * chance to send us some data over cell if WiFi
954 * is dead. We establish the subflow with the
955 * backup-bit set, so the server is not allowed to
956 * send on this subflow as long as WiFi is providing
957 * good performance.
958 */
959 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
960 !IFNET_IS_CELLULAR(subifp) &&
961 !mptcp_subflow_disconnecting(mpts) &&
962 (mptcp_is_wifi_unusable_for_session(mpte) == 0 ||
963 (tp->t_rxtshift < mptcp_fail_thresh * 2 && mptetoso(mpte)->so_snd.sb_cc))) {
964 os_log_debug(mptcp_log_handle,
965 "%s - %lx: handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
966 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
967 mptcp_is_wifi_unusable_for_session(mpte),
968 tp->t_rxtshift,
969 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
970 mptetoso(mpte)->so_snd.sb_cc,
971 ifindex, subifp->if_index,
972 tp->t_srtt >> TCP_RTT_SHIFT,
973 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
974 tp->t_rxtcur);
975 found = TRUE;
976
977 /* We found a proper subflow on WiFi - no need for cell */
978 want_cellular = FALSE;
979 break;
980 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
981 uint64_t time_now = mach_continuous_time();
982
983 os_log(mptcp_log_handle,
984 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
985 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
986 time_now, mptcp_is_wifi_unusable_for_session(mpte),
987 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
988 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
989
990 if (!IFNET_IS_CELLULAR(subifp) &&
991 !mptcp_subflow_disconnecting(mpts) &&
992 (mpte->mpte_time_target == 0 ||
993 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
994 !mptcp_is_wifi_unusable_for_session(mpte))) {
995 found = TRUE;
996
997 want_cellular = FALSE;
998 break;
999 }
1000 } else {
1001 os_log_debug(mptcp_log_handle,
1002 "%s - %lx: svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u rtt %u rttvar %u rto %u\n",
1003 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1004 mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
1005 mptcp_is_wifi_unusable_for_session(mpte), tp->t_rxtshift,
1006 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc,
1007 tp->t_srtt >> TCP_RTT_SHIFT,
1008 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
1009 tp->t_rxtcur);
1010 }
1011
1012 if (subifp->if_index == ifindex &&
1013 !mptcp_subflow_disconnecting(mpts)) {
1014 /*
1015 * We found a subflow on this interface.
1016 * No need to create a new one.
1017 */
1018 found = TRUE;
1019 break;
1020 }
1021 }
1022
1023 if (found) {
1024 continue;
1025 }
1026
1027 if (need_to_ask_symptoms &&
1028 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1029 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1030 mptcp_developer_mode == 0) {
1031 mptcp_ask_symptoms(mpte);
1032 return;
1033 }
1034
1035 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1036
1037 if (dst->sa_family == AF_INET &&
1038 !info->has_v4_conn && info->has_nat64_conn) {
1039 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1040 int error, j;
1041
1042 bzero(&nat64pre, sizeof(struct sockaddr_in6));
1043
1044 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1045 if (error) {
1046 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1047 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1048 continue;
1049 }
1050
1051 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1052 if (nat64prefixes[j].prefix_len != 0) {
1053 break;
1054 }
1055 }
1056
1057 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1058
1059 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1060 nat64prefixes[j].prefix_len,
1061 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1062 if (error != 0) {
1063 os_log_info(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1064 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1065 continue;
1066 }
1067
1068 memcpy(&nat64pre.sin6_addr,
1069 &nat64prefixes[j].ipv6_prefix,
1070 sizeof(nat64pre.sin6_addr));
1071 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1072 nat64pre.sin6_family = AF_INET6;
1073 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1074 nat64pre.sin6_flowinfo = 0;
1075 nat64pre.sin6_scope_id = 0;
1076
1077 dst = (struct sockaddr *)&nat64pre;
1078 }
1079
1080 /* Initial subflow started on a NAT64'd address? */
1081 if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
1082 mpte->mpte_dst.sa_family == AF_INET6 &&
1083 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
1084 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
1085 }
1086
1087 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1088 continue;
1089 }
1090 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1091 continue;
1092 }
1093
1094 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1095 }
1096
1097 if (!cellular_viable && want_cellular) {
1098 /* Trigger Cell Bringup */
1099 mptcp_trigger_cell_bringup(mpte);
1100 }
1101 }
1102
1103 static void
1104 mptcp_remove_cell_subflows(struct mptses *mpte)
1105 {
1106 struct mptsub *mpts, *tmpts;
1107 boolean_t found = false;
1108
1109 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1110 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1111
1112 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1113 continue;
1114 }
1115
1116 /* We have a functioning subflow on WiFi. No need for cell! */
1117 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1118 !mptcp_subflow_disconnecting(mpts)) {
1119 found = true;
1120 }
1121 }
1122
1123 /* Didn't found functional sub on WiFi - stay on cell */
1124 if (!found) {
1125 return;
1126 }
1127
1128 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1129 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1130
1131 /* Only remove cellular subflows */
1132 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1133 continue;
1134 }
1135
1136 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1137 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1138
1139 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1140 }
1141
1142 return;
1143 }
1144
1145 /* Returns true if it removed a subflow on cell */
1146 static void
1147 mptcp_handover_subflows_remove(struct mptses *mpte)
1148 {
1149 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1150 boolean_t found_working_subflow = false;
1151 struct mptsub *mpts;
1152
1153 /*
1154 * Look for a subflow that is on a non-cellular interface
1155 * and actually works (aka, no retransmission timeout).
1156 */
1157 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1158 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1159 struct socket *so;
1160 struct tcpcb *tp;
1161
1162 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1163 continue;
1164 }
1165
1166 so = mpts->mpts_socket;
1167 tp = sototcpcb(so);
1168
1169 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1170 tp->t_state != TCPS_ESTABLISHED) {
1171 continue;
1172 }
1173
1174 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1175 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1176
1177 /* Is this subflow in good condition? */
1178 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) {
1179 found_working_subflow = true;
1180 }
1181
1182 /* Or WiFi is fine */
1183 if (!wifi_unusable) {
1184 found_working_subflow = true;
1185 }
1186 }
1187
1188 /*
1189 * Couldn't find a working subflow, let's not remove those on a cellular
1190 * interface.
1191 */
1192 if (!found_working_subflow) {
1193 return;
1194 }
1195
1196 mptcp_remove_cell_subflows(mpte);
1197 }
1198
1199 static void
1200 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1201 {
1202 uint64_t time_now = mach_continuous_time();
1203
1204 if (mpte->mpte_time_target != 0 &&
1205 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1206 mptcp_is_wifi_unusable_for_session(mpte)) {
1207 /* WiFi is bad and we are below the target - don't remove any subflows */
1208 return;
1209 }
1210
1211 mptcp_remove_cell_subflows(mpte);
1212 }
1213
1214 /*
1215 * Based on the MPTCP Service-type and the state of the subflows, we
1216 * will destroy subflows here.
1217 */
1218 void
1219 mptcp_check_subflows_and_remove(struct mptses *mpte)
1220 {
1221 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1222 return;
1223 }
1224
1225 socket_lock_assert_owned(mptetoso(mpte));
1226
1227 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1228 mptcp_handover_subflows_remove(mpte);
1229 }
1230
1231 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1232 mptcp_targetbased_subflows_remove(mpte);
1233 }
1234 }
1235
1236 static void
1237 mptcp_remove_subflows(struct mptses *mpte)
1238 {
1239 struct mptsub *mpts, *tmpts;
1240
1241 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1242 return;
1243 }
1244
1245 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1246 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1247 boolean_t found = false;
1248 uint32_t ifindex;
1249 uint32_t i;
1250
1251 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1252 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1253
1254 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1255 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1256 ifp ? ifp->if_index : -1);
1257 soevent(mpts->mpts_socket,
1258 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1259
1260 continue;
1261 }
1262
1263 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1264 continue;
1265 }
1266
1267 if (ifp) {
1268 ifindex = ifp->if_index;
1269 } else {
1270 ifindex = mpts->mpts_ifscope;
1271 }
1272
1273 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1274 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1275 continue;
1276 }
1277
1278 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1279 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1280 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1281 found = true;
1282 break;
1283 }
1284
1285 if (mpts->mpts_dst.sa_family == AF_INET &&
1286 mpte->mpte_itfinfo[i].has_v4_conn) {
1287 found = true;
1288 break;
1289 }
1290 }
1291 }
1292
1293 if (!found) {
1294 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1295 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1296 ifindex, mpts->mpts_flags);
1297
1298 soevent(mpts->mpts_socket,
1299 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1300 }
1301 }
1302 }
1303
1304 static void
1305 mptcp_create_subflows(__unused void *arg)
1306 {
1307 struct mppcb *mpp;
1308
1309 /*
1310 * Start with clearing, because we might be processing connections
1311 * while a new event comes in.
1312 */
1313 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1314 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1315 }
1316
1317 /* Iterate over all MPTCP connections */
1318
1319 lck_mtx_lock(&mtcbinfo.mppi_lock);
1320
1321 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1322 struct socket *mp_so = mpp->mpp_socket;
1323 struct mptses *mpte = mpp->mpp_pcbe;
1324
1325 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1326 continue;
1327 }
1328
1329 socket_lock(mp_so, 1);
1330 VERIFY(mp_so->so_usecount > 0);
1331
1332 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1333
1334 mptcp_check_subflows_and_add(mpte);
1335 mptcp_remove_subflows(mpte);
1336
1337 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1338 socket_unlock(mp_so, 1);
1339 }
1340
1341 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1342 }
1343
1344 /*
1345 * We need this because we are coming from an NECP-event. This event gets posted
1346 * while holding NECP-locks. The creation of the subflow however leads us back
1347 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1348 * So, we would deadlock there as we already hold the NECP-lock.
1349 *
1350 * So, let's schedule this separately. It also gives NECP the chance to make
1351 * progress, without having to wait for MPTCP to finish its subflow creation.
1352 */
1353 void
1354 mptcp_sched_create_subflows(struct mptses *mpte)
1355 {
1356 struct mppcb *mpp = mpte->mpte_mppcb;
1357 struct mptcb *mp_tp = mpte->mpte_mptcb;
1358 struct socket *mp_so = mpp->mpp_socket;
1359
1360 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1361 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1362 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1363 return;
1364 }
1365
1366 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1367 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1368 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1369 }
1370
1371 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1372 return;
1373 }
1374
1375 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1376 timeout(mptcp_create_subflows, NULL, hz / 10);
1377 }
1378
1379 /*
1380 * Allocate an MPTCP socket option structure.
1381 */
1382 struct mptopt *
1383 mptcp_sopt_alloc(int how)
1384 {
1385 struct mptopt *mpo;
1386
1387 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1388 zalloc_noblock(mptopt_zone);
1389 if (mpo != NULL) {
1390 bzero(mpo, mptopt_zone_size);
1391 }
1392
1393 return mpo;
1394 }
1395
1396 /*
1397 * Free an MPTCP socket option structure.
1398 */
1399 void
1400 mptcp_sopt_free(struct mptopt *mpo)
1401 {
1402 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1403
1404 zfree(mptopt_zone, mpo);
1405 }
1406
1407 /*
1408 * Add a socket option to the MPTCP socket option list.
1409 */
1410 void
1411 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1412 {
1413 socket_lock_assert_owned(mptetoso(mpte));
1414 mpo->mpo_flags |= MPOF_ATTACHED;
1415 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1416 }
1417
1418 /*
1419 * Remove a socket option from the MPTCP socket option list.
1420 */
1421 void
1422 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1423 {
1424 socket_lock_assert_owned(mptetoso(mpte));
1425 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1426 mpo->mpo_flags &= ~MPOF_ATTACHED;
1427 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1428 }
1429
1430 /*
1431 * Search for an existing <sopt_level,sopt_name> socket option.
1432 */
1433 struct mptopt *
1434 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1435 {
1436 struct mptopt *mpo;
1437
1438 socket_lock_assert_owned(mptetoso(mpte));
1439
1440 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1441 if (mpo->mpo_level == sopt->sopt_level &&
1442 mpo->mpo_name == sopt->sopt_name) {
1443 break;
1444 }
1445 }
1446 return mpo;
1447 }
1448
1449 /*
1450 * Allocate a MPTCP subflow structure.
1451 */
1452 static struct mptsub *
1453 mptcp_subflow_alloc(void)
1454 {
1455 struct mptsub *mpts = zalloc(mptsub_zone);
1456
1457 if (mpts == NULL) {
1458 return NULL;
1459 }
1460
1461 bzero(mpts, mptsub_zone_size);
1462 return mpts;
1463 }
1464
1465 /*
1466 * Deallocate a subflow structure, called when all of the references held
1467 * on it have been released. This implies that the subflow has been deleted.
1468 */
1469 static void
1470 mptcp_subflow_free(struct mptsub *mpts)
1471 {
1472 VERIFY(mpts->mpts_refcnt == 0);
1473 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1474 VERIFY(mpts->mpts_mpte == NULL);
1475 VERIFY(mpts->mpts_socket == NULL);
1476
1477 if (mpts->mpts_src != NULL) {
1478 FREE(mpts->mpts_src, M_SONAME);
1479 mpts->mpts_src = NULL;
1480 }
1481
1482 zfree(mptsub_zone, mpts);
1483 }
1484
1485 static void
1486 mptcp_subflow_addref(struct mptsub *mpts)
1487 {
1488 if (++mpts->mpts_refcnt == 0) {
1489 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1490 }
1491 /* NOTREACHED */
1492 }
1493
1494 static void
1495 mptcp_subflow_remref(struct mptsub *mpts)
1496 {
1497 if (mpts->mpts_refcnt == 0) {
1498 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1499 /* NOTREACHED */
1500 }
1501 if (--mpts->mpts_refcnt > 0) {
1502 return;
1503 }
1504
1505 /* callee will unlock and destroy lock */
1506 mptcp_subflow_free(mpts);
1507 }
1508
1509 static void
1510 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1511 {
1512 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1513 struct tcpcb *tp = sototcpcb(so);
1514
1515 /*
1516 * From this moment on, the subflow is linked to the MPTCP-connection.
1517 * Locking,... happens now at the MPTCP-layer
1518 */
1519 tp->t_mptcb = mpte->mpte_mptcb;
1520 so->so_flags |= SOF_MP_SUBFLOW;
1521 mp_so->so_usecount++;
1522
1523 /*
1524 * Insert the subflow into the list, and associate the MPTCP PCB
1525 * as well as the the subflow socket. From this point on, removing
1526 * the subflow needs to be done via mptcp_subflow_del().
1527 */
1528 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1529 mpte->mpte_numflows++;
1530
1531 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1532 mpts->mpts_mpte = mpte;
1533 mpts->mpts_socket = so;
1534 tp->t_mpsub = mpts;
1535 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1536 mptcp_subflow_addref(mpts); /* for subflow socket */
1537 }
1538
1539 static void
1540 mptcp_subflow_necp_cb(void *handle, __unused int action,
1541 __unused uint32_t interface_index,
1542 uint32_t necp_flags, bool *viable)
1543 {
1544 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1545 struct inpcb *inp = (struct inpcb *)handle;
1546 struct socket *so = inp->inp_socket;
1547 struct mptsub *mpts;
1548 struct mptses *mpte;
1549
1550 if (low_power) {
1551 action = NECP_CLIENT_CBACTION_NONVIABLE;
1552 }
1553
1554 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1555 return;
1556 }
1557
1558 /*
1559 * The socket is being garbage-collected. There is nothing to be done
1560 * here.
1561 */
1562 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1563 return;
1564 }
1565
1566 socket_lock(so, 1);
1567
1568 /* Check again after we acquired the lock. */
1569 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1570 goto out;
1571 }
1572
1573 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1574 mpts = sototcpcb(so)->t_mpsub;
1575
1576 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1577 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1578
1579 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1580
1581 mptcp_sched_create_subflows(mpte);
1582
1583 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1584 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1585 viable != NULL) {
1586 *viable = 1;
1587 }
1588
1589 out:
1590 socket_unlock(so, 1);
1591 }
1592
1593 /*
1594 * Create an MPTCP subflow socket.
1595 */
1596 static int
1597 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1598 struct socket **so)
1599 {
1600 lck_mtx_t *subflow_mtx;
1601 struct mptopt smpo, *mpo, *tmpo;
1602 struct proc *p;
1603 struct socket *mp_so;
1604 int error;
1605
1606 *so = NULL;
1607
1608 mp_so = mptetoso(mpte);
1609
1610 p = proc_find(mp_so->last_pid);
1611 if (p == PROC_NULL) {
1612 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1613 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1614
1615 return ESRCH;
1616 }
1617
1618 /*
1619 * Create the subflow socket (multipath subflow, non-blocking.)
1620 *
1621 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1622 * socket; it will be cleared when the socket is peeled off or closed.
1623 * It also indicates to the underlying TCP to handle MPTCP options.
1624 * A multipath subflow socket implies SS_NOFDREF state.
1625 */
1626
1627 /*
1628 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1629 * the ipi-lock. We cannot hold the socket-lock at that point.
1630 */
1631 socket_unlock(mp_so, 0);
1632 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1633 SOCF_MPTCP, PROC_NULL);
1634 socket_lock(mp_so, 0);
1635 if (error) {
1636 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1637 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1638
1639 proc_rele(p);
1640
1641 mptcp_subflow_free(mpts);
1642 return error;
1643 }
1644
1645 /*
1646 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1647 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1648 * Which is why we also need to get the lock with pr_getlock, as after
1649 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1650 */
1651 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1652 lck_mtx_lock(subflow_mtx);
1653
1654 /*
1655 * Must be the first thing we do, to make sure all pointers for this
1656 * subflow are set.
1657 */
1658 mptcp_subflow_attach(mpte, mpts, *so);
1659
1660 /*
1661 * A multipath subflow socket is used internally in the kernel,
1662 * therefore it does not have a file desciptor associated by
1663 * default.
1664 */
1665 (*so)->so_state |= SS_NOFDREF;
1666
1667 lck_mtx_unlock(subflow_mtx);
1668
1669 /* prevent the socket buffers from being compressed */
1670 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1671 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1672
1673 /* Inherit preconnect and TFO data flags */
1674 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1675 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1676 }
1677 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1678 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1679 }
1680
1681 /* Inherit uuid and create the related flow. */
1682 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1683 struct mptcb *mp_tp = mpte->mpte_mptcb;
1684
1685 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1686
1687 /*
1688 * A note on the unlock: With MPTCP, we do multiple times a
1689 * necp_client_register_socket_flow. This is problematic,
1690 * because now the lock-ordering guarantee (first necp-locks,
1691 * then socket-locks) is no more respected. So, we need to
1692 * unlock here.
1693 */
1694 socket_unlock(mp_so, 0);
1695 error = necp_client_register_socket_flow(mp_so->last_pid,
1696 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1697 socket_lock(mp_so, 0);
1698
1699 if (error) {
1700 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1701 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1702
1703 goto out_err;
1704 }
1705
1706 /* Possible state-change during the unlock above */
1707 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1708 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1709 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1710 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1711 mp_tp->mpt_state, mp_tp->mpt_flags);
1712
1713 error = EINVAL;
1714 goto out_err;
1715 }
1716
1717 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1718 }
1719
1720 /* Needs to happen prior to the delegation! */
1721 (*so)->last_pid = mp_so->last_pid;
1722
1723 if (mp_so->so_flags & SOF_DELEGATED) {
1724 if (mpte->mpte_epid) {
1725 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1726 if (error) {
1727 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1728 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1729 goto out_err;
1730 }
1731 }
1732 if (!uuid_is_null(mpte->mpte_euuid)) {
1733 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1734 if (error) {
1735 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1736 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1737 goto out_err;
1738 }
1739 }
1740 }
1741
1742 /* inherit the other socket options */
1743 bzero(&smpo, sizeof(smpo));
1744 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1745 smpo.mpo_level = SOL_SOCKET;
1746 smpo.mpo_intval = 1;
1747
1748 /* disable SIGPIPE */
1749 smpo.mpo_name = SO_NOSIGPIPE;
1750 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1751 goto out_err;
1752 }
1753
1754 /* find out if the subflow's source address goes away */
1755 smpo.mpo_name = SO_NOADDRERR;
1756 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1757 goto out_err;
1758 }
1759
1760 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1761 /*
1762 * On secondary subflows we might need to set the cell-fallback
1763 * flag (see conditions in mptcp_subflow_sosetopt).
1764 */
1765 smpo.mpo_level = SOL_SOCKET;
1766 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1767 smpo.mpo_intval = 1;
1768 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1769 goto out_err;
1770 }
1771 }
1772
1773 /* replay setsockopt(2) on the subflow sockets for eligible options */
1774 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1775 int interim;
1776
1777 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1778 continue;
1779 }
1780
1781 /*
1782 * Skip those that are handled internally; these options
1783 * should not have been recorded and marked with the
1784 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1785 */
1786 if (mpo->mpo_level == SOL_SOCKET &&
1787 (mpo->mpo_name == SO_NOSIGPIPE ||
1788 mpo->mpo_name == SO_NOADDRERR ||
1789 mpo->mpo_name == SO_KEEPALIVE)) {
1790 continue;
1791 }
1792
1793 interim = (mpo->mpo_flags & MPOF_INTERIM);
1794 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1795 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1796 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1797 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1798 mpo->mpo_intval);
1799 mptcp_sopt_remove(mpte, mpo);
1800 mptcp_sopt_free(mpo);
1801 continue;
1802 }
1803 }
1804
1805 /*
1806 * We need to receive everything that the subflow socket has,
1807 * so use a customized socket receive function. We will undo
1808 * this when the socket is peeled off or closed.
1809 */
1810 switch (dom) {
1811 case PF_INET:
1812 (*so)->so_proto = &mptcp_subflow_protosw;
1813 break;
1814 #if INET6
1815 case PF_INET6:
1816 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1817 break;
1818 #endif /* INET6 */
1819 default:
1820 VERIFY(0);
1821 /* NOTREACHED */
1822 }
1823
1824 proc_rele(p);
1825
1826 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1827 int, dom, int, error);
1828
1829 return 0;
1830
1831 out_err:
1832 mptcp_subflow_abort(mpts, error);
1833
1834 proc_rele(p);
1835
1836 return error;
1837 }
1838
1839 /*
1840 * Close an MPTCP subflow socket.
1841 *
1842 * Note that this may be called on an embryonic subflow, and the only
1843 * thing that is guaranteed valid is the protocol-user request.
1844 */
1845 static void
1846 mptcp_subflow_soclose(struct mptsub *mpts)
1847 {
1848 struct socket *so = mpts->mpts_socket;
1849
1850 if (mpts->mpts_flags & MPTSF_CLOSED) {
1851 return;
1852 }
1853
1854 VERIFY(so != NULL);
1855 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1856 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1857
1858 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1859 struct socket *, so,
1860 struct sockbuf *, &so->so_rcv,
1861 struct sockbuf *, &so->so_snd,
1862 struct mptses *, mpts->mpts_mpte);
1863
1864 mpts->mpts_flags |= MPTSF_CLOSED;
1865
1866 if (so->so_retaincnt == 0) {
1867 soclose_locked(so);
1868
1869 return;
1870 } else {
1871 VERIFY(so->so_usecount > 0);
1872 so->so_usecount--;
1873 }
1874
1875 return;
1876 }
1877
1878 /*
1879 * Connect an MPTCP subflow socket.
1880 *
1881 * Note that in the pending connect case, the subflow socket may have been
1882 * bound to an interface and/or a source IP address which may no longer be
1883 * around by the time this routine is called; in that case the connect attempt
1884 * will most likely fail.
1885 */
1886 static int
1887 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1888 {
1889 char dbuf[MAX_IPv6_STR_LEN];
1890 struct socket *mp_so, *so;
1891 struct mptcb *mp_tp;
1892 struct sockaddr *dst;
1893 struct proc *p;
1894 int af, error, dport;
1895
1896 mp_so = mptetoso(mpte);
1897 mp_tp = mpte->mpte_mptcb;
1898 so = mpts->mpts_socket;
1899 af = mpts->mpts_dst.sa_family;
1900 dst = &mpts->mpts_dst;
1901
1902 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1903 VERIFY(mpts->mpts_socket != NULL);
1904 VERIFY(af == AF_INET || af == AF_INET6);
1905
1906 if (af == AF_INET) {
1907 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1908 dport = ntohs(SIN(dst)->sin_port);
1909 } else {
1910 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1911 dport = ntohs(SIN6(dst)->sin6_port);
1912 }
1913
1914 os_log_info(mptcp_log_handle,
1915 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1916 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1917
1918 p = proc_find(mp_so->last_pid);
1919 if (p == PROC_NULL) {
1920 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1921 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1922
1923 return ESRCH;
1924 }
1925
1926 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1927
1928 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1929
1930 /* connect the subflow socket */
1931 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1932 p, mpts->mpts_ifscope,
1933 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1934
1935 mpts->mpts_iss = sototcpcb(so)->iss;
1936
1937 /* See tcp_connect_complete */
1938 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1939 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1940 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1941 }
1942
1943 /* Allocate a unique address id per subflow */
1944 mpte->mpte_addrid_last++;
1945 if (mpte->mpte_addrid_last == 0) {
1946 mpte->mpte_addrid_last++;
1947 }
1948
1949 proc_rele(p);
1950
1951 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1952 struct mptsub *, mpts, int, error);
1953 if (error) {
1954 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1955 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1956 }
1957
1958 return error;
1959 }
1960
1961 static int
1962 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1963 uint32_t rseq, uint16_t dlen)
1964 {
1965 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1966
1967 if (m_pktlen(m) == 0) {
1968 return 0;
1969 }
1970
1971 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1972 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1973 rseq != m->m_pkthdr.mp_rseq ||
1974 dlen != m->m_pkthdr.mp_rlen)) {
1975 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
1976 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1977 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1978 rseq, m->m_pkthdr.mp_rseq,
1979 dlen, m->m_pkthdr.mp_rlen);
1980
1981 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1982 return -1;
1983 }
1984 m->m_pkthdr.mp_dsn += off;
1985 m->m_pkthdr.mp_rseq += off;
1986 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
1987 } else {
1988 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
1989 /* data arrived without an DSS option mapping */
1990
1991 /* initial subflow can fallback right after SYN handshake */
1992 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
1993 mptcp_notify_mpfail(so);
1994 } else {
1995 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1996
1997 return -1;
1998 }
1999 } else if (m->m_flags & M_PKTHDR) {
2000 /* We need to fake the DATA-mapping */
2001 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2002 m->m_pkthdr.mp_dsn = dsn + off;
2003 m->m_pkthdr.mp_rseq = rseq + off;
2004 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
2005 }
2006 }
2007
2008 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2009
2010 return 0;
2011 }
2012
2013 /*
2014 * MPTCP subflow socket receive routine, derived from soreceive().
2015 */
2016 static int
2017 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2018 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2019 {
2020 #pragma unused(uio)
2021 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2022 int flags, error = 0;
2023 struct proc *p = current_proc();
2024 struct mbuf *m, **mp = mp0;
2025 boolean_t proc_held = FALSE;
2026
2027 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2028
2029 #ifdef MORE_LOCKING_DEBUG
2030 if (so->so_usecount == 1) {
2031 panic("%s: so=%x no other reference on socket\n", __func__, so);
2032 /* NOTREACHED */
2033 }
2034 #endif
2035 /*
2036 * We return all that is there in the subflow's socket receive buffer
2037 * to the MPTCP layer, so we require that the caller passes in the
2038 * expected parameters.
2039 */
2040 if (mp == NULL || controlp != NULL) {
2041 return EINVAL;
2042 }
2043
2044 *mp = NULL;
2045 if (psa != NULL) {
2046 *psa = NULL;
2047 }
2048 if (flagsp != NULL) {
2049 flags = *flagsp & ~MSG_EOR;
2050 } else {
2051 flags = 0;
2052 }
2053
2054 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2055 return EOPNOTSUPP;
2056 }
2057
2058 flags |= (MSG_DONTWAIT | MSG_NBIO);
2059
2060 /*
2061 * If a recv attempt is made on a previously-accepted socket
2062 * that has been marked as inactive (disconnected), reject
2063 * the request.
2064 */
2065 if (so->so_flags & SOF_DEFUNCT) {
2066 struct sockbuf *sb = &so->so_rcv;
2067
2068 error = ENOTCONN;
2069 /*
2070 * This socket should have been disconnected and flushed
2071 * prior to being returned from sodefunct(); there should
2072 * be no data on its receive list, so panic otherwise.
2073 */
2074 if (so->so_state & SS_DEFUNCT) {
2075 sb_empty_assert(sb, __func__);
2076 }
2077 return error;
2078 }
2079
2080 /*
2081 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2082 * and if so just return to the caller. This could happen when
2083 * soreceive() is called by a socket upcall function during the
2084 * time the socket is freed. The socket buffer would have been
2085 * locked across the upcall, therefore we cannot put this thread
2086 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2087 * we may livelock), because the lock on the socket buffer will
2088 * only be released when the upcall routine returns to its caller.
2089 * Because the socket has been officially closed, there can be
2090 * no further read on it.
2091 *
2092 * A multipath subflow socket would have its SS_NOFDREF set by
2093 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2094 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2095 */
2096 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2097 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2098 return 0;
2099 }
2100
2101 /*
2102 * For consistency with soreceive() semantics, we need to obey
2103 * SB_LOCK in case some other code path has locked the buffer.
2104 */
2105 error = sblock(&so->so_rcv, 0);
2106 if (error != 0) {
2107 return error;
2108 }
2109
2110 m = so->so_rcv.sb_mb;
2111 if (m == NULL) {
2112 /*
2113 * Panic if we notice inconsistencies in the socket's
2114 * receive list; both sb_mb and sb_cc should correctly
2115 * reflect the contents of the list, otherwise we may
2116 * end up with false positives during select() or poll()
2117 * which could put the application in a bad state.
2118 */
2119 SB_MB_CHECK(&so->so_rcv);
2120
2121 if (so->so_error != 0) {
2122 error = so->so_error;
2123 so->so_error = 0;
2124 goto release;
2125 }
2126
2127 if (so->so_state & SS_CANTRCVMORE) {
2128 goto release;
2129 }
2130
2131 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2132 error = ENOTCONN;
2133 goto release;
2134 }
2135
2136 /*
2137 * MSG_DONTWAIT is implicitly defined and this routine will
2138 * never block, so return EWOULDBLOCK when there is nothing.
2139 */
2140 error = EWOULDBLOCK;
2141 goto release;
2142 }
2143
2144 mptcp_update_last_owner(so, mp_so);
2145
2146 if (mp_so->last_pid != proc_pid(p)) {
2147 p = proc_find(mp_so->last_pid);
2148 if (p == PROC_NULL) {
2149 p = current_proc();
2150 } else {
2151 proc_held = TRUE;
2152 }
2153 }
2154
2155 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2156 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2157 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2158
2159 while (m != NULL) {
2160 int dlen = 0, dfin = 0, error_out = 0;
2161 struct mbuf *start = m;
2162 uint64_t dsn;
2163 uint32_t sseq;
2164 uint16_t orig_dlen;
2165 uint16_t csum;
2166
2167 VERIFY(m->m_nextpkt == NULL);
2168
2169 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2170 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2171 dsn = m->m_pkthdr.mp_dsn;
2172 sseq = m->m_pkthdr.mp_rseq;
2173 csum = m->m_pkthdr.mp_csum;
2174 } else {
2175 /* We did fallback */
2176 if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) {
2177 error = EIO;
2178 *mp0 = NULL;
2179 goto release;
2180 }
2181
2182 sbfree(&so->so_rcv, m);
2183
2184 if (mp != NULL) {
2185 *mp = m;
2186 mp = &m->m_next;
2187 so->so_rcv.sb_mb = m = m->m_next;
2188 *mp = NULL;
2189 }
2190
2191 if (m != NULL) {
2192 so->so_rcv.sb_lastrecord = m;
2193 } else {
2194 SB_EMPTY_FIXUP(&so->so_rcv);
2195 }
2196
2197 continue;
2198 }
2199
2200 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2201 dfin = 1;
2202 }
2203
2204 /*
2205 * Check if the full mapping is now present
2206 */
2207 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
2208 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
2209 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
2210 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
2211
2212 if (*mp0 == NULL) {
2213 error = EWOULDBLOCK;
2214 }
2215 goto release;
2216 }
2217
2218 /* Now, get the full mapping */
2219 while (dlen > 0) {
2220 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
2221 error_out = 1;
2222 error = EIO;
2223 dlen = 0;
2224 *mp0 = NULL;
2225 break;
2226 }
2227
2228 dlen -= m->m_len;
2229 sbfree(&so->so_rcv, m);
2230
2231 if (mp != NULL) {
2232 *mp = m;
2233 mp = &m->m_next;
2234 so->so_rcv.sb_mb = m = m->m_next;
2235 *mp = NULL;
2236 }
2237
2238 if (dlen - dfin == 0) {
2239 dlen = 0;
2240 }
2241
2242 VERIFY(dlen <= 0 || m);
2243 }
2244
2245 VERIFY(dlen == 0);
2246
2247 if (m != NULL) {
2248 so->so_rcv.sb_lastrecord = m;
2249 } else {
2250 SB_EMPTY_FIXUP(&so->so_rcv);
2251 }
2252
2253 if (error_out) {
2254 goto release;
2255 }
2256
2257 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2258 error = EIO;
2259 *mp0 = NULL;
2260 goto release;
2261 }
2262
2263 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2264 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2265 }
2266
2267 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2268 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2269
2270 if (flagsp != NULL) {
2271 *flagsp |= flags;
2272 }
2273
2274 release:
2275 sbunlock(&so->so_rcv, TRUE);
2276
2277 if (proc_held) {
2278 proc_rele(p);
2279 }
2280
2281 return error;
2282 }
2283
2284 /*
2285 * MPTCP subflow socket send routine, derived from sosend().
2286 */
2287 static int
2288 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2289 struct mbuf *top, struct mbuf *control, int flags)
2290 {
2291 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2292 struct proc *p = current_proc();
2293 boolean_t en_tracing = FALSE, proc_held = FALSE;
2294 int en_tracing_val;
2295 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2296 int error;
2297
2298 VERIFY(control == NULL);
2299 VERIFY(addr == NULL);
2300 VERIFY(uio == NULL);
2301 VERIFY(flags == 0);
2302 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2303
2304 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2305 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2306
2307 /*
2308 * trace if tracing & network (vs. unix) sockets & and
2309 * non-loopback
2310 */
2311 if (ENTR_SHOULDTRACE &&
2312 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2313 struct inpcb *inp = sotoinpcb(so);
2314 if (inp->inp_last_outifp != NULL &&
2315 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2316 en_tracing = TRUE;
2317 en_tracing_val = top->m_pkthdr.len;
2318 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2319 (unsigned long)VM_KERNEL_ADDRPERM(so),
2320 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2321 (int64_t)en_tracing_val);
2322 }
2323 }
2324
2325 mptcp_update_last_owner(so, mp_so);
2326
2327 if (mp_so->last_pid != proc_pid(p)) {
2328 p = proc_find(mp_so->last_pid);
2329 if (p == PROC_NULL) {
2330 p = current_proc();
2331 } else {
2332 proc_held = TRUE;
2333 }
2334 }
2335
2336 #if NECP
2337 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2338 #endif /* NECP */
2339
2340 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2341
2342 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
2343 if (error) {
2344 goto out;
2345 }
2346
2347 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2348 top = NULL;
2349
2350 out:
2351 if (top != NULL) {
2352 m_freem(top);
2353 }
2354
2355 if (proc_held) {
2356 proc_rele(p);
2357 }
2358
2359 soclearfastopen(so);
2360
2361 if (en_tracing) {
2362 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2363 (unsigned long)VM_KERNEL_ADDRPERM(so),
2364 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2365 (int64_t)en_tracing_val);
2366 }
2367
2368 return error;
2369 }
2370
2371 /*
2372 * Establish an initial MPTCP connection (if first subflow and not yet
2373 * connected), or add a subflow to an existing MPTCP connection.
2374 */
2375 int
2376 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2377 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2378 {
2379 struct socket *mp_so, *so = NULL;
2380 struct mptcb *mp_tp;
2381 struct mptsub *mpts = NULL;
2382 int af, error = 0;
2383
2384 mp_so = mptetoso(mpte);
2385 mp_tp = mpte->mpte_mptcb;
2386
2387 socket_lock_assert_owned(mp_so);
2388
2389 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2390 /* If the remote end sends Data FIN, refuse subflow adds */
2391 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2392 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2393 error = ENOTCONN;
2394 goto out_err;
2395 }
2396
2397 mpts = mptcp_subflow_alloc();
2398 if (mpts == NULL) {
2399 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2400 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2401 error = ENOMEM;
2402 goto out_err;
2403 }
2404
2405 if (src) {
2406 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2407 error = EAFNOSUPPORT;
2408 goto out_err;
2409 }
2410
2411 if (src->sa_family == AF_INET &&
2412 src->sa_len != sizeof(struct sockaddr_in)) {
2413 error = EINVAL;
2414 goto out_err;
2415 }
2416
2417 if (src->sa_family == AF_INET6 &&
2418 src->sa_len != sizeof(struct sockaddr_in6)) {
2419 error = EINVAL;
2420 goto out_err;
2421 }
2422
2423 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
2424 M_WAITOK | M_ZERO);
2425 if (mpts->mpts_src == NULL) {
2426 error = ENOMEM;
2427 goto out_err;
2428 }
2429 bcopy(src, mpts->mpts_src, src->sa_len);
2430 }
2431
2432 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2433 error = EAFNOSUPPORT;
2434 goto out_err;
2435 }
2436
2437 if (dst->sa_family == AF_INET &&
2438 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2439 error = EINVAL;
2440 goto out_err;
2441 }
2442
2443 if (dst->sa_family == AF_INET6 &&
2444 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2445 error = EINVAL;
2446 goto out_err;
2447 }
2448
2449 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2450
2451 af = mpts->mpts_dst.sa_family;
2452
2453 ifnet_head_lock_shared();
2454 if ((ifscope > (unsigned)if_index)) {
2455 ifnet_head_done();
2456 error = ENXIO;
2457 goto out_err;
2458 }
2459 ifnet_head_done();
2460
2461 mpts->mpts_ifscope = ifscope;
2462
2463 /* create the subflow socket */
2464 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2465 /*
2466 * Returning (error) and not cleaning up, because up to here
2467 * all we did is creating mpts.
2468 *
2469 * And the contract is that the call to mptcp_subflow_socreate,
2470 * moves ownership of mpts to mptcp_subflow_socreate.
2471 */
2472 return error;
2473 }
2474
2475 /*
2476 * We may be called from within the kernel. Still need to account this
2477 * one to the real app.
2478 */
2479 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2480
2481 /*
2482 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2483 * -1 (SAE_CONNID_ALL).
2484 */
2485 mpte->mpte_connid_last++;
2486 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2487 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2488 mpte->mpte_connid_last++;
2489 }
2490
2491 mpts->mpts_connid = mpte->mpte_connid_last;
2492
2493 mpts->mpts_rel_seq = 1;
2494
2495 /* Allocate a unique address id per subflow */
2496 mpte->mpte_addrid_last++;
2497 if (mpte->mpte_addrid_last == 0) {
2498 mpte->mpte_addrid_last++;
2499 }
2500
2501 /* register for subflow socket read/write events */
2502 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2503
2504 /* Register for subflow socket control events */
2505 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2506 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2507 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2508 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2509 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2510 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2511 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2512 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2513
2514 /* sanity check */
2515 VERIFY(!(mpts->mpts_flags &
2516 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2517
2518 /*
2519 * Indicate to the TCP subflow whether or not it should establish
2520 * the initial MPTCP connection, or join an existing one. Fill
2521 * in the connection request structure with additional info needed
2522 * by the underlying TCP (to be used in the TCP options, etc.)
2523 */
2524 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2525 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2526
2527 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2528 mptcp_init_local_parms(mpte);
2529 }
2530 soisconnecting(mp_so);
2531
2532 /* If fastopen is requested, set state in mpts */
2533 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2534 mpts->mpts_flags |= MPTSF_TFO_REQD;
2535 }
2536 } else {
2537 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2538 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2539 }
2540 }
2541
2542 mpts->mpts_flags |= MPTSF_CONNECTING;
2543
2544 /* connect right away if first attempt, or if join can be done now */
2545 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2546 error = mptcp_subflow_soconnectx(mpte, mpts);
2547 }
2548
2549 if (error) {
2550 goto out_err_close;
2551 }
2552
2553 if (pcid) {
2554 *pcid = mpts->mpts_connid;
2555 }
2556
2557 return 0;
2558
2559 out_err_close:
2560 mptcp_subflow_abort(mpts, error);
2561
2562 return error;
2563
2564 out_err:
2565 if (mpts) {
2566 mptcp_subflow_free(mpts);
2567 }
2568
2569 return error;
2570 }
2571
2572 void
2573 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2574 {
2575 int index = mptcpstats_get_index(stats, mpts);
2576
2577 if (index != -1) {
2578 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2579
2580 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2581 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2582
2583 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2584 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2585
2586 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2587 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2588
2589 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2590 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2591 }
2592 }
2593
2594 /*
2595 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2596 * will no longer be accessible after a subflow is deleted, thus this
2597 * should occur only after the subflow socket has been disconnected.
2598 */
2599 void
2600 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2601 {
2602 struct socket *mp_so = mptetoso(mpte);
2603 struct socket *so = mpts->mpts_socket;
2604 struct tcpcb *tp = sototcpcb(so);
2605
2606 socket_lock_assert_owned(mp_so);
2607 VERIFY(mpts->mpts_mpte == mpte);
2608 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2609 VERIFY(mpte->mpte_numflows != 0);
2610 VERIFY(mp_so->so_usecount > 0);
2611
2612 mptcpstats_update(mpte->mpte_itfstats, mpts);
2613
2614 mptcp_unset_cellicon(mpte, mpts, 1);
2615
2616 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2617 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2618
2619 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2620 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2621 mpte->mpte_numflows--;
2622 if (mpte->mpte_active_sub == mpts) {
2623 mpte->mpte_active_sub = NULL;
2624 }
2625
2626 /*
2627 * Drop references held by this subflow socket; there
2628 * will be no further upcalls made from this point.
2629 */
2630 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2631 sock_catchevents_locked(so, NULL, NULL, 0);
2632
2633 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2634
2635 mp_so->so_usecount--; /* for subflow socket */
2636 mpts->mpts_mpte = NULL;
2637 mpts->mpts_socket = NULL;
2638
2639 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2640 mptcp_subflow_remref(mpts); /* for subflow socket */
2641
2642 so->so_flags &= ~SOF_MP_SUBFLOW;
2643 tp->t_mptcb = NULL;
2644 tp->t_mpsub = NULL;
2645 }
2646
2647 void
2648 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2649 {
2650 struct socket *so = mpts->mpts_socket;
2651 struct mptcb *mp_tp = mpte->mpte_mptcb;
2652 int send_dfin = 0;
2653
2654 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2655 send_dfin = 1;
2656 }
2657
2658 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2659 (so->so_state & SS_ISCONNECTED)) {
2660 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2661 __func__, mpts->mpts_connid, send_dfin),
2662 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2663
2664 if (send_dfin) {
2665 mptcp_send_dfin(so);
2666 }
2667 soshutdownlock(so, SHUT_WR);
2668 }
2669 }
2670
2671 static void
2672 mptcp_subflow_abort(struct mptsub *mpts, int error)
2673 {
2674 struct socket *so = mpts->mpts_socket;
2675 struct tcpcb *tp = sototcpcb(so);
2676
2677 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2678 return;
2679 }
2680
2681 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2682 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2683
2684 if (tp->t_state != TCPS_CLOSED) {
2685 tcp_drop(tp, error);
2686 }
2687
2688 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2689 }
2690
2691 /*
2692 * Disconnect a subflow socket.
2693 */
2694 void
2695 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2696 {
2697 struct socket *so, *mp_so;
2698 struct mptcb *mp_tp;
2699 int send_dfin = 0;
2700
2701 so = mpts->mpts_socket;
2702 mp_tp = mpte->mpte_mptcb;
2703 mp_so = mptetoso(mpte);
2704
2705 socket_lock_assert_owned(mp_so);
2706
2707 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2708 return;
2709 }
2710
2711 mptcp_unset_cellicon(mpte, mpts, 1);
2712
2713 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2714
2715 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2716 send_dfin = 1;
2717 }
2718
2719 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2720 (so->so_state & SS_ISCONNECTED)) {
2721 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2722 __func__, mpts->mpts_connid, send_dfin),
2723 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2724
2725 if (send_dfin) {
2726 mptcp_send_dfin(so);
2727 }
2728
2729 if (mp_so->so_flags & SOF_DEFUNCT) {
2730 errno_t ret;
2731
2732 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2733 if (ret == 0) {
2734 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2735
2736 if (ret != 0) {
2737 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2738 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2739 }
2740 } else {
2741 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2742 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2743 }
2744 } else {
2745 (void) soshutdownlock(so, SHUT_RD);
2746 (void) soshutdownlock(so, SHUT_WR);
2747 (void) sodisconnectlocked(so);
2748 }
2749 }
2750
2751 /*
2752 * Generate a disconnect event for this subflow socket, in case
2753 * the lower layer doesn't do it; this is needed because the
2754 * subflow socket deletion relies on it.
2755 */
2756 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2757 }
2758
2759 /*
2760 * Subflow socket input.
2761 */
2762 static void
2763 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2764 {
2765 struct socket *mp_so = mptetoso(mpte);
2766 struct mbuf *m = NULL;
2767 struct socket *so;
2768 int error, wakeup = 0;
2769
2770 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2771 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2772
2773 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2774 struct mptsub *, mpts);
2775
2776 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2777 goto out;
2778 }
2779
2780 so = mpts->mpts_socket;
2781
2782 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2783 if (error != 0 && error != EWOULDBLOCK) {
2784 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2785 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2786 if (error == ENODATA) {
2787 /*
2788 * Don't ignore ENODATA so as to discover
2789 * nasty middleboxes.
2790 */
2791 mp_so->so_error = ENODATA;
2792
2793 wakeup = 1;
2794 goto out;
2795 }
2796 } else if (error == 0) {
2797 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2798 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2799 }
2800
2801 /* In fallback, make sure to accept data on all but one subflow */
2802 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2803 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2804 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2805 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2806 m_freem(m);
2807 goto out;
2808 }
2809
2810 if (m != NULL) {
2811 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2812 mptcp_set_cellicon(mpte, mpts);
2813
2814 mpte->mpte_used_cell = 1;
2815 } else {
2816 /*
2817 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2818 * explicitly set the cellicon, then we unset it again.
2819 */
2820 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2821 mptcp_unset_cellicon(mpte, NULL, 1);
2822 }
2823
2824 mpte->mpte_used_wifi = 1;
2825 }
2826
2827 mptcp_input(mpte, m);
2828 }
2829
2830 out:
2831 if (wakeup) {
2832 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2833 }
2834
2835 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2836 }
2837
2838 void
2839 mptcp_handle_input(struct socket *so)
2840 {
2841 struct mptsub *mpts, *tmpts;
2842 struct mptses *mpte;
2843
2844 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2845 return;
2846 }
2847
2848 mpts = sototcpcb(so)->t_mpsub;
2849 mpte = mpts->mpts_mpte;
2850
2851 socket_lock_assert_owned(mptetoso(mpte));
2852
2853 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2854 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2855 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2856 }
2857 return;
2858 }
2859
2860 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2861 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2862 if (mpts->mpts_socket->so_usecount == 0) {
2863 /* Will be removed soon by tcp_garbage_collect */
2864 continue;
2865 }
2866
2867 mptcp_subflow_addref(mpts);
2868 mpts->mpts_socket->so_usecount++;
2869
2870 mptcp_subflow_input(mpte, mpts);
2871
2872 mptcp_subflow_remref(mpts); /* ours */
2873
2874 VERIFY(mpts->mpts_socket->so_usecount != 0);
2875 mpts->mpts_socket->so_usecount--;
2876 }
2877
2878 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2879 }
2880
2881 /*
2882 * Subflow socket write upcall.
2883 *
2884 * Called when the associated subflow socket posted a read event.
2885 */
2886 static void
2887 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2888 {
2889 #pragma unused(so, waitf)
2890 struct mptsub *mpts = arg;
2891 struct mptses *mpte = mpts->mpts_mpte;
2892
2893 VERIFY(mpte != NULL);
2894
2895 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2896 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2897 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2898 }
2899 return;
2900 }
2901
2902 mptcp_output(mpte);
2903 }
2904
2905 static boolean_t
2906 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2907 {
2908 struct mbuf *so_m = so->so_snd.sb_mb;
2909 uint64_t dsn = m->m_pkthdr.mp_dsn;
2910
2911 while (so_m) {
2912 VERIFY(so_m->m_flags & M_PKTHDR);
2913 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2914
2915 /* Part of the segment is covered, don't reinject here */
2916 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2917 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2918 return TRUE;
2919 }
2920
2921 so_m = so_m->m_next;
2922 }
2923
2924 return FALSE;
2925 }
2926
2927 /*
2928 * Subflow socket output.
2929 *
2930 * Called for sending data from MPTCP to the underlying subflow socket.
2931 */
2932 int
2933 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2934 {
2935 struct mptcb *mp_tp = mpte->mpte_mptcb;
2936 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2937 struct socket *mp_so, *so;
2938 struct tcpcb *tp;
2939 uint64_t mpt_dsn = 0, off = 0;
2940 int sb_cc = 0, error = 0, wakeup = 0;
2941 uint32_t dss_csum;
2942 uint16_t tot_sent = 0;
2943 boolean_t reinjected = FALSE;
2944
2945 mp_so = mptetoso(mpte);
2946 so = mpts->mpts_socket;
2947 tp = sototcpcb(so);
2948
2949 socket_lock_assert_owned(mp_so);
2950
2951 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2952 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2953
2954 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2955 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2956 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2957 (mpts->mpts_flags & MPTSF_TFO_REQD));
2958 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2959
2960 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2961 __func__, mpts->mpts_flags, mpte->mpte_flags,
2962 mptcp_subflow_cwnd_space(so)),
2963 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2964 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2965 struct mptsub *, mpts);
2966
2967 /* Remove Addr Option is not sent reliably as per I-D */
2968 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2969 tp->t_rem_aid = mpte->mpte_lost_aid;
2970 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2971 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2972 }
2973
2974 /*
2975 * The mbuf chains containing the metadata (as well as pointing to
2976 * the user data sitting at the MPTCP output queue) would then be
2977 * sent down to the subflow socket.
2978 *
2979 * Some notes on data sequencing:
2980 *
2981 * a. Each mbuf must be a M_PKTHDR.
2982 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2983 * in the mbuf pkthdr structure.
2984 * c. Each mbuf containing the MPTCP metadata must have its
2985 * pkt_flags marked with the PKTF_MPTCP flag.
2986 */
2987
2988 if (mpte->mpte_reinjectq) {
2989 sb_mb = mpte->mpte_reinjectq;
2990 } else {
2991 sb_mb = mp_so->so_snd.sb_mb;
2992 }
2993
2994 if (sb_mb == NULL) {
2995 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2996 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2997 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2998 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
2999
3000 /* Fix it to prevent looping */
3001 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3002 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3003 }
3004 goto out;
3005 }
3006
3007 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3008
3009 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3010 !(so->so_state & SS_ISCONNECTED) &&
3011 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3012 tp->t_mpflags |= TMPF_TFO_REQUEST;
3013 goto zero_len_write;
3014 }
3015
3016 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3017
3018 /* First, drop acknowledged data */
3019 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3020 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3021 "dsn %u suna %u reinject? %u\n",
3022 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3023 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3024 if (mpte->mpte_reinjectq) {
3025 mptcp_clean_reinjectq(mpte);
3026 } else {
3027 uint64_t len = 0;
3028 len = mp_tp->mpt_snduna - mpt_dsn;
3029 sbdrop(&mp_so->so_snd, (int)len);
3030 wakeup = 1;
3031 }
3032 }
3033
3034 /* Check again because of above sbdrop */
3035 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3036 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3037 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3038 goto out;
3039 }
3040
3041 /*
3042 * In degraded mode, we don't receive data acks, so force free
3043 * mbufs less than snd_nxt
3044 */
3045 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3046 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3047 mp_so->so_snd.sb_mb) {
3048 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3049 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3050 uint64_t len = 0;
3051 len = mp_tp->mpt_snduna - mpt_dsn;
3052 sbdrop(&mp_so->so_snd, (int)len);
3053 wakeup = 1;
3054
3055 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3056 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3057 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3058 }
3059 }
3060
3061 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3062 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3063 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3064 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3065 }
3066
3067 /*
3068 * Adjust the top level notion of next byte used for retransmissions
3069 * and sending FINs.
3070 */
3071 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3072 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3073 }
3074
3075 /* Now determine the offset from which to start transmitting data */
3076 if (mpte->mpte_reinjectq) {
3077 sb_mb = mpte->mpte_reinjectq;
3078 } else {
3079 dont_reinject:
3080 sb_mb = mp_so->so_snd.sb_mb;
3081 }
3082 if (sb_mb == NULL) {
3083 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3084 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3085 goto out;
3086 }
3087
3088 if (sb_mb == mpte->mpte_reinjectq) {
3089 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3090 off = 0;
3091
3092 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3093 if (mptcp_can_send_more(mp_tp, TRUE)) {
3094 goto dont_reinject;
3095 }
3096
3097 error = ECANCELED;
3098 goto out;
3099 }
3100
3101 reinjected = TRUE;
3102 } else if (flags & MPTCP_SUBOUT_PROBING) {
3103 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3104 off = 0;
3105 } else {
3106 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3107
3108 /*
3109 * With TFO, there might be no data at all, thus still go into this
3110 * code-path here.
3111 */
3112 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3113 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3114 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3115 sb_cc -= off;
3116 } else {
3117 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3118 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3119 (uint32_t)mp_tp->mpt_sndmax);
3120
3121 goto out;
3122 }
3123 }
3124
3125 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3126 if (sb_cc <= 0) {
3127 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3128 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3129 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3130 mptcp_subflow_cwnd_space(so));
3131 }
3132
3133 sb_cc = min(sb_cc, UINT16_MAX);
3134
3135 /*
3136 * Create a DSN mapping for the data we are about to send. It all
3137 * has the same mapping.
3138 */
3139 if (reinjected) {
3140 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3141 } else {
3142 mpt_dsn = mp_tp->mpt_snduna + off;
3143 }
3144
3145 mpt_mbuf = sb_mb;
3146 while (mpt_mbuf && reinjected == FALSE &&
3147 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3148 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3149 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3150 mpt_mbuf = mpt_mbuf->m_next;
3151 }
3152 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3153 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3154 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3155 mpts->mpts_probecnt),
3156 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3157 }
3158
3159 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3160
3161 head = tail = NULL;
3162
3163 while (tot_sent < sb_cc) {
3164 ssize_t mlen;
3165
3166 mlen = mpt_mbuf->m_len;
3167 mlen -= off;
3168 mlen = min(mlen, sb_cc - tot_sent);
3169
3170 if (mlen < 0) {
3171 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3172 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3173 (uint32_t)off, sb_cc, tot_sent);
3174 goto out;
3175 }
3176
3177 if (mlen == 0) {
3178 goto next;
3179 }
3180
3181 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3182 M_COPYM_MUST_COPY_HDR);
3183 if (m == NULL) {
3184 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3185 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3186 error = ENOBUFS;
3187 break;
3188 }
3189
3190 /* Create a DSN mapping for the data (m_copym does it) */
3191 VERIFY(m->m_flags & M_PKTHDR);
3192 VERIFY(m->m_next == NULL);
3193
3194 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3195 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3196 m->m_pkthdr.mp_dsn = mpt_dsn;
3197 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3198 m->m_pkthdr.len = mlen;
3199
3200 if (head == NULL) {
3201 head = tail = m;
3202 } else {
3203 tail->m_next = m;
3204 tail = m;
3205 }
3206
3207 tot_sent += mlen;
3208 off = 0;
3209 next:
3210 mpt_mbuf = mpt_mbuf->m_next;
3211 }
3212
3213 if (reinjected) {
3214 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3215 struct mbuf *n = sb_mb;
3216
3217 while (n) {
3218 n->m_pkthdr.mp_dsn += sb_cc;
3219 n->m_pkthdr.mp_rlen -= sb_cc;
3220 n = n->m_next;
3221 }
3222 m_adj(sb_mb, sb_cc);
3223 } else {
3224 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3225 m_freem(sb_mb);
3226 }
3227 }
3228
3229 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3230 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3231 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3232
3233 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3234 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3235 tot_sent);
3236 }
3237
3238 /* Now, let's update rel-seq and the data-level length */
3239 mpts->mpts_rel_seq += tot_sent;
3240 m = head;
3241 while (m) {
3242 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3243 m->m_pkthdr.mp_csum = dss_csum;
3244 }
3245 m->m_pkthdr.mp_rlen = tot_sent;
3246 m = m->m_next;
3247 }
3248
3249 if (head != NULL) {
3250 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3251 (tp->t_tfo_stats == 0)) {
3252 tp->t_mpflags |= TMPF_TFO_REQUEST;
3253 }
3254
3255 error = sock_sendmbuf(so, NULL, head, 0, NULL);
3256
3257 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
3258 struct sockbuf *, &so->so_rcv,
3259 struct sockbuf *, &so->so_snd,
3260 struct mptses *, mpte, struct mptsub *, mpts,
3261 size_t, tot_sent);
3262 }
3263
3264 done_sending:
3265 if (error == 0 ||
3266 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3267 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3268
3269 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3270 tcpstat.tcps_mp_num_probes++;
3271 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3272 mpts->mpts_probecnt += 1;
3273 } else {
3274 mpts->mpts_probecnt +=
3275 tot_sent / mpts->mpts_maxseg;
3276 }
3277 }
3278
3279 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3280 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3281 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3282 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3283 }
3284 mp_tp->mpt_sndnxt = new_sndnxt;
3285 }
3286
3287 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3288
3289 /* Must be here as mptcp_can_send_more() checks for this */
3290 soclearfastopen(mp_so);
3291
3292 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3293 (mpts->mpts_probesoon != 0)) {
3294 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3295 __func__, mpts->mpts_connid,
3296 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3297 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3298 (tcp_now - mpts->mpts_probesoon)),
3299 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3300 }
3301
3302 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3303 mptcp_set_cellicon(mpte, mpts);
3304
3305 mpte->mpte_used_cell = 1;
3306 } else {
3307 /*
3308 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3309 * explicitly set the cellicon, then we unset it again.
3310 */
3311 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3312 mptcp_unset_cellicon(mpte, NULL, 1);
3313 }
3314
3315 mpte->mpte_used_wifi = 1;
3316 }
3317
3318 /*
3319 * Don't propagate EWOULDBLOCK - it's already taken care of
3320 * in mptcp_usr_send for TFO.
3321 */
3322 error = 0;
3323 } else {
3324 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3325 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3326 }
3327 out:
3328
3329 if (wakeup) {
3330 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3331 }
3332
3333 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3334 return error;
3335
3336 zero_len_write:
3337 /* Opting to call pru_send as no mbuf at subflow level */
3338 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3339 NULL, current_proc());
3340
3341 goto done_sending;
3342 }
3343
3344 static void
3345 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3346 {
3347 struct mbuf *n, *prev = NULL;
3348
3349 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3350 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3351 m->m_pkthdr.mp_rseq),
3352 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3353
3354 n = mpte->mpte_reinjectq;
3355
3356 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3357 * equal than m's sequence number.
3358 */
3359 while (n) {
3360 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3361 break;
3362 }
3363
3364 prev = n;
3365
3366 n = n->m_nextpkt;
3367 }
3368
3369 if (n) {
3370 /* m is already fully covered by the next mbuf in the queue */
3371 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3372 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3373 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
3374 __func__, n->m_pkthdr.mp_rlen),
3375 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3376 goto dont_queue;
3377 }
3378
3379 /* m is covering the next mbuf entirely, thus we remove this guy */
3380 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3381 struct mbuf *tmp = n->m_nextpkt;
3382
3383 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
3384 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3385 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
3386 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3387
3388 m->m_nextpkt = NULL;
3389 if (prev == NULL) {
3390 mpte->mpte_reinjectq = tmp;
3391 } else {
3392 prev->m_nextpkt = tmp;
3393 }
3394
3395 m_freem(n);
3396 n = tmp;
3397 }
3398 }
3399
3400 if (prev) {
3401 /* m is already fully covered by the previous mbuf in the queue */
3402 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3403 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
3404 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
3405 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3406 goto dont_queue;
3407 }
3408 }
3409
3410 if (prev == NULL) {
3411 mpte->mpte_reinjectq = m;
3412 } else {
3413 prev->m_nextpkt = m;
3414 }
3415
3416 m->m_nextpkt = n;
3417
3418 return;
3419
3420 dont_queue:
3421 m_freem(m);
3422 return;
3423 }
3424
3425 static struct mbuf *
3426 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3427 {
3428 struct socket *mp_so = mptetoso(mpte);
3429 struct mbuf *m;
3430
3431 m = mp_so->so_snd.sb_mb;
3432
3433 while (m) {
3434 /* If this segment covers what we are looking for, return it. */
3435 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3436 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3437 break;
3438 }
3439
3440
3441 /* Segment is no more in the queue */
3442 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3443 return NULL;
3444 }
3445
3446 m = m->m_next;
3447 }
3448
3449 return m;
3450 }
3451
3452 static struct mbuf *
3453 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3454 {
3455 struct mbuf *top = NULL, *tail = NULL;
3456 uint64_t dsn;
3457 uint32_t dlen, rseq;
3458
3459 dsn = m->m_pkthdr.mp_dsn;
3460 dlen = m->m_pkthdr.mp_rlen;
3461 rseq = m->m_pkthdr.mp_rseq;
3462
3463 while (len > 0) {
3464 struct mbuf *n;
3465
3466 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3467
3468 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3469 if (n == NULL) {
3470 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3471 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3472 goto err;
3473 }
3474
3475 VERIFY(n->m_flags & M_PKTHDR);
3476 VERIFY(n->m_next == NULL);
3477 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3478 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3479 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3480 VERIFY(n->m_len == m->m_len);
3481
3482 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3483
3484 if (top == NULL) {
3485 top = n;
3486 }
3487
3488 if (tail != NULL) {
3489 tail->m_next = n;
3490 }
3491
3492 tail = n;
3493
3494 len -= m->m_len;
3495 m = m->m_next;
3496 }
3497
3498 return top;
3499
3500 err:
3501 if (top) {
3502 m_freem(top);
3503 }
3504
3505 return NULL;
3506 }
3507
3508 static void
3509 mptcp_reinject_mbufs(struct socket *so)
3510 {
3511 struct tcpcb *tp = sototcpcb(so);
3512 struct mptsub *mpts = tp->t_mpsub;
3513 struct mptcb *mp_tp = tptomptp(tp);
3514 struct mptses *mpte = mp_tp->mpt_mpte;;
3515 struct sockbuf *sb = &so->so_snd;
3516 struct mbuf *m;
3517
3518 m = sb->sb_mb;
3519 while (m) {
3520 struct mbuf *n = m->m_next, *orig = m;
3521
3522 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3523 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3524 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3525 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3526
3527 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3528
3529 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3530 goto next;
3531 }
3532
3533 /* Has it all already been acknowledged at the data-level? */
3534 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3535 goto next;
3536 }
3537
3538 /* Part of this has already been acknowledged - lookup in the
3539 * MPTCP-socket for the segment.
3540 */
3541 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3542 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3543 if (m == NULL) {
3544 goto next;
3545 }
3546 }
3547
3548 /* Copy the mbuf with headers (aka, DSN-numbers) */
3549 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3550 if (m == NULL) {
3551 break;
3552 }
3553
3554 VERIFY(m->m_nextpkt == NULL);
3555
3556 /* Now, add to the reinject-queue, eliminating overlapping
3557 * segments
3558 */
3559 mptcp_add_reinjectq(mpte, m);
3560
3561 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3562
3563 next:
3564 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3565 while (n) {
3566 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3567
3568 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3569 break;
3570 }
3571
3572 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3573 n = n->m_next;
3574 }
3575
3576 m = n;
3577 }
3578 }
3579
3580 void
3581 mptcp_clean_reinjectq(struct mptses *mpte)
3582 {
3583 struct mptcb *mp_tp = mpte->mpte_mptcb;
3584
3585 socket_lock_assert_owned(mptetoso(mpte));
3586
3587 while (mpte->mpte_reinjectq) {
3588 struct mbuf *m = mpte->mpte_reinjectq;
3589
3590 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3591 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3592 break;
3593 }
3594
3595 mpte->mpte_reinjectq = m->m_nextpkt;
3596 m->m_nextpkt = NULL;
3597 m_freem(m);
3598 }
3599 }
3600
3601 /*
3602 * Subflow socket control event upcall.
3603 */
3604 static void
3605 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3606 {
3607 #pragma unused(so)
3608 struct mptsub *mpts = arg;
3609 struct mptses *mpte = mpts->mpts_mpte;
3610
3611 socket_lock_assert_owned(mptetoso(mpte));
3612
3613 if ((mpts->mpts_evctl & events) == events) {
3614 return;
3615 }
3616
3617 mpts->mpts_evctl |= events;
3618
3619 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3620 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3621 return;
3622 }
3623
3624 mptcp_subflow_workloop(mpte);
3625 }
3626
3627 /*
3628 * Subflow socket control events.
3629 *
3630 * Called for handling events related to the underlying subflow socket.
3631 */
3632 static ev_ret_t
3633 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3634 uint64_t *p_mpsofilt_hint)
3635 {
3636 ev_ret_t ret = MPTS_EVRET_OK;
3637 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3638 sizeof(mpsub_ev_entry_tbl[0]);
3639
3640 /* bail if there's nothing to process */
3641 if (!mpts->mpts_evctl) {
3642 return ret;
3643 }
3644
3645 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3646 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3647 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3648 SO_FILT_HINT_DISCONNECTED)) {
3649 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3650 }
3651
3652 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3653 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3654
3655 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3656 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3657 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3658
3659 /*
3660 * Process all the socket filter hints and reset the hint
3661 * once it is handled
3662 */
3663 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3664 /*
3665 * Always execute the DISCONNECTED event, because it will wakeup
3666 * the app.
3667 */
3668 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3669 (ret >= MPTS_EVRET_OK ||
3670 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3671 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3672 ev_ret_t error =
3673 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3674 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3675 }
3676 }
3677
3678 /*
3679 * We should be getting only events specified via sock_catchevents(),
3680 * so loudly complain if we have any unprocessed one(s).
3681 */
3682 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) {
3683 mptcplog((LOG_WARNING, "%s%s: cid %d evret %d unhandled events=%b\n", __func__,
3684 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3685 mpts->mpts_connid,
3686 ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3687 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3688 } else {
3689 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3690 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3691 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3692 }
3693
3694 return ret;
3695 }
3696
3697 static ev_ret_t
3698 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3699 uint64_t *p_mpsofilt_hint, uint64_t event)
3700 {
3701 struct socket *mp_so, *so;
3702 struct mptcb *mp_tp;
3703
3704 mp_so = mptetoso(mpte);
3705 mp_tp = mpte->mpte_mptcb;
3706 so = mpts->mpts_socket;
3707
3708 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3709 mpts->mpts_connid, event),
3710 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3711
3712 /*
3713 * We got an event for this subflow that might need to be propagated,
3714 * based on the state of the MPTCP connection.
3715 */
3716 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3717 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3718 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3719 mp_so->so_error = so->so_error;
3720 *p_mpsofilt_hint |= event;
3721 }
3722
3723 return MPTS_EVRET_OK;
3724 }
3725
3726 /*
3727 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3728 */
3729 static ev_ret_t
3730 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3731 uint64_t *p_mpsofilt_hint, uint64_t event)
3732 {
3733 #pragma unused(p_mpsofilt_hint, event)
3734 struct socket *mp_so;
3735 struct tcpcb *tp;
3736
3737 mp_so = mptetoso(mpte);
3738 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3739
3740 /*
3741 * This overwrites any previous mpte_lost_aid to avoid storing
3742 * too much state when the typical case has only two subflows.
3743 */
3744 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3745 mpte->mpte_lost_aid = tp->t_local_aid;
3746
3747 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3748 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3749
3750 /*
3751 * The subflow connection has lost its source address.
3752 */
3753 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3754
3755 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3756 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3757 }
3758
3759 return MPTS_EVRET_DELETE;
3760 }
3761
3762 static ev_ret_t
3763 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3764 uint64_t *p_mpsofilt_hint, uint64_t event)
3765 {
3766 #pragma unused(event, p_mpsofilt_hint)
3767 struct socket *so, *mp_so;
3768
3769 so = mpts->mpts_socket;
3770
3771 if (so->so_error != ENODATA) {
3772 return MPTS_EVRET_OK;
3773 }
3774
3775
3776 mp_so = mptetoso(mpte);
3777
3778 mp_so->so_error = ENODATA;
3779
3780 sorwakeup(mp_so);
3781 sowwakeup(mp_so);
3782
3783 return MPTS_EVRET_OK;
3784 }
3785
3786
3787 /*
3788 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3789 * indicates that the remote side sent a Data FIN
3790 */
3791 static ev_ret_t
3792 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3793 uint64_t *p_mpsofilt_hint, uint64_t event)
3794 {
3795 #pragma unused(event)
3796 struct mptcb *mp_tp = mpte->mpte_mptcb;
3797
3798 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3799 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3800
3801 /*
3802 * We got a Data FIN for the MPTCP connection.
3803 * The FIN may arrive with data. The data is handed up to the
3804 * mptcp socket and the user is notified so that it may close
3805 * the socket if needed.
3806 */
3807 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3808 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3809 }
3810
3811 return MPTS_EVRET_OK; /* keep the subflow socket around */
3812 }
3813
3814 /*
3815 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3816 */
3817 static ev_ret_t
3818 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3819 uint64_t *p_mpsofilt_hint, uint64_t event)
3820 {
3821 #pragma unused(event, p_mpsofilt_hint)
3822 struct mptsub *mpts_alt = NULL;
3823 struct socket *alt_so = NULL;
3824 struct socket *mp_so;
3825 int altpath_exists = 0;
3826
3827 mp_so = mptetoso(mpte);
3828 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3829
3830 mptcp_reinject_mbufs(mpts->mpts_socket);
3831
3832 mpts_alt = mptcp_get_subflow(mpte, NULL);
3833
3834 /* If there is no alternate eligible subflow, ignore the failover hint. */
3835 if (mpts_alt == NULL || mpts_alt == mpts) {
3836 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3837 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3838
3839 goto done;
3840 }
3841
3842 altpath_exists = 1;
3843 alt_so = mpts_alt->mpts_socket;
3844 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3845 /* All data acknowledged and no RTT spike */
3846 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3847 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3848 } else {
3849 /* no alternate path available */
3850 altpath_exists = 0;
3851 }
3852 }
3853
3854 if (altpath_exists) {
3855 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3856
3857 mpte->mpte_active_sub = mpts_alt;
3858 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3859 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3860
3861 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3862 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3863
3864 mptcpstats_inc_switch(mpte, mpts);
3865
3866 sowwakeup(alt_so);
3867 } else {
3868 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3869 mpts->mpts_connid),
3870 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3871 done:
3872 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3873 }
3874
3875 return MPTS_EVRET_OK;
3876 }
3877
3878 /*
3879 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3880 */
3881 static ev_ret_t
3882 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3883 uint64_t *p_mpsofilt_hint, uint64_t event)
3884 {
3885 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3886 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3887
3888 /*
3889 * The subflow connection cannot use the outgoing interface, let's
3890 * close this subflow.
3891 */
3892 mptcp_subflow_abort(mpts, EPERM);
3893
3894 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3895
3896 return MPTS_EVRET_DELETE;
3897 }
3898
3899 /*
3900 * https://tools.ietf.org/html/rfc6052#section-2
3901 * https://tools.ietf.org/html/rfc6147#section-5.2
3902 */
3903 static boolean_t
3904 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3905 const struct ipv6_prefix *prefix,
3906 struct in_addr *addrv4)
3907 {
3908 char buf[MAX_IPv4_STR_LEN];
3909 char *ptrv4 = (char *)addrv4;
3910 const char *ptr = (const char *)addr;
3911
3912 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3913 return false;
3914 }
3915
3916 switch (prefix->prefix_len) {
3917 case NAT64_PREFIX_LEN_96:
3918 memcpy(ptrv4, ptr + 12, 4);
3919 break;
3920 case NAT64_PREFIX_LEN_64:
3921 memcpy(ptrv4, ptr + 9, 4);
3922 break;
3923 case NAT64_PREFIX_LEN_56:
3924 memcpy(ptrv4, ptr + 7, 1);
3925 memcpy(ptrv4 + 1, ptr + 9, 3);
3926 break;
3927 case NAT64_PREFIX_LEN_48:
3928 memcpy(ptrv4, ptr + 6, 2);
3929 memcpy(ptrv4 + 2, ptr + 9, 2);
3930 break;
3931 case NAT64_PREFIX_LEN_40:
3932 memcpy(ptrv4, ptr + 5, 3);
3933 memcpy(ptrv4 + 3, ptr + 9, 1);
3934 break;
3935 case NAT64_PREFIX_LEN_32:
3936 memcpy(ptrv4, ptr + 4, 4);
3937 break;
3938 default:
3939 panic("NAT64-prefix len is wrong: %u\n",
3940 prefix->prefix_len);
3941 }
3942
3943 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3944 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3945
3946 return true;
3947 }
3948
3949 static void
3950 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3951 {
3952 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3953 struct socket *so = mpts->mpts_socket;
3954 struct ifnet *ifp;
3955 int j;
3956
3957 /* Subflow IPs will be steered directly by the server - no need to
3958 * desynthesize.
3959 */
3960 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3961 return;
3962 }
3963
3964 ifp = sotoinpcb(so)->inp_last_outifp;
3965
3966 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3967 mptcp_ask_for_nat64(ifp);
3968 return;
3969 }
3970
3971
3972 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3973 int success;
3974
3975 if (nat64prefixes[j].prefix_len == 0) {
3976 continue;
3977 }
3978
3979 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3980 &nat64prefixes[j],
3981 &mpte->mpte_dst_v4_nat64.sin_addr);
3982 if (success) {
3983 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3984 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3985 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3986 break;
3987 }
3988 }
3989 }
3990
3991 /*
3992 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3993 */
3994 static ev_ret_t
3995 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3996 uint64_t *p_mpsofilt_hint, uint64_t event)
3997 {
3998 #pragma unused(event, p_mpsofilt_hint)
3999 struct socket *mp_so, *so;
4000 struct inpcb *inp;
4001 struct tcpcb *tp;
4002 struct mptcb *mp_tp;
4003 int af;
4004 boolean_t mpok = FALSE;
4005
4006 mp_so = mptetoso(mpte);
4007 mp_tp = mpte->mpte_mptcb;
4008 so = mpts->mpts_socket;
4009 tp = sototcpcb(so);
4010 af = mpts->mpts_dst.sa_family;
4011
4012 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4013 return MPTS_EVRET_OK;
4014 }
4015
4016 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4017 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4018 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4019 (so->so_state & SS_ISCONNECTED)) {
4020 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4021 __func__, mpts->mpts_connid),
4022 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4023 (void) soshutdownlock(so, SHUT_RD);
4024 (void) soshutdownlock(so, SHUT_WR);
4025 (void) sodisconnectlocked(so);
4026 }
4027 return MPTS_EVRET_OK;
4028 }
4029
4030 /*
4031 * The subflow connection has been connected. Find out whether it
4032 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4033 *
4034 * a. If MPTCP connection is not yet established, then this must be
4035 * the first subflow connection. If MPTCP failed to negotiate,
4036 * fallback to regular TCP by degrading this subflow.
4037 *
4038 * b. If MPTCP connection has been established, then this must be
4039 * one of the subsequent subflow connections. If MPTCP failed
4040 * to negotiate, disconnect the connection.
4041 *
4042 * Right now, we simply unblock any waiters at the MPTCP socket layer
4043 * if the MPTCP connection has not been established.
4044 */
4045
4046 if (so->so_state & SS_ISDISCONNECTED) {
4047 /*
4048 * With MPTCP joins, a connection is connected at the subflow
4049 * level, but the 4th ACK from the server elevates the MPTCP
4050 * subflow to connected state. So there is a small window
4051 * where the subflow could get disconnected before the
4052 * connected event is processed.
4053 */
4054 return MPTS_EVRET_OK;
4055 }
4056
4057 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4058 mptcp_drop_tfo_data(mpte, mpts);
4059 }
4060
4061 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4062 mpts->mpts_flags |= MPTSF_CONNECTED;
4063
4064 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4065 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4066 }
4067
4068 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4069
4070 /* get/verify the outbound interface */
4071 inp = sotoinpcb(so);
4072
4073 mpts->mpts_maxseg = tp->t_maxseg;
4074
4075 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4076 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4077 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4078 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4079
4080 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4081
4082 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4083 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4084 mpte->mpte_associd = mpts->mpts_connid;
4085 DTRACE_MPTCP2(state__change,
4086 struct mptcb *, mp_tp,
4087 uint32_t, 0 /* event */);
4088
4089 if (SOCK_DOM(so) == AF_INET) {
4090 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4091 } else {
4092 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4093 }
4094
4095 mpts->mpts_flags |= MPTSF_ACTIVE;
4096
4097 /* case (a) above */
4098 if (!mpok) {
4099 tcpstat.tcps_mpcap_fallback++;
4100
4101 tp->t_mpflags |= TMPF_INFIN_SENT;
4102 mptcp_notify_mpfail(so);
4103 } else {
4104 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4105 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4106 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4107 } else {
4108 mpts->mpts_flags |= MPTSF_PREFERRED;
4109 }
4110 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4111 mpte->mpte_nummpcapflows++;
4112
4113 if (SOCK_DOM(so) == AF_INET6) {
4114 mptcp_handle_ipv6_connection(mpte, mpts);
4115 }
4116
4117 mptcp_check_subflows_and_add(mpte);
4118
4119 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4120 mpte->mpte_initial_cell = 1;
4121 }
4122
4123 mpte->mpte_handshake_success = 1;
4124 }
4125
4126 mp_tp->mpt_sndwnd = tp->snd_wnd;
4127 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4128 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4129 soisconnected(mp_so);
4130 } else if (mpok) {
4131 /*
4132 * case (b) above
4133 * In case of additional flows, the MPTCP socket is not
4134 * MPTSF_MP_CAPABLE until an ACK is received from server
4135 * for 3-way handshake. TCP would have guaranteed that this
4136 * is an MPTCP subflow.
4137 */
4138 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4139 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4140 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4141 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4142 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4143 } else {
4144 mpts->mpts_flags |= MPTSF_PREFERRED;
4145 }
4146
4147 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4148 mpte->mpte_nummpcapflows++;
4149
4150 mpts->mpts_rel_seq = 1;
4151
4152 mptcp_check_subflows_and_remove(mpte);
4153 } else {
4154 unsigned int i;
4155
4156 /* Should we try the alternate port? */
4157 if (mpte->mpte_alternate_port &&
4158 inp->inp_fport != mpte->mpte_alternate_port) {
4159 union sockaddr_in_4_6 dst;
4160 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4161
4162 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4163
4164 dst_in->sin_port = mpte->mpte_alternate_port;
4165
4166 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4167 mpts->mpts_ifscope, NULL);
4168 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4169 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4170 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
4171
4172 if (inp->inp_last_outifp->if_index == info->ifindex) {
4173 info->no_mptcp_support = 1;
4174 break;
4175 }
4176 }
4177 }
4178
4179 tcpstat.tcps_join_fallback++;
4180 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4181 tcpstat.tcps_mptcp_cell_proxy++;
4182 } else {
4183 tcpstat.tcps_mptcp_wifi_proxy++;
4184 }
4185
4186 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4187
4188 return MPTS_EVRET_OK;
4189 }
4190
4191 /* This call, just to "book" an entry in the stats-table for this ifindex */
4192 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4193
4194 mptcp_output(mpte);
4195
4196 return MPTS_EVRET_OK; /* keep the subflow socket around */
4197 }
4198
4199 /*
4200 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4201 */
4202 static ev_ret_t
4203 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4204 uint64_t *p_mpsofilt_hint, uint64_t event)
4205 {
4206 #pragma unused(event, p_mpsofilt_hint)
4207 struct socket *mp_so, *so;
4208 struct mptcb *mp_tp;
4209
4210 mp_so = mptetoso(mpte);
4211 mp_tp = mpte->mpte_mptcb;
4212 so = mpts->mpts_socket;
4213
4214 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4215 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4216 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4217 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4218 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4219
4220 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4221 return MPTS_EVRET_DELETE;
4222 }
4223
4224 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4225
4226 /* The subflow connection has been disconnected. */
4227
4228 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4229 mpte->mpte_nummpcapflows--;
4230 if (mpte->mpte_active_sub == mpts) {
4231 mpte->mpte_active_sub = NULL;
4232 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4233 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4234 }
4235 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4236 }
4237
4238 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4239 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4240 mptcp_drop(mpte, mp_tp, so->so_error);
4241 }
4242
4243 /*
4244 * Clear flags that are used by getconninfo to return state.
4245 * Retain like MPTSF_DELETEOK for internal purposes.
4246 */
4247 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4248 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4249 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4250
4251 return MPTS_EVRET_DELETE;
4252 }
4253
4254 /*
4255 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4256 */
4257 static ev_ret_t
4258 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4259 uint64_t *p_mpsofilt_hint, uint64_t event)
4260 {
4261 #pragma unused(event, p_mpsofilt_hint)
4262 ev_ret_t ret = MPTS_EVRET_OK;
4263 struct socket *mp_so, *so;
4264 struct mptcb *mp_tp;
4265
4266 mp_so = mptetoso(mpte);
4267 mp_tp = mpte->mpte_mptcb;
4268 so = mpts->mpts_socket;
4269
4270 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4271 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4272 } else {
4273 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4274 }
4275
4276 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4277 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4278 goto done;
4279 }
4280 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4281 } else {
4282 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4283 }
4284
4285 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4286 mpts->mpts_flags |= MPTSF_MP_READY;
4287 } else {
4288 mpts->mpts_flags &= ~MPTSF_MP_READY;
4289 }
4290
4291 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4292 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4293 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4294 }
4295
4296 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4297 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4298
4299 m_freem_list(mpte->mpte_reinjectq);
4300 mpte->mpte_reinjectq = NULL;
4301 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4302 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4303 ret = MPTS_EVRET_CONNECT_PENDING;
4304 }
4305
4306 done:
4307 return ret;
4308 }
4309
4310 /*
4311 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4312 */
4313 static ev_ret_t
4314 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4315 uint64_t *p_mpsofilt_hint, uint64_t event)
4316 {
4317 #pragma unused(event)
4318 struct socket *mp_so, *so;
4319 struct mptcb *mp_tp;
4320 boolean_t is_fastclose;
4321
4322 mp_so = mptetoso(mpte);
4323 mp_tp = mpte->mpte_mptcb;
4324 so = mpts->mpts_socket;
4325
4326 /* We got an invalid option or a fast close */
4327 struct tcptemp *t_template;
4328 struct inpcb *inp = sotoinpcb(so);
4329 struct tcpcb *tp = NULL;
4330
4331 tp = intotcpcb(inp);
4332 so->so_error = ECONNABORTED;
4333
4334 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4335
4336 tp->t_mpflags |= TMPF_RESET;
4337
4338 t_template = tcp_maketemplate(tp);
4339 if (t_template) {
4340 struct tcp_respond_args tra;
4341
4342 bzero(&tra, sizeof(tra));
4343 if (inp->inp_flags & INP_BOUND_IF) {
4344 tra.ifscope = inp->inp_boundifp->if_index;
4345 } else {
4346 tra.ifscope = IFSCOPE_NONE;
4347 }
4348 tra.awdl_unrestricted = 1;
4349
4350 tcp_respond(tp, t_template->tt_ipgen,
4351 &t_template->tt_t, (struct mbuf *)NULL,
4352 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4353 (void) m_free(dtom(t_template));
4354 }
4355
4356 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4357 struct mptsub *iter, *tmp;
4358
4359 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4360
4361 mp_so->so_error = ECONNRESET;
4362
4363 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4364 if (iter == mpts) {
4365 continue;
4366 }
4367 mptcp_subflow_abort(iter, ECONNABORTED);
4368 }
4369
4370 /*
4371 * mptcp_drop is being called after processing the events, to fully
4372 * close the MPTCP connection
4373 */
4374 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4375 }
4376
4377 mptcp_subflow_abort(mpts, ECONNABORTED);
4378
4379
4380 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4381 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4382 }
4383
4384 return MPTS_EVRET_DELETE;
4385 }
4386
4387 static ev_ret_t
4388 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4389 uint64_t *p_mpsofilt_hint, uint64_t event)
4390 {
4391 #pragma unused(event)
4392 bool found_active = false;
4393
4394 mpts->mpts_flags |= MPTSF_READ_STALL;
4395
4396 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4397 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4398
4399 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4400 TCPS_HAVERCVDFIN2(tp->t_state)) {
4401 continue;
4402 }
4403
4404 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4405 found_active = true;
4406 break;
4407 }
4408 }
4409
4410 if (!found_active) {
4411 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4412 }
4413
4414 return MPTS_EVRET_OK;
4415 }
4416
4417 static ev_ret_t
4418 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4419 uint64_t *p_mpsofilt_hint, uint64_t event)
4420 {
4421 #pragma unused(event)
4422 bool found_active = false;
4423
4424 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4425
4426 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4427 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4428
4429 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4430 tp->t_state > TCPS_CLOSE_WAIT) {
4431 continue;
4432 }
4433
4434 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4435 found_active = true;
4436 break;
4437 }
4438 }
4439
4440 if (!found_active) {
4441 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4442 }
4443
4444 return MPTS_EVRET_OK;
4445 }
4446
4447 /*
4448 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4449 * caller must ensure that the option can be issued on subflow sockets, via
4450 * MPOF_SUBFLOW_OK flag.
4451 */
4452 int
4453 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4454 {
4455 struct socket *mp_so, *so;
4456 struct sockopt sopt;
4457 int error;
4458
4459 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4460
4461 mp_so = mptetoso(mpte);
4462 so = mpts->mpts_socket;
4463
4464 socket_lock_assert_owned(mp_so);
4465
4466 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4467 mpo->mpo_level == SOL_SOCKET &&
4468 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4469 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4470
4471 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4472 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4473 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4474 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4475 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4476
4477 /*
4478 * When we open a new subflow, mark it as cell fallback, if
4479 * this subflow goes over cell.
4480 *
4481 * (except for first-party apps)
4482 */
4483
4484 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4485 return 0;
4486 }
4487
4488 if (sotoinpcb(so)->inp_last_outifp &&
4489 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4490 return 0;
4491 }
4492
4493 /*
4494 * This here is an OR, because if the app is not binding to the
4495 * interface, then it definitely is not a cell-fallback
4496 * connection.
4497 */
4498 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4499 !IFNET_IS_CELLULAR(ifp)) {
4500 return 0;
4501 }
4502 }
4503
4504 mpo->mpo_flags &= ~MPOF_INTERIM;
4505
4506 bzero(&sopt, sizeof(sopt));
4507 sopt.sopt_dir = SOPT_SET;
4508 sopt.sopt_level = mpo->mpo_level;
4509 sopt.sopt_name = mpo->mpo_name;
4510 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4511 sopt.sopt_valsize = sizeof(int);
4512 sopt.sopt_p = kernproc;
4513
4514 error = sosetoptlock(so, &sopt, 0);
4515 if (error) {
4516 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4517 "val %d set error %d\n", __func__,
4518 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4519 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4520 mpo->mpo_intval, error);
4521 }
4522 return error;
4523 }
4524
4525 /*
4526 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4527 * caller must ensure that the option can be issued on subflow sockets, via
4528 * MPOF_SUBFLOW_OK flag.
4529 */
4530 int
4531 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4532 struct mptopt *mpo)
4533 {
4534 struct socket *mp_so;
4535 struct sockopt sopt;
4536 int error;
4537
4538 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4539 mp_so = mptetoso(mpte);
4540
4541 socket_lock_assert_owned(mp_so);
4542
4543 bzero(&sopt, sizeof(sopt));
4544 sopt.sopt_dir = SOPT_GET;
4545 sopt.sopt_level = mpo->mpo_level;
4546 sopt.sopt_name = mpo->mpo_name;
4547 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4548 sopt.sopt_valsize = sizeof(int);
4549 sopt.sopt_p = kernproc;
4550
4551 error = sogetoptlock(so, &sopt, 0); /* already locked */
4552 if (error) {
4553 os_log_error(mptcp_log_handle,
4554 "%s - %lx: sopt %s get error %d\n",
4555 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4556 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4557 }
4558 return error;
4559 }
4560
4561
4562 /*
4563 * MPTCP garbage collector.
4564 *
4565 * This routine is called by the MP domain on-demand, periodic callout,
4566 * which is triggered when a MPTCP socket is closed. The callout will
4567 * repeat as long as this routine returns a non-zero value.
4568 */
4569 static uint32_t
4570 mptcp_gc(struct mppcbinfo *mppi)
4571 {
4572 struct mppcb *mpp, *tmpp;
4573 uint32_t active = 0;
4574
4575 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4576
4577 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4578 struct socket *mp_so;
4579 struct mptses *mpte;
4580 struct mptcb *mp_tp;
4581
4582 mp_so = mpp->mpp_socket;
4583 mpte = mptompte(mpp);
4584 mp_tp = mpte->mpte_mptcb;
4585
4586 if (!mpp_try_lock(mpp)) {
4587 active++;
4588 continue;
4589 }
4590
4591 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4592
4593 /* check again under the lock */
4594 if (mp_so->so_usecount > 0) {
4595 boolean_t wakeup = FALSE;
4596 struct mptsub *mpts, *tmpts;
4597
4598 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4599 if (mp_tp->mpt_gc_ticks > 0) {
4600 mp_tp->mpt_gc_ticks--;
4601 }
4602 if (mp_tp->mpt_gc_ticks == 0) {
4603 wakeup = TRUE;
4604 }
4605 }
4606 if (wakeup) {
4607 TAILQ_FOREACH_SAFE(mpts,
4608 &mpte->mpte_subflows, mpts_entry, tmpts) {
4609 mptcp_subflow_eupcall1(mpts->mpts_socket,
4610 mpts, SO_FILT_HINT_DISCONNECTED);
4611 }
4612 }
4613 socket_unlock(mp_so, 0);
4614 active++;
4615 continue;
4616 }
4617
4618 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4619 panic("%s - %lx: skipped state "
4620 "[u=%d,r=%d,s=%d]\n", __func__,
4621 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4622 mp_so->so_usecount, mp_so->so_retaincnt,
4623 mpp->mpp_state);
4624 }
4625
4626 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4627 mptcp_close(mpte, mp_tp);
4628 }
4629
4630 mptcp_session_destroy(mpte);
4631
4632 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4633 struct sockbuf *, &mp_so->so_rcv,
4634 struct sockbuf *, &mp_so->so_snd,
4635 struct mppcb *, mpp);
4636
4637 mp_pcbdispose(mpp);
4638 sodealloc(mp_so);
4639 }
4640
4641 return active;
4642 }
4643
4644 /*
4645 * Drop a MPTCP connection, reporting the specified error.
4646 */
4647 struct mptses *
4648 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4649 {
4650 struct socket *mp_so = mptetoso(mpte);
4651
4652 VERIFY(mpte->mpte_mptcb == mp_tp);
4653
4654 socket_lock_assert_owned(mp_so);
4655
4656 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4657 uint32_t, 0 /* event */);
4658
4659 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4660 errno = mp_tp->mpt_softerror;
4661 }
4662 mp_so->so_error = errno;
4663
4664 return mptcp_close(mpte, mp_tp);
4665 }
4666
4667 /*
4668 * Close a MPTCP control block.
4669 */
4670 struct mptses *
4671 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4672 {
4673 struct mptsub *mpts = NULL, *tmpts = NULL;
4674 struct socket *mp_so = mptetoso(mpte);
4675
4676 socket_lock_assert_owned(mp_so);
4677 VERIFY(mpte->mpte_mptcb == mp_tp);
4678
4679 mp_tp->mpt_state = MPTCPS_TERMINATE;
4680
4681 mptcp_freeq(mp_tp);
4682
4683 soisdisconnected(mp_so);
4684
4685 /* Clean up all subflows */
4686 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4687 mptcp_subflow_disconnect(mpte, mpts);
4688 }
4689
4690 return NULL;
4691 }
4692
4693 void
4694 mptcp_notify_close(struct socket *so)
4695 {
4696 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4697 }
4698
4699 /*
4700 * MPTCP workloop.
4701 */
4702 void
4703 mptcp_subflow_workloop(struct mptses *mpte)
4704 {
4705 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4706 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4707 struct mptsub *mpts, *tmpts;
4708 struct socket *mp_so;
4709
4710 mp_so = mptetoso(mpte);
4711
4712 socket_lock_assert_owned(mp_so);
4713
4714 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4715 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4716 return;
4717 }
4718 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4719
4720 relaunch:
4721 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4722
4723 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4724 ev_ret_t ret;
4725
4726 if (mpts->mpts_socket->so_usecount == 0) {
4727 /* Will be removed soon by tcp_garbage_collect */
4728 continue;
4729 }
4730
4731 mptcp_subflow_addref(mpts);
4732 mpts->mpts_socket->so_usecount++;
4733
4734 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4735
4736 /*
4737 * If MPTCP socket is closed, disconnect all subflows.
4738 * This will generate a disconnect event which will
4739 * be handled during the next iteration, causing a
4740 * non-zero error to be returned above.
4741 */
4742 if (mp_so->so_flags & SOF_PCBCLEARING) {
4743 mptcp_subflow_disconnect(mpte, mpts);
4744 }
4745
4746 switch (ret) {
4747 case MPTS_EVRET_OK:
4748 /* nothing to do */
4749 break;
4750 case MPTS_EVRET_DELETE:
4751 mptcp_subflow_soclose(mpts);
4752 break;
4753 case MPTS_EVRET_CONNECT_PENDING:
4754 connect_pending = TRUE;
4755 break;
4756 case MPTS_EVRET_DISCONNECT_FALLBACK:
4757 disconnect_fallback = TRUE;
4758 break;
4759 default:
4760 mptcplog((LOG_DEBUG,
4761 "MPTCP Socket: %s: mptcp_subflow_events "
4762 "returned invalid value: %d\n", __func__,
4763 ret),
4764 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4765 break;
4766 }
4767 mptcp_subflow_remref(mpts); /* ours */
4768
4769 VERIFY(mpts->mpts_socket->so_usecount != 0);
4770 mpts->mpts_socket->so_usecount--;
4771 }
4772
4773 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4774 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4775
4776 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4777 mp_so->so_state |= SS_CANTRCVMORE;
4778 sorwakeup(mp_so);
4779 }
4780
4781 soevent(mp_so, mpsofilt_hint_mask);
4782 }
4783
4784 if (!connect_pending && !disconnect_fallback) {
4785 goto exit;
4786 }
4787
4788 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4789 if (disconnect_fallback) {
4790 struct socket *so = NULL;
4791 struct inpcb *inp = NULL;
4792 struct tcpcb *tp = NULL;
4793
4794 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4795 continue;
4796 }
4797
4798 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4799
4800 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4801 MPTSF_DISCONNECTED | MPTSF_CONNECT_PENDING)) {
4802 continue;
4803 }
4804
4805 so = mpts->mpts_socket;
4806
4807 /*
4808 * The MPTCP connection has degraded to a fallback
4809 * mode, so there is no point in keeping this subflow
4810 * regardless of its MPTCP-readiness state, unless it
4811 * is the primary one which we use for fallback. This
4812 * assumes that the subflow used for fallback is the
4813 * ACTIVE one.
4814 */
4815
4816 inp = sotoinpcb(so);
4817 tp = intotcpcb(inp);
4818 tp->t_mpflags &=
4819 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4820 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4821
4822 soevent(so, SO_FILT_HINT_MUSTRST);
4823 } else if (connect_pending) {
4824 /*
4825 * The MPTCP connection has progressed to a state
4826 * where it supports full multipath semantics; allow
4827 * additional joins to be attempted for all subflows
4828 * that are in the PENDING state.
4829 */
4830 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4831 int error = mptcp_subflow_soconnectx(mpte, mpts);
4832
4833 if (error) {
4834 mptcp_subflow_abort(mpts, error);
4835 }
4836 }
4837 }
4838 }
4839
4840 exit:
4841 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4842 goto relaunch;
4843 }
4844
4845 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4846 }
4847
4848 /*
4849 * Protocol pr_lock callback.
4850 */
4851 int
4852 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4853 {
4854 struct mppcb *mpp = mpsotomppcb(mp_so);
4855 void *lr_saved;
4856
4857 if (lr == NULL) {
4858 lr_saved = __builtin_return_address(0);
4859 } else {
4860 lr_saved = lr;
4861 }
4862
4863 if (mpp == NULL) {
4864 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4865 mp_so, lr_saved, solockhistory_nr(mp_so));
4866 /* NOTREACHED */
4867 }
4868 mpp_lock(mpp);
4869
4870 if (mp_so->so_usecount < 0) {
4871 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4872 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4873 solockhistory_nr(mp_so));
4874 /* NOTREACHED */
4875 }
4876 if (refcount != 0) {
4877 mp_so->so_usecount++;
4878 mpp->mpp_inside++;
4879 }
4880 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4881 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4882
4883 return 0;
4884 }
4885
4886 /*
4887 * Protocol pr_unlock callback.
4888 */
4889 int
4890 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4891 {
4892 struct mppcb *mpp = mpsotomppcb(mp_so);
4893 void *lr_saved;
4894
4895 if (lr == NULL) {
4896 lr_saved = __builtin_return_address(0);
4897 } else {
4898 lr_saved = lr;
4899 }
4900
4901 if (mpp == NULL) {
4902 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4903 mp_so, mp_so->so_usecount, lr_saved,
4904 solockhistory_nr(mp_so));
4905 /* NOTREACHED */
4906 }
4907 socket_lock_assert_owned(mp_so);
4908
4909 if (refcount != 0) {
4910 mp_so->so_usecount--;
4911 mpp->mpp_inside--;
4912 }
4913
4914 if (mp_so->so_usecount < 0) {
4915 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4916 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4917 /* NOTREACHED */
4918 }
4919 if (mpp->mpp_inside < 0) {
4920 panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
4921 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
4922 /* NOTREACHED */
4923 }
4924 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4925 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4926 mpp_unlock(mpp);
4927
4928 return 0;
4929 }
4930
4931 /*
4932 * Protocol pr_getlock callback.
4933 */
4934 lck_mtx_t *
4935 mptcp_getlock(struct socket *mp_so, int flags)
4936 {
4937 struct mppcb *mpp = mpsotomppcb(mp_so);
4938
4939 if (mpp == NULL) {
4940 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4941 solockhistory_nr(mp_so));
4942 /* NOTREACHED */
4943 }
4944 if (mp_so->so_usecount < 0) {
4945 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4946 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4947 /* NOTREACHED */
4948 }
4949 return mpp_getlock(mpp, flags);
4950 }
4951
4952 /*
4953 * MPTCP Join support
4954 */
4955
4956 static void
4957 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
4958 {
4959 struct tcpcb *tp = sototcpcb(so);
4960 struct mptcp_subf_auth_entry *sauth_entry;
4961
4962 /*
4963 * The address ID of the first flow is implicitly 0.
4964 */
4965 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4966 tp->t_local_aid = 0;
4967 } else {
4968 tp->t_local_aid = addr_id;
4969 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4970 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4971 }
4972 sauth_entry = zalloc(mpt_subauth_zone);
4973 sauth_entry->msae_laddr_id = tp->t_local_aid;
4974 sauth_entry->msae_raddr_id = 0;
4975 sauth_entry->msae_raddr_rand = 0;
4976 try_again:
4977 sauth_entry->msae_laddr_rand = RandomULong();
4978 if (sauth_entry->msae_laddr_rand == 0) {
4979 goto try_again;
4980 }
4981 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4982 }
4983
4984 static void
4985 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4986 {
4987 struct mptcp_subf_auth_entry *sauth_entry;
4988 struct tcpcb *tp = NULL;
4989 int found = 0;
4990
4991 tp = sototcpcb(so);
4992 if (tp == NULL) {
4993 return;
4994 }
4995
4996 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4997 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4998 found = 1;
4999 break;
5000 }
5001 }
5002 if (found) {
5003 LIST_REMOVE(sauth_entry, msae_next);
5004 }
5005
5006 if (found) {
5007 zfree(mpt_subauth_zone, sauth_entry);
5008 }
5009 }
5010
5011 void
5012 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5013 u_int32_t *rrand)
5014 {
5015 struct mptcp_subf_auth_entry *sauth_entry;
5016
5017 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5018 if (sauth_entry->msae_laddr_id == addr_id) {
5019 if (lrand) {
5020 *lrand = sauth_entry->msae_laddr_rand;
5021 }
5022 if (rrand) {
5023 *rrand = sauth_entry->msae_raddr_rand;
5024 }
5025 break;
5026 }
5027 }
5028 }
5029
5030 void
5031 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5032 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5033 {
5034 struct mptcp_subf_auth_entry *sauth_entry;
5035
5036 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5037 if (sauth_entry->msae_laddr_id == laddr_id) {
5038 if ((sauth_entry->msae_raddr_id != 0) &&
5039 (sauth_entry->msae_raddr_id != raddr_id)) {
5040 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5041 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5042 raddr_id, sauth_entry->msae_raddr_id);
5043 return;
5044 }
5045 sauth_entry->msae_raddr_id = raddr_id;
5046 if ((sauth_entry->msae_raddr_rand != 0) &&
5047 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5048 os_log_error(mptcp_log_handle, "%s - %lx: "
5049 "dup SYN_ACK %d %d \n",
5050 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5051 raddr_rand, sauth_entry->msae_raddr_rand);
5052 return;
5053 }
5054 sauth_entry->msae_raddr_rand = raddr_rand;
5055 return;
5056 }
5057 }
5058 }
5059
5060 /*
5061 * SHA1 support for MPTCP
5062 */
5063 static void
5064 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5065 {
5066 SHA1_CTX sha1ctxt;
5067 const unsigned char *sha1_base;
5068 int sha1_size;
5069
5070 sha1_base = (const unsigned char *) key;
5071 sha1_size = sizeof(mptcp_key_t);
5072 SHA1Init(&sha1ctxt);
5073 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5074 SHA1Final(sha_digest, &sha1ctxt);
5075 }
5076
5077 void
5078 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5079 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5080 {
5081 SHA1_CTX sha1ctxt;
5082 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5083 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5084 u_int32_t data[2];
5085 int i;
5086
5087 bzero(digest, SHA1_RESULTLEN);
5088
5089 /* Set up the Key for HMAC */
5090 key_ipad[0] = key1;
5091 key_ipad[1] = key2;
5092
5093 key_opad[0] = key1;
5094 key_opad[1] = key2;
5095
5096 /* Set up the message for HMAC */
5097 data[0] = rand1;
5098 data[1] = rand2;
5099
5100 /* Key is 512 block length, so no need to compute hash */
5101
5102 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5103
5104 for (i = 0; i < 8; i++) {
5105 key_ipad[i] ^= 0x3636363636363636;
5106 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5107 }
5108
5109 /* Perform inner SHA1 */
5110 SHA1Init(&sha1ctxt);
5111 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5112 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5113 SHA1Final(digest, &sha1ctxt);
5114
5115 /* Perform outer SHA1 */
5116 SHA1Init(&sha1ctxt);
5117 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5118 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5119 SHA1Final(digest, &sha1ctxt);
5120 }
5121
5122 /*
5123 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5124 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5125 */
5126 void
5127 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
5128 {
5129 uint32_t lrand, rrand;
5130
5131 lrand = rrand = 0;
5132 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5133 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
5134 digest);
5135 }
5136
5137 /*
5138 * Authentication data generation
5139 */
5140 static void
5141 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5142 int token_len)
5143 {
5144 VERIFY(token_len == sizeof(u_int32_t));
5145 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5146
5147 /* Most significant 32 bits of the SHA1 hash */
5148 bcopy(sha_digest, token, sizeof(u_int32_t));
5149 return;
5150 }
5151
5152 static void
5153 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5154 int idsn_len)
5155 {
5156 VERIFY(idsn_len == sizeof(u_int64_t));
5157 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5158
5159 /*
5160 * Least significant 64 bits of the SHA1 hash
5161 */
5162
5163 idsn[7] = sha_digest[12];
5164 idsn[6] = sha_digest[13];
5165 idsn[5] = sha_digest[14];
5166 idsn[4] = sha_digest[15];
5167 idsn[3] = sha_digest[16];
5168 idsn[2] = sha_digest[17];
5169 idsn[1] = sha_digest[18];
5170 idsn[0] = sha_digest[19];
5171 return;
5172 }
5173
5174 static void
5175 mptcp_conn_properties(struct mptcb *mp_tp)
5176 {
5177 /* There is only Version 0 at this time */
5178 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
5179
5180 /* Set DSS checksum flag */
5181 if (mptcp_dss_csum) {
5182 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5183 }
5184
5185 /* Set up receive window */
5186 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5187
5188 /* Set up gc ticks */
5189 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5190 }
5191
5192 static void
5193 mptcp_init_local_parms(struct mptses *mpte)
5194 {
5195 struct mptcb *mp_tp = mpte->mpte_mptcb;
5196 char key_digest[SHA1_RESULTLEN];
5197
5198 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5199 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5200
5201 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
5202 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5203 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
5204 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
5205
5206 /* The subflow SYN is also first MPTCP byte */
5207 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5208 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5209
5210 mptcp_conn_properties(mp_tp);
5211 }
5212
5213 int
5214 mptcp_init_remote_parms(struct mptcb *mp_tp)
5215 {
5216 char remote_digest[SHA1_RESULTLEN];
5217
5218 /* Only Version 0 is supported for auth purposes */
5219 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5220 return -1;
5221 }
5222
5223 /* Setup local and remote tokens and Initial DSNs */
5224 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5225 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
5226 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5227 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
5228 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5229 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5230 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5231
5232 return 0;
5233 }
5234
5235 static void
5236 mptcp_send_dfin(struct socket *so)
5237 {
5238 struct tcpcb *tp = NULL;
5239 struct inpcb *inp = NULL;
5240
5241 inp = sotoinpcb(so);
5242 if (!inp) {
5243 return;
5244 }
5245
5246 tp = intotcpcb(inp);
5247 if (!tp) {
5248 return;
5249 }
5250
5251 if (!(tp->t_mpflags & TMPF_RESET)) {
5252 tp->t_mpflags |= TMPF_SEND_DFIN;
5253 }
5254 }
5255
5256 /*
5257 * Data Sequence Mapping routines
5258 */
5259 void
5260 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5261 {
5262 struct mptcb *mp_tp;
5263
5264 if (m == NULL) {
5265 return;
5266 }
5267
5268 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5269
5270 while (m) {
5271 VERIFY(m->m_flags & M_PKTHDR);
5272 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5273 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5274 m->m_pkthdr.mp_rlen = m_pktlen(m);
5275 mp_tp->mpt_sndmax += m_pktlen(m);
5276 m = m->m_next;
5277 }
5278 }
5279
5280 void
5281 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5282 {
5283 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5284 uint64_t data_ack;
5285 uint64_t dsn;
5286
5287 if (!m || len == 0) {
5288 return;
5289 }
5290
5291 while (m && len > 0) {
5292 VERIFY(m->m_flags & M_PKTHDR);
5293 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5294
5295 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5296 dsn = m->m_pkthdr.mp_dsn;
5297
5298 len -= m->m_len;
5299 m = m->m_next;
5300 }
5301
5302 if (m && len == 0) {
5303 /*
5304 * If there is one more mbuf in the chain, it automatically means
5305 * that up to m->mp_dsn has been ack'ed.
5306 *
5307 * This means, we actually correct data_ack back down (compared
5308 * to what we set inside the loop - dsn + data_len). Because in
5309 * the loop we are "optimistic" and assume that the full mapping
5310 * will be acked. If that's not the case and we get out of the
5311 * loop with m != NULL, it means only up to m->mp_dsn has been
5312 * really acked.
5313 */
5314 data_ack = m->m_pkthdr.mp_dsn;
5315 }
5316
5317 if (len < 0) {
5318 /*
5319 * If len is negative, meaning we acked in the middle of an mbuf,
5320 * only up to this mbuf's data-sequence number has been acked
5321 * at the MPTCP-level.
5322 */
5323 data_ack = dsn;
5324 }
5325
5326 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5327 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5328
5329 /* We can have data in the subflow's send-queue that is being acked,
5330 * while the DATA_ACK has already advanced. Thus, we should check whether
5331 * or not the DATA_ACK is actually new here.
5332 */
5333 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5334 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5335 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5336 }
5337 }
5338
5339 void
5340 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5341 {
5342 int rewinding = 0;
5343
5344 /* TFO makes things complicated. */
5345 if (so->so_flags1 & SOF1_TFO_REWIND) {
5346 rewinding = 1;
5347 so->so_flags1 &= ~SOF1_TFO_REWIND;
5348 }
5349
5350 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5351 u_int32_t sub_len;
5352 VERIFY(m->m_flags & M_PKTHDR);
5353 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5354
5355 sub_len = m->m_pkthdr.mp_rlen;
5356
5357 if (sub_len < len) {
5358 m->m_pkthdr.mp_dsn += sub_len;
5359 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5360 m->m_pkthdr.mp_rseq += sub_len;
5361 }
5362 m->m_pkthdr.mp_rlen = 0;
5363 len -= sub_len;
5364 } else {
5365 /* sub_len >= len */
5366 if (rewinding == 0) {
5367 m->m_pkthdr.mp_dsn += len;
5368 }
5369 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5370 if (rewinding == 0) {
5371 m->m_pkthdr.mp_rseq += len;
5372 }
5373 }
5374 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5375 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5376 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5377 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5378 m->m_pkthdr.mp_rlen -= len;
5379 break;
5380 }
5381 m = m->m_next;
5382 }
5383
5384 if (so->so_flags & SOF_MP_SUBFLOW &&
5385 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5386 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5387 /*
5388 * Received an ack without receiving a DATA_ACK.
5389 * Need to fallback to regular TCP (or destroy this subflow).
5390 */
5391 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5392 mptcp_notify_mpfail(so);
5393 }
5394 }
5395
5396 /* Obtain the DSN mapping stored in the mbuf */
5397 void
5398 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5399 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5400 {
5401 u_int64_t dsn64;
5402
5403 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5404 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5405 }
5406
5407 void
5408 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5409 uint32_t *relseq, uint16_t *data_len,
5410 uint16_t *dss_csum)
5411 {
5412 struct mbuf *m = so->so_snd.sb_mb;
5413 int off_orig = off;
5414
5415 VERIFY(off >= 0);
5416
5417 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5418 *dsn = 0;
5419 *relseq = 0;
5420 *data_len = 0;
5421 *dss_csum = 0;
5422 return;
5423 }
5424
5425 /*
5426 * In the subflow socket, the DSN sequencing can be discontiguous,
5427 * but the subflow sequence mapping is contiguous. Use the subflow
5428 * sequence property to find the right mbuf and corresponding dsn
5429 * mapping.
5430 */
5431
5432 while (m) {
5433 VERIFY(m->m_flags & M_PKTHDR);
5434 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5435
5436 if (off >= m->m_len) {
5437 off -= m->m_len;
5438 m = m->m_next;
5439 } else {
5440 break;
5441 }
5442 }
5443
5444 VERIFY(off >= 0);
5445 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5446
5447 *dsn = m->m_pkthdr.mp_dsn;
5448 *relseq = m->m_pkthdr.mp_rseq;
5449 *data_len = m->m_pkthdr.mp_rlen;
5450 *dss_csum = m->m_pkthdr.mp_csum;
5451
5452 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5453 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5454 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5455 }
5456
5457 /*
5458 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5459 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5460 * When it trims data tcp_input calls m_adj() which does not remove the
5461 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5462 * The dsn map insertion cannot be delayed after trim, because data can be in
5463 * the reassembly queue for a while and the DSN option info in tp will be
5464 * overwritten for every new packet received.
5465 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5466 * with mptcp_adj_rmap()
5467 */
5468 void
5469 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5470 {
5471 VERIFY(m->m_flags & M_PKTHDR);
5472 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5473
5474 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5475 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5476 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5477 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5478 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5479 if (tp->t_rcv_map.mpt_dfin) {
5480 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5481 }
5482
5483 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5484
5485 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5486 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5487 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5488 if (th->th_flags & TH_FIN) {
5489 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5490 }
5491 }
5492 }
5493
5494 /*
5495 * Following routines help with failure detection and failover of data
5496 * transfer from one subflow to another.
5497 */
5498 void
5499 mptcp_act_on_txfail(struct socket *so)
5500 {
5501 struct tcpcb *tp = NULL;
5502 struct inpcb *inp = sotoinpcb(so);
5503
5504 if (inp == NULL) {
5505 return;
5506 }
5507
5508 tp = intotcpcb(inp);
5509 if (tp == NULL) {
5510 return;
5511 }
5512
5513 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5514 return;
5515 }
5516
5517 so->so_flags |= SOF_MP_TRYFAILOVER;
5518 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5519 }
5520
5521 /*
5522 * Support for MP_FAIL option
5523 */
5524 int
5525 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5526 {
5527 struct mbuf *m = so->so_snd.sb_mb;
5528 u_int64_t dsn;
5529 int off = 0;
5530 u_int32_t datalen;
5531
5532 if (m == NULL) {
5533 return -1;
5534 }
5535
5536 while (m != NULL) {
5537 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5538 VERIFY(m->m_flags & M_PKTHDR);
5539 dsn = m->m_pkthdr.mp_dsn;
5540 datalen = m->m_pkthdr.mp_rlen;
5541 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5542 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5543 off = dsn_fail - dsn;
5544 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5545 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5546 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5547 return 0;
5548 }
5549
5550 m = m->m_next;
5551 }
5552
5553 /*
5554 * If there was no mbuf data and a fallback to TCP occurred, there's
5555 * not much else to do.
5556 */
5557
5558 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5559 return -1;
5560 }
5561
5562 /*
5563 * Support for sending contiguous MPTCP bytes in subflow
5564 * Also for preventing sending data with ACK in 3-way handshake
5565 */
5566 int32_t
5567 mptcp_adj_sendlen(struct socket *so, int32_t off)
5568 {
5569 struct tcpcb *tp = sototcpcb(so);
5570 struct mptsub *mpts = tp->t_mpsub;
5571 uint64_t mdss_dsn;
5572 uint32_t mdss_subflow_seq;
5573 int mdss_subflow_off;
5574 uint16_t mdss_data_len;
5575 uint16_t dss_csum;
5576
5577 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5578 return 0;
5579 }
5580
5581 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5582 &mdss_data_len, &dss_csum);
5583
5584 /*
5585 * We need to compute how much of the mapping still remains.
5586 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5587 */
5588 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5589
5590 /*
5591 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5592 * seq has been set to 1 (while it should be 0).
5593 */
5594 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5595 mdss_subflow_off--;
5596 }
5597
5598 VERIFY(off >= mdss_subflow_off);
5599
5600 return mdss_data_len - (off - mdss_subflow_off);
5601 }
5602
5603 static uint32_t
5604 mptcp_get_maxseg(struct mptses *mpte)
5605 {
5606 struct mptsub *mpts;
5607 uint32_t maxseg = 0;
5608
5609 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5610 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5611
5612 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5613 TCPS_HAVERCVDFIN2(tp->t_state)) {
5614 continue;
5615 }
5616
5617 if (tp->t_maxseg > maxseg) {
5618 maxseg = tp->t_maxseg;
5619 }
5620 }
5621
5622 return maxseg;
5623 }
5624
5625 static uint8_t
5626 mptcp_get_rcvscale(struct mptses *mpte)
5627 {
5628 struct mptsub *mpts;
5629 uint8_t rcvscale = UINT8_MAX;
5630
5631 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5632 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5633
5634 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5635 TCPS_HAVERCVDFIN2(tp->t_state)) {
5636 continue;
5637 }
5638
5639 if (tp->rcv_scale < rcvscale) {
5640 rcvscale = tp->rcv_scale;
5641 }
5642 }
5643
5644 return rcvscale;
5645 }
5646
5647 /* Similar to tcp_sbrcv_reserve */
5648 static void
5649 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5650 u_int32_t newsize, u_int32_t idealsize)
5651 {
5652 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5653
5654 /* newsize should not exceed max */
5655 newsize = min(newsize, tcp_autorcvbuf_max);
5656
5657 /* The receive window scale negotiated at the
5658 * beginning of the connection will also set a
5659 * limit on the socket buffer size
5660 */
5661 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5662
5663 /* Set new socket buffer size */
5664 if (newsize > sbrcv->sb_hiwat &&
5665 (sbreserve(sbrcv, newsize) == 1)) {
5666 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5667 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5668
5669 /* Again check the limit set by the advertised
5670 * window scale
5671 */
5672 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5673 TCP_MAXWIN << rcvscale);
5674 }
5675 }
5676
5677 void
5678 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5679 {
5680 struct mptses *mpte = mp_tp->mpt_mpte;
5681 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5682 struct sockbuf *sbrcv = &mp_so->so_rcv;
5683 uint32_t hiwat_sum = 0;
5684 uint32_t ideal_sum = 0;
5685 struct mptsub *mpts;
5686
5687 /*
5688 * Do not grow the receive socket buffer if
5689 * - auto resizing is disabled, globally or on this socket
5690 * - the high water mark already reached the maximum
5691 * - the stream is in background and receive side is being
5692 * throttled
5693 * - if there are segments in reassembly queue indicating loss,
5694 * do not need to increase recv window during recovery as more
5695 * data is not going to be sent. A duplicate ack sent during
5696 * recovery should not change the receive window
5697 */
5698 if (tcp_do_autorcvbuf == 0 ||
5699 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5700 tcp_cansbgrow(sbrcv) == 0 ||
5701 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5702 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5703 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5704 /* Can not resize the socket buffer, just return */
5705 return;
5706 }
5707
5708 /*
5709 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5710 *
5711 * But, for this we first need accurate receiver-RTT estimations, which
5712 * we currently don't have.
5713 *
5714 * Let's use a dummy algorithm for now, just taking the sum of all
5715 * subflow's receive-buffers. It's too low, but that's all we can get
5716 * for now.
5717 */
5718
5719 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5720 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5721 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5722 }
5723
5724 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5725 }
5726
5727 /*
5728 * Determine if we can grow the recieve socket buffer to avoid sending
5729 * a zero window update to the peer. We allow even socket buffers that
5730 * have fixed size (set by the application) to grow if the resource
5731 * constraints are met. They will also be trimmed after the application
5732 * reads data.
5733 *
5734 * Similar to tcp_sbrcv_grow_rwin
5735 */
5736 static void
5737 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5738 {
5739 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5740 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5741 u_int32_t rcvbuf = sb->sb_hiwat;
5742
5743 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5744 return;
5745 }
5746
5747 if (tcp_do_autorcvbuf == 1 &&
5748 tcp_cansbgrow(sb) &&
5749 /* Diff to tcp_sbrcv_grow_rwin */
5750 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5751 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5752 rcvbuf < tcp_autorcvbuf_max &&
5753 (sb->sb_idealsize > 0 &&
5754 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5755 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5756 }
5757 }
5758
5759 /* Similar to tcp_sbspace */
5760 int32_t
5761 mptcp_sbspace(struct mptcb *mp_tp)
5762 {
5763 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5764 uint32_t rcvbuf;
5765 int32_t space;
5766 int32_t pending = 0;
5767
5768 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5769
5770 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5771
5772 /* hiwat might have changed */
5773 rcvbuf = sb->sb_hiwat;
5774
5775 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5776 (sb->sb_mbmax - sb->sb_mbcnt)));
5777 if (space < 0) {
5778 space = 0;
5779 }
5780
5781 #if CONTENT_FILTER
5782 /* Compensate for data being processed by content filters */
5783 pending = cfil_sock_data_space(sb);
5784 #endif /* CONTENT_FILTER */
5785 if (pending > space) {
5786 space = 0;
5787 } else {
5788 space -= pending;
5789 }
5790
5791 return space;
5792 }
5793
5794 /*
5795 * Support Fallback to Regular TCP
5796 */
5797 void
5798 mptcp_notify_mpready(struct socket *so)
5799 {
5800 struct tcpcb *tp = NULL;
5801
5802 if (so == NULL) {
5803 return;
5804 }
5805
5806 tp = intotcpcb(sotoinpcb(so));
5807
5808 if (tp == NULL) {
5809 return;
5810 }
5811
5812 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5813 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5814 struct tcpcb *, tp);
5815
5816 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5817 return;
5818 }
5819
5820 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5821 return;
5822 }
5823
5824 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5825 tp->t_mpflags |= TMPF_MPTCP_READY;
5826
5827 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5828 }
5829
5830 void
5831 mptcp_notify_mpfail(struct socket *so)
5832 {
5833 struct tcpcb *tp = NULL;
5834
5835 if (so == NULL) {
5836 return;
5837 }
5838
5839 tp = intotcpcb(sotoinpcb(so));
5840
5841 if (tp == NULL) {
5842 return;
5843 }
5844
5845 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5846 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5847 struct tcpcb *, tp);
5848
5849 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5850 return;
5851 }
5852
5853 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5854 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5855
5856 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5857 }
5858
5859 /*
5860 * Keepalive helper function
5861 */
5862 boolean_t
5863 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5864 {
5865 boolean_t ret = 1;
5866
5867 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5868
5869 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5870 ret = 0;
5871 }
5872 return ret;
5873 }
5874
5875 /*
5876 * MPTCP t_maxseg adjustment function
5877 */
5878 int
5879 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5880 {
5881 int mss_lower = 0;
5882 struct mptcb *mp_tp = tptomptp(tp);
5883
5884 #define MPTCP_COMPUTE_LEN { \
5885 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5886 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5887 mss_lower += 2; \
5888 else \
5889 /* adjust to 32-bit boundary + EOL */ \
5890 mss_lower += 2; \
5891 }
5892 if (mp_tp == NULL) {
5893 return 0;
5894 }
5895
5896 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5897
5898 /*
5899 * For the first subflow and subsequent subflows, adjust mss for
5900 * most common MPTCP option size, for case where tcp_mss is called
5901 * during option processing and MTU discovery.
5902 */
5903 if (!mtudisc) {
5904 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5905 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5906 MPTCP_COMPUTE_LEN;
5907 }
5908
5909 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5910 tp->t_mpflags & TMPF_SENT_JOIN) {
5911 MPTCP_COMPUTE_LEN;
5912 }
5913 } else {
5914 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5915 MPTCP_COMPUTE_LEN;
5916 }
5917 }
5918
5919 return mss_lower;
5920 }
5921
5922 /*
5923 * Update the pid, upid, uuid of the subflow so, based on parent so
5924 */
5925 void
5926 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5927 {
5928 if (so->last_pid != mp_so->last_pid ||
5929 so->last_upid != mp_so->last_upid) {
5930 so->last_upid = mp_so->last_upid;
5931 so->last_pid = mp_so->last_pid;
5932 uuid_copy(so->last_uuid, mp_so->last_uuid);
5933 }
5934 so_update_policy(so);
5935 }
5936
5937 static void
5938 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5939 {
5940 struct inpcb *inp;
5941
5942 tcp_getconninfo(so, &flow->flow_ci);
5943 inp = sotoinpcb(so);
5944 #if INET6
5945 if ((inp->inp_vflag & INP_IPV6) != 0) {
5946 flow->flow_src.ss_family = AF_INET6;
5947 flow->flow_dst.ss_family = AF_INET6;
5948 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5949 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5950 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5951 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5952 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5953 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5954 } else
5955 #endif
5956 if ((inp->inp_vflag & INP_IPV4) != 0) {
5957 flow->flow_src.ss_family = AF_INET;
5958 flow->flow_dst.ss_family = AF_INET;
5959 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5960 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5961 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5962 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5963 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5964 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5965 }
5966 flow->flow_len = sizeof(*flow);
5967 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5968 flow->flow_flags = mpts->mpts_flags;
5969 flow->flow_cid = mpts->mpts_connid;
5970 flow->flow_relseq = mpts->mpts_rel_seq;
5971 flow->flow_soerror = mpts->mpts_socket->so_error;
5972 flow->flow_probecnt = mpts->mpts_probecnt;
5973 }
5974
5975 static int
5976 mptcp_pcblist SYSCTL_HANDLER_ARGS
5977 {
5978 #pragma unused(oidp, arg1, arg2)
5979 int error = 0, f;
5980 size_t len;
5981 struct mppcb *mpp;
5982 struct mptses *mpte;
5983 struct mptcb *mp_tp;
5984 struct mptsub *mpts;
5985 struct socket *so;
5986 conninfo_mptcp_t mptcpci;
5987 mptcp_flow_t *flows = NULL;
5988
5989 if (req->newptr != USER_ADDR_NULL) {
5990 return EPERM;
5991 }
5992
5993 lck_mtx_lock(&mtcbinfo.mppi_lock);
5994 if (req->oldptr == USER_ADDR_NULL) {
5995 size_t n = mtcbinfo.mppi_count;
5996 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5997 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
5998 4 * (n + n / 8) * sizeof(mptcp_flow_t);
5999 return 0;
6000 }
6001 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6002 flows = NULL;
6003 socket_lock(mpp->mpp_socket, 1);
6004 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6005 mpte = mptompte(mpp);
6006
6007 socket_lock_assert_owned(mptetoso(mpte));
6008 mp_tp = mpte->mpte_mptcb;
6009
6010 bzero(&mptcpci, sizeof(mptcpci));
6011 mptcpci.mptcpci_state = mp_tp->mpt_state;
6012 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6013 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6014 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6015 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6016 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6017 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6018 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6019 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6020 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6021 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6022 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6023 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6024 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6025
6026 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6027 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6028 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6029 mptcpci.mptcpci_flow_offset =
6030 offsetof(conninfo_mptcp_t, mptcpci_flows);
6031
6032 len = sizeof(*flows) * mpte->mpte_numflows;
6033 if (mpte->mpte_numflows != 0) {
6034 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
6035 if (flows == NULL) {
6036 socket_unlock(mpp->mpp_socket, 1);
6037 break;
6038 }
6039 mptcpci.mptcpci_len = sizeof(mptcpci) +
6040 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6041 error = SYSCTL_OUT(req, &mptcpci,
6042 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6043 } else {
6044 mptcpci.mptcpci_len = sizeof(mptcpci);
6045 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6046 }
6047 if (error) {
6048 socket_unlock(mpp->mpp_socket, 1);
6049 FREE(flows, M_TEMP);
6050 break;
6051 }
6052 f = 0;
6053 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6054 so = mpts->mpts_socket;
6055 fill_mptcp_subflow(so, &flows[f], mpts);
6056 f++;
6057 }
6058 socket_unlock(mpp->mpp_socket, 1);
6059 if (flows) {
6060 error = SYSCTL_OUT(req, flows, len);
6061 FREE(flows, M_TEMP);
6062 if (error) {
6063 break;
6064 }
6065 }
6066 }
6067 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6068
6069 return error;
6070 }
6071
6072 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6073 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6074 "List of active MPTCP connections");
6075
6076 /*
6077 * Set notsent lowat mark on the MPTCB
6078 */
6079 int
6080 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6081 {
6082 struct mptcb *mp_tp = NULL;
6083 int error = 0;
6084
6085 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6086 mp_tp = mpte->mpte_mptcb;
6087 }
6088
6089 if (mp_tp) {
6090 mp_tp->mpt_notsent_lowat = optval;
6091 } else {
6092 error = EINVAL;
6093 }
6094
6095 return error;
6096 }
6097
6098 u_int32_t
6099 mptcp_get_notsent_lowat(struct mptses *mpte)
6100 {
6101 struct mptcb *mp_tp = NULL;
6102
6103 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6104 mp_tp = mpte->mpte_mptcb;
6105 }
6106
6107 if (mp_tp) {
6108 return mp_tp->mpt_notsent_lowat;
6109 } else {
6110 return 0;
6111 }
6112 }
6113
6114 int
6115 mptcp_notsent_lowat_check(struct socket *so)
6116 {
6117 struct mptses *mpte;
6118 struct mppcb *mpp;
6119 struct mptcb *mp_tp;
6120 struct mptsub *mpts;
6121
6122 int notsent = 0;
6123
6124 mpp = mpsotomppcb(so);
6125 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6126 return 0;
6127 }
6128
6129 mpte = mptompte(mpp);
6130 socket_lock_assert_owned(mptetoso(mpte));
6131 mp_tp = mpte->mpte_mptcb;
6132
6133 notsent = so->so_snd.sb_cc;
6134
6135 if ((notsent == 0) ||
6136 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6137 mp_tp->mpt_notsent_lowat)) {
6138 mptcplog((LOG_DEBUG, "MPTCP Sender: "
6139 "lowat %d notsent %d actual %d \n",
6140 mp_tp->mpt_notsent_lowat, notsent,
6141 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6142 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6143 return 1;
6144 }
6145
6146 /* When Nagle's algorithm is not disabled, it is better
6147 * to wakeup the client even before there is atleast one
6148 * maxseg of data to write.
6149 */
6150 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6151 int retval = 0;
6152 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6153 struct socket *subf_so = mpts->mpts_socket;
6154 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6155
6156 notsent = so->so_snd.sb_cc -
6157 (tp->snd_nxt - tp->snd_una);
6158
6159 if ((tp->t_flags & TF_NODELAY) == 0 &&
6160 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6161 retval = 1;
6162 }
6163 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6164 " nodelay false \n",
6165 mp_tp->mpt_notsent_lowat, notsent),
6166 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6167 return retval;
6168 }
6169 }
6170 return 0;
6171 }
6172
6173 static errno_t
6174 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6175 void **unitinfo)
6176 {
6177 #pragma unused(kctlref, sac, unitinfo)
6178
6179 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6180 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6181 }
6182
6183 mptcp_kern_skt_unit = sac->sc_unit;
6184
6185 return 0;
6186 }
6187
6188 static void
6189 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6190 {
6191 struct mppcb *mpp;
6192
6193 /* Iterate over all MPTCP connections */
6194
6195 lck_mtx_lock(&mtcbinfo.mppi_lock);
6196
6197 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6198 struct socket *mp_so = mpp->mpp_socket;
6199 struct mptses *mpte = mpp->mpp_pcbe;
6200
6201 socket_lock(mp_so, 1);
6202
6203 if (mp_so->so_flags & SOF_DELEGATED &&
6204 uuid_compare(uuid, mp_so->e_uuid)) {
6205 goto next;
6206 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6207 uuid_compare(uuid, mp_so->last_uuid)) {
6208 goto next;
6209 }
6210
6211 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6212 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6213
6214 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6215
6216 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6217 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6218 }
6219
6220 mptcp_check_subflows_and_add(mpte);
6221 mptcp_remove_subflows(mpte);
6222
6223 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6224
6225 next:
6226 socket_unlock(mp_so, 1);
6227 }
6228
6229 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6230 }
6231
6232 static void
6233 mptcp_wifi_status_changed(void)
6234 {
6235 struct mppcb *mpp;
6236
6237 /* Iterate over all MPTCP connections */
6238
6239 lck_mtx_lock(&mtcbinfo.mppi_lock);
6240
6241 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6242 struct socket *mp_so = mpp->mpp_socket;
6243 struct mptses *mpte = mpp->mpp_pcbe;
6244
6245 socket_lock(mp_so, 1);
6246
6247 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6248 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6249 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6250 goto next;
6251 }
6252
6253 mptcp_check_subflows_and_add(mpte);
6254 mptcp_check_subflows_and_remove(mpte);
6255
6256 next:
6257 socket_unlock(mp_so, 1);
6258 }
6259
6260 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6261 }
6262
6263 void
6264 mptcp_ask_symptoms(struct mptses *mpte)
6265 {
6266 struct mptcp_symptoms_ask_uuid ask;
6267 struct socket *mp_so;
6268 struct proc *p;
6269 int pid, prio, err;
6270
6271 if (mptcp_kern_skt_unit == 0) {
6272 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6273 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6274 return;
6275 }
6276
6277 mp_so = mptetoso(mpte);
6278
6279 if (mp_so->so_flags & SOF_DELEGATED) {
6280 pid = mp_so->e_pid;
6281 } else {
6282 pid = mp_so->last_pid;
6283 }
6284
6285 p = proc_find(pid);
6286 if (p == PROC_NULL) {
6287 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6288 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6289 return;
6290 }
6291
6292 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6293
6294 if (mp_so->so_flags & SOF_DELEGATED) {
6295 uuid_copy(ask.uuid, mp_so->e_uuid);
6296 } else {
6297 uuid_copy(ask.uuid, mp_so->last_uuid);
6298 }
6299
6300 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6301
6302 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6303 prio == TASK_DARWINBG_APPLICATION) {
6304 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6305 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6306 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6307 } else {
6308 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6309 }
6310
6311 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6312 &ask, sizeof(ask), CTL_DATA_EOR);
6313
6314 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6315 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6316
6317
6318 proc_rele(p);
6319 }
6320
6321 static errno_t
6322 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6323 void *unitinfo)
6324 {
6325 #pragma unused(kctlref, kcunit, unitinfo)
6326
6327 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6328
6329 return 0;
6330 }
6331
6332 static errno_t
6333 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6334 mbuf_t m, int flags)
6335 {
6336 #pragma unused(kctlref, unitinfo, flags)
6337 symptoms_advisory_t *sa = NULL;
6338
6339 if (kcunit != mptcp_kern_skt_unit) {
6340 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6341 __func__, kcunit, mptcp_kern_skt_unit);
6342 }
6343
6344 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6345 mbuf_freem(m);
6346 return EINVAL;
6347 }
6348
6349 if (mbuf_len(m) < sizeof(*sa)) {
6350 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6351 __func__, mbuf_len(m), sizeof(*sa));
6352 mbuf_freem(m);
6353 return EINVAL;
6354 }
6355
6356 sa = mbuf_data(m);
6357
6358 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6359 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6360 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6361 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6362
6363 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6364 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6365 mptcp_wifi_status_changed();
6366 }
6367 } else {
6368 struct mptcp_symptoms_answer answer;
6369 errno_t err;
6370
6371 /* We temporarily allow different sizes for ease of submission */
6372 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6373 mbuf_len(m) != sizeof(answer)) {
6374 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6375 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6376 sizeof(answer));
6377 mbuf_free(m);
6378 return EINVAL;
6379 }
6380
6381 memset(&answer, 0, sizeof(answer));
6382
6383 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6384 if (err) {
6385 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6386 mbuf_free(m);
6387 return err;
6388 }
6389
6390 mptcp_allow_uuid(answer.uuid, answer.rssi);
6391 }
6392
6393 mbuf_freem(m);
6394 return 0;
6395 }
6396
6397 void
6398 mptcp_control_register(void)
6399 {
6400 /* Set up the advisory control socket */
6401 struct kern_ctl_reg mptcp_kern_ctl;
6402
6403 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6404 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6405 sizeof(mptcp_kern_ctl.ctl_name));
6406 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6407 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6408 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6409 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6410
6411 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6412 }
6413
6414 /*
6415 * Three return-values:
6416 * 1 : WiFi is bad
6417 * 0 : WiFi is good
6418 * -1 : WiFi-state is unknown
6419 */
6420 int
6421 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6422 {
6423 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6424 if (mptcp_advisory.sa_wifi_status) {
6425 return symptoms_is_wifi_lossy() ? 1 : 0;
6426 }
6427
6428 /*
6429 * If it's a first-party app and we don't have any info
6430 * about the Wi-Fi state, let's be pessimistic.
6431 */
6432 return -1;
6433 } else {
6434 if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
6435 return 1;
6436 }
6437
6438 /*
6439 * If we are target-based (meaning, we allow to be more lax on
6440 * the "unusable" target. We only *know* about the state once
6441 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6442 *
6443 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6444 * be set.
6445 *
6446 * In any other case (while in target-mode), consider WiFi bad
6447 * and we are going to ask for allowance from Symptoms anyway.
6448 */
6449 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6450 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6451 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6452 return 0;
6453 }
6454
6455 return 1;
6456 }
6457
6458 return 0;
6459 }
6460 }
6461
6462 boolean_t
6463 symptoms_is_wifi_lossy(void)
6464 {
6465 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6466 }
6467
6468 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6469 static void
6470 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6471 {
6472 struct socket *mp_so = mptetoso(mpte);
6473 struct socket *so = mpts->mpts_socket;
6474 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6475 struct mptcb *mp_tp = mpte->mpte_mptcb;
6476
6477 /* If data was sent with SYN, rewind state */
6478 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6479 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6480 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6481
6482 VERIFY(mp_droplen <= (UINT_MAX));
6483 VERIFY(mp_droplen >= tcp_droplen);
6484
6485 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6486 mpts->mpts_iss += tcp_droplen;
6487 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6488
6489 if (mp_droplen > tcp_droplen) {
6490 /* handle partial TCP ack */
6491 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6492 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6493 mp_droplen = tcp_droplen;
6494 } else {
6495 /* all data on SYN was acked */
6496 mpts->mpts_rel_seq = 1;
6497 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6498 }
6499 mp_tp->mpt_sndmax -= tcp_droplen;
6500
6501 if (mp_droplen != 0) {
6502 VERIFY(mp_so->so_snd.sb_mb != NULL);
6503 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6504 }
6505 }
6506 }
6507
6508 int
6509 mptcp_freeq(struct mptcb *mp_tp)
6510 {
6511 struct tseg_qent *q;
6512 int rv = 0;
6513
6514 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6515 LIST_REMOVE(q, tqe_q);
6516 m_freem(q->tqe_m);
6517 zfree(tcp_reass_zone, q);
6518 rv = 1;
6519 }
6520 mp_tp->mpt_reassqlen = 0;
6521 return rv;
6522 }
6523
6524 static int
6525 mptcp_post_event(u_int32_t event_code, int value)
6526 {
6527 struct kev_mptcp_data event_data;
6528 struct kev_msg ev_msg;
6529
6530 memset(&ev_msg, 0, sizeof(ev_msg));
6531
6532 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6533 ev_msg.kev_class = KEV_NETWORK_CLASS;
6534 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6535 ev_msg.event_code = event_code;
6536
6537 event_data.value = value;
6538
6539 ev_msg.dv[0].data_ptr = &event_data;
6540 ev_msg.dv[0].data_length = sizeof(event_data);
6541
6542 return kev_post_msg(&ev_msg);
6543 }
6544
6545 static void
6546 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6547 {
6548 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6549 int error;
6550
6551 /* First-party apps (Siri) don't flip the cellicon */
6552 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6553 return;
6554 }
6555
6556 /* Subflow is disappearing - don't set it on this one */
6557 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6558 return;
6559 }
6560
6561 /* Fallen back connections are not triggering the cellicon */
6562 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6563 return;
6564 }
6565
6566 /* Remember the last time we set the cellicon. Needed for debouncing */
6567 mpte->mpte_last_cellicon_set = tcp_now;
6568
6569 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6570 tcp_sched_timers(tp);
6571
6572 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6573 mpte->mpte_cellicon_increments != 0) {
6574 if (mptcp_cellicon_refcount == 0) {
6575 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6576 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6577
6578 /* Continue, so that the icon gets set... */
6579 } else {
6580 /*
6581 * In this case, the cellicon is already set. No need to bump it
6582 * even higher
6583 */
6584
6585 return;
6586 }
6587 }
6588
6589 /* When tearing down this subflow, we need to decrement the
6590 * reference counter
6591 */
6592 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6593
6594 /* This counter, so that when a session gets destroyed we decrement
6595 * the reference counter by whatever is left
6596 */
6597 mpte->mpte_cellicon_increments++;
6598
6599 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6600 /* If cellicon is already set, get out of here! */
6601 return;
6602 }
6603
6604 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6605
6606 if (error) {
6607 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6608 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6609 } else {
6610 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6611 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6612 }
6613 }
6614
6615 void
6616 mptcp_clear_cellicon(void)
6617 {
6618 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6619
6620 if (error) {
6621 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6622 __func__, error);
6623 } else {
6624 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6625 __func__);
6626 }
6627 }
6628
6629 /*
6630 * Returns true if the icon has been flipped to WiFi.
6631 */
6632 static boolean_t
6633 __mptcp_unset_cellicon(long val)
6634 {
6635 if (OSAddAtomic(-val, &mptcp_cellicon_refcount) != 1) {
6636 return false;
6637 }
6638
6639 mptcp_clear_cellicon();
6640
6641 return true;
6642 }
6643
6644 void
6645 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6646 {
6647 /* First-party apps (Siri) don't flip the cellicon */
6648 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6649 return;
6650 }
6651
6652 if (mpte->mpte_cellicon_increments == 0) {
6653 /* This flow never used cell - get out of here! */
6654 return;
6655 }
6656
6657 if (mptcp_cellicon_refcount == 0) {
6658 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6659 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6660
6661 return;
6662 }
6663
6664 if (mpts) {
6665 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6666 return;
6667 }
6668
6669 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6670 }
6671
6672 if (mpte->mpte_cellicon_increments < val) {
6673 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6674 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6675 val = mpte->mpte_cellicon_increments;
6676 }
6677
6678 mpte->mpte_cellicon_increments -= val;
6679
6680 if (__mptcp_unset_cellicon(val) == false) {
6681 return;
6682 }
6683
6684 /* All flows are gone - our counter should be at zero too! */
6685 if (mpte->mpte_cellicon_increments != 0) {
6686 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6687 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6688 }
6689 }
6690
6691 void
6692 mptcp_reset_rexmit_state(struct tcpcb *tp)
6693 {
6694 struct mptsub *mpts;
6695 struct inpcb *inp;
6696 struct socket *so;
6697
6698 inp = tp->t_inpcb;
6699 if (inp == NULL) {
6700 return;
6701 }
6702
6703 so = inp->inp_socket;
6704 if (so == NULL) {
6705 return;
6706 }
6707
6708 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6709 return;
6710 }
6711
6712 mpts = tp->t_mpsub;
6713
6714 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6715 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6716 }
6717
6718 void
6719 mptcp_reset_keepalive(struct tcpcb *tp)
6720 {
6721 struct mptsub *mpts = tp->t_mpsub;
6722
6723 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6724 }