]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
c253fc4f82fb5cf246c3cbba7de2b9ed45f2c285
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #include <netinet6/in6_pcb.h>
68 #include <netinet6/ip6protosw.h>
69 #include <dev/random/randomdev.h>
70
71 /*
72 * Notes on MPTCP implementation.
73 *
74 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
75 * communication domain. The structure mtcbinfo describes the MPTCP instance
76 * of a Multipath protocol in that domain. It is used to keep track of all
77 * MPTCP PCB instances in the system, and is protected by the global lock
78 * mppi_lock.
79 *
80 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
81 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
82 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
83 * allocated from the same memory block, and each structure has a pointer
84 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
85 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
86 * PCB (mppcb) as well as the MPTCP Session (mptses).
87 *
88 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
89 *
90 * A functioning MPTCP Session consists of one or more subflow sockets. Each
91 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
92 * represented by the mptsub structure. Because each subflow requires access
93 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
94 * subflow. This gets decremented prior to the subflow's destruction.
95 *
96 * To handle events (read, write, control) from the subflows, we do direct
97 * upcalls into the specific function.
98 *
99 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
100 * lock. Incoming data on a subflow also ends up taking this single lock. To
101 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
102 * of the MPTCP-socket.
103 *
104 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
105 * work is done by the MPTCP garbage collector which is invoked on demand by
106 * the PF_MULTIPATH garbage collector. This process will take place once all
107 * of the subflows have been destroyed.
108 */
109
110 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
111 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
112
113 static uint32_t mptcp_gc(struct mppcbinfo *);
114 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
115 struct uio *, struct mbuf **, struct mbuf **, int *);
116 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
117 struct uio *, struct mbuf *, struct mbuf *, int);
118 static void mptcp_subflow_wupcall(struct socket *, void *, int);
119 static void mptcp_subflow_eupcall1(struct socket *so, void *arg, long events);
120 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
121 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
122
123 static void mptcp_subflow_abort(struct mptsub *, int);
124
125 static void mptcp_send_dfin(struct socket *so);
126 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
127 static int mptcp_freeq(struct mptcb *mp_tp);
128
129 /*
130 * Possible return values for subflow event handlers. Note that success
131 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
132 * indicate errors or actions which require immediate attention; they will
133 * prevent the rest of the handlers from processing their respective events
134 * until the next round of events processing.
135 */
136 typedef enum {
137 MPTS_EVRET_DELETE = 1, /* delete this subflow */
138 MPTS_EVRET_OK = 2, /* OK */
139 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
140 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
141 } ev_ret_t;
142
143 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, long *, long);
144 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, long *, long);
145 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, long *, long);
146 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, long *, long);
147 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, long *, long);
148 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, long *, long);
149 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, long *, long);
150 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, long *, long);
151 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, long *, long);
152 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, long *, long);
153 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, long *, long);
154 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, long *, long);
155
156 static void mptcp_do_sha1(mptcp_key_t *, char *);
157 static void mptcp_init_local_parms(struct mptses *);
158
159 static ZONE_DECLARE(mptsub_zone, "mptsub", sizeof(struct mptsub), ZC_ZFREE_CLEARMEM);
160 static ZONE_DECLARE(mptopt_zone, "mptopt", sizeof(struct mptopt), ZC_ZFREE_CLEARMEM);
161 static ZONE_DECLARE(mpt_subauth_zone, "mptauth",
162 sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
163
164 struct mppcbinfo mtcbinfo;
165
166 SYSCTL_DECL(_net_inet);
167
168 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
169
170 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
171 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
172 &mptcp_dbg_area, 0, "MPTCP debug area");
173
174 uint32_t mptcp_dbg_level = 1;
175 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &mptcp_dbg_level, 0, "MPTCP debug level");
177
178 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
179 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
180
181
182 static int mptcp_alternate_port = 0;
183 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
184 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
185
186 static struct protosw mptcp_subflow_protosw;
187 static struct pr_usrreqs mptcp_subflow_usrreqs;
188 static struct ip6protosw mptcp_subflow_protosw6;
189 static struct pr_usrreqs mptcp_subflow_usrreqs6;
190
191 static uint8_t mptcp_create_subflows_scheduled;
192
193 typedef struct mptcp_subflow_event_entry {
194 long sofilt_hint_mask;
195 ev_ret_t (*sofilt_hint_ev_hdlr)(
196 struct mptses *mpte,
197 struct mptsub *mpts,
198 long *p_mpsofilt_hint,
199 long event);
200 } mptsub_ev_entry_t;
201
202 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
203 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
204 static uint32_t mptcp_kern_skt_inuse = 0;
205 static uint32_t mptcp_kern_skt_unit;
206 static symptoms_advisory_t mptcp_advisory;
207
208 uint32_t mptcp_cellicon_refcount = 0;
209
210 /*
211 * XXX The order of the event handlers below is really
212 * really important. Think twice before changing it.
213 */
214 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
215 {
216 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
217 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
218 },
219 {
220 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
221 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
222 },
223 {
224 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
225 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
226 },
227 {
228 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
229 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
230 },
231 {
232 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
233 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
234 },
235 {
236 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
237 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
238 },
239 {
240 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
241 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
242 },
243 {
244 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
245 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
246 },
247 {
248 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
249 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
250 },
251 {
252 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
253 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
254 },
255 {
256 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
257 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
258 },
259 {
260 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
261 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
262 },
263 {
264 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
265 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
266 },
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
269 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
270 },
271 };
272
273 os_log_t mptcp_log_handle;
274
275 /*
276 * Protocol pr_init callback.
277 */
278 void
279 mptcp_init(struct protosw *pp, struct domain *dp)
280 {
281 #pragma unused(dp)
282 static int mptcp_initialized = 0;
283 struct protosw *prp;
284 struct ip6protosw *prp6;
285
286 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
287
288 /* do this only once */
289 if (mptcp_initialized) {
290 return;
291 }
292 mptcp_initialized = 1;
293
294 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
295
296 /*
297 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
298 * we must be able to find IPPROTO_TCP entries for both.
299 */
300 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
301 VERIFY(prp != NULL);
302 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
303 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
304 sizeof(mptcp_subflow_usrreqs));
305 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
306 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
307 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
308 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
309 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
310 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
311 /*
312 * Socket filters shouldn't attach/detach to/from this protosw
313 * since pr_protosw is to be used instead, which points to the
314 * real protocol; if they do, it is a bug and we should panic.
315 */
316 mptcp_subflow_protosw.pr_filter_head.tqh_first =
317 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
318 mptcp_subflow_protosw.pr_filter_head.tqh_last =
319 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
320
321 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
322 IPPROTO_TCP, SOCK_STREAM);
323 VERIFY(prp6 != NULL);
324 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
325 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
326 sizeof(mptcp_subflow_usrreqs6));
327 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
328 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
329 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
330 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
331 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
332 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
333 /*
334 * Socket filters shouldn't attach/detach to/from this protosw
335 * since pr_protosw is to be used instead, which points to the
336 * real protocol; if they do, it is a bug and we should panic.
337 */
338 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
339 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
340 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
341 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
342
343 bzero(&mtcbinfo, sizeof(mtcbinfo));
344 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
345 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
346 mtcbinfo.mppi_zone = zone_create("mptc", mtcbinfo.mppi_size,
347 ZC_NONE);
348
349 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
350 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
351 mtcbinfo.mppi_lock_grp_attr);
352 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
353 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
354 mtcbinfo.mppi_lock_attr);
355
356 mtcbinfo.mppi_gc = mptcp_gc;
357 mtcbinfo.mppi_timer = mptcp_timer;
358
359 /* attach to MP domain for garbage collection to take place */
360 mp_pcbinfo_attach(&mtcbinfo);
361
362 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
363 }
364
365 int
366 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
367 {
368 int i, index = -1;
369
370 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
371 if (create && stats[i].ifindex == IFSCOPE_NONE) {
372 if (index < 0) {
373 index = i;
374 }
375 continue;
376 }
377
378 if (stats[i].ifindex == ifindex) {
379 index = i;
380 return index;
381 }
382 }
383
384 if (index != -1) {
385 stats[index].ifindex = ifindex;
386 }
387
388 return index;
389 }
390
391 static int
392 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
393 {
394 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
395 int index;
396
397 if (ifp == NULL) {
398 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
399 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
400 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
401 return -1;
402 }
403
404 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
405
406 if (index != -1) {
407 if (stats[index].is_expensive == 0) {
408 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
409 }
410 }
411
412 return index;
413 }
414
415 void
416 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
417 {
418 int index;
419
420 tcpstat.tcps_mp_switches++;
421 mpte->mpte_subflow_switches++;
422
423 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
424
425 if (index != -1) {
426 mpte->mpte_itfstats[index].switches++;
427 }
428 }
429
430 /*
431 * Flushes all recorded socket options from an MP socket.
432 */
433 static void
434 mptcp_flush_sopts(struct mptses *mpte)
435 {
436 struct mptopt *mpo, *tmpo;
437
438 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
439 mptcp_sopt_remove(mpte, mpo);
440 mptcp_sopt_free(mpo);
441 }
442 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
443 }
444
445 /*
446 * Create an MPTCP session, called as a result of opening a MPTCP socket.
447 */
448 int
449 mptcp_session_create(struct mppcb *mpp)
450 {
451 struct mppcbinfo *mppi;
452 struct mptses *mpte;
453 struct mptcb *mp_tp;
454
455 VERIFY(mpp != NULL);
456 mppi = mpp->mpp_pcbinfo;
457 VERIFY(mppi != NULL);
458
459 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
460 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
461
462 /* MPTCP Multipath PCB Extension */
463 bzero(mpte, sizeof(*mpte));
464 VERIFY(mpp->mpp_pcbe == NULL);
465 mpp->mpp_pcbe = mpte;
466 mpte->mpte_mppcb = mpp;
467 mpte->mpte_mptcb = mp_tp;
468
469 TAILQ_INIT(&mpte->mpte_sopts);
470 TAILQ_INIT(&mpte->mpte_subflows);
471 mpte->mpte_associd = SAE_ASSOCID_ANY;
472 mpte->mpte_connid_last = SAE_CONNID_ANY;
473
474 mptcp_init_urgency_timer(mpte);
475
476 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
477 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
478
479 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
480 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
481 }
482
483 mpte->mpte_last_cellicon_set = tcp_now;
484
485 /* MPTCP Protocol Control Block */
486 bzero(mp_tp, sizeof(*mp_tp));
487 mp_tp->mpt_mpte = mpte;
488 mp_tp->mpt_state = MPTCPS_CLOSED;
489
490 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
491
492 return 0;
493 }
494
495 struct sockaddr *
496 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
497 {
498 if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
499 return &mpte->mpte_dst;
500 }
501
502 if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
503 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
504 }
505
506 if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
507 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
508 }
509
510 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
511 * meaning we prefer IPv6 over IPv4.
512 */
513 if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
514 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
515 }
516
517 if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
518 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
519 }
520
521 /* We don't yet have a unicast IP */
522 return NULL;
523 }
524
525 static void
526 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
527 uint64_t *cellbytes, uint64_t *allbytes)
528 {
529 int64_t mycellbytes = 0;
530 uint64_t myallbytes = 0;
531 int i;
532
533 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
534 if (mpte->mpte_itfstats[i].is_expensive) {
535 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
536 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
537 }
538
539 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
540 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
541 }
542
543 if (initial_cell) {
544 mycellbytes -= mpte->mpte_init_txbytes;
545 mycellbytes -= mpte->mpte_init_rxbytes;
546 }
547
548 if (mycellbytes < 0) {
549 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
550 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
551 *cellbytes = 0;
552 *allbytes = 0;
553 } else {
554 *cellbytes = mycellbytes;
555 *allbytes = myallbytes;
556 }
557 }
558
559 static void
560 mptcpstats_session_wrapup(struct mptses *mpte)
561 {
562 boolean_t cell = mpte->mpte_initial_cell;
563
564 switch (mpte->mpte_svctype) {
565 case MPTCP_SVCTYPE_HANDOVER:
566 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
567 tcpstat.tcps_mptcp_fp_handover_attempt++;
568
569 if (cell && mpte->mpte_handshake_success) {
570 tcpstat.tcps_mptcp_fp_handover_success_cell++;
571
572 if (mpte->mpte_used_wifi) {
573 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
574 }
575 } else if (mpte->mpte_handshake_success) {
576 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
577
578 if (mpte->mpte_used_cell) {
579 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
580 }
581 }
582 } else {
583 tcpstat.tcps_mptcp_handover_attempt++;
584
585 if (cell && mpte->mpte_handshake_success) {
586 tcpstat.tcps_mptcp_handover_success_cell++;
587
588 if (mpte->mpte_used_wifi) {
589 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
590 }
591 } else if (mpte->mpte_handshake_success) {
592 tcpstat.tcps_mptcp_handover_success_wifi++;
593
594 if (mpte->mpte_used_cell) {
595 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
596 }
597 }
598 }
599
600 if (mpte->mpte_handshake_success) {
601 uint64_t cellbytes;
602 uint64_t allbytes;
603
604 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
605
606 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
607 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
608 }
609 break;
610 case MPTCP_SVCTYPE_INTERACTIVE:
611 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
612 tcpstat.tcps_mptcp_fp_interactive_attempt++;
613
614 if (mpte->mpte_handshake_success) {
615 tcpstat.tcps_mptcp_fp_interactive_success++;
616
617 if (!cell && mpte->mpte_used_cell) {
618 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
619 }
620 }
621 } else {
622 tcpstat.tcps_mptcp_interactive_attempt++;
623
624 if (mpte->mpte_handshake_success) {
625 tcpstat.tcps_mptcp_interactive_success++;
626
627 if (!cell && mpte->mpte_used_cell) {
628 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
629 }
630 }
631 }
632
633 if (mpte->mpte_handshake_success) {
634 uint64_t cellbytes;
635 uint64_t allbytes;
636
637 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
638
639 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
640 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
641 }
642 break;
643 case MPTCP_SVCTYPE_AGGREGATE:
644 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
645 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
646
647 if (mpte->mpte_handshake_success) {
648 tcpstat.tcps_mptcp_fp_aggregate_success++;
649 }
650 } else {
651 tcpstat.tcps_mptcp_aggregate_attempt++;
652
653 if (mpte->mpte_handshake_success) {
654 tcpstat.tcps_mptcp_aggregate_success++;
655 }
656 }
657
658 if (mpte->mpte_handshake_success) {
659 uint64_t cellbytes;
660 uint64_t allbytes;
661
662 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
663
664 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
665 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
666 }
667 break;
668 }
669
670 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
671 tcpstat.tcps_mptcp_back_to_wifi++;
672 }
673
674 if (mpte->mpte_triggered_cell) {
675 tcpstat.tcps_mptcp_triggered_cell++;
676 }
677 }
678
679 /*
680 * Destroy an MPTCP session.
681 */
682 static void
683 mptcp_session_destroy(struct mptses *mpte)
684 {
685 struct mptcb *mp_tp = mpte->mpte_mptcb;
686
687 VERIFY(mp_tp != NULL);
688 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
689
690 mptcpstats_session_wrapup(mpte);
691 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
692 mptcp_flush_sopts(mpte);
693
694 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
695 _FREE(mpte->mpte_itfinfo, M_TEMP);
696 }
697 mpte->mpte_itfinfo = NULL;
698
699 m_freem_list(mpte->mpte_reinjectq);
700
701 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
702 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
703 }
704
705 boolean_t
706 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
707 {
708 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
709 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
710 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
711 }
712
713 static int
714 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
715 const struct in_addr *addrv4)
716 {
717 static const struct in6_addr well_known_prefix = {
718 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
719 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
720 0x00, 0x00, 0x00, 0x00},
721 };
722 const char *ptrv4 = (const char *)addrv4;
723 char *ptr = (char *)addr;
724
725 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
726 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
727 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
728 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
729 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
730 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
731 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
732 return -1;
733 }
734
735 /* Check for the well-known prefix */
736 if (len == NAT64_PREFIX_LEN_96 &&
737 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
738 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
739 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
740 return -1;
741 }
742 }
743
744 switch (len) {
745 case NAT64_PREFIX_LEN_96:
746 memcpy(ptr + 12, ptrv4, 4);
747 break;
748 case NAT64_PREFIX_LEN_64:
749 memcpy(ptr + 9, ptrv4, 4);
750 break;
751 case NAT64_PREFIX_LEN_56:
752 memcpy(ptr + 7, ptrv4, 1);
753 memcpy(ptr + 9, ptrv4 + 1, 3);
754 break;
755 case NAT64_PREFIX_LEN_48:
756 memcpy(ptr + 6, ptrv4, 2);
757 memcpy(ptr + 9, ptrv4 + 2, 2);
758 break;
759 case NAT64_PREFIX_LEN_40:
760 memcpy(ptr + 5, ptrv4, 3);
761 memcpy(ptr + 9, ptrv4 + 3, 1);
762 break;
763 case NAT64_PREFIX_LEN_32:
764 memcpy(ptr + 4, ptrv4, 4);
765 break;
766 default:
767 panic("NAT64-prefix len is wrong: %u\n", len);
768 }
769
770 return 0;
771 }
772
773 static void
774 mptcp_trigger_cell_bringup(struct mptses *mpte)
775 {
776 struct socket *mp_so = mptetoso(mpte);
777
778 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
779 uuid_string_t uuidstr;
780 int err;
781
782 socket_unlock(mp_so, 0);
783 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
784 TRUE);
785 socket_lock(mp_so, 0);
786
787 if (err == 0) {
788 mpte->mpte_triggered_cell = 1;
789 }
790
791 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
792 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
793 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
794 } else {
795 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
796 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
797 }
798 }
799
800 static boolean_t
801 mptcp_subflow_disconnecting(struct mptsub *mpts)
802 {
803 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
804 return true;
805 }
806
807 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
808 return true;
809 }
810
811 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
812 return true;
813 }
814
815 return false;
816 }
817
818 /*
819 * In Handover mode, only create cell subflow if
820 * - Symptoms marked WiFi as weak:
821 * Here, if we are sending data, then we can check the RTO-state. That is a
822 * stronger signal of WiFi quality than the Symptoms indicator.
823 * If however we are not sending any data, the only thing we can do is guess
824 * and thus bring up Cell.
825 *
826 * - Symptoms marked WiFi as unknown:
827 * In this state we don't know what the situation is and thus remain
828 * conservative, only bringing up cell if there are retransmissions going on.
829 */
830 static boolean_t
831 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
832 {
833 int unusable_state = mptcp_is_wifi_unusable_for_session(mpte);
834
835 if (unusable_state == 0) {
836 /* WiFi is good - don't use cell */
837 return false;
838 }
839
840 if (unusable_state == -1) {
841 /*
842 * We are in unknown state, only use Cell if we have confirmed
843 * that WiFi is bad.
844 */
845 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
846 return true;
847 } else {
848 return false;
849 }
850 }
851
852 if (unusable_state == 1) {
853 /*
854 * WiFi is confirmed to be bad from Symptoms-Framework.
855 * If we are sending data, check the RTOs.
856 * Otherwise, be pessimistic and use Cell.
857 */
858 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
859 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
860 return true;
861 } else {
862 return false;
863 }
864 } else {
865 return true;
866 }
867 }
868
869 return false;
870 }
871
872 void
873 mptcp_check_subflows_and_add(struct mptses *mpte)
874 {
875 struct mptcb *mp_tp = mpte->mpte_mptcb;
876 boolean_t cellular_viable = FALSE;
877 boolean_t want_cellular = TRUE;
878 uint32_t i;
879
880 if (!mptcp_ok_to_create_subflows(mp_tp)) {
881 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
882 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
883 return;
884 }
885
886 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
887 return;
888 }
889
890 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
891 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
892 struct mpt_itf_info *info;
893 struct sockaddr_in6 nat64pre;
894 struct sockaddr *dst;
895 struct mptsub *mpts;
896 struct ifnet *ifp;
897 uint32_t ifindex;
898
899 info = &mpte->mpte_itfinfo[i];
900
901 ifindex = info->ifindex;
902 if (ifindex == IFSCOPE_NONE) {
903 continue;
904 }
905
906 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
907 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
908 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
909
910 if (info->no_mptcp_support) {
911 continue;
912 }
913
914 ifnet_head_lock_shared();
915 ifp = ifindex2ifnet[ifindex];
916 ifnet_head_done();
917
918 if (ifp == NULL) {
919 continue;
920 }
921
922 if (IFNET_IS_CELLULAR(ifp)) {
923 cellular_viable = TRUE;
924 }
925
926 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
927 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
928 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
929
930 if (subifp == NULL) {
931 continue;
932 }
933
934 /*
935 * If there is at least one functioning subflow on WiFi
936 * and we are checking for the cell interface, then
937 * we always need to ask symptoms for permission as
938 * cell is triggered even if WiFi is available.
939 */
940 if (!IFNET_IS_CELLULAR(subifp) &&
941 !mptcp_subflow_disconnecting(mpts) &&
942 IFNET_IS_CELLULAR(ifp)) {
943 need_to_ask_symptoms = TRUE;
944 }
945
946 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
947 os_log(mptcp_log_handle,
948 "%s - %lx: handover: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
949 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
950 IFNET_IS_CELLULAR(subifp),
951 mptcp_is_wifi_unusable_for_session(mpte),
952 mpts->mpts_flags,
953 tp->t_rxtshift,
954 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
955 mptetoso(mpte)->so_snd.sb_cc,
956 ifindex, subifp->if_index,
957 tp->t_srtt >> TCP_RTT_SHIFT,
958 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
959 tp->t_rxtcur);
960
961 if (!IFNET_IS_CELLULAR(subifp) &&
962 !mptcp_subflow_disconnecting(mpts) &&
963 (mpts->mpts_flags & MPTSF_CONNECTED) &&
964 !mptcp_handover_use_cellular(mpte, tp)) {
965 found = TRUE;
966
967 /* We found a proper subflow on WiFi - no need for cell */
968 want_cellular = FALSE;
969 break;
970 }
971 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
972 uint64_t time_now = mach_continuous_time();
973
974 os_log(mptcp_log_handle,
975 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
976 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
977 time_now, mptcp_is_wifi_unusable_for_session(mpte),
978 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
979 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
980
981 if (!IFNET_IS_CELLULAR(subifp) &&
982 !mptcp_subflow_disconnecting(mpts) &&
983 (mpte->mpte_time_target == 0 ||
984 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
985 !mptcp_is_wifi_unusable_for_session(mpte))) {
986 found = TRUE;
987
988 want_cellular = FALSE;
989 break;
990 }
991 }
992
993 if (subifp->if_index == ifindex &&
994 !mptcp_subflow_disconnecting(mpts)) {
995 /*
996 * We found a subflow on this interface.
997 * No need to create a new one.
998 */
999 found = TRUE;
1000 break;
1001 }
1002 }
1003
1004 if (found) {
1005 continue;
1006 }
1007
1008 if (need_to_ask_symptoms &&
1009 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1010 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1011 mptcp_developer_mode == 0) {
1012 mptcp_ask_symptoms(mpte);
1013 return;
1014 }
1015
1016 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1017
1018 if (dst->sa_family == AF_INET &&
1019 !info->has_v4_conn && info->has_nat64_conn) {
1020 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1021 int error, j;
1022
1023 bzero(&nat64pre, sizeof(struct sockaddr_in6));
1024
1025 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1026 if (error) {
1027 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1028 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1029 continue;
1030 }
1031
1032 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1033 if (nat64prefixes[j].prefix_len != 0) {
1034 break;
1035 }
1036 }
1037
1038 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1039
1040 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1041 nat64prefixes[j].prefix_len,
1042 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1043 if (error != 0) {
1044 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1045 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1046 continue;
1047 }
1048
1049 memcpy(&nat64pre.sin6_addr,
1050 &nat64prefixes[j].ipv6_prefix,
1051 sizeof(nat64pre.sin6_addr));
1052 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1053 nat64pre.sin6_family = AF_INET6;
1054 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1055 nat64pre.sin6_flowinfo = 0;
1056 nat64pre.sin6_scope_id = 0;
1057
1058 dst = (struct sockaddr *)&nat64pre;
1059 }
1060
1061 /* Initial subflow started on a NAT64'd address? */
1062 if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
1063 mpte->mpte_dst.sa_family == AF_INET6 &&
1064 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
1065 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
1066 }
1067
1068 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1069 continue;
1070 }
1071 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1072 continue;
1073 }
1074
1075 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1076 }
1077
1078 if (!cellular_viable && want_cellular) {
1079 /* Trigger Cell Bringup */
1080 mptcp_trigger_cell_bringup(mpte);
1081 }
1082 }
1083
1084 static void
1085 mptcp_remove_cell_subflows(struct mptses *mpte)
1086 {
1087 struct mptsub *mpts, *tmpts;
1088 boolean_t found = false;
1089
1090 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1091 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1092
1093 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1094 continue;
1095 }
1096
1097 /* We have a functioning subflow on WiFi. No need for cell! */
1098 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1099 !mptcp_subflow_disconnecting(mpts)) {
1100 found = true;
1101 }
1102 }
1103
1104 /* Didn't found functional sub on WiFi - stay on cell */
1105 if (!found) {
1106 return;
1107 }
1108
1109 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1110 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1111
1112 /* Only remove cellular subflows */
1113 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1114 continue;
1115 }
1116
1117 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1118 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1119
1120 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1121 }
1122
1123 return;
1124 }
1125
1126 /* Returns true if it removed a subflow on cell */
1127 static void
1128 mptcp_handover_subflows_remove(struct mptses *mpte)
1129 {
1130 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1131 boolean_t found_working_subflow = false;
1132 struct mptsub *mpts;
1133
1134 /*
1135 * Look for a subflow that is on a non-cellular interface
1136 * and actually works (aka, no retransmission timeout).
1137 */
1138 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1139 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1140 struct socket *so;
1141 struct tcpcb *tp;
1142
1143 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1144 continue;
1145 }
1146
1147 so = mpts->mpts_socket;
1148 tp = sototcpcb(so);
1149
1150 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1151 tp->t_state != TCPS_ESTABLISHED) {
1152 continue;
1153 }
1154
1155 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1156 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1157
1158 if (!mptcp_handover_use_cellular(mpte, tp)) {
1159 found_working_subflow = true;
1160 break;
1161 }
1162 }
1163
1164 /*
1165 * Couldn't find a working subflow, let's not remove those on a cellular
1166 * interface.
1167 */
1168 if (!found_working_subflow) {
1169 return;
1170 }
1171
1172 mptcp_remove_cell_subflows(mpte);
1173 }
1174
1175 static void
1176 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1177 {
1178 uint64_t time_now = mach_continuous_time();
1179
1180 if (mpte->mpte_time_target != 0 &&
1181 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1182 mptcp_is_wifi_unusable_for_session(mpte)) {
1183 /* WiFi is bad and we are below the target - don't remove any subflows */
1184 return;
1185 }
1186
1187 mptcp_remove_cell_subflows(mpte);
1188 }
1189
1190 /*
1191 * Based on the MPTCP Service-type and the state of the subflows, we
1192 * will destroy subflows here.
1193 */
1194 void
1195 mptcp_check_subflows_and_remove(struct mptses *mpte)
1196 {
1197 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1198 return;
1199 }
1200
1201 socket_lock_assert_owned(mptetoso(mpte));
1202
1203 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1204 mptcp_handover_subflows_remove(mpte);
1205 }
1206
1207 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1208 mptcp_targetbased_subflows_remove(mpte);
1209 }
1210 }
1211
1212 static void
1213 mptcp_remove_subflows(struct mptses *mpte)
1214 {
1215 struct mptsub *mpts, *tmpts;
1216
1217 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1218 return;
1219 }
1220
1221 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1222 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1223 boolean_t found = false;
1224 uint32_t ifindex;
1225 uint32_t i;
1226
1227 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1228 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1229
1230 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1231 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1232 ifp ? ifp->if_index : -1);
1233 soevent(mpts->mpts_socket,
1234 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1235
1236 continue;
1237 }
1238
1239 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1240 continue;
1241 }
1242
1243 if (ifp) {
1244 ifindex = ifp->if_index;
1245 } else {
1246 ifindex = mpts->mpts_ifscope;
1247 }
1248
1249 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1250 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1251 continue;
1252 }
1253
1254 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1255 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1256 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1257 found = true;
1258 break;
1259 }
1260
1261 if (mpts->mpts_dst.sa_family == AF_INET &&
1262 mpte->mpte_itfinfo[i].has_v4_conn) {
1263 found = true;
1264 break;
1265 }
1266 }
1267 }
1268
1269 if (!found) {
1270 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1271 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1272 ifindex, mpts->mpts_flags);
1273
1274 soevent(mpts->mpts_socket,
1275 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1276 }
1277 }
1278 }
1279
1280 static void
1281 mptcp_create_subflows(__unused void *arg)
1282 {
1283 struct mppcb *mpp;
1284
1285 /*
1286 * Start with clearing, because we might be processing connections
1287 * while a new event comes in.
1288 */
1289 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1290 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1291 }
1292
1293 /* Iterate over all MPTCP connections */
1294
1295 lck_mtx_lock(&mtcbinfo.mppi_lock);
1296
1297 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1298 struct socket *mp_so = mpp->mpp_socket;
1299 struct mptses *mpte = mpp->mpp_pcbe;
1300
1301 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1302 continue;
1303 }
1304
1305 socket_lock(mp_so, 1);
1306 VERIFY(mp_so->so_usecount > 0);
1307
1308 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1309
1310 mptcp_check_subflows_and_add(mpte);
1311 mptcp_remove_subflows(mpte);
1312
1313 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1314 socket_unlock(mp_so, 1);
1315 }
1316
1317 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1318 }
1319
1320 /*
1321 * We need this because we are coming from an NECP-event. This event gets posted
1322 * while holding NECP-locks. The creation of the subflow however leads us back
1323 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1324 * So, we would deadlock there as we already hold the NECP-lock.
1325 *
1326 * So, let's schedule this separately. It also gives NECP the chance to make
1327 * progress, without having to wait for MPTCP to finish its subflow creation.
1328 */
1329 void
1330 mptcp_sched_create_subflows(struct mptses *mpte)
1331 {
1332 struct mppcb *mpp = mpte->mpte_mppcb;
1333 struct mptcb *mp_tp = mpte->mpte_mptcb;
1334 struct socket *mp_so = mpp->mpp_socket;
1335
1336 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1337 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1338 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1339 return;
1340 }
1341
1342 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1343 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1344 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1345 }
1346
1347 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1348 return;
1349 }
1350
1351 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1352 timeout(mptcp_create_subflows, NULL, hz / 10);
1353 }
1354
1355 /*
1356 * Allocate an MPTCP socket option structure.
1357 */
1358 struct mptopt *
1359 mptcp_sopt_alloc(zalloc_flags_t how)
1360 {
1361 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1362 }
1363
1364 /*
1365 * Free an MPTCP socket option structure.
1366 */
1367 void
1368 mptcp_sopt_free(struct mptopt *mpo)
1369 {
1370 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1371
1372 zfree(mptopt_zone, mpo);
1373 }
1374
1375 /*
1376 * Add a socket option to the MPTCP socket option list.
1377 */
1378 void
1379 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1380 {
1381 socket_lock_assert_owned(mptetoso(mpte));
1382 mpo->mpo_flags |= MPOF_ATTACHED;
1383 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1384 }
1385
1386 /*
1387 * Remove a socket option from the MPTCP socket option list.
1388 */
1389 void
1390 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1391 {
1392 socket_lock_assert_owned(mptetoso(mpte));
1393 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1394 mpo->mpo_flags &= ~MPOF_ATTACHED;
1395 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1396 }
1397
1398 /*
1399 * Search for an existing <sopt_level,sopt_name> socket option.
1400 */
1401 struct mptopt *
1402 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1403 {
1404 struct mptopt *mpo;
1405
1406 socket_lock_assert_owned(mptetoso(mpte));
1407
1408 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1409 if (mpo->mpo_level == sopt->sopt_level &&
1410 mpo->mpo_name == sopt->sopt_name) {
1411 break;
1412 }
1413 }
1414 return mpo;
1415 }
1416
1417 /*
1418 * Allocate a MPTCP subflow structure.
1419 */
1420 static struct mptsub *
1421 mptcp_subflow_alloc(void)
1422 {
1423 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1424 }
1425
1426 /*
1427 * Deallocate a subflow structure, called when all of the references held
1428 * on it have been released. This implies that the subflow has been deleted.
1429 */
1430 static void
1431 mptcp_subflow_free(struct mptsub *mpts)
1432 {
1433 VERIFY(mpts->mpts_refcnt == 0);
1434 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1435 VERIFY(mpts->mpts_mpte == NULL);
1436 VERIFY(mpts->mpts_socket == NULL);
1437
1438 if (mpts->mpts_src != NULL) {
1439 FREE(mpts->mpts_src, M_SONAME);
1440 mpts->mpts_src = NULL;
1441 }
1442
1443 zfree(mptsub_zone, mpts);
1444 }
1445
1446 static void
1447 mptcp_subflow_addref(struct mptsub *mpts)
1448 {
1449 if (++mpts->mpts_refcnt == 0) {
1450 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1451 }
1452 /* NOTREACHED */
1453 }
1454
1455 static void
1456 mptcp_subflow_remref(struct mptsub *mpts)
1457 {
1458 if (mpts->mpts_refcnt == 0) {
1459 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1460 /* NOTREACHED */
1461 }
1462 if (--mpts->mpts_refcnt > 0) {
1463 return;
1464 }
1465
1466 /* callee will unlock and destroy lock */
1467 mptcp_subflow_free(mpts);
1468 }
1469
1470 static void
1471 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1472 {
1473 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1474 struct tcpcb *tp = sototcpcb(so);
1475
1476 /*
1477 * From this moment on, the subflow is linked to the MPTCP-connection.
1478 * Locking,... happens now at the MPTCP-layer
1479 */
1480 tp->t_mptcb = mpte->mpte_mptcb;
1481 so->so_flags |= SOF_MP_SUBFLOW;
1482 mp_so->so_usecount++;
1483
1484 /*
1485 * Insert the subflow into the list, and associate the MPTCP PCB
1486 * as well as the the subflow socket. From this point on, removing
1487 * the subflow needs to be done via mptcp_subflow_del().
1488 */
1489 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1490 mpte->mpte_numflows++;
1491
1492 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1493 mpts->mpts_mpte = mpte;
1494 mpts->mpts_socket = so;
1495 tp->t_mpsub = mpts;
1496 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1497 mptcp_subflow_addref(mpts); /* for subflow socket */
1498 }
1499
1500 static void
1501 mptcp_subflow_necp_cb(void *handle, __unused int action,
1502 __unused uint32_t interface_index,
1503 uint32_t necp_flags, bool *viable)
1504 {
1505 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1506 struct inpcb *inp = (struct inpcb *)handle;
1507 struct socket *so = inp->inp_socket;
1508 struct mptsub *mpts;
1509 struct mptses *mpte;
1510
1511 if (low_power) {
1512 action = NECP_CLIENT_CBACTION_NONVIABLE;
1513 }
1514
1515 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1516 return;
1517 }
1518
1519 /*
1520 * The socket is being garbage-collected. There is nothing to be done
1521 * here.
1522 */
1523 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1524 return;
1525 }
1526
1527 socket_lock(so, 1);
1528
1529 /* Check again after we acquired the lock. */
1530 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1531 goto out;
1532 }
1533
1534 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1535 mpts = sototcpcb(so)->t_mpsub;
1536
1537 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1538 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1539
1540 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1541
1542 mptcp_sched_create_subflows(mpte);
1543
1544 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1545 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1546 viable != NULL) {
1547 *viable = 1;
1548 }
1549
1550 out:
1551 socket_unlock(so, 1);
1552 }
1553
1554 /*
1555 * Create an MPTCP subflow socket.
1556 */
1557 static int
1558 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1559 struct socket **so)
1560 {
1561 lck_mtx_t *subflow_mtx;
1562 struct mptopt smpo, *mpo, *tmpo;
1563 struct proc *p;
1564 struct socket *mp_so;
1565 int error;
1566
1567 *so = NULL;
1568
1569 mp_so = mptetoso(mpte);
1570
1571 p = proc_find(mp_so->last_pid);
1572 if (p == PROC_NULL) {
1573 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1574 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1575
1576 mptcp_subflow_free(mpts);
1577 return ESRCH;
1578 }
1579
1580 /*
1581 * Create the subflow socket (multipath subflow, non-blocking.)
1582 *
1583 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1584 * socket; it will be cleared when the socket is peeled off or closed.
1585 * It also indicates to the underlying TCP to handle MPTCP options.
1586 * A multipath subflow socket implies SS_NOFDREF state.
1587 */
1588
1589 /*
1590 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1591 * the ipi-lock. We cannot hold the socket-lock at that point.
1592 */
1593 socket_unlock(mp_so, 0);
1594 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1595 SOCF_MPTCP, PROC_NULL);
1596 socket_lock(mp_so, 0);
1597 if (error) {
1598 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1599 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1600
1601 proc_rele(p);
1602
1603 mptcp_subflow_free(mpts);
1604 return error;
1605 }
1606
1607 /*
1608 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1609 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1610 * Which is why we also need to get the lock with pr_getlock, as after
1611 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1612 */
1613 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1614 lck_mtx_lock(subflow_mtx);
1615
1616 /*
1617 * Must be the first thing we do, to make sure all pointers for this
1618 * subflow are set.
1619 */
1620 mptcp_subflow_attach(mpte, mpts, *so);
1621
1622 /*
1623 * A multipath subflow socket is used internally in the kernel,
1624 * therefore it does not have a file desciptor associated by
1625 * default.
1626 */
1627 (*so)->so_state |= SS_NOFDREF;
1628
1629 lck_mtx_unlock(subflow_mtx);
1630
1631 /* prevent the socket buffers from being compressed */
1632 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1633 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1634
1635 /* Inherit preconnect and TFO data flags */
1636 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1637 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1638 }
1639 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1640 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1641 }
1642
1643 /* Inherit uuid and create the related flow. */
1644 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1645 struct mptcb *mp_tp = mpte->mpte_mptcb;
1646
1647 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1648
1649 /*
1650 * A note on the unlock: With MPTCP, we do multiple times a
1651 * necp_client_register_socket_flow. This is problematic,
1652 * because now the lock-ordering guarantee (first necp-locks,
1653 * then socket-locks) is no more respected. So, we need to
1654 * unlock here.
1655 */
1656 socket_unlock(mp_so, 0);
1657 error = necp_client_register_socket_flow(mp_so->last_pid,
1658 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1659 socket_lock(mp_so, 0);
1660
1661 if (error) {
1662 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1663 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1664
1665 goto out_err;
1666 }
1667
1668 /* Possible state-change during the unlock above */
1669 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1670 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1671 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1672 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1673 mp_tp->mpt_state, mp_tp->mpt_flags);
1674
1675 error = EINVAL;
1676 goto out_err;
1677 }
1678
1679 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1680 }
1681
1682 /* Needs to happen prior to the delegation! */
1683 (*so)->last_pid = mp_so->last_pid;
1684
1685 if (mp_so->so_flags & SOF_DELEGATED) {
1686 if (mpte->mpte_epid) {
1687 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1688 if (error) {
1689 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1690 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1691 goto out_err;
1692 }
1693 }
1694 if (!uuid_is_null(mpte->mpte_euuid)) {
1695 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1696 if (error) {
1697 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1698 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1699 goto out_err;
1700 }
1701 }
1702 }
1703
1704 /* inherit the other socket options */
1705 bzero(&smpo, sizeof(smpo));
1706 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1707 smpo.mpo_level = SOL_SOCKET;
1708 smpo.mpo_intval = 1;
1709
1710 /* disable SIGPIPE */
1711 smpo.mpo_name = SO_NOSIGPIPE;
1712 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1713 goto out_err;
1714 }
1715
1716 /* find out if the subflow's source address goes away */
1717 smpo.mpo_name = SO_NOADDRERR;
1718 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1719 goto out_err;
1720 }
1721
1722 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1723 /*
1724 * On secondary subflows we might need to set the cell-fallback
1725 * flag (see conditions in mptcp_subflow_sosetopt).
1726 */
1727 smpo.mpo_level = SOL_SOCKET;
1728 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1729 smpo.mpo_intval = 1;
1730 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1731 goto out_err;
1732 }
1733 }
1734
1735 /* replay setsockopt(2) on the subflow sockets for eligible options */
1736 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1737 int interim;
1738
1739 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1740 continue;
1741 }
1742
1743 /*
1744 * Skip those that are handled internally; these options
1745 * should not have been recorded and marked with the
1746 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1747 */
1748 if (mpo->mpo_level == SOL_SOCKET &&
1749 (mpo->mpo_name == SO_NOSIGPIPE ||
1750 mpo->mpo_name == SO_NOADDRERR ||
1751 mpo->mpo_name == SO_KEEPALIVE)) {
1752 continue;
1753 }
1754
1755 interim = (mpo->mpo_flags & MPOF_INTERIM);
1756 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1757 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1758 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1759 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1760 mpo->mpo_intval);
1761 mptcp_sopt_remove(mpte, mpo);
1762 mptcp_sopt_free(mpo);
1763 continue;
1764 }
1765 }
1766
1767 /*
1768 * We need to receive everything that the subflow socket has,
1769 * so use a customized socket receive function. We will undo
1770 * this when the socket is peeled off or closed.
1771 */
1772 switch (dom) {
1773 case PF_INET:
1774 (*so)->so_proto = &mptcp_subflow_protosw;
1775 break;
1776 case PF_INET6:
1777 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1778 break;
1779 default:
1780 VERIFY(0);
1781 /* NOTREACHED */
1782 }
1783
1784 proc_rele(p);
1785
1786 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1787 int, dom, int, error);
1788
1789 return 0;
1790
1791 out_err:
1792 mptcp_subflow_abort(mpts, error);
1793
1794 proc_rele(p);
1795
1796 return error;
1797 }
1798
1799 /*
1800 * Close an MPTCP subflow socket.
1801 *
1802 * Note that this may be called on an embryonic subflow, and the only
1803 * thing that is guaranteed valid is the protocol-user request.
1804 */
1805 static void
1806 mptcp_subflow_soclose(struct mptsub *mpts)
1807 {
1808 struct socket *so = mpts->mpts_socket;
1809
1810 if (mpts->mpts_flags & MPTSF_CLOSED) {
1811 return;
1812 }
1813
1814 VERIFY(so != NULL);
1815 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1816 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1817
1818 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1819 struct socket *, so,
1820 struct sockbuf *, &so->so_rcv,
1821 struct sockbuf *, &so->so_snd,
1822 struct mptses *, mpts->mpts_mpte);
1823
1824 mpts->mpts_flags |= MPTSF_CLOSED;
1825
1826 if (so->so_retaincnt == 0) {
1827 soclose_locked(so);
1828
1829 return;
1830 } else {
1831 VERIFY(so->so_usecount > 0);
1832 so->so_usecount--;
1833 }
1834
1835 return;
1836 }
1837
1838 /*
1839 * Connect an MPTCP subflow socket.
1840 *
1841 * Note that in the pending connect case, the subflow socket may have been
1842 * bound to an interface and/or a source IP address which may no longer be
1843 * around by the time this routine is called; in that case the connect attempt
1844 * will most likely fail.
1845 */
1846 static int
1847 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1848 {
1849 char dbuf[MAX_IPv6_STR_LEN];
1850 struct socket *mp_so, *so;
1851 struct mptcb *mp_tp;
1852 struct sockaddr *dst;
1853 struct proc *p;
1854 int af, error, dport;
1855
1856 mp_so = mptetoso(mpte);
1857 mp_tp = mpte->mpte_mptcb;
1858 so = mpts->mpts_socket;
1859 af = mpts->mpts_dst.sa_family;
1860 dst = &mpts->mpts_dst;
1861
1862 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1863 VERIFY(mpts->mpts_socket != NULL);
1864 VERIFY(af == AF_INET || af == AF_INET6);
1865
1866 if (af == AF_INET) {
1867 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1868 dport = ntohs(SIN(dst)->sin_port);
1869 } else {
1870 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1871 dport = ntohs(SIN6(dst)->sin6_port);
1872 }
1873
1874 os_log(mptcp_log_handle,
1875 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1876 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1877
1878 p = proc_find(mp_so->last_pid);
1879 if (p == PROC_NULL) {
1880 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1881 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1882
1883 return ESRCH;
1884 }
1885
1886 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1887
1888 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1889
1890 /* connect the subflow socket */
1891 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1892 p, mpts->mpts_ifscope,
1893 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1894
1895 mpts->mpts_iss = sototcpcb(so)->iss;
1896
1897 /* See tcp_connect_complete */
1898 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1899 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1900 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1901 }
1902
1903 /* Allocate a unique address id per subflow */
1904 mpte->mpte_addrid_last++;
1905 if (mpte->mpte_addrid_last == 0) {
1906 mpte->mpte_addrid_last++;
1907 }
1908
1909 proc_rele(p);
1910
1911 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1912 struct mptsub *, mpts, int, error);
1913 if (error) {
1914 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1915 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1916 }
1917
1918 return error;
1919 }
1920
1921 static int
1922 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1923 uint32_t rseq, uint16_t dlen)
1924 {
1925 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1926
1927 if (m_pktlen(m) == 0) {
1928 return 0;
1929 }
1930
1931 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1932 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1933 rseq != m->m_pkthdr.mp_rseq ||
1934 dlen != m->m_pkthdr.mp_rlen)) {
1935 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
1936 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1937 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1938 rseq, m->m_pkthdr.mp_rseq,
1939 dlen, m->m_pkthdr.mp_rlen);
1940
1941 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1942 return -1;
1943 }
1944 m->m_pkthdr.mp_dsn += off;
1945 m->m_pkthdr.mp_rseq += off;
1946
1947 VERIFY(m_pktlen(m) < UINT16_MAX);
1948 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1949 } else {
1950 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
1951 /* data arrived without an DSS option mapping */
1952
1953 /* initial subflow can fallback right after SYN handshake */
1954 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
1955 mptcp_notify_mpfail(so);
1956 } else {
1957 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1958
1959 return -1;
1960 }
1961 } else if (m->m_flags & M_PKTHDR) {
1962 /* We need to fake the DATA-mapping */
1963 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1964 m->m_pkthdr.mp_dsn = dsn + off;
1965 m->m_pkthdr.mp_rseq = rseq + off;
1966
1967 VERIFY(m_pktlen(m) < UINT16_MAX);
1968 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1969 }
1970 }
1971
1972 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1973
1974 return 0;
1975 }
1976
1977 /*
1978 * MPTCP subflow socket receive routine, derived from soreceive().
1979 */
1980 static int
1981 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1982 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1983 {
1984 #pragma unused(uio)
1985 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1986 int flags, error = 0;
1987 struct proc *p = current_proc();
1988 struct mbuf *m, **mp = mp0;
1989 boolean_t proc_held = FALSE;
1990
1991 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1992
1993 #ifdef MORE_LOCKING_DEBUG
1994 if (so->so_usecount == 1) {
1995 panic("%s: so=%x no other reference on socket\n", __func__, so);
1996 /* NOTREACHED */
1997 }
1998 #endif
1999 /*
2000 * We return all that is there in the subflow's socket receive buffer
2001 * to the MPTCP layer, so we require that the caller passes in the
2002 * expected parameters.
2003 */
2004 if (mp == NULL || controlp != NULL) {
2005 return EINVAL;
2006 }
2007
2008 *mp = NULL;
2009 if (psa != NULL) {
2010 *psa = NULL;
2011 }
2012 if (flagsp != NULL) {
2013 flags = *flagsp & ~MSG_EOR;
2014 } else {
2015 flags = 0;
2016 }
2017
2018 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2019 return EOPNOTSUPP;
2020 }
2021
2022 flags |= (MSG_DONTWAIT | MSG_NBIO);
2023
2024 /*
2025 * If a recv attempt is made on a previously-accepted socket
2026 * that has been marked as inactive (disconnected), reject
2027 * the request.
2028 */
2029 if (so->so_flags & SOF_DEFUNCT) {
2030 struct sockbuf *sb = &so->so_rcv;
2031
2032 error = ENOTCONN;
2033 /*
2034 * This socket should have been disconnected and flushed
2035 * prior to being returned from sodefunct(); there should
2036 * be no data on its receive list, so panic otherwise.
2037 */
2038 if (so->so_state & SS_DEFUNCT) {
2039 sb_empty_assert(sb, __func__);
2040 }
2041 return error;
2042 }
2043
2044 /*
2045 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2046 * and if so just return to the caller. This could happen when
2047 * soreceive() is called by a socket upcall function during the
2048 * time the socket is freed. The socket buffer would have been
2049 * locked across the upcall, therefore we cannot put this thread
2050 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2051 * we may livelock), because the lock on the socket buffer will
2052 * only be released when the upcall routine returns to its caller.
2053 * Because the socket has been officially closed, there can be
2054 * no further read on it.
2055 *
2056 * A multipath subflow socket would have its SS_NOFDREF set by
2057 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2058 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2059 */
2060 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2061 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2062 return 0;
2063 }
2064
2065 /*
2066 * For consistency with soreceive() semantics, we need to obey
2067 * SB_LOCK in case some other code path has locked the buffer.
2068 */
2069 error = sblock(&so->so_rcv, 0);
2070 if (error != 0) {
2071 return error;
2072 }
2073
2074 m = so->so_rcv.sb_mb;
2075 if (m == NULL) {
2076 /*
2077 * Panic if we notice inconsistencies in the socket's
2078 * receive list; both sb_mb and sb_cc should correctly
2079 * reflect the contents of the list, otherwise we may
2080 * end up with false positives during select() or poll()
2081 * which could put the application in a bad state.
2082 */
2083 SB_MB_CHECK(&so->so_rcv);
2084
2085 if (so->so_error != 0) {
2086 error = so->so_error;
2087 so->so_error = 0;
2088 goto release;
2089 }
2090
2091 if (so->so_state & SS_CANTRCVMORE) {
2092 goto release;
2093 }
2094
2095 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2096 error = ENOTCONN;
2097 goto release;
2098 }
2099
2100 /*
2101 * MSG_DONTWAIT is implicitly defined and this routine will
2102 * never block, so return EWOULDBLOCK when there is nothing.
2103 */
2104 error = EWOULDBLOCK;
2105 goto release;
2106 }
2107
2108 mptcp_update_last_owner(so, mp_so);
2109
2110 if (mp_so->last_pid != proc_pid(p)) {
2111 p = proc_find(mp_so->last_pid);
2112 if (p == PROC_NULL) {
2113 p = current_proc();
2114 } else {
2115 proc_held = TRUE;
2116 }
2117 }
2118
2119 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2120 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2121 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2122
2123 while (m != NULL) {
2124 int dlen = 0, dfin = 0, error_out = 0;
2125 struct mbuf *start = m;
2126 uint64_t dsn;
2127 uint32_t sseq;
2128 uint16_t orig_dlen;
2129 uint16_t csum;
2130
2131 VERIFY(m->m_nextpkt == NULL);
2132
2133 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2134 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2135 dsn = m->m_pkthdr.mp_dsn;
2136 sseq = m->m_pkthdr.mp_rseq;
2137 csum = m->m_pkthdr.mp_csum;
2138 } else {
2139 /* We did fallback */
2140 if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) {
2141 error = EIO;
2142 *mp0 = NULL;
2143 goto release;
2144 }
2145
2146 sbfree(&so->so_rcv, m);
2147
2148 if (mp != NULL) {
2149 *mp = m;
2150 mp = &m->m_next;
2151 so->so_rcv.sb_mb = m = m->m_next;
2152 *mp = NULL;
2153 }
2154
2155 if (m != NULL) {
2156 so->so_rcv.sb_lastrecord = m;
2157 } else {
2158 SB_EMPTY_FIXUP(&so->so_rcv);
2159 }
2160
2161 continue;
2162 }
2163
2164 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2165 dfin = 1;
2166 }
2167
2168 /*
2169 * Check if the full mapping is now present
2170 */
2171 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
2172 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
2173 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
2174 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
2175
2176 if (*mp0 == NULL) {
2177 error = EWOULDBLOCK;
2178 }
2179 goto release;
2180 }
2181
2182 /* Now, get the full mapping */
2183 while (dlen > 0) {
2184 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
2185 error_out = 1;
2186 error = EIO;
2187 dlen = 0;
2188 *mp0 = NULL;
2189 break;
2190 }
2191
2192 dlen -= m->m_len;
2193 sbfree(&so->so_rcv, m);
2194
2195 if (mp != NULL) {
2196 *mp = m;
2197 mp = &m->m_next;
2198 so->so_rcv.sb_mb = m = m->m_next;
2199 *mp = NULL;
2200 }
2201
2202 if (dlen - dfin == 0) {
2203 dlen = 0;
2204 }
2205
2206 VERIFY(dlen <= 0 || m);
2207 }
2208
2209 VERIFY(dlen == 0);
2210
2211 if (m != NULL) {
2212 so->so_rcv.sb_lastrecord = m;
2213 } else {
2214 SB_EMPTY_FIXUP(&so->so_rcv);
2215 }
2216
2217 if (error_out) {
2218 goto release;
2219 }
2220
2221 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2222 error = EIO;
2223 *mp0 = NULL;
2224 goto release;
2225 }
2226
2227 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2228 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2229 }
2230
2231 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2232 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2233
2234 if (flagsp != NULL) {
2235 *flagsp |= flags;
2236 }
2237
2238 release:
2239 sbunlock(&so->so_rcv, TRUE);
2240
2241 if (proc_held) {
2242 proc_rele(p);
2243 }
2244
2245 return error;
2246 }
2247
2248 /*
2249 * MPTCP subflow socket send routine, derived from sosend().
2250 */
2251 static int
2252 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2253 struct mbuf *top, struct mbuf *control, int flags)
2254 {
2255 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2256 struct proc *p = current_proc();
2257 boolean_t en_tracing = FALSE, proc_held = FALSE;
2258 int en_tracing_val;
2259 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2260 int error;
2261
2262 VERIFY(control == NULL);
2263 VERIFY(addr == NULL);
2264 VERIFY(uio == NULL);
2265 VERIFY(flags == 0);
2266 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2267
2268 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2269 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2270
2271 /*
2272 * trace if tracing & network (vs. unix) sockets & and
2273 * non-loopback
2274 */
2275 if (ENTR_SHOULDTRACE &&
2276 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2277 struct inpcb *inp = sotoinpcb(so);
2278 if (inp->inp_last_outifp != NULL &&
2279 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2280 en_tracing = TRUE;
2281 en_tracing_val = top->m_pkthdr.len;
2282 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2283 (unsigned long)VM_KERNEL_ADDRPERM(so),
2284 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2285 (int64_t)en_tracing_val);
2286 }
2287 }
2288
2289 mptcp_update_last_owner(so, mp_so);
2290
2291 if (mp_so->last_pid != proc_pid(p)) {
2292 p = proc_find(mp_so->last_pid);
2293 if (p == PROC_NULL) {
2294 p = current_proc();
2295 } else {
2296 proc_held = TRUE;
2297 }
2298 }
2299
2300 #if NECP
2301 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2302 #endif /* NECP */
2303
2304 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2305
2306 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2307 if (error) {
2308 goto out;
2309 }
2310
2311 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2312 top = NULL;
2313
2314 out:
2315 if (top != NULL) {
2316 m_freem(top);
2317 }
2318
2319 if (proc_held) {
2320 proc_rele(p);
2321 }
2322
2323 soclearfastopen(so);
2324
2325 if (en_tracing) {
2326 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2327 (unsigned long)VM_KERNEL_ADDRPERM(so),
2328 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2329 (int64_t)en_tracing_val);
2330 }
2331
2332 return error;
2333 }
2334
2335 /*
2336 * Establish an initial MPTCP connection (if first subflow and not yet
2337 * connected), or add a subflow to an existing MPTCP connection.
2338 */
2339 int
2340 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2341 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2342 {
2343 struct socket *mp_so, *so = NULL;
2344 struct mptcb *mp_tp;
2345 struct mptsub *mpts = NULL;
2346 int af, error = 0;
2347
2348 mp_so = mptetoso(mpte);
2349 mp_tp = mpte->mpte_mptcb;
2350
2351 socket_lock_assert_owned(mp_so);
2352
2353 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2354 /* If the remote end sends Data FIN, refuse subflow adds */
2355 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2356 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2357 error = ENOTCONN;
2358 goto out_err;
2359 }
2360
2361 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2362 error = EOVERFLOW;
2363 goto out_err;
2364 }
2365
2366 mpts = mptcp_subflow_alloc();
2367 if (mpts == NULL) {
2368 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2369 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2370 error = ENOMEM;
2371 goto out_err;
2372 }
2373
2374 if (src) {
2375 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2376 error = EAFNOSUPPORT;
2377 goto out_err;
2378 }
2379
2380 if (src->sa_family == AF_INET &&
2381 src->sa_len != sizeof(struct sockaddr_in)) {
2382 error = EINVAL;
2383 goto out_err;
2384 }
2385
2386 if (src->sa_family == AF_INET6 &&
2387 src->sa_len != sizeof(struct sockaddr_in6)) {
2388 error = EINVAL;
2389 goto out_err;
2390 }
2391
2392 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
2393 M_WAITOK | M_ZERO);
2394 if (mpts->mpts_src == NULL) {
2395 error = ENOMEM;
2396 goto out_err;
2397 }
2398 bcopy(src, mpts->mpts_src, src->sa_len);
2399 }
2400
2401 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2402 error = EAFNOSUPPORT;
2403 goto out_err;
2404 }
2405
2406 if (dst->sa_family == AF_INET &&
2407 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2408 error = EINVAL;
2409 goto out_err;
2410 }
2411
2412 if (dst->sa_family == AF_INET6 &&
2413 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2414 error = EINVAL;
2415 goto out_err;
2416 }
2417
2418 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2419
2420 af = mpts->mpts_dst.sa_family;
2421
2422 ifnet_head_lock_shared();
2423 if ((ifscope > (unsigned)if_index)) {
2424 ifnet_head_done();
2425 error = ENXIO;
2426 goto out_err;
2427 }
2428 ifnet_head_done();
2429
2430 mpts->mpts_ifscope = ifscope;
2431
2432 /* create the subflow socket */
2433 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2434 /*
2435 * Returning (error) and not cleaning up, because up to here
2436 * all we did is creating mpts.
2437 *
2438 * And the contract is that the call to mptcp_subflow_socreate,
2439 * moves ownership of mpts to mptcp_subflow_socreate.
2440 */
2441 return error;
2442 }
2443
2444 /*
2445 * We may be called from within the kernel. Still need to account this
2446 * one to the real app.
2447 */
2448 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2449
2450 /*
2451 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2452 * -1 (SAE_CONNID_ALL).
2453 */
2454 mpte->mpte_connid_last++;
2455 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2456 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2457 mpte->mpte_connid_last++;
2458 }
2459
2460 mpts->mpts_connid = mpte->mpte_connid_last;
2461
2462 mpts->mpts_rel_seq = 1;
2463
2464 /* Allocate a unique address id per subflow */
2465 mpte->mpte_addrid_last++;
2466 if (mpte->mpte_addrid_last == 0) {
2467 mpte->mpte_addrid_last++;
2468 }
2469
2470 /* register for subflow socket read/write events */
2471 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2472
2473 /* Register for subflow socket control events */
2474 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2475 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2476 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2477 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2478 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2479 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2480 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2481 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2482
2483 /* sanity check */
2484 VERIFY(!(mpts->mpts_flags &
2485 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2486
2487 /*
2488 * Indicate to the TCP subflow whether or not it should establish
2489 * the initial MPTCP connection, or join an existing one. Fill
2490 * in the connection request structure with additional info needed
2491 * by the underlying TCP (to be used in the TCP options, etc.)
2492 */
2493 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2494 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2495
2496 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2497 mptcp_init_local_parms(mpte);
2498 }
2499 soisconnecting(mp_so);
2500
2501 /* If fastopen is requested, set state in mpts */
2502 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2503 mpts->mpts_flags |= MPTSF_TFO_REQD;
2504 }
2505 } else {
2506 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2507 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2508 }
2509 }
2510
2511 mpts->mpts_flags |= MPTSF_CONNECTING;
2512
2513 /* connect right away if first attempt, or if join can be done now */
2514 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2515 error = mptcp_subflow_soconnectx(mpte, mpts);
2516 }
2517
2518 if (error) {
2519 goto out_err_close;
2520 }
2521
2522 if (pcid) {
2523 *pcid = mpts->mpts_connid;
2524 }
2525
2526 return 0;
2527
2528 out_err_close:
2529 mptcp_subflow_abort(mpts, error);
2530
2531 return error;
2532
2533 out_err:
2534 if (mpts) {
2535 mptcp_subflow_free(mpts);
2536 }
2537
2538 return error;
2539 }
2540
2541 void
2542 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2543 {
2544 int index = mptcpstats_get_index(stats, mpts);
2545
2546 if (index != -1) {
2547 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2548
2549 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2550 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2551
2552 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2553 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2554
2555 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2556 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2557
2558 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2559 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2560 }
2561 }
2562
2563 /*
2564 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2565 * will no longer be accessible after a subflow is deleted, thus this
2566 * should occur only after the subflow socket has been disconnected.
2567 */
2568 void
2569 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2570 {
2571 struct socket *mp_so = mptetoso(mpte);
2572 struct socket *so = mpts->mpts_socket;
2573 struct tcpcb *tp = sototcpcb(so);
2574
2575 socket_lock_assert_owned(mp_so);
2576 VERIFY(mpts->mpts_mpte == mpte);
2577 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2578 VERIFY(mpte->mpte_numflows != 0);
2579 VERIFY(mp_so->so_usecount > 0);
2580
2581 mptcpstats_update(mpte->mpte_itfstats, mpts);
2582
2583 mptcp_unset_cellicon(mpte, mpts, 1);
2584
2585 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2586 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2587
2588 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2589 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2590 mpte->mpte_numflows--;
2591 if (mpte->mpte_active_sub == mpts) {
2592 mpte->mpte_active_sub = NULL;
2593 }
2594
2595 /*
2596 * Drop references held by this subflow socket; there
2597 * will be no further upcalls made from this point.
2598 */
2599 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2600 sock_catchevents_locked(so, NULL, NULL, 0);
2601
2602 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2603
2604 mp_so->so_usecount--; /* for subflow socket */
2605 mpts->mpts_mpte = NULL;
2606 mpts->mpts_socket = NULL;
2607
2608 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2609 mptcp_subflow_remref(mpts); /* for subflow socket */
2610
2611 so->so_flags &= ~SOF_MP_SUBFLOW;
2612 tp->t_mptcb = NULL;
2613 tp->t_mpsub = NULL;
2614 }
2615
2616 void
2617 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2618 {
2619 struct socket *so = mpts->mpts_socket;
2620 struct mptcb *mp_tp = mpte->mpte_mptcb;
2621 int send_dfin = 0;
2622
2623 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2624 send_dfin = 1;
2625 }
2626
2627 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2628 (so->so_state & SS_ISCONNECTED)) {
2629 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2630 __func__, mpts->mpts_connid, send_dfin),
2631 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2632
2633 if (send_dfin) {
2634 mptcp_send_dfin(so);
2635 }
2636 soshutdownlock(so, SHUT_WR);
2637 }
2638 }
2639
2640 static void
2641 mptcp_subflow_abort(struct mptsub *mpts, int error)
2642 {
2643 struct socket *so = mpts->mpts_socket;
2644 struct tcpcb *tp = sototcpcb(so);
2645
2646 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2647 return;
2648 }
2649
2650 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2651 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2652
2653 if (tp->t_state != TCPS_CLOSED) {
2654 tcp_drop(tp, error);
2655 }
2656
2657 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2658 }
2659
2660 /*
2661 * Disconnect a subflow socket.
2662 */
2663 void
2664 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2665 {
2666 struct socket *so, *mp_so;
2667 struct mptcb *mp_tp;
2668 int send_dfin = 0;
2669
2670 so = mpts->mpts_socket;
2671 mp_tp = mpte->mpte_mptcb;
2672 mp_so = mptetoso(mpte);
2673
2674 socket_lock_assert_owned(mp_so);
2675
2676 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2677 return;
2678 }
2679
2680 mptcp_unset_cellicon(mpte, mpts, 1);
2681
2682 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2683
2684 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2685 send_dfin = 1;
2686 }
2687
2688 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2689 (so->so_state & SS_ISCONNECTED)) {
2690 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2691 __func__, mpts->mpts_connid, send_dfin),
2692 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2693
2694 if (send_dfin) {
2695 mptcp_send_dfin(so);
2696 }
2697
2698 if (mp_so->so_flags & SOF_DEFUNCT) {
2699 errno_t ret;
2700
2701 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2702 if (ret == 0) {
2703 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2704
2705 if (ret != 0) {
2706 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2707 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2708 }
2709 } else {
2710 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2711 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2712 }
2713 } else {
2714 (void) soshutdownlock(so, SHUT_RD);
2715 (void) soshutdownlock(so, SHUT_WR);
2716 (void) sodisconnectlocked(so);
2717 }
2718 }
2719
2720 /*
2721 * Generate a disconnect event for this subflow socket, in case
2722 * the lower layer doesn't do it; this is needed because the
2723 * subflow socket deletion relies on it.
2724 */
2725 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2726 }
2727
2728 /*
2729 * Subflow socket input.
2730 */
2731 static void
2732 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2733 {
2734 struct socket *mp_so = mptetoso(mpte);
2735 struct mbuf *m = NULL;
2736 struct socket *so;
2737 int error, wakeup = 0;
2738
2739 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2740 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2741
2742 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2743 struct mptsub *, mpts);
2744
2745 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2746 goto out;
2747 }
2748
2749 so = mpts->mpts_socket;
2750
2751 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2752 if (error != 0 && error != EWOULDBLOCK) {
2753 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2754 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2755 if (error == ENODATA) {
2756 /*
2757 * Don't ignore ENODATA so as to discover
2758 * nasty middleboxes.
2759 */
2760 mp_so->so_error = ENODATA;
2761
2762 wakeup = 1;
2763 goto out;
2764 }
2765 } else if (error == 0) {
2766 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2767 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2768 }
2769
2770 /* In fallback, make sure to accept data on all but one subflow */
2771 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2772 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2773 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2774 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2775 m_freem(m);
2776 goto out;
2777 }
2778
2779 if (m != NULL) {
2780 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2781 mptcp_set_cellicon(mpte, mpts);
2782
2783 mpte->mpte_used_cell = 1;
2784 } else {
2785 /*
2786 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2787 * explicitly set the cellicon, then we unset it again.
2788 */
2789 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2790 mptcp_unset_cellicon(mpte, NULL, 1);
2791 }
2792
2793 mpte->mpte_used_wifi = 1;
2794 }
2795
2796 mptcp_input(mpte, m);
2797 }
2798
2799 out:
2800 if (wakeup) {
2801 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2802 }
2803
2804 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2805 }
2806
2807 void
2808 mptcp_handle_input(struct socket *so)
2809 {
2810 struct mptsub *mpts, *tmpts;
2811 struct mptses *mpte;
2812
2813 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2814 return;
2815 }
2816
2817 mpts = sototcpcb(so)->t_mpsub;
2818 mpte = mpts->mpts_mpte;
2819
2820 socket_lock_assert_owned(mptetoso(mpte));
2821
2822 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2823 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2824 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2825 }
2826 return;
2827 }
2828
2829 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2830 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2831 if (mpts->mpts_socket->so_usecount == 0) {
2832 /* Will be removed soon by tcp_garbage_collect */
2833 continue;
2834 }
2835
2836 mptcp_subflow_addref(mpts);
2837 mpts->mpts_socket->so_usecount++;
2838
2839 mptcp_subflow_input(mpte, mpts);
2840
2841 mptcp_subflow_remref(mpts); /* ours */
2842
2843 VERIFY(mpts->mpts_socket->so_usecount != 0);
2844 mpts->mpts_socket->so_usecount--;
2845 }
2846
2847 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2848 }
2849
2850 /*
2851 * Subflow socket write upcall.
2852 *
2853 * Called when the associated subflow socket posted a read event.
2854 */
2855 static void
2856 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2857 {
2858 #pragma unused(so, waitf)
2859 struct mptsub *mpts = arg;
2860 struct mptses *mpte = mpts->mpts_mpte;
2861
2862 VERIFY(mpte != NULL);
2863
2864 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2865 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2866 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2867 }
2868 return;
2869 }
2870
2871 mptcp_output(mpte);
2872 }
2873
2874 static boolean_t
2875 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2876 {
2877 struct mbuf *so_m = so->so_snd.sb_mb;
2878 uint64_t dsn = m->m_pkthdr.mp_dsn;
2879
2880 while (so_m) {
2881 VERIFY(so_m->m_flags & M_PKTHDR);
2882 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2883
2884 /* Part of the segment is covered, don't reinject here */
2885 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2886 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2887 return TRUE;
2888 }
2889
2890 so_m = so_m->m_next;
2891 }
2892
2893 return FALSE;
2894 }
2895
2896 /*
2897 * Subflow socket output.
2898 *
2899 * Called for sending data from MPTCP to the underlying subflow socket.
2900 */
2901 int
2902 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2903 {
2904 struct mptcb *mp_tp = mpte->mpte_mptcb;
2905 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2906 struct socket *mp_so, *so;
2907 struct tcpcb *tp;
2908 uint64_t mpt_dsn = 0, off = 0;
2909 int sb_cc = 0, error = 0, wakeup = 0;
2910 uint16_t dss_csum;
2911 uint16_t tot_sent = 0;
2912 boolean_t reinjected = FALSE;
2913
2914 mp_so = mptetoso(mpte);
2915 so = mpts->mpts_socket;
2916 tp = sototcpcb(so);
2917
2918 socket_lock_assert_owned(mp_so);
2919
2920 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2921 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2922
2923 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2924 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2925 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2926 (mpts->mpts_flags & MPTSF_TFO_REQD));
2927 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2928
2929 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2930 __func__, mpts->mpts_flags, mpte->mpte_flags,
2931 mptcp_subflow_cwnd_space(so)),
2932 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2933 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2934 struct mptsub *, mpts);
2935
2936 /* Remove Addr Option is not sent reliably as per I-D */
2937 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2938 tp->t_rem_aid = mpte->mpte_lost_aid;
2939 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2940 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2941 }
2942
2943 /*
2944 * The mbuf chains containing the metadata (as well as pointing to
2945 * the user data sitting at the MPTCP output queue) would then be
2946 * sent down to the subflow socket.
2947 *
2948 * Some notes on data sequencing:
2949 *
2950 * a. Each mbuf must be a M_PKTHDR.
2951 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2952 * in the mbuf pkthdr structure.
2953 * c. Each mbuf containing the MPTCP metadata must have its
2954 * pkt_flags marked with the PKTF_MPTCP flag.
2955 */
2956
2957 if (mpte->mpte_reinjectq) {
2958 sb_mb = mpte->mpte_reinjectq;
2959 } else {
2960 sb_mb = mp_so->so_snd.sb_mb;
2961 }
2962
2963 if (sb_mb == NULL) {
2964 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2965 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2966 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2967 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
2968
2969 /* Fix it to prevent looping */
2970 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
2971 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2972 }
2973 goto out;
2974 }
2975
2976 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2977
2978 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2979 !(so->so_state & SS_ISCONNECTED) &&
2980 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2981 tp->t_mpflags |= TMPF_TFO_REQUEST;
2982 goto zero_len_write;
2983 }
2984
2985 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2986
2987 /* First, drop acknowledged data */
2988 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2989 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
2990 "dsn %u suna %u reinject? %u\n",
2991 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
2992 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
2993 if (mpte->mpte_reinjectq) {
2994 mptcp_clean_reinjectq(mpte);
2995 } else {
2996 uint64_t len = 0;
2997 len = mp_tp->mpt_snduna - mpt_dsn;
2998 sbdrop(&mp_so->so_snd, (int)len);
2999 wakeup = 1;
3000 }
3001 }
3002
3003 /* Check again because of above sbdrop */
3004 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3005 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3006 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3007 goto out;
3008 }
3009
3010 /*
3011 * In degraded mode, we don't receive data acks, so force free
3012 * mbufs less than snd_nxt
3013 */
3014 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3015 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3016 mp_so->so_snd.sb_mb) {
3017 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3018 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3019 uint64_t len = 0;
3020 len = mp_tp->mpt_snduna - mpt_dsn;
3021 sbdrop(&mp_so->so_snd, (int)len);
3022 wakeup = 1;
3023
3024 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3025 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3026 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3027 }
3028 }
3029
3030 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3031 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3032 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3033 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3034 }
3035
3036 /*
3037 * Adjust the top level notion of next byte used for retransmissions
3038 * and sending FINs.
3039 */
3040 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3041 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3042 }
3043
3044 /* Now determine the offset from which to start transmitting data */
3045 if (mpte->mpte_reinjectq) {
3046 sb_mb = mpte->mpte_reinjectq;
3047 } else {
3048 dont_reinject:
3049 sb_mb = mp_so->so_snd.sb_mb;
3050 }
3051 if (sb_mb == NULL) {
3052 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3053 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3054 goto out;
3055 }
3056
3057 if (sb_mb == mpte->mpte_reinjectq) {
3058 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3059 off = 0;
3060
3061 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3062 if (mptcp_can_send_more(mp_tp, TRUE)) {
3063 goto dont_reinject;
3064 }
3065
3066 error = ECANCELED;
3067 goto out;
3068 }
3069
3070 reinjected = TRUE;
3071 } else if (flags & MPTCP_SUBOUT_PROBING) {
3072 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3073 off = 0;
3074 } else {
3075 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3076
3077 /*
3078 * With TFO, there might be no data at all, thus still go into this
3079 * code-path here.
3080 */
3081 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3082 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3083 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3084 sb_cc -= off;
3085 } else {
3086 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3087 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3088 (uint32_t)mp_tp->mpt_sndmax);
3089
3090 goto out;
3091 }
3092 }
3093
3094 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3095 if (sb_cc <= 0) {
3096 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3097 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3098 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3099 mptcp_subflow_cwnd_space(so));
3100 }
3101
3102 sb_cc = min(sb_cc, UINT16_MAX);
3103
3104 /*
3105 * Create a DSN mapping for the data we are about to send. It all
3106 * has the same mapping.
3107 */
3108 if (reinjected) {
3109 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3110 } else {
3111 mpt_dsn = mp_tp->mpt_snduna + off;
3112 }
3113
3114 mpt_mbuf = sb_mb;
3115 while (mpt_mbuf && reinjected == FALSE &&
3116 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3117 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3118 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3119 mpt_mbuf = mpt_mbuf->m_next;
3120 }
3121 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3122 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3123 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3124 mpts->mpts_probecnt),
3125 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3126 }
3127
3128 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3129
3130 head = tail = NULL;
3131
3132 while (tot_sent < sb_cc) {
3133 int32_t mlen;
3134
3135 mlen = mpt_mbuf->m_len;
3136 mlen -= off;
3137 mlen = MIN(mlen, sb_cc - tot_sent);
3138
3139 if (mlen < 0) {
3140 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3141 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3142 (uint32_t)off, sb_cc, tot_sent);
3143 goto out;
3144 }
3145
3146 if (mlen == 0) {
3147 goto next;
3148 }
3149
3150 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3151 M_COPYM_MUST_COPY_HDR);
3152 if (m == NULL) {
3153 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3154 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3155 error = ENOBUFS;
3156 break;
3157 }
3158
3159 /* Create a DSN mapping for the data (m_copym does it) */
3160 VERIFY(m->m_flags & M_PKTHDR);
3161 VERIFY(m->m_next == NULL);
3162
3163 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3164 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3165 m->m_pkthdr.mp_dsn = mpt_dsn;
3166 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3167 m->m_pkthdr.len = mlen;
3168
3169 if (head == NULL) {
3170 head = tail = m;
3171 } else {
3172 tail->m_next = m;
3173 tail = m;
3174 }
3175
3176 tot_sent += mlen;
3177 off = 0;
3178 next:
3179 mpt_mbuf = mpt_mbuf->m_next;
3180 }
3181
3182 if (reinjected) {
3183 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3184 struct mbuf *n = sb_mb;
3185
3186 while (n) {
3187 n->m_pkthdr.mp_dsn += sb_cc;
3188 n->m_pkthdr.mp_rlen -= sb_cc;
3189 n = n->m_next;
3190 }
3191 m_adj(sb_mb, sb_cc);
3192 } else {
3193 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3194 m_freem(sb_mb);
3195 }
3196 }
3197
3198 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3199 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3200 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3201
3202 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3203 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3204 tot_sent);
3205 }
3206
3207 /* Now, let's update rel-seq and the data-level length */
3208 mpts->mpts_rel_seq += tot_sent;
3209 m = head;
3210 while (m) {
3211 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3212 m->m_pkthdr.mp_csum = dss_csum;
3213 }
3214 m->m_pkthdr.mp_rlen = tot_sent;
3215 m = m->m_next;
3216 }
3217
3218 if (head != NULL) {
3219 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3220 (tp->t_tfo_stats == 0)) {
3221 tp->t_mpflags |= TMPF_TFO_REQUEST;
3222 }
3223
3224 error = sock_sendmbuf(so, NULL, head, 0, NULL);
3225
3226 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
3227 struct sockbuf *, &so->so_rcv,
3228 struct sockbuf *, &so->so_snd,
3229 struct mptses *, mpte, struct mptsub *, mpts,
3230 size_t, tot_sent);
3231 }
3232
3233 done_sending:
3234 if (error == 0 ||
3235 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3236 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3237
3238 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3239 tcpstat.tcps_mp_num_probes++;
3240 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3241 mpts->mpts_probecnt += 1;
3242 } else {
3243 mpts->mpts_probecnt +=
3244 tot_sent / mpts->mpts_maxseg;
3245 }
3246 }
3247
3248 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3249 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3250 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3251 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3252 }
3253 mp_tp->mpt_sndnxt = new_sndnxt;
3254 }
3255
3256 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3257
3258 /* Must be here as mptcp_can_send_more() checks for this */
3259 soclearfastopen(mp_so);
3260
3261 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3262 (mpts->mpts_probesoon != 0)) {
3263 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3264 __func__, mpts->mpts_connid,
3265 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3266 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3267 (tcp_now - mpts->mpts_probesoon)),
3268 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3269 }
3270
3271 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3272 mptcp_set_cellicon(mpte, mpts);
3273
3274 mpte->mpte_used_cell = 1;
3275 } else {
3276 /*
3277 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3278 * explicitly set the cellicon, then we unset it again.
3279 */
3280 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3281 mptcp_unset_cellicon(mpte, NULL, 1);
3282 }
3283
3284 mpte->mpte_used_wifi = 1;
3285 }
3286
3287 /*
3288 * Don't propagate EWOULDBLOCK - it's already taken care of
3289 * in mptcp_usr_send for TFO.
3290 */
3291 error = 0;
3292 } else {
3293 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3294 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3295 }
3296 out:
3297
3298 if (wakeup) {
3299 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3300 }
3301
3302 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3303 return error;
3304
3305 zero_len_write:
3306 /* Opting to call pru_send as no mbuf at subflow level */
3307 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3308 NULL, current_proc());
3309
3310 goto done_sending;
3311 }
3312
3313 static void
3314 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3315 {
3316 struct mbuf *n, *prev = NULL;
3317
3318 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3319 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3320 m->m_pkthdr.mp_rseq),
3321 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3322
3323 n = mpte->mpte_reinjectq;
3324
3325 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3326 * equal than m's sequence number.
3327 */
3328 while (n) {
3329 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3330 break;
3331 }
3332
3333 prev = n;
3334
3335 n = n->m_nextpkt;
3336 }
3337
3338 if (n) {
3339 /* m is already fully covered by the next mbuf in the queue */
3340 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3341 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3342 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
3343 __func__, n->m_pkthdr.mp_rlen),
3344 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3345 goto dont_queue;
3346 }
3347
3348 /* m is covering the next mbuf entirely, thus we remove this guy */
3349 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3350 struct mbuf *tmp = n->m_nextpkt;
3351
3352 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
3353 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3354 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
3355 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3356
3357 m->m_nextpkt = NULL;
3358 if (prev == NULL) {
3359 mpte->mpte_reinjectq = tmp;
3360 } else {
3361 prev->m_nextpkt = tmp;
3362 }
3363
3364 m_freem(n);
3365 n = tmp;
3366 }
3367 }
3368
3369 if (prev) {
3370 /* m is already fully covered by the previous mbuf in the queue */
3371 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3372 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
3373 __func__, (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
3374 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3375 goto dont_queue;
3376 }
3377 }
3378
3379 if (prev == NULL) {
3380 mpte->mpte_reinjectq = m;
3381 } else {
3382 prev->m_nextpkt = m;
3383 }
3384
3385 m->m_nextpkt = n;
3386
3387 return;
3388
3389 dont_queue:
3390 m_freem(m);
3391 return;
3392 }
3393
3394 static struct mbuf *
3395 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3396 {
3397 struct socket *mp_so = mptetoso(mpte);
3398 struct mbuf *m;
3399
3400 m = mp_so->so_snd.sb_mb;
3401
3402 while (m) {
3403 /* If this segment covers what we are looking for, return it. */
3404 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3405 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3406 break;
3407 }
3408
3409
3410 /* Segment is no more in the queue */
3411 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3412 return NULL;
3413 }
3414
3415 m = m->m_next;
3416 }
3417
3418 return m;
3419 }
3420
3421 static struct mbuf *
3422 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3423 {
3424 struct mbuf *top = NULL, *tail = NULL;
3425 uint64_t dsn;
3426 uint32_t dlen, rseq;
3427
3428 dsn = m->m_pkthdr.mp_dsn;
3429 dlen = m->m_pkthdr.mp_rlen;
3430 rseq = m->m_pkthdr.mp_rseq;
3431
3432 while (len > 0) {
3433 struct mbuf *n;
3434
3435 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3436
3437 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3438 if (n == NULL) {
3439 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3440 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3441 goto err;
3442 }
3443
3444 VERIFY(n->m_flags & M_PKTHDR);
3445 VERIFY(n->m_next == NULL);
3446 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3447 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3448 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3449 VERIFY(n->m_len == m->m_len);
3450
3451 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3452
3453 if (top == NULL) {
3454 top = n;
3455 }
3456
3457 if (tail != NULL) {
3458 tail->m_next = n;
3459 }
3460
3461 tail = n;
3462
3463 len -= m->m_len;
3464 m = m->m_next;
3465 }
3466
3467 return top;
3468
3469 err:
3470 if (top) {
3471 m_freem(top);
3472 }
3473
3474 return NULL;
3475 }
3476
3477 static void
3478 mptcp_reinject_mbufs(struct socket *so)
3479 {
3480 struct tcpcb *tp = sototcpcb(so);
3481 struct mptsub *mpts = tp->t_mpsub;
3482 struct mptcb *mp_tp = tptomptp(tp);
3483 struct mptses *mpte = mp_tp->mpt_mpte;;
3484 struct sockbuf *sb = &so->so_snd;
3485 struct mbuf *m;
3486
3487 m = sb->sb_mb;
3488 while (m) {
3489 struct mbuf *n = m->m_next, *orig = m;
3490
3491 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3492 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3493 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3494 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3495
3496 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3497
3498 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3499 goto next;
3500 }
3501
3502 /* Has it all already been acknowledged at the data-level? */
3503 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3504 goto next;
3505 }
3506
3507 /* Part of this has already been acknowledged - lookup in the
3508 * MPTCP-socket for the segment.
3509 */
3510 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3511 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3512 if (m == NULL) {
3513 goto next;
3514 }
3515 }
3516
3517 /* Copy the mbuf with headers (aka, DSN-numbers) */
3518 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3519 if (m == NULL) {
3520 break;
3521 }
3522
3523 VERIFY(m->m_nextpkt == NULL);
3524
3525 /* Now, add to the reinject-queue, eliminating overlapping
3526 * segments
3527 */
3528 mptcp_add_reinjectq(mpte, m);
3529
3530 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3531
3532 next:
3533 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3534 while (n) {
3535 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3536
3537 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3538 break;
3539 }
3540
3541 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3542 n = n->m_next;
3543 }
3544
3545 m = n;
3546 }
3547 }
3548
3549 void
3550 mptcp_clean_reinjectq(struct mptses *mpte)
3551 {
3552 struct mptcb *mp_tp = mpte->mpte_mptcb;
3553
3554 socket_lock_assert_owned(mptetoso(mpte));
3555
3556 while (mpte->mpte_reinjectq) {
3557 struct mbuf *m = mpte->mpte_reinjectq;
3558
3559 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3560 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3561 break;
3562 }
3563
3564 mpte->mpte_reinjectq = m->m_nextpkt;
3565 m->m_nextpkt = NULL;
3566 m_freem(m);
3567 }
3568 }
3569
3570 /*
3571 * Subflow socket control event upcall.
3572 */
3573 static void
3574 mptcp_subflow_eupcall1(struct socket *so, void *arg, long events)
3575 {
3576 #pragma unused(so)
3577 struct mptsub *mpts = arg;
3578 struct mptses *mpte = mpts->mpts_mpte;
3579
3580 socket_lock_assert_owned(mptetoso(mpte));
3581
3582 if ((mpts->mpts_evctl & events) == events) {
3583 return;
3584 }
3585
3586 mpts->mpts_evctl |= events;
3587
3588 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3589 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3590 return;
3591 }
3592
3593 mptcp_subflow_workloop(mpte);
3594 }
3595
3596 /*
3597 * Subflow socket control events.
3598 *
3599 * Called for handling events related to the underlying subflow socket.
3600 */
3601 static ev_ret_t
3602 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3603 long *p_mpsofilt_hint)
3604 {
3605 ev_ret_t ret = MPTS_EVRET_OK;
3606 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3607 sizeof(mpsub_ev_entry_tbl[0]);
3608
3609 /* bail if there's nothing to process */
3610 if (!mpts->mpts_evctl) {
3611 return ret;
3612 }
3613
3614 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3615 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3616 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3617 SO_FILT_HINT_DISCONNECTED)) {
3618 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3619 }
3620
3621 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3622 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3623
3624 /*
3625 * Process all the socket filter hints and reset the hint
3626 * once it is handled
3627 */
3628 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3629 /*
3630 * Always execute the DISCONNECTED event, because it will wakeup
3631 * the app.
3632 */
3633 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3634 (ret >= MPTS_EVRET_OK ||
3635 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3636 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3637 ev_ret_t error =
3638 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3639 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3640 }
3641 }
3642
3643 return ret;
3644 }
3645
3646 static ev_ret_t
3647 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3648 long *p_mpsofilt_hint, long event)
3649 {
3650 struct socket *mp_so, *so;
3651 struct mptcb *mp_tp;
3652
3653 mp_so = mptetoso(mpte);
3654 mp_tp = mpte->mpte_mptcb;
3655 so = mpts->mpts_socket;
3656
3657 /*
3658 * We got an event for this subflow that might need to be propagated,
3659 * based on the state of the MPTCP connection.
3660 */
3661 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3662 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3663 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3664 mp_so->so_error = so->so_error;
3665 *p_mpsofilt_hint |= event;
3666 }
3667
3668 return MPTS_EVRET_OK;
3669 }
3670
3671 /*
3672 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3673 */
3674 static ev_ret_t
3675 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3676 long *p_mpsofilt_hint, long event)
3677 {
3678 #pragma unused(p_mpsofilt_hint, event)
3679 struct socket *mp_so;
3680 struct tcpcb *tp;
3681
3682 mp_so = mptetoso(mpte);
3683 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3684
3685 /*
3686 * This overwrites any previous mpte_lost_aid to avoid storing
3687 * too much state when the typical case has only two subflows.
3688 */
3689 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3690 mpte->mpte_lost_aid = tp->t_local_aid;
3691
3692 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3693 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3694
3695 /*
3696 * The subflow connection has lost its source address.
3697 */
3698 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3699
3700 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3701 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3702 }
3703
3704 return MPTS_EVRET_DELETE;
3705 }
3706
3707 static ev_ret_t
3708 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3709 long *p_mpsofilt_hint, long event)
3710 {
3711 #pragma unused(event, p_mpsofilt_hint)
3712 struct socket *so, *mp_so;
3713
3714 so = mpts->mpts_socket;
3715
3716 if (so->so_error != ENODATA) {
3717 return MPTS_EVRET_OK;
3718 }
3719
3720
3721 mp_so = mptetoso(mpte);
3722
3723 mp_so->so_error = ENODATA;
3724
3725 sorwakeup(mp_so);
3726 sowwakeup(mp_so);
3727
3728 return MPTS_EVRET_OK;
3729 }
3730
3731
3732 /*
3733 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3734 * indicates that the remote side sent a Data FIN
3735 */
3736 static ev_ret_t
3737 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3738 long *p_mpsofilt_hint, long event)
3739 {
3740 #pragma unused(event)
3741 struct mptcb *mp_tp = mpte->mpte_mptcb;
3742
3743 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3744 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3745
3746 /*
3747 * We got a Data FIN for the MPTCP connection.
3748 * The FIN may arrive with data. The data is handed up to the
3749 * mptcp socket and the user is notified so that it may close
3750 * the socket if needed.
3751 */
3752 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3753 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3754 }
3755
3756 return MPTS_EVRET_OK; /* keep the subflow socket around */
3757 }
3758
3759 /*
3760 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3761 */
3762 static ev_ret_t
3763 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3764 long *p_mpsofilt_hint, long event)
3765 {
3766 #pragma unused(event, p_mpsofilt_hint)
3767 struct mptsub *mpts_alt = NULL;
3768 struct socket *alt_so = NULL;
3769 struct socket *mp_so;
3770 int altpath_exists = 0;
3771
3772 mp_so = mptetoso(mpte);
3773 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3774
3775 mptcp_reinject_mbufs(mpts->mpts_socket);
3776
3777 mpts_alt = mptcp_get_subflow(mpte, NULL);
3778
3779 /* If there is no alternate eligible subflow, ignore the failover hint. */
3780 if (mpts_alt == NULL || mpts_alt == mpts) {
3781 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3782 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3783
3784 goto done;
3785 }
3786
3787 altpath_exists = 1;
3788 alt_so = mpts_alt->mpts_socket;
3789 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3790 /* All data acknowledged and no RTT spike */
3791 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3792 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3793 } else {
3794 /* no alternate path available */
3795 altpath_exists = 0;
3796 }
3797 }
3798
3799 if (altpath_exists) {
3800 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3801
3802 mpte->mpte_active_sub = mpts_alt;
3803 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3804 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3805
3806 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3807 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3808
3809 mptcpstats_inc_switch(mpte, mpts);
3810
3811 sowwakeup(alt_so);
3812 } else {
3813 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3814 mpts->mpts_connid),
3815 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3816 done:
3817 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3818 }
3819
3820 return MPTS_EVRET_OK;
3821 }
3822
3823 /*
3824 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3825 */
3826 static ev_ret_t
3827 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3828 long *p_mpsofilt_hint, long event)
3829 {
3830 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3831 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3832
3833 /*
3834 * The subflow connection cannot use the outgoing interface, let's
3835 * close this subflow.
3836 */
3837 mptcp_subflow_abort(mpts, EPERM);
3838
3839 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3840
3841 return MPTS_EVRET_DELETE;
3842 }
3843
3844 /*
3845 * https://tools.ietf.org/html/rfc6052#section-2
3846 * https://tools.ietf.org/html/rfc6147#section-5.2
3847 */
3848 static boolean_t
3849 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3850 const struct ipv6_prefix *prefix,
3851 struct in_addr *addrv4)
3852 {
3853 char buf[MAX_IPv4_STR_LEN];
3854 char *ptrv4 = (char *)addrv4;
3855 const char *ptr = (const char *)addr;
3856
3857 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3858 return false;
3859 }
3860
3861 switch (prefix->prefix_len) {
3862 case NAT64_PREFIX_LEN_96:
3863 memcpy(ptrv4, ptr + 12, 4);
3864 break;
3865 case NAT64_PREFIX_LEN_64:
3866 memcpy(ptrv4, ptr + 9, 4);
3867 break;
3868 case NAT64_PREFIX_LEN_56:
3869 memcpy(ptrv4, ptr + 7, 1);
3870 memcpy(ptrv4 + 1, ptr + 9, 3);
3871 break;
3872 case NAT64_PREFIX_LEN_48:
3873 memcpy(ptrv4, ptr + 6, 2);
3874 memcpy(ptrv4 + 2, ptr + 9, 2);
3875 break;
3876 case NAT64_PREFIX_LEN_40:
3877 memcpy(ptrv4, ptr + 5, 3);
3878 memcpy(ptrv4 + 3, ptr + 9, 1);
3879 break;
3880 case NAT64_PREFIX_LEN_32:
3881 memcpy(ptrv4, ptr + 4, 4);
3882 break;
3883 default:
3884 panic("NAT64-prefix len is wrong: %u\n",
3885 prefix->prefix_len);
3886 }
3887
3888 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3889 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3890
3891 return true;
3892 }
3893
3894 static void
3895 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3896 {
3897 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3898 struct socket *so = mpts->mpts_socket;
3899 struct ifnet *ifp;
3900 int j;
3901
3902 /* Subflow IPs will be steered directly by the server - no need to
3903 * desynthesize.
3904 */
3905 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3906 return;
3907 }
3908
3909 ifp = sotoinpcb(so)->inp_last_outifp;
3910
3911 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3912 mptcp_ask_for_nat64(ifp);
3913 return;
3914 }
3915
3916
3917 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3918 int success;
3919
3920 if (nat64prefixes[j].prefix_len == 0) {
3921 continue;
3922 }
3923
3924 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3925 &nat64prefixes[j],
3926 &mpte->mpte_dst_v4_nat64.sin_addr);
3927 if (success) {
3928 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3929 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3930 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3931 break;
3932 }
3933 }
3934 }
3935
3936 static void
3937 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3938 {
3939 struct inpcb *inp;
3940
3941 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3942 return;
3943 }
3944
3945 inp = sotoinpcb(mpts->mpts_socket);
3946 if (inp == NULL) {
3947 return;
3948 }
3949
3950 /* Should we try the alternate port? */
3951 if (mpte->mpte_alternate_port &&
3952 inp->inp_fport != mpte->mpte_alternate_port) {
3953 union sockaddr_in_4_6 dst;
3954 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3955
3956 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3957
3958 dst_in->sin_port = mpte->mpte_alternate_port;
3959
3960 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3961 mpts->mpts_ifscope, NULL);
3962 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3963 unsigned int i;
3964
3965 if (inp->inp_last_outifp == NULL) {
3966 return;
3967 }
3968
3969 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3970 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3971
3972 if (inp->inp_last_outifp->if_index == info->ifindex) {
3973 info->no_mptcp_support = 1;
3974 break;
3975 }
3976 }
3977 }
3978 }
3979
3980 /*
3981 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3982 */
3983 static ev_ret_t
3984 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3985 long *p_mpsofilt_hint, long event)
3986 {
3987 #pragma unused(event, p_mpsofilt_hint)
3988 struct socket *mp_so, *so;
3989 struct inpcb *inp;
3990 struct tcpcb *tp;
3991 struct mptcb *mp_tp;
3992 int af;
3993 boolean_t mpok = FALSE;
3994
3995 mp_so = mptetoso(mpte);
3996 mp_tp = mpte->mpte_mptcb;
3997 so = mpts->mpts_socket;
3998 tp = sototcpcb(so);
3999 af = mpts->mpts_dst.sa_family;
4000
4001 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4002 return MPTS_EVRET_OK;
4003 }
4004
4005 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4006 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4007 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4008 (so->so_state & SS_ISCONNECTED)) {
4009 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4010 __func__, mpts->mpts_connid),
4011 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4012 (void) soshutdownlock(so, SHUT_RD);
4013 (void) soshutdownlock(so, SHUT_WR);
4014 (void) sodisconnectlocked(so);
4015 }
4016 return MPTS_EVRET_OK;
4017 }
4018
4019 /*
4020 * The subflow connection has been connected. Find out whether it
4021 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4022 *
4023 * a. If MPTCP connection is not yet established, then this must be
4024 * the first subflow connection. If MPTCP failed to negotiate,
4025 * fallback to regular TCP by degrading this subflow.
4026 *
4027 * b. If MPTCP connection has been established, then this must be
4028 * one of the subsequent subflow connections. If MPTCP failed
4029 * to negotiate, disconnect the connection.
4030 *
4031 * Right now, we simply unblock any waiters at the MPTCP socket layer
4032 * if the MPTCP connection has not been established.
4033 */
4034
4035 if (so->so_state & SS_ISDISCONNECTED) {
4036 /*
4037 * With MPTCP joins, a connection is connected at the subflow
4038 * level, but the 4th ACK from the server elevates the MPTCP
4039 * subflow to connected state. So there is a small window
4040 * where the subflow could get disconnected before the
4041 * connected event is processed.
4042 */
4043 return MPTS_EVRET_OK;
4044 }
4045
4046 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4047 mptcp_drop_tfo_data(mpte, mpts);
4048 }
4049
4050 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4051 mpts->mpts_flags |= MPTSF_CONNECTED;
4052
4053 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4054 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4055 }
4056
4057 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4058
4059 /* get/verify the outbound interface */
4060 inp = sotoinpcb(so);
4061
4062 mpts->mpts_maxseg = tp->t_maxseg;
4063
4064 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4065 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4066 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4067 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4068
4069 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4070
4071 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4072 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4073 mpte->mpte_associd = mpts->mpts_connid;
4074 DTRACE_MPTCP2(state__change,
4075 struct mptcb *, mp_tp,
4076 uint32_t, 0 /* event */);
4077
4078 if (SOCK_DOM(so) == AF_INET) {
4079 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4080 } else {
4081 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4082 }
4083
4084 mpts->mpts_flags |= MPTSF_ACTIVE;
4085
4086 /* case (a) above */
4087 if (!mpok) {
4088 tcpstat.tcps_mpcap_fallback++;
4089
4090 tp->t_mpflags |= TMPF_INFIN_SENT;
4091 mptcp_notify_mpfail(so);
4092 } else {
4093 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4094 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4095 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4096 } else {
4097 mpts->mpts_flags |= MPTSF_PREFERRED;
4098 }
4099 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4100 mpte->mpte_nummpcapflows++;
4101
4102 if (SOCK_DOM(so) == AF_INET6) {
4103 mptcp_handle_ipv6_connection(mpte, mpts);
4104 }
4105
4106 mptcp_check_subflows_and_add(mpte);
4107
4108 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4109 mpte->mpte_initial_cell = 1;
4110 }
4111
4112 mpte->mpte_handshake_success = 1;
4113 }
4114
4115 mp_tp->mpt_sndwnd = tp->snd_wnd;
4116 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4117 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4118 soisconnected(mp_so);
4119 } else if (mpok) {
4120 /*
4121 * case (b) above
4122 * In case of additional flows, the MPTCP socket is not
4123 * MPTSF_MP_CAPABLE until an ACK is received from server
4124 * for 3-way handshake. TCP would have guaranteed that this
4125 * is an MPTCP subflow.
4126 */
4127 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4128 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4129 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4130 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4131 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4132 } else {
4133 mpts->mpts_flags |= MPTSF_PREFERRED;
4134 }
4135
4136 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4137 mpte->mpte_nummpcapflows++;
4138
4139 mpts->mpts_rel_seq = 1;
4140
4141 mptcp_check_subflows_and_remove(mpte);
4142 } else {
4143 mptcp_try_alternate_port(mpte, mpts);
4144
4145 tcpstat.tcps_join_fallback++;
4146 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4147 tcpstat.tcps_mptcp_cell_proxy++;
4148 } else {
4149 tcpstat.tcps_mptcp_wifi_proxy++;
4150 }
4151
4152 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4153
4154 return MPTS_EVRET_OK;
4155 }
4156
4157 /* This call, just to "book" an entry in the stats-table for this ifindex */
4158 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4159
4160 mptcp_output(mpte);
4161
4162 return MPTS_EVRET_OK; /* keep the subflow socket around */
4163 }
4164
4165 /*
4166 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4167 */
4168 static ev_ret_t
4169 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4170 long *p_mpsofilt_hint, long event)
4171 {
4172 #pragma unused(event, p_mpsofilt_hint)
4173 struct socket *mp_so, *so;
4174 struct mptcb *mp_tp;
4175
4176 mp_so = mptetoso(mpte);
4177 mp_tp = mpte->mpte_mptcb;
4178 so = mpts->mpts_socket;
4179
4180 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4181 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4182 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4183 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4184 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4185
4186 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4187 return MPTS_EVRET_DELETE;
4188 }
4189
4190 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4191
4192 /* The subflow connection has been disconnected. */
4193
4194 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4195 mpte->mpte_nummpcapflows--;
4196 if (mpte->mpte_active_sub == mpts) {
4197 mpte->mpte_active_sub = NULL;
4198 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4199 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4200 }
4201 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4202 } else {
4203 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4204 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4205 mptcp_try_alternate_port(mpte, mpts);
4206 }
4207 }
4208
4209 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4210 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4211 mptcp_drop(mpte, mp_tp, so->so_error);
4212 }
4213
4214 /*
4215 * Clear flags that are used by getconninfo to return state.
4216 * Retain like MPTSF_DELETEOK for internal purposes.
4217 */
4218 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4219 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4220 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4221
4222 return MPTS_EVRET_DELETE;
4223 }
4224
4225 /*
4226 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4227 */
4228 static ev_ret_t
4229 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4230 long *p_mpsofilt_hint, long event)
4231 {
4232 #pragma unused(event, p_mpsofilt_hint)
4233 ev_ret_t ret = MPTS_EVRET_OK;
4234 struct socket *mp_so, *so;
4235 struct mptcb *mp_tp;
4236
4237 mp_so = mptetoso(mpte);
4238 mp_tp = mpte->mpte_mptcb;
4239 so = mpts->mpts_socket;
4240
4241 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4242 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4243 } else {
4244 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4245 }
4246
4247 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4248 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4249 goto done;
4250 }
4251 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4252 } else {
4253 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4254 }
4255
4256 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4257 mpts->mpts_flags |= MPTSF_MP_READY;
4258 } else {
4259 mpts->mpts_flags &= ~MPTSF_MP_READY;
4260 }
4261
4262 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4263 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4264 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4265 }
4266
4267 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4268 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4269
4270 m_freem_list(mpte->mpte_reinjectq);
4271 mpte->mpte_reinjectq = NULL;
4272 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4273 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4274 ret = MPTS_EVRET_CONNECT_PENDING;
4275 }
4276
4277 done:
4278 return ret;
4279 }
4280
4281 /*
4282 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4283 */
4284 static ev_ret_t
4285 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4286 long *p_mpsofilt_hint, long event)
4287 {
4288 #pragma unused(event)
4289 struct socket *mp_so, *so;
4290 struct mptcb *mp_tp;
4291 boolean_t is_fastclose;
4292
4293 mp_so = mptetoso(mpte);
4294 mp_tp = mpte->mpte_mptcb;
4295 so = mpts->mpts_socket;
4296
4297 /* We got an invalid option or a fast close */
4298 struct inpcb *inp = sotoinpcb(so);
4299 struct tcpcb *tp = NULL;
4300
4301 tp = intotcpcb(inp);
4302 so->so_error = ECONNABORTED;
4303
4304 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4305
4306 tp->t_mpflags |= TMPF_RESET;
4307
4308 if (tp->t_state != TCPS_CLOSED) {
4309 struct tcptemp *t_template = tcp_maketemplate(tp);
4310
4311 if (t_template) {
4312 struct tcp_respond_args tra;
4313
4314 bzero(&tra, sizeof(tra));
4315 if (inp->inp_flags & INP_BOUND_IF) {
4316 tra.ifscope = inp->inp_boundifp->if_index;
4317 } else {
4318 tra.ifscope = IFSCOPE_NONE;
4319 }
4320 tra.awdl_unrestricted = 1;
4321
4322 tcp_respond(tp, t_template->tt_ipgen,
4323 &t_template->tt_t, (struct mbuf *)NULL,
4324 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4325 (void) m_free(dtom(t_template));
4326 }
4327 }
4328
4329 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4330 struct mptsub *iter, *tmp;
4331
4332 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4333
4334 mp_so->so_error = ECONNRESET;
4335
4336 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4337 if (iter == mpts) {
4338 continue;
4339 }
4340 mptcp_subflow_abort(iter, ECONNABORTED);
4341 }
4342
4343 /*
4344 * mptcp_drop is being called after processing the events, to fully
4345 * close the MPTCP connection
4346 */
4347 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4348 }
4349
4350 mptcp_subflow_abort(mpts, ECONNABORTED);
4351
4352 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4353 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4354 }
4355
4356 return MPTS_EVRET_DELETE;
4357 }
4358
4359 static ev_ret_t
4360 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4361 long *p_mpsofilt_hint, long event)
4362 {
4363 #pragma unused(event)
4364 bool found_active = false;
4365
4366 mpts->mpts_flags |= MPTSF_READ_STALL;
4367
4368 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4369 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4370
4371 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4372 TCPS_HAVERCVDFIN2(tp->t_state)) {
4373 continue;
4374 }
4375
4376 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4377 found_active = true;
4378 break;
4379 }
4380 }
4381
4382 if (!found_active) {
4383 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4384 }
4385
4386 return MPTS_EVRET_OK;
4387 }
4388
4389 static ev_ret_t
4390 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4391 long *p_mpsofilt_hint, long event)
4392 {
4393 #pragma unused(event)
4394 bool found_active = false;
4395
4396 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4397
4398 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4399 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4400
4401 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4402 tp->t_state > TCPS_CLOSE_WAIT) {
4403 continue;
4404 }
4405
4406 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4407 found_active = true;
4408 break;
4409 }
4410 }
4411
4412 if (!found_active) {
4413 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4414 }
4415
4416 return MPTS_EVRET_OK;
4417 }
4418
4419 /*
4420 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4421 * caller must ensure that the option can be issued on subflow sockets, via
4422 * MPOF_SUBFLOW_OK flag.
4423 */
4424 int
4425 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4426 {
4427 struct socket *mp_so, *so;
4428 struct sockopt sopt;
4429 int error;
4430
4431 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4432
4433 mp_so = mptetoso(mpte);
4434 so = mpts->mpts_socket;
4435
4436 socket_lock_assert_owned(mp_so);
4437
4438 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4439 mpo->mpo_level == SOL_SOCKET &&
4440 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4441 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4442
4443 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4444 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4445 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4446 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4447 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4448
4449 /*
4450 * When we open a new subflow, mark it as cell fallback, if
4451 * this subflow goes over cell.
4452 *
4453 * (except for first-party apps)
4454 */
4455
4456 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4457 return 0;
4458 }
4459
4460 if (sotoinpcb(so)->inp_last_outifp &&
4461 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4462 return 0;
4463 }
4464
4465 /*
4466 * This here is an OR, because if the app is not binding to the
4467 * interface, then it definitely is not a cell-fallback
4468 * connection.
4469 */
4470 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4471 !IFNET_IS_CELLULAR(ifp)) {
4472 return 0;
4473 }
4474 }
4475
4476 mpo->mpo_flags &= ~MPOF_INTERIM;
4477
4478 bzero(&sopt, sizeof(sopt));
4479 sopt.sopt_dir = SOPT_SET;
4480 sopt.sopt_level = mpo->mpo_level;
4481 sopt.sopt_name = mpo->mpo_name;
4482 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4483 sopt.sopt_valsize = sizeof(int);
4484 sopt.sopt_p = kernproc;
4485
4486 error = sosetoptlock(so, &sopt, 0);
4487 if (error) {
4488 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4489 "val %d set error %d\n", __func__,
4490 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4491 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4492 mpo->mpo_intval, error);
4493 }
4494 return error;
4495 }
4496
4497 /*
4498 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4499 * caller must ensure that the option can be issued on subflow sockets, via
4500 * MPOF_SUBFLOW_OK flag.
4501 */
4502 int
4503 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4504 struct mptopt *mpo)
4505 {
4506 struct socket *mp_so;
4507 struct sockopt sopt;
4508 int error;
4509
4510 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4511 mp_so = mptetoso(mpte);
4512
4513 socket_lock_assert_owned(mp_so);
4514
4515 bzero(&sopt, sizeof(sopt));
4516 sopt.sopt_dir = SOPT_GET;
4517 sopt.sopt_level = mpo->mpo_level;
4518 sopt.sopt_name = mpo->mpo_name;
4519 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4520 sopt.sopt_valsize = sizeof(int);
4521 sopt.sopt_p = kernproc;
4522
4523 error = sogetoptlock(so, &sopt, 0); /* already locked */
4524 if (error) {
4525 os_log_error(mptcp_log_handle,
4526 "%s - %lx: sopt %s get error %d\n",
4527 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4528 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4529 }
4530 return error;
4531 }
4532
4533
4534 /*
4535 * MPTCP garbage collector.
4536 *
4537 * This routine is called by the MP domain on-demand, periodic callout,
4538 * which is triggered when a MPTCP socket is closed. The callout will
4539 * repeat as long as this routine returns a non-zero value.
4540 */
4541 static uint32_t
4542 mptcp_gc(struct mppcbinfo *mppi)
4543 {
4544 struct mppcb *mpp, *tmpp;
4545 uint32_t active = 0;
4546
4547 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4548
4549 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4550 struct socket *mp_so;
4551 struct mptses *mpte;
4552 struct mptcb *mp_tp;
4553
4554 mp_so = mpp->mpp_socket;
4555 mpte = mptompte(mpp);
4556 mp_tp = mpte->mpte_mptcb;
4557
4558 if (!mpp_try_lock(mpp)) {
4559 active++;
4560 continue;
4561 }
4562
4563 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4564
4565 /* check again under the lock */
4566 if (mp_so->so_usecount > 0) {
4567 boolean_t wakeup = FALSE;
4568 struct mptsub *mpts, *tmpts;
4569
4570 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4571 if (mp_tp->mpt_gc_ticks > 0) {
4572 mp_tp->mpt_gc_ticks--;
4573 }
4574 if (mp_tp->mpt_gc_ticks == 0) {
4575 wakeup = TRUE;
4576 }
4577 }
4578 if (wakeup) {
4579 TAILQ_FOREACH_SAFE(mpts,
4580 &mpte->mpte_subflows, mpts_entry, tmpts) {
4581 mptcp_subflow_eupcall1(mpts->mpts_socket,
4582 mpts, SO_FILT_HINT_DISCONNECTED);
4583 }
4584 }
4585 socket_unlock(mp_so, 0);
4586 active++;
4587 continue;
4588 }
4589
4590 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4591 panic("%s - %lx: skipped state "
4592 "[u=%d,r=%d,s=%d]\n", __func__,
4593 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4594 mp_so->so_usecount, mp_so->so_retaincnt,
4595 mpp->mpp_state);
4596 }
4597
4598 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4599 mptcp_close(mpte, mp_tp);
4600 }
4601
4602 mptcp_session_destroy(mpte);
4603
4604 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4605 struct sockbuf *, &mp_so->so_rcv,
4606 struct sockbuf *, &mp_so->so_snd,
4607 struct mppcb *, mpp);
4608
4609 mp_pcbdispose(mpp);
4610 sodealloc(mp_so);
4611 }
4612
4613 return active;
4614 }
4615
4616 /*
4617 * Drop a MPTCP connection, reporting the specified error.
4618 */
4619 struct mptses *
4620 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4621 {
4622 struct socket *mp_so = mptetoso(mpte);
4623
4624 VERIFY(mpte->mpte_mptcb == mp_tp);
4625
4626 socket_lock_assert_owned(mp_so);
4627
4628 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4629 uint32_t, 0 /* event */);
4630
4631 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4632 errno = mp_tp->mpt_softerror;
4633 }
4634 mp_so->so_error = errno;
4635
4636 return mptcp_close(mpte, mp_tp);
4637 }
4638
4639 /*
4640 * Close a MPTCP control block.
4641 */
4642 struct mptses *
4643 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4644 {
4645 struct mptsub *mpts = NULL, *tmpts = NULL;
4646 struct socket *mp_so = mptetoso(mpte);
4647
4648 socket_lock_assert_owned(mp_so);
4649 VERIFY(mpte->mpte_mptcb == mp_tp);
4650
4651 mp_tp->mpt_state = MPTCPS_TERMINATE;
4652
4653 mptcp_freeq(mp_tp);
4654
4655 soisdisconnected(mp_so);
4656
4657 /* Clean up all subflows */
4658 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4659 mptcp_subflow_disconnect(mpte, mpts);
4660 }
4661
4662 return NULL;
4663 }
4664
4665 void
4666 mptcp_notify_close(struct socket *so)
4667 {
4668 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4669 }
4670
4671 /*
4672 * MPTCP workloop.
4673 */
4674 void
4675 mptcp_subflow_workloop(struct mptses *mpte)
4676 {
4677 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4678 long mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4679 struct mptsub *mpts, *tmpts;
4680 struct socket *mp_so;
4681
4682 mp_so = mptetoso(mpte);
4683
4684 socket_lock_assert_owned(mp_so);
4685
4686 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4687 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4688 return;
4689 }
4690 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4691
4692 relaunch:
4693 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4694
4695 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4696 ev_ret_t ret;
4697
4698 if (mpts->mpts_socket->so_usecount == 0) {
4699 /* Will be removed soon by tcp_garbage_collect */
4700 continue;
4701 }
4702
4703 mptcp_subflow_addref(mpts);
4704 mpts->mpts_socket->so_usecount++;
4705
4706 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4707
4708 /*
4709 * If MPTCP socket is closed, disconnect all subflows.
4710 * This will generate a disconnect event which will
4711 * be handled during the next iteration, causing a
4712 * non-zero error to be returned above.
4713 */
4714 if (mp_so->so_flags & SOF_PCBCLEARING) {
4715 mptcp_subflow_disconnect(mpte, mpts);
4716 }
4717
4718 switch (ret) {
4719 case MPTS_EVRET_OK:
4720 /* nothing to do */
4721 break;
4722 case MPTS_EVRET_DELETE:
4723 mptcp_subflow_soclose(mpts);
4724 break;
4725 case MPTS_EVRET_CONNECT_PENDING:
4726 connect_pending = TRUE;
4727 break;
4728 case MPTS_EVRET_DISCONNECT_FALLBACK:
4729 disconnect_fallback = TRUE;
4730 break;
4731 default:
4732 mptcplog((LOG_DEBUG,
4733 "MPTCP Socket: %s: mptcp_subflow_events "
4734 "returned invalid value: %d\n", __func__,
4735 ret),
4736 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4737 break;
4738 }
4739 mptcp_subflow_remref(mpts); /* ours */
4740
4741 VERIFY(mpts->mpts_socket->so_usecount != 0);
4742 mpts->mpts_socket->so_usecount--;
4743 }
4744
4745 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4746 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4747
4748 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4749 mp_so->so_state |= SS_CANTRCVMORE;
4750 sorwakeup(mp_so);
4751 }
4752
4753 soevent(mp_so, mpsofilt_hint_mask);
4754 }
4755
4756 if (!connect_pending && !disconnect_fallback) {
4757 goto exit;
4758 }
4759
4760 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4761 if (disconnect_fallback) {
4762 struct socket *so = NULL;
4763 struct inpcb *inp = NULL;
4764 struct tcpcb *tp = NULL;
4765
4766 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4767 continue;
4768 }
4769
4770 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4771
4772 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4773 MPTSF_DISCONNECTED)) {
4774 continue;
4775 }
4776
4777 so = mpts->mpts_socket;
4778
4779 /*
4780 * The MPTCP connection has degraded to a fallback
4781 * mode, so there is no point in keeping this subflow
4782 * regardless of its MPTCP-readiness state, unless it
4783 * is the primary one which we use for fallback. This
4784 * assumes that the subflow used for fallback is the
4785 * ACTIVE one.
4786 */
4787
4788 inp = sotoinpcb(so);
4789 tp = intotcpcb(inp);
4790 tp->t_mpflags &=
4791 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4792 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4793
4794 soevent(so, SO_FILT_HINT_MUSTRST);
4795 } else if (connect_pending) {
4796 /*
4797 * The MPTCP connection has progressed to a state
4798 * where it supports full multipath semantics; allow
4799 * additional joins to be attempted for all subflows
4800 * that are in the PENDING state.
4801 */
4802 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4803 int error = mptcp_subflow_soconnectx(mpte, mpts);
4804
4805 if (error) {
4806 mptcp_subflow_abort(mpts, error);
4807 }
4808 }
4809 }
4810 }
4811
4812 exit:
4813 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4814 goto relaunch;
4815 }
4816
4817 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4818 }
4819
4820 /*
4821 * Protocol pr_lock callback.
4822 */
4823 int
4824 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4825 {
4826 struct mppcb *mpp = mpsotomppcb(mp_so);
4827 void *lr_saved;
4828
4829 if (lr == NULL) {
4830 lr_saved = __builtin_return_address(0);
4831 } else {
4832 lr_saved = lr;
4833 }
4834
4835 if (mpp == NULL) {
4836 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4837 mp_so, lr_saved, solockhistory_nr(mp_so));
4838 /* NOTREACHED */
4839 }
4840 mpp_lock(mpp);
4841
4842 if (mp_so->so_usecount < 0) {
4843 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4844 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4845 solockhistory_nr(mp_so));
4846 /* NOTREACHED */
4847 }
4848 if (refcount != 0) {
4849 mp_so->so_usecount++;
4850 mpp->mpp_inside++;
4851 }
4852 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4853 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4854
4855 return 0;
4856 }
4857
4858 /*
4859 * Protocol pr_unlock callback.
4860 */
4861 int
4862 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4863 {
4864 struct mppcb *mpp = mpsotomppcb(mp_so);
4865 void *lr_saved;
4866
4867 if (lr == NULL) {
4868 lr_saved = __builtin_return_address(0);
4869 } else {
4870 lr_saved = lr;
4871 }
4872
4873 if (mpp == NULL) {
4874 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4875 mp_so, mp_so->so_usecount, lr_saved,
4876 solockhistory_nr(mp_so));
4877 /* NOTREACHED */
4878 }
4879 socket_lock_assert_owned(mp_so);
4880
4881 if (refcount != 0) {
4882 mp_so->so_usecount--;
4883 mpp->mpp_inside--;
4884 }
4885
4886 if (mp_so->so_usecount < 0) {
4887 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4888 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4889 /* NOTREACHED */
4890 }
4891 if (mpp->mpp_inside < 0) {
4892 panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
4893 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
4894 /* NOTREACHED */
4895 }
4896 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4897 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4898 mpp_unlock(mpp);
4899
4900 return 0;
4901 }
4902
4903 /*
4904 * Protocol pr_getlock callback.
4905 */
4906 lck_mtx_t *
4907 mptcp_getlock(struct socket *mp_so, int flags)
4908 {
4909 struct mppcb *mpp = mpsotomppcb(mp_so);
4910
4911 if (mpp == NULL) {
4912 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4913 solockhistory_nr(mp_so));
4914 /* NOTREACHED */
4915 }
4916 if (mp_so->so_usecount < 0) {
4917 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4918 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4919 /* NOTREACHED */
4920 }
4921 return mpp_getlock(mpp, flags);
4922 }
4923
4924 /*
4925 * MPTCP Join support
4926 */
4927
4928 static void
4929 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
4930 {
4931 struct tcpcb *tp = sototcpcb(so);
4932 struct mptcp_subf_auth_entry *sauth_entry;
4933
4934 /*
4935 * The address ID of the first flow is implicitly 0.
4936 */
4937 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4938 tp->t_local_aid = 0;
4939 } else {
4940 tp->t_local_aid = addr_id;
4941 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4942 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4943 }
4944 sauth_entry = zalloc(mpt_subauth_zone);
4945 sauth_entry->msae_laddr_id = tp->t_local_aid;
4946 sauth_entry->msae_raddr_id = 0;
4947 sauth_entry->msae_raddr_rand = 0;
4948 try_again:
4949 sauth_entry->msae_laddr_rand = RandomULong();
4950 if (sauth_entry->msae_laddr_rand == 0) {
4951 goto try_again;
4952 }
4953 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4954 }
4955
4956 static void
4957 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4958 {
4959 struct mptcp_subf_auth_entry *sauth_entry;
4960 struct tcpcb *tp = NULL;
4961 int found = 0;
4962
4963 tp = sototcpcb(so);
4964 if (tp == NULL) {
4965 return;
4966 }
4967
4968 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4969 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4970 found = 1;
4971 break;
4972 }
4973 }
4974 if (found) {
4975 LIST_REMOVE(sauth_entry, msae_next);
4976 }
4977
4978 if (found) {
4979 zfree(mpt_subauth_zone, sauth_entry);
4980 }
4981 }
4982
4983 void
4984 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4985 u_int32_t *rrand)
4986 {
4987 struct mptcp_subf_auth_entry *sauth_entry;
4988
4989 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4990 if (sauth_entry->msae_laddr_id == addr_id) {
4991 if (lrand) {
4992 *lrand = sauth_entry->msae_laddr_rand;
4993 }
4994 if (rrand) {
4995 *rrand = sauth_entry->msae_raddr_rand;
4996 }
4997 break;
4998 }
4999 }
5000 }
5001
5002 void
5003 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5004 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5005 {
5006 struct mptcp_subf_auth_entry *sauth_entry;
5007
5008 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5009 if (sauth_entry->msae_laddr_id == laddr_id) {
5010 if ((sauth_entry->msae_raddr_id != 0) &&
5011 (sauth_entry->msae_raddr_id != raddr_id)) {
5012 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5013 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5014 raddr_id, sauth_entry->msae_raddr_id);
5015 return;
5016 }
5017 sauth_entry->msae_raddr_id = raddr_id;
5018 if ((sauth_entry->msae_raddr_rand != 0) &&
5019 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5020 os_log_error(mptcp_log_handle, "%s - %lx: "
5021 "dup SYN_ACK %d %d \n",
5022 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5023 raddr_rand, sauth_entry->msae_raddr_rand);
5024 return;
5025 }
5026 sauth_entry->msae_raddr_rand = raddr_rand;
5027 return;
5028 }
5029 }
5030 }
5031
5032 /*
5033 * SHA1 support for MPTCP
5034 */
5035 static void
5036 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5037 {
5038 SHA1_CTX sha1ctxt;
5039 const unsigned char *sha1_base;
5040 int sha1_size;
5041
5042 sha1_base = (const unsigned char *) key;
5043 sha1_size = sizeof(mptcp_key_t);
5044 SHA1Init(&sha1ctxt);
5045 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5046 SHA1Final(sha_digest, &sha1ctxt);
5047 }
5048
5049 void
5050 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5051 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5052 {
5053 SHA1_CTX sha1ctxt;
5054 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5055 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5056 u_int32_t data[2];
5057 int i;
5058
5059 bzero(digest, SHA1_RESULTLEN);
5060
5061 /* Set up the Key for HMAC */
5062 key_ipad[0] = key1;
5063 key_ipad[1] = key2;
5064
5065 key_opad[0] = key1;
5066 key_opad[1] = key2;
5067
5068 /* Set up the message for HMAC */
5069 data[0] = rand1;
5070 data[1] = rand2;
5071
5072 /* Key is 512 block length, so no need to compute hash */
5073
5074 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5075
5076 for (i = 0; i < 8; i++) {
5077 key_ipad[i] ^= 0x3636363636363636;
5078 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5079 }
5080
5081 /* Perform inner SHA1 */
5082 SHA1Init(&sha1ctxt);
5083 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5084 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5085 SHA1Final(digest, &sha1ctxt);
5086
5087 /* Perform outer SHA1 */
5088 SHA1Init(&sha1ctxt);
5089 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5090 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5091 SHA1Final(digest, &sha1ctxt);
5092 }
5093
5094 /*
5095 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5096 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5097 */
5098 void
5099 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
5100 {
5101 uint32_t lrand, rrand;
5102
5103 lrand = rrand = 0;
5104 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5105 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
5106 digest);
5107 }
5108
5109 /*
5110 * Authentication data generation
5111 */
5112 static void
5113 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5114 int token_len)
5115 {
5116 VERIFY(token_len == sizeof(u_int32_t));
5117 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5118
5119 /* Most significant 32 bits of the SHA1 hash */
5120 bcopy(sha_digest, token, sizeof(u_int32_t));
5121 return;
5122 }
5123
5124 static void
5125 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5126 int idsn_len)
5127 {
5128 VERIFY(idsn_len == sizeof(u_int64_t));
5129 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5130
5131 /*
5132 * Least significant 64 bits of the SHA1 hash
5133 */
5134
5135 idsn[7] = sha_digest[12];
5136 idsn[6] = sha_digest[13];
5137 idsn[5] = sha_digest[14];
5138 idsn[4] = sha_digest[15];
5139 idsn[3] = sha_digest[16];
5140 idsn[2] = sha_digest[17];
5141 idsn[1] = sha_digest[18];
5142 idsn[0] = sha_digest[19];
5143 return;
5144 }
5145
5146 static void
5147 mptcp_conn_properties(struct mptcb *mp_tp)
5148 {
5149 /* There is only Version 0 at this time */
5150 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
5151
5152 /* Set DSS checksum flag */
5153 if (mptcp_dss_csum) {
5154 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5155 }
5156
5157 /* Set up receive window */
5158 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5159
5160 /* Set up gc ticks */
5161 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5162 }
5163
5164 static void
5165 mptcp_init_local_parms(struct mptses *mpte)
5166 {
5167 struct mptcb *mp_tp = mpte->mpte_mptcb;
5168 char key_digest[SHA1_RESULTLEN];
5169
5170 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5171 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5172
5173 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
5174 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5175 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
5176 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
5177
5178 /* The subflow SYN is also first MPTCP byte */
5179 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5180 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5181
5182 mptcp_conn_properties(mp_tp);
5183 }
5184
5185 int
5186 mptcp_init_remote_parms(struct mptcb *mp_tp)
5187 {
5188 char remote_digest[SHA1_RESULTLEN];
5189
5190 /* Only Version 0 is supported for auth purposes */
5191 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5192 return -1;
5193 }
5194
5195 /* Setup local and remote tokens and Initial DSNs */
5196 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5197 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
5198 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5199 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
5200 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5201 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5202 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5203
5204 return 0;
5205 }
5206
5207 static void
5208 mptcp_send_dfin(struct socket *so)
5209 {
5210 struct tcpcb *tp = NULL;
5211 struct inpcb *inp = NULL;
5212
5213 inp = sotoinpcb(so);
5214 if (!inp) {
5215 return;
5216 }
5217
5218 tp = intotcpcb(inp);
5219 if (!tp) {
5220 return;
5221 }
5222
5223 if (!(tp->t_mpflags & TMPF_RESET)) {
5224 tp->t_mpflags |= TMPF_SEND_DFIN;
5225 }
5226 }
5227
5228 /*
5229 * Data Sequence Mapping routines
5230 */
5231 void
5232 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5233 {
5234 struct mptcb *mp_tp;
5235
5236 if (m == NULL) {
5237 return;
5238 }
5239
5240 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5241
5242 while (m) {
5243 VERIFY(m->m_flags & M_PKTHDR);
5244 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5245 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5246 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5247 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5248 mp_tp->mpt_sndmax += m_pktlen(m);
5249 m = m->m_next;
5250 }
5251 }
5252
5253 void
5254 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5255 {
5256 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5257 uint64_t data_ack;
5258 uint64_t dsn;
5259
5260 VERIFY(len >= 0);
5261
5262 if (!m || len == 0) {
5263 return;
5264 }
5265
5266 while (m && len > 0) {
5267 VERIFY(m->m_flags & M_PKTHDR);
5268 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5269
5270 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5271 dsn = m->m_pkthdr.mp_dsn;
5272
5273 len -= m->m_len;
5274 m = m->m_next;
5275 }
5276
5277 if (m && len == 0) {
5278 /*
5279 * If there is one more mbuf in the chain, it automatically means
5280 * that up to m->mp_dsn has been ack'ed.
5281 *
5282 * This means, we actually correct data_ack back down (compared
5283 * to what we set inside the loop - dsn + data_len). Because in
5284 * the loop we are "optimistic" and assume that the full mapping
5285 * will be acked. If that's not the case and we get out of the
5286 * loop with m != NULL, it means only up to m->mp_dsn has been
5287 * really acked.
5288 */
5289 data_ack = m->m_pkthdr.mp_dsn;
5290 }
5291
5292 if (len < 0) {
5293 /*
5294 * If len is negative, meaning we acked in the middle of an mbuf,
5295 * only up to this mbuf's data-sequence number has been acked
5296 * at the MPTCP-level.
5297 */
5298 data_ack = dsn;
5299 }
5300
5301 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5302 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5303
5304 /* We can have data in the subflow's send-queue that is being acked,
5305 * while the DATA_ACK has already advanced. Thus, we should check whether
5306 * or not the DATA_ACK is actually new here.
5307 */
5308 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5309 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5310 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5311 }
5312 }
5313
5314 void
5315 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5316 {
5317 int rewinding = 0;
5318
5319 /* TFO makes things complicated. */
5320 if (so->so_flags1 & SOF1_TFO_REWIND) {
5321 rewinding = 1;
5322 so->so_flags1 &= ~SOF1_TFO_REWIND;
5323 }
5324
5325 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5326 u_int32_t sub_len;
5327 VERIFY(m->m_flags & M_PKTHDR);
5328 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5329
5330 sub_len = m->m_pkthdr.mp_rlen;
5331
5332 if (sub_len < len) {
5333 m->m_pkthdr.mp_dsn += sub_len;
5334 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5335 m->m_pkthdr.mp_rseq += sub_len;
5336 }
5337 m->m_pkthdr.mp_rlen = 0;
5338 len -= sub_len;
5339 } else {
5340 /* sub_len >= len */
5341 if (rewinding == 0) {
5342 m->m_pkthdr.mp_dsn += len;
5343 }
5344 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5345 if (rewinding == 0) {
5346 m->m_pkthdr.mp_rseq += len;
5347 }
5348 }
5349 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5350 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5351 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5352 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5353 m->m_pkthdr.mp_rlen -= len;
5354 break;
5355 }
5356 m = m->m_next;
5357 }
5358
5359 if (so->so_flags & SOF_MP_SUBFLOW &&
5360 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5361 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5362 /*
5363 * Received an ack without receiving a DATA_ACK.
5364 * Need to fallback to regular TCP (or destroy this subflow).
5365 */
5366 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5367 mptcp_notify_mpfail(so);
5368 }
5369 }
5370
5371 /* Obtain the DSN mapping stored in the mbuf */
5372 void
5373 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5374 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5375 {
5376 u_int64_t dsn64;
5377
5378 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5379 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5380 }
5381
5382 void
5383 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5384 uint32_t *relseq, uint16_t *data_len,
5385 uint16_t *dss_csum)
5386 {
5387 struct mbuf *m = so->so_snd.sb_mb;
5388 int off_orig = off;
5389
5390 VERIFY(off >= 0);
5391
5392 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5393 *dsn = 0;
5394 *relseq = 0;
5395 *data_len = 0;
5396 *dss_csum = 0;
5397 return;
5398 }
5399
5400 /*
5401 * In the subflow socket, the DSN sequencing can be discontiguous,
5402 * but the subflow sequence mapping is contiguous. Use the subflow
5403 * sequence property to find the right mbuf and corresponding dsn
5404 * mapping.
5405 */
5406
5407 while (m) {
5408 VERIFY(m->m_flags & M_PKTHDR);
5409 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5410
5411 if (off >= m->m_len) {
5412 off -= m->m_len;
5413 m = m->m_next;
5414 } else {
5415 break;
5416 }
5417 }
5418
5419 VERIFY(off >= 0);
5420 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5421
5422 *dsn = m->m_pkthdr.mp_dsn;
5423 *relseq = m->m_pkthdr.mp_rseq;
5424 *data_len = m->m_pkthdr.mp_rlen;
5425 *dss_csum = m->m_pkthdr.mp_csum;
5426
5427 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5428 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5429 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5430 }
5431
5432 /*
5433 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5434 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5435 * When it trims data tcp_input calls m_adj() which does not remove the
5436 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5437 * The dsn map insertion cannot be delayed after trim, because data can be in
5438 * the reassembly queue for a while and the DSN option info in tp will be
5439 * overwritten for every new packet received.
5440 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5441 * with mptcp_adj_rmap()
5442 */
5443 void
5444 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5445 {
5446 VERIFY(m->m_flags & M_PKTHDR);
5447 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5448
5449 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5450 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5451 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5452 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5453 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5454 if (tp->t_rcv_map.mpt_dfin) {
5455 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5456 }
5457
5458 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5459
5460 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5461 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5462 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5463 if (th->th_flags & TH_FIN) {
5464 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5465 }
5466 }
5467 }
5468
5469 /*
5470 * Following routines help with failure detection and failover of data
5471 * transfer from one subflow to another.
5472 */
5473 void
5474 mptcp_act_on_txfail(struct socket *so)
5475 {
5476 struct tcpcb *tp = NULL;
5477 struct inpcb *inp = sotoinpcb(so);
5478
5479 if (inp == NULL) {
5480 return;
5481 }
5482
5483 tp = intotcpcb(inp);
5484 if (tp == NULL) {
5485 return;
5486 }
5487
5488 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5489 return;
5490 }
5491
5492 so->so_flags |= SOF_MP_TRYFAILOVER;
5493 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5494 }
5495
5496 /*
5497 * Support for MP_FAIL option
5498 */
5499 int
5500 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5501 {
5502 struct mbuf *m = so->so_snd.sb_mb;
5503 uint16_t datalen;
5504 uint64_t dsn;
5505 int off = 0;
5506
5507 if (m == NULL) {
5508 return -1;
5509 }
5510
5511 while (m != NULL) {
5512 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5513 VERIFY(m->m_flags & M_PKTHDR);
5514 dsn = m->m_pkthdr.mp_dsn;
5515 datalen = m->m_pkthdr.mp_rlen;
5516 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5517 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5518 off = (int)(dsn_fail - dsn);
5519 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5520 return 0;
5521 }
5522
5523 m = m->m_next;
5524 }
5525
5526 /*
5527 * If there was no mbuf data and a fallback to TCP occurred, there's
5528 * not much else to do.
5529 */
5530
5531 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5532 return -1;
5533 }
5534
5535 /*
5536 * Support for sending contiguous MPTCP bytes in subflow
5537 * Also for preventing sending data with ACK in 3-way handshake
5538 */
5539 int32_t
5540 mptcp_adj_sendlen(struct socket *so, int32_t off)
5541 {
5542 struct tcpcb *tp = sototcpcb(so);
5543 struct mptsub *mpts = tp->t_mpsub;
5544 uint64_t mdss_dsn;
5545 uint32_t mdss_subflow_seq;
5546 int mdss_subflow_off;
5547 uint16_t mdss_data_len;
5548 uint16_t dss_csum;
5549
5550 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5551 return 0;
5552 }
5553
5554 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5555 &mdss_data_len, &dss_csum);
5556
5557 /*
5558 * We need to compute how much of the mapping still remains.
5559 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5560 */
5561 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5562
5563 /*
5564 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5565 * seq has been set to 1 (while it should be 0).
5566 */
5567 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5568 mdss_subflow_off--;
5569 }
5570
5571 VERIFY(off >= mdss_subflow_off);
5572
5573 return mdss_data_len - (off - mdss_subflow_off);
5574 }
5575
5576 static uint32_t
5577 mptcp_get_maxseg(struct mptses *mpte)
5578 {
5579 struct mptsub *mpts;
5580 uint32_t maxseg = 0;
5581
5582 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5583 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5584
5585 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5586 TCPS_HAVERCVDFIN2(tp->t_state)) {
5587 continue;
5588 }
5589
5590 if (tp->t_maxseg > maxseg) {
5591 maxseg = tp->t_maxseg;
5592 }
5593 }
5594
5595 return maxseg;
5596 }
5597
5598 static uint8_t
5599 mptcp_get_rcvscale(struct mptses *mpte)
5600 {
5601 struct mptsub *mpts;
5602 uint8_t rcvscale = UINT8_MAX;
5603
5604 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5605 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5606
5607 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5608 TCPS_HAVERCVDFIN2(tp->t_state)) {
5609 continue;
5610 }
5611
5612 if (tp->rcv_scale < rcvscale) {
5613 rcvscale = tp->rcv_scale;
5614 }
5615 }
5616
5617 return rcvscale;
5618 }
5619
5620 /* Similar to tcp_sbrcv_reserve */
5621 static void
5622 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5623 u_int32_t newsize, u_int32_t idealsize)
5624 {
5625 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5626
5627 /* newsize should not exceed max */
5628 newsize = min(newsize, tcp_autorcvbuf_max);
5629
5630 /* The receive window scale negotiated at the
5631 * beginning of the connection will also set a
5632 * limit on the socket buffer size
5633 */
5634 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5635
5636 /* Set new socket buffer size */
5637 if (newsize > sbrcv->sb_hiwat &&
5638 (sbreserve(sbrcv, newsize) == 1)) {
5639 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5640 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5641
5642 /* Again check the limit set by the advertised
5643 * window scale
5644 */
5645 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5646 TCP_MAXWIN << rcvscale);
5647 }
5648 }
5649
5650 void
5651 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5652 {
5653 struct mptses *mpte = mp_tp->mpt_mpte;
5654 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5655 struct sockbuf *sbrcv = &mp_so->so_rcv;
5656 uint32_t hiwat_sum = 0;
5657 uint32_t ideal_sum = 0;
5658 struct mptsub *mpts;
5659
5660 /*
5661 * Do not grow the receive socket buffer if
5662 * - auto resizing is disabled, globally or on this socket
5663 * - the high water mark already reached the maximum
5664 * - the stream is in background and receive side is being
5665 * throttled
5666 * - if there are segments in reassembly queue indicating loss,
5667 * do not need to increase recv window during recovery as more
5668 * data is not going to be sent. A duplicate ack sent during
5669 * recovery should not change the receive window
5670 */
5671 if (tcp_do_autorcvbuf == 0 ||
5672 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5673 tcp_cansbgrow(sbrcv) == 0 ||
5674 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5675 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5676 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5677 /* Can not resize the socket buffer, just return */
5678 return;
5679 }
5680
5681 /*
5682 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5683 *
5684 * But, for this we first need accurate receiver-RTT estimations, which
5685 * we currently don't have.
5686 *
5687 * Let's use a dummy algorithm for now, just taking the sum of all
5688 * subflow's receive-buffers. It's too low, but that's all we can get
5689 * for now.
5690 */
5691
5692 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5693 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5694 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5695 }
5696
5697 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5698 }
5699
5700 /*
5701 * Determine if we can grow the recieve socket buffer to avoid sending
5702 * a zero window update to the peer. We allow even socket buffers that
5703 * have fixed size (set by the application) to grow if the resource
5704 * constraints are met. They will also be trimmed after the application
5705 * reads data.
5706 *
5707 * Similar to tcp_sbrcv_grow_rwin
5708 */
5709 static void
5710 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5711 {
5712 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5713 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5714 u_int32_t rcvbuf = sb->sb_hiwat;
5715
5716 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5717 return;
5718 }
5719
5720 if (tcp_do_autorcvbuf == 1 &&
5721 tcp_cansbgrow(sb) &&
5722 /* Diff to tcp_sbrcv_grow_rwin */
5723 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5724 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5725 rcvbuf < tcp_autorcvbuf_max &&
5726 (sb->sb_idealsize > 0 &&
5727 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5728 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5729 }
5730 }
5731
5732 /* Similar to tcp_sbspace */
5733 int32_t
5734 mptcp_sbspace(struct mptcb *mp_tp)
5735 {
5736 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5737 uint32_t rcvbuf;
5738 int32_t space;
5739 int32_t pending = 0;
5740
5741 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5742
5743 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5744
5745 /* hiwat might have changed */
5746 rcvbuf = sb->sb_hiwat;
5747
5748 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5749 (sb->sb_mbmax - sb->sb_mbcnt)));
5750 if (space < 0) {
5751 space = 0;
5752 }
5753
5754 #if CONTENT_FILTER
5755 /* Compensate for data being processed by content filters */
5756 pending = cfil_sock_data_space(sb);
5757 #endif /* CONTENT_FILTER */
5758 if (pending > space) {
5759 space = 0;
5760 } else {
5761 space -= pending;
5762 }
5763
5764 return space;
5765 }
5766
5767 /*
5768 * Support Fallback to Regular TCP
5769 */
5770 void
5771 mptcp_notify_mpready(struct socket *so)
5772 {
5773 struct tcpcb *tp = NULL;
5774
5775 if (so == NULL) {
5776 return;
5777 }
5778
5779 tp = intotcpcb(sotoinpcb(so));
5780
5781 if (tp == NULL) {
5782 return;
5783 }
5784
5785 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5786 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5787 struct tcpcb *, tp);
5788
5789 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5790 return;
5791 }
5792
5793 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5794 return;
5795 }
5796
5797 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5798 tp->t_mpflags |= TMPF_MPTCP_READY;
5799
5800 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5801 }
5802
5803 void
5804 mptcp_notify_mpfail(struct socket *so)
5805 {
5806 struct tcpcb *tp = NULL;
5807
5808 if (so == NULL) {
5809 return;
5810 }
5811
5812 tp = intotcpcb(sotoinpcb(so));
5813
5814 if (tp == NULL) {
5815 return;
5816 }
5817
5818 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5819 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5820 struct tcpcb *, tp);
5821
5822 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5823 return;
5824 }
5825
5826 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5827 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5828
5829 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5830 }
5831
5832 /*
5833 * Keepalive helper function
5834 */
5835 boolean_t
5836 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5837 {
5838 boolean_t ret = 1;
5839
5840 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5841
5842 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5843 ret = 0;
5844 }
5845 return ret;
5846 }
5847
5848 /*
5849 * MPTCP t_maxseg adjustment function
5850 */
5851 int
5852 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5853 {
5854 int mss_lower = 0;
5855 struct mptcb *mp_tp = tptomptp(tp);
5856
5857 #define MPTCP_COMPUTE_LEN { \
5858 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5859 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5860 mss_lower += 2; \
5861 else \
5862 /* adjust to 32-bit boundary + EOL */ \
5863 mss_lower += 2; \
5864 }
5865 if (mp_tp == NULL) {
5866 return 0;
5867 }
5868
5869 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5870
5871 /*
5872 * For the first subflow and subsequent subflows, adjust mss for
5873 * most common MPTCP option size, for case where tcp_mss is called
5874 * during option processing and MTU discovery.
5875 */
5876 if (!mtudisc) {
5877 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5878 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5879 MPTCP_COMPUTE_LEN;
5880 }
5881
5882 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5883 tp->t_mpflags & TMPF_SENT_JOIN) {
5884 MPTCP_COMPUTE_LEN;
5885 }
5886 } else {
5887 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5888 MPTCP_COMPUTE_LEN;
5889 }
5890 }
5891
5892 return mss_lower;
5893 }
5894
5895 /*
5896 * Update the pid, upid, uuid of the subflow so, based on parent so
5897 */
5898 void
5899 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5900 {
5901 if (so->last_pid != mp_so->last_pid ||
5902 so->last_upid != mp_so->last_upid) {
5903 so->last_upid = mp_so->last_upid;
5904 so->last_pid = mp_so->last_pid;
5905 uuid_copy(so->last_uuid, mp_so->last_uuid);
5906 }
5907 so_update_policy(so);
5908 }
5909
5910 static void
5911 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5912 {
5913 struct inpcb *inp;
5914
5915 tcp_getconninfo(so, &flow->flow_ci);
5916 inp = sotoinpcb(so);
5917 if ((inp->inp_vflag & INP_IPV6) != 0) {
5918 flow->flow_src.ss_family = AF_INET6;
5919 flow->flow_dst.ss_family = AF_INET6;
5920 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5921 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5922 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5923 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5924 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5925 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5926 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
5927 flow->flow_src.ss_family = AF_INET;
5928 flow->flow_dst.ss_family = AF_INET;
5929 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5930 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5931 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5932 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5933 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5934 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5935 }
5936 flow->flow_len = sizeof(*flow);
5937 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5938 flow->flow_flags = mpts->mpts_flags;
5939 flow->flow_cid = mpts->mpts_connid;
5940 flow->flow_relseq = mpts->mpts_rel_seq;
5941 flow->flow_soerror = mpts->mpts_socket->so_error;
5942 flow->flow_probecnt = mpts->mpts_probecnt;
5943 }
5944
5945 static int
5946 mptcp_pcblist SYSCTL_HANDLER_ARGS
5947 {
5948 #pragma unused(oidp, arg1, arg2)
5949 int error = 0, f;
5950 size_t len;
5951 struct mppcb *mpp;
5952 struct mptses *mpte;
5953 struct mptcb *mp_tp;
5954 struct mptsub *mpts;
5955 struct socket *so;
5956 conninfo_mptcp_t mptcpci;
5957 mptcp_flow_t *flows = NULL;
5958
5959 if (req->newptr != USER_ADDR_NULL) {
5960 return EPERM;
5961 }
5962
5963 lck_mtx_lock(&mtcbinfo.mppi_lock);
5964 if (req->oldptr == USER_ADDR_NULL) {
5965 size_t n = mtcbinfo.mppi_count;
5966 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5967 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
5968 4 * (n + n / 8) * sizeof(mptcp_flow_t);
5969 return 0;
5970 }
5971 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5972 flows = NULL;
5973 socket_lock(mpp->mpp_socket, 1);
5974 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5975 mpte = mptompte(mpp);
5976
5977 socket_lock_assert_owned(mptetoso(mpte));
5978 mp_tp = mpte->mpte_mptcb;
5979
5980 bzero(&mptcpci, sizeof(mptcpci));
5981 mptcpci.mptcpci_state = mp_tp->mpt_state;
5982 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5983 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5984 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5985 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5986 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5987 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5988 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5989 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5990 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5991 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5992 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
5993 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5994 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5995
5996 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5997 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5998 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5999 mptcpci.mptcpci_flow_offset =
6000 offsetof(conninfo_mptcp_t, mptcpci_flows);
6001
6002 len = sizeof(*flows) * mpte->mpte_numflows;
6003 if (mpte->mpte_numflows != 0) {
6004 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
6005 if (flows == NULL) {
6006 socket_unlock(mpp->mpp_socket, 1);
6007 break;
6008 }
6009 mptcpci.mptcpci_len = sizeof(mptcpci) +
6010 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6011 error = SYSCTL_OUT(req, &mptcpci,
6012 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6013 } else {
6014 mptcpci.mptcpci_len = sizeof(mptcpci);
6015 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6016 }
6017 if (error) {
6018 socket_unlock(mpp->mpp_socket, 1);
6019 FREE(flows, M_TEMP);
6020 break;
6021 }
6022 f = 0;
6023 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6024 so = mpts->mpts_socket;
6025 fill_mptcp_subflow(so, &flows[f], mpts);
6026 f++;
6027 }
6028 socket_unlock(mpp->mpp_socket, 1);
6029 if (flows) {
6030 error = SYSCTL_OUT(req, flows, len);
6031 FREE(flows, M_TEMP);
6032 if (error) {
6033 break;
6034 }
6035 }
6036 }
6037 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6038
6039 return error;
6040 }
6041
6042 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6043 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6044 "List of active MPTCP connections");
6045
6046 /*
6047 * Set notsent lowat mark on the MPTCB
6048 */
6049 int
6050 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6051 {
6052 struct mptcb *mp_tp = NULL;
6053 int error = 0;
6054
6055 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6056 mp_tp = mpte->mpte_mptcb;
6057 }
6058
6059 if (mp_tp) {
6060 mp_tp->mpt_notsent_lowat = optval;
6061 } else {
6062 error = EINVAL;
6063 }
6064
6065 return error;
6066 }
6067
6068 u_int32_t
6069 mptcp_get_notsent_lowat(struct mptses *mpte)
6070 {
6071 struct mptcb *mp_tp = NULL;
6072
6073 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6074 mp_tp = mpte->mpte_mptcb;
6075 }
6076
6077 if (mp_tp) {
6078 return mp_tp->mpt_notsent_lowat;
6079 } else {
6080 return 0;
6081 }
6082 }
6083
6084 int
6085 mptcp_notsent_lowat_check(struct socket *so)
6086 {
6087 struct mptses *mpte;
6088 struct mppcb *mpp;
6089 struct mptcb *mp_tp;
6090 struct mptsub *mpts;
6091
6092 int notsent = 0;
6093
6094 mpp = mpsotomppcb(so);
6095 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6096 return 0;
6097 }
6098
6099 mpte = mptompte(mpp);
6100 socket_lock_assert_owned(mptetoso(mpte));
6101 mp_tp = mpte->mpte_mptcb;
6102
6103 notsent = so->so_snd.sb_cc;
6104
6105 if ((notsent == 0) ||
6106 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6107 mp_tp->mpt_notsent_lowat)) {
6108 mptcplog((LOG_DEBUG, "MPTCP Sender: "
6109 "lowat %d notsent %d actual %llu \n",
6110 mp_tp->mpt_notsent_lowat, notsent,
6111 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6112 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6113 return 1;
6114 }
6115
6116 /* When Nagle's algorithm is not disabled, it is better
6117 * to wakeup the client even before there is atleast one
6118 * maxseg of data to write.
6119 */
6120 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6121 int retval = 0;
6122 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6123 struct socket *subf_so = mpts->mpts_socket;
6124 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6125
6126 notsent = so->so_snd.sb_cc -
6127 (tp->snd_nxt - tp->snd_una);
6128
6129 if ((tp->t_flags & TF_NODELAY) == 0 &&
6130 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6131 retval = 1;
6132 }
6133 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6134 " nodelay false \n",
6135 mp_tp->mpt_notsent_lowat, notsent),
6136 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6137 return retval;
6138 }
6139 }
6140 return 0;
6141 }
6142
6143 static errno_t
6144 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6145 void **unitinfo)
6146 {
6147 #pragma unused(kctlref, sac, unitinfo)
6148
6149 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6150 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6151 }
6152
6153 mptcp_kern_skt_unit = sac->sc_unit;
6154
6155 return 0;
6156 }
6157
6158 static void
6159 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6160 {
6161 struct mppcb *mpp;
6162
6163 /* Iterate over all MPTCP connections */
6164
6165 lck_mtx_lock(&mtcbinfo.mppi_lock);
6166
6167 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6168 struct socket *mp_so = mpp->mpp_socket;
6169 struct mptses *mpte = mpp->mpp_pcbe;
6170
6171 socket_lock(mp_so, 1);
6172
6173 if (mp_so->so_flags & SOF_DELEGATED &&
6174 uuid_compare(uuid, mp_so->e_uuid)) {
6175 goto next;
6176 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6177 uuid_compare(uuid, mp_so->last_uuid)) {
6178 goto next;
6179 }
6180
6181 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6182 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6183
6184 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6185
6186 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6187 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6188 }
6189
6190 mptcp_check_subflows_and_add(mpte);
6191 mptcp_remove_subflows(mpte);
6192
6193 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6194
6195 next:
6196 socket_unlock(mp_so, 1);
6197 }
6198
6199 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6200 }
6201
6202 static void
6203 mptcp_wifi_status_changed(void)
6204 {
6205 struct mppcb *mpp;
6206
6207 /* Iterate over all MPTCP connections */
6208
6209 lck_mtx_lock(&mtcbinfo.mppi_lock);
6210
6211 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6212 struct socket *mp_so = mpp->mpp_socket;
6213 struct mptses *mpte = mpp->mpp_pcbe;
6214
6215 socket_lock(mp_so, 1);
6216
6217 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6218 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6219 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6220 goto next;
6221 }
6222
6223 mptcp_check_subflows_and_add(mpte);
6224 mptcp_check_subflows_and_remove(mpte);
6225
6226 next:
6227 socket_unlock(mp_so, 1);
6228 }
6229
6230 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6231 }
6232
6233 void
6234 mptcp_ask_symptoms(struct mptses *mpte)
6235 {
6236 struct mptcp_symptoms_ask_uuid ask;
6237 struct socket *mp_so;
6238 struct proc *p;
6239 int pid, prio, err;
6240
6241 if (mptcp_kern_skt_unit == 0) {
6242 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6243 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6244 return;
6245 }
6246
6247 mp_so = mptetoso(mpte);
6248
6249 if (mp_so->so_flags & SOF_DELEGATED) {
6250 pid = mp_so->e_pid;
6251 } else {
6252 pid = mp_so->last_pid;
6253 }
6254
6255 p = proc_find(pid);
6256 if (p == PROC_NULL) {
6257 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6258 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6259 return;
6260 }
6261
6262 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6263
6264 if (mp_so->so_flags & SOF_DELEGATED) {
6265 uuid_copy(ask.uuid, mp_so->e_uuid);
6266 } else {
6267 uuid_copy(ask.uuid, mp_so->last_uuid);
6268 }
6269
6270 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6271
6272 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6273 prio == TASK_DARWINBG_APPLICATION) {
6274 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6275 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6276 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6277 } else {
6278 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6279 }
6280
6281 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6282 &ask, sizeof(ask), CTL_DATA_EOR);
6283
6284 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6285 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6286
6287
6288 proc_rele(p);
6289 }
6290
6291 static errno_t
6292 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6293 void *unitinfo)
6294 {
6295 #pragma unused(kctlref, kcunit, unitinfo)
6296
6297 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6298
6299 return 0;
6300 }
6301
6302 static errno_t
6303 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6304 mbuf_t m, int flags)
6305 {
6306 #pragma unused(kctlref, unitinfo, flags)
6307 symptoms_advisory_t *sa = NULL;
6308
6309 if (kcunit != mptcp_kern_skt_unit) {
6310 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6311 __func__, kcunit, mptcp_kern_skt_unit);
6312 }
6313
6314 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6315 mbuf_freem(m);
6316 return EINVAL;
6317 }
6318
6319 if (mbuf_len(m) < sizeof(*sa)) {
6320 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6321 __func__, mbuf_len(m), sizeof(*sa));
6322 mbuf_freem(m);
6323 return EINVAL;
6324 }
6325
6326 sa = mbuf_data(m);
6327
6328 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6329 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6330 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6331 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6332
6333 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6334 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6335 mptcp_wifi_status_changed();
6336 }
6337 } else {
6338 struct mptcp_symptoms_answer answer;
6339 errno_t err;
6340
6341 /* We temporarily allow different sizes for ease of submission */
6342 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6343 mbuf_len(m) != sizeof(answer)) {
6344 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6345 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6346 sizeof(answer));
6347 mbuf_free(m);
6348 return EINVAL;
6349 }
6350
6351 memset(&answer, 0, sizeof(answer));
6352
6353 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6354 if (err) {
6355 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6356 mbuf_free(m);
6357 return err;
6358 }
6359
6360 mptcp_allow_uuid(answer.uuid, answer.rssi);
6361 }
6362
6363 mbuf_freem(m);
6364 return 0;
6365 }
6366
6367 void
6368 mptcp_control_register(void)
6369 {
6370 /* Set up the advisory control socket */
6371 struct kern_ctl_reg mptcp_kern_ctl;
6372
6373 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6374 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6375 sizeof(mptcp_kern_ctl.ctl_name));
6376 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6377 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6378 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6379 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6380
6381 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6382 }
6383
6384 /*
6385 * Three return-values:
6386 * 1 : WiFi is bad
6387 * 0 : WiFi is good
6388 * -1 : WiFi-state is unknown
6389 */
6390 int
6391 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6392 {
6393 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6394 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6395 mptcp_advisory.sa_wifi_status) {
6396 return symptoms_is_wifi_lossy() ? 1 : 0;
6397 }
6398
6399 /*
6400 * If it's a first-party app and we don't have any info
6401 * about the Wi-Fi state, let's be pessimistic.
6402 */
6403 return -1;
6404 } else {
6405 if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
6406 return 1;
6407 }
6408
6409 /*
6410 * If we are target-based (meaning, we allow to be more lax on
6411 * the "unusable" target. We only *know* about the state once
6412 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6413 *
6414 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6415 * be set.
6416 *
6417 * In any other case (while in target-mode), consider WiFi bad
6418 * and we are going to ask for allowance from Symptoms anyway.
6419 */
6420 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6421 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6422 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6423 return 0;
6424 }
6425
6426 return 1;
6427 }
6428
6429 return 0;
6430 }
6431 }
6432
6433 boolean_t
6434 symptoms_is_wifi_lossy(void)
6435 {
6436 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6437 }
6438
6439 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6440 static void
6441 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6442 {
6443 struct socket *mp_so = mptetoso(mpte);
6444 struct socket *so = mpts->mpts_socket;
6445 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6446 struct mptcb *mp_tp = mpte->mpte_mptcb;
6447
6448 /* If data was sent with SYN, rewind state */
6449 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6450 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6451 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6452
6453 VERIFY(mp_droplen <= (UINT_MAX));
6454 VERIFY(mp_droplen >= tcp_droplen);
6455
6456 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6457 mpts->mpts_iss += tcp_droplen;
6458 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6459
6460 if (mp_droplen > tcp_droplen) {
6461 /* handle partial TCP ack */
6462 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6463 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6464 mp_droplen = tcp_droplen;
6465 } else {
6466 /* all data on SYN was acked */
6467 mpts->mpts_rel_seq = 1;
6468 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6469 }
6470 mp_tp->mpt_sndmax -= tcp_droplen;
6471
6472 if (mp_droplen != 0) {
6473 VERIFY(mp_so->so_snd.sb_mb != NULL);
6474 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6475 }
6476 }
6477 }
6478
6479 int
6480 mptcp_freeq(struct mptcb *mp_tp)
6481 {
6482 struct tseg_qent *q;
6483 int rv = 0;
6484
6485 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6486 LIST_REMOVE(q, tqe_q);
6487 m_freem(q->tqe_m);
6488 zfree(tcp_reass_zone, q);
6489 rv = 1;
6490 }
6491 mp_tp->mpt_reassqlen = 0;
6492 return rv;
6493 }
6494
6495 static int
6496 mptcp_post_event(u_int32_t event_code, int value)
6497 {
6498 struct kev_mptcp_data event_data;
6499 struct kev_msg ev_msg;
6500
6501 memset(&ev_msg, 0, sizeof(ev_msg));
6502
6503 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6504 ev_msg.kev_class = KEV_NETWORK_CLASS;
6505 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6506 ev_msg.event_code = event_code;
6507
6508 event_data.value = value;
6509
6510 ev_msg.dv[0].data_ptr = &event_data;
6511 ev_msg.dv[0].data_length = sizeof(event_data);
6512
6513 return kev_post_msg(&ev_msg);
6514 }
6515
6516 static void
6517 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6518 {
6519 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6520 int error;
6521
6522 /* First-party apps (Siri) don't flip the cellicon */
6523 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6524 return;
6525 }
6526
6527 /* Subflow is disappearing - don't set it on this one */
6528 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6529 return;
6530 }
6531
6532 /* Fallen back connections are not triggering the cellicon */
6533 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6534 return;
6535 }
6536
6537 /* Remember the last time we set the cellicon. Needed for debouncing */
6538 mpte->mpte_last_cellicon_set = tcp_now;
6539
6540 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6541 tcp_sched_timers(tp);
6542
6543 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6544 mpte->mpte_cellicon_increments != 0) {
6545 if (mptcp_cellicon_refcount == 0) {
6546 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6547 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6548
6549 /* Continue, so that the icon gets set... */
6550 } else {
6551 /*
6552 * In this case, the cellicon is already set. No need to bump it
6553 * even higher
6554 */
6555
6556 return;
6557 }
6558 }
6559
6560 /* When tearing down this subflow, we need to decrement the
6561 * reference counter
6562 */
6563 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6564
6565 /* This counter, so that when a session gets destroyed we decrement
6566 * the reference counter by whatever is left
6567 */
6568 mpte->mpte_cellicon_increments++;
6569
6570 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6571 /* If cellicon is already set, get out of here! */
6572 return;
6573 }
6574
6575 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6576
6577 if (error) {
6578 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6579 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6580 } else {
6581 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6582 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6583 }
6584 }
6585
6586 void
6587 mptcp_clear_cellicon(void)
6588 {
6589 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6590
6591 if (error) {
6592 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6593 __func__, error);
6594 } else {
6595 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6596 __func__);
6597 }
6598 }
6599
6600 /*
6601 * Returns true if the icon has been flipped to WiFi.
6602 */
6603 static boolean_t
6604 __mptcp_unset_cellicon(uint32_t val)
6605 {
6606 VERIFY(val < INT32_MAX);
6607 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6608 return false;
6609 }
6610
6611 mptcp_clear_cellicon();
6612
6613 return true;
6614 }
6615
6616 void
6617 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6618 {
6619 /* First-party apps (Siri) don't flip the cellicon */
6620 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6621 return;
6622 }
6623
6624 if (mpte->mpte_cellicon_increments == 0) {
6625 /* This flow never used cell - get out of here! */
6626 return;
6627 }
6628
6629 if (mptcp_cellicon_refcount == 0) {
6630 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6631 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6632
6633 return;
6634 }
6635
6636 if (mpts) {
6637 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6638 return;
6639 }
6640
6641 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6642 }
6643
6644 if (mpte->mpte_cellicon_increments < val) {
6645 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6646 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6647 val = mpte->mpte_cellicon_increments;
6648 }
6649
6650 mpte->mpte_cellicon_increments -= val;
6651
6652 if (__mptcp_unset_cellicon(val) == false) {
6653 return;
6654 }
6655
6656 /* All flows are gone - our counter should be at zero too! */
6657 if (mpte->mpte_cellicon_increments != 0) {
6658 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6659 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6660 }
6661 }
6662
6663 void
6664 mptcp_reset_rexmit_state(struct tcpcb *tp)
6665 {
6666 struct mptsub *mpts;
6667 struct inpcb *inp;
6668 struct socket *so;
6669
6670 inp = tp->t_inpcb;
6671 if (inp == NULL) {
6672 return;
6673 }
6674
6675 so = inp->inp_socket;
6676 if (so == NULL) {
6677 return;
6678 }
6679
6680 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6681 return;
6682 }
6683
6684 mpts = tp->t_mpsub;
6685
6686 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6687 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6688 }
6689
6690 void
6691 mptcp_reset_keepalive(struct tcpcb *tp)
6692 {
6693 struct mptsub *mpts = tp->t_mpsub;
6694
6695 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6696 }