]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
f7980b76c26a2d4577ec1fb396b8ae724ba9d698
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_wupcall(struct socket *, void *, int);
121 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
122 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
123 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
124
125 static void mptcp_subflow_abort(struct mptsub *, int);
126
127 static void mptcp_send_dfin(struct socket *so);
128 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
129 static void mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val);
130 static int mptcp_freeq(struct mptcb *mp_tp);
131
132 /*
133 * Possible return values for subflow event handlers. Note that success
134 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
135 * indicate errors or actions which require immediate attention; they will
136 * prevent the rest of the handlers from processing their respective events
137 * until the next round of events processing.
138 */
139 typedef enum {
140 MPTS_EVRET_DELETE = 1, /* delete this subflow */
141 MPTS_EVRET_OK = 2, /* OK */
142 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
143 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
144 } ev_ret_t;
145
146 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
158
159 static void mptcp_do_sha1(mptcp_key_t *, char *);
160 static void mptcp_init_local_parms(struct mptses *);
161
162 static unsigned int mptsub_zone_size; /* size of mptsub */
163 static struct zone *mptsub_zone; /* zone for mptsub */
164
165 static unsigned int mptopt_zone_size; /* size of mptopt */
166 static struct zone *mptopt_zone; /* zone for mptopt */
167
168 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
169 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
170
171 struct mppcbinfo mtcbinfo;
172
173 SYSCTL_DECL(_net_inet);
174
175 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
176
177 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
178 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
179 &mptcp_dbg_area, 0, "MPTCP debug area");
180
181 uint32_t mptcp_dbg_level = 1;
182 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
183 &mptcp_dbg_level, 0, "MPTCP debug level");
184
185 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
186 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
187
188
189 static int mptcp_alternate_port = 0;
190 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
191 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
192
193 static struct protosw mptcp_subflow_protosw;
194 static struct pr_usrreqs mptcp_subflow_usrreqs;
195 #if INET6
196 static struct ip6protosw mptcp_subflow_protosw6;
197 static struct pr_usrreqs mptcp_subflow_usrreqs6;
198 #endif /* INET6 */
199
200 static uint8_t mptcp_create_subflows_scheduled;
201
202 typedef struct mptcp_subflow_event_entry {
203 uint64_t sofilt_hint_mask;
204 ev_ret_t (*sofilt_hint_ev_hdlr)(
205 struct mptses *mpte,
206 struct mptsub *mpts,
207 uint64_t *p_mpsofilt_hint,
208 uint64_t event);
209 } mptsub_ev_entry_t;
210
211 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
212 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
213 static uint32_t mptcp_kern_skt_inuse = 0;
214 static uint32_t mptcp_kern_skt_unit;
215 static symptoms_advisory_t mptcp_advisory;
216
217 uint32_t mptcp_cellicon_refcount = 0;
218 #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
219
220 /*
221 * XXX The order of the event handlers below is really
222 * really important. Think twice before changing it.
223 */
224 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
225 {
226 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
227 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
228 },
229 {
230 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
231 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
232 },
233 {
234 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
235 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
236 },
237 {
238 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
239 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
240 },
241 {
242 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
243 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
244 },
245 {
246 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
247 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
248 },
249 {
250 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
251 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
252 },
253 {
254 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
255 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
256 },
257 {
258 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
259 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
260 },
261 {
262 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
263 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
264 },
265 {
266 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
267 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
268 },
269 {
270 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
271 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
272 },
273 {
274 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
275 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
276 },
277 {
278 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
279 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
280 },
281 };
282
283 os_log_t mptcp_log_handle;
284
285 /*
286 * Protocol pr_init callback.
287 */
288 void
289 mptcp_init(struct protosw *pp, struct domain *dp)
290 {
291 #pragma unused(dp)
292 static int mptcp_initialized = 0;
293 struct protosw *prp;
294 #if INET6
295 struct ip6protosw *prp6;
296 #endif /* INET6 */
297
298 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
299
300 /* do this only once */
301 if (mptcp_initialized) {
302 return;
303 }
304 mptcp_initialized = 1;
305
306 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
307
308 /*
309 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
310 * we must be able to find IPPROTO_TCP entries for both.
311 */
312 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
313 VERIFY(prp != NULL);
314 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
315 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
316 sizeof(mptcp_subflow_usrreqs));
317 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
318 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
319 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
320 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
321 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
322 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
323 /*
324 * Socket filters shouldn't attach/detach to/from this protosw
325 * since pr_protosw is to be used instead, which points to the
326 * real protocol; if they do, it is a bug and we should panic.
327 */
328 mptcp_subflow_protosw.pr_filter_head.tqh_first =
329 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
330 mptcp_subflow_protosw.pr_filter_head.tqh_last =
331 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
332
333 #if INET6
334 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
335 IPPROTO_TCP, SOCK_STREAM);
336 VERIFY(prp6 != NULL);
337 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
338 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
339 sizeof(mptcp_subflow_usrreqs6));
340 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
341 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
342 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
343 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
344 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
345 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
346 /*
347 * Socket filters shouldn't attach/detach to/from this protosw
348 * since pr_protosw is to be used instead, which points to the
349 * real protocol; if they do, it is a bug and we should panic.
350 */
351 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
352 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
353 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
354 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
355 #endif /* INET6 */
356
357 bzero(&mtcbinfo, sizeof(mtcbinfo));
358 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
359 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
360 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
361 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
362 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
363 /* NOTREACHED */
364 }
365 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
366 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
367
368 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
369 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
370 mtcbinfo.mppi_lock_grp_attr);
371 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
372 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
373 mtcbinfo.mppi_lock_attr);
374
375 mtcbinfo.mppi_gc = mptcp_gc;
376 mtcbinfo.mppi_timer = mptcp_timer;
377
378 /* attach to MP domain for garbage collection to take place */
379 mp_pcbinfo_attach(&mtcbinfo);
380
381 mptsub_zone_size = sizeof(struct mptsub);
382 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
383 8192, "mptsub")) == NULL) {
384 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
385 /* NOTREACHED */
386 }
387 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
388 zone_change(mptsub_zone, Z_EXPAND, TRUE);
389
390 mptopt_zone_size = sizeof(struct mptopt);
391 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
392 1024, "mptopt")) == NULL) {
393 panic("%s: unable to allocate MPTCP option zone\n", __func__);
394 /* NOTREACHED */
395 }
396 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
397 zone_change(mptopt_zone, Z_EXPAND, TRUE);
398
399 mpt_subauth_entry_size = sizeof(struct mptcp_subf_auth_entry);
400 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
401 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
402 panic("%s: unable to allocate MPTCP address auth zone \n",
403 __func__);
404 /* NOTREACHED */
405 }
406 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
407 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
408
409 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
410 }
411
412 int
413 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, int ifindex, boolean_t create)
414 {
415 int i, index = -1;
416
417 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
418 if (create && stats[i].ifindex == IFSCOPE_NONE) {
419 if (index < 0) {
420 index = i;
421 }
422 continue;
423 }
424
425 if (stats[i].ifindex == ifindex) {
426 index = i;
427 return index;
428 }
429 }
430
431 if (index != -1) {
432 stats[index].ifindex = ifindex;
433 }
434
435 return index;
436 }
437
438 static int
439 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
440 {
441 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
442 int index;
443
444 if (ifp == NULL) {
445 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
446 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
447 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
448 return -1;
449 }
450
451 index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
452
453 if (index != -1) {
454 if (stats[index].is_expensive == 0) {
455 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
456 }
457 }
458
459 return index;
460 }
461
462 void
463 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
464 {
465 int index;
466
467 tcpstat.tcps_mp_switches++;
468 mpte->mpte_subflow_switches++;
469
470 index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
471
472 if (index != -1) {
473 mpte->mpte_itfstats[index].switches++;
474 }
475 }
476
477 /*
478 * Flushes all recorded socket options from an MP socket.
479 */
480 static void
481 mptcp_flush_sopts(struct mptses *mpte)
482 {
483 struct mptopt *mpo, *tmpo;
484
485 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
486 mptcp_sopt_remove(mpte, mpo);
487 mptcp_sopt_free(mpo);
488 }
489 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
490 }
491
492 /*
493 * Create an MPTCP session, called as a result of opening a MPTCP socket.
494 */
495 int
496 mptcp_session_create(struct mppcb *mpp)
497 {
498 struct mppcbinfo *mppi;
499 struct mptses *mpte;
500 struct mptcb *mp_tp;
501
502 VERIFY(mpp != NULL);
503 mppi = mpp->mpp_pcbinfo;
504 VERIFY(mppi != NULL);
505
506 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
507 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
508
509 /* MPTCP Multipath PCB Extension */
510 bzero(mpte, sizeof(*mpte));
511 VERIFY(mpp->mpp_pcbe == NULL);
512 mpp->mpp_pcbe = mpte;
513 mpte->mpte_mppcb = mpp;
514 mpte->mpte_mptcb = mp_tp;
515
516 TAILQ_INIT(&mpte->mpte_sopts);
517 TAILQ_INIT(&mpte->mpte_subflows);
518 mpte->mpte_associd = SAE_ASSOCID_ANY;
519 mpte->mpte_connid_last = SAE_CONNID_ANY;
520
521 mptcp_init_urgency_timer(mpte);
522
523 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
524 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
525
526 if (mptcp_alternate_port) {
527 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
528 }
529
530 mpte->mpte_last_cellicon_set = tcp_now;
531
532 /* MPTCP Protocol Control Block */
533 bzero(mp_tp, sizeof(*mp_tp));
534 mp_tp->mpt_mpte = mpte;
535 mp_tp->mpt_state = MPTCPS_CLOSED;
536
537 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
538
539 return 0;
540 }
541
542 struct sockaddr *
543 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
544 {
545 if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) {
546 return &mpte->mpte_dst;
547 }
548
549 if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
550 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
551 }
552
553 if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
554 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
555 }
556
557 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
558 * meaning we prefer IPv6 over IPv4.
559 */
560 if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) {
561 return (struct sockaddr *)&mpte->mpte_dst_unicast_v6;
562 }
563
564 if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) {
565 return (struct sockaddr *)&mpte->mpte_dst_unicast_v4;
566 }
567
568 /* We don't yet have a unicast IP */
569 return NULL;
570 }
571
572 static void
573 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
574 uint64_t *cellbytes, uint64_t *allbytes)
575 {
576 int64_t mycellbytes = 0;
577 uint64_t myallbytes = 0;
578 int i;
579
580 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
581 if (mpte->mpte_itfstats[i].is_expensive) {
582 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
583 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
584 }
585
586 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
587 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
588 }
589
590 if (initial_cell) {
591 mycellbytes -= mpte->mpte_init_txbytes;
592 mycellbytes -= mpte->mpte_init_rxbytes;
593 }
594
595 if (mycellbytes < 0) {
596 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
597 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
598 *cellbytes = 0;
599 *allbytes = 0;
600 } else {
601 *cellbytes = mycellbytes;
602 *allbytes = myallbytes;
603 }
604 }
605
606 static void
607 mptcpstats_session_wrapup(struct mptses *mpte)
608 {
609 boolean_t cell = mpte->mpte_initial_cell;
610
611 switch (mpte->mpte_svctype) {
612 case MPTCP_SVCTYPE_HANDOVER:
613 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
614 tcpstat.tcps_mptcp_fp_handover_attempt++;
615
616 if (cell && mpte->mpte_handshake_success) {
617 tcpstat.tcps_mptcp_fp_handover_success_cell++;
618
619 if (mpte->mpte_used_wifi) {
620 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
621 }
622 } else if (mpte->mpte_handshake_success) {
623 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
624
625 if (mpte->mpte_used_cell) {
626 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
627 }
628 }
629 } else {
630 tcpstat.tcps_mptcp_handover_attempt++;
631
632 if (cell && mpte->mpte_handshake_success) {
633 tcpstat.tcps_mptcp_handover_success_cell++;
634
635 if (mpte->mpte_used_wifi) {
636 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
637 }
638 } else if (mpte->mpte_handshake_success) {
639 tcpstat.tcps_mptcp_handover_success_wifi++;
640
641 if (mpte->mpte_used_cell) {
642 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
643 }
644 }
645 }
646
647 if (mpte->mpte_handshake_success) {
648 uint64_t cellbytes;
649 uint64_t allbytes;
650
651 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
652
653 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
654 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
655 }
656 break;
657 case MPTCP_SVCTYPE_INTERACTIVE:
658 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
659 tcpstat.tcps_mptcp_fp_interactive_attempt++;
660
661 if (mpte->mpte_handshake_success) {
662 tcpstat.tcps_mptcp_fp_interactive_success++;
663
664 if (!cell && mpte->mpte_used_cell) {
665 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
666 }
667 }
668 } else {
669 tcpstat.tcps_mptcp_interactive_attempt++;
670
671 if (mpte->mpte_handshake_success) {
672 tcpstat.tcps_mptcp_interactive_success++;
673
674 if (!cell && mpte->mpte_used_cell) {
675 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
676 }
677 }
678 }
679
680 if (mpte->mpte_handshake_success) {
681 uint64_t cellbytes;
682 uint64_t allbytes;
683
684 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
685
686 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
687 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
688 }
689 break;
690 case MPTCP_SVCTYPE_AGGREGATE:
691 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
692 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
693
694 if (mpte->mpte_handshake_success) {
695 tcpstat.tcps_mptcp_fp_aggregate_success++;
696 }
697 } else {
698 tcpstat.tcps_mptcp_aggregate_attempt++;
699
700 if (mpte->mpte_handshake_success) {
701 tcpstat.tcps_mptcp_aggregate_success++;
702 }
703 }
704
705 if (mpte->mpte_handshake_success) {
706 uint64_t cellbytes;
707 uint64_t allbytes;
708
709 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
710
711 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
712 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
713 }
714 break;
715 }
716
717 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
718 tcpstat.tcps_mptcp_back_to_wifi++;
719 }
720
721 if (mpte->mpte_triggered_cell) {
722 tcpstat.tcps_mptcp_triggered_cell++;
723 }
724 }
725
726 /*
727 * Destroy an MPTCP session.
728 */
729 static void
730 mptcp_session_destroy(struct mptses *mpte)
731 {
732 struct mptcb *mp_tp = mpte->mpte_mptcb;
733
734 VERIFY(mp_tp != NULL);
735 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
736
737 mptcpstats_session_wrapup(mpte);
738 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
739 mptcp_flush_sopts(mpte);
740
741 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
742 _FREE(mpte->mpte_itfinfo, M_TEMP);
743 }
744 mpte->mpte_itfinfo = NULL;
745
746 m_freem_list(mpte->mpte_reinjectq);
747
748 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
749 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
750 }
751
752 boolean_t
753 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
754 {
755 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
756 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
757 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
758 }
759
760 static int
761 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
762 const struct in_addr *addrv4)
763 {
764 static const struct in6_addr well_known_prefix = {
765 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
766 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
767 0x00, 0x00, 0x00, 0x00},
768 };
769 const char *ptrv4 = (const char *)addrv4;
770 char buf[MAX_IPv6_STR_LEN];
771 char *ptr = (char *)addr;
772
773 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
774 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
775 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
776 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
777 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
778 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
779 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
780 return -1;
781 }
782
783 /* Check for the well-known prefix */
784 if (len == NAT64_PREFIX_LEN_96 &&
785 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
786 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
787 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
788 return -1;
789 }
790 }
791
792 switch (len) {
793 case NAT64_PREFIX_LEN_96:
794 memcpy(ptr + 12, ptrv4, 4);
795 break;
796 case NAT64_PREFIX_LEN_64:
797 memcpy(ptr + 9, ptrv4, 4);
798 break;
799 case NAT64_PREFIX_LEN_56:
800 memcpy(ptr + 7, ptrv4, 1);
801 memcpy(ptr + 9, ptrv4 + 1, 3);
802 break;
803 case NAT64_PREFIX_LEN_48:
804 memcpy(ptr + 6, ptrv4, 2);
805 memcpy(ptr + 9, ptrv4 + 2, 2);
806 break;
807 case NAT64_PREFIX_LEN_40:
808 memcpy(ptr + 5, ptrv4, 3);
809 memcpy(ptr + 9, ptrv4 + 3, 1);
810 break;
811 case NAT64_PREFIX_LEN_32:
812 memcpy(ptr + 4, ptrv4, 4);
813 break;
814 default:
815 panic("NAT64-prefix len is wrong: %u\n", len);
816 }
817
818 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
819 __func__, len,
820 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
821
822 return 0;
823 }
824
825 static void
826 mptcp_trigger_cell_bringup(struct mptses *mpte)
827 {
828 struct socket *mp_so = mptetoso(mpte);
829
830 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
831 uuid_string_t uuidstr;
832 int err;
833
834 socket_unlock(mp_so, 0);
835 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
836 TRUE);
837 socket_lock(mp_so, 0);
838
839 if (err == 0) {
840 mpte->mpte_triggered_cell = 1;
841 }
842
843 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
844 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
845 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
846 } else {
847 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
848 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
849 }
850 }
851
852 static boolean_t
853 mptcp_subflow_disconnecting(struct mptsub *mpts)
854 {
855 /* Split out in if-statements for readability. Compile should
856 * optimize that.
857 */
858 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
859 return true;
860 }
861
862 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
863 return true;
864 }
865
866 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
867 return true;
868 }
869
870 return false;
871 }
872
873 void
874 mptcp_check_subflows_and_add(struct mptses *mpte)
875 {
876 struct mptcb *mp_tp = mpte->mpte_mptcb;
877 boolean_t cellular_viable = FALSE;
878 boolean_t want_cellular = TRUE;
879 uint32_t i;
880
881 if (!mptcp_ok_to_create_subflows(mp_tp)) {
882 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
883 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
884 return;
885 }
886
887 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
888 return;
889 }
890
891 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
892 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
893 struct mpt_itf_info *info;
894 struct sockaddr_in6 nat64pre;
895 struct sockaddr *dst;
896 struct mptsub *mpts;
897 struct ifnet *ifp;
898 uint32_t ifindex;
899
900 info = &mpte->mpte_itfinfo[i];
901
902 ifindex = info->ifindex;
903 if (ifindex == IFSCOPE_NONE) {
904 continue;
905 }
906
907 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
908 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
909 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
910
911 if (info->no_mptcp_support) {
912 continue;
913 }
914
915 ifnet_head_lock_shared();
916 ifp = ifindex2ifnet[ifindex];
917 ifnet_head_done();
918
919 if (ifp == NULL) {
920 continue;
921 }
922
923 if (IFNET_IS_CELLULAR(ifp)) {
924 cellular_viable = TRUE;
925 }
926
927 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
928 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
929 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
930
931 if (subifp == NULL) {
932 continue;
933 }
934
935 /*
936 * If there is at least one functioning subflow on WiFi
937 * and we are checking for the cell interface, then
938 * we always need to ask symptoms for permission as
939 * cell is triggered even if WiFi is available.
940 */
941 if (!IFNET_IS_CELLULAR(subifp) &&
942 !mptcp_subflow_disconnecting(mpts) &&
943 IFNET_IS_CELLULAR(ifp)) {
944 need_to_ask_symptoms = TRUE;
945 }
946
947 /*
948 * In Handover mode, only create cell subflow if
949 * 1. Wi-Fi Assist is active
950 * 2. Symptoms marked WiFi as weak
951 * 3. We are experiencing RTOs or we are not sending data.
952 *
953 * This covers the scenario, where:
954 * 1. We send and get retransmission timeouts (thus,
955 * we confirmed that WiFi is indeed bad).
956 * 2. We are not sending and the server tries to send.
957 * Establshing a cell-subflow gives the server a
958 * chance to send us some data over cell if WiFi
959 * is dead. We establish the subflow with the
960 * backup-bit set, so the server is not allowed to
961 * send on this subflow as long as WiFi is providing
962 * good performance.
963 */
964 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
965 !IFNET_IS_CELLULAR(subifp) &&
966 !mptcp_subflow_disconnecting(mpts) &&
967 (mptcp_is_wifi_unusable_for_session(mpte) == 0 ||
968 (tp->t_rxtshift < mptcp_fail_thresh * 2 && mptetoso(mpte)->so_snd.sb_cc))) {
969 os_log_debug(mptcp_log_handle,
970 "%s - %lx: handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
971 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
972 mptcp_is_wifi_unusable_for_session(mpte),
973 tp->t_rxtshift,
974 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
975 mptetoso(mpte)->so_snd.sb_cc,
976 ifindex, subifp->if_index,
977 tp->t_srtt >> TCP_RTT_SHIFT,
978 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
979 tp->t_rxtcur);
980 found = TRUE;
981
982 /* We found a proper subflow on WiFi - no need for cell */
983 want_cellular = FALSE;
984 break;
985 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
986 uint64_t time_now = mach_continuous_time();
987
988 os_log(mptcp_log_handle,
989 "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
990 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
991 time_now, mptcp_is_wifi_unusable_for_session(mpte),
992 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
993 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
994
995 if (!IFNET_IS_CELLULAR(subifp) &&
996 !mptcp_subflow_disconnecting(mpts) &&
997 (mpte->mpte_time_target == 0 ||
998 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
999 !mptcp_is_wifi_unusable_for_session(mpte))) {
1000 found = TRUE;
1001
1002 want_cellular = FALSE;
1003 break;
1004 }
1005 } else {
1006 os_log_debug(mptcp_log_handle,
1007 "%s - %lx: svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u rtt %u rttvar %u rto %u\n",
1008 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1009 mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
1010 mptcp_is_wifi_unusable_for_session(mpte), tp->t_rxtshift,
1011 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc,
1012 tp->t_srtt >> TCP_RTT_SHIFT,
1013 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
1014 tp->t_rxtcur);
1015 }
1016
1017 if (subifp->if_index == ifindex &&
1018 !mptcp_subflow_disconnecting(mpts)) {
1019 /*
1020 * We found a subflow on this interface.
1021 * No need to create a new one.
1022 */
1023 found = TRUE;
1024 break;
1025 }
1026 }
1027
1028 if (found) {
1029 continue;
1030 }
1031
1032 if (need_to_ask_symptoms &&
1033 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1034 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1035 mptcp_developer_mode == 0) {
1036 mptcp_ask_symptoms(mpte);
1037 return;
1038 }
1039
1040 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1041
1042 if (dst->sa_family == AF_INET &&
1043 !info->has_v4_conn && info->has_nat64_conn) {
1044 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1045 int error, j;
1046
1047 bzero(&nat64pre, sizeof(struct sockaddr_in6));
1048
1049 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1050 if (error) {
1051 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1052 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1053 continue;
1054 }
1055
1056 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1057 if (nat64prefixes[j].prefix_len != 0) {
1058 break;
1059 }
1060 }
1061
1062 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1063
1064 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1065 nat64prefixes[j].prefix_len,
1066 &((struct sockaddr_in *)(void *)dst)->sin_addr);
1067 if (error != 0) {
1068 os_log_info(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1069 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1070 continue;
1071 }
1072
1073 memcpy(&nat64pre.sin6_addr,
1074 &nat64prefixes[j].ipv6_prefix,
1075 sizeof(nat64pre.sin6_addr));
1076 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1077 nat64pre.sin6_family = AF_INET6;
1078 nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1079 nat64pre.sin6_flowinfo = 0;
1080 nat64pre.sin6_scope_id = 0;
1081
1082 dst = (struct sockaddr *)&nat64pre;
1083 }
1084
1085 /* Initial subflow started on a NAT64'd address? */
1086 if (!(mpte->mpte_flags & MPTE_UNICAST_IP) &&
1087 mpte->mpte_dst.sa_family == AF_INET6 &&
1088 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
1089 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
1090 }
1091
1092 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1093 continue;
1094 }
1095 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1096 continue;
1097 }
1098
1099 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1100 }
1101
1102 if (!cellular_viable && want_cellular) {
1103 /* Trigger Cell Bringup */
1104 mptcp_trigger_cell_bringup(mpte);
1105 }
1106 }
1107
1108 static void
1109 mptcp_remove_cell_subflows(struct mptses *mpte)
1110 {
1111 struct mptsub *mpts, *tmpts;
1112 boolean_t found = false;
1113
1114 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1115 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1116
1117 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1118 continue;
1119 }
1120
1121 /* We have a functioning subflow on WiFi. No need for cell! */
1122 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1123 !mptcp_subflow_disconnecting(mpts)) {
1124 found = true;
1125 }
1126 }
1127
1128 /* Didn't found functional sub on WiFi - stay on cell */
1129 if (!found) {
1130 return;
1131 }
1132
1133 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1134 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1135
1136 /* Only remove cellular subflows */
1137 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1138 continue;
1139 }
1140
1141 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1142 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1143
1144 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1145 }
1146
1147 return;
1148 }
1149
1150 /* Returns true if it removed a subflow on cell */
1151 static void
1152 mptcp_handover_subflows_remove(struct mptses *mpte)
1153 {
1154 int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1155 boolean_t found_working_subflow = false;
1156 struct mptsub *mpts;
1157
1158 /*
1159 * Look for a subflow that is on a non-cellular interface
1160 * and actually works (aka, no retransmission timeout).
1161 */
1162 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1163 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1164 struct socket *so;
1165 struct tcpcb *tp;
1166
1167 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1168 continue;
1169 }
1170
1171 so = mpts->mpts_socket;
1172 tp = sototcpcb(so);
1173
1174 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1175 tp->t_state != TCPS_ESTABLISHED) {
1176 continue;
1177 }
1178
1179 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1180 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1181
1182 /* Is this subflow in good condition? */
1183 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) {
1184 found_working_subflow = true;
1185 }
1186
1187 /* Or WiFi is fine */
1188 if (!wifi_unusable) {
1189 found_working_subflow = true;
1190 }
1191 }
1192
1193 /*
1194 * Couldn't find a working subflow, let's not remove those on a cellular
1195 * interface.
1196 */
1197 if (!found_working_subflow) {
1198 return;
1199 }
1200
1201 mptcp_remove_cell_subflows(mpte);
1202 }
1203
1204 static void
1205 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1206 {
1207 uint64_t time_now = mach_continuous_time();
1208
1209 if (mpte->mpte_time_target != 0 &&
1210 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1211 mptcp_is_wifi_unusable_for_session(mpte)) {
1212 /* WiFi is bad and we are below the target - don't remove any subflows */
1213 return;
1214 }
1215
1216 mptcp_remove_cell_subflows(mpte);
1217 }
1218
1219 /*
1220 * Based on the MPTCP Service-type and the state of the subflows, we
1221 * will destroy subflows here.
1222 */
1223 void
1224 mptcp_check_subflows_and_remove(struct mptses *mpte)
1225 {
1226 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1227 return;
1228 }
1229
1230 socket_lock_assert_owned(mptetoso(mpte));
1231
1232 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1233 mptcp_handover_subflows_remove(mpte);
1234 }
1235
1236 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1237 mptcp_targetbased_subflows_remove(mpte);
1238 }
1239 }
1240
1241 static void
1242 mptcp_remove_subflows(struct mptses *mpte)
1243 {
1244 struct mptsub *mpts, *tmpts;
1245
1246 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1247 return;
1248 }
1249
1250 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1251 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1252 boolean_t found = false;
1253 uint32_t ifindex;
1254 uint32_t i;
1255
1256 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1257 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1258
1259 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1260 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1261 ifp ? ifp->if_index : -1);
1262 soevent(mpts->mpts_socket,
1263 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1264
1265 continue;
1266 }
1267
1268 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1269 continue;
1270 }
1271
1272 if (ifp) {
1273 ifindex = ifp->if_index;
1274 } else {
1275 ifindex = mpts->mpts_ifscope;
1276 }
1277
1278 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1279 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1280 continue;
1281 }
1282
1283 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1284 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1285 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1286 found = true;
1287 break;
1288 }
1289
1290 if (mpts->mpts_dst.sa_family == AF_INET &&
1291 mpte->mpte_itfinfo[i].has_v4_conn) {
1292 found = true;
1293 break;
1294 }
1295 }
1296 }
1297
1298 if (!found) {
1299 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1300 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1301 ifindex, mpts->mpts_flags);
1302
1303 soevent(mpts->mpts_socket,
1304 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1305 }
1306 }
1307 }
1308
1309 static void
1310 mptcp_create_subflows(__unused void *arg)
1311 {
1312 struct mppcb *mpp;
1313
1314 /*
1315 * Start with clearing, because we might be processing connections
1316 * while a new event comes in.
1317 */
1318 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1319 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1320 }
1321
1322 /* Iterate over all MPTCP connections */
1323
1324 lck_mtx_lock(&mtcbinfo.mppi_lock);
1325
1326 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1327 struct socket *mp_so = mpp->mpp_socket;
1328 struct mptses *mpte = mpp->mpp_pcbe;
1329
1330 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1331 continue;
1332 }
1333
1334 socket_lock(mp_so, 1);
1335 VERIFY(mp_so->so_usecount > 0);
1336
1337 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1338
1339 mptcp_check_subflows_and_add(mpte);
1340 mptcp_remove_subflows(mpte);
1341
1342 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1343 socket_unlock(mp_so, 1);
1344 }
1345
1346 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1347 }
1348
1349 /*
1350 * We need this because we are coming from an NECP-event. This event gets posted
1351 * while holding NECP-locks. The creation of the subflow however leads us back
1352 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1353 * So, we would deadlock there as we already hold the NECP-lock.
1354 *
1355 * So, let's schedule this separately. It also gives NECP the chance to make
1356 * progress, without having to wait for MPTCP to finish its subflow creation.
1357 */
1358 void
1359 mptcp_sched_create_subflows(struct mptses *mpte)
1360 {
1361 struct mppcb *mpp = mpte->mpte_mppcb;
1362 struct mptcb *mp_tp = mpte->mpte_mptcb;
1363 struct socket *mp_so = mpp->mpp_socket;
1364
1365 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1366 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1367 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1368 return;
1369 }
1370
1371 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1372 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1373 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1374 }
1375
1376 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1377 return;
1378 }
1379
1380 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1381 timeout(mptcp_create_subflows, NULL, hz / 10);
1382 }
1383
1384 /*
1385 * Allocate an MPTCP socket option structure.
1386 */
1387 struct mptopt *
1388 mptcp_sopt_alloc(int how)
1389 {
1390 struct mptopt *mpo;
1391
1392 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1393 zalloc_noblock(mptopt_zone);
1394 if (mpo != NULL) {
1395 bzero(mpo, mptopt_zone_size);
1396 }
1397
1398 return mpo;
1399 }
1400
1401 /*
1402 * Free an MPTCP socket option structure.
1403 */
1404 void
1405 mptcp_sopt_free(struct mptopt *mpo)
1406 {
1407 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1408
1409 zfree(mptopt_zone, mpo);
1410 }
1411
1412 /*
1413 * Add a socket option to the MPTCP socket option list.
1414 */
1415 void
1416 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1417 {
1418 socket_lock_assert_owned(mptetoso(mpte));
1419 mpo->mpo_flags |= MPOF_ATTACHED;
1420 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1421 }
1422
1423 /*
1424 * Remove a socket option from the MPTCP socket option list.
1425 */
1426 void
1427 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1428 {
1429 socket_lock_assert_owned(mptetoso(mpte));
1430 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1431 mpo->mpo_flags &= ~MPOF_ATTACHED;
1432 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1433 }
1434
1435 /*
1436 * Search for an existing <sopt_level,sopt_name> socket option.
1437 */
1438 struct mptopt *
1439 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1440 {
1441 struct mptopt *mpo;
1442
1443 socket_lock_assert_owned(mptetoso(mpte));
1444
1445 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1446 if (mpo->mpo_level == sopt->sopt_level &&
1447 mpo->mpo_name == sopt->sopt_name) {
1448 break;
1449 }
1450 }
1451 return mpo;
1452 }
1453
1454 /*
1455 * Allocate a MPTCP subflow structure.
1456 */
1457 static struct mptsub *
1458 mptcp_subflow_alloc(void)
1459 {
1460 struct mptsub *mpts = zalloc(mptsub_zone);
1461
1462 if (mpts == NULL) {
1463 return NULL;
1464 }
1465
1466 bzero(mpts, mptsub_zone_size);
1467 return mpts;
1468 }
1469
1470 /*
1471 * Deallocate a subflow structure, called when all of the references held
1472 * on it have been released. This implies that the subflow has been deleted.
1473 */
1474 static void
1475 mptcp_subflow_free(struct mptsub *mpts)
1476 {
1477 VERIFY(mpts->mpts_refcnt == 0);
1478 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1479 VERIFY(mpts->mpts_mpte == NULL);
1480 VERIFY(mpts->mpts_socket == NULL);
1481
1482 if (mpts->mpts_src != NULL) {
1483 FREE(mpts->mpts_src, M_SONAME);
1484 mpts->mpts_src = NULL;
1485 }
1486
1487 zfree(mptsub_zone, mpts);
1488 }
1489
1490 static void
1491 mptcp_subflow_addref(struct mptsub *mpts)
1492 {
1493 if (++mpts->mpts_refcnt == 0) {
1494 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1495 }
1496 /* NOTREACHED */
1497 }
1498
1499 static void
1500 mptcp_subflow_remref(struct mptsub *mpts)
1501 {
1502 if (mpts->mpts_refcnt == 0) {
1503 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1504 /* NOTREACHED */
1505 }
1506 if (--mpts->mpts_refcnt > 0) {
1507 return;
1508 }
1509
1510 /* callee will unlock and destroy lock */
1511 mptcp_subflow_free(mpts);
1512 }
1513
1514 static void
1515 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1516 {
1517 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1518 struct tcpcb *tp = sototcpcb(so);
1519
1520 /*
1521 * From this moment on, the subflow is linked to the MPTCP-connection.
1522 * Locking,... happens now at the MPTCP-layer
1523 */
1524 tp->t_mptcb = mpte->mpte_mptcb;
1525 so->so_flags |= SOF_MP_SUBFLOW;
1526 mp_so->so_usecount++;
1527
1528 /*
1529 * Insert the subflow into the list, and associate the MPTCP PCB
1530 * as well as the the subflow socket. From this point on, removing
1531 * the subflow needs to be done via mptcp_subflow_del().
1532 */
1533 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1534 mpte->mpte_numflows++;
1535
1536 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1537 mpts->mpts_mpte = mpte;
1538 mpts->mpts_socket = so;
1539 tp->t_mpsub = mpts;
1540 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1541 mptcp_subflow_addref(mpts); /* for subflow socket */
1542 }
1543
1544 static void
1545 mptcp_subflow_necp_cb(void *handle, __unused int action,
1546 __unused uint32_t interface_index,
1547 uint32_t necp_flags, bool *viable)
1548 {
1549 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1550 struct inpcb *inp = (struct inpcb *)handle;
1551 struct socket *so = inp->inp_socket;
1552 struct mptsub *mpts;
1553 struct mptses *mpte;
1554
1555 if (low_power) {
1556 action = NECP_CLIENT_CBACTION_NONVIABLE;
1557 }
1558
1559 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1560 return;
1561 }
1562
1563 /*
1564 * The socket is being garbage-collected. There is nothing to be done
1565 * here.
1566 */
1567 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1568 return;
1569 }
1570
1571 socket_lock(so, 1);
1572
1573 /* Check again after we acquired the lock. */
1574 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1575 goto out;
1576 }
1577
1578 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1579 mpts = sototcpcb(so)->t_mpsub;
1580
1581 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1582 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1583
1584 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1585
1586 mptcp_sched_create_subflows(mpte);
1587
1588 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1589 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1590 viable != NULL) {
1591 *viable = 1;
1592 }
1593
1594 out:
1595 socket_unlock(so, 1);
1596 }
1597
1598 /*
1599 * Create an MPTCP subflow socket.
1600 */
1601 static int
1602 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1603 struct socket **so)
1604 {
1605 lck_mtx_t *subflow_mtx;
1606 struct mptopt smpo, *mpo, *tmpo;
1607 struct proc *p;
1608 struct socket *mp_so;
1609 int error;
1610
1611 *so = NULL;
1612
1613 mp_so = mptetoso(mpte);
1614
1615 p = proc_find(mp_so->last_pid);
1616 if (p == PROC_NULL) {
1617 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1618 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1619
1620 return ESRCH;
1621 }
1622
1623 /*
1624 * Create the subflow socket (multipath subflow, non-blocking.)
1625 *
1626 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1627 * socket; it will be cleared when the socket is peeled off or closed.
1628 * It also indicates to the underlying TCP to handle MPTCP options.
1629 * A multipath subflow socket implies SS_NOFDREF state.
1630 */
1631
1632 /*
1633 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1634 * the ipi-lock. We cannot hold the socket-lock at that point.
1635 */
1636 socket_unlock(mp_so, 0);
1637 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1638 SOCF_MPTCP, PROC_NULL);
1639 socket_lock(mp_so, 0);
1640 if (error) {
1641 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1642 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1643
1644 proc_rele(p);
1645
1646 mptcp_subflow_free(mpts);
1647 return error;
1648 }
1649
1650 /*
1651 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1652 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1653 * Which is why we also need to get the lock with pr_getlock, as after
1654 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1655 */
1656 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1657 lck_mtx_lock(subflow_mtx);
1658
1659 /*
1660 * Must be the first thing we do, to make sure all pointers for this
1661 * subflow are set.
1662 */
1663 mptcp_subflow_attach(mpte, mpts, *so);
1664
1665 /*
1666 * A multipath subflow socket is used internally in the kernel,
1667 * therefore it does not have a file desciptor associated by
1668 * default.
1669 */
1670 (*so)->so_state |= SS_NOFDREF;
1671
1672 lck_mtx_unlock(subflow_mtx);
1673
1674 /* prevent the socket buffers from being compressed */
1675 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1676 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1677
1678 /* Inherit preconnect and TFO data flags */
1679 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1680 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1681 }
1682 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1683 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1684 }
1685
1686 /* Inherit uuid and create the related flow. */
1687 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1688 struct mptcb *mp_tp = mpte->mpte_mptcb;
1689
1690 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1691
1692 /*
1693 * A note on the unlock: With MPTCP, we do multiple times a
1694 * necp_client_register_socket_flow. This is problematic,
1695 * because now the lock-ordering guarantee (first necp-locks,
1696 * then socket-locks) is no more respected. So, we need to
1697 * unlock here.
1698 */
1699 socket_unlock(mp_so, 0);
1700 error = necp_client_register_socket_flow(mp_so->last_pid,
1701 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1702 socket_lock(mp_so, 0);
1703
1704 if (error) {
1705 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1706 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1707
1708 goto out_err;
1709 }
1710
1711 /* Possible state-change during the unlock above */
1712 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1713 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1714 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1715 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1716 mp_tp->mpt_state, mp_tp->mpt_flags);
1717
1718 error = EINVAL;
1719 goto out_err;
1720 }
1721
1722 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1723 }
1724
1725 /* Needs to happen prior to the delegation! */
1726 (*so)->last_pid = mp_so->last_pid;
1727
1728 if (mp_so->so_flags & SOF_DELEGATED) {
1729 if (mpte->mpte_epid) {
1730 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1731 if (error) {
1732 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1733 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1734 goto out_err;
1735 }
1736 }
1737 if (!uuid_is_null(mpte->mpte_euuid)) {
1738 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1739 if (error) {
1740 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1741 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1742 goto out_err;
1743 }
1744 }
1745 }
1746
1747 /* inherit the other socket options */
1748 bzero(&smpo, sizeof(smpo));
1749 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1750 smpo.mpo_level = SOL_SOCKET;
1751 smpo.mpo_intval = 1;
1752
1753 /* disable SIGPIPE */
1754 smpo.mpo_name = SO_NOSIGPIPE;
1755 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1756 goto out_err;
1757 }
1758
1759 /* find out if the subflow's source address goes away */
1760 smpo.mpo_name = SO_NOADDRERR;
1761 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1762 goto out_err;
1763 }
1764
1765 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1766 /*
1767 * On secondary subflows we might need to set the cell-fallback
1768 * flag (see conditions in mptcp_subflow_sosetopt).
1769 */
1770 smpo.mpo_level = SOL_SOCKET;
1771 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1772 smpo.mpo_intval = 1;
1773 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1774 goto out_err;
1775 }
1776 }
1777
1778 /* replay setsockopt(2) on the subflow sockets for eligible options */
1779 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1780 int interim;
1781
1782 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1783 continue;
1784 }
1785
1786 /*
1787 * Skip those that are handled internally; these options
1788 * should not have been recorded and marked with the
1789 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1790 */
1791 if (mpo->mpo_level == SOL_SOCKET &&
1792 (mpo->mpo_name == SO_NOSIGPIPE ||
1793 mpo->mpo_name == SO_NOADDRERR ||
1794 mpo->mpo_name == SO_KEEPALIVE)) {
1795 continue;
1796 }
1797
1798 interim = (mpo->mpo_flags & MPOF_INTERIM);
1799 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1800 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1801 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1802 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1803 mpo->mpo_intval);
1804 mptcp_sopt_remove(mpte, mpo);
1805 mptcp_sopt_free(mpo);
1806 continue;
1807 }
1808 }
1809
1810 /*
1811 * We need to receive everything that the subflow socket has,
1812 * so use a customized socket receive function. We will undo
1813 * this when the socket is peeled off or closed.
1814 */
1815 switch (dom) {
1816 case PF_INET:
1817 (*so)->so_proto = &mptcp_subflow_protosw;
1818 break;
1819 #if INET6
1820 case PF_INET6:
1821 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1822 break;
1823 #endif /* INET6 */
1824 default:
1825 VERIFY(0);
1826 /* NOTREACHED */
1827 }
1828
1829 proc_rele(p);
1830
1831 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1832 int, dom, int, error);
1833
1834 return 0;
1835
1836 out_err:
1837 mptcp_subflow_abort(mpts, error);
1838
1839 proc_rele(p);
1840
1841 return error;
1842 }
1843
1844 /*
1845 * Close an MPTCP subflow socket.
1846 *
1847 * Note that this may be called on an embryonic subflow, and the only
1848 * thing that is guaranteed valid is the protocol-user request.
1849 */
1850 static void
1851 mptcp_subflow_soclose(struct mptsub *mpts)
1852 {
1853 struct socket *so = mpts->mpts_socket;
1854
1855 if (mpts->mpts_flags & MPTSF_CLOSED) {
1856 return;
1857 }
1858
1859 VERIFY(so != NULL);
1860 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1861 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1862
1863 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1864 struct socket *, so,
1865 struct sockbuf *, &so->so_rcv,
1866 struct sockbuf *, &so->so_snd,
1867 struct mptses *, mpts->mpts_mpte);
1868
1869 mpts->mpts_flags |= MPTSF_CLOSED;
1870
1871 if (so->so_retaincnt == 0) {
1872 soclose_locked(so);
1873
1874 return;
1875 } else {
1876 VERIFY(so->so_usecount > 0);
1877 so->so_usecount--;
1878 }
1879
1880 return;
1881 }
1882
1883 /*
1884 * Connect an MPTCP subflow socket.
1885 *
1886 * Note that in the pending connect case, the subflow socket may have been
1887 * bound to an interface and/or a source IP address which may no longer be
1888 * around by the time this routine is called; in that case the connect attempt
1889 * will most likely fail.
1890 */
1891 static int
1892 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1893 {
1894 char dbuf[MAX_IPv6_STR_LEN];
1895 struct socket *mp_so, *so;
1896 struct mptcb *mp_tp;
1897 struct sockaddr *dst;
1898 struct proc *p;
1899 int af, error, dport;
1900
1901 mp_so = mptetoso(mpte);
1902 mp_tp = mpte->mpte_mptcb;
1903 so = mpts->mpts_socket;
1904 af = mpts->mpts_dst.sa_family;
1905 dst = &mpts->mpts_dst;
1906
1907 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1908 VERIFY(mpts->mpts_socket != NULL);
1909 VERIFY(af == AF_INET || af == AF_INET6);
1910
1911 if (af == AF_INET) {
1912 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1913 dport = ntohs(SIN(dst)->sin_port);
1914 } else {
1915 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1916 dport = ntohs(SIN6(dst)->sin6_port);
1917 }
1918
1919 os_log_info(mptcp_log_handle,
1920 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1921 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1922
1923 p = proc_find(mp_so->last_pid);
1924 if (p == PROC_NULL) {
1925 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1926 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1927
1928 return ESRCH;
1929 }
1930
1931 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1932
1933 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1934
1935 /* connect the subflow socket */
1936 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1937 p, mpts->mpts_ifscope,
1938 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1939
1940 mpts->mpts_iss = sototcpcb(so)->iss;
1941
1942 /* See tcp_connect_complete */
1943 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1944 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1945 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1946 }
1947
1948 /* Allocate a unique address id per subflow */
1949 mpte->mpte_addrid_last++;
1950 if (mpte->mpte_addrid_last == 0) {
1951 mpte->mpte_addrid_last++;
1952 }
1953
1954 proc_rele(p);
1955
1956 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1957 struct mptsub *, mpts, int, error);
1958 if (error) {
1959 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1960 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1961 }
1962
1963 return error;
1964 }
1965
1966 static int
1967 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1968 uint32_t rseq, uint16_t dlen)
1969 {
1970 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1971
1972 if (m_pktlen(m) == 0) {
1973 return 0;
1974 }
1975
1976 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1977 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1978 rseq != m->m_pkthdr.mp_rseq ||
1979 dlen != m->m_pkthdr.mp_rlen)) {
1980 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n",
1981 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1982 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1983 rseq, m->m_pkthdr.mp_rseq,
1984 dlen, m->m_pkthdr.mp_rlen);
1985
1986 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1987 return -1;
1988 }
1989 m->m_pkthdr.mp_dsn += off;
1990 m->m_pkthdr.mp_rseq += off;
1991 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
1992 } else {
1993 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
1994 /* data arrived without an DSS option mapping */
1995
1996 /* initial subflow can fallback right after SYN handshake */
1997 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
1998 mptcp_notify_mpfail(so);
1999 } else {
2000 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2001
2002 return -1;
2003 }
2004 } else if (m->m_flags & M_PKTHDR) {
2005 /* We need to fake the DATA-mapping */
2006 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2007 m->m_pkthdr.mp_dsn = dsn + off;
2008 m->m_pkthdr.mp_rseq = rseq + off;
2009 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
2010 }
2011 }
2012
2013 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2014
2015 return 0;
2016 }
2017
2018 /*
2019 * MPTCP subflow socket receive routine, derived from soreceive().
2020 */
2021 static int
2022 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2023 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2024 {
2025 #pragma unused(uio)
2026 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2027 int flags, error = 0;
2028 struct proc *p = current_proc();
2029 struct mbuf *m, **mp = mp0;
2030 boolean_t proc_held = FALSE;
2031
2032 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2033
2034 #ifdef MORE_LOCKING_DEBUG
2035 if (so->so_usecount == 1) {
2036 panic("%s: so=%x no other reference on socket\n", __func__, so);
2037 /* NOTREACHED */
2038 }
2039 #endif
2040 /*
2041 * We return all that is there in the subflow's socket receive buffer
2042 * to the MPTCP layer, so we require that the caller passes in the
2043 * expected parameters.
2044 */
2045 if (mp == NULL || controlp != NULL) {
2046 return EINVAL;
2047 }
2048
2049 *mp = NULL;
2050 if (psa != NULL) {
2051 *psa = NULL;
2052 }
2053 if (flagsp != NULL) {
2054 flags = *flagsp & ~MSG_EOR;
2055 } else {
2056 flags = 0;
2057 }
2058
2059 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2060 return EOPNOTSUPP;
2061 }
2062
2063 flags |= (MSG_DONTWAIT | MSG_NBIO);
2064
2065 /*
2066 * If a recv attempt is made on a previously-accepted socket
2067 * that has been marked as inactive (disconnected), reject
2068 * the request.
2069 */
2070 if (so->so_flags & SOF_DEFUNCT) {
2071 struct sockbuf *sb = &so->so_rcv;
2072
2073 error = ENOTCONN;
2074 /*
2075 * This socket should have been disconnected and flushed
2076 * prior to being returned from sodefunct(); there should
2077 * be no data on its receive list, so panic otherwise.
2078 */
2079 if (so->so_state & SS_DEFUNCT) {
2080 sb_empty_assert(sb, __func__);
2081 }
2082 return error;
2083 }
2084
2085 /*
2086 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2087 * and if so just return to the caller. This could happen when
2088 * soreceive() is called by a socket upcall function during the
2089 * time the socket is freed. The socket buffer would have been
2090 * locked across the upcall, therefore we cannot put this thread
2091 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2092 * we may livelock), because the lock on the socket buffer will
2093 * only be released when the upcall routine returns to its caller.
2094 * Because the socket has been officially closed, there can be
2095 * no further read on it.
2096 *
2097 * A multipath subflow socket would have its SS_NOFDREF set by
2098 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2099 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2100 */
2101 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2102 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2103 return 0;
2104 }
2105
2106 /*
2107 * For consistency with soreceive() semantics, we need to obey
2108 * SB_LOCK in case some other code path has locked the buffer.
2109 */
2110 error = sblock(&so->so_rcv, 0);
2111 if (error != 0) {
2112 return error;
2113 }
2114
2115 m = so->so_rcv.sb_mb;
2116 if (m == NULL) {
2117 /*
2118 * Panic if we notice inconsistencies in the socket's
2119 * receive list; both sb_mb and sb_cc should correctly
2120 * reflect the contents of the list, otherwise we may
2121 * end up with false positives during select() or poll()
2122 * which could put the application in a bad state.
2123 */
2124 SB_MB_CHECK(&so->so_rcv);
2125
2126 if (so->so_error != 0) {
2127 error = so->so_error;
2128 so->so_error = 0;
2129 goto release;
2130 }
2131
2132 if (so->so_state & SS_CANTRCVMORE) {
2133 goto release;
2134 }
2135
2136 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2137 error = ENOTCONN;
2138 goto release;
2139 }
2140
2141 /*
2142 * MSG_DONTWAIT is implicitly defined and this routine will
2143 * never block, so return EWOULDBLOCK when there is nothing.
2144 */
2145 error = EWOULDBLOCK;
2146 goto release;
2147 }
2148
2149 mptcp_update_last_owner(so, mp_so);
2150
2151 if (mp_so->last_pid != proc_pid(p)) {
2152 p = proc_find(mp_so->last_pid);
2153 if (p == PROC_NULL) {
2154 p = current_proc();
2155 } else {
2156 proc_held = TRUE;
2157 }
2158 }
2159
2160 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2161 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2162 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2163
2164 while (m != NULL) {
2165 int dlen = 0, dfin = 0, error_out = 0;
2166 struct mbuf *start = m;
2167 uint64_t dsn;
2168 uint32_t sseq;
2169 uint16_t orig_dlen;
2170 uint16_t csum;
2171
2172 VERIFY(m->m_nextpkt == NULL);
2173
2174 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2175 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2176 dsn = m->m_pkthdr.mp_dsn;
2177 sseq = m->m_pkthdr.mp_rseq;
2178 csum = m->m_pkthdr.mp_csum;
2179 } else {
2180 /* We did fallback */
2181 if (mptcp_adj_rmap(so, m, 0, 0, 0, 0)) {
2182 error = EIO;
2183 *mp0 = NULL;
2184 goto release;
2185 }
2186
2187 sbfree(&so->so_rcv, m);
2188
2189 if (mp != NULL) {
2190 *mp = m;
2191 mp = &m->m_next;
2192 so->so_rcv.sb_mb = m = m->m_next;
2193 *mp = NULL;
2194 }
2195
2196 if (m != NULL) {
2197 so->so_rcv.sb_lastrecord = m;
2198 } else {
2199 SB_EMPTY_FIXUP(&so->so_rcv);
2200 }
2201
2202 continue;
2203 }
2204
2205 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2206 dfin = 1;
2207 }
2208
2209 /*
2210 * Check if the full mapping is now present
2211 */
2212 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
2213 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
2214 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
2215 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
2216
2217 if (*mp0 == NULL) {
2218 error = EWOULDBLOCK;
2219 }
2220 goto release;
2221 }
2222
2223 /* Now, get the full mapping */
2224 while (dlen > 0) {
2225 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
2226 error_out = 1;
2227 error = EIO;
2228 dlen = 0;
2229 *mp0 = NULL;
2230 break;
2231 }
2232
2233 dlen -= m->m_len;
2234 sbfree(&so->so_rcv, m);
2235
2236 if (mp != NULL) {
2237 *mp = m;
2238 mp = &m->m_next;
2239 so->so_rcv.sb_mb = m = m->m_next;
2240 *mp = NULL;
2241 }
2242
2243 if (dlen - dfin == 0) {
2244 dlen = 0;
2245 }
2246
2247 VERIFY(dlen <= 0 || m);
2248 }
2249
2250 VERIFY(dlen == 0);
2251
2252 if (m != NULL) {
2253 so->so_rcv.sb_lastrecord = m;
2254 } else {
2255 SB_EMPTY_FIXUP(&so->so_rcv);
2256 }
2257
2258 if (error_out) {
2259 goto release;
2260 }
2261
2262 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2263 error = EIO;
2264 *mp0 = NULL;
2265 goto release;
2266 }
2267
2268 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2269 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2270 }
2271
2272 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2273 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2274
2275 if (flagsp != NULL) {
2276 *flagsp |= flags;
2277 }
2278
2279 release:
2280 sbunlock(&so->so_rcv, TRUE);
2281
2282 if (proc_held) {
2283 proc_rele(p);
2284 }
2285
2286 return error;
2287 }
2288
2289 /*
2290 * MPTCP subflow socket send routine, derived from sosend().
2291 */
2292 static int
2293 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2294 struct mbuf *top, struct mbuf *control, int flags)
2295 {
2296 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2297 struct proc *p = current_proc();
2298 boolean_t en_tracing = FALSE, proc_held = FALSE;
2299 int en_tracing_val;
2300 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2301 int error;
2302
2303 VERIFY(control == NULL);
2304 VERIFY(addr == NULL);
2305 VERIFY(uio == NULL);
2306 VERIFY(flags == 0);
2307 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2308
2309 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2310 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2311
2312 /*
2313 * trace if tracing & network (vs. unix) sockets & and
2314 * non-loopback
2315 */
2316 if (ENTR_SHOULDTRACE &&
2317 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2318 struct inpcb *inp = sotoinpcb(so);
2319 if (inp->inp_last_outifp != NULL &&
2320 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2321 en_tracing = TRUE;
2322 en_tracing_val = top->m_pkthdr.len;
2323 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2324 (unsigned long)VM_KERNEL_ADDRPERM(so),
2325 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2326 (int64_t)en_tracing_val);
2327 }
2328 }
2329
2330 mptcp_update_last_owner(so, mp_so);
2331
2332 if (mp_so->last_pid != proc_pid(p)) {
2333 p = proc_find(mp_so->last_pid);
2334 if (p == PROC_NULL) {
2335 p = current_proc();
2336 } else {
2337 proc_held = TRUE;
2338 }
2339 }
2340
2341 #if NECP
2342 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2343 #endif /* NECP */
2344
2345 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2346
2347 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
2348 if (error) {
2349 goto out;
2350 }
2351
2352 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2353 top = NULL;
2354
2355 out:
2356 if (top != NULL) {
2357 m_freem(top);
2358 }
2359
2360 if (proc_held) {
2361 proc_rele(p);
2362 }
2363
2364 soclearfastopen(so);
2365
2366 if (en_tracing) {
2367 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2368 (unsigned long)VM_KERNEL_ADDRPERM(so),
2369 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2370 (int64_t)en_tracing_val);
2371 }
2372
2373 return error;
2374 }
2375
2376 /*
2377 * Establish an initial MPTCP connection (if first subflow and not yet
2378 * connected), or add a subflow to an existing MPTCP connection.
2379 */
2380 int
2381 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2382 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2383 {
2384 struct socket *mp_so, *so = NULL;
2385 struct mptcb *mp_tp;
2386 struct mptsub *mpts = NULL;
2387 int af, error = 0;
2388
2389 mp_so = mptetoso(mpte);
2390 mp_tp = mpte->mpte_mptcb;
2391
2392 socket_lock_assert_owned(mp_so);
2393
2394 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2395 /* If the remote end sends Data FIN, refuse subflow adds */
2396 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2397 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2398 error = ENOTCONN;
2399 goto out_err;
2400 }
2401
2402 mpts = mptcp_subflow_alloc();
2403 if (mpts == NULL) {
2404 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2405 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2406 error = ENOMEM;
2407 goto out_err;
2408 }
2409
2410 if (src) {
2411 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2412 error = EAFNOSUPPORT;
2413 goto out_err;
2414 }
2415
2416 if (src->sa_family == AF_INET &&
2417 src->sa_len != sizeof(struct sockaddr_in)) {
2418 error = EINVAL;
2419 goto out_err;
2420 }
2421
2422 if (src->sa_family == AF_INET6 &&
2423 src->sa_len != sizeof(struct sockaddr_in6)) {
2424 error = EINVAL;
2425 goto out_err;
2426 }
2427
2428 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
2429 M_WAITOK | M_ZERO);
2430 if (mpts->mpts_src == NULL) {
2431 error = ENOMEM;
2432 goto out_err;
2433 }
2434 bcopy(src, mpts->mpts_src, src->sa_len);
2435 }
2436
2437 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2438 error = EAFNOSUPPORT;
2439 goto out_err;
2440 }
2441
2442 if (dst->sa_family == AF_INET &&
2443 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2444 error = EINVAL;
2445 goto out_err;
2446 }
2447
2448 if (dst->sa_family == AF_INET6 &&
2449 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2450 error = EINVAL;
2451 goto out_err;
2452 }
2453
2454 memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2455
2456 af = mpts->mpts_dst.sa_family;
2457
2458 ifnet_head_lock_shared();
2459 if ((ifscope > (unsigned)if_index)) {
2460 ifnet_head_done();
2461 error = ENXIO;
2462 goto out_err;
2463 }
2464 ifnet_head_done();
2465
2466 mpts->mpts_ifscope = ifscope;
2467
2468 /* create the subflow socket */
2469 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2470 /*
2471 * Returning (error) and not cleaning up, because up to here
2472 * all we did is creating mpts.
2473 *
2474 * And the contract is that the call to mptcp_subflow_socreate,
2475 * moves ownership of mpts to mptcp_subflow_socreate.
2476 */
2477 return error;
2478 }
2479
2480 /*
2481 * We may be called from within the kernel. Still need to account this
2482 * one to the real app.
2483 */
2484 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2485
2486 /*
2487 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2488 * -1 (SAE_CONNID_ALL).
2489 */
2490 mpte->mpte_connid_last++;
2491 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2492 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2493 mpte->mpte_connid_last++;
2494 }
2495
2496 mpts->mpts_connid = mpte->mpte_connid_last;
2497
2498 mpts->mpts_rel_seq = 1;
2499
2500 /* Allocate a unique address id per subflow */
2501 mpte->mpte_addrid_last++;
2502 if (mpte->mpte_addrid_last == 0) {
2503 mpte->mpte_addrid_last++;
2504 }
2505
2506 /* register for subflow socket read/write events */
2507 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2508
2509 /* Register for subflow socket control events */
2510 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2511 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2512 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2513 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2514 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2515 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2516 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2517 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2518
2519 /* sanity check */
2520 VERIFY(!(mpts->mpts_flags &
2521 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2522
2523 /*
2524 * Indicate to the TCP subflow whether or not it should establish
2525 * the initial MPTCP connection, or join an existing one. Fill
2526 * in the connection request structure with additional info needed
2527 * by the underlying TCP (to be used in the TCP options, etc.)
2528 */
2529 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2530 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2531
2532 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2533 mptcp_init_local_parms(mpte);
2534 }
2535 soisconnecting(mp_so);
2536
2537 /* If fastopen is requested, set state in mpts */
2538 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2539 mpts->mpts_flags |= MPTSF_TFO_REQD;
2540 }
2541 } else {
2542 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2543 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2544 }
2545 }
2546
2547 mpts->mpts_flags |= MPTSF_CONNECTING;
2548
2549 /* connect right away if first attempt, or if join can be done now */
2550 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2551 error = mptcp_subflow_soconnectx(mpte, mpts);
2552 }
2553
2554 if (error) {
2555 goto out_err_close;
2556 }
2557
2558 if (pcid) {
2559 *pcid = mpts->mpts_connid;
2560 }
2561
2562 return 0;
2563
2564 out_err_close:
2565 mptcp_subflow_abort(mpts, error);
2566
2567 return error;
2568
2569 out_err:
2570 if (mpts) {
2571 mptcp_subflow_free(mpts);
2572 }
2573
2574 return error;
2575 }
2576
2577 void
2578 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2579 {
2580 int index = mptcpstats_get_index(stats, mpts);
2581
2582 if (index != -1) {
2583 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2584
2585 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2586 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2587
2588 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2589 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2590
2591 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2592 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2593
2594 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2595 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2596 }
2597 }
2598
2599 /*
2600 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2601 * will no longer be accessible after a subflow is deleted, thus this
2602 * should occur only after the subflow socket has been disconnected.
2603 */
2604 void
2605 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2606 {
2607 struct socket *mp_so = mptetoso(mpte);
2608 struct socket *so = mpts->mpts_socket;
2609 struct tcpcb *tp = sototcpcb(so);
2610
2611 socket_lock_assert_owned(mp_so);
2612 VERIFY(mpts->mpts_mpte == mpte);
2613 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2614 VERIFY(mpte->mpte_numflows != 0);
2615 VERIFY(mp_so->so_usecount > 0);
2616
2617 mptcpstats_update(mpte->mpte_itfstats, mpts);
2618
2619 mptcp_unset_cellicon(mpte, mpts, 1);
2620
2621 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2622 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2623
2624 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2625 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2626 mpte->mpte_numflows--;
2627 if (mpte->mpte_active_sub == mpts) {
2628 mpte->mpte_active_sub = NULL;
2629 }
2630
2631 /*
2632 * Drop references held by this subflow socket; there
2633 * will be no further upcalls made from this point.
2634 */
2635 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2636 sock_catchevents_locked(so, NULL, NULL, 0);
2637
2638 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2639
2640 mp_so->so_usecount--; /* for subflow socket */
2641 mpts->mpts_mpte = NULL;
2642 mpts->mpts_socket = NULL;
2643
2644 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2645 mptcp_subflow_remref(mpts); /* for subflow socket */
2646
2647 so->so_flags &= ~SOF_MP_SUBFLOW;
2648 tp->t_mptcb = NULL;
2649 tp->t_mpsub = NULL;
2650 }
2651
2652 void
2653 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2654 {
2655 struct socket *so = mpts->mpts_socket;
2656 struct mptcb *mp_tp = mpte->mpte_mptcb;
2657 int send_dfin = 0;
2658
2659 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2660 send_dfin = 1;
2661 }
2662
2663 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2664 (so->so_state & SS_ISCONNECTED)) {
2665 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2666 __func__, mpts->mpts_connid, send_dfin),
2667 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2668
2669 if (send_dfin) {
2670 mptcp_send_dfin(so);
2671 }
2672 soshutdownlock(so, SHUT_WR);
2673 }
2674 }
2675
2676 static void
2677 mptcp_subflow_abort(struct mptsub *mpts, int error)
2678 {
2679 struct socket *so = mpts->mpts_socket;
2680 struct tcpcb *tp = sototcpcb(so);
2681
2682 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2683 return;
2684 }
2685
2686 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2687 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2688
2689 if (tp->t_state != TCPS_CLOSED) {
2690 tcp_drop(tp, error);
2691 }
2692
2693 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2694 }
2695
2696 /*
2697 * Disconnect a subflow socket.
2698 */
2699 void
2700 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2701 {
2702 struct socket *so;
2703 struct mptcb *mp_tp;
2704 int send_dfin = 0;
2705
2706 socket_lock_assert_owned(mptetoso(mpte));
2707
2708 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2709 return;
2710 }
2711
2712 mptcp_unset_cellicon(mpte, mpts, 1);
2713
2714 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2715
2716 so = mpts->mpts_socket;
2717 mp_tp = mpte->mpte_mptcb;
2718 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2719 send_dfin = 1;
2720 }
2721
2722 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2723 (so->so_state & SS_ISCONNECTED)) {
2724 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2725 __func__, mpts->mpts_connid, send_dfin),
2726 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2727
2728 if (send_dfin) {
2729 mptcp_send_dfin(so);
2730 }
2731 (void) soshutdownlock(so, SHUT_RD);
2732 (void) soshutdownlock(so, SHUT_WR);
2733 (void) sodisconnectlocked(so);
2734 }
2735 /*
2736 * Generate a disconnect event for this subflow socket, in case
2737 * the lower layer doesn't do it; this is needed because the
2738 * subflow socket deletion relies on it.
2739 */
2740 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2741 }
2742
2743 /*
2744 * Subflow socket input.
2745 */
2746 static void
2747 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2748 {
2749 struct socket *mp_so = mptetoso(mpte);
2750 struct mbuf *m = NULL;
2751 struct socket *so;
2752 int error, wakeup = 0;
2753
2754 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2755 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2756
2757 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2758 struct mptsub *, mpts);
2759
2760 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2761 goto out;
2762 }
2763
2764 so = mpts->mpts_socket;
2765
2766 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2767 if (error != 0 && error != EWOULDBLOCK) {
2768 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2769 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2770 if (error == ENODATA) {
2771 /*
2772 * Don't ignore ENODATA so as to discover
2773 * nasty middleboxes.
2774 */
2775 mp_so->so_error = ENODATA;
2776
2777 wakeup = 1;
2778 goto out;
2779 }
2780 } else if (error == 0) {
2781 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2782 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2783 }
2784
2785 /* In fallback, make sure to accept data on all but one subflow */
2786 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2787 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2788 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2789 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2790 m_freem(m);
2791 goto out;
2792 }
2793
2794 if (m != NULL) {
2795 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2796 mptcp_set_cellicon(mpte, mpts);
2797
2798 mpte->mpte_used_cell = 1;
2799 } else {
2800 /*
2801 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2802 * explicitly set the cellicon, then we unset it again.
2803 */
2804 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2805 mptcp_unset_cellicon(mpte, NULL, 1);
2806 }
2807
2808 mpte->mpte_used_wifi = 1;
2809 }
2810
2811 mptcp_input(mpte, m);
2812 }
2813
2814 out:
2815 if (wakeup) {
2816 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2817 }
2818
2819 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2820 }
2821
2822 void
2823 mptcp_handle_input(struct socket *so)
2824 {
2825 struct mptsub *mpts, *tmpts;
2826 struct mptses *mpte;
2827
2828 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2829 return;
2830 }
2831
2832 mpts = sototcpcb(so)->t_mpsub;
2833 mpte = mpts->mpts_mpte;
2834
2835 socket_lock_assert_owned(mptetoso(mpte));
2836
2837 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2838 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2839 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2840 }
2841 return;
2842 }
2843
2844 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2845 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2846 if (mpts->mpts_socket->so_usecount == 0) {
2847 /* Will be removed soon by tcp_garbage_collect */
2848 continue;
2849 }
2850
2851 mptcp_subflow_addref(mpts);
2852 mpts->mpts_socket->so_usecount++;
2853
2854 mptcp_subflow_input(mpte, mpts);
2855
2856 mptcp_subflow_remref(mpts); /* ours */
2857
2858 VERIFY(mpts->mpts_socket->so_usecount != 0);
2859 mpts->mpts_socket->so_usecount--;
2860 }
2861
2862 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2863 }
2864
2865 /*
2866 * Subflow socket write upcall.
2867 *
2868 * Called when the associated subflow socket posted a read event.
2869 */
2870 static void
2871 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2872 {
2873 #pragma unused(so, waitf)
2874 struct mptsub *mpts = arg;
2875 struct mptses *mpte = mpts->mpts_mpte;
2876
2877 VERIFY(mpte != NULL);
2878
2879 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2880 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2881 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2882 }
2883 return;
2884 }
2885
2886 mptcp_output(mpte);
2887 }
2888
2889 static boolean_t
2890 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2891 {
2892 struct mbuf *so_m = so->so_snd.sb_mb;
2893 uint64_t dsn = m->m_pkthdr.mp_dsn;
2894
2895 while (so_m) {
2896 VERIFY(so_m->m_flags & M_PKTHDR);
2897 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2898
2899 /* Part of the segment is covered, don't reinject here */
2900 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2901 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2902 return TRUE;
2903 }
2904
2905 so_m = so_m->m_next;
2906 }
2907
2908 return FALSE;
2909 }
2910
2911 /*
2912 * Subflow socket output.
2913 *
2914 * Called for sending data from MPTCP to the underlying subflow socket.
2915 */
2916 int
2917 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2918 {
2919 struct mptcb *mp_tp = mpte->mpte_mptcb;
2920 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2921 struct socket *mp_so, *so;
2922 struct tcpcb *tp;
2923 uint64_t mpt_dsn = 0, off = 0;
2924 int sb_cc = 0, error = 0, wakeup = 0;
2925 uint32_t dss_csum;
2926 uint16_t tot_sent = 0;
2927 boolean_t reinjected = FALSE;
2928
2929 mp_so = mptetoso(mpte);
2930 so = mpts->mpts_socket;
2931 tp = sototcpcb(so);
2932
2933 socket_lock_assert_owned(mp_so);
2934
2935 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2936 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2937
2938 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2939 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2940 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2941 (mpts->mpts_flags & MPTSF_TFO_REQD));
2942 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2943
2944 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2945 __func__, mpts->mpts_flags, mpte->mpte_flags,
2946 mptcp_subflow_cwnd_space(so)),
2947 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2948 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2949 struct mptsub *, mpts);
2950
2951 /* Remove Addr Option is not sent reliably as per I-D */
2952 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2953 tp->t_rem_aid = mpte->mpte_lost_aid;
2954 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2955 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2956 }
2957
2958 /*
2959 * The mbuf chains containing the metadata (as well as pointing to
2960 * the user data sitting at the MPTCP output queue) would then be
2961 * sent down to the subflow socket.
2962 *
2963 * Some notes on data sequencing:
2964 *
2965 * a. Each mbuf must be a M_PKTHDR.
2966 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2967 * in the mbuf pkthdr structure.
2968 * c. Each mbuf containing the MPTCP metadata must have its
2969 * pkt_flags marked with the PKTF_MPTCP flag.
2970 */
2971
2972 if (mpte->mpte_reinjectq) {
2973 sb_mb = mpte->mpte_reinjectq;
2974 } else {
2975 sb_mb = mp_so->so_snd.sb_mb;
2976 }
2977
2978 if (sb_mb == NULL) {
2979 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2980 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2981 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2982 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
2983
2984 /* Fix it to prevent looping */
2985 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
2986 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2987 }
2988 goto out;
2989 }
2990
2991 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2992
2993 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2994 !(so->so_state & SS_ISCONNECTED) &&
2995 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2996 tp->t_mpflags |= TMPF_TFO_REQUEST;
2997 goto zero_len_write;
2998 }
2999
3000 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3001
3002 /* First, drop acknowledged data */
3003 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3004 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3005 "dsn %u suna %u reinject? %u\n",
3006 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3007 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3008 if (mpte->mpte_reinjectq) {
3009 mptcp_clean_reinjectq(mpte);
3010 } else {
3011 uint64_t len = 0;
3012 len = mp_tp->mpt_snduna - mpt_dsn;
3013 sbdrop(&mp_so->so_snd, (int)len);
3014 wakeup = 1;
3015 }
3016 }
3017
3018 /* Check again because of above sbdrop */
3019 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3020 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3021 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3022 goto out;
3023 }
3024
3025 /*
3026 * In degraded mode, we don't receive data acks, so force free
3027 * mbufs less than snd_nxt
3028 */
3029 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3030 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3031 mp_so->so_snd.sb_mb) {
3032 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3033 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3034 uint64_t len = 0;
3035 len = mp_tp->mpt_snduna - mpt_dsn;
3036 sbdrop(&mp_so->so_snd, (int)len);
3037 wakeup = 1;
3038
3039 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3040 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3041 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3042 }
3043 }
3044
3045 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3046 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3047 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3048 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3049 }
3050
3051 /*
3052 * Adjust the top level notion of next byte used for retransmissions
3053 * and sending FINs.
3054 */
3055 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3056 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3057 }
3058
3059 /* Now determine the offset from which to start transmitting data */
3060 if (mpte->mpte_reinjectq) {
3061 sb_mb = mpte->mpte_reinjectq;
3062 } else {
3063 dont_reinject:
3064 sb_mb = mp_so->so_snd.sb_mb;
3065 }
3066 if (sb_mb == NULL) {
3067 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3068 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3069 goto out;
3070 }
3071
3072 if (sb_mb == mpte->mpte_reinjectq) {
3073 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3074 off = 0;
3075
3076 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3077 if (mptcp_can_send_more(mp_tp, TRUE)) {
3078 goto dont_reinject;
3079 }
3080
3081 error = ECANCELED;
3082 goto out;
3083 }
3084
3085 reinjected = TRUE;
3086 } else if (flags & MPTCP_SUBOUT_PROBING) {
3087 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3088 off = 0;
3089 } else {
3090 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3091
3092 /*
3093 * With TFO, there might be no data at all, thus still go into this
3094 * code-path here.
3095 */
3096 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3097 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3098 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3099 sb_cc -= off;
3100 } else {
3101 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3102 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3103 (uint32_t)mp_tp->mpt_sndmax);
3104
3105 goto out;
3106 }
3107 }
3108
3109 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3110 if (sb_cc <= 0) {
3111 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3112 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3113 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3114 mptcp_subflow_cwnd_space(so));
3115 }
3116
3117 sb_cc = min(sb_cc, UINT16_MAX);
3118
3119 /*
3120 * Create a DSN mapping for the data we are about to send. It all
3121 * has the same mapping.
3122 */
3123 if (reinjected) {
3124 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3125 } else {
3126 mpt_dsn = mp_tp->mpt_snduna + off;
3127 }
3128
3129 mpt_mbuf = sb_mb;
3130 while (mpt_mbuf && reinjected == FALSE &&
3131 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3132 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3133 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3134 mpt_mbuf = mpt_mbuf->m_next;
3135 }
3136 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3137 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3138 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3139 mpts->mpts_probecnt),
3140 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3141 }
3142
3143 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3144
3145 head = tail = NULL;
3146
3147 while (tot_sent < sb_cc) {
3148 ssize_t mlen;
3149
3150 mlen = mpt_mbuf->m_len;
3151 mlen -= off;
3152 mlen = min(mlen, sb_cc - tot_sent);
3153
3154 if (mlen < 0) {
3155 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3156 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3157 (uint32_t)off, sb_cc, tot_sent);
3158 goto out;
3159 }
3160
3161 if (mlen == 0) {
3162 goto next;
3163 }
3164
3165 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3166 M_COPYM_MUST_COPY_HDR);
3167 if (m == NULL) {
3168 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3169 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3170 error = ENOBUFS;
3171 break;
3172 }
3173
3174 /* Create a DSN mapping for the data (m_copym does it) */
3175 VERIFY(m->m_flags & M_PKTHDR);
3176 VERIFY(m->m_next == NULL);
3177
3178 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3179 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3180 m->m_pkthdr.mp_dsn = mpt_dsn;
3181 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3182 m->m_pkthdr.len = mlen;
3183
3184 if (head == NULL) {
3185 head = tail = m;
3186 } else {
3187 tail->m_next = m;
3188 tail = m;
3189 }
3190
3191 tot_sent += mlen;
3192 off = 0;
3193 next:
3194 mpt_mbuf = mpt_mbuf->m_next;
3195 }
3196
3197 if (reinjected) {
3198 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3199 struct mbuf *n = sb_mb;
3200
3201 while (n) {
3202 n->m_pkthdr.mp_dsn += sb_cc;
3203 n->m_pkthdr.mp_rlen -= sb_cc;
3204 n = n->m_next;
3205 }
3206 m_adj(sb_mb, sb_cc);
3207 } else {
3208 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3209 m_freem(sb_mb);
3210 }
3211 }
3212
3213 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3214 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3215 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3216
3217 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3218 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3219 tot_sent);
3220 }
3221
3222 /* Now, let's update rel-seq and the data-level length */
3223 mpts->mpts_rel_seq += tot_sent;
3224 m = head;
3225 while (m) {
3226 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3227 m->m_pkthdr.mp_csum = dss_csum;
3228 }
3229 m->m_pkthdr.mp_rlen = tot_sent;
3230 m = m->m_next;
3231 }
3232
3233 if (head != NULL) {
3234 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3235 (tp->t_tfo_stats == 0)) {
3236 tp->t_mpflags |= TMPF_TFO_REQUEST;
3237 }
3238
3239 error = sock_sendmbuf(so, NULL, head, 0, NULL);
3240
3241 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
3242 struct sockbuf *, &so->so_rcv,
3243 struct sockbuf *, &so->so_snd,
3244 struct mptses *, mpte, struct mptsub *, mpts,
3245 size_t, tot_sent);
3246 }
3247
3248 done_sending:
3249 if (error == 0 ||
3250 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3251 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3252
3253 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3254 tcpstat.tcps_mp_num_probes++;
3255 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3256 mpts->mpts_probecnt += 1;
3257 } else {
3258 mpts->mpts_probecnt +=
3259 tot_sent / mpts->mpts_maxseg;
3260 }
3261 }
3262
3263 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3264 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3265 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3266 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3267 }
3268 mp_tp->mpt_sndnxt = new_sndnxt;
3269 }
3270
3271 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3272
3273 /* Must be here as mptcp_can_send_more() checks for this */
3274 soclearfastopen(mp_so);
3275
3276 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3277 (mpts->mpts_probesoon != 0)) {
3278 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3279 __func__, mpts->mpts_connid,
3280 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3281 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3282 (tcp_now - mpts->mpts_probesoon)),
3283 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3284 }
3285
3286 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3287 mptcp_set_cellicon(mpte, mpts);
3288
3289 mpte->mpte_used_cell = 1;
3290 } else {
3291 /*
3292 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3293 * explicitly set the cellicon, then we unset it again.
3294 */
3295 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3296 mptcp_unset_cellicon(mpte, NULL, 1);
3297 }
3298
3299 mpte->mpte_used_wifi = 1;
3300 }
3301
3302 /*
3303 * Don't propagate EWOULDBLOCK - it's already taken care of
3304 * in mptcp_usr_send for TFO.
3305 */
3306 error = 0;
3307 } else {
3308 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3309 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3310 }
3311 out:
3312
3313 if (wakeup) {
3314 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3315 }
3316
3317 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3318 return error;
3319
3320 zero_len_write:
3321 /* Opting to call pru_send as no mbuf at subflow level */
3322 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3323 NULL, current_proc());
3324
3325 goto done_sending;
3326 }
3327
3328 static void
3329 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3330 {
3331 struct mbuf *n, *prev = NULL;
3332
3333 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3334 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3335 m->m_pkthdr.mp_rseq),
3336 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3337
3338 n = mpte->mpte_reinjectq;
3339
3340 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3341 * equal than m's sequence number.
3342 */
3343 while (n) {
3344 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3345 break;
3346 }
3347
3348 prev = n;
3349
3350 n = n->m_nextpkt;
3351 }
3352
3353 if (n) {
3354 /* m is already fully covered by the next mbuf in the queue */
3355 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3356 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3357 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
3358 __func__, n->m_pkthdr.mp_rlen),
3359 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3360 goto dont_queue;
3361 }
3362
3363 /* m is covering the next mbuf entirely, thus we remove this guy */
3364 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3365 struct mbuf *tmp = n->m_nextpkt;
3366
3367 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
3368 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3369 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
3370 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3371
3372 m->m_nextpkt = NULL;
3373 if (prev == NULL) {
3374 mpte->mpte_reinjectq = tmp;
3375 } else {
3376 prev->m_nextpkt = tmp;
3377 }
3378
3379 m_freem(n);
3380 n = tmp;
3381 }
3382 }
3383
3384 if (prev) {
3385 /* m is already fully covered by the previous mbuf in the queue */
3386 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3387 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
3388 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
3389 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3390 goto dont_queue;
3391 }
3392 }
3393
3394 if (prev == NULL) {
3395 mpte->mpte_reinjectq = m;
3396 } else {
3397 prev->m_nextpkt = m;
3398 }
3399
3400 m->m_nextpkt = n;
3401
3402 return;
3403
3404 dont_queue:
3405 m_freem(m);
3406 return;
3407 }
3408
3409 static struct mbuf *
3410 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3411 {
3412 struct socket *mp_so = mptetoso(mpte);
3413 struct mbuf *m;
3414
3415 m = mp_so->so_snd.sb_mb;
3416
3417 while (m) {
3418 /* If this segment covers what we are looking for, return it. */
3419 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3420 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3421 break;
3422 }
3423
3424
3425 /* Segment is no more in the queue */
3426 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3427 return NULL;
3428 }
3429
3430 m = m->m_next;
3431 }
3432
3433 return m;
3434 }
3435
3436 static struct mbuf *
3437 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3438 {
3439 struct mbuf *top = NULL, *tail = NULL;
3440 uint64_t dsn;
3441 uint32_t dlen, rseq;
3442
3443 dsn = m->m_pkthdr.mp_dsn;
3444 dlen = m->m_pkthdr.mp_rlen;
3445 rseq = m->m_pkthdr.mp_rseq;
3446
3447 while (len > 0) {
3448 struct mbuf *n;
3449
3450 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3451
3452 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3453 if (n == NULL) {
3454 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3455 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3456 goto err;
3457 }
3458
3459 VERIFY(n->m_flags & M_PKTHDR);
3460 VERIFY(n->m_next == NULL);
3461 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3462 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3463 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3464 VERIFY(n->m_len == m->m_len);
3465
3466 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3467
3468 if (top == NULL) {
3469 top = n;
3470 }
3471
3472 if (tail != NULL) {
3473 tail->m_next = n;
3474 }
3475
3476 tail = n;
3477
3478 len -= m->m_len;
3479 m = m->m_next;
3480 }
3481
3482 return top;
3483
3484 err:
3485 if (top) {
3486 m_freem(top);
3487 }
3488
3489 return NULL;
3490 }
3491
3492 static void
3493 mptcp_reinject_mbufs(struct socket *so)
3494 {
3495 struct tcpcb *tp = sototcpcb(so);
3496 struct mptsub *mpts = tp->t_mpsub;
3497 struct mptcb *mp_tp = tptomptp(tp);
3498 struct mptses *mpte = mp_tp->mpt_mpte;;
3499 struct sockbuf *sb = &so->so_snd;
3500 struct mbuf *m;
3501
3502 m = sb->sb_mb;
3503 while (m) {
3504 struct mbuf *n = m->m_next, *orig = m;
3505
3506 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3507 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3508 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3509 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3510
3511 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3512
3513 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3514 goto next;
3515 }
3516
3517 /* Has it all already been acknowledged at the data-level? */
3518 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3519 goto next;
3520 }
3521
3522 /* Part of this has already been acknowledged - lookup in the
3523 * MPTCP-socket for the segment.
3524 */
3525 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3526 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3527 if (m == NULL) {
3528 goto next;
3529 }
3530 }
3531
3532 /* Copy the mbuf with headers (aka, DSN-numbers) */
3533 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3534 if (m == NULL) {
3535 break;
3536 }
3537
3538 VERIFY(m->m_nextpkt == NULL);
3539
3540 /* Now, add to the reinject-queue, eliminating overlapping
3541 * segments
3542 */
3543 mptcp_add_reinjectq(mpte, m);
3544
3545 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3546
3547 next:
3548 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3549 while (n) {
3550 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3551
3552 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3553 break;
3554 }
3555
3556 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3557 n = n->m_next;
3558 }
3559
3560 m = n;
3561 }
3562 }
3563
3564 void
3565 mptcp_clean_reinjectq(struct mptses *mpte)
3566 {
3567 struct mptcb *mp_tp = mpte->mpte_mptcb;
3568
3569 socket_lock_assert_owned(mptetoso(mpte));
3570
3571 while (mpte->mpte_reinjectq) {
3572 struct mbuf *m = mpte->mpte_reinjectq;
3573
3574 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3575 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3576 break;
3577 }
3578
3579 mpte->mpte_reinjectq = m->m_nextpkt;
3580 m->m_nextpkt = NULL;
3581 m_freem(m);
3582 }
3583 }
3584
3585 /*
3586 * Subflow socket control event upcall.
3587 */
3588 static void
3589 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3590 {
3591 #pragma unused(so)
3592 struct mptsub *mpts = arg;
3593 struct mptses *mpte = mpts->mpts_mpte;
3594
3595 socket_lock_assert_owned(mptetoso(mpte));
3596
3597 if ((mpts->mpts_evctl & events) == events) {
3598 return;
3599 }
3600
3601 mpts->mpts_evctl |= events;
3602
3603 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3604 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3605 return;
3606 }
3607
3608 mptcp_subflow_workloop(mpte);
3609 }
3610
3611 /*
3612 * Subflow socket control events.
3613 *
3614 * Called for handling events related to the underlying subflow socket.
3615 */
3616 static ev_ret_t
3617 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3618 uint64_t *p_mpsofilt_hint)
3619 {
3620 ev_ret_t ret = MPTS_EVRET_OK;
3621 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3622 sizeof(mpsub_ev_entry_tbl[0]);
3623
3624 /* bail if there's nothing to process */
3625 if (!mpts->mpts_evctl) {
3626 return ret;
3627 }
3628
3629 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3630 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3631 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3632 SO_FILT_HINT_DISCONNECTED)) {
3633 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3634 }
3635
3636 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3637 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3638
3639 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3640 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3641 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3642
3643 /*
3644 * Process all the socket filter hints and reset the hint
3645 * once it is handled
3646 */
3647 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3648 /*
3649 * Always execute the DISCONNECTED event, because it will wakeup
3650 * the app.
3651 */
3652 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3653 (ret >= MPTS_EVRET_OK ||
3654 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3655 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3656 ev_ret_t error =
3657 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3658 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3659 }
3660 }
3661
3662 /*
3663 * We should be getting only events specified via sock_catchevents(),
3664 * so loudly complain if we have any unprocessed one(s).
3665 */
3666 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) {
3667 mptcplog((LOG_WARNING, "%s%s: cid %d evret %d unhandled events=%b\n", __func__,
3668 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3669 mpts->mpts_connid,
3670 ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3671 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3672 } else {
3673 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3674 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3675 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3676 }
3677
3678 return ret;
3679 }
3680
3681 static ev_ret_t
3682 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3683 uint64_t *p_mpsofilt_hint, uint64_t event)
3684 {
3685 struct socket *mp_so, *so;
3686 struct mptcb *mp_tp;
3687
3688 mp_so = mptetoso(mpte);
3689 mp_tp = mpte->mpte_mptcb;
3690 so = mpts->mpts_socket;
3691
3692 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3693 mpts->mpts_connid, event),
3694 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3695
3696 /*
3697 * We got an event for this subflow that might need to be propagated,
3698 * based on the state of the MPTCP connection.
3699 */
3700 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3701 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3702 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3703 mp_so->so_error = so->so_error;
3704 *p_mpsofilt_hint |= event;
3705 }
3706
3707 return MPTS_EVRET_OK;
3708 }
3709
3710 /*
3711 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3712 */
3713 static ev_ret_t
3714 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3715 uint64_t *p_mpsofilt_hint, uint64_t event)
3716 {
3717 #pragma unused(p_mpsofilt_hint, event)
3718 struct socket *mp_so;
3719 struct tcpcb *tp;
3720
3721 mp_so = mptetoso(mpte);
3722 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3723
3724 /*
3725 * This overwrites any previous mpte_lost_aid to avoid storing
3726 * too much state when the typical case has only two subflows.
3727 */
3728 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3729 mpte->mpte_lost_aid = tp->t_local_aid;
3730
3731 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3732 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3733
3734 /*
3735 * The subflow connection has lost its source address.
3736 */
3737 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3738
3739 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3740 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3741 }
3742
3743 return MPTS_EVRET_DELETE;
3744 }
3745
3746 static ev_ret_t
3747 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3748 uint64_t *p_mpsofilt_hint, uint64_t event)
3749 {
3750 #pragma unused(event, p_mpsofilt_hint)
3751 struct socket *so, *mp_so;
3752
3753 so = mpts->mpts_socket;
3754
3755 if (so->so_error != ENODATA) {
3756 return MPTS_EVRET_OK;
3757 }
3758
3759
3760 mp_so = mptetoso(mpte);
3761
3762 mp_so->so_error = ENODATA;
3763
3764 sorwakeup(mp_so);
3765 sowwakeup(mp_so);
3766
3767 return MPTS_EVRET_OK;
3768 }
3769
3770
3771 /*
3772 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3773 * indicates that the remote side sent a Data FIN
3774 */
3775 static ev_ret_t
3776 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3777 uint64_t *p_mpsofilt_hint, uint64_t event)
3778 {
3779 #pragma unused(event)
3780 struct mptcb *mp_tp = mpte->mpte_mptcb;
3781
3782 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3783 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3784
3785 /*
3786 * We got a Data FIN for the MPTCP connection.
3787 * The FIN may arrive with data. The data is handed up to the
3788 * mptcp socket and the user is notified so that it may close
3789 * the socket if needed.
3790 */
3791 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3792 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3793 }
3794
3795 return MPTS_EVRET_OK; /* keep the subflow socket around */
3796 }
3797
3798 /*
3799 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3800 */
3801 static ev_ret_t
3802 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3803 uint64_t *p_mpsofilt_hint, uint64_t event)
3804 {
3805 #pragma unused(event, p_mpsofilt_hint)
3806 struct mptsub *mpts_alt = NULL;
3807 struct socket *alt_so = NULL;
3808 struct socket *mp_so;
3809 int altpath_exists = 0;
3810
3811 mp_so = mptetoso(mpte);
3812 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3813
3814 mptcp_reinject_mbufs(mpts->mpts_socket);
3815
3816 mpts_alt = mptcp_get_subflow(mpte, NULL);
3817
3818 /* If there is no alternate eligible subflow, ignore the failover hint. */
3819 if (mpts_alt == NULL || mpts_alt == mpts) {
3820 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3821 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3822
3823 goto done;
3824 }
3825
3826 altpath_exists = 1;
3827 alt_so = mpts_alt->mpts_socket;
3828 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3829 /* All data acknowledged and no RTT spike */
3830 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3831 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3832 } else {
3833 /* no alternate path available */
3834 altpath_exists = 0;
3835 }
3836 }
3837
3838 if (altpath_exists) {
3839 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3840
3841 mpte->mpte_active_sub = mpts_alt;
3842 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3843 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3844
3845 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3846 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3847
3848 mptcpstats_inc_switch(mpte, mpts);
3849
3850 sowwakeup(alt_so);
3851 } else {
3852 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3853 mpts->mpts_connid),
3854 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3855 done:
3856 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3857 }
3858
3859 return MPTS_EVRET_OK;
3860 }
3861
3862 /*
3863 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3864 */
3865 static ev_ret_t
3866 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3867 uint64_t *p_mpsofilt_hint, uint64_t event)
3868 {
3869 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3870 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3871
3872 /*
3873 * The subflow connection cannot use the outgoing interface, let's
3874 * close this subflow.
3875 */
3876 mptcp_subflow_abort(mpts, EPERM);
3877
3878 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3879
3880 return MPTS_EVRET_DELETE;
3881 }
3882
3883 /*
3884 * https://tools.ietf.org/html/rfc6052#section-2
3885 * https://tools.ietf.org/html/rfc6147#section-5.2
3886 */
3887 static boolean_t
3888 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3889 const struct ipv6_prefix *prefix,
3890 struct in_addr *addrv4)
3891 {
3892 char buf[MAX_IPv4_STR_LEN];
3893 char *ptrv4 = (char *)addrv4;
3894 const char *ptr = (const char *)addr;
3895
3896 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3897 return false;
3898 }
3899
3900 switch (prefix->prefix_len) {
3901 case NAT64_PREFIX_LEN_96:
3902 memcpy(ptrv4, ptr + 12, 4);
3903 break;
3904 case NAT64_PREFIX_LEN_64:
3905 memcpy(ptrv4, ptr + 9, 4);
3906 break;
3907 case NAT64_PREFIX_LEN_56:
3908 memcpy(ptrv4, ptr + 7, 1);
3909 memcpy(ptrv4 + 1, ptr + 9, 3);
3910 break;
3911 case NAT64_PREFIX_LEN_48:
3912 memcpy(ptrv4, ptr + 6, 2);
3913 memcpy(ptrv4 + 2, ptr + 9, 2);
3914 break;
3915 case NAT64_PREFIX_LEN_40:
3916 memcpy(ptrv4, ptr + 5, 3);
3917 memcpy(ptrv4 + 3, ptr + 9, 1);
3918 break;
3919 case NAT64_PREFIX_LEN_32:
3920 memcpy(ptrv4, ptr + 4, 4);
3921 break;
3922 default:
3923 panic("NAT64-prefix len is wrong: %u\n",
3924 prefix->prefix_len);
3925 }
3926
3927 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3928 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3929
3930 return true;
3931 }
3932
3933 static void
3934 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3935 {
3936 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3937 struct socket *so = mpts->mpts_socket;
3938 struct ifnet *ifp;
3939 int j;
3940
3941 /* Subflow IPs will be steered directly by the server - no need to
3942 * desynthesize.
3943 */
3944 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3945 return;
3946 }
3947
3948 ifp = sotoinpcb(so)->inp_last_outifp;
3949
3950 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3951 mptcp_ask_for_nat64(ifp);
3952 return;
3953 }
3954
3955
3956 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3957 int success;
3958
3959 if (nat64prefixes[j].prefix_len == 0) {
3960 continue;
3961 }
3962
3963 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3964 &nat64prefixes[j],
3965 &mpte->mpte_dst_v4_nat64.sin_addr);
3966 if (success) {
3967 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3968 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3969 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3970 break;
3971 }
3972 }
3973 }
3974
3975 /*
3976 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3977 */
3978 static ev_ret_t
3979 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3980 uint64_t *p_mpsofilt_hint, uint64_t event)
3981 {
3982 #pragma unused(event, p_mpsofilt_hint)
3983 struct socket *mp_so, *so;
3984 struct inpcb *inp;
3985 struct tcpcb *tp;
3986 struct mptcb *mp_tp;
3987 int af;
3988 boolean_t mpok = FALSE;
3989
3990 mp_so = mptetoso(mpte);
3991 mp_tp = mpte->mpte_mptcb;
3992 so = mpts->mpts_socket;
3993 tp = sototcpcb(so);
3994 af = mpts->mpts_dst.sa_family;
3995
3996 if (mpts->mpts_flags & MPTSF_CONNECTED) {
3997 return MPTS_EVRET_OK;
3998 }
3999
4000 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4001 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4002 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4003 (so->so_state & SS_ISCONNECTED)) {
4004 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4005 __func__, mpts->mpts_connid),
4006 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4007 (void) soshutdownlock(so, SHUT_RD);
4008 (void) soshutdownlock(so, SHUT_WR);
4009 (void) sodisconnectlocked(so);
4010 }
4011 return MPTS_EVRET_OK;
4012 }
4013
4014 /*
4015 * The subflow connection has been connected. Find out whether it
4016 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4017 *
4018 * a. If MPTCP connection is not yet established, then this must be
4019 * the first subflow connection. If MPTCP failed to negotiate,
4020 * fallback to regular TCP by degrading this subflow.
4021 *
4022 * b. If MPTCP connection has been established, then this must be
4023 * one of the subsequent subflow connections. If MPTCP failed
4024 * to negotiate, disconnect the connection.
4025 *
4026 * Right now, we simply unblock any waiters at the MPTCP socket layer
4027 * if the MPTCP connection has not been established.
4028 */
4029
4030 if (so->so_state & SS_ISDISCONNECTED) {
4031 /*
4032 * With MPTCP joins, a connection is connected at the subflow
4033 * level, but the 4th ACK from the server elevates the MPTCP
4034 * subflow to connected state. So there is a small window
4035 * where the subflow could get disconnected before the
4036 * connected event is processed.
4037 */
4038 return MPTS_EVRET_OK;
4039 }
4040
4041 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4042 mptcp_drop_tfo_data(mpte, mpts);
4043 }
4044
4045 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4046 mpts->mpts_flags |= MPTSF_CONNECTED;
4047
4048 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4049 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4050 }
4051
4052 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4053
4054 /* get/verify the outbound interface */
4055 inp = sotoinpcb(so);
4056
4057 mpts->mpts_maxseg = tp->t_maxseg;
4058
4059 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4060 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4061 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4062 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4063
4064 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4065
4066 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4067 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4068 mpte->mpte_associd = mpts->mpts_connid;
4069 DTRACE_MPTCP2(state__change,
4070 struct mptcb *, mp_tp,
4071 uint32_t, 0 /* event */);
4072
4073 if (SOCK_DOM(so) == AF_INET) {
4074 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4075 } else {
4076 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4077 }
4078
4079 mpts->mpts_flags |= MPTSF_ACTIVE;
4080
4081 /* case (a) above */
4082 if (!mpok) {
4083 tcpstat.tcps_mpcap_fallback++;
4084
4085 tp->t_mpflags |= TMPF_INFIN_SENT;
4086 mptcp_notify_mpfail(so);
4087 } else {
4088 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4089 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4090 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4091 } else {
4092 mpts->mpts_flags |= MPTSF_PREFERRED;
4093 }
4094 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4095 mpte->mpte_nummpcapflows++;
4096
4097 if (SOCK_DOM(so) == AF_INET6) {
4098 mptcp_handle_ipv6_connection(mpte, mpts);
4099 }
4100
4101 mptcp_check_subflows_and_add(mpte);
4102
4103 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4104 mpte->mpte_initial_cell = 1;
4105 }
4106
4107 mpte->mpte_handshake_success = 1;
4108 }
4109
4110 mp_tp->mpt_sndwnd = tp->snd_wnd;
4111 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4112 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4113 soisconnected(mp_so);
4114 } else if (mpok) {
4115 /*
4116 * case (b) above
4117 * In case of additional flows, the MPTCP socket is not
4118 * MPTSF_MP_CAPABLE until an ACK is received from server
4119 * for 3-way handshake. TCP would have guaranteed that this
4120 * is an MPTCP subflow.
4121 */
4122 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4123 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4124 mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) {
4125 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4126 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4127 } else {
4128 mpts->mpts_flags |= MPTSF_PREFERRED;
4129 }
4130
4131 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4132 mpte->mpte_nummpcapflows++;
4133
4134 mpts->mpts_rel_seq = 1;
4135
4136 mptcp_check_subflows_and_remove(mpte);
4137 } else {
4138 unsigned int i;
4139
4140 /* Should we try the alternate port? */
4141 if (mpte->mpte_alternate_port &&
4142 inp->inp_fport != mpte->mpte_alternate_port) {
4143 union sockaddr_in_4_6 dst;
4144 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4145
4146 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4147
4148 dst_in->sin_port = mpte->mpte_alternate_port;
4149
4150 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4151 mpts->mpts_ifscope, NULL);
4152 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4153 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4154 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
4155
4156 if (inp->inp_last_outifp->if_index == info->ifindex) {
4157 info->no_mptcp_support = 1;
4158 break;
4159 }
4160 }
4161 }
4162
4163 tcpstat.tcps_join_fallback++;
4164 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4165 tcpstat.tcps_mptcp_cell_proxy++;
4166 } else {
4167 tcpstat.tcps_mptcp_wifi_proxy++;
4168 }
4169
4170 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4171
4172 return MPTS_EVRET_OK;
4173 }
4174
4175 /* This call, just to "book" an entry in the stats-table for this ifindex */
4176 mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4177
4178 mptcp_output(mpte);
4179
4180 return MPTS_EVRET_OK; /* keep the subflow socket around */
4181 }
4182
4183 /*
4184 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4185 */
4186 static ev_ret_t
4187 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4188 uint64_t *p_mpsofilt_hint, uint64_t event)
4189 {
4190 #pragma unused(event, p_mpsofilt_hint)
4191 struct socket *mp_so, *so;
4192 struct mptcb *mp_tp;
4193
4194 mp_so = mptetoso(mpte);
4195 mp_tp = mpte->mpte_mptcb;
4196 so = mpts->mpts_socket;
4197
4198 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4199 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4200 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4201 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4202 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4203
4204 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4205 return MPTS_EVRET_DELETE;
4206 }
4207
4208 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4209
4210 /* The subflow connection has been disconnected. */
4211
4212 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4213 mpte->mpte_nummpcapflows--;
4214 if (mpte->mpte_active_sub == mpts) {
4215 mpte->mpte_active_sub = NULL;
4216 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4217 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4218 }
4219 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4220 }
4221
4222 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4223 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4224 mptcp_drop(mpte, mp_tp, so->so_error);
4225 }
4226
4227 /*
4228 * Clear flags that are used by getconninfo to return state.
4229 * Retain like MPTSF_DELETEOK for internal purposes.
4230 */
4231 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4232 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4233 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4234
4235 return MPTS_EVRET_DELETE;
4236 }
4237
4238 /*
4239 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4240 */
4241 static ev_ret_t
4242 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4243 uint64_t *p_mpsofilt_hint, uint64_t event)
4244 {
4245 #pragma unused(event, p_mpsofilt_hint)
4246 ev_ret_t ret = MPTS_EVRET_OK;
4247 struct socket *mp_so, *so;
4248 struct mptcb *mp_tp;
4249
4250 mp_so = mptetoso(mpte);
4251 mp_tp = mpte->mpte_mptcb;
4252 so = mpts->mpts_socket;
4253
4254 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4255 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4256 } else {
4257 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4258 }
4259
4260 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4261 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4262 goto done;
4263 }
4264 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4265 } else {
4266 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4267 }
4268
4269 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4270 mpts->mpts_flags |= MPTSF_MP_READY;
4271 } else {
4272 mpts->mpts_flags &= ~MPTSF_MP_READY;
4273 }
4274
4275 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4276 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4277 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4278 }
4279
4280 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4281 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4282
4283 m_freem_list(mpte->mpte_reinjectq);
4284 mpte->mpte_reinjectq = NULL;
4285 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4286 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4287 ret = MPTS_EVRET_CONNECT_PENDING;
4288 }
4289
4290 done:
4291 return ret;
4292 }
4293
4294 /*
4295 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4296 */
4297 static ev_ret_t
4298 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4299 uint64_t *p_mpsofilt_hint, uint64_t event)
4300 {
4301 #pragma unused(event)
4302 struct socket *mp_so, *so;
4303 struct mptcb *mp_tp;
4304 boolean_t is_fastclose;
4305
4306 mp_so = mptetoso(mpte);
4307 mp_tp = mpte->mpte_mptcb;
4308 so = mpts->mpts_socket;
4309
4310 /* We got an invalid option or a fast close */
4311 struct tcptemp *t_template;
4312 struct inpcb *inp = sotoinpcb(so);
4313 struct tcpcb *tp = NULL;
4314
4315 tp = intotcpcb(inp);
4316 so->so_error = ECONNABORTED;
4317
4318 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4319
4320 tp->t_mpflags |= TMPF_RESET;
4321
4322 t_template = tcp_maketemplate(tp);
4323 if (t_template) {
4324 struct tcp_respond_args tra;
4325
4326 bzero(&tra, sizeof(tra));
4327 if (inp->inp_flags & INP_BOUND_IF) {
4328 tra.ifscope = inp->inp_boundifp->if_index;
4329 } else {
4330 tra.ifscope = IFSCOPE_NONE;
4331 }
4332 tra.awdl_unrestricted = 1;
4333
4334 tcp_respond(tp, t_template->tt_ipgen,
4335 &t_template->tt_t, (struct mbuf *)NULL,
4336 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4337 (void) m_free(dtom(t_template));
4338 }
4339
4340 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4341 struct mptsub *iter, *tmp;
4342
4343 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4344
4345 mp_so->so_error = ECONNRESET;
4346
4347 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4348 if (iter == mpts) {
4349 continue;
4350 }
4351 mptcp_subflow_abort(iter, ECONNABORTED);
4352 }
4353
4354 /*
4355 * mptcp_drop is being called after processing the events, to fully
4356 * close the MPTCP connection
4357 */
4358 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4359 }
4360
4361 mptcp_subflow_abort(mpts, ECONNABORTED);
4362
4363
4364 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4365 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4366 }
4367
4368 return MPTS_EVRET_DELETE;
4369 }
4370
4371 static ev_ret_t
4372 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4373 uint64_t *p_mpsofilt_hint, uint64_t event)
4374 {
4375 #pragma unused(event)
4376 bool found_active = false;
4377
4378 mpts->mpts_flags |= MPTSF_READ_STALL;
4379
4380 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4381 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4382
4383 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4384 TCPS_HAVERCVDFIN2(tp->t_state)) {
4385 continue;
4386 }
4387
4388 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4389 found_active = true;
4390 break;
4391 }
4392 }
4393
4394 if (!found_active) {
4395 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4396 }
4397
4398 return MPTS_EVRET_OK;
4399 }
4400
4401 static ev_ret_t
4402 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4403 uint64_t *p_mpsofilt_hint, uint64_t event)
4404 {
4405 #pragma unused(event)
4406 bool found_active = false;
4407
4408 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4409
4410 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4411 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4412
4413 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4414 tp->t_state > TCPS_CLOSE_WAIT) {
4415 continue;
4416 }
4417
4418 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4419 found_active = true;
4420 break;
4421 }
4422 }
4423
4424 if (!found_active) {
4425 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4426 }
4427
4428 return MPTS_EVRET_OK;
4429 }
4430
4431 /*
4432 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4433 * caller must ensure that the option can be issued on subflow sockets, via
4434 * MPOF_SUBFLOW_OK flag.
4435 */
4436 int
4437 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4438 {
4439 struct socket *mp_so, *so;
4440 struct sockopt sopt;
4441 int error;
4442
4443 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4444
4445 mp_so = mptetoso(mpte);
4446 so = mpts->mpts_socket;
4447
4448 socket_lock_assert_owned(mp_so);
4449
4450 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4451 mpo->mpo_level == SOL_SOCKET &&
4452 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4453 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4454
4455 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4456 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4457 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4458 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4459 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4460
4461 /*
4462 * When we open a new subflow, mark it as cell fallback, if
4463 * this subflow goes over cell.
4464 *
4465 * (except for first-party apps)
4466 */
4467
4468 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4469 return 0;
4470 }
4471
4472 if (sotoinpcb(so)->inp_last_outifp &&
4473 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4474 return 0;
4475 }
4476
4477 /*
4478 * This here is an OR, because if the app is not binding to the
4479 * interface, then it definitely is not a cell-fallback
4480 * connection.
4481 */
4482 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4483 !IFNET_IS_CELLULAR(ifp)) {
4484 return 0;
4485 }
4486 }
4487
4488 mpo->mpo_flags &= ~MPOF_INTERIM;
4489
4490 bzero(&sopt, sizeof(sopt));
4491 sopt.sopt_dir = SOPT_SET;
4492 sopt.sopt_level = mpo->mpo_level;
4493 sopt.sopt_name = mpo->mpo_name;
4494 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4495 sopt.sopt_valsize = sizeof(int);
4496 sopt.sopt_p = kernproc;
4497
4498 error = sosetoptlock(so, &sopt, 0);
4499 if (error) {
4500 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4501 "val %d set error %d\n", __func__,
4502 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4503 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4504 mpo->mpo_intval, error);
4505 }
4506 return error;
4507 }
4508
4509 /*
4510 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4511 * caller must ensure that the option can be issued on subflow sockets, via
4512 * MPOF_SUBFLOW_OK flag.
4513 */
4514 int
4515 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4516 struct mptopt *mpo)
4517 {
4518 struct socket *mp_so;
4519 struct sockopt sopt;
4520 int error;
4521
4522 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4523 mp_so = mptetoso(mpte);
4524
4525 socket_lock_assert_owned(mp_so);
4526
4527 bzero(&sopt, sizeof(sopt));
4528 sopt.sopt_dir = SOPT_GET;
4529 sopt.sopt_level = mpo->mpo_level;
4530 sopt.sopt_name = mpo->mpo_name;
4531 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4532 sopt.sopt_valsize = sizeof(int);
4533 sopt.sopt_p = kernproc;
4534
4535 error = sogetoptlock(so, &sopt, 0); /* already locked */
4536 if (error) {
4537 os_log_error(mptcp_log_handle,
4538 "%s - %lx: sopt %s get error %d\n",
4539 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4540 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4541 }
4542 return error;
4543 }
4544
4545
4546 /*
4547 * MPTCP garbage collector.
4548 *
4549 * This routine is called by the MP domain on-demand, periodic callout,
4550 * which is triggered when a MPTCP socket is closed. The callout will
4551 * repeat as long as this routine returns a non-zero value.
4552 */
4553 static uint32_t
4554 mptcp_gc(struct mppcbinfo *mppi)
4555 {
4556 struct mppcb *mpp, *tmpp;
4557 uint32_t active = 0;
4558
4559 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4560
4561 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4562 struct socket *mp_so;
4563 struct mptses *mpte;
4564 struct mptcb *mp_tp;
4565
4566 mp_so = mpp->mpp_socket;
4567 mpte = mptompte(mpp);
4568 mp_tp = mpte->mpte_mptcb;
4569
4570 if (!mpp_try_lock(mpp)) {
4571 active++;
4572 continue;
4573 }
4574
4575 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4576
4577 /* check again under the lock */
4578 if (mp_so->so_usecount > 0) {
4579 boolean_t wakeup = FALSE;
4580 struct mptsub *mpts, *tmpts;
4581
4582 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4583 if (mp_tp->mpt_gc_ticks > 0) {
4584 mp_tp->mpt_gc_ticks--;
4585 }
4586 if (mp_tp->mpt_gc_ticks == 0) {
4587 wakeup = TRUE;
4588 }
4589 }
4590 if (wakeup) {
4591 TAILQ_FOREACH_SAFE(mpts,
4592 &mpte->mpte_subflows, mpts_entry, tmpts) {
4593 mptcp_subflow_eupcall1(mpts->mpts_socket,
4594 mpts, SO_FILT_HINT_DISCONNECTED);
4595 }
4596 }
4597 socket_unlock(mp_so, 0);
4598 active++;
4599 continue;
4600 }
4601
4602 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4603 panic("%s - %lx: skipped state "
4604 "[u=%d,r=%d,s=%d]\n", __func__,
4605 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4606 mp_so->so_usecount, mp_so->so_retaincnt,
4607 mpp->mpp_state);
4608 }
4609
4610 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4611 mptcp_close(mpte, mp_tp);
4612 }
4613
4614 mptcp_session_destroy(mpte);
4615
4616 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4617 struct sockbuf *, &mp_so->so_rcv,
4618 struct sockbuf *, &mp_so->so_snd,
4619 struct mppcb *, mpp);
4620
4621 mp_pcbdispose(mpp);
4622 sodealloc(mp_so);
4623 }
4624
4625 return active;
4626 }
4627
4628 /*
4629 * Drop a MPTCP connection, reporting the specified error.
4630 */
4631 struct mptses *
4632 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4633 {
4634 struct socket *mp_so = mptetoso(mpte);
4635
4636 VERIFY(mpte->mpte_mptcb == mp_tp);
4637
4638 socket_lock_assert_owned(mp_so);
4639
4640 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4641 uint32_t, 0 /* event */);
4642
4643 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4644 errno = mp_tp->mpt_softerror;
4645 }
4646 mp_so->so_error = errno;
4647
4648 return mptcp_close(mpte, mp_tp);
4649 }
4650
4651 /*
4652 * Close a MPTCP control block.
4653 */
4654 struct mptses *
4655 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4656 {
4657 struct mptsub *mpts = NULL, *tmpts = NULL;
4658 struct socket *mp_so = mptetoso(mpte);
4659
4660 socket_lock_assert_owned(mp_so);
4661 VERIFY(mpte->mpte_mptcb == mp_tp);
4662
4663 mp_tp->mpt_state = MPTCPS_TERMINATE;
4664
4665 mptcp_freeq(mp_tp);
4666
4667 soisdisconnected(mp_so);
4668
4669 /* Clean up all subflows */
4670 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4671 mptcp_subflow_disconnect(mpte, mpts);
4672 }
4673
4674 return NULL;
4675 }
4676
4677 void
4678 mptcp_notify_close(struct socket *so)
4679 {
4680 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4681 }
4682
4683 /*
4684 * MPTCP workloop.
4685 */
4686 void
4687 mptcp_subflow_workloop(struct mptses *mpte)
4688 {
4689 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4690 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4691 struct mptsub *mpts, *tmpts;
4692 struct socket *mp_so;
4693
4694 mp_so = mptetoso(mpte);
4695
4696 socket_lock_assert_owned(mp_so);
4697
4698 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4699 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4700 return;
4701 }
4702 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4703
4704 relaunch:
4705 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4706
4707 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4708 ev_ret_t ret;
4709
4710 if (mpts->mpts_socket->so_usecount == 0) {
4711 /* Will be removed soon by tcp_garbage_collect */
4712 continue;
4713 }
4714
4715 mptcp_subflow_addref(mpts);
4716 mpts->mpts_socket->so_usecount++;
4717
4718 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4719
4720 /*
4721 * If MPTCP socket is closed, disconnect all subflows.
4722 * This will generate a disconnect event which will
4723 * be handled during the next iteration, causing a
4724 * non-zero error to be returned above.
4725 */
4726 if (mp_so->so_flags & SOF_PCBCLEARING) {
4727 mptcp_subflow_disconnect(mpte, mpts);
4728 }
4729
4730 switch (ret) {
4731 case MPTS_EVRET_OK:
4732 /* nothing to do */
4733 break;
4734 case MPTS_EVRET_DELETE:
4735 mptcp_subflow_soclose(mpts);
4736 break;
4737 case MPTS_EVRET_CONNECT_PENDING:
4738 connect_pending = TRUE;
4739 break;
4740 case MPTS_EVRET_DISCONNECT_FALLBACK:
4741 disconnect_fallback = TRUE;
4742 break;
4743 default:
4744 mptcplog((LOG_DEBUG,
4745 "MPTCP Socket: %s: mptcp_subflow_events "
4746 "returned invalid value: %d\n", __func__,
4747 ret),
4748 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4749 break;
4750 }
4751 mptcp_subflow_remref(mpts); /* ours */
4752
4753 VERIFY(mpts->mpts_socket->so_usecount != 0);
4754 mpts->mpts_socket->so_usecount--;
4755 }
4756
4757 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4758 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4759
4760 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4761 mp_so->so_state |= SS_CANTRCVMORE;
4762 sorwakeup(mp_so);
4763 }
4764
4765 soevent(mp_so, mpsofilt_hint_mask);
4766 }
4767
4768 if (!connect_pending && !disconnect_fallback) {
4769 goto exit;
4770 }
4771
4772 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4773 if (disconnect_fallback) {
4774 struct socket *so = NULL;
4775 struct inpcb *inp = NULL;
4776 struct tcpcb *tp = NULL;
4777
4778 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4779 continue;
4780 }
4781
4782 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4783
4784 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4785 MPTSF_DISCONNECTED | MPTSF_CONNECT_PENDING)) {
4786 continue;
4787 }
4788
4789 so = mpts->mpts_socket;
4790
4791 /*
4792 * The MPTCP connection has degraded to a fallback
4793 * mode, so there is no point in keeping this subflow
4794 * regardless of its MPTCP-readiness state, unless it
4795 * is the primary one which we use for fallback. This
4796 * assumes that the subflow used for fallback is the
4797 * ACTIVE one.
4798 */
4799
4800 inp = sotoinpcb(so);
4801 tp = intotcpcb(inp);
4802 tp->t_mpflags &=
4803 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4804 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4805
4806 soevent(so, SO_FILT_HINT_MUSTRST);
4807 } else if (connect_pending) {
4808 /*
4809 * The MPTCP connection has progressed to a state
4810 * where it supports full multipath semantics; allow
4811 * additional joins to be attempted for all subflows
4812 * that are in the PENDING state.
4813 */
4814 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4815 int error = mptcp_subflow_soconnectx(mpte, mpts);
4816
4817 if (error) {
4818 mptcp_subflow_abort(mpts, error);
4819 }
4820 }
4821 }
4822 }
4823
4824 exit:
4825 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4826 goto relaunch;
4827 }
4828
4829 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4830 }
4831
4832 /*
4833 * Protocol pr_lock callback.
4834 */
4835 int
4836 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4837 {
4838 struct mppcb *mpp = mpsotomppcb(mp_so);
4839 void *lr_saved;
4840
4841 if (lr == NULL) {
4842 lr_saved = __builtin_return_address(0);
4843 } else {
4844 lr_saved = lr;
4845 }
4846
4847 if (mpp == NULL) {
4848 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4849 mp_so, lr_saved, solockhistory_nr(mp_so));
4850 /* NOTREACHED */
4851 }
4852 mpp_lock(mpp);
4853
4854 if (mp_so->so_usecount < 0) {
4855 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4856 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4857 solockhistory_nr(mp_so));
4858 /* NOTREACHED */
4859 }
4860 if (refcount != 0) {
4861 mp_so->so_usecount++;
4862 mpp->mpp_inside++;
4863 }
4864 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4865 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4866
4867 return 0;
4868 }
4869
4870 /*
4871 * Protocol pr_unlock callback.
4872 */
4873 int
4874 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4875 {
4876 struct mppcb *mpp = mpsotomppcb(mp_so);
4877 void *lr_saved;
4878
4879 if (lr == NULL) {
4880 lr_saved = __builtin_return_address(0);
4881 } else {
4882 lr_saved = lr;
4883 }
4884
4885 if (mpp == NULL) {
4886 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4887 mp_so, mp_so->so_usecount, lr_saved,
4888 solockhistory_nr(mp_so));
4889 /* NOTREACHED */
4890 }
4891 socket_lock_assert_owned(mp_so);
4892
4893 if (refcount != 0) {
4894 mp_so->so_usecount--;
4895 mpp->mpp_inside--;
4896 }
4897
4898 if (mp_so->so_usecount < 0) {
4899 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4900 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4901 /* NOTREACHED */
4902 }
4903 if (mpp->mpp_inside < 0) {
4904 panic("%s: mpp=%p inside=%x lrh= %s\n", __func__,
4905 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
4906 /* NOTREACHED */
4907 }
4908 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4909 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4910 mpp_unlock(mpp);
4911
4912 return 0;
4913 }
4914
4915 /*
4916 * Protocol pr_getlock callback.
4917 */
4918 lck_mtx_t *
4919 mptcp_getlock(struct socket *mp_so, int flags)
4920 {
4921 struct mppcb *mpp = mpsotomppcb(mp_so);
4922
4923 if (mpp == NULL) {
4924 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4925 solockhistory_nr(mp_so));
4926 /* NOTREACHED */
4927 }
4928 if (mp_so->so_usecount < 0) {
4929 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4930 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4931 /* NOTREACHED */
4932 }
4933 return mpp_getlock(mpp, flags);
4934 }
4935
4936 /*
4937 * MPTCP Join support
4938 */
4939
4940 static void
4941 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
4942 {
4943 struct tcpcb *tp = sototcpcb(so);
4944 struct mptcp_subf_auth_entry *sauth_entry;
4945
4946 /*
4947 * The address ID of the first flow is implicitly 0.
4948 */
4949 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4950 tp->t_local_aid = 0;
4951 } else {
4952 tp->t_local_aid = addr_id;
4953 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4954 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4955 }
4956 sauth_entry = zalloc(mpt_subauth_zone);
4957 sauth_entry->msae_laddr_id = tp->t_local_aid;
4958 sauth_entry->msae_raddr_id = 0;
4959 sauth_entry->msae_raddr_rand = 0;
4960 try_again:
4961 sauth_entry->msae_laddr_rand = RandomULong();
4962 if (sauth_entry->msae_laddr_rand == 0) {
4963 goto try_again;
4964 }
4965 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4966 }
4967
4968 static void
4969 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4970 {
4971 struct mptcp_subf_auth_entry *sauth_entry;
4972 struct tcpcb *tp = NULL;
4973 int found = 0;
4974
4975 tp = sototcpcb(so);
4976 if (tp == NULL) {
4977 return;
4978 }
4979
4980 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4981 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4982 found = 1;
4983 break;
4984 }
4985 }
4986 if (found) {
4987 LIST_REMOVE(sauth_entry, msae_next);
4988 }
4989
4990 if (found) {
4991 zfree(mpt_subauth_zone, sauth_entry);
4992 }
4993 }
4994
4995 void
4996 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4997 u_int32_t *rrand)
4998 {
4999 struct mptcp_subf_auth_entry *sauth_entry;
5000
5001 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5002 if (sauth_entry->msae_laddr_id == addr_id) {
5003 if (lrand) {
5004 *lrand = sauth_entry->msae_laddr_rand;
5005 }
5006 if (rrand) {
5007 *rrand = sauth_entry->msae_raddr_rand;
5008 }
5009 break;
5010 }
5011 }
5012 }
5013
5014 void
5015 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5016 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5017 {
5018 struct mptcp_subf_auth_entry *sauth_entry;
5019
5020 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5021 if (sauth_entry->msae_laddr_id == laddr_id) {
5022 if ((sauth_entry->msae_raddr_id != 0) &&
5023 (sauth_entry->msae_raddr_id != raddr_id)) {
5024 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5025 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5026 raddr_id, sauth_entry->msae_raddr_id);
5027 return;
5028 }
5029 sauth_entry->msae_raddr_id = raddr_id;
5030 if ((sauth_entry->msae_raddr_rand != 0) &&
5031 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5032 os_log_error(mptcp_log_handle, "%s - %lx: "
5033 "dup SYN_ACK %d %d \n",
5034 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5035 raddr_rand, sauth_entry->msae_raddr_rand);
5036 return;
5037 }
5038 sauth_entry->msae_raddr_rand = raddr_rand;
5039 return;
5040 }
5041 }
5042 }
5043
5044 /*
5045 * SHA1 support for MPTCP
5046 */
5047 static void
5048 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5049 {
5050 SHA1_CTX sha1ctxt;
5051 const unsigned char *sha1_base;
5052 int sha1_size;
5053
5054 sha1_base = (const unsigned char *) key;
5055 sha1_size = sizeof(mptcp_key_t);
5056 SHA1Init(&sha1ctxt);
5057 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5058 SHA1Final(sha_digest, &sha1ctxt);
5059 }
5060
5061 void
5062 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5063 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5064 {
5065 SHA1_CTX sha1ctxt;
5066 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5067 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5068 u_int32_t data[2];
5069 int i;
5070
5071 bzero(digest, SHA1_RESULTLEN);
5072
5073 /* Set up the Key for HMAC */
5074 key_ipad[0] = key1;
5075 key_ipad[1] = key2;
5076
5077 key_opad[0] = key1;
5078 key_opad[1] = key2;
5079
5080 /* Set up the message for HMAC */
5081 data[0] = rand1;
5082 data[1] = rand2;
5083
5084 /* Key is 512 block length, so no need to compute hash */
5085
5086 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5087
5088 for (i = 0; i < 8; i++) {
5089 key_ipad[i] ^= 0x3636363636363636;
5090 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5091 }
5092
5093 /* Perform inner SHA1 */
5094 SHA1Init(&sha1ctxt);
5095 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5096 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5097 SHA1Final(digest, &sha1ctxt);
5098
5099 /* Perform outer SHA1 */
5100 SHA1Init(&sha1ctxt);
5101 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5102 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5103 SHA1Final(digest, &sha1ctxt);
5104 }
5105
5106 /*
5107 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5108 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5109 */
5110 void
5111 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
5112 {
5113 uint32_t lrand, rrand;
5114
5115 lrand = rrand = 0;
5116 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5117 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
5118 digest);
5119 }
5120
5121 /*
5122 * Authentication data generation
5123 */
5124 static void
5125 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5126 int token_len)
5127 {
5128 VERIFY(token_len == sizeof(u_int32_t));
5129 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5130
5131 /* Most significant 32 bits of the SHA1 hash */
5132 bcopy(sha_digest, token, sizeof(u_int32_t));
5133 return;
5134 }
5135
5136 static void
5137 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5138 int idsn_len)
5139 {
5140 VERIFY(idsn_len == sizeof(u_int64_t));
5141 VERIFY(sha_digest_len == SHA1_RESULTLEN);
5142
5143 /*
5144 * Least significant 64 bits of the SHA1 hash
5145 */
5146
5147 idsn[7] = sha_digest[12];
5148 idsn[6] = sha_digest[13];
5149 idsn[5] = sha_digest[14];
5150 idsn[4] = sha_digest[15];
5151 idsn[3] = sha_digest[16];
5152 idsn[2] = sha_digest[17];
5153 idsn[1] = sha_digest[18];
5154 idsn[0] = sha_digest[19];
5155 return;
5156 }
5157
5158 static void
5159 mptcp_conn_properties(struct mptcb *mp_tp)
5160 {
5161 /* There is only Version 0 at this time */
5162 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
5163
5164 /* Set DSS checksum flag */
5165 if (mptcp_dss_csum) {
5166 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5167 }
5168
5169 /* Set up receive window */
5170 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5171
5172 /* Set up gc ticks */
5173 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5174 }
5175
5176 static void
5177 mptcp_init_local_parms(struct mptses *mpte)
5178 {
5179 struct mptcb *mp_tp = mpte->mpte_mptcb;
5180 char key_digest[SHA1_RESULTLEN];
5181
5182 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5183 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5184
5185 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
5186 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5187 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
5188 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
5189
5190 /* The subflow SYN is also first MPTCP byte */
5191 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5192 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5193
5194 mptcp_conn_properties(mp_tp);
5195 }
5196
5197 int
5198 mptcp_init_remote_parms(struct mptcb *mp_tp)
5199 {
5200 char remote_digest[SHA1_RESULTLEN];
5201
5202 /* Only Version 0 is supported for auth purposes */
5203 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5204 return -1;
5205 }
5206
5207 /* Setup local and remote tokens and Initial DSNs */
5208 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5209 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
5210 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5211 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
5212 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5213 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5214 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5215
5216 return 0;
5217 }
5218
5219 static void
5220 mptcp_send_dfin(struct socket *so)
5221 {
5222 struct tcpcb *tp = NULL;
5223 struct inpcb *inp = NULL;
5224
5225 inp = sotoinpcb(so);
5226 if (!inp) {
5227 return;
5228 }
5229
5230 tp = intotcpcb(inp);
5231 if (!tp) {
5232 return;
5233 }
5234
5235 if (!(tp->t_mpflags & TMPF_RESET)) {
5236 tp->t_mpflags |= TMPF_SEND_DFIN;
5237 }
5238 }
5239
5240 /*
5241 * Data Sequence Mapping routines
5242 */
5243 void
5244 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5245 {
5246 struct mptcb *mp_tp;
5247
5248 if (m == NULL) {
5249 return;
5250 }
5251
5252 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5253
5254 while (m) {
5255 VERIFY(m->m_flags & M_PKTHDR);
5256 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5257 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5258 m->m_pkthdr.mp_rlen = m_pktlen(m);
5259 mp_tp->mpt_sndmax += m_pktlen(m);
5260 m = m->m_next;
5261 }
5262 }
5263
5264 void
5265 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5266 {
5267 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5268 uint64_t data_ack;
5269 uint64_t dsn;
5270
5271 if (!m || len == 0) {
5272 return;
5273 }
5274
5275 while (m && len > 0) {
5276 VERIFY(m->m_flags & M_PKTHDR);
5277 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5278
5279 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5280 dsn = m->m_pkthdr.mp_dsn;
5281
5282 len -= m->m_len;
5283 m = m->m_next;
5284 }
5285
5286 if (m && len == 0) {
5287 /*
5288 * If there is one more mbuf in the chain, it automatically means
5289 * that up to m->mp_dsn has been ack'ed.
5290 *
5291 * This means, we actually correct data_ack back down (compared
5292 * to what we set inside the loop - dsn + data_len). Because in
5293 * the loop we are "optimistic" and assume that the full mapping
5294 * will be acked. If that's not the case and we get out of the
5295 * loop with m != NULL, it means only up to m->mp_dsn has been
5296 * really acked.
5297 */
5298 data_ack = m->m_pkthdr.mp_dsn;
5299 }
5300
5301 if (len < 0) {
5302 /*
5303 * If len is negative, meaning we acked in the middle of an mbuf,
5304 * only up to this mbuf's data-sequence number has been acked
5305 * at the MPTCP-level.
5306 */
5307 data_ack = dsn;
5308 }
5309
5310 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5311 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5312
5313 /* We can have data in the subflow's send-queue that is being acked,
5314 * while the DATA_ACK has already advanced. Thus, we should check whether
5315 * or not the DATA_ACK is actually new here.
5316 */
5317 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5318 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5319 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5320 }
5321 }
5322
5323 void
5324 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5325 {
5326 int rewinding = 0;
5327
5328 /* TFO makes things complicated. */
5329 if (so->so_flags1 & SOF1_TFO_REWIND) {
5330 rewinding = 1;
5331 so->so_flags1 &= ~SOF1_TFO_REWIND;
5332 }
5333
5334 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5335 u_int32_t sub_len;
5336 VERIFY(m->m_flags & M_PKTHDR);
5337 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5338
5339 sub_len = m->m_pkthdr.mp_rlen;
5340
5341 if (sub_len < len) {
5342 m->m_pkthdr.mp_dsn += sub_len;
5343 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5344 m->m_pkthdr.mp_rseq += sub_len;
5345 }
5346 m->m_pkthdr.mp_rlen = 0;
5347 len -= sub_len;
5348 } else {
5349 /* sub_len >= len */
5350 if (rewinding == 0) {
5351 m->m_pkthdr.mp_dsn += len;
5352 }
5353 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5354 if (rewinding == 0) {
5355 m->m_pkthdr.mp_rseq += len;
5356 }
5357 }
5358 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5359 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5360 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5361 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5362 m->m_pkthdr.mp_rlen -= len;
5363 break;
5364 }
5365 m = m->m_next;
5366 }
5367
5368 if (so->so_flags & SOF_MP_SUBFLOW &&
5369 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5370 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5371 /*
5372 * Received an ack without receiving a DATA_ACK.
5373 * Need to fallback to regular TCP (or destroy this subflow).
5374 */
5375 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5376 mptcp_notify_mpfail(so);
5377 }
5378 }
5379
5380 /* Obtain the DSN mapping stored in the mbuf */
5381 void
5382 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5383 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5384 {
5385 u_int64_t dsn64;
5386
5387 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5388 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5389 }
5390
5391 void
5392 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5393 uint32_t *relseq, uint16_t *data_len,
5394 uint16_t *dss_csum)
5395 {
5396 struct mbuf *m = so->so_snd.sb_mb;
5397 int off_orig = off;
5398
5399 VERIFY(off >= 0);
5400
5401 /*
5402 * In the subflow socket, the DSN sequencing can be discontiguous,
5403 * but the subflow sequence mapping is contiguous. Use the subflow
5404 * sequence property to find the right mbuf and corresponding dsn
5405 * mapping.
5406 */
5407
5408 while (m) {
5409 VERIFY(m->m_flags & M_PKTHDR);
5410 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5411
5412 if (off >= m->m_len) {
5413 off -= m->m_len;
5414 m = m->m_next;
5415 } else {
5416 break;
5417 }
5418 }
5419
5420 VERIFY(m);
5421 VERIFY(off >= 0);
5422 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5423
5424 *dsn = m->m_pkthdr.mp_dsn;
5425 *relseq = m->m_pkthdr.mp_rseq;
5426 *data_len = m->m_pkthdr.mp_rlen;
5427 *dss_csum = m->m_pkthdr.mp_csum;
5428
5429 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5430 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5431 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5432 }
5433
5434 /*
5435 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5436 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5437 * When it trims data tcp_input calls m_adj() which does not remove the
5438 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5439 * The dsn map insertion cannot be delayed after trim, because data can be in
5440 * the reassembly queue for a while and the DSN option info in tp will be
5441 * overwritten for every new packet received.
5442 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5443 * with mptcp_adj_rmap()
5444 */
5445 void
5446 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5447 {
5448 VERIFY(m->m_flags & M_PKTHDR);
5449 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5450
5451 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5452 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5453 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5454 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5455 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5456 if (tp->t_rcv_map.mpt_dfin) {
5457 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5458 }
5459
5460 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5461
5462 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5463 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5464 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5465 if (th->th_flags & TH_FIN) {
5466 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5467 }
5468 }
5469 }
5470
5471 /*
5472 * Following routines help with failure detection and failover of data
5473 * transfer from one subflow to another.
5474 */
5475 void
5476 mptcp_act_on_txfail(struct socket *so)
5477 {
5478 struct tcpcb *tp = NULL;
5479 struct inpcb *inp = sotoinpcb(so);
5480
5481 if (inp == NULL) {
5482 return;
5483 }
5484
5485 tp = intotcpcb(inp);
5486 if (tp == NULL) {
5487 return;
5488 }
5489
5490 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5491 return;
5492 }
5493
5494 so->so_flags |= SOF_MP_TRYFAILOVER;
5495 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5496 }
5497
5498 /*
5499 * Support for MP_FAIL option
5500 */
5501 int
5502 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5503 {
5504 struct mbuf *m = so->so_snd.sb_mb;
5505 u_int64_t dsn;
5506 int off = 0;
5507 u_int32_t datalen;
5508
5509 if (m == NULL) {
5510 return -1;
5511 }
5512
5513 while (m != NULL) {
5514 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5515 VERIFY(m->m_flags & M_PKTHDR);
5516 dsn = m->m_pkthdr.mp_dsn;
5517 datalen = m->m_pkthdr.mp_rlen;
5518 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5519 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5520 off = dsn_fail - dsn;
5521 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5522 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5523 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5524 return 0;
5525 }
5526
5527 m = m->m_next;
5528 }
5529
5530 /*
5531 * If there was no mbuf data and a fallback to TCP occurred, there's
5532 * not much else to do.
5533 */
5534
5535 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5536 return -1;
5537 }
5538
5539 /*
5540 * Support for sending contiguous MPTCP bytes in subflow
5541 * Also for preventing sending data with ACK in 3-way handshake
5542 */
5543 int32_t
5544 mptcp_adj_sendlen(struct socket *so, int32_t off)
5545 {
5546 struct tcpcb *tp = sototcpcb(so);
5547 struct mptsub *mpts = tp->t_mpsub;
5548 uint64_t mdss_dsn;
5549 uint32_t mdss_subflow_seq;
5550 int mdss_subflow_off;
5551 uint16_t mdss_data_len;
5552 uint16_t dss_csum;
5553
5554 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5555 &mdss_data_len, &dss_csum);
5556
5557 /*
5558 * We need to compute how much of the mapping still remains.
5559 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5560 */
5561 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5562
5563 /*
5564 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5565 * seq has been set to 1 (while it should be 0).
5566 */
5567 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5568 mdss_subflow_off--;
5569 }
5570
5571 if (off < mdss_subflow_off) {
5572 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5573 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5574 }
5575 VERIFY(off >= mdss_subflow_off);
5576
5577 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5578 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5579 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5580 return mdss_data_len - (off - mdss_subflow_off);
5581 }
5582
5583 static uint32_t
5584 mptcp_get_maxseg(struct mptses *mpte)
5585 {
5586 struct mptsub *mpts;
5587 uint32_t maxseg = 0;
5588
5589 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5590 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5591
5592 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5593 TCPS_HAVERCVDFIN2(tp->t_state)) {
5594 continue;
5595 }
5596
5597 if (tp->t_maxseg > maxseg) {
5598 maxseg = tp->t_maxseg;
5599 }
5600 }
5601
5602 return maxseg;
5603 }
5604
5605 static uint8_t
5606 mptcp_get_rcvscale(struct mptses *mpte)
5607 {
5608 struct mptsub *mpts;
5609 uint8_t rcvscale = UINT8_MAX;
5610
5611 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5612 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5613
5614 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5615 TCPS_HAVERCVDFIN2(tp->t_state)) {
5616 continue;
5617 }
5618
5619 if (tp->rcv_scale < rcvscale) {
5620 rcvscale = tp->rcv_scale;
5621 }
5622 }
5623
5624 return rcvscale;
5625 }
5626
5627 /* Similar to tcp_sbrcv_reserve */
5628 static void
5629 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5630 u_int32_t newsize, u_int32_t idealsize)
5631 {
5632 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5633
5634 /* newsize should not exceed max */
5635 newsize = min(newsize, tcp_autorcvbuf_max);
5636
5637 /* The receive window scale negotiated at the
5638 * beginning of the connection will also set a
5639 * limit on the socket buffer size
5640 */
5641 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5642
5643 /* Set new socket buffer size */
5644 if (newsize > sbrcv->sb_hiwat &&
5645 (sbreserve(sbrcv, newsize) == 1)) {
5646 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5647 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5648
5649 /* Again check the limit set by the advertised
5650 * window scale
5651 */
5652 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5653 TCP_MAXWIN << rcvscale);
5654 }
5655 }
5656
5657 void
5658 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5659 {
5660 struct mptses *mpte = mp_tp->mpt_mpte;
5661 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5662 struct sockbuf *sbrcv = &mp_so->so_rcv;
5663 uint32_t hiwat_sum = 0;
5664 uint32_t ideal_sum = 0;
5665 struct mptsub *mpts;
5666
5667 /*
5668 * Do not grow the receive socket buffer if
5669 * - auto resizing is disabled, globally or on this socket
5670 * - the high water mark already reached the maximum
5671 * - the stream is in background and receive side is being
5672 * throttled
5673 * - if there are segments in reassembly queue indicating loss,
5674 * do not need to increase recv window during recovery as more
5675 * data is not going to be sent. A duplicate ack sent during
5676 * recovery should not change the receive window
5677 */
5678 if (tcp_do_autorcvbuf == 0 ||
5679 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5680 tcp_cansbgrow(sbrcv) == 0 ||
5681 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5682 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5683 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5684 /* Can not resize the socket buffer, just return */
5685 return;
5686 }
5687
5688 /*
5689 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5690 *
5691 * But, for this we first need accurate receiver-RTT estimations, which
5692 * we currently don't have.
5693 *
5694 * Let's use a dummy algorithm for now, just taking the sum of all
5695 * subflow's receive-buffers. It's too low, but that's all we can get
5696 * for now.
5697 */
5698
5699 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5700 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5701 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5702 }
5703
5704 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5705 }
5706
5707 /*
5708 * Determine if we can grow the recieve socket buffer to avoid sending
5709 * a zero window update to the peer. We allow even socket buffers that
5710 * have fixed size (set by the application) to grow if the resource
5711 * constraints are met. They will also be trimmed after the application
5712 * reads data.
5713 *
5714 * Similar to tcp_sbrcv_grow_rwin
5715 */
5716 static void
5717 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5718 {
5719 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5720 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5721 u_int32_t rcvbuf = sb->sb_hiwat;
5722
5723 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5724 return;
5725 }
5726
5727 if (tcp_do_autorcvbuf == 1 &&
5728 tcp_cansbgrow(sb) &&
5729 /* Diff to tcp_sbrcv_grow_rwin */
5730 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5731 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5732 rcvbuf < tcp_autorcvbuf_max &&
5733 (sb->sb_idealsize > 0 &&
5734 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5735 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5736 }
5737 }
5738
5739 /* Similar to tcp_sbspace */
5740 int32_t
5741 mptcp_sbspace(struct mptcb *mp_tp)
5742 {
5743 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5744 uint32_t rcvbuf;
5745 int32_t space;
5746 int32_t pending = 0;
5747
5748 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5749
5750 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5751
5752 /* hiwat might have changed */
5753 rcvbuf = sb->sb_hiwat;
5754
5755 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5756 (sb->sb_mbmax - sb->sb_mbcnt)));
5757 if (space < 0) {
5758 space = 0;
5759 }
5760
5761 #if CONTENT_FILTER
5762 /* Compensate for data being processed by content filters */
5763 pending = cfil_sock_data_space(sb);
5764 #endif /* CONTENT_FILTER */
5765 if (pending > space) {
5766 space = 0;
5767 } else {
5768 space -= pending;
5769 }
5770
5771 return space;
5772 }
5773
5774 /*
5775 * Support Fallback to Regular TCP
5776 */
5777 void
5778 mptcp_notify_mpready(struct socket *so)
5779 {
5780 struct tcpcb *tp = NULL;
5781
5782 if (so == NULL) {
5783 return;
5784 }
5785
5786 tp = intotcpcb(sotoinpcb(so));
5787
5788 if (tp == NULL) {
5789 return;
5790 }
5791
5792 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5793 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5794 struct tcpcb *, tp);
5795
5796 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5797 return;
5798 }
5799
5800 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5801 return;
5802 }
5803
5804 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5805 tp->t_mpflags |= TMPF_MPTCP_READY;
5806
5807 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5808 }
5809
5810 void
5811 mptcp_notify_mpfail(struct socket *so)
5812 {
5813 struct tcpcb *tp = NULL;
5814
5815 if (so == NULL) {
5816 return;
5817 }
5818
5819 tp = intotcpcb(sotoinpcb(so));
5820
5821 if (tp == NULL) {
5822 return;
5823 }
5824
5825 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5826 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5827 struct tcpcb *, tp);
5828
5829 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5830 return;
5831 }
5832
5833 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5834 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5835
5836 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5837 }
5838
5839 /*
5840 * Keepalive helper function
5841 */
5842 boolean_t
5843 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5844 {
5845 boolean_t ret = 1;
5846
5847 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5848
5849 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5850 ret = 0;
5851 }
5852 return ret;
5853 }
5854
5855 /*
5856 * MPTCP t_maxseg adjustment function
5857 */
5858 int
5859 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5860 {
5861 int mss_lower = 0;
5862 struct mptcb *mp_tp = tptomptp(tp);
5863
5864 #define MPTCP_COMPUTE_LEN { \
5865 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5866 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5867 mss_lower += 2; \
5868 else \
5869 /* adjust to 32-bit boundary + EOL */ \
5870 mss_lower += 2; \
5871 }
5872 if (mp_tp == NULL) {
5873 return 0;
5874 }
5875
5876 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5877
5878 /*
5879 * For the first subflow and subsequent subflows, adjust mss for
5880 * most common MPTCP option size, for case where tcp_mss is called
5881 * during option processing and MTU discovery.
5882 */
5883 if (!mtudisc) {
5884 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5885 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5886 MPTCP_COMPUTE_LEN;
5887 }
5888
5889 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5890 tp->t_mpflags & TMPF_SENT_JOIN) {
5891 MPTCP_COMPUTE_LEN;
5892 }
5893 } else {
5894 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5895 MPTCP_COMPUTE_LEN;
5896 }
5897 }
5898
5899 return mss_lower;
5900 }
5901
5902 /*
5903 * Update the pid, upid, uuid of the subflow so, based on parent so
5904 */
5905 void
5906 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5907 {
5908 if (so->last_pid != mp_so->last_pid ||
5909 so->last_upid != mp_so->last_upid) {
5910 so->last_upid = mp_so->last_upid;
5911 so->last_pid = mp_so->last_pid;
5912 uuid_copy(so->last_uuid, mp_so->last_uuid);
5913 }
5914 so_update_policy(so);
5915 }
5916
5917 static void
5918 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5919 {
5920 struct inpcb *inp;
5921
5922 tcp_getconninfo(so, &flow->flow_ci);
5923 inp = sotoinpcb(so);
5924 #if INET6
5925 if ((inp->inp_vflag & INP_IPV6) != 0) {
5926 flow->flow_src.ss_family = AF_INET6;
5927 flow->flow_dst.ss_family = AF_INET6;
5928 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5929 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5930 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5931 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5932 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5933 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5934 } else
5935 #endif
5936 if ((inp->inp_vflag & INP_IPV4) != 0) {
5937 flow->flow_src.ss_family = AF_INET;
5938 flow->flow_dst.ss_family = AF_INET;
5939 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5940 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5941 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5942 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5943 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5944 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5945 }
5946 flow->flow_len = sizeof(*flow);
5947 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5948 flow->flow_flags = mpts->mpts_flags;
5949 flow->flow_cid = mpts->mpts_connid;
5950 flow->flow_relseq = mpts->mpts_rel_seq;
5951 flow->flow_soerror = mpts->mpts_socket->so_error;
5952 flow->flow_probecnt = mpts->mpts_probecnt;
5953 }
5954
5955 static int
5956 mptcp_pcblist SYSCTL_HANDLER_ARGS
5957 {
5958 #pragma unused(oidp, arg1, arg2)
5959 int error = 0, f;
5960 size_t len;
5961 struct mppcb *mpp;
5962 struct mptses *mpte;
5963 struct mptcb *mp_tp;
5964 struct mptsub *mpts;
5965 struct socket *so;
5966 conninfo_mptcp_t mptcpci;
5967 mptcp_flow_t *flows = NULL;
5968
5969 if (req->newptr != USER_ADDR_NULL) {
5970 return EPERM;
5971 }
5972
5973 lck_mtx_lock(&mtcbinfo.mppi_lock);
5974 if (req->oldptr == USER_ADDR_NULL) {
5975 size_t n = mtcbinfo.mppi_count;
5976 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5977 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
5978 4 * (n + n / 8) * sizeof(mptcp_flow_t);
5979 return 0;
5980 }
5981 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5982 flows = NULL;
5983 socket_lock(mpp->mpp_socket, 1);
5984 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5985 mpte = mptompte(mpp);
5986
5987 socket_lock_assert_owned(mptetoso(mpte));
5988 mp_tp = mpte->mpte_mptcb;
5989
5990 bzero(&mptcpci, sizeof(mptcpci));
5991 mptcpci.mptcpci_state = mp_tp->mpt_state;
5992 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5993 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5994 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5995 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5996 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5997 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5998 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5999 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6000 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6001 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6002 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6003 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6004 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6005
6006 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6007 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6008 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6009 mptcpci.mptcpci_flow_offset =
6010 offsetof(conninfo_mptcp_t, mptcpci_flows);
6011
6012 len = sizeof(*flows) * mpte->mpte_numflows;
6013 if (mpte->mpte_numflows != 0) {
6014 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
6015 if (flows == NULL) {
6016 socket_unlock(mpp->mpp_socket, 1);
6017 break;
6018 }
6019 mptcpci.mptcpci_len = sizeof(mptcpci) +
6020 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6021 error = SYSCTL_OUT(req, &mptcpci,
6022 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6023 } else {
6024 mptcpci.mptcpci_len = sizeof(mptcpci);
6025 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6026 }
6027 if (error) {
6028 socket_unlock(mpp->mpp_socket, 1);
6029 FREE(flows, M_TEMP);
6030 break;
6031 }
6032 f = 0;
6033 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6034 so = mpts->mpts_socket;
6035 fill_mptcp_subflow(so, &flows[f], mpts);
6036 f++;
6037 }
6038 socket_unlock(mpp->mpp_socket, 1);
6039 if (flows) {
6040 error = SYSCTL_OUT(req, flows, len);
6041 FREE(flows, M_TEMP);
6042 if (error) {
6043 break;
6044 }
6045 }
6046 }
6047 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6048
6049 return error;
6050 }
6051
6052 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6053 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6054 "List of active MPTCP connections");
6055
6056 /*
6057 * Set notsent lowat mark on the MPTCB
6058 */
6059 int
6060 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6061 {
6062 struct mptcb *mp_tp = NULL;
6063 int error = 0;
6064
6065 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6066 mp_tp = mpte->mpte_mptcb;
6067 }
6068
6069 if (mp_tp) {
6070 mp_tp->mpt_notsent_lowat = optval;
6071 } else {
6072 error = EINVAL;
6073 }
6074
6075 return error;
6076 }
6077
6078 u_int32_t
6079 mptcp_get_notsent_lowat(struct mptses *mpte)
6080 {
6081 struct mptcb *mp_tp = NULL;
6082
6083 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6084 mp_tp = mpte->mpte_mptcb;
6085 }
6086
6087 if (mp_tp) {
6088 return mp_tp->mpt_notsent_lowat;
6089 } else {
6090 return 0;
6091 }
6092 }
6093
6094 int
6095 mptcp_notsent_lowat_check(struct socket *so)
6096 {
6097 struct mptses *mpte;
6098 struct mppcb *mpp;
6099 struct mptcb *mp_tp;
6100 struct mptsub *mpts;
6101
6102 int notsent = 0;
6103
6104 mpp = mpsotomppcb(so);
6105 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6106 return 0;
6107 }
6108
6109 mpte = mptompte(mpp);
6110 socket_lock_assert_owned(mptetoso(mpte));
6111 mp_tp = mpte->mpte_mptcb;
6112
6113 notsent = so->so_snd.sb_cc;
6114
6115 if ((notsent == 0) ||
6116 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6117 mp_tp->mpt_notsent_lowat)) {
6118 mptcplog((LOG_DEBUG, "MPTCP Sender: "
6119 "lowat %d notsent %d actual %d \n",
6120 mp_tp->mpt_notsent_lowat, notsent,
6121 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6122 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6123 return 1;
6124 }
6125
6126 /* When Nagle's algorithm is not disabled, it is better
6127 * to wakeup the client even before there is atleast one
6128 * maxseg of data to write.
6129 */
6130 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6131 int retval = 0;
6132 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6133 struct socket *subf_so = mpts->mpts_socket;
6134 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6135
6136 notsent = so->so_snd.sb_cc -
6137 (tp->snd_nxt - tp->snd_una);
6138
6139 if ((tp->t_flags & TF_NODELAY) == 0 &&
6140 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6141 retval = 1;
6142 }
6143 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6144 " nodelay false \n",
6145 mp_tp->mpt_notsent_lowat, notsent),
6146 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6147 return retval;
6148 }
6149 }
6150 return 0;
6151 }
6152
6153 static errno_t
6154 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6155 void **unitinfo)
6156 {
6157 #pragma unused(kctlref, sac, unitinfo)
6158
6159 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6160 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6161 }
6162
6163 mptcp_kern_skt_unit = sac->sc_unit;
6164
6165 return 0;
6166 }
6167
6168 static void
6169 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6170 {
6171 struct mppcb *mpp;
6172
6173 /* Iterate over all MPTCP connections */
6174
6175 lck_mtx_lock(&mtcbinfo.mppi_lock);
6176
6177 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6178 struct socket *mp_so = mpp->mpp_socket;
6179 struct mptses *mpte = mpp->mpp_pcbe;
6180
6181 socket_lock(mp_so, 1);
6182
6183 if (mp_so->so_flags & SOF_DELEGATED &&
6184 uuid_compare(uuid, mp_so->e_uuid)) {
6185 goto next;
6186 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6187 uuid_compare(uuid, mp_so->last_uuid)) {
6188 goto next;
6189 }
6190
6191 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6192 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6193
6194 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6195
6196 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6197 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6198 }
6199
6200 mptcp_check_subflows_and_add(mpte);
6201 mptcp_remove_subflows(mpte);
6202
6203 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6204
6205 next:
6206 socket_unlock(mp_so, 1);
6207 }
6208
6209 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6210 }
6211
6212 static void
6213 mptcp_wifi_status_changed(void)
6214 {
6215 struct mppcb *mpp;
6216
6217 /* Iterate over all MPTCP connections */
6218
6219 lck_mtx_lock(&mtcbinfo.mppi_lock);
6220
6221 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6222 struct socket *mp_so = mpp->mpp_socket;
6223 struct mptses *mpte = mpp->mpp_pcbe;
6224
6225 socket_lock(mp_so, 1);
6226
6227 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6228 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6229 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6230 goto next;
6231 }
6232
6233 mptcp_check_subflows_and_add(mpte);
6234 mptcp_check_subflows_and_remove(mpte);
6235
6236 next:
6237 socket_unlock(mp_so, 1);
6238 }
6239
6240 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6241 }
6242
6243 void
6244 mptcp_ask_symptoms(struct mptses *mpte)
6245 {
6246 struct mptcp_symptoms_ask_uuid ask;
6247 struct socket *mp_so;
6248 struct proc *p;
6249 int pid, prio, err;
6250
6251 if (mptcp_kern_skt_unit == 0) {
6252 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6253 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6254 return;
6255 }
6256
6257 mp_so = mptetoso(mpte);
6258
6259 if (mp_so->so_flags & SOF_DELEGATED) {
6260 pid = mp_so->e_pid;
6261 } else {
6262 pid = mp_so->last_pid;
6263 }
6264
6265 p = proc_find(pid);
6266 if (p == PROC_NULL) {
6267 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6268 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6269 return;
6270 }
6271
6272 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6273
6274 if (mp_so->so_flags & SOF_DELEGATED) {
6275 uuid_copy(ask.uuid, mp_so->e_uuid);
6276 } else {
6277 uuid_copy(ask.uuid, mp_so->last_uuid);
6278 }
6279
6280 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6281
6282 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6283 prio == TASK_DARWINBG_APPLICATION) {
6284 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6285 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6286 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6287 } else {
6288 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6289 }
6290
6291 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6292 &ask, sizeof(ask), CTL_DATA_EOR);
6293
6294 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6295 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6296
6297
6298 proc_rele(p);
6299 }
6300
6301 static errno_t
6302 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6303 void *unitinfo)
6304 {
6305 #pragma unused(kctlref, kcunit, unitinfo)
6306
6307 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6308
6309 return 0;
6310 }
6311
6312 static errno_t
6313 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6314 mbuf_t m, int flags)
6315 {
6316 #pragma unused(kctlref, unitinfo, flags)
6317 symptoms_advisory_t *sa = NULL;
6318
6319 if (kcunit != mptcp_kern_skt_unit) {
6320 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6321 __func__, kcunit, mptcp_kern_skt_unit);
6322 }
6323
6324 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6325 mbuf_freem(m);
6326 return EINVAL;
6327 }
6328
6329 if (mbuf_len(m) < sizeof(*sa)) {
6330 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6331 __func__, mbuf_len(m), sizeof(*sa));
6332 mbuf_freem(m);
6333 return EINVAL;
6334 }
6335
6336 sa = mbuf_data(m);
6337
6338 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6339 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6340 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6341 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6342
6343 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6344 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6345 mptcp_wifi_status_changed();
6346 }
6347 } else {
6348 struct mptcp_symptoms_answer answer;
6349 errno_t err;
6350
6351 /* We temporarily allow different sizes for ease of submission */
6352 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6353 mbuf_len(m) != sizeof(answer)) {
6354 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6355 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6356 sizeof(answer));
6357 mbuf_free(m);
6358 return EINVAL;
6359 }
6360
6361 memset(&answer, 0, sizeof(answer));
6362
6363 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6364 if (err) {
6365 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6366 mbuf_free(m);
6367 return err;
6368 }
6369
6370 mptcp_allow_uuid(answer.uuid, answer.rssi);
6371 }
6372
6373 mbuf_freem(m);
6374 return 0;
6375 }
6376
6377 void
6378 mptcp_control_register(void)
6379 {
6380 /* Set up the advisory control socket */
6381 struct kern_ctl_reg mptcp_kern_ctl;
6382
6383 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6384 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6385 sizeof(mptcp_kern_ctl.ctl_name));
6386 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6387 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6388 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6389 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6390
6391 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6392 }
6393
6394 /*
6395 * Three return-values:
6396 * 1 : WiFi is bad
6397 * 0 : WiFi is good
6398 * -1 : WiFi-state is unknown
6399 */
6400 int
6401 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6402 {
6403 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6404 if (mptcp_advisory.sa_wifi_status) {
6405 return symptoms_is_wifi_lossy() ? 1 : 0;
6406 }
6407
6408 /*
6409 * If it's a first-party app and we don't have any info
6410 * about the Wi-Fi state, let's be pessimistic.
6411 */
6412 return -1;
6413 } else {
6414 if (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) {
6415 return 1;
6416 }
6417
6418 /*
6419 * If we are target-based (meaning, we allow to be more lax on
6420 * the "unusable" target. We only *know* about the state once
6421 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6422 *
6423 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6424 * be set.
6425 *
6426 * In any other case (while in target-mode), consider WiFi bad
6427 * and we are going to ask for allowance from Symptoms anyway.
6428 */
6429 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6430 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6431 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6432 return 0;
6433 }
6434
6435 return 1;
6436 }
6437
6438 return 0;
6439 }
6440 }
6441
6442 boolean_t
6443 symptoms_is_wifi_lossy(void)
6444 {
6445 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6446 }
6447
6448 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6449 static void
6450 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6451 {
6452 struct socket *mp_so = mptetoso(mpte);
6453 struct socket *so = mpts->mpts_socket;
6454 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6455 struct mptcb *mp_tp = mpte->mpte_mptcb;
6456
6457 /* If data was sent with SYN, rewind state */
6458 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6459 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6460 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6461
6462 VERIFY(mp_droplen <= (UINT_MAX));
6463 VERIFY(mp_droplen >= tcp_droplen);
6464
6465 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6466 mpts->mpts_iss += tcp_droplen;
6467 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6468
6469 if (mp_droplen > tcp_droplen) {
6470 /* handle partial TCP ack */
6471 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6472 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6473 mp_droplen = tcp_droplen;
6474 } else {
6475 /* all data on SYN was acked */
6476 mpts->mpts_rel_seq = 1;
6477 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6478 }
6479 mp_tp->mpt_sndmax -= tcp_droplen;
6480
6481 if (mp_droplen != 0) {
6482 VERIFY(mp_so->so_snd.sb_mb != NULL);
6483 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6484 }
6485 }
6486 }
6487
6488 int
6489 mptcp_freeq(struct mptcb *mp_tp)
6490 {
6491 struct tseg_qent *q;
6492 int rv = 0;
6493
6494 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6495 LIST_REMOVE(q, tqe_q);
6496 m_freem(q->tqe_m);
6497 zfree(tcp_reass_zone, q);
6498 rv = 1;
6499 }
6500 mp_tp->mpt_reassqlen = 0;
6501 return rv;
6502 }
6503
6504 static int
6505 mptcp_post_event(u_int32_t event_code, int value)
6506 {
6507 struct kev_mptcp_data event_data;
6508 struct kev_msg ev_msg;
6509
6510 memset(&ev_msg, 0, sizeof(ev_msg));
6511
6512 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6513 ev_msg.kev_class = KEV_NETWORK_CLASS;
6514 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6515 ev_msg.event_code = event_code;
6516
6517 event_data.value = value;
6518
6519 ev_msg.dv[0].data_ptr = &event_data;
6520 ev_msg.dv[0].data_length = sizeof(event_data);
6521
6522 return kev_post_msg(&ev_msg);
6523 }
6524
6525 static void
6526 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6527 {
6528 int error;
6529
6530 /* First-party apps (Siri) don't flip the cellicon */
6531 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6532 return;
6533 }
6534
6535 /* Subflow is disappearing - don't set it on this one */
6536 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6537 return;
6538 }
6539
6540 /* Remember the last time we set the cellicon. Needed for debouncing */
6541 mpte->mpte_last_cellicon_set = tcp_now;
6542
6543 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6544 mpte->mpte_cellicon_increments != 0) {
6545 if (mptcp_cellicon_refcount == 0) {
6546 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6547 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6548
6549 /* Continue, so that the icon gets set... */
6550 } else {
6551 /*
6552 * In this case, the cellicon is already set. No need to bump it
6553 * even higher
6554 */
6555
6556 return;
6557 }
6558 }
6559
6560 /* When tearing down this subflow, we need to decrement the
6561 * reference counter
6562 */
6563 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6564
6565 /* This counter, so that when a session gets destroyed we decrement
6566 * the reference counter by whatever is left
6567 */
6568 mpte->mpte_cellicon_increments++;
6569
6570 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6571 /* If cellicon is already set, get out of here! */
6572 return;
6573 }
6574
6575 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6576
6577 if (error) {
6578 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6579 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6580 } else {
6581 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6582 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6583 }
6584 }
6585
6586 void
6587 mptcp_clear_cellicon(void)
6588 {
6589 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6590
6591 if (error) {
6592 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6593 __func__, error);
6594 } else {
6595 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6596 __func__);
6597 }
6598 }
6599
6600 /*
6601 * Returns true if the icon has been flipped to WiFi.
6602 */
6603 static boolean_t
6604 __mptcp_unset_cellicon(long val)
6605 {
6606 if (OSAddAtomic(-val, &mptcp_cellicon_refcount) != 1) {
6607 return false;
6608 }
6609
6610 mptcp_clear_cellicon();
6611
6612 return true;
6613 }
6614
6615 static void
6616 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, long val)
6617 {
6618 /* First-party apps (Siri) don't flip the cellicon */
6619 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6620 return;
6621 }
6622
6623 if (mpte->mpte_cellicon_increments == 0) {
6624 /* This flow never used cell - get out of here! */
6625 return;
6626 }
6627
6628 if (mptcp_cellicon_refcount == 0) {
6629 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6630 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6631
6632 return;
6633 }
6634
6635 if (mpts) {
6636 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6637 return;
6638 }
6639
6640 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6641 }
6642
6643 mpte->mpte_cellicon_increments--;
6644
6645 if (__mptcp_unset_cellicon(val) == false) {
6646 return;
6647 }
6648
6649 /* All flows are gone - our counter should be at zero too! */
6650 if (mpte->mpte_cellicon_increments != 0) {
6651 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6652 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6653 }
6654 }
6655
6656 void
6657 mptcp_reset_rexmit_state(struct tcpcb *tp)
6658 {
6659 struct mptsub *mpts;
6660 struct inpcb *inp;
6661 struct socket *so;
6662
6663 inp = tp->t_inpcb;
6664 if (inp == NULL) {
6665 return;
6666 }
6667
6668 so = inp->inp_socket;
6669 if (so == NULL) {
6670 return;
6671 }
6672
6673 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6674 return;
6675 }
6676
6677 mpts = tp->t_mpsub;
6678
6679 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6680 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6681 }
6682
6683 void
6684 mptcp_reset_keepalive(struct tcpcb *tp)
6685 {
6686 struct mptsub *mpts = tp->t_mpsub;
6687
6688 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6689 }