]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_rupcall(struct socket *, void *, int);
121 static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122 static void mptcp_subflow_wupcall(struct socket *, void *, int);
123 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127 static void mptcp_subflow_abort(struct mptsub *, int);
128
129 static void mptcp_send_dfin(struct socket *so);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
146 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158 static const char *mptcp_evret2str(ev_ret_t);
159
160 static void mptcp_do_sha1(mptcp_key_t *, char *);
161 static void mptcp_init_local_parms(struct mptses *);
162
163 static unsigned int mptsub_zone_size; /* size of mptsub */
164 static struct zone *mptsub_zone; /* zone for mptsub */
165
166 static unsigned int mptopt_zone_size; /* size of mptopt */
167 static struct zone *mptopt_zone; /* zone for mptopt */
168
169 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172 struct mppcbinfo mtcbinfo;
173
174 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177 SYSCTL_DECL(_net_inet);
178
179 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
180
181 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
182 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
185 uint32_t mptcp_dbg_level = 1;
186 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
189 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
192
193 static int mptcp_alternate_port = 0;
194 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
196
197 static struct protosw mptcp_subflow_protosw;
198 static struct pr_usrreqs mptcp_subflow_usrreqs;
199 #if INET6
200 static struct ip6protosw mptcp_subflow_protosw6;
201 static struct pr_usrreqs mptcp_subflow_usrreqs6;
202 #endif /* INET6 */
203
204 static uint8_t mptcp_create_subflows_scheduled;
205
206 typedef struct mptcp_subflow_event_entry {
207 uint64_t sofilt_hint_mask;
208 ev_ret_t (*sofilt_hint_ev_hdlr)(
209 struct mptses *mpte,
210 struct mptsub *mpts,
211 uint64_t *p_mpsofilt_hint,
212 uint64_t event);
213 } mptsub_ev_entry_t;
214
215 static uint8_t mptcp_cellicon_is_set;
216 static uint32_t mptcp_last_cellicon_set;
217 #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
218
219 /*
220 * XXX The order of the event handlers below is really
221 * really important. Think twice before changing it.
222 */
223 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
224 {
225 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
226 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
227 },
228 {
229 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
230 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
231 },
232 {
233 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
234 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
235 },
236 {
237 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
238 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
239 },
240 {
241 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
242 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
243 },
244 {
245 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
246 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
247 },
248 {
249 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
250 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
251 },
252 {
253 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
254 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
255 },
256 {
257 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
258 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
259 },
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
263 },
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
267 },
268 {
269 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
275 },
276 };
277
278 os_log_t mptcp_log_handle;
279
280 /*
281 * Protocol pr_init callback.
282 */
283 void
284 mptcp_init(struct protosw *pp, struct domain *dp)
285 {
286 #pragma unused(dp)
287 static int mptcp_initialized = 0;
288 struct protosw *prp;
289 #if INET6
290 struct ip6protosw *prp6;
291 #endif /* INET6 */
292
293 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
294
295 /* do this only once */
296 if (mptcp_initialized) {
297 return;
298 }
299 mptcp_initialized = 1;
300
301 /*
302 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
303 * we must be able to find IPPROTO_TCP entries for both.
304 */
305 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
306 VERIFY(prp != NULL);
307 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
308 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
309 sizeof(mptcp_subflow_usrreqs));
310 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
311 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
312 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
313 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
314 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
315 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
316 /*
317 * Socket filters shouldn't attach/detach to/from this protosw
318 * since pr_protosw is to be used instead, which points to the
319 * real protocol; if they do, it is a bug and we should panic.
320 */
321 mptcp_subflow_protosw.pr_filter_head.tqh_first =
322 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
323 mptcp_subflow_protosw.pr_filter_head.tqh_last =
324 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
325
326 #if INET6
327 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
328 IPPROTO_TCP, SOCK_STREAM);
329 VERIFY(prp6 != NULL);
330 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
331 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
332 sizeof(mptcp_subflow_usrreqs6));
333 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
334 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
335 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
336 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
337 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
338 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
339 /*
340 * Socket filters shouldn't attach/detach to/from this protosw
341 * since pr_protosw is to be used instead, which points to the
342 * real protocol; if they do, it is a bug and we should panic.
343 */
344 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
345 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
346 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
347 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
348 #endif /* INET6 */
349
350 bzero(&mtcbinfo, sizeof(mtcbinfo));
351 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
352 mtcbinfo.mppi_size = sizeof(struct mpp_mtp);
353 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
354 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
355 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
356 /* NOTREACHED */
357 }
358 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
359 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
360
361 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
362 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
363 mtcbinfo.mppi_lock_grp_attr);
364 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
365 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
366 mtcbinfo.mppi_lock_attr);
367
368 mtcbinfo.mppi_gc = mptcp_gc;
369 mtcbinfo.mppi_timer = mptcp_timer;
370
371 /* attach to MP domain for garbage collection to take place */
372 mp_pcbinfo_attach(&mtcbinfo);
373
374 mptsub_zone_size = sizeof(struct mptsub);
375 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
376 8192, "mptsub")) == NULL) {
377 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
378 /* NOTREACHED */
379 }
380 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
381 zone_change(mptsub_zone, Z_EXPAND, TRUE);
382
383 mptopt_zone_size = sizeof(struct mptopt);
384 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
385 1024, "mptopt")) == NULL) {
386 panic("%s: unable to allocate MPTCP option zone\n", __func__);
387 /* NOTREACHED */
388 }
389 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
390 zone_change(mptopt_zone, Z_EXPAND, TRUE);
391
392 mpt_subauth_entry_size = sizeof(struct mptcp_subf_auth_entry);
393 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
394 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
395 panic("%s: unable to allocate MPTCP address auth zone \n",
396 __func__);
397 /* NOTREACHED */
398 }
399 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
400 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
401
402 mptcp_last_cellicon_set = tcp_now;
403
404 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
405 }
406
407 int
408 mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
409 {
410 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
411
412 int i, index = -1;
413
414 if (ifp == NULL) {
415 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
416 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
417 return -1;
418 }
419
420 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
421 if (stats[i].ifindex == IFSCOPE_NONE) {
422 if (index < 0) {
423 index = i;
424 }
425 continue;
426 }
427
428 if (stats[i].ifindex == ifp->if_index) {
429 index = i;
430 return index;
431 }
432 }
433
434 if (index != -1) {
435 stats[index].ifindex = ifp->if_index;
436 if (stats[index].is_expensive == 0) {
437 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
438 }
439 }
440
441 return index;
442 }
443
444 void
445 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
446 {
447 int index;
448
449 tcpstat.tcps_mp_switches++;
450 mpte->mpte_subflow_switches++;
451
452 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
453
454 if (index != -1) {
455 mpte->mpte_itfstats[index].switches++;
456 }
457 }
458
459 /*
460 * Flushes all recorded socket options from an MP socket.
461 */
462 static void
463 mptcp_flush_sopts(struct mptses *mpte)
464 {
465 struct mptopt *mpo, *tmpo;
466
467 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
468 mptcp_sopt_remove(mpte, mpo);
469 mptcp_sopt_free(mpo);
470 }
471 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
472 }
473
474 /*
475 * Create an MPTCP session, called as a result of opening a MPTCP socket.
476 */
477 int
478 mptcp_sescreate(struct mppcb *mpp)
479 {
480 struct mppcbinfo *mppi;
481 struct mptses *mpte;
482 struct mptcb *mp_tp;
483
484 VERIFY(mpp != NULL);
485 mppi = mpp->mpp_pcbinfo;
486 VERIFY(mppi != NULL);
487
488 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
489 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
490
491 /* MPTCP Multipath PCB Extension */
492 bzero(mpte, sizeof(*mpte));
493 VERIFY(mpp->mpp_pcbe == NULL);
494 mpp->mpp_pcbe = mpte;
495 mpte->mpte_mppcb = mpp;
496 mpte->mpte_mptcb = mp_tp;
497
498 TAILQ_INIT(&mpte->mpte_sopts);
499 TAILQ_INIT(&mpte->mpte_subflows);
500 mpte->mpte_associd = SAE_ASSOCID_ANY;
501 mpte->mpte_connid_last = SAE_CONNID_ANY;
502
503 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
504 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
505
506 if (mptcp_alternate_port) {
507 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
508 }
509
510 /* MPTCP Protocol Control Block */
511 bzero(mp_tp, sizeof(*mp_tp));
512 mp_tp->mpt_mpte = mpte;
513 mp_tp->mpt_state = MPTCPS_CLOSED;
514
515 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
516
517 return 0;
518 }
519
520 static void
521 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
522 uint64_t *cellbytes, uint64_t *allbytes)
523 {
524 int64_t mycellbytes = 0;
525 uint64_t myallbytes = 0;
526 int i;
527
528 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
529 if (mpte->mpte_itfstats[i].is_expensive) {
530 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
531 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
532 }
533
534 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
535 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
536 }
537
538 if (initial_cell) {
539 mycellbytes -= mpte->mpte_init_txbytes;
540 mycellbytes -= mpte->mpte_init_txbytes;
541 }
542
543 if (mycellbytes < 0) {
544 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
545 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
546 *cellbytes = 0;
547 *allbytes = 0;
548 } else {
549 *cellbytes = mycellbytes;
550 *allbytes = myallbytes;
551 }
552 }
553
554 static void
555 mptcpstats_session_wrapup(struct mptses *mpte)
556 {
557 boolean_t cell = mpte->mpte_initial_cell;
558
559 switch (mpte->mpte_svctype) {
560 case MPTCP_SVCTYPE_HANDOVER:
561 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
562 tcpstat.tcps_mptcp_fp_handover_attempt++;
563
564 if (cell && mpte->mpte_handshake_success) {
565 tcpstat.tcps_mptcp_fp_handover_success_cell++;
566
567 if (mpte->mpte_used_wifi) {
568 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
569 }
570 } else if (mpte->mpte_handshake_success) {
571 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
572
573 if (mpte->mpte_used_cell) {
574 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
575 }
576 }
577 } else {
578 tcpstat.tcps_mptcp_handover_attempt++;
579
580 if (cell && mpte->mpte_handshake_success) {
581 tcpstat.tcps_mptcp_handover_success_cell++;
582
583 if (mpte->mpte_used_wifi) {
584 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
585 }
586 } else if (mpte->mpte_handshake_success) {
587 tcpstat.tcps_mptcp_handover_success_wifi++;
588
589 if (mpte->mpte_used_cell) {
590 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
591 }
592 }
593 }
594
595 if (mpte->mpte_handshake_success) {
596 uint64_t cellbytes;
597 uint64_t allbytes;
598
599 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
600
601 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
602 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
603 }
604 break;
605 case MPTCP_SVCTYPE_INTERACTIVE:
606 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
607 tcpstat.tcps_mptcp_fp_interactive_attempt++;
608
609 if (mpte->mpte_handshake_success) {
610 tcpstat.tcps_mptcp_fp_interactive_success++;
611
612 if (!cell && mpte->mpte_used_cell) {
613 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
614 }
615 }
616 } else {
617 tcpstat.tcps_mptcp_interactive_attempt++;
618
619 if (mpte->mpte_handshake_success) {
620 tcpstat.tcps_mptcp_interactive_success++;
621
622 if (!cell && mpte->mpte_used_cell) {
623 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
624 }
625 }
626 }
627
628 if (mpte->mpte_handshake_success) {
629 uint64_t cellbytes;
630 uint64_t allbytes;
631
632 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
633
634 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
635 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
636 }
637 break;
638 case MPTCP_SVCTYPE_AGGREGATE:
639 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
640 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
641
642 if (mpte->mpte_handshake_success) {
643 tcpstat.tcps_mptcp_fp_aggregate_success++;
644 }
645 } else {
646 tcpstat.tcps_mptcp_aggregate_attempt++;
647
648 if (mpte->mpte_handshake_success) {
649 tcpstat.tcps_mptcp_aggregate_success++;
650 }
651 }
652
653 if (mpte->mpte_handshake_success) {
654 uint64_t cellbytes;
655 uint64_t allbytes;
656
657 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
658
659 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
660 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
661 }
662 break;
663 }
664
665 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
666 tcpstat.tcps_mptcp_back_to_wifi++;
667 }
668
669 if (mpte->mpte_triggered_cell) {
670 tcpstat.tcps_mptcp_triggered_cell++;
671 }
672 }
673
674 /*
675 * Destroy an MPTCP session.
676 */
677 static void
678 mptcp_session_destroy(struct mptses *mpte)
679 {
680 struct mptcb *mp_tp;
681
682 mpte_lock_assert_held(mpte); /* same as MP socket lock */
683
684 mp_tp = mpte->mpte_mptcb;
685 VERIFY(mp_tp != NULL);
686
687 mptcpstats_session_wrapup(mpte);
688
689 mptcp_unset_cellicon();
690
691 /*
692 * MPTCP Multipath PCB Extension section
693 */
694 mptcp_flush_sopts(mpte);
695 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
696
697 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
698 _FREE(mpte->mpte_itfinfo, M_TEMP);
699 }
700
701 mpte->mpte_itfinfo = NULL;
702
703 m_freem_list(mpte->mpte_reinjectq);
704
705 /*
706 * MPTCP Protocol Control Block section
707 */
708 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
709 struct mptcb *, mp_tp);
710 }
711
712 static boolean_t
713 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
714 {
715 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
716 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
717 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
718 }
719
720 static int
721 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
722 {
723 static const struct in6_addr well_known_prefix = {
724 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
725 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
726 0x00, 0x00, 0x00, 0x00},
727 };
728 char buf[MAX_IPv6_STR_LEN];
729 char *ptrv4 = (char *)addrv4;
730 char *ptr = (char *)addr;
731
732 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
733 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
734 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
735 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
736 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
737 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
738 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
739 return -1;
740 }
741
742 /* Check for the well-known prefix */
743 if (len == NAT64_PREFIX_LEN_96 &&
744 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
745 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
746 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
747 return -1;
748 }
749 }
750
751 switch (len) {
752 case NAT64_PREFIX_LEN_96:
753 memcpy(ptr + 12, ptrv4, 4);
754 break;
755 case NAT64_PREFIX_LEN_64:
756 memcpy(ptr + 9, ptrv4, 4);
757 break;
758 case NAT64_PREFIX_LEN_56:
759 memcpy(ptr + 7, ptrv4, 1);
760 memcpy(ptr + 9, ptrv4 + 1, 3);
761 break;
762 case NAT64_PREFIX_LEN_48:
763 memcpy(ptr + 6, ptrv4, 2);
764 memcpy(ptr + 9, ptrv4 + 2, 2);
765 break;
766 case NAT64_PREFIX_LEN_40:
767 memcpy(ptr + 5, ptrv4, 3);
768 memcpy(ptr + 9, ptrv4 + 3, 1);
769 break;
770 case NAT64_PREFIX_LEN_32:
771 memcpy(ptr + 4, ptrv4, 4);
772 break;
773 default:
774 panic("NAT64-prefix len is wrong: %u\n", len);
775 }
776
777 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
778 __func__, len,
779 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
780
781 return 0;
782 }
783
784 static void
785 mptcp_trigger_cell_bringup(struct mptses *mpte)
786 {
787 struct socket *mp_so = mptetoso(mpte);
788
789 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
790 uuid_string_t uuidstr;
791 int err;
792
793 mpte_unlock(mpte);
794 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
795 TRUE);
796 mpte_lock(mpte);
797
798 if (err == 0) {
799 mpte->mpte_triggered_cell = 1;
800 }
801
802 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
803 os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
804 __func__, uuidstr, err);
805 } else {
806 os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
807 }
808 }
809
810
811 void
812 mptcp_check_subflows_and_add(struct mptses *mpte)
813 {
814 struct mptcb *mp_tp = mpte->mpte_mptcb;
815 boolean_t cellular_viable = FALSE;
816 boolean_t want_cellular = TRUE;
817 uint32_t i;
818
819 if (!mptcp_ok_to_create_subflows(mp_tp)) {
820 return;
821 }
822
823 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
824 struct mpt_itf_info *info;
825 struct mptsub *mpts;
826 struct ifnet *ifp;
827 uint32_t ifindex;
828 int found = 0;
829
830 info = &mpte->mpte_itfinfo[i];
831
832 if (info->no_mptcp_support) {
833 continue;
834 }
835
836 ifindex = info->ifindex;
837 if (ifindex == IFSCOPE_NONE) {
838 continue;
839 }
840
841 ifnet_head_lock_shared();
842 ifp = ifindex2ifnet[ifindex];
843 ifnet_head_done();
844
845 if (ifp == NULL) {
846 continue;
847 }
848
849 if (IFNET_IS_CELLULAR(ifp)) {
850 cellular_viable = TRUE;
851 }
852
853 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
854 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
855
856 if (subifp == NULL) {
857 continue;
858 }
859
860 /*
861 * In Handover mode, only create cell subflow if
862 * 1. Wi-Fi Assist is active
863 * 2. Symptoms marked WiFi as weak
864 * 3. We are experiencing RTOs or we are not sending data.
865 *
866 * This covers the scenario, where:
867 * 1. We send and get retransmission timeouts (thus,
868 * we confirmed that WiFi is indeed bad).
869 * 2. We are not sending and the server tries to send.
870 * Establshing a cell-subflow gives the server a
871 * chance to send us some data over cell if WiFi
872 * is dead. We establish the subflow with the
873 * backup-bit set, so the server is not allowed to
874 * send on this subflow as long as WiFi is providing
875 * good performance.
876 */
877 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
878 !IFNET_IS_CELLULAR(subifp) &&
879 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
880 (mptcp_is_wifi_unusable(mpte) == 0 ||
881 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
882 ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
883 os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
884 __func__, mptcp_is_wifi_unusable(mpte),
885 sototcpcb(mpts->mpts_socket)->t_rxtshift,
886 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
887 mptetoso(mpte)->so_snd.sb_cc,
888 ifindex, subifp->if_index);
889 found = 1;
890
891 /* We found a proper subflow on WiFi - no need for cell */
892 want_cellular = FALSE;
893 break;
894 } else {
895 os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
896 __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
897 mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
898 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
899 }
900
901 if (subifp->if_index == ifindex &&
902 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
903 sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
904 /*
905 * We found a subflow on this interface.
906 * No need to create a new one.
907 */
908 found = 1;
909 break;
910 }
911 }
912
913 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
914 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
915 mptcp_developer_mode == 0) {
916 mptcp_ask_symptoms(mpte);
917 return;
918 }
919
920 if (!found) {
921 struct sockaddr *dst = &mpte->mpte_dst;
922 struct sockaddr_in6 nat64pre;
923
924 if (mpte->mpte_dst.sa_family == AF_INET &&
925 !info->has_v4_conn && info->has_nat64_conn) {
926 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
927 int error, j;
928
929 bzero(&nat64pre, sizeof(struct sockaddr_in6));
930
931 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
932 if (error) {
933 os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
934 __func__, ifp->if_name, error);
935 continue;
936 }
937
938 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
939 if (nat64prefixes[j].prefix_len != 0) {
940 break;
941 }
942 }
943
944 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
945
946 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
947 nat64prefixes[j].prefix_len,
948 &mpte->__mpte_dst_v4.sin_addr);
949 if (error != 0) {
950 os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
951 __func__);
952 continue;
953 }
954
955 memcpy(&nat64pre.sin6_addr,
956 &nat64prefixes[j].ipv6_prefix,
957 sizeof(nat64pre.sin6_addr));
958 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
959 nat64pre.sin6_family = AF_INET6;
960 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
961 nat64pre.sin6_flowinfo = 0;
962 nat64pre.sin6_scope_id = 0;
963
964 dst = (struct sockaddr *)&nat64pre;
965 }
966
967 /* Initial subflow started on a NAT64'd address? */
968 if (mpte->mpte_dst.sa_family == AF_INET6 &&
969 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
970 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
971 }
972
973 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
974 continue;
975 }
976 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
977 continue;
978 }
979
980 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
981 }
982 }
983
984 if (!cellular_viable && want_cellular) {
985 /* Trigger Cell Bringup */
986 mptcp_trigger_cell_bringup(mpte);
987 }
988 }
989
990 /*
991 * Based on the MPTCP Service-type and the state of the subflows, we
992 * will destroy subflows here.
993 */
994 static void
995 mptcp_check_subflows_and_remove(struct mptses *mpte)
996 {
997 struct mptsub *mpts, *tmpts;
998 int found_working_subflow = 0, removed_some = 0;
999 int wifi_unusable = mptcp_is_wifi_unusable(mpte);
1000
1001 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) {
1002 return;
1003 }
1004
1005 /*
1006 * Look for a subflow that is on a non-cellular interface
1007 * and actually works (aka, no retransmission timeout).
1008 */
1009 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1010 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1011 struct socket *so;
1012 struct tcpcb *tp;
1013
1014 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1015 continue;
1016 }
1017
1018 so = mpts->mpts_socket;
1019 tp = sototcpcb(so);
1020
1021 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1022 tp->t_state != TCPS_ESTABLISHED) {
1023 continue;
1024 }
1025
1026 /* Is this subflow in good condition? */
1027 if (tp->t_rxtshift == 0) {
1028 found_working_subflow = 1;
1029 }
1030
1031 /* Or WiFi is fine */
1032 if (!wifi_unusable) {
1033 found_working_subflow = 1;
1034 }
1035 }
1036
1037 /*
1038 * Couldn't find a working subflow, let's not remove those on a cellular
1039 * interface.
1040 */
1041 if (!found_working_subflow) {
1042 return;
1043 }
1044
1045 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1046 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1047
1048 /* Only remove cellular subflows */
1049 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1050 continue;
1051 }
1052
1053 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1054 removed_some = 1;
1055 }
1056
1057 if (removed_some) {
1058 mptcp_unset_cellicon();
1059 }
1060 }
1061
1062 static void
1063 mptcp_remove_subflows(struct mptses *mpte)
1064 {
1065 struct mptsub *mpts, *tmpts;
1066
1067 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1068 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1069 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1070
1071 soevent(mpts->mpts_socket,
1072 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1073 }
1074 }
1075 }
1076
1077 static void
1078 mptcp_create_subflows(__unused void *arg)
1079 {
1080 struct mppcb *mpp;
1081
1082 /*
1083 * Start with clearing, because we might be processing connections
1084 * while a new event comes in.
1085 */
1086 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1087 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
1088 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1089 }
1090
1091 /* Iterate over all MPTCP connections */
1092
1093 lck_mtx_lock(&mtcbinfo.mppi_lock);
1094
1095 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1096 struct mptses *mpte;
1097 struct socket *mp_so;
1098
1099 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1100 continue;
1101 }
1102
1103 mpp_lock(mpp);
1104
1105 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1106
1107 mpte = mpp->mpp_pcbe;
1108 mp_so = mpp->mpp_socket;
1109
1110 VERIFY(mp_so->so_usecount > 0);
1111
1112 mptcp_check_subflows_and_add(mpte);
1113 mptcp_remove_subflows(mpte);
1114
1115 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1116 mpp_unlock(mpp);
1117 }
1118
1119 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1120 }
1121
1122 /*
1123 * We need this because we are coming from an NECP-event. This event gets posted
1124 * while holding NECP-locks. The creation of the subflow however leads us back
1125 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1126 * So, we would deadlock there as we already hold the NECP-lock.
1127 *
1128 * So, let's schedule this separately. It also gives NECP the chance to make
1129 * progress, without having to wait for MPTCP to finish its subflow creation.
1130 */
1131 void
1132 mptcp_sched_create_subflows(struct mptses *mpte)
1133 {
1134 struct mppcb *mpp = mpte->mpte_mppcb;
1135 struct mptcb *mp_tp = mpte->mpte_mptcb;
1136 struct socket *mp_so = mpp->mpp_socket;
1137
1138 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1139 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1140 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1141 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1142 return;
1143 }
1144
1145 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1146 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1147 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1148 }
1149
1150 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1151 return;
1152 }
1153
1154 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1155 timeout(mptcp_create_subflows, NULL, hz / 10);
1156 }
1157
1158 /*
1159 * Allocate an MPTCP socket option structure.
1160 */
1161 struct mptopt *
1162 mptcp_sopt_alloc(int how)
1163 {
1164 struct mptopt *mpo;
1165
1166 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1167 zalloc_noblock(mptopt_zone);
1168 if (mpo != NULL) {
1169 bzero(mpo, mptopt_zone_size);
1170 }
1171
1172 return mpo;
1173 }
1174
1175 /*
1176 * Free an MPTCP socket option structure.
1177 */
1178 void
1179 mptcp_sopt_free(struct mptopt *mpo)
1180 {
1181 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1182
1183 zfree(mptopt_zone, mpo);
1184 }
1185
1186 /*
1187 * Add a socket option to the MPTCP socket option list.
1188 */
1189 void
1190 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1191 {
1192 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1193 mpo->mpo_flags |= MPOF_ATTACHED;
1194 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1195 }
1196
1197 /*
1198 * Remove a socket option from the MPTCP socket option list.
1199 */
1200 void
1201 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1202 {
1203 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1204 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1205 mpo->mpo_flags &= ~MPOF_ATTACHED;
1206 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1207 }
1208
1209 /*
1210 * Search for an existing <sopt_level,sopt_name> socket option.
1211 */
1212 struct mptopt *
1213 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1214 {
1215 struct mptopt *mpo;
1216
1217 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1218
1219 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1220 if (mpo->mpo_level == sopt->sopt_level &&
1221 mpo->mpo_name == sopt->sopt_name) {
1222 break;
1223 }
1224 }
1225 return mpo;
1226 }
1227
1228 /*
1229 * Allocate a MPTCP subflow structure.
1230 */
1231 static struct mptsub *
1232 mptcp_subflow_alloc(void)
1233 {
1234 struct mptsub *mpts = zalloc(mptsub_zone);
1235
1236 if (mpts == NULL) {
1237 return NULL;
1238 }
1239
1240 bzero(mpts, mptsub_zone_size);
1241 return mpts;
1242 }
1243
1244 /*
1245 * Deallocate a subflow structure, called when all of the references held
1246 * on it have been released. This implies that the subflow has been deleted.
1247 */
1248 static void
1249 mptcp_subflow_free(struct mptsub *mpts)
1250 {
1251 VERIFY(mpts->mpts_refcnt == 0);
1252 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1253 VERIFY(mpts->mpts_mpte == NULL);
1254 VERIFY(mpts->mpts_socket == NULL);
1255
1256 if (mpts->mpts_src != NULL) {
1257 FREE(mpts->mpts_src, M_SONAME);
1258 mpts->mpts_src = NULL;
1259 }
1260
1261 zfree(mptsub_zone, mpts);
1262 }
1263
1264 static void
1265 mptcp_subflow_addref(struct mptsub *mpts)
1266 {
1267 if (++mpts->mpts_refcnt == 0) {
1268 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1269 }
1270 /* NOTREACHED */
1271 }
1272
1273 static void
1274 mptcp_subflow_remref(struct mptsub *mpts)
1275 {
1276 if (mpts->mpts_refcnt == 0) {
1277 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1278 /* NOTREACHED */
1279 }
1280 if (--mpts->mpts_refcnt > 0) {
1281 return;
1282 }
1283
1284 /* callee will unlock and destroy lock */
1285 mptcp_subflow_free(mpts);
1286 }
1287
1288 static void
1289 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1290 {
1291 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1292 struct tcpcb *tp = sototcpcb(so);
1293
1294 /*
1295 * From this moment on, the subflow is linked to the MPTCP-connection.
1296 * Locking,... happens now at the MPTCP-layer
1297 */
1298 tp->t_mptcb = mpte->mpte_mptcb;
1299 so->so_flags |= SOF_MP_SUBFLOW;
1300 mp_so->so_usecount++;
1301
1302 /*
1303 * Insert the subflow into the list, and associate the MPTCP PCB
1304 * as well as the the subflow socket. From this point on, removing
1305 * the subflow needs to be done via mptcp_subflow_del().
1306 */
1307 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1308 mpte->mpte_numflows++;
1309
1310 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1311 mpts->mpts_mpte = mpte;
1312 mpts->mpts_socket = so;
1313 tp->t_mpsub = mpts;
1314 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1315 mptcp_subflow_addref(mpts); /* for subflow socket */
1316 }
1317
1318 static void
1319 mptcp_subflow_necp_cb(void *handle, __unused int action,
1320 __unused uint32_t interface_index,
1321 uint32_t necp_flags, bool *viable)
1322 {
1323 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1324 struct inpcb *inp = (struct inpcb *)handle;
1325 struct socket *so = inp->inp_socket;
1326 struct mptsub *mpts;
1327 struct mptses *mpte;
1328
1329 if (low_power) {
1330 action = NECP_CLIENT_CBACTION_NONVIABLE;
1331 }
1332
1333 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1334 return;
1335 }
1336
1337 /*
1338 * The socket is being garbage-collected. There is nothing to be done
1339 * here.
1340 */
1341 if (so->so_usecount == 0) {
1342 return;
1343 }
1344
1345 socket_lock(so, 1);
1346
1347 /* Check again after we acquired the lock. */
1348 if (so->so_usecount == 0) {
1349 goto out;
1350 }
1351
1352 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1353 mpts = sototcpcb(so)->t_mpsub;
1354
1355 os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
1356 __func__, mpts->mpts_ifscope, low_power);
1357
1358 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1359
1360 mptcp_sched_create_subflows(mpte);
1361
1362 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL) {
1363 *viable = 1;
1364 }
1365
1366 out:
1367 socket_unlock(so, 1);
1368 }
1369
1370 /*
1371 * Create an MPTCP subflow socket.
1372 */
1373 static int
1374 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1375 struct socket **so)
1376 {
1377 lck_mtx_t *subflow_mtx;
1378 struct mptopt smpo, *mpo, *tmpo;
1379 struct proc *p;
1380 struct socket *mp_so;
1381 int error;
1382
1383 *so = NULL;
1384 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1385 mp_so = mptetoso(mpte);
1386
1387 p = proc_find(mp_so->last_pid);
1388 if (p == PROC_NULL) {
1389 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1390 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1391
1392 return ESRCH;
1393 }
1394
1395 /*
1396 * Create the subflow socket (multipath subflow, non-blocking.)
1397 *
1398 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1399 * socket; it will be cleared when the socket is peeled off or closed.
1400 * It also indicates to the underlying TCP to handle MPTCP options.
1401 * A multipath subflow socket implies SS_NOFDREF state.
1402 */
1403
1404 /*
1405 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1406 * the ipi-lock. We cannot hold the socket-lock at that point.
1407 */
1408 mpte_unlock(mpte);
1409 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1410 SOCF_ASYNC, PROC_NULL);
1411 mpte_lock(mpte);
1412 if (error) {
1413 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1414 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1416
1417 proc_rele(p);
1418
1419 mptcp_subflow_free(mpts);
1420 return error;
1421 }
1422
1423 /*
1424 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1425 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1426 * Which is why we also need to get the lock with pr_getlock, as after
1427 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1428 */
1429 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1430 lck_mtx_lock(subflow_mtx);
1431
1432 /*
1433 * Must be the first thing we do, to make sure all pointers for this
1434 * subflow are set.
1435 */
1436 mptcp_subflow_attach(mpte, mpts, *so);
1437
1438 /*
1439 * A multipath subflow socket is used internally in the kernel,
1440 * therefore it does not have a file desciptor associated by
1441 * default.
1442 */
1443 (*so)->so_state |= SS_NOFDREF;
1444
1445 lck_mtx_unlock(subflow_mtx);
1446
1447 /* prevent the socket buffers from being compressed */
1448 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1449 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1450
1451 /* Inherit preconnect and TFO data flags */
1452 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1453 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1454 }
1455 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1456 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1457 }
1458
1459 /* Inherit uuid and create the related flow. */
1460 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1461 struct mptcb *mp_tp = mpte->mpte_mptcb;
1462
1463 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1464
1465 /*
1466 * A note on the unlock: With MPTCP, we do multiple times a
1467 * necp_client_register_socket_flow. This is problematic,
1468 * because now the lock-ordering guarantee (first necp-locks,
1469 * then socket-locks) is no more respected. So, we need to
1470 * unlock here.
1471 */
1472 mpte_unlock(mpte);
1473 error = necp_client_register_socket_flow(mp_so->last_pid,
1474 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1475 mpte_lock(mpte);
1476
1477 if (error) {
1478 goto out_err;
1479 }
1480
1481 /* Possible state-change during the unlock above */
1482 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1483 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1484 goto out_err;
1485 }
1486
1487 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1488 } else {
1489 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1490 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1491 }
1492
1493 /* inherit the other socket options */
1494 bzero(&smpo, sizeof(smpo));
1495 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1496 smpo.mpo_level = SOL_SOCKET;
1497 smpo.mpo_intval = 1;
1498
1499 /* disable SIGPIPE */
1500 smpo.mpo_name = SO_NOSIGPIPE;
1501 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1502 goto out_err;
1503 }
1504
1505 /* find out if the subflow's source address goes away */
1506 smpo.mpo_name = SO_NOADDRERR;
1507 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1508 goto out_err;
1509 }
1510
1511 /* enable keepalive */
1512 smpo.mpo_name = SO_KEEPALIVE;
1513 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1514 goto out_err;
1515 }
1516
1517 smpo.mpo_level = IPPROTO_TCP;
1518 smpo.mpo_intval = mptcp_subflow_keeptime;
1519 smpo.mpo_name = TCP_KEEPALIVE;
1520 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1521 goto out_err;
1522 }
1523
1524 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1525 /*
1526 * On secondary subflows we might need to set the cell-fallback
1527 * flag (see conditions in mptcp_subflow_sosetopt).
1528 */
1529 smpo.mpo_level = SOL_SOCKET;
1530 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1531 smpo.mpo_intval = 1;
1532 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1533 goto out_err;
1534 }
1535 }
1536
1537 /* replay setsockopt(2) on the subflow sockets for eligible options */
1538 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1539 int interim;
1540
1541 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1542 continue;
1543 }
1544
1545 /*
1546 * Skip those that are handled internally; these options
1547 * should not have been recorded and marked with the
1548 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1549 */
1550 if (mpo->mpo_level == SOL_SOCKET &&
1551 (mpo->mpo_name == SO_NOSIGPIPE ||
1552 mpo->mpo_name == SO_NOADDRERR ||
1553 mpo->mpo_name == SO_KEEPALIVE)) {
1554 continue;
1555 }
1556
1557 interim = (mpo->mpo_flags & MPOF_INTERIM);
1558 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1559 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1560 " sopt %s val %d interim record removed\n", __func__,
1561 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1562 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1563 mpo->mpo_intval),
1564 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1565 mptcp_sopt_remove(mpte, mpo);
1566 mptcp_sopt_free(mpo);
1567 continue;
1568 }
1569 }
1570
1571 /*
1572 * We need to receive everything that the subflow socket has,
1573 * so use a customized socket receive function. We will undo
1574 * this when the socket is peeled off or closed.
1575 */
1576 switch (dom) {
1577 case PF_INET:
1578 (*so)->so_proto = &mptcp_subflow_protosw;
1579 break;
1580 #if INET6
1581 case PF_INET6:
1582 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1583 break;
1584 #endif /* INET6 */
1585 default:
1586 VERIFY(0);
1587 /* NOTREACHED */
1588 }
1589
1590 proc_rele(p);
1591
1592 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1593 int, dom, int, error);
1594
1595 return 0;
1596
1597 out_err:
1598 mptcp_subflow_abort(mpts, error);
1599
1600 proc_rele(p);
1601
1602 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1603 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1604
1605 return error;
1606 }
1607
1608 /*
1609 * Close an MPTCP subflow socket.
1610 *
1611 * Note that this may be called on an embryonic subflow, and the only
1612 * thing that is guaranteed valid is the protocol-user request.
1613 */
1614 static void
1615 mptcp_subflow_soclose(struct mptsub *mpts)
1616 {
1617 struct socket *so = mpts->mpts_socket;
1618
1619 if (mpts->mpts_flags & MPTSF_CLOSED) {
1620 return;
1621 }
1622
1623 VERIFY(so != NULL);
1624 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1625 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1626
1627 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1628 struct socket *, so,
1629 struct sockbuf *, &so->so_rcv,
1630 struct sockbuf *, &so->so_snd,
1631 struct mptses *, mpts->mpts_mpte);
1632
1633 mpts->mpts_flags |= MPTSF_CLOSED;
1634
1635 if (so->so_retaincnt == 0) {
1636 soclose_locked(so);
1637
1638 return;
1639 } else {
1640 VERIFY(so->so_usecount > 0);
1641 so->so_usecount--;
1642 }
1643
1644 return;
1645 }
1646
1647 /*
1648 * Connect an MPTCP subflow socket.
1649 *
1650 * Note that in the pending connect case, the subflow socket may have been
1651 * bound to an interface and/or a source IP address which may no longer be
1652 * around by the time this routine is called; in that case the connect attempt
1653 * will most likely fail.
1654 */
1655 static int
1656 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1657 {
1658 char dbuf[MAX_IPv6_STR_LEN];
1659 struct socket *mp_so, *so;
1660 struct mptcb *mp_tp;
1661 struct sockaddr *dst;
1662 struct proc *p;
1663 int af, error, dport;
1664
1665 mp_so = mptetoso(mpte);
1666 mp_tp = mpte->mpte_mptcb;
1667 so = mpts->mpts_socket;
1668 af = mpts->mpts_dst.sa_family;
1669 dst = &mpts->mpts_dst;
1670
1671 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1672 VERIFY(mpts->mpts_socket != NULL);
1673 VERIFY(af == AF_INET || af == AF_INET6);
1674
1675 if (af == AF_INET) {
1676 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1677 dport = ntohs(SIN(dst)->sin_port);
1678 } else {
1679 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1680 dport = ntohs(SIN6(dst)->sin6_port);
1681 }
1682
1683 os_log_info(mptcp_log_handle,
1684 "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
1685 dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1686
1687 p = proc_find(mp_so->last_pid);
1688 if (p == PROC_NULL) {
1689 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1690 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1691
1692 return ESRCH;
1693 }
1694
1695 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1696
1697 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1698
1699 /* connect the subflow socket */
1700 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1701 p, mpts->mpts_ifscope,
1702 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1703
1704 mpts->mpts_iss = sototcpcb(so)->iss;
1705
1706 /* See tcp_connect_complete */
1707 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1708 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1709 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1710 }
1711
1712 /* Allocate a unique address id per subflow */
1713 mpte->mpte_addrid_last++;
1714 if (mpte->mpte_addrid_last == 0) {
1715 mpte->mpte_addrid_last++;
1716 }
1717
1718 proc_rele(p);
1719
1720 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1721 struct mptsub *, mpts, int, error);
1722 if (error) {
1723 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1724 __func__, error, mpts->mpts_ifscope),
1725 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1726 }
1727
1728 return error;
1729 }
1730
1731 /*
1732 * MPTCP subflow socket receive routine, derived from soreceive().
1733 */
1734 static int
1735 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1736 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1737 {
1738 #pragma unused(uio)
1739 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1740 int flags, error = 0;
1741 struct proc *p = current_proc();
1742 struct mbuf *m, **mp = mp0;
1743 boolean_t proc_held = FALSE;
1744
1745 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
1746 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1747
1748 #ifdef MORE_LOCKING_DEBUG
1749 if (so->so_usecount == 1) {
1750 panic("%s: so=%x no other reference on socket\n", __func__, so);
1751 /* NOTREACHED */
1752 }
1753 #endif
1754 /*
1755 * We return all that is there in the subflow's socket receive buffer
1756 * to the MPTCP layer, so we require that the caller passes in the
1757 * expected parameters.
1758 */
1759 if (mp == NULL || controlp != NULL) {
1760 return EINVAL;
1761 }
1762
1763 *mp = NULL;
1764 if (psa != NULL) {
1765 *psa = NULL;
1766 }
1767 if (flagsp != NULL) {
1768 flags = *flagsp & ~MSG_EOR;
1769 } else {
1770 flags = 0;
1771 }
1772
1773 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
1774 return EOPNOTSUPP;
1775 }
1776
1777 flags |= (MSG_DONTWAIT | MSG_NBIO);
1778
1779 /*
1780 * If a recv attempt is made on a previously-accepted socket
1781 * that has been marked as inactive (disconnected), reject
1782 * the request.
1783 */
1784 if (so->so_flags & SOF_DEFUNCT) {
1785 struct sockbuf *sb = &so->so_rcv;
1786
1787 error = ENOTCONN;
1788 /*
1789 * This socket should have been disconnected and flushed
1790 * prior to being returned from sodefunct(); there should
1791 * be no data on its receive list, so panic otherwise.
1792 */
1793 if (so->so_state & SS_DEFUNCT) {
1794 sb_empty_assert(sb, __func__);
1795 }
1796 return error;
1797 }
1798
1799 /*
1800 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1801 * and if so just return to the caller. This could happen when
1802 * soreceive() is called by a socket upcall function during the
1803 * time the socket is freed. The socket buffer would have been
1804 * locked across the upcall, therefore we cannot put this thread
1805 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1806 * we may livelock), because the lock on the socket buffer will
1807 * only be released when the upcall routine returns to its caller.
1808 * Because the socket has been officially closed, there can be
1809 * no further read on it.
1810 *
1811 * A multipath subflow socket would have its SS_NOFDREF set by
1812 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1813 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1814 */
1815 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1816 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1817 return 0;
1818 }
1819
1820 /*
1821 * For consistency with soreceive() semantics, we need to obey
1822 * SB_LOCK in case some other code path has locked the buffer.
1823 */
1824 error = sblock(&so->so_rcv, 0);
1825 if (error != 0) {
1826 return error;
1827 }
1828
1829 m = so->so_rcv.sb_mb;
1830 if (m == NULL) {
1831 /*
1832 * Panic if we notice inconsistencies in the socket's
1833 * receive list; both sb_mb and sb_cc should correctly
1834 * reflect the contents of the list, otherwise we may
1835 * end up with false positives during select() or poll()
1836 * which could put the application in a bad state.
1837 */
1838 SB_MB_CHECK(&so->so_rcv);
1839
1840 if (so->so_error != 0) {
1841 error = so->so_error;
1842 so->so_error = 0;
1843 goto release;
1844 }
1845
1846 if (so->so_state & SS_CANTRCVMORE) {
1847 goto release;
1848 }
1849
1850 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
1851 error = ENOTCONN;
1852 goto release;
1853 }
1854
1855 /*
1856 * MSG_DONTWAIT is implicitly defined and this routine will
1857 * never block, so return EWOULDBLOCK when there is nothing.
1858 */
1859 error = EWOULDBLOCK;
1860 goto release;
1861 }
1862
1863 mptcp_update_last_owner(so, mp_so);
1864
1865 if (mp_so->last_pid != proc_pid(p)) {
1866 p = proc_find(mp_so->last_pid);
1867 if (p == PROC_NULL) {
1868 p = current_proc();
1869 } else {
1870 proc_held = TRUE;
1871 }
1872 }
1873
1874 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1875 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1876 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1877
1878 while (m != NULL) {
1879 int dlen = 0, dfin = 0, error_out = 0;
1880 struct mbuf *start = m;
1881 uint64_t dsn;
1882 uint32_t sseq;
1883 uint16_t orig_dlen;
1884 uint16_t csum;
1885
1886 VERIFY(m->m_nextpkt == NULL);
1887
1888 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1889 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1890 dsn = m->m_pkthdr.mp_dsn;
1891 sseq = m->m_pkthdr.mp_rseq;
1892 csum = m->m_pkthdr.mp_csum;
1893 } else {
1894 /* We did fallback */
1895 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
1896
1897 sbfree(&so->so_rcv, m);
1898
1899 if (mp != NULL) {
1900 *mp = m;
1901 mp = &m->m_next;
1902 so->so_rcv.sb_mb = m = m->m_next;
1903 *mp = NULL;
1904 }
1905
1906 if (m != NULL) {
1907 so->so_rcv.sb_lastrecord = m;
1908 } else {
1909 SB_EMPTY_FIXUP(&so->so_rcv);
1910 }
1911
1912 continue;
1913 }
1914
1915 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
1916 dfin = 1;
1917 }
1918
1919 /*
1920 * Check if the full mapping is now present
1921 */
1922 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
1923 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
1924 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
1925 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1926
1927 if (*mp0 == NULL) {
1928 error = EWOULDBLOCK;
1929 }
1930 goto release;
1931 }
1932
1933 /* Now, get the full mapping */
1934 while (dlen > 0) {
1935 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1936 error_out = 1;
1937 error = EIO;
1938 dlen = 0;
1939 *mp0 = NULL;
1940 mptcp_subflow_abort(sototcpcb(so)->t_mpsub, ECONNABORTED);
1941 break;
1942 }
1943
1944 dlen -= m->m_len;
1945 sbfree(&so->so_rcv, m);
1946
1947 if (mp != NULL) {
1948 *mp = m;
1949 mp = &m->m_next;
1950 so->so_rcv.sb_mb = m = m->m_next;
1951 *mp = NULL;
1952 }
1953
1954 if (dlen - dfin == 0) {
1955 dlen = 0;
1956 }
1957
1958 VERIFY(dlen <= 0 || m);
1959 }
1960
1961 VERIFY(dlen == 0);
1962
1963 if (m != NULL) {
1964 so->so_rcv.sb_lastrecord = m;
1965 } else {
1966 SB_EMPTY_FIXUP(&so->so_rcv);
1967 }
1968
1969 if (error_out) {
1970 goto release;
1971 }
1972
1973 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
1974 error = EIO;
1975 *mp0 = NULL;
1976 goto release;
1977 }
1978
1979 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1980 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1981 }
1982
1983 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1984 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1985
1986 if (flagsp != NULL) {
1987 *flagsp |= flags;
1988 }
1989
1990 release:
1991 sbunlock(&so->so_rcv, TRUE);
1992
1993 if (proc_held) {
1994 proc_rele(p);
1995 }
1996
1997 return error;
1998 }
1999
2000 /*
2001 * MPTCP subflow socket send routine, derived from sosend().
2002 */
2003 static int
2004 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2005 struct mbuf *top, struct mbuf *control, int flags)
2006 {
2007 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2008 struct proc *p = current_proc();
2009 boolean_t en_tracing = FALSE, proc_held = FALSE;
2010 int en_tracing_val;
2011 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2012 int error;
2013
2014 VERIFY(control == NULL);
2015 VERIFY(addr == NULL);
2016 VERIFY(uio == NULL);
2017 VERIFY(flags == 0);
2018 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2019
2020 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2021 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2022
2023 /*
2024 * trace if tracing & network (vs. unix) sockets & and
2025 * non-loopback
2026 */
2027 if (ENTR_SHOULDTRACE &&
2028 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2029 struct inpcb *inp = sotoinpcb(so);
2030 if (inp->inp_last_outifp != NULL &&
2031 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2032 en_tracing = TRUE;
2033 en_tracing_val = top->m_pkthdr.len;
2034 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2035 VM_KERNEL_ADDRPERM(so),
2036 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2037 (int64_t)en_tracing_val);
2038 }
2039 }
2040
2041 mptcp_update_last_owner(so, mp_so);
2042
2043 if (mp_so->last_pid != proc_pid(p)) {
2044 p = proc_find(mp_so->last_pid);
2045 if (p == PROC_NULL) {
2046 p = current_proc();
2047 } else {
2048 proc_held = TRUE;
2049 }
2050 }
2051
2052 #if NECP
2053 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2054 #endif /* NECP */
2055
2056 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2057
2058 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
2059 if (error) {
2060 goto out;
2061 }
2062
2063 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2064 top = NULL;
2065
2066 out:
2067 if (top != NULL) {
2068 m_freem(top);
2069 }
2070
2071 if (proc_held) {
2072 proc_rele(p);
2073 }
2074
2075 soclearfastopen(so);
2076
2077 if (en_tracing) {
2078 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2079 VM_KERNEL_ADDRPERM(so),
2080 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2081 (int64_t)en_tracing_val);
2082 }
2083
2084 return error;
2085 }
2086
2087 /*
2088 * Establish an initial MPTCP connection (if first subflow and not yet
2089 * connected), or add a subflow to an existing MPTCP connection.
2090 */
2091 int
2092 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2093 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2094 {
2095 struct socket *mp_so, *so = NULL;
2096 struct mptcb *mp_tp;
2097 struct mptsub *mpts = NULL;
2098 int af, error = 0;
2099
2100 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2101 mp_so = mptetoso(mpte);
2102 mp_tp = mpte->mpte_mptcb;
2103
2104 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2105 /* If the remote end sends Data FIN, refuse subflow adds */
2106 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
2107 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2108 error = ENOTCONN;
2109 goto out_err;
2110 }
2111
2112 mpts = mptcp_subflow_alloc();
2113 if (mpts == NULL) {
2114 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
2115 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2116 error = ENOMEM;
2117 goto out_err;
2118 }
2119
2120 if (src) {
2121 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2122 error = EAFNOSUPPORT;
2123 goto out_err;
2124 }
2125
2126 if (src->sa_family == AF_INET &&
2127 src->sa_len != sizeof(struct sockaddr_in)) {
2128 error = EINVAL;
2129 goto out_err;
2130 }
2131
2132 if (src->sa_family == AF_INET6 &&
2133 src->sa_len != sizeof(struct sockaddr_in6)) {
2134 error = EINVAL;
2135 goto out_err;
2136 }
2137
2138 MALLOC(mpts->mpts_src, struct sockaddr *, src->sa_len, M_SONAME,
2139 M_WAITOK | M_ZERO);
2140 if (mpts->mpts_src == NULL) {
2141 error = ENOMEM;
2142 goto out_err;
2143 }
2144 bcopy(src, mpts->mpts_src, src->sa_len);
2145 }
2146
2147 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2148 error = EAFNOSUPPORT;
2149 goto out_err;
2150 }
2151
2152 if (dst->sa_family == AF_INET &&
2153 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2154 error = EINVAL;
2155 goto out_err;
2156 }
2157
2158 if (dst->sa_family == AF_INET6 &&
2159 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2160 error = EINVAL;
2161 goto out_err;
2162 }
2163
2164 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
2165
2166 af = mpts->mpts_dst.sa_family;
2167
2168 ifnet_head_lock_shared();
2169 if ((ifscope > (unsigned)if_index)) {
2170 ifnet_head_done();
2171 error = ENXIO;
2172 goto out_err;
2173 }
2174 ifnet_head_done();
2175
2176 mpts->mpts_ifscope = ifscope;
2177
2178 /* create the subflow socket */
2179 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2180 /*
2181 * Returning (error) and not cleaning up, because up to here
2182 * all we did is creating mpts.
2183 *
2184 * And the contract is that the call to mptcp_subflow_socreate,
2185 * moves ownership of mpts to mptcp_subflow_socreate.
2186 */
2187 return error;
2188 }
2189
2190 /*
2191 * We may be called from within the kernel. Still need to account this
2192 * one to the real app.
2193 */
2194 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2195
2196 /*
2197 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2198 * -1 (SAE_CONNID_ALL).
2199 */
2200 mpte->mpte_connid_last++;
2201 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2202 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2203 mpte->mpte_connid_last++;
2204 }
2205
2206 mpts->mpts_connid = mpte->mpte_connid_last;
2207
2208 mpts->mpts_rel_seq = 1;
2209
2210 /* Allocate a unique address id per subflow */
2211 mpte->mpte_addrid_last++;
2212 if (mpte->mpte_addrid_last == 0) {
2213 mpte->mpte_addrid_last++;
2214 }
2215
2216 /* register for subflow socket read/write events */
2217 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
2218
2219 /* Register for subflow socket control events */
2220 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2221 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2222 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2223 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2224 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2225 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2226 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2227 SO_FILT_HINT_ADAPTIVE_WTIMO);
2228
2229 /* sanity check */
2230 VERIFY(!(mpts->mpts_flags &
2231 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2232
2233 /*
2234 * Indicate to the TCP subflow whether or not it should establish
2235 * the initial MPTCP connection, or join an existing one. Fill
2236 * in the connection request structure with additional info needed
2237 * by the underlying TCP (to be used in the TCP options, etc.)
2238 */
2239 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2240 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2241
2242 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2243 mptcp_init_local_parms(mpte);
2244 }
2245 soisconnecting(mp_so);
2246
2247 /* If fastopen is requested, set state in mpts */
2248 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2249 mpts->mpts_flags |= MPTSF_TFO_REQD;
2250 }
2251 } else {
2252 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2253 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2254 }
2255 }
2256
2257 mpts->mpts_flags |= MPTSF_CONNECTING;
2258
2259 if (af == AF_INET || af == AF_INET6) {
2260 char dbuf[MAX_IPv6_STR_LEN];
2261
2262 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2263 "mp_so 0x%llx dst %s[%d] cid %d "
2264 "[pending %s]\n", __func__,
2265 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2266 inet_ntop(af, ((af == AF_INET) ?
2267 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2268 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
2269 dbuf, sizeof(dbuf)), ((af == AF_INET) ?
2270 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2271 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
2272 mpts->mpts_connid,
2273 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
2274 "YES" : "NO")),
2275 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2276 }
2277
2278 /* connect right away if first attempt, or if join can be done now */
2279 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2280 error = mptcp_subflow_soconnectx(mpte, mpts);
2281 }
2282
2283 if (error) {
2284 goto out_err_close;
2285 }
2286
2287 if (pcid) {
2288 *pcid = mpts->mpts_connid;
2289 }
2290
2291 return 0;
2292
2293 out_err_close:
2294 mptcp_subflow_abort(mpts, error);
2295
2296 return error;
2297
2298 out_err:
2299 if (mpts) {
2300 mptcp_subflow_free(mpts);
2301 }
2302
2303 return error;
2304 }
2305
2306 void
2307 mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2308 {
2309 int index = mptcp_get_statsindex(stats, mpts);
2310
2311 if (index != -1) {
2312 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2313
2314 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2315 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2316 }
2317 }
2318
2319 /*
2320 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2321 * will no longer be accessible after a subflow is deleted, thus this
2322 * should occur only after the subflow socket has been disconnected.
2323 */
2324 void
2325 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2326 {
2327 struct socket *mp_so = mptetoso(mpte);
2328 struct socket *so = mpts->mpts_socket;
2329 struct tcpcb *tp = sototcpcb(so);
2330
2331 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2332 VERIFY(mpts->mpts_mpte == mpte);
2333 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2334 VERIFY(mpte->mpte_numflows != 0);
2335 VERIFY(mp_so->so_usecount > 0);
2336
2337 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2338 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2339 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2340 mpts->mpts_flags, mp_so->so_error),
2341 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2342
2343 mptcpstats_update(mpte->mpte_itfstats, mpts);
2344 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2345 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2346
2347 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2348 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2349 mpte->mpte_numflows--;
2350 if (mpte->mpte_active_sub == mpts) {
2351 mpte->mpte_active_sub = NULL;
2352 }
2353
2354 /*
2355 * Drop references held by this subflow socket; there
2356 * will be no further upcalls made from this point.
2357 */
2358 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2359 sock_catchevents_locked(so, NULL, NULL, 0);
2360
2361 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2362
2363 mp_so->so_usecount--; /* for subflow socket */
2364 mpts->mpts_mpte = NULL;
2365 mpts->mpts_socket = NULL;
2366
2367 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2368 mptcp_subflow_remref(mpts); /* for subflow socket */
2369
2370 so->so_flags &= ~SOF_MP_SUBFLOW;
2371 tp->t_mptcb = NULL;
2372 tp->t_mpsub = NULL;
2373 }
2374
2375 void
2376 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2377 {
2378 struct socket *so = mpts->mpts_socket;
2379 struct mptcb *mp_tp = mpte->mpte_mptcb;
2380 int send_dfin = 0;
2381
2382 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2383 send_dfin = 1;
2384 }
2385
2386 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2387 (so->so_state & SS_ISCONNECTED)) {
2388 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2389 __func__, mpts->mpts_connid, send_dfin),
2390 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2391
2392 if (send_dfin) {
2393 mptcp_send_dfin(so);
2394 }
2395 soshutdownlock(so, SHUT_WR);
2396 }
2397 }
2398
2399 static void
2400 mptcp_subflow_abort(struct mptsub *mpts, int error)
2401 {
2402 struct socket *so = mpts->mpts_socket;
2403 struct tcpcb *tp = sototcpcb(so);
2404
2405 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2406 return;
2407 }
2408
2409 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2410 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2411
2412 if (tp->t_state != TCPS_CLOSED) {
2413 tcp_drop(tp, error);
2414 }
2415
2416 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2417 }
2418
2419 /*
2420 * Disconnect a subflow socket.
2421 */
2422 void
2423 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2424 {
2425 struct socket *so;
2426 struct mptcb *mp_tp;
2427 int send_dfin = 0;
2428
2429 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2430
2431 VERIFY(mpts->mpts_mpte == mpte);
2432 VERIFY(mpts->mpts_socket != NULL);
2433
2434 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2435 return;
2436 }
2437
2438 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2439
2440 so = mpts->mpts_socket;
2441 mp_tp = mpte->mpte_mptcb;
2442 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2443 send_dfin = 1;
2444 }
2445
2446 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2447 (so->so_state & SS_ISCONNECTED)) {
2448 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2449 __func__, mpts->mpts_connid, send_dfin),
2450 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2451
2452 if (send_dfin) {
2453 mptcp_send_dfin(so);
2454 }
2455 (void) soshutdownlock(so, SHUT_RD);
2456 (void) soshutdownlock(so, SHUT_WR);
2457 (void) sodisconnectlocked(so);
2458 }
2459 /*
2460 * Generate a disconnect event for this subflow socket, in case
2461 * the lower layer doesn't do it; this is needed because the
2462 * subflow socket deletion relies on it.
2463 */
2464 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2465 }
2466
2467 /*
2468 * Called when the associated subflow socket posted a read event.
2469 */
2470 static void
2471 mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2472 {
2473 #pragma unused(so, waitf)
2474 struct mptsub *mpts = arg, *tmpts;
2475 struct mptses *mpte = mpts->mpts_mpte;
2476
2477 VERIFY(mpte != NULL);
2478
2479 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2480 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL)) {
2481 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2482 }
2483 return;
2484 }
2485
2486 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2487 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2488 if (mpts->mpts_socket->so_usecount == 0) {
2489 /* Will be removed soon by tcp_garbage_collect */
2490 continue;
2491 }
2492
2493 mptcp_subflow_addref(mpts);
2494 mpts->mpts_socket->so_usecount++;
2495
2496 mptcp_subflow_input(mpte, mpts);
2497
2498 mptcp_subflow_remref(mpts); /* ours */
2499
2500 VERIFY(mpts->mpts_socket->so_usecount != 0);
2501 mpts->mpts_socket->so_usecount--;
2502 }
2503
2504 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
2505 }
2506
2507 /*
2508 * Subflow socket input.
2509 */
2510 static void
2511 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2512 {
2513 struct socket *mp_so = mptetoso(mpte);
2514 struct mbuf *m = NULL;
2515 struct socket *so;
2516 int error, wakeup = 0;
2517
2518 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2519 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2520
2521 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2522 struct mptsub *, mpts);
2523
2524 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2525 goto out;
2526 }
2527
2528 so = mpts->mpts_socket;
2529
2530 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2531 if (error != 0 && error != EWOULDBLOCK) {
2532 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
2533 __func__, mpts->mpts_connid, error),
2534 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
2535 if (error == ENODATA) {
2536 /*
2537 * Don't ignore ENODATA so as to discover
2538 * nasty middleboxes.
2539 */
2540 mp_so->so_error = ENODATA;
2541
2542 wakeup = 1;
2543 goto out;
2544 }
2545 } else if (error == 0) {
2546 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2547 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2548 }
2549
2550 /* In fallback, make sure to accept data on all but one subflow */
2551 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2552 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2553 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2554 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2555 m_freem(m);
2556 goto out;
2557 }
2558
2559 if (m != NULL) {
2560 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2561 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2562
2563 mpte->mpte_used_cell = 1;
2564 } else {
2565 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2566
2567 mpte->mpte_used_wifi = 1;
2568 }
2569
2570 mptcp_input(mpte, m);
2571 }
2572
2573 /* notify protocol that we drained all the data */
2574 if (error == 0 && m != NULL &&
2575 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
2576 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2577 }
2578
2579 out:
2580 if (wakeup) {
2581 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2582 }
2583
2584 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2585 }
2586
2587 /*
2588 * Subflow socket write upcall.
2589 *
2590 * Called when the associated subflow socket posted a read event.
2591 */
2592 static void
2593 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2594 {
2595 #pragma unused(so, waitf)
2596 struct mptsub *mpts = arg;
2597 struct mptses *mpte = mpts->mpts_mpte;
2598
2599 VERIFY(mpte != NULL);
2600
2601 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2602 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2603 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2604 }
2605 return;
2606 }
2607
2608 mptcp_output(mpte);
2609 }
2610
2611 static boolean_t
2612 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2613 {
2614 struct mbuf *so_m = so->so_snd.sb_mb;
2615 uint64_t dsn = m->m_pkthdr.mp_dsn;
2616
2617 while (so_m) {
2618 VERIFY(so_m->m_flags & M_PKTHDR);
2619 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2620
2621 /* Part of the segment is covered, don't reinject here */
2622 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2623 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2624 return TRUE;
2625 }
2626
2627 so_m = so_m->m_next;
2628 }
2629
2630 return FALSE;
2631 }
2632
2633 /*
2634 * Subflow socket output.
2635 *
2636 * Called for sending data from MPTCP to the underlying subflow socket.
2637 */
2638 int
2639 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2640 {
2641 struct mptcb *mp_tp = mpte->mpte_mptcb;
2642 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2643 struct socket *mp_so, *so;
2644 struct tcpcb *tp;
2645 uint64_t mpt_dsn = 0, off = 0;
2646 int sb_cc = 0, error = 0, wakeup = 0;
2647 uint32_t dss_csum;
2648 uint16_t tot_sent = 0;
2649 boolean_t reinjected = FALSE;
2650
2651 mpte_lock_assert_held(mpte);
2652
2653 mp_so = mptetoso(mpte);
2654 so = mpts->mpts_socket;
2655 tp = sototcpcb(so);
2656
2657 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2658 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2659
2660 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2661 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2662 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2663 (mpts->mpts_flags & MPTSF_TFO_REQD));
2664 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2665
2666 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2667 __func__, mpts->mpts_flags, mpte->mpte_flags,
2668 mptcp_subflow_cwnd_space(so)),
2669 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2670 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2671 struct mptsub *, mpts);
2672
2673 /* Remove Addr Option is not sent reliably as per I-D */
2674 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2675 tp->t_rem_aid = mpte->mpte_lost_aid;
2676 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2677 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2678 }
2679
2680 /*
2681 * The mbuf chains containing the metadata (as well as pointing to
2682 * the user data sitting at the MPTCP output queue) would then be
2683 * sent down to the subflow socket.
2684 *
2685 * Some notes on data sequencing:
2686 *
2687 * a. Each mbuf must be a M_PKTHDR.
2688 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2689 * in the mbuf pkthdr structure.
2690 * c. Each mbuf containing the MPTCP metadata must have its
2691 * pkt_flags marked with the PKTF_MPTCP flag.
2692 */
2693
2694 if (mpte->mpte_reinjectq) {
2695 sb_mb = mpte->mpte_reinjectq;
2696 } else {
2697 sb_mb = mp_so->so_snd.sb_mb;
2698 }
2699
2700 if (sb_mb == NULL) {
2701 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2702 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2703 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
2704 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2705
2706 /* Fix it to prevent looping */
2707 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
2708 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2709 }
2710 goto out;
2711 }
2712
2713 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2714
2715 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2716 !(so->so_state & SS_ISCONNECTED) &&
2717 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2718 tp->t_mpflags |= TMPF_TFO_REQUEST;
2719 goto zero_len_write;
2720 }
2721
2722 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2723
2724 /* First, drop acknowledged data */
2725 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2726 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2727 "dsn %u suna %u reinject? %u\n",
2728 __func__, (uint32_t)mpt_dsn,
2729 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2730 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2731 if (mpte->mpte_reinjectq) {
2732 mptcp_clean_reinjectq(mpte);
2733 } else {
2734 uint64_t len = 0;
2735 len = mp_tp->mpt_snduna - mpt_dsn;
2736 sbdrop(&mp_so->so_snd, (int)len);
2737 wakeup = 1;
2738 }
2739 }
2740
2741 /* Check again because of above sbdrop */
2742 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2743 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2744 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2745 goto out;
2746 }
2747
2748 /*
2749 * In degraded mode, we don't receive data acks, so force free
2750 * mbufs less than snd_nxt
2751 */
2752 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2753 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
2754 mp_so->so_snd.sb_mb) {
2755 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2756 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2757 uint64_t len = 0;
2758 len = mp_tp->mpt_snduna - mpt_dsn;
2759 sbdrop(&mp_so->so_snd, (int)len);
2760 wakeup = 1;
2761
2762 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2763 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2764 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2765 }
2766 }
2767
2768 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2769 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2770 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2771 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
2772 }
2773
2774 /*
2775 * Adjust the top level notion of next byte used for retransmissions
2776 * and sending FINs.
2777 */
2778 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
2779 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2780 }
2781
2782 /* Now determine the offset from which to start transmitting data */
2783 if (mpte->mpte_reinjectq) {
2784 sb_mb = mpte->mpte_reinjectq;
2785 } else {
2786 dont_reinject:
2787 sb_mb = mp_so->so_snd.sb_mb;
2788 }
2789 if (sb_mb == NULL) {
2790 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2791 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2792 goto out;
2793 }
2794
2795 if (sb_mb == mpte->mpte_reinjectq) {
2796 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2797 off = 0;
2798
2799 if (mptcp_search_seq_in_sub(sb_mb, so)) {
2800 if (mptcp_can_send_more(mp_tp, TRUE)) {
2801 goto dont_reinject;
2802 }
2803
2804 error = ECANCELED;
2805 goto out;
2806 }
2807
2808 reinjected = TRUE;
2809 } else if (flags & MPTCP_SUBOUT_PROBING) {
2810 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2811 off = 0;
2812 } else {
2813 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2814
2815 /*
2816 * With TFO, there might be no data at all, thus still go into this
2817 * code-path here.
2818 */
2819 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2820 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2821 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2822 sb_cc -= off;
2823 } else {
2824 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2825 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2826 (uint32_t)mp_tp->mpt_sndmax),
2827 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2828
2829 goto out;
2830 }
2831 }
2832
2833 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2834 if (sb_cc <= 0) {
2835 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2836 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2837 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2838 mptcp_subflow_cwnd_space(so)),
2839 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2840 }
2841
2842 sb_cc = min(sb_cc, UINT16_MAX);
2843
2844 /*
2845 * Create a DSN mapping for the data we are about to send. It all
2846 * has the same mapping.
2847 */
2848 if (reinjected) {
2849 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2850 } else {
2851 mpt_dsn = mp_tp->mpt_snduna + off;
2852 }
2853
2854 mpt_mbuf = sb_mb;
2855 while (mpt_mbuf && reinjected == FALSE &&
2856 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2857 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
2858 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2859 mpt_mbuf = mpt_mbuf->m_next;
2860 }
2861 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
2862 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2863 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
2864 mpts->mpts_probecnt),
2865 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2866 }
2867
2868 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
2869
2870 head = tail = NULL;
2871
2872 while (tot_sent < sb_cc) {
2873 ssize_t mlen;
2874
2875 mlen = mpt_mbuf->m_len;
2876 mlen -= off;
2877 mlen = min(mlen, sb_cc - tot_sent);
2878
2879 if (mlen < 0) {
2880 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2881 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2882 (uint32_t)off, sb_cc, tot_sent),
2883 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2884 goto out;
2885 }
2886
2887 if (mlen == 0) {
2888 goto next;
2889 }
2890
2891 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2892 M_COPYM_MUST_COPY_HDR);
2893 if (m == NULL) {
2894 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2895 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2896 error = ENOBUFS;
2897 break;
2898 }
2899
2900 /* Create a DSN mapping for the data (m_copym does it) */
2901 VERIFY(m->m_flags & M_PKTHDR);
2902 VERIFY(m->m_next == NULL);
2903
2904 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2905 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2906 m->m_pkthdr.mp_dsn = mpt_dsn;
2907 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2908 m->m_pkthdr.len = mlen;
2909
2910 if (head == NULL) {
2911 head = tail = m;
2912 } else {
2913 tail->m_next = m;
2914 tail = m;
2915 }
2916
2917 tot_sent += mlen;
2918 off = 0;
2919 next:
2920 mpt_mbuf = mpt_mbuf->m_next;
2921 }
2922
2923 if (reinjected) {
2924 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2925 struct mbuf *n = sb_mb;
2926
2927 while (n) {
2928 n->m_pkthdr.mp_dsn += sb_cc;
2929 n->m_pkthdr.mp_rlen -= sb_cc;
2930 n = n->m_next;
2931 }
2932 m_adj(sb_mb, sb_cc);
2933 } else {
2934 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2935 m_freem(sb_mb);
2936 }
2937 }
2938
2939 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2940 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2941 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2942
2943 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2944 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2945 tot_sent);
2946 }
2947
2948 /* Now, let's update rel-seq and the data-level length */
2949 mpts->mpts_rel_seq += tot_sent;
2950 m = head;
2951 while (m) {
2952 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
2953 m->m_pkthdr.mp_csum = dss_csum;
2954 }
2955 m->m_pkthdr.mp_rlen = tot_sent;
2956 m = m->m_next;
2957 }
2958
2959 if (head != NULL) {
2960 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2961 (tp->t_tfo_stats == 0)) {
2962 tp->t_mpflags |= TMPF_TFO_REQUEST;
2963 }
2964
2965 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2966
2967 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
2968 struct sockbuf *, &so->so_rcv,
2969 struct sockbuf *, &so->so_snd,
2970 struct mptses *, mpte, struct mptsub *, mpts,
2971 size_t, tot_sent);
2972 }
2973
2974 done_sending:
2975 if (error == 0 ||
2976 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2977 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
2978
2979 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2980 tcpstat.tcps_mp_num_probes++;
2981 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
2982 mpts->mpts_probecnt += 1;
2983 } else {
2984 mpts->mpts_probecnt +=
2985 tot_sent / mpts->mpts_maxseg;
2986 }
2987 }
2988
2989 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2990 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
2991 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
2992 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2993 }
2994 mp_tp->mpt_sndnxt = new_sndnxt;
2995 }
2996
2997 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
2998
2999 /* Must be here as mptcp_can_send_more() checks for this */
3000 soclearfastopen(mp_so);
3001
3002 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3003 (mpts->mpts_probesoon != 0)) {
3004 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3005 __func__, mpts->mpts_connid,
3006 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3007 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3008 (tcp_now - mpts->mpts_probesoon)),
3009 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3010 }
3011
3012 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3013 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
3014
3015 mpte->mpte_used_cell = 1;
3016 } else {
3017 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
3018
3019 mpte->mpte_used_wifi = 1;
3020 }
3021
3022 /*
3023 * Don't propagate EWOULDBLOCK - it's already taken care of
3024 * in mptcp_usr_send for TFO.
3025 */
3026 error = 0;
3027 } else {
3028 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3029 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
3030 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
3031 }
3032 out:
3033
3034 if (wakeup) {
3035 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3036 }
3037
3038 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3039 return error;
3040
3041 zero_len_write:
3042 /* Opting to call pru_send as no mbuf at subflow level */
3043 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3044 NULL, current_proc());
3045
3046 goto done_sending;
3047 }
3048
3049 static void
3050 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3051 {
3052 struct mbuf *n, *prev = NULL;
3053
3054 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3055 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3056 m->m_pkthdr.mp_rseq),
3057 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3058
3059 n = mpte->mpte_reinjectq;
3060
3061 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3062 * equal than m's sequence number.
3063 */
3064 while (n) {
3065 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3066 break;
3067 }
3068
3069 prev = n;
3070
3071 n = n->m_nextpkt;
3072 }
3073
3074 if (n) {
3075 /* m is already fully covered by the next mbuf in the queue */
3076 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3077 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3078 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
3079 __func__, n->m_pkthdr.mp_rlen),
3080 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3081 goto dont_queue;
3082 }
3083
3084 /* m is covering the next mbuf entirely, thus we remove this guy */
3085 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3086 struct mbuf *tmp = n->m_nextpkt;
3087
3088 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
3089 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3090 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
3091 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3092
3093 m->m_nextpkt = NULL;
3094 if (prev == NULL) {
3095 mpte->mpte_reinjectq = tmp;
3096 } else {
3097 prev->m_nextpkt = tmp;
3098 }
3099
3100 m_freem(n);
3101 n = tmp;
3102 }
3103 }
3104
3105 if (prev) {
3106 /* m is already fully covered by the previous mbuf in the queue */
3107 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3108 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
3109 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
3110 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3111 goto dont_queue;
3112 }
3113 }
3114
3115 if (prev == NULL) {
3116 mpte->mpte_reinjectq = m;
3117 } else {
3118 prev->m_nextpkt = m;
3119 }
3120
3121 m->m_nextpkt = n;
3122
3123 return;
3124
3125 dont_queue:
3126 m_freem(m);
3127 return;
3128 }
3129
3130 static struct mbuf *
3131 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3132 {
3133 struct socket *mp_so = mptetoso(mpte);
3134 struct mbuf *m;
3135
3136 m = mp_so->so_snd.sb_mb;
3137
3138 while (m) {
3139 /* If this segment covers what we are looking for, return it. */
3140 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3141 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3142 break;
3143 }
3144
3145
3146 /* Segment is no more in the queue */
3147 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3148 return NULL;
3149 }
3150
3151 m = m->m_next;
3152 }
3153
3154 return m;
3155 }
3156
3157 static struct mbuf *
3158 mptcp_copy_mbuf_list(struct mbuf *m, int len)
3159 {
3160 struct mbuf *top = NULL, *tail = NULL;
3161 uint64_t dsn;
3162 uint32_t dlen, rseq;
3163
3164 dsn = m->m_pkthdr.mp_dsn;
3165 dlen = m->m_pkthdr.mp_rlen;
3166 rseq = m->m_pkthdr.mp_rseq;
3167
3168 while (len > 0) {
3169 struct mbuf *n;
3170
3171 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3172
3173 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3174 if (n == NULL) {
3175 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
3176 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
3177 goto err;
3178 }
3179
3180 VERIFY(n->m_flags & M_PKTHDR);
3181 VERIFY(n->m_next == NULL);
3182 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3183 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3184 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3185 VERIFY(n->m_len == m->m_len);
3186
3187 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3188
3189 if (top == NULL) {
3190 top = n;
3191 }
3192
3193 if (tail != NULL) {
3194 tail->m_next = n;
3195 }
3196
3197 tail = n;
3198
3199 len -= m->m_len;
3200 m = m->m_next;
3201 }
3202
3203 return top;
3204
3205 err:
3206 if (top) {
3207 m_freem(top);
3208 }
3209
3210 return NULL;
3211 }
3212
3213 static void
3214 mptcp_reinject_mbufs(struct socket *so)
3215 {
3216 struct tcpcb *tp = sototcpcb(so);
3217 struct mptsub *mpts = tp->t_mpsub;
3218 struct mptcb *mp_tp = tptomptp(tp);
3219 struct mptses *mpte = mp_tp->mpt_mpte;;
3220 struct sockbuf *sb = &so->so_snd;
3221 struct mbuf *m;
3222
3223 m = sb->sb_mb;
3224 while (m) {
3225 struct mbuf *n = m->m_next, *orig = m;
3226
3227 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3228 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3229 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3230 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3231
3232 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3233
3234 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3235 goto next;
3236 }
3237
3238 /* Has it all already been acknowledged at the data-level? */
3239 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3240 goto next;
3241 }
3242
3243 /* Part of this has already been acknowledged - lookup in the
3244 * MPTCP-socket for the segment.
3245 */
3246 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3247 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3248 if (m == NULL) {
3249 goto next;
3250 }
3251 }
3252
3253 /* Copy the mbuf with headers (aka, DSN-numbers) */
3254 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
3255 if (m == NULL) {
3256 break;
3257 }
3258
3259 VERIFY(m->m_nextpkt == NULL);
3260
3261 /* Now, add to the reinject-queue, eliminating overlapping
3262 * segments
3263 */
3264 mptcp_add_reinjectq(mpte, m);
3265
3266 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3267
3268 next:
3269 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3270 while (n) {
3271 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3272
3273 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3274 break;
3275 }
3276
3277 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3278 n = n->m_next;
3279 }
3280
3281 m = n;
3282 }
3283 }
3284
3285 void
3286 mptcp_clean_reinjectq(struct mptses *mpte)
3287 {
3288 struct mptcb *mp_tp = mpte->mpte_mptcb;
3289
3290 mpte_lock_assert_held(mpte);
3291
3292 while (mpte->mpte_reinjectq) {
3293 struct mbuf *m = mpte->mpte_reinjectq;
3294
3295 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3296 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3297 break;
3298 }
3299
3300 mpte->mpte_reinjectq = m->m_nextpkt;
3301 m->m_nextpkt = NULL;
3302 m_freem(m);
3303 }
3304 }
3305
3306 /*
3307 * Subflow socket control event upcall.
3308 */
3309 static void
3310 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3311 {
3312 #pragma unused(so)
3313 struct mptsub *mpts = arg;
3314 struct mptses *mpte = mpts->mpts_mpte;
3315
3316 VERIFY(mpte != NULL);
3317 mpte_lock_assert_held(mpte);
3318
3319 if ((mpts->mpts_evctl & events) == events) {
3320 return;
3321 }
3322
3323 mpts->mpts_evctl |= events;
3324
3325 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3326 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3327 return;
3328 }
3329
3330 mptcp_subflow_workloop(mpte);
3331 }
3332
3333 /*
3334 * Subflow socket control events.
3335 *
3336 * Called for handling events related to the underlying subflow socket.
3337 */
3338 static ev_ret_t
3339 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3340 uint64_t *p_mpsofilt_hint)
3341 {
3342 ev_ret_t ret = MPTS_EVRET_OK;
3343 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3344 sizeof(mpsub_ev_entry_tbl[0]);
3345
3346 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3347
3348 /* bail if there's nothing to process */
3349 if (!mpts->mpts_evctl) {
3350 return ret;
3351 }
3352
3353 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3354 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3355 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3356 SO_FILT_HINT_DISCONNECTED)) {
3357 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3358 }
3359
3360 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3361 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3362
3363 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3364 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3365 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3366
3367 /*
3368 * Process all the socket filter hints and reset the hint
3369 * once it is handled
3370 */
3371 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3372 /*
3373 * Always execute the DISCONNECTED event, because it will wakeup
3374 * the app.
3375 */
3376 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3377 (ret >= MPTS_EVRET_OK ||
3378 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3379 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3380 ev_ret_t error =
3381 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3382 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3383 }
3384 }
3385
3386 /*
3387 * We should be getting only events specified via sock_catchevents(),
3388 * so loudly complain if we have any unprocessed one(s).
3389 */
3390 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK) {
3391 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3392 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3393 mpts->mpts_connid,
3394 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3395 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3396 } else {
3397 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3398 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3399 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3400 }
3401
3402 return ret;
3403 }
3404
3405 static ev_ret_t
3406 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3407 uint64_t *p_mpsofilt_hint, uint64_t event)
3408 {
3409 struct socket *mp_so, *so;
3410 struct mptcb *mp_tp;
3411
3412 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3413 VERIFY(mpte->mpte_mppcb != NULL);
3414 mp_so = mptetoso(mpte);
3415 mp_tp = mpte->mpte_mptcb;
3416 so = mpts->mpts_socket;
3417
3418 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3419 mpts->mpts_connid, event),
3420 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3421
3422 /*
3423 * We got an event for this subflow that might need to be propagated,
3424 * based on the state of the MPTCP connection.
3425 */
3426 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3427 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3428 mp_so->so_error = so->so_error;
3429 *p_mpsofilt_hint |= event;
3430 }
3431
3432 return MPTS_EVRET_OK;
3433 }
3434
3435 /*
3436 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3437 */
3438 static ev_ret_t
3439 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3440 uint64_t *p_mpsofilt_hint, uint64_t event)
3441 {
3442 #pragma unused(p_mpsofilt_hint, event)
3443 struct socket *mp_so;
3444 struct tcpcb *tp;
3445
3446 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3447
3448 VERIFY(mpte->mpte_mppcb != NULL);
3449 mp_so = mptetoso(mpte);
3450 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3451
3452 /*
3453 * This overwrites any previous mpte_lost_aid to avoid storing
3454 * too much state when the typical case has only two subflows.
3455 */
3456 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3457 mpte->mpte_lost_aid = tp->t_local_aid;
3458
3459 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3460 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3461
3462 /*
3463 * The subflow connection has lost its source address.
3464 */
3465 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3466
3467 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3468 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3469 }
3470
3471 return MPTS_EVRET_DELETE;
3472 }
3473
3474 /*
3475 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3476 * indicates that the remote side sent a Data FIN
3477 */
3478 static ev_ret_t
3479 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3480 uint64_t *p_mpsofilt_hint, uint64_t event)
3481 {
3482 #pragma unused(event)
3483 struct mptcb *mp_tp;
3484
3485 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3486 mp_tp = mpte->mpte_mptcb;
3487
3488 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3489 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3490
3491 /*
3492 * We got a Data FIN for the MPTCP connection.
3493 * The FIN may arrive with data. The data is handed up to the
3494 * mptcp socket and the user is notified so that it may close
3495 * the socket if needed.
3496 */
3497 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3498 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3499 }
3500
3501 return MPTS_EVRET_OK; /* keep the subflow socket around */
3502 }
3503
3504 /*
3505 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3506 */
3507 static ev_ret_t
3508 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3509 uint64_t *p_mpsofilt_hint, uint64_t event)
3510 {
3511 #pragma unused(event, p_mpsofilt_hint)
3512 struct mptsub *mpts_alt = NULL;
3513 struct socket *alt_so = NULL;
3514 struct socket *mp_so;
3515 int altpath_exists = 0;
3516
3517 mpte_lock_assert_held(mpte);
3518 mp_so = mptetoso(mpte);
3519 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3520 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3521 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3522
3523 mptcp_reinject_mbufs(mpts->mpts_socket);
3524
3525 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
3526 /*
3527 * If there is no alternate eligible subflow, ignore the
3528 * failover hint.
3529 */
3530 if (mpts_alt == NULL) {
3531 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3532 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3533
3534 goto done;
3535 }
3536
3537 altpath_exists = 1;
3538 alt_so = mpts_alt->mpts_socket;
3539 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3540 /* All data acknowledged and no RTT spike */
3541 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3542 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3543 } else {
3544 /* no alternate path available */
3545 altpath_exists = 0;
3546 }
3547 }
3548
3549 if (altpath_exists) {
3550 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3551
3552 mpte->mpte_active_sub = mpts_alt;
3553 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3554 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3555
3556 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3557 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3558 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3559
3560 mptcpstats_inc_switch(mpte, mpts);
3561
3562 sowwakeup(alt_so);
3563 } else {
3564 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3565 mpts->mpts_connid),
3566 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3567 done:
3568 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3569 }
3570
3571 return MPTS_EVRET_OK;
3572 }
3573
3574 /*
3575 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3576 */
3577 static ev_ret_t
3578 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3579 uint64_t *p_mpsofilt_hint, uint64_t event)
3580 {
3581 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3582 VERIFY(mpte->mpte_mppcb != NULL);
3583
3584 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3585 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3586
3587 /*
3588 * The subflow connection cannot use the outgoing interface, let's
3589 * close this subflow.
3590 */
3591 mptcp_subflow_abort(mpts, EPERM);
3592
3593 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3594
3595 return MPTS_EVRET_DELETE;
3596 }
3597
3598 /*
3599 * https://tools.ietf.org/html/rfc6052#section-2
3600 * https://tools.ietf.org/html/rfc6147#section-5.2
3601 */
3602 static boolean_t
3603 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3604 const struct ipv6_prefix *prefix,
3605 struct in_addr *addrv4)
3606 {
3607 char buf[MAX_IPv4_STR_LEN];
3608 char *ptrv4 = (char *)addrv4;
3609 const char *ptr = (const char *)addr;
3610
3611 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3612 return false;
3613 }
3614
3615 switch (prefix->prefix_len) {
3616 case NAT64_PREFIX_LEN_96:
3617 memcpy(ptrv4, ptr + 12, 4);
3618 break;
3619 case NAT64_PREFIX_LEN_64:
3620 memcpy(ptrv4, ptr + 9, 4);
3621 break;
3622 case NAT64_PREFIX_LEN_56:
3623 memcpy(ptrv4, ptr + 7, 1);
3624 memcpy(ptrv4 + 1, ptr + 9, 3);
3625 break;
3626 case NAT64_PREFIX_LEN_48:
3627 memcpy(ptrv4, ptr + 6, 2);
3628 memcpy(ptrv4 + 2, ptr + 9, 2);
3629 break;
3630 case NAT64_PREFIX_LEN_40:
3631 memcpy(ptrv4, ptr + 5, 3);
3632 memcpy(ptrv4 + 3, ptr + 9, 1);
3633 break;
3634 case NAT64_PREFIX_LEN_32:
3635 memcpy(ptrv4, ptr + 4, 4);
3636 break;
3637 default:
3638 panic("NAT64-prefix len is wrong: %u\n",
3639 prefix->prefix_len);
3640 }
3641
3642 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3643 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3644
3645 return true;
3646 }
3647
3648 static void
3649 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3650 {
3651 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3652 struct socket *so = mpts->mpts_socket;
3653 struct ifnet *ifp;
3654 int j;
3655
3656 ifp = sotoinpcb(so)->inp_last_outifp;
3657
3658 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3659 mptcp_ask_for_nat64(ifp);
3660 return;
3661 }
3662
3663
3664 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3665 int success;
3666
3667 if (nat64prefixes[j].prefix_len == 0) {
3668 continue;
3669 }
3670
3671 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3672 &nat64prefixes[j],
3673 &mpte->mpte_dst_v4_nat64.sin_addr);
3674 if (success) {
3675 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3676 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3677 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3678 break;
3679 }
3680 }
3681 }
3682
3683 /*
3684 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3685 */
3686 static ev_ret_t
3687 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3688 uint64_t *p_mpsofilt_hint, uint64_t event)
3689 {
3690 #pragma unused(event, p_mpsofilt_hint)
3691 struct socket *mp_so, *so;
3692 struct inpcb *inp;
3693 struct tcpcb *tp;
3694 struct mptcb *mp_tp;
3695 int af;
3696 boolean_t mpok = FALSE;
3697
3698 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3699 VERIFY(mpte->mpte_mppcb != NULL);
3700
3701 mp_so = mptetoso(mpte);
3702 mp_tp = mpte->mpte_mptcb;
3703 so = mpts->mpts_socket;
3704 tp = sototcpcb(so);
3705 af = mpts->mpts_dst.sa_family;
3706
3707 if (mpts->mpts_flags & MPTSF_CONNECTED) {
3708 return MPTS_EVRET_OK;
3709 }
3710
3711 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3712 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
3713 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3714 (so->so_state & SS_ISCONNECTED)) {
3715 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3716 __func__, mpts->mpts_connid),
3717 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3718 (void) soshutdownlock(so, SHUT_RD);
3719 (void) soshutdownlock(so, SHUT_WR);
3720 (void) sodisconnectlocked(so);
3721 }
3722 return MPTS_EVRET_OK;
3723 }
3724
3725 /*
3726 * The subflow connection has been connected. Find out whether it
3727 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3728 *
3729 * a. If MPTCP connection is not yet established, then this must be
3730 * the first subflow connection. If MPTCP failed to negotiate,
3731 * fallback to regular TCP by degrading this subflow.
3732 *
3733 * b. If MPTCP connection has been established, then this must be
3734 * one of the subsequent subflow connections. If MPTCP failed
3735 * to negotiate, disconnect the connection.
3736 *
3737 * Right now, we simply unblock any waiters at the MPTCP socket layer
3738 * if the MPTCP connection has not been established.
3739 */
3740
3741 if (so->so_state & SS_ISDISCONNECTED) {
3742 /*
3743 * With MPTCP joins, a connection is connected at the subflow
3744 * level, but the 4th ACK from the server elevates the MPTCP
3745 * subflow to connected state. So there is a small window
3746 * where the subflow could get disconnected before the
3747 * connected event is processed.
3748 */
3749 return MPTS_EVRET_OK;
3750 }
3751
3752 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
3753 mptcp_drop_tfo_data(mpte, mpts);
3754 }
3755
3756 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3757 mpts->mpts_flags |= MPTSF_CONNECTED;
3758
3759 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
3760 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3761 }
3762
3763 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3764
3765 /* get/verify the outbound interface */
3766 inp = sotoinpcb(so);
3767
3768 mpts->mpts_maxseg = tp->t_maxseg;
3769
3770 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3771 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3772 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3773 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
3774
3775 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
3776
3777 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3778 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3779 mpte->mpte_associd = mpts->mpts_connid;
3780 DTRACE_MPTCP2(state__change,
3781 struct mptcb *, mp_tp,
3782 uint32_t, 0 /* event */);
3783
3784 if (SOCK_DOM(so) == AF_INET) {
3785 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3786 } else {
3787 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3788 }
3789
3790 mpts->mpts_flags |= MPTSF_ACTIVE;
3791
3792 /* case (a) above */
3793 if (!mpok) {
3794 tcpstat.tcps_mpcap_fallback++;
3795
3796 tp->t_mpflags |= TMPF_INFIN_SENT;
3797 mptcp_notify_mpfail(so);
3798 } else {
3799 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3800 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3801 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3802 } else {
3803 mpts->mpts_flags |= MPTSF_PREFERRED;
3804 }
3805 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3806 mpte->mpte_nummpcapflows++;
3807
3808 if (SOCK_DOM(so) == AF_INET6) {
3809 mptcp_handle_ipv6_connection(mpte, mpts);
3810 }
3811
3812 mptcp_check_subflows_and_add(mpte);
3813
3814 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
3815 mpte->mpte_initial_cell = 1;
3816 }
3817
3818 mpte->mpte_handshake_success = 1;
3819 }
3820
3821 mp_tp->mpt_sndwnd = tp->snd_wnd;
3822 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3823 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3824 soisconnected(mp_so);
3825
3826 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3827 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3828 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
3829 } else if (mpok) {
3830 /*
3831 * case (b) above
3832 * In case of additional flows, the MPTCP socket is not
3833 * MPTSF_MP_CAPABLE until an ACK is received from server
3834 * for 3-way handshake. TCP would have guaranteed that this
3835 * is an MPTCP subflow.
3836 */
3837 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3838 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3839 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3840 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3841 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3842 } else {
3843 mpts->mpts_flags |= MPTSF_PREFERRED;
3844 }
3845
3846 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3847 mpte->mpte_nummpcapflows++;
3848
3849 mpts->mpts_rel_seq = 1;
3850
3851 mptcp_check_subflows_and_remove(mpte);
3852 } else {
3853 unsigned int i;
3854
3855 /* Should we try the alternate port? */
3856 if (mpte->mpte_alternate_port &&
3857 inp->inp_fport != mpte->mpte_alternate_port) {
3858 union sockaddr_in_4_6 dst;
3859 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3860
3861 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3862
3863 dst_in->sin_port = mpte->mpte_alternate_port;
3864
3865 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3866 mpts->mpts_ifscope, NULL);
3867 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3868 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3869 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3870
3871 if (inp->inp_last_outifp->if_index == info->ifindex) {
3872 info->no_mptcp_support = 1;
3873 break;
3874 }
3875 }
3876 }
3877
3878 tcpstat.tcps_join_fallback++;
3879 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
3880 tcpstat.tcps_mptcp_cell_proxy++;
3881 } else {
3882 tcpstat.tcps_mptcp_wifi_proxy++;
3883 }
3884
3885 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3886
3887 return MPTS_EVRET_OK;
3888 }
3889
3890 /* This call, just to "book" an entry in the stats-table for this ifindex */
3891 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3892
3893 mptcp_output(mpte);
3894
3895 return MPTS_EVRET_OK; /* keep the subflow socket around */
3896 }
3897
3898 /*
3899 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3900 */
3901 static ev_ret_t
3902 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
3903 uint64_t *p_mpsofilt_hint, uint64_t event)
3904 {
3905 #pragma unused(event, p_mpsofilt_hint)
3906 struct socket *mp_so, *so;
3907 struct mptcb *mp_tp;
3908
3909 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3910 VERIFY(mpte->mpte_mppcb != NULL);
3911 mp_so = mptetoso(mpte);
3912 mp_tp = mpte->mpte_mptcb;
3913 so = mpts->mpts_socket;
3914
3915 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3916 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3917 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3918 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3919 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3920
3921 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
3922 return MPTS_EVRET_DELETE;
3923 }
3924
3925 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3926
3927 /* The subflow connection has been disconnected. */
3928
3929 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3930 mpte->mpte_nummpcapflows--;
3931 if (mpte->mpte_active_sub == mpts) {
3932 mpte->mpte_active_sub = NULL;
3933 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3934 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3935 }
3936 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3937 }
3938
3939 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3940 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3941 mptcp_drop(mpte, mp_tp, so->so_error);
3942 }
3943
3944 if (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV) {
3945 mptcp_drop(mpte, mp_tp, mp_so->so_error);
3946 }
3947
3948 /*
3949 * Clear flags that are used by getconninfo to return state.
3950 * Retain like MPTSF_DELETEOK for internal purposes.
3951 */
3952 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
3953 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
3954 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
3955
3956 return MPTS_EVRET_DELETE;
3957 }
3958
3959 /*
3960 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3961 */
3962 static ev_ret_t
3963 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3964 uint64_t *p_mpsofilt_hint, uint64_t event)
3965 {
3966 #pragma unused(event, p_mpsofilt_hint)
3967 struct socket *mp_so, *so;
3968 struct mptcb *mp_tp;
3969 ev_ret_t ret = MPTS_EVRET_OK;
3970
3971 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3972 VERIFY(mpte->mpte_mppcb != NULL);
3973 mp_so = mptetoso(mpte);
3974 mp_tp = mpte->mpte_mptcb;
3975 so = mpts->mpts_socket;
3976
3977 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
3978 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3979 } else {
3980 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3981 }
3982
3983 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3984 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3985 goto done;
3986 }
3987 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3988 } else {
3989 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3990 }
3991
3992 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
3993 mpts->mpts_flags |= MPTSF_MP_READY;
3994 } else {
3995 mpts->mpts_flags &= ~MPTSF_MP_READY;
3996 }
3997
3998 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3999 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4000 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4001 }
4002
4003 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4004 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
4005 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4006
4007 m_freem_list(mpte->mpte_reinjectq);
4008 mpte->mpte_reinjectq = NULL;
4009 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4010 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4011 ret = MPTS_EVRET_CONNECT_PENDING;
4012 }
4013
4014 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
4015 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4016 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
4017 mpts->mpts_flags, MPTSF_BITS),
4018 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4019
4020 done:
4021 return ret;
4022 }
4023
4024 /*
4025 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4026 */
4027 static ev_ret_t
4028 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4029 uint64_t *p_mpsofilt_hint, uint64_t event)
4030 {
4031 #pragma unused(event)
4032 struct socket *mp_so, *so;
4033 struct mptcb *mp_tp;
4034 boolean_t is_fastclose;
4035
4036 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4037 VERIFY(mpte->mpte_mppcb != NULL);
4038 mp_so = mptetoso(mpte);
4039 mp_tp = mpte->mpte_mptcb;
4040 so = mpts->mpts_socket;
4041
4042 /* We got an invalid option or a fast close */
4043 struct tcptemp *t_template;
4044 struct inpcb *inp = sotoinpcb(so);
4045 struct tcpcb *tp = NULL;
4046
4047 tp = intotcpcb(inp);
4048 so->so_error = ECONNABORTED;
4049
4050 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4051
4052 t_template = tcp_maketemplate(tp);
4053 if (t_template) {
4054 struct tcp_respond_args tra;
4055
4056 bzero(&tra, sizeof(tra));
4057 if (inp->inp_flags & INP_BOUND_IF) {
4058 tra.ifscope = inp->inp_boundifp->if_index;
4059 } else {
4060 tra.ifscope = IFSCOPE_NONE;
4061 }
4062 tra.awdl_unrestricted = 1;
4063
4064 tcp_respond(tp, t_template->tt_ipgen,
4065 &t_template->tt_t, (struct mbuf *)NULL,
4066 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4067 (void) m_free(dtom(t_template));
4068 mptcplog((LOG_DEBUG, "MPTCP Events: "
4069 "%s: mp_so 0x%llx cid %d \n",
4070 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4071 so, mpts->mpts_connid),
4072 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4073 }
4074 mptcp_subflow_abort(mpts, ECONNABORTED);
4075
4076 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4077 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4078
4079 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4080 mp_so->so_error = ECONNABORTED;
4081 } else {
4082 mp_so->so_error = ECONNRESET;
4083 }
4084
4085 /*
4086 * mptcp_drop is being called after processing the events, to fully
4087 * close the MPTCP connection
4088 */
4089 }
4090
4091 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4092 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4093 }
4094
4095 return MPTS_EVRET_DELETE;
4096 }
4097
4098 static ev_ret_t
4099 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4100 uint64_t *p_mpsofilt_hint, uint64_t event)
4101 {
4102 #pragma unused(event)
4103 bool found_active = false;
4104
4105 mpts->mpts_flags |= MPTSF_READ_STALL;
4106
4107 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4108 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4109
4110 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4111 TCPS_HAVERCVDFIN2(tp->t_state)) {
4112 continue;
4113 }
4114
4115 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4116 found_active = true;
4117 break;
4118 }
4119 }
4120
4121 if (!found_active) {
4122 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4123 }
4124
4125 return MPTS_EVRET_OK;
4126 }
4127
4128 static ev_ret_t
4129 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4130 uint64_t *p_mpsofilt_hint, uint64_t event)
4131 {
4132 #pragma unused(event)
4133 bool found_active = false;
4134
4135 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4136
4137 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4138 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4139
4140 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4141 tp->t_state > TCPS_CLOSE_WAIT) {
4142 continue;
4143 }
4144
4145 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4146 found_active = true;
4147 break;
4148 }
4149 }
4150
4151 if (!found_active) {
4152 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4153 }
4154
4155 return MPTS_EVRET_OK;
4156 }
4157
4158 static const char *
4159 mptcp_evret2str(ev_ret_t ret)
4160 {
4161 const char *c = "UNKNOWN";
4162
4163 switch (ret) {
4164 case MPTS_EVRET_DELETE:
4165 c = "MPTS_EVRET_DELETE";
4166 break;
4167 case MPTS_EVRET_CONNECT_PENDING:
4168 c = "MPTS_EVRET_CONNECT_PENDING";
4169 break;
4170 case MPTS_EVRET_DISCONNECT_FALLBACK:
4171 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
4172 break;
4173 case MPTS_EVRET_OK:
4174 c = "MPTS_EVRET_OK";
4175 break;
4176 default:
4177 break;
4178 }
4179 return c;
4180 }
4181
4182 /*
4183 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4184 * caller must ensure that the option can be issued on subflow sockets, via
4185 * MPOF_SUBFLOW_OK flag.
4186 */
4187 int
4188 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4189 {
4190 struct socket *mp_so, *so;
4191 struct sockopt sopt;
4192 int error;
4193
4194 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4195 mpte_lock_assert_held(mpte);
4196
4197 mp_so = mptetoso(mpte);
4198 so = mpts->mpts_socket;
4199
4200 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4201 mpo->mpo_level == SOL_SOCKET &&
4202 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4203 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4204
4205 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4206 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
4207 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4208 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4209 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4210
4211 /*
4212 * When we open a new subflow, mark it as cell fallback, if
4213 * this subflow goes over cell.
4214 *
4215 * (except for first-party apps)
4216 */
4217
4218 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4219 return 0;
4220 }
4221
4222 if (sotoinpcb(so)->inp_last_outifp &&
4223 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4224 return 0;
4225 }
4226
4227 /*
4228 * This here is an OR, because if the app is not binding to the
4229 * interface, then it definitely is not a cell-fallback
4230 * connection.
4231 */
4232 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4233 !IFNET_IS_CELLULAR(ifp)) {
4234 return 0;
4235 }
4236 }
4237
4238 mpo->mpo_flags &= ~MPOF_INTERIM;
4239
4240 bzero(&sopt, sizeof(sopt));
4241 sopt.sopt_dir = SOPT_SET;
4242 sopt.sopt_level = mpo->mpo_level;
4243 sopt.sopt_name = mpo->mpo_name;
4244 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4245 sopt.sopt_valsize = sizeof(int);
4246 sopt.sopt_p = kernproc;
4247
4248 error = sosetoptlock(so, &sopt, 0);
4249 if (error == 0) {
4250 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
4251 "val %d set successful\n", __func__,
4252 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4253 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4254 mpo->mpo_intval),
4255 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4256 } else {
4257 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
4258 "val %d set error %d\n", __func__,
4259 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4260 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4261 mpo->mpo_intval, error),
4262 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4263 }
4264 return error;
4265 }
4266
4267 /*
4268 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4269 * caller must ensure that the option can be issued on subflow sockets, via
4270 * MPOF_SUBFLOW_OK flag.
4271 */
4272 int
4273 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4274 struct mptopt *mpo)
4275 {
4276 struct socket *mp_so;
4277 struct sockopt sopt;
4278 int error;
4279
4280 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4281 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4282 mp_so = mptetoso(mpte);
4283
4284 bzero(&sopt, sizeof(sopt));
4285 sopt.sopt_dir = SOPT_GET;
4286 sopt.sopt_level = mpo->mpo_level;
4287 sopt.sopt_name = mpo->mpo_name;
4288 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4289 sopt.sopt_valsize = sizeof(int);
4290 sopt.sopt_p = kernproc;
4291
4292 error = sogetoptlock(so, &sopt, 0); /* already locked */
4293 if (error == 0) {
4294 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4295 "%s: mp_so 0x%llx sopt %s "
4296 "val %d get successful\n", __func__,
4297 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4298 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4299 mpo->mpo_intval),
4300 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4301 } else {
4302 mptcplog((LOG_ERR, "MPTCP Socket: "
4303 "%s: mp_so 0x%llx sopt %s get error %d\n",
4304 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4305 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
4306 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4307 }
4308 return error;
4309 }
4310
4311
4312 /*
4313 * MPTCP garbage collector.
4314 *
4315 * This routine is called by the MP domain on-demand, periodic callout,
4316 * which is triggered when a MPTCP socket is closed. The callout will
4317 * repeat as long as this routine returns a non-zero value.
4318 */
4319 static uint32_t
4320 mptcp_gc(struct mppcbinfo *mppi)
4321 {
4322 struct mppcb *mpp, *tmpp;
4323 uint32_t active = 0;
4324
4325 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4326
4327 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4328 struct socket *mp_so;
4329 struct mptses *mpte;
4330 struct mptcb *mp_tp;
4331
4332 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4333 mp_so = mpp->mpp_socket;
4334 VERIFY(mp_so != NULL);
4335 mpte = mptompte(mpp);
4336 VERIFY(mpte != NULL);
4337 mp_tp = mpte->mpte_mptcb;
4338 VERIFY(mp_tp != NULL);
4339
4340 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4341 "%s: mp_so 0x%llx found "
4342 "(u=%d,r=%d,s=%d)\n", __func__,
4343 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
4344 mp_so->so_retaincnt, mpp->mpp_state),
4345 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4346
4347 if (!mpte_try_lock(mpte)) {
4348 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4349 "%s: mp_so 0x%llx skipped lock "
4350 "(u=%d,r=%d)\n", __func__,
4351 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4352 mp_so->so_usecount, mp_so->so_retaincnt),
4353 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4354 active++;
4355 continue;
4356 }
4357
4358 /* check again under the lock */
4359 if (mp_so->so_usecount > 0) {
4360 boolean_t wakeup = FALSE;
4361 struct mptsub *mpts, *tmpts;
4362
4363 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4364 "%s: mp_so 0x%llx skipped usecount "
4365 "[u=%d,r=%d] %d %d\n", __func__,
4366 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4367 mp_so->so_usecount, mp_so->so_retaincnt,
4368 mp_tp->mpt_gc_ticks,
4369 mp_tp->mpt_state),
4370 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4371
4372 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4373 if (mp_tp->mpt_gc_ticks > 0) {
4374 mp_tp->mpt_gc_ticks--;
4375 }
4376 if (mp_tp->mpt_gc_ticks == 0) {
4377 wakeup = TRUE;
4378 }
4379 }
4380 if (wakeup) {
4381 TAILQ_FOREACH_SAFE(mpts,
4382 &mpte->mpte_subflows, mpts_entry, tmpts) {
4383 mptcp_subflow_eupcall1(mpts->mpts_socket,
4384 mpts, SO_FILT_HINT_DISCONNECTED);
4385 }
4386 }
4387 mpte_unlock(mpte);
4388 active++;
4389 continue;
4390 }
4391
4392 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4393 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4394 "[u=%d,r=%d,s=%d]\n", __func__,
4395 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4396 mp_so->so_usecount, mp_so->so_retaincnt,
4397 mpp->mpp_state);
4398 }
4399
4400 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4401 mptcp_close(mpte, mp_tp);
4402 }
4403
4404 mptcp_session_destroy(mpte);
4405
4406 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4407 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
4408 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4409 mp_so->so_usecount, mp_so->so_retaincnt),
4410 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4411
4412 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4413 struct sockbuf *, &mp_so->so_rcv,
4414 struct sockbuf *, &mp_so->so_snd,
4415 struct mppcb *, mpp);
4416
4417 mp_pcbdispose(mpp);
4418 sodealloc(mp_so);
4419 }
4420
4421 return active;
4422 }
4423
4424 /*
4425 * Drop a MPTCP connection, reporting the specified error.
4426 */
4427 struct mptses *
4428 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4429 {
4430 struct socket *mp_so;
4431
4432 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4433 VERIFY(mpte->mpte_mptcb == mp_tp);
4434 mp_so = mptetoso(mpte);
4435
4436 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4437 uint32_t, 0 /* event */);
4438
4439 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4440 errno = mp_tp->mpt_softerror;
4441 }
4442 mp_so->so_error = errno;
4443
4444 return mptcp_close(mpte, mp_tp);
4445 }
4446
4447 /*
4448 * Close a MPTCP control block.
4449 */
4450 struct mptses *
4451 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4452 {
4453 struct socket *mp_so = NULL;
4454 struct mptsub *mpts = NULL, *tmpts = NULL;
4455
4456 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4457 VERIFY(mpte->mpte_mptcb == mp_tp);
4458 mp_so = mptetoso(mpte);
4459
4460 mp_tp->mpt_state = MPTCPS_TERMINATE;
4461
4462 mptcp_freeq(mp_tp);
4463
4464 soisdisconnected(mp_so);
4465
4466 /* Clean up all subflows */
4467 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4468 mptcp_subflow_disconnect(mpte, mpts);
4469 }
4470
4471 return NULL;
4472 }
4473
4474 void
4475 mptcp_notify_close(struct socket *so)
4476 {
4477 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4478 }
4479
4480 /*
4481 * MPTCP workloop.
4482 */
4483 void
4484 mptcp_subflow_workloop(struct mptses *mpte)
4485 {
4486 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4487 uint64_t mpsofilt_hint_mask;
4488 struct mptsub *mpts, *tmpts;
4489 struct socket *mp_so;
4490
4491 mpte_lock_assert_held(mpte);
4492
4493 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4494 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4495 return;
4496 }
4497 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4498
4499 mp_so = mptetoso(mpte);
4500
4501 relaunch:
4502 mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4503 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4504
4505 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4506 ev_ret_t ret;
4507
4508 if (mpts->mpts_socket->so_usecount == 0) {
4509 /* Will be removed soon by tcp_garbage_collect */
4510 continue;
4511 }
4512
4513 mptcp_subflow_addref(mpts);
4514 mpts->mpts_socket->so_usecount++;
4515
4516 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4517
4518 /*
4519 * If MPTCP socket is closed, disconnect all subflows.
4520 * This will generate a disconnect event which will
4521 * be handled during the next iteration, causing a
4522 * non-zero error to be returned above.
4523 */
4524 if (mp_so->so_flags & SOF_PCBCLEARING) {
4525 mptcp_subflow_disconnect(mpte, mpts);
4526 }
4527
4528 switch (ret) {
4529 case MPTS_EVRET_OK:
4530 /* nothing to do */
4531 break;
4532 case MPTS_EVRET_DELETE:
4533 mptcp_subflow_soclose(mpts);
4534 break;
4535 case MPTS_EVRET_CONNECT_PENDING:
4536 connect_pending = TRUE;
4537 break;
4538 case MPTS_EVRET_DISCONNECT_FALLBACK:
4539 disconnect_fallback = TRUE;
4540 break;
4541 default:
4542 mptcplog((LOG_DEBUG,
4543 "MPTCP Socket: %s: mptcp_subflow_events "
4544 "returned invalid value: %d\n", __func__,
4545 ret),
4546 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4547 break;
4548 }
4549 mptcp_subflow_remref(mpts); /* ours */
4550
4551 VERIFY(mpts->mpts_socket->so_usecount != 0);
4552 mpts->mpts_socket->so_usecount--;
4553 }
4554
4555 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4556 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4557
4558 soevent(mp_so, mpsofilt_hint_mask);
4559 }
4560
4561 if (!connect_pending && !disconnect_fallback) {
4562 goto exit;
4563 }
4564
4565 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4566 if (disconnect_fallback) {
4567 struct socket *so = NULL;
4568 struct inpcb *inp = NULL;
4569 struct tcpcb *tp = NULL;
4570
4571 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4572 continue;
4573 }
4574
4575 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4576
4577 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4578 MPTSF_DISCONNECTED | MPTSF_CONNECT_PENDING)) {
4579 continue;
4580 }
4581
4582 so = mpts->mpts_socket;
4583
4584 /*
4585 * The MPTCP connection has degraded to a fallback
4586 * mode, so there is no point in keeping this subflow
4587 * regardless of its MPTCP-readiness state, unless it
4588 * is the primary one which we use for fallback. This
4589 * assumes that the subflow used for fallback is the
4590 * ACTIVE one.
4591 */
4592
4593 inp = sotoinpcb(so);
4594 tp = intotcpcb(inp);
4595 tp->t_mpflags &=
4596 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4597 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4598
4599 if (mpts->mpts_flags & MPTSF_ACTIVE) {
4600 continue;
4601 }
4602 tp->t_mpflags |= TMPF_RESET;
4603 soevent(so, SO_FILT_HINT_MUSTRST);
4604 } else if (connect_pending) {
4605 /*
4606 * The MPTCP connection has progressed to a state
4607 * where it supports full multipath semantics; allow
4608 * additional joins to be attempted for all subflows
4609 * that are in the PENDING state.
4610 */
4611 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4612 int error = mptcp_subflow_soconnectx(mpte, mpts);
4613
4614 if (error) {
4615 mptcp_subflow_abort(mpts, error);
4616 }
4617 }
4618 }
4619 }
4620
4621 exit:
4622 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4623 goto relaunch;
4624 }
4625
4626 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4627 }
4628
4629 /*
4630 * Protocol pr_lock callback.
4631 */
4632 int
4633 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4634 {
4635 struct mppcb *mpp = mpsotomppcb(mp_so);
4636 void *lr_saved;
4637
4638 if (lr == NULL) {
4639 lr_saved = __builtin_return_address(0);
4640 } else {
4641 lr_saved = lr;
4642 }
4643
4644 if (mpp == NULL) {
4645 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4646 mp_so, lr_saved, solockhistory_nr(mp_so));
4647 /* NOTREACHED */
4648 }
4649 mpp_lock(mpp);
4650
4651 if (mp_so->so_usecount < 0) {
4652 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4653 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4654 solockhistory_nr(mp_so));
4655 /* NOTREACHED */
4656 }
4657 if (refcount != 0) {
4658 mp_so->so_usecount++;
4659 }
4660 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4661 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4662
4663 return 0;
4664 }
4665
4666 /*
4667 * Protocol pr_unlock callback.
4668 */
4669 int
4670 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4671 {
4672 struct mppcb *mpp = mpsotomppcb(mp_so);
4673 void *lr_saved;
4674
4675 if (lr == NULL) {
4676 lr_saved = __builtin_return_address(0);
4677 } else {
4678 lr_saved = lr;
4679 }
4680
4681 if (mpp == NULL) {
4682 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4683 mp_so, mp_so->so_usecount, lr_saved,
4684 solockhistory_nr(mp_so));
4685 /* NOTREACHED */
4686 }
4687 mpp_lock_assert_held(mpp);
4688
4689 if (refcount != 0) {
4690 mp_so->so_usecount--;
4691 }
4692
4693 if (mp_so->so_usecount < 0) {
4694 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4695 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4696 /* NOTREACHED */
4697 }
4698 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4699 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4700 mpp_unlock(mpp);
4701
4702 return 0;
4703 }
4704
4705 /*
4706 * Protocol pr_getlock callback.
4707 */
4708 lck_mtx_t *
4709 mptcp_getlock(struct socket *mp_so, int flags)
4710 {
4711 struct mppcb *mpp = mpsotomppcb(mp_so);
4712
4713 if (mpp == NULL) {
4714 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4715 solockhistory_nr(mp_so));
4716 /* NOTREACHED */
4717 }
4718 if (mp_so->so_usecount < 0) {
4719 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4720 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4721 /* NOTREACHED */
4722 }
4723 return mpp_getlock(mpp, flags);
4724 }
4725
4726 /*
4727 * MPTCP Join support
4728 */
4729
4730 static void
4731 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
4732 uint8_t addr_id)
4733 {
4734 struct tcpcb *tp = sototcpcb(so);
4735 struct mptcp_subf_auth_entry *sauth_entry;
4736 mpte_lock_assert_held(mp_tp->mpt_mpte);
4737
4738 /*
4739 * The address ID of the first flow is implicitly 0.
4740 */
4741 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4742 tp->t_local_aid = 0;
4743 } else {
4744 tp->t_local_aid = addr_id;
4745 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4746 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4747 }
4748 sauth_entry = zalloc(mpt_subauth_zone);
4749 sauth_entry->msae_laddr_id = tp->t_local_aid;
4750 sauth_entry->msae_raddr_id = 0;
4751 sauth_entry->msae_raddr_rand = 0;
4752 try_again:
4753 sauth_entry->msae_laddr_rand = RandomULong();
4754 if (sauth_entry->msae_laddr_rand == 0) {
4755 goto try_again;
4756 }
4757 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4758 }
4759
4760 static void
4761 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4762 {
4763 struct mptcp_subf_auth_entry *sauth_entry;
4764 struct tcpcb *tp = NULL;
4765 int found = 0;
4766
4767 tp = sototcpcb(so);
4768 if (tp == NULL) {
4769 return;
4770 }
4771
4772 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4773 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4774 found = 1;
4775 break;
4776 }
4777 }
4778 if (found) {
4779 LIST_REMOVE(sauth_entry, msae_next);
4780 }
4781
4782 if (found) {
4783 zfree(mpt_subauth_zone, sauth_entry);
4784 }
4785 }
4786
4787 void
4788 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4789 u_int32_t *rrand)
4790 {
4791 struct mptcp_subf_auth_entry *sauth_entry;
4792 mpte_lock_assert_held(mp_tp->mpt_mpte);
4793
4794 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4795 if (sauth_entry->msae_laddr_id == addr_id) {
4796 if (lrand) {
4797 *lrand = sauth_entry->msae_laddr_rand;
4798 }
4799 if (rrand) {
4800 *rrand = sauth_entry->msae_raddr_rand;
4801 }
4802 break;
4803 }
4804 }
4805 }
4806
4807 void
4808 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4809 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4810 {
4811 struct mptcp_subf_auth_entry *sauth_entry;
4812 mpte_lock_assert_held(mp_tp->mpt_mpte);
4813
4814 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4815 if (sauth_entry->msae_laddr_id == laddr_id) {
4816 if ((sauth_entry->msae_raddr_id != 0) &&
4817 (sauth_entry->msae_raddr_id != raddr_id)) {
4818 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
4819 " address ids %d %d \n", __func__, raddr_id,
4820 sauth_entry->msae_raddr_id),
4821 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4822 return;
4823 }
4824 sauth_entry->msae_raddr_id = raddr_id;
4825 if ((sauth_entry->msae_raddr_rand != 0) &&
4826 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4827 mptcplog((LOG_ERR, "MPTCP Socket: "
4828 "%s: dup SYN_ACK %d %d \n",
4829 __func__, raddr_rand,
4830 sauth_entry->msae_raddr_rand),
4831 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4832 return;
4833 }
4834 sauth_entry->msae_raddr_rand = raddr_rand;
4835 return;
4836 }
4837 }
4838 }
4839
4840 /*
4841 * SHA1 support for MPTCP
4842 */
4843 static void
4844 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
4845 {
4846 SHA1_CTX sha1ctxt;
4847 const unsigned char *sha1_base;
4848 int sha1_size;
4849
4850 sha1_base = (const unsigned char *) key;
4851 sha1_size = sizeof(mptcp_key_t);
4852 SHA1Init(&sha1ctxt);
4853 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4854 SHA1Final(sha_digest, &sha1ctxt);
4855 }
4856
4857 void
4858 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4859 u_int32_t rand1, u_int32_t rand2, u_char *digest)
4860 {
4861 SHA1_CTX sha1ctxt;
4862 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4863 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4864 u_int32_t data[2];
4865 int i;
4866
4867 bzero(digest, SHA1_RESULTLEN);
4868
4869 /* Set up the Key for HMAC */
4870 key_ipad[0] = key1;
4871 key_ipad[1] = key2;
4872
4873 key_opad[0] = key1;
4874 key_opad[1] = key2;
4875
4876 /* Set up the message for HMAC */
4877 data[0] = rand1;
4878 data[1] = rand2;
4879
4880 /* Key is 512 block length, so no need to compute hash */
4881
4882 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4883
4884 for (i = 0; i < 8; i++) {
4885 key_ipad[i] ^= 0x3636363636363636;
4886 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4887 }
4888
4889 /* Perform inner SHA1 */
4890 SHA1Init(&sha1ctxt);
4891 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
4892 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
4893 SHA1Final(digest, &sha1ctxt);
4894
4895 /* Perform outer SHA1 */
4896 SHA1Init(&sha1ctxt);
4897 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
4898 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4899 SHA1Final(digest, &sha1ctxt);
4900 }
4901
4902 /*
4903 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4904 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4905 */
4906 void
4907 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
4908 {
4909 uint32_t lrand, rrand;
4910
4911 mpte_lock_assert_held(mp_tp->mpt_mpte);
4912
4913 lrand = rrand = 0;
4914 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4915 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4916 digest);
4917 }
4918
4919 /*
4920 * Authentication data generation
4921 */
4922 static void
4923 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4924 int token_len)
4925 {
4926 VERIFY(token_len == sizeof(u_int32_t));
4927 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4928
4929 /* Most significant 32 bits of the SHA1 hash */
4930 bcopy(sha_digest, token, sizeof(u_int32_t));
4931 return;
4932 }
4933
4934 static void
4935 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4936 int idsn_len)
4937 {
4938 VERIFY(idsn_len == sizeof(u_int64_t));
4939 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4940
4941 /*
4942 * Least significant 64 bits of the SHA1 hash
4943 */
4944
4945 idsn[7] = sha_digest[12];
4946 idsn[6] = sha_digest[13];
4947 idsn[5] = sha_digest[14];
4948 idsn[4] = sha_digest[15];
4949 idsn[3] = sha_digest[16];
4950 idsn[2] = sha_digest[17];
4951 idsn[1] = sha_digest[18];
4952 idsn[0] = sha_digest[19];
4953 return;
4954 }
4955
4956 static void
4957 mptcp_conn_properties(struct mptcb *mp_tp)
4958 {
4959 /* There is only Version 0 at this time */
4960 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4961
4962 /* Set DSS checksum flag */
4963 if (mptcp_dss_csum) {
4964 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4965 }
4966
4967 /* Set up receive window */
4968 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4969
4970 /* Set up gc ticks */
4971 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4972 }
4973
4974 static void
4975 mptcp_init_local_parms(struct mptses *mpte)
4976 {
4977 struct mptcb *mp_tp = mpte->mpte_mptcb;
4978 char key_digest[SHA1_RESULTLEN];
4979
4980 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4981 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4982
4983 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
4984 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
4985 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
4986 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t));
4987
4988 /* The subflow SYN is also first MPTCP byte */
4989 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4990 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4991
4992 mptcp_conn_properties(mp_tp);
4993 }
4994
4995 int
4996 mptcp_init_remote_parms(struct mptcb *mp_tp)
4997 {
4998 char remote_digest[SHA1_RESULTLEN];
4999 mpte_lock_assert_held(mp_tp->mpt_mpte);
5000
5001 /* Only Version 0 is supported for auth purposes */
5002 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0) {
5003 return -1;
5004 }
5005
5006 /* Setup local and remote tokens and Initial DSNs */
5007 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5008 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
5009 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5010 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
5011 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t));
5012 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5013
5014 return 0;
5015 }
5016
5017 static void
5018 mptcp_send_dfin(struct socket *so)
5019 {
5020 struct tcpcb *tp = NULL;
5021 struct inpcb *inp = NULL;
5022
5023 inp = sotoinpcb(so);
5024 if (!inp) {
5025 return;
5026 }
5027
5028 tp = intotcpcb(inp);
5029 if (!tp) {
5030 return;
5031 }
5032
5033 if (!(tp->t_mpflags & TMPF_RESET)) {
5034 tp->t_mpflags |= TMPF_SEND_DFIN;
5035 }
5036 }
5037
5038 /*
5039 * Data Sequence Mapping routines
5040 */
5041 void
5042 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5043 {
5044 struct mptcb *mp_tp;
5045
5046 if (m == NULL) {
5047 return;
5048 }
5049
5050 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5051 mpte_lock_assert_held(mp_tp->mpt_mpte);
5052
5053 while (m) {
5054 VERIFY(m->m_flags & M_PKTHDR);
5055 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5056 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5057 m->m_pkthdr.mp_rlen = m_pktlen(m);
5058 mp_tp->mpt_sndmax += m_pktlen(m);
5059 m = m->m_next;
5060 }
5061 }
5062
5063 void
5064 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5065 {
5066 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5067 uint64_t data_ack;
5068 uint64_t dsn;
5069
5070 if (!m || len == 0) {
5071 return;
5072 }
5073
5074 while (m && len > 0) {
5075 VERIFY(m->m_flags & M_PKTHDR);
5076 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5077
5078 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5079 dsn = m->m_pkthdr.mp_dsn;
5080
5081 len -= m->m_len;
5082 m = m->m_next;
5083 }
5084
5085 if (m && len == 0) {
5086 /*
5087 * If there is one more mbuf in the chain, it automatically means
5088 * that up to m->mp_dsn has been ack'ed.
5089 *
5090 * This means, we actually correct data_ack back down (compared
5091 * to what we set inside the loop - dsn + data_len). Because in
5092 * the loop we are "optimistic" and assume that the full mapping
5093 * will be acked. If that's not the case and we get out of the
5094 * loop with m != NULL, it means only up to m->mp_dsn has been
5095 * really acked.
5096 */
5097 data_ack = m->m_pkthdr.mp_dsn;
5098 }
5099
5100 if (len < 0) {
5101 /*
5102 * If len is negative, meaning we acked in the middle of an mbuf,
5103 * only up to this mbuf's data-sequence number has been acked
5104 * at the MPTCP-level.
5105 */
5106 data_ack = dsn;
5107 }
5108
5109 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5110 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5111 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5112 }
5113
5114 void
5115 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5116 {
5117 int rewinding = 0;
5118
5119 /* TFO makes things complicated. */
5120 if (so->so_flags1 & SOF1_TFO_REWIND) {
5121 rewinding = 1;
5122 so->so_flags1 &= ~SOF1_TFO_REWIND;
5123 }
5124
5125 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5126 u_int32_t sub_len;
5127 VERIFY(m->m_flags & M_PKTHDR);
5128 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5129
5130 sub_len = m->m_pkthdr.mp_rlen;
5131
5132 if (sub_len < len) {
5133 m->m_pkthdr.mp_dsn += sub_len;
5134 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5135 m->m_pkthdr.mp_rseq += sub_len;
5136 }
5137 m->m_pkthdr.mp_rlen = 0;
5138 len -= sub_len;
5139 } else {
5140 /* sub_len >= len */
5141 if (rewinding == 0) {
5142 m->m_pkthdr.mp_dsn += len;
5143 }
5144 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5145 if (rewinding == 0) {
5146 m->m_pkthdr.mp_rseq += len;
5147 }
5148 }
5149 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5150 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5151 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5152 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5153 m->m_pkthdr.mp_rlen -= len;
5154 break;
5155 }
5156 m = m->m_next;
5157 }
5158
5159 if (so->so_flags & SOF_MP_SUBFLOW &&
5160 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5161 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5162 /*
5163 * Received an ack without receiving a DATA_ACK.
5164 * Need to fallback to regular TCP (or destroy this subflow).
5165 */
5166 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5167 mptcp_notify_mpfail(so);
5168 }
5169 }
5170
5171 /* Obtain the DSN mapping stored in the mbuf */
5172 void
5173 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5174 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5175 {
5176 u_int64_t dsn64;
5177
5178 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5179 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5180 }
5181
5182 void
5183 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5184 uint32_t *relseq, uint16_t *data_len,
5185 uint16_t *dss_csum)
5186 {
5187 struct mbuf *m = so->so_snd.sb_mb;
5188 int off_orig = off;
5189
5190 VERIFY(off >= 0);
5191
5192 /*
5193 * In the subflow socket, the DSN sequencing can be discontiguous,
5194 * but the subflow sequence mapping is contiguous. Use the subflow
5195 * sequence property to find the right mbuf and corresponding dsn
5196 * mapping.
5197 */
5198
5199 while (m) {
5200 VERIFY(m->m_flags & M_PKTHDR);
5201 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5202
5203 if (off >= m->m_len) {
5204 off -= m->m_len;
5205 m = m->m_next;
5206 } else {
5207 break;
5208 }
5209 }
5210
5211 VERIFY(m);
5212 VERIFY(off >= 0);
5213 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5214
5215 *dsn = m->m_pkthdr.mp_dsn;
5216 *relseq = m->m_pkthdr.mp_rseq;
5217 *data_len = m->m_pkthdr.mp_rlen;
5218 *dss_csum = m->m_pkthdr.mp_csum;
5219
5220 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5221 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5222 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5223 }
5224
5225 /*
5226 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5227 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5228 * When it trims data tcp_input calls m_adj() which does not remove the
5229 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5230 * The dsn map insertion cannot be delayed after trim, because data can be in
5231 * the reassembly queue for a while and the DSN option info in tp will be
5232 * overwritten for every new packet received.
5233 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5234 * with mptcp_adj_rmap()
5235 */
5236 void
5237 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5238 {
5239 VERIFY(m->m_flags & M_PKTHDR);
5240 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5241
5242 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5243 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5244 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5245 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5246 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5247 if (tp->t_rcv_map.mpt_dfin) {
5248 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5249 }
5250
5251 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5252
5253 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5254 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5255 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5256 if (th->th_flags & TH_FIN) {
5257 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5258 }
5259 }
5260 }
5261
5262 int
5263 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
5264 uint32_t rseq, uint16_t dlen)
5265 {
5266 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
5267
5268 if (m_pktlen(m) == 0) {
5269 return 0;
5270 }
5271
5272 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5273 if (off && (dsn != m->m_pkthdr.mp_dsn ||
5274 rseq != m->m_pkthdr.mp_rseq ||
5275 dlen != m->m_pkthdr.mp_rlen)) {
5276 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
5277 __func__, dsn, m->m_pkthdr.mp_dsn,
5278 rseq, m->m_pkthdr.mp_rseq,
5279 dlen, m->m_pkthdr.mp_rlen),
5280 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5281 return -1;
5282 }
5283 m->m_pkthdr.mp_dsn += off;
5284 m->m_pkthdr.mp_rseq += off;
5285 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
5286 } else {
5287 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
5288 /* data arrived without an DSS option mapping */
5289
5290 /* initial subflow can fallback right after SYN handshake */
5291 mptcp_notify_mpfail(so);
5292 }
5293 }
5294
5295 mpts->mpts_flags |= MPTSF_CONFIRMED;
5296
5297 return 0;
5298 }
5299
5300 /*
5301 * Following routines help with failure detection and failover of data
5302 * transfer from one subflow to another.
5303 */
5304 void
5305 mptcp_act_on_txfail(struct socket *so)
5306 {
5307 struct tcpcb *tp = NULL;
5308 struct inpcb *inp = sotoinpcb(so);
5309
5310 if (inp == NULL) {
5311 return;
5312 }
5313
5314 tp = intotcpcb(inp);
5315 if (tp == NULL) {
5316 return;
5317 }
5318
5319 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5320 return;
5321 }
5322
5323 so->so_flags |= SOF_MP_TRYFAILOVER;
5324 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5325 }
5326
5327 /*
5328 * Support for MP_FAIL option
5329 */
5330 int
5331 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5332 {
5333 struct mbuf *m = so->so_snd.sb_mb;
5334 u_int64_t dsn;
5335 int off = 0;
5336 u_int32_t datalen;
5337
5338 if (m == NULL) {
5339 return -1;
5340 }
5341
5342 while (m != NULL) {
5343 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5344 VERIFY(m->m_flags & M_PKTHDR);
5345 dsn = m->m_pkthdr.mp_dsn;
5346 datalen = m->m_pkthdr.mp_rlen;
5347 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5348 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5349 off = dsn_fail - dsn;
5350 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5351 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5352 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5353 return 0;
5354 }
5355
5356 m = m->m_next;
5357 }
5358
5359 /*
5360 * If there was no mbuf data and a fallback to TCP occurred, there's
5361 * not much else to do.
5362 */
5363
5364 mptcplog((LOG_ERR, "MPTCP Sender: "
5365 "%s: %llu not found \n", __func__, dsn_fail),
5366 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5367 return -1;
5368 }
5369
5370 /*
5371 * Support for sending contiguous MPTCP bytes in subflow
5372 * Also for preventing sending data with ACK in 3-way handshake
5373 */
5374 int32_t
5375 mptcp_adj_sendlen(struct socket *so, int32_t off)
5376 {
5377 struct tcpcb *tp = sototcpcb(so);
5378 struct mptsub *mpts = tp->t_mpsub;
5379 uint64_t mdss_dsn;
5380 uint32_t mdss_subflow_seq;
5381 int mdss_subflow_off;
5382 uint16_t mdss_data_len;
5383 uint16_t dss_csum;
5384
5385 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5386 &mdss_data_len, &dss_csum);
5387
5388 /*
5389 * We need to compute how much of the mapping still remains.
5390 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5391 */
5392 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5393
5394 /*
5395 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5396 * seq has been set to 1 (while it should be 0).
5397 */
5398 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5399 mdss_subflow_off--;
5400 }
5401
5402 if (off < mdss_subflow_off) {
5403 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5404 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5405 }
5406 VERIFY(off >= mdss_subflow_off);
5407
5408 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5409 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5410 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5411 return mdss_data_len - (off - mdss_subflow_off);
5412 }
5413
5414 static uint32_t
5415 mptcp_get_maxseg(struct mptses *mpte)
5416 {
5417 struct mptsub *mpts;
5418 uint32_t maxseg = 0;
5419
5420 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5421 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5422
5423 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5424 TCPS_HAVERCVDFIN2(tp->t_state)) {
5425 continue;
5426 }
5427
5428 if (tp->t_maxseg > maxseg) {
5429 maxseg = tp->t_maxseg;
5430 }
5431 }
5432
5433 return maxseg;
5434 }
5435
5436 static uint8_t
5437 mptcp_get_rcvscale(struct mptses *mpte)
5438 {
5439 struct mptsub *mpts;
5440 uint8_t rcvscale = UINT8_MAX;
5441
5442 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5443 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5444
5445 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5446 TCPS_HAVERCVDFIN2(tp->t_state)) {
5447 continue;
5448 }
5449
5450 if (tp->rcv_scale < rcvscale) {
5451 rcvscale = tp->rcv_scale;
5452 }
5453 }
5454
5455 return rcvscale;
5456 }
5457
5458 /* Similar to tcp_sbrcv_reserve */
5459 static void
5460 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5461 u_int32_t newsize, u_int32_t idealsize)
5462 {
5463 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5464
5465 /* newsize should not exceed max */
5466 newsize = min(newsize, tcp_autorcvbuf_max);
5467
5468 /* The receive window scale negotiated at the
5469 * beginning of the connection will also set a
5470 * limit on the socket buffer size
5471 */
5472 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5473
5474 /* Set new socket buffer size */
5475 if (newsize > sbrcv->sb_hiwat &&
5476 (sbreserve(sbrcv, newsize) == 1)) {
5477 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5478 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5479
5480 /* Again check the limit set by the advertised
5481 * window scale
5482 */
5483 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5484 TCP_MAXWIN << rcvscale);
5485 }
5486 }
5487
5488 void
5489 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5490 {
5491 struct mptses *mpte = mp_tp->mpt_mpte;
5492 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5493 struct sockbuf *sbrcv = &mp_so->so_rcv;
5494 uint32_t hiwat_sum = 0;
5495 uint32_t ideal_sum = 0;
5496 struct mptsub *mpts;
5497
5498 /*
5499 * Do not grow the receive socket buffer if
5500 * - auto resizing is disabled, globally or on this socket
5501 * - the high water mark already reached the maximum
5502 * - the stream is in background and receive side is being
5503 * throttled
5504 * - if there are segments in reassembly queue indicating loss,
5505 * do not need to increase recv window during recovery as more
5506 * data is not going to be sent. A duplicate ack sent during
5507 * recovery should not change the receive window
5508 */
5509 if (tcp_do_autorcvbuf == 0 ||
5510 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5511 tcp_cansbgrow(sbrcv) == 0 ||
5512 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5513 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5514 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5515 /* Can not resize the socket buffer, just return */
5516 return;
5517 }
5518
5519 /*
5520 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5521 *
5522 * But, for this we first need accurate receiver-RTT estimations, which
5523 * we currently don't have.
5524 *
5525 * Let's use a dummy algorithm for now, just taking the sum of all
5526 * subflow's receive-buffers. It's too low, but that's all we can get
5527 * for now.
5528 */
5529
5530 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5531 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5532 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5533 }
5534
5535 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5536 }
5537
5538 /*
5539 * Determine if we can grow the recieve socket buffer to avoid sending
5540 * a zero window update to the peer. We allow even socket buffers that
5541 * have fixed size (set by the application) to grow if the resource
5542 * constraints are met. They will also be trimmed after the application
5543 * reads data.
5544 *
5545 * Similar to tcp_sbrcv_grow_rwin
5546 */
5547 static void
5548 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5549 {
5550 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5551 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5552 u_int32_t rcvbuf = sb->sb_hiwat;
5553
5554 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5555 return;
5556 }
5557
5558 if (tcp_do_autorcvbuf == 1 &&
5559 tcp_cansbgrow(sb) &&
5560 /* Diff to tcp_sbrcv_grow_rwin */
5561 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5562 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5563 rcvbuf < tcp_autorcvbuf_max &&
5564 (sb->sb_idealsize > 0 &&
5565 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5566 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5567 }
5568 }
5569
5570 /* Similar to tcp_sbspace */
5571 int32_t
5572 mptcp_sbspace(struct mptcb *mp_tp)
5573 {
5574 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5575 uint32_t rcvbuf;
5576 int32_t space;
5577 int32_t pending = 0;
5578
5579 mpte_lock_assert_held(mp_tp->mpt_mpte);
5580
5581 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5582
5583 /* hiwat might have changed */
5584 rcvbuf = sb->sb_hiwat;
5585
5586 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5587 (sb->sb_mbmax - sb->sb_mbcnt)));
5588 if (space < 0) {
5589 space = 0;
5590 }
5591
5592 #if CONTENT_FILTER
5593 /* Compensate for data being processed by content filters */
5594 pending = cfil_sock_data_space(sb);
5595 #endif /* CONTENT_FILTER */
5596 if (pending > space) {
5597 space = 0;
5598 } else {
5599 space -= pending;
5600 }
5601
5602 return space;
5603 }
5604
5605 /*
5606 * Support Fallback to Regular TCP
5607 */
5608 void
5609 mptcp_notify_mpready(struct socket *so)
5610 {
5611 struct tcpcb *tp = NULL;
5612
5613 if (so == NULL) {
5614 return;
5615 }
5616
5617 tp = intotcpcb(sotoinpcb(so));
5618
5619 if (tp == NULL) {
5620 return;
5621 }
5622
5623 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5624 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5625 struct tcpcb *, tp);
5626
5627 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5628 return;
5629 }
5630
5631 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5632 return;
5633 }
5634
5635 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5636 tp->t_mpflags |= TMPF_MPTCP_READY;
5637
5638 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5639 }
5640
5641 void
5642 mptcp_notify_mpfail(struct socket *so)
5643 {
5644 struct tcpcb *tp = NULL;
5645
5646 if (so == NULL) {
5647 return;
5648 }
5649
5650 tp = intotcpcb(sotoinpcb(so));
5651
5652 if (tp == NULL) {
5653 return;
5654 }
5655
5656 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5657 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5658 struct tcpcb *, tp);
5659
5660 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5661 return;
5662 }
5663
5664 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5665 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5666
5667 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5668 }
5669
5670 /*
5671 * Keepalive helper function
5672 */
5673 boolean_t
5674 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5675 {
5676 boolean_t ret = 1;
5677 mpte_lock_assert_held(mp_tp->mpt_mpte);
5678
5679 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5680 ret = 0;
5681 }
5682 return ret;
5683 }
5684
5685 /*
5686 * MPTCP t_maxseg adjustment function
5687 */
5688 int
5689 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5690 {
5691 int mss_lower = 0;
5692 struct mptcb *mp_tp = tptomptp(tp);
5693
5694 #define MPTCP_COMPUTE_LEN { \
5695 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5696 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5697 mss_lower += 2; \
5698 else \
5699 /* adjust to 32-bit boundary + EOL */ \
5700 mss_lower += 2; \
5701 }
5702 if (mp_tp == NULL) {
5703 return 0;
5704 }
5705
5706 mpte_lock_assert_held(mp_tp->mpt_mpte);
5707
5708 /*
5709 * For the first subflow and subsequent subflows, adjust mss for
5710 * most common MPTCP option size, for case where tcp_mss is called
5711 * during option processing and MTU discovery.
5712 */
5713 if (!mtudisc) {
5714 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5715 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5716 MPTCP_COMPUTE_LEN;
5717 }
5718
5719 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5720 tp->t_mpflags & TMPF_SENT_JOIN) {
5721 MPTCP_COMPUTE_LEN;
5722 }
5723 } else {
5724 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5725 MPTCP_COMPUTE_LEN;
5726 }
5727 }
5728
5729 return mss_lower;
5730 }
5731
5732 /*
5733 * Update the pid, upid, uuid of the subflow so, based on parent so
5734 */
5735 void
5736 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5737 {
5738 if (so->last_pid != mp_so->last_pid ||
5739 so->last_upid != mp_so->last_upid) {
5740 so->last_upid = mp_so->last_upid;
5741 so->last_pid = mp_so->last_pid;
5742 uuid_copy(so->last_uuid, mp_so->last_uuid);
5743 }
5744 so_update_policy(so);
5745 }
5746
5747 static void
5748 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5749 {
5750 struct inpcb *inp;
5751
5752 tcp_getconninfo(so, &flow->flow_ci);
5753 inp = sotoinpcb(so);
5754 #if INET6
5755 if ((inp->inp_vflag & INP_IPV6) != 0) {
5756 flow->flow_src.ss_family = AF_INET6;
5757 flow->flow_dst.ss_family = AF_INET6;
5758 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5759 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5760 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5761 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5762 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5763 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5764 } else
5765 #endif
5766 if ((inp->inp_vflag & INP_IPV4) != 0) {
5767 flow->flow_src.ss_family = AF_INET;
5768 flow->flow_dst.ss_family = AF_INET;
5769 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5770 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5771 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5772 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5773 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5774 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5775 }
5776 flow->flow_len = sizeof(*flow);
5777 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5778 flow->flow_flags = mpts->mpts_flags;
5779 flow->flow_cid = mpts->mpts_connid;
5780 flow->flow_relseq = mpts->mpts_rel_seq;
5781 flow->flow_soerror = mpts->mpts_socket->so_error;
5782 flow->flow_probecnt = mpts->mpts_probecnt;
5783 }
5784
5785 static int
5786 mptcp_pcblist SYSCTL_HANDLER_ARGS
5787 {
5788 #pragma unused(oidp, arg1, arg2)
5789 int error = 0, f;
5790 size_t len;
5791 struct mppcb *mpp;
5792 struct mptses *mpte;
5793 struct mptcb *mp_tp;
5794 struct mptsub *mpts;
5795 struct socket *so;
5796 conninfo_mptcp_t mptcpci;
5797 mptcp_flow_t *flows = NULL;
5798
5799 if (req->newptr != USER_ADDR_NULL) {
5800 return EPERM;
5801 }
5802
5803 lck_mtx_lock(&mtcbinfo.mppi_lock);
5804 if (req->oldptr == USER_ADDR_NULL) {
5805 size_t n = mtcbinfo.mppi_count;
5806 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5807 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
5808 4 * (n + n / 8) * sizeof(mptcp_flow_t);
5809 return 0;
5810 }
5811 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5812 flows = NULL;
5813 mpp_lock(mpp);
5814 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5815 mpte = mptompte(mpp);
5816 VERIFY(mpte != NULL);
5817 mpte_lock_assert_held(mpte);
5818 mp_tp = mpte->mpte_mptcb;
5819 VERIFY(mp_tp != NULL);
5820
5821 bzero(&mptcpci, sizeof(mptcpci));
5822 mptcpci.mptcpci_state = mp_tp->mpt_state;
5823 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5824 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5825 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5826 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5827 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5828 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5829 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5830 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5831 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5832 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5833 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
5834 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5835 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5836
5837 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5838 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5839 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5840 mptcpci.mptcpci_flow_offset =
5841 offsetof(conninfo_mptcp_t, mptcpci_flows);
5842
5843 len = sizeof(*flows) * mpte->mpte_numflows;
5844 if (mpte->mpte_numflows != 0) {
5845 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5846 if (flows == NULL) {
5847 mpp_unlock(mpp);
5848 break;
5849 }
5850 mptcpci.mptcpci_len = sizeof(mptcpci) +
5851 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5852 error = SYSCTL_OUT(req, &mptcpci,
5853 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5854 } else {
5855 mptcpci.mptcpci_len = sizeof(mptcpci);
5856 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
5857 }
5858 if (error) {
5859 mpp_unlock(mpp);
5860 FREE(flows, M_TEMP);
5861 break;
5862 }
5863 f = 0;
5864 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5865 so = mpts->mpts_socket;
5866 fill_mptcp_subflow(so, &flows[f], mpts);
5867 f++;
5868 }
5869 mpp_unlock(mpp);
5870 if (flows) {
5871 error = SYSCTL_OUT(req, flows, len);
5872 FREE(flows, M_TEMP);
5873 if (error) {
5874 break;
5875 }
5876 }
5877 }
5878 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5879
5880 return error;
5881 }
5882
5883 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5884 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5885 "List of active MPTCP connections");
5886
5887 /*
5888 * Set notsent lowat mark on the MPTCB
5889 */
5890 int
5891 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5892 {
5893 struct mptcb *mp_tp = NULL;
5894 int error = 0;
5895
5896 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
5897 mp_tp = mpte->mpte_mptcb;
5898 }
5899
5900 if (mp_tp) {
5901 mp_tp->mpt_notsent_lowat = optval;
5902 } else {
5903 error = EINVAL;
5904 }
5905
5906 return error;
5907 }
5908
5909 u_int32_t
5910 mptcp_get_notsent_lowat(struct mptses *mpte)
5911 {
5912 struct mptcb *mp_tp = NULL;
5913
5914 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
5915 mp_tp = mpte->mpte_mptcb;
5916 }
5917
5918 if (mp_tp) {
5919 return mp_tp->mpt_notsent_lowat;
5920 } else {
5921 return 0;
5922 }
5923 }
5924
5925 int
5926 mptcp_notsent_lowat_check(struct socket *so)
5927 {
5928 struct mptses *mpte;
5929 struct mppcb *mpp;
5930 struct mptcb *mp_tp;
5931 struct mptsub *mpts;
5932
5933 int notsent = 0;
5934
5935 mpp = mpsotomppcb(so);
5936 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5937 return 0;
5938 }
5939
5940 mpte = mptompte(mpp);
5941 mpte_lock_assert_held(mpte);
5942 mp_tp = mpte->mpte_mptcb;
5943
5944 notsent = so->so_snd.sb_cc;
5945
5946 if ((notsent == 0) ||
5947 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5948 mp_tp->mpt_notsent_lowat)) {
5949 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5950 "lowat %d notsent %d actual %d \n",
5951 mp_tp->mpt_notsent_lowat, notsent,
5952 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5953 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5954 return 1;
5955 }
5956
5957 /* When Nagle's algorithm is not disabled, it is better
5958 * to wakeup the client even before there is atleast one
5959 * maxseg of data to write.
5960 */
5961 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5962 int retval = 0;
5963 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5964 struct socket *subf_so = mpts->mpts_socket;
5965 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5966
5967 notsent = so->so_snd.sb_cc -
5968 (tp->snd_nxt - tp->snd_una);
5969
5970 if ((tp->t_flags & TF_NODELAY) == 0 &&
5971 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5972 retval = 1;
5973 }
5974 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
5975 " nodelay false \n",
5976 mp_tp->mpt_notsent_lowat, notsent),
5977 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5978 return retval;
5979 }
5980 }
5981 return 0;
5982 }
5983
5984 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5985 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5986 static uint32_t mptcp_kern_skt_inuse = 0;
5987 static uint32_t mptcp_kern_skt_unit;
5988 symptoms_advisory_t mptcp_advisory;
5989
5990 static errno_t
5991 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5992 void **unitinfo)
5993 {
5994 #pragma unused(kctlref, sac, unitinfo)
5995
5996 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
5997 os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
5998 }
5999
6000 mptcp_kern_skt_unit = sac->sc_unit;
6001
6002 return 0;
6003 }
6004
6005 static void
6006 mptcp_allow_uuid(uuid_t uuid)
6007 {
6008 struct mppcb *mpp;
6009
6010 /* Iterate over all MPTCP connections */
6011
6012 lck_mtx_lock(&mtcbinfo.mppi_lock);
6013
6014 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6015 struct mptses *mpte;
6016 struct socket *mp_so;
6017
6018 mpp_lock(mpp);
6019
6020 mpte = mpp->mpp_pcbe;
6021 mp_so = mpp->mpp_socket;
6022
6023 if (mp_so->so_flags & SOF_DELEGATED &&
6024 uuid_compare(uuid, mp_so->e_uuid)) {
6025 goto next;
6026 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6027 uuid_compare(uuid, mp_so->last_uuid)) {
6028 goto next;
6029 }
6030
6031 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp\n",
6032 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6033
6034 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6035
6036 mptcp_check_subflows_and_add(mpte);
6037 mptcp_remove_subflows(mpte);
6038
6039 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
6040
6041 next:
6042 mpp_unlock(mpp);
6043 }
6044
6045 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6046 }
6047
6048 static void
6049 mptcp_wifi_status_changed(void)
6050 {
6051 struct mppcb *mpp;
6052
6053 /* Iterate over all MPTCP connections */
6054
6055 lck_mtx_lock(&mtcbinfo.mppi_lock);
6056
6057 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6058 struct mptses *mpte;
6059 struct socket *mp_so;
6060
6061 mpp_lock(mpp);
6062
6063 mpte = mpp->mpp_pcbe;
6064 mp_so = mpp->mpp_socket;
6065
6066 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
6067 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) {
6068 goto next;
6069 }
6070
6071 mptcp_check_subflows_and_add(mpte);
6072 mptcp_check_subflows_and_remove(mpte);
6073
6074 next:
6075 mpp_unlock(mpp);
6076 }
6077
6078 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6079 }
6080
6081 void
6082 mptcp_ask_symptoms(struct mptses *mpte)
6083 {
6084 struct mptcp_symptoms_ask_uuid ask;
6085 struct socket *mp_so;
6086 struct proc *p;
6087 int pid, prio, err;
6088
6089 if (mptcp_kern_skt_unit == 0) {
6090 os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
6091 return;
6092 }
6093
6094 mp_so = mptetoso(mpte);
6095
6096 if (mp_so->so_flags & SOF_DELEGATED) {
6097 pid = mp_so->e_pid;
6098 } else {
6099 pid = mp_so->last_pid;
6100 }
6101
6102 p = proc_find(pid);
6103 if (p == PROC_NULL) {
6104 os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
6105 return;
6106 }
6107
6108 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6109
6110 if (mp_so->so_flags & SOF_DELEGATED) {
6111 uuid_copy(ask.uuid, mp_so->e_uuid);
6112 } else {
6113 uuid_copy(ask.uuid, mp_so->last_uuid);
6114 }
6115
6116 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6117
6118 if (prio == TASK_BACKGROUND_APPLICATION) {
6119 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6120 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6121 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6122 } else {
6123 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6124 }
6125
6126 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6127 &ask, sizeof(ask), CTL_DATA_EOR);
6128
6129 os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
6130 __func__, pid, ask.priority, err);
6131
6132
6133 proc_rele(p);
6134 }
6135
6136 static errno_t
6137 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6138 void *unitinfo)
6139 {
6140 #pragma unused(kctlref, kcunit, unitinfo)
6141
6142 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6143
6144 return 0;
6145 }
6146
6147 static errno_t
6148 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6149 mbuf_t m, int flags)
6150 {
6151 #pragma unused(kctlref, unitinfo, flags)
6152 symptoms_advisory_t *sa = NULL;
6153
6154 if (kcunit != mptcp_kern_skt_unit) {
6155 os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
6156 __func__, kcunit, mptcp_kern_skt_unit);
6157 }
6158
6159 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6160 mbuf_freem(m);
6161 return EINVAL;
6162 }
6163
6164 if (mbuf_len(m) < sizeof(*sa)) {
6165 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6166 __func__, mbuf_len(m), sizeof(*sa));
6167 mbuf_freem(m);
6168 return EINVAL;
6169 }
6170
6171 sa = mbuf_data(m);
6172
6173 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
6174 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6175 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
6176
6177 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
6178 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
6179 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
6180
6181 if ((sa->sa_wifi_status &
6182 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
6183 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
6184 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6185 }
6186
6187 if (old_wifi_status != mptcp_advisory.sa_wifi_status) {
6188 mptcp_wifi_status_changed();
6189 }
6190 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
6191 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
6192 mptcp_advisory.sa_wifi_status),
6193 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
6194 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
6195 uuid_t uuid;
6196 errno_t err;
6197
6198 if (mbuf_len(m) < sizeof(uuid_t) + sizeof(*sa)) {
6199 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6200 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa));
6201 mbuf_free(m);
6202 return EINVAL;
6203 }
6204
6205 err = mbuf_copydata(m, sizeof(*sa), sizeof(uuid_t), uuid);
6206 if (err) {
6207 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6208 mbuf_free(m);
6209 return err;
6210 }
6211
6212 mptcp_allow_uuid(uuid);
6213 }
6214
6215 mbuf_freem(m);
6216 return 0;
6217 }
6218
6219 void
6220 mptcp_control_register(void)
6221 {
6222 /* Set up the advisory control socket */
6223 struct kern_ctl_reg mptcp_kern_ctl;
6224
6225 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6226 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6227 sizeof(mptcp_kern_ctl.ctl_name));
6228 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6229 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6230 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6231 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6232
6233 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6234 }
6235
6236 /*
6237 * Three return-values:
6238 * 1 : WiFi is bad
6239 * 0 : WiFi is good
6240 * -1 : WiFi-state is unknown, use subflow-only heuristics
6241 */
6242 int
6243 mptcp_is_wifi_unusable(struct mptses *mpte)
6244 {
6245 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6246 if (mptcp_advisory.sa_wifi_status) {
6247 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0;
6248 }
6249
6250 /*
6251 * If it's a first-party app and we don't have any info
6252 * about the Wi-Fi state, let's be pessimistic.
6253 */
6254 return -1;
6255 }
6256
6257 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0;
6258 }
6259
6260 boolean_t
6261 mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
6262 {
6263 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6264 int fail_thresh = mptcp_fail_thresh;
6265
6266 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
6267 fail_thresh *= 2;
6268 }
6269
6270 return tp->t_rxtshift >= fail_thresh &&
6271 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
6272 }
6273
6274 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6275 static void
6276 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6277 {
6278 struct socket *mp_so = mptetoso(mpte);
6279 struct socket *so = mpts->mpts_socket;
6280 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6281 struct mptcb *mp_tp = mpte->mpte_mptcb;
6282
6283 /* If data was sent with SYN, rewind state */
6284 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6285 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6286 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6287
6288 VERIFY(mp_droplen <= (UINT_MAX));
6289 VERIFY(mp_droplen >= tcp_droplen);
6290
6291 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6292 mpts->mpts_iss += tcp_droplen;
6293 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6294
6295 if (mp_droplen > tcp_droplen) {
6296 /* handle partial TCP ack */
6297 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6298 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6299 mp_droplen = tcp_droplen;
6300 } else {
6301 /* all data on SYN was acked */
6302 mpts->mpts_rel_seq = 1;
6303 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6304 }
6305 mp_tp->mpt_sndmax -= tcp_droplen;
6306
6307 if (mp_droplen != 0) {
6308 VERIFY(mp_so->so_snd.sb_mb != NULL);
6309 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6310 }
6311 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
6312 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
6313 mpts->mpts_connid, tcp_droplen, mp_droplen),
6314 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6315 }
6316 }
6317
6318 int
6319 mptcp_freeq(struct mptcb *mp_tp)
6320 {
6321 struct tseg_qent *q;
6322 int rv = 0;
6323
6324 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6325 LIST_REMOVE(q, tqe_q);
6326 m_freem(q->tqe_m);
6327 zfree(tcp_reass_zone, q);
6328 rv = 1;
6329 }
6330 mp_tp->mpt_reassqlen = 0;
6331 return rv;
6332 }
6333
6334 static int
6335 mptcp_post_event(u_int32_t event_code, int value)
6336 {
6337 struct kev_mptcp_data event_data;
6338 struct kev_msg ev_msg;
6339
6340 memset(&ev_msg, 0, sizeof(ev_msg));
6341
6342 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6343 ev_msg.kev_class = KEV_NETWORK_CLASS;
6344 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6345 ev_msg.event_code = event_code;
6346
6347 event_data.value = value;
6348
6349 ev_msg.dv[0].data_ptr = &event_data;
6350 ev_msg.dv[0].data_length = sizeof(event_data);
6351
6352 return kev_post_msg(&ev_msg);
6353 }
6354
6355 void
6356 mptcp_set_cellicon(struct mptses *mpte)
6357 {
6358 int error;
6359
6360 /* First-party apps (Siri) don't flip the cellicon */
6361 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6362 return;
6363 }
6364
6365 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
6366 mptcp_last_cellicon_set = tcp_now;
6367
6368 /* If cellicon is already set, get out of here! */
6369 if (OSTestAndSet(7, &mptcp_cellicon_is_set)) {
6370 return;
6371 }
6372
6373 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6374
6375 if (error) {
6376 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
6377 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6378 } else {
6379 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
6380 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6381 }
6382 }
6383
6384 void
6385 mptcp_unset_cellicon(void)
6386 {
6387 int error;
6388
6389 /* If cellicon is already unset, get out of here! */
6390 if (OSTestAndClear(7, &mptcp_cellicon_is_set)) {
6391 return;
6392 }
6393
6394 /*
6395 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6396 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6397 * it again.
6398 */
6399 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
6400 tcp_now)) {
6401 OSTestAndSet(7, &mptcp_cellicon_is_set);
6402 return;
6403 }
6404
6405 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6406
6407 if (error) {
6408 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
6409 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6410 } else {
6411 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
6412 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6413 }
6414 }
6415
6416 void
6417 mptcp_reset_rexmit_state(struct tcpcb *tp)
6418 {
6419 struct mptsub *mpts;
6420 struct inpcb *inp;
6421 struct socket *so;
6422
6423 inp = tp->t_inpcb;
6424 if (inp == NULL) {
6425 return;
6426 }
6427
6428 so = inp->inp_socket;
6429 if (so == NULL) {
6430 return;
6431 }
6432
6433 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6434 return;
6435 }
6436
6437 mpts = tp->t_mpsub;
6438
6439 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6440 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6441 }
6442
6443 void
6444 mptcp_reset_keepalive(struct tcpcb *tp)
6445 {
6446 struct mptsub *mpts = tp->t_mpsub;
6447
6448 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6449 }