]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
xnu-4903.241.1.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_rupcall(struct socket *, void *, int);
121 static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122 static void mptcp_subflow_wupcall(struct socket *, void *, int);
123 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127 static void mptcp_subflow_abort(struct mptsub *, int);
128
129 static void mptcp_send_dfin(struct socket *so);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
146 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158 static const char *mptcp_evret2str(ev_ret_t);
159
160 static void mptcp_do_sha1(mptcp_key_t *, char *);
161 static void mptcp_init_local_parms(struct mptses *);
162
163 static unsigned int mptsub_zone_size; /* size of mptsub */
164 static struct zone *mptsub_zone; /* zone for mptsub */
165
166 static unsigned int mptopt_zone_size; /* size of mptopt */
167 static struct zone *mptopt_zone; /* zone for mptopt */
168
169 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172 struct mppcbinfo mtcbinfo;
173
174 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177 SYSCTL_DECL(_net_inet);
178
179 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
181 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
182 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
185 uint32_t mptcp_dbg_level = 1;
186 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
189 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
192
193 static int mptcp_alternate_port = 0;
194 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
196
197 static struct protosw mptcp_subflow_protosw;
198 static struct pr_usrreqs mptcp_subflow_usrreqs;
199 #if INET6
200 static struct ip6protosw mptcp_subflow_protosw6;
201 static struct pr_usrreqs mptcp_subflow_usrreqs6;
202 #endif /* INET6 */
203
204 static uint8_t mptcp_create_subflows_scheduled;
205
206 typedef struct mptcp_subflow_event_entry {
207 uint64_t sofilt_hint_mask;
208 ev_ret_t (*sofilt_hint_ev_hdlr)(
209 struct mptses *mpte,
210 struct mptsub *mpts,
211 uint64_t *p_mpsofilt_hint,
212 uint64_t event);
213 } mptsub_ev_entry_t;
214
215 static uint8_t mptcp_cellicon_is_set;
216 static uint32_t mptcp_last_cellicon_set;
217 #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
218
219 /*
220 * XXX The order of the event handlers below is really
221 * really important. Think twice before changing it.
222 */
223 static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
224 {
225 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
226 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
227 },
228 {
229 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
230 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
231 },
232 {
233 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
234 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
235 },
236 {
237 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
238 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
239 },
240 {
241 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
242 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
243 },
244 {
245 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
246 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
247 },
248 {
249 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
250 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
251 },
252 {
253 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
254 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
255 },
256 {
257 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
258 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
259 },
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
263 },
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
267 },
268 {
269 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
275 },
276 };
277
278 os_log_t mptcp_log_handle;
279
280 /*
281 * Protocol pr_init callback.
282 */
283 void
284 mptcp_init(struct protosw *pp, struct domain *dp)
285 {
286 #pragma unused(dp)
287 static int mptcp_initialized = 0;
288 struct protosw *prp;
289 #if INET6
290 struct ip6protosw *prp6;
291 #endif /* INET6 */
292
293 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
294
295 /* do this only once */
296 if (mptcp_initialized)
297 return;
298 mptcp_initialized = 1;
299
300 /*
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
303 */
304 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 VERIFY(prp != NULL);
306 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
307 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 sizeof (mptcp_subflow_usrreqs));
309 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
313 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
314 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 /*
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
319 */
320 mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324
325 #if INET6
326 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
327 IPPROTO_TCP, SOCK_STREAM);
328 VERIFY(prp6 != NULL);
329 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
330 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
331 sizeof (mptcp_subflow_usrreqs6));
332 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
333 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
334 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
335 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
336 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
337 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
338 /*
339 * Socket filters shouldn't attach/detach to/from this protosw
340 * since pr_protosw is to be used instead, which points to the
341 * real protocol; if they do, it is a bug and we should panic.
342 */
343 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
344 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
345 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
346 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
347 #endif /* INET6 */
348
349 bzero(&mtcbinfo, sizeof (mtcbinfo));
350 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
351 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
352 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
353 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
354 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
355 /* NOTREACHED */
356 }
357 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
358 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
359
360 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
361 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
362 mtcbinfo.mppi_lock_grp_attr);
363 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
364 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
365 mtcbinfo.mppi_lock_attr);
366
367 mtcbinfo.mppi_gc = mptcp_gc;
368 mtcbinfo.mppi_timer = mptcp_timer;
369
370 /* attach to MP domain for garbage collection to take place */
371 mp_pcbinfo_attach(&mtcbinfo);
372
373 mptsub_zone_size = sizeof (struct mptsub);
374 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
375 8192, "mptsub")) == NULL) {
376 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
377 /* NOTREACHED */
378 }
379 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
380 zone_change(mptsub_zone, Z_EXPAND, TRUE);
381
382 mptopt_zone_size = sizeof (struct mptopt);
383 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
384 1024, "mptopt")) == NULL) {
385 panic("%s: unable to allocate MPTCP option zone\n", __func__);
386 /* NOTREACHED */
387 }
388 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
389 zone_change(mptopt_zone, Z_EXPAND, TRUE);
390
391 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
392 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
393 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
394 panic("%s: unable to allocate MPTCP address auth zone \n",
395 __func__);
396 /* NOTREACHED */
397 }
398 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
399 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
400
401 mptcp_last_cellicon_set = tcp_now;
402
403 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
404 }
405
406 int
407 mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
408 {
409 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
410
411 int i, index = -1;
412
413 if (ifp == NULL) {
414 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
416 return (-1);
417 }
418
419 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
420 if (stats[i].ifindex == IFSCOPE_NONE) {
421 if (index < 0)
422 index = i;
423 continue;
424 }
425
426 if (stats[i].ifindex == ifp->if_index) {
427 index = i;
428 return (index);
429 }
430 }
431
432 if (index != -1) {
433 stats[index].ifindex = ifp->if_index;
434 if (stats[index].is_expensive == 0)
435 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
436 }
437
438 return (index);
439 }
440
441 void
442 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
443 {
444 int index;
445
446 tcpstat.tcps_mp_switches++;
447 mpte->mpte_subflow_switches++;
448
449 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
450
451 if (index != -1)
452 mpte->mpte_itfstats[index].switches++;
453 }
454
455 /*
456 * Flushes all recorded socket options from an MP socket.
457 */
458 static void
459 mptcp_flush_sopts(struct mptses *mpte)
460 {
461 struct mptopt *mpo, *tmpo;
462
463 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
464 mptcp_sopt_remove(mpte, mpo);
465 mptcp_sopt_free(mpo);
466 }
467 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
468 }
469
470 /*
471 * Create an MPTCP session, called as a result of opening a MPTCP socket.
472 */
473 int
474 mptcp_sescreate(struct mppcb *mpp)
475 {
476 struct mppcbinfo *mppi;
477 struct mptses *mpte;
478 struct mptcb *mp_tp;
479
480 VERIFY(mpp != NULL);
481 mppi = mpp->mpp_pcbinfo;
482 VERIFY(mppi != NULL);
483
484 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
485 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
486
487 /* MPTCP Multipath PCB Extension */
488 bzero(mpte, sizeof (*mpte));
489 VERIFY(mpp->mpp_pcbe == NULL);
490 mpp->mpp_pcbe = mpte;
491 mpte->mpte_mppcb = mpp;
492 mpte->mpte_mptcb = mp_tp;
493
494 TAILQ_INIT(&mpte->mpte_sopts);
495 TAILQ_INIT(&mpte->mpte_subflows);
496 mpte->mpte_associd = SAE_ASSOCID_ANY;
497 mpte->mpte_connid_last = SAE_CONNID_ANY;
498
499 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
500 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
501
502 if (mptcp_alternate_port)
503 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
504
505 /* MPTCP Protocol Control Block */
506 bzero(mp_tp, sizeof (*mp_tp));
507 mp_tp->mpt_mpte = mpte;
508 mp_tp->mpt_state = MPTCPS_CLOSED;
509
510 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
511
512 return (0);
513 }
514
515 static void
516 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
517 uint64_t *cellbytes, uint64_t *allbytes)
518 {
519 int64_t mycellbytes = 0;
520 uint64_t myallbytes = 0;
521 int i;
522
523 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
524 if (mpte->mpte_itfstats[i].is_expensive) {
525 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
526 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
527 }
528
529 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
530 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
531 }
532
533 if (initial_cell) {
534 mycellbytes -= mpte->mpte_init_txbytes;
535 mycellbytes -= mpte->mpte_init_txbytes;
536 }
537
538 if (mycellbytes < 0) {
539 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
540 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
541 *cellbytes = 0;
542 *allbytes = 0;
543 } else {
544 *cellbytes = mycellbytes;
545 *allbytes = myallbytes;
546 }
547 }
548
549 static void
550 mptcpstats_session_wrapup(struct mptses *mpte)
551 {
552 boolean_t cell = mpte->mpte_initial_cell;
553
554 switch (mpte->mpte_svctype) {
555 case MPTCP_SVCTYPE_HANDOVER:
556 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
557 tcpstat.tcps_mptcp_fp_handover_attempt++;
558
559 if (cell && mpte->mpte_handshake_success) {
560 tcpstat.tcps_mptcp_fp_handover_success_cell++;
561
562 if (mpte->mpte_used_wifi)
563 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
564 } else if (mpte->mpte_handshake_success) {
565 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
566
567 if (mpte->mpte_used_cell)
568 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
569 }
570 } else {
571 tcpstat.tcps_mptcp_handover_attempt++;
572
573 if (cell && mpte->mpte_handshake_success) {
574 tcpstat.tcps_mptcp_handover_success_cell++;
575
576 if (mpte->mpte_used_wifi)
577 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
578 } else if (mpte->mpte_handshake_success) {
579 tcpstat.tcps_mptcp_handover_success_wifi++;
580
581 if (mpte->mpte_used_cell)
582 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
583 }
584 }
585
586 if (mpte->mpte_handshake_success) {
587 uint64_t cellbytes;
588 uint64_t allbytes;
589
590 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
591
592 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
593 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
594 }
595 break;
596 case MPTCP_SVCTYPE_INTERACTIVE:
597 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
598 tcpstat.tcps_mptcp_fp_interactive_attempt++;
599
600 if (mpte->mpte_handshake_success) {
601 tcpstat.tcps_mptcp_fp_interactive_success++;
602
603 if (!cell && mpte->mpte_used_cell)
604 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
605 }
606 } else {
607 tcpstat.tcps_mptcp_interactive_attempt++;
608
609 if (mpte->mpte_handshake_success) {
610 tcpstat.tcps_mptcp_interactive_success++;
611
612 if (!cell && mpte->mpte_used_cell)
613 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
614 }
615 }
616
617 if (mpte->mpte_handshake_success) {
618 uint64_t cellbytes;
619 uint64_t allbytes;
620
621 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
622
623 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
624 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
625 }
626 break;
627 case MPTCP_SVCTYPE_AGGREGATE:
628 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
629 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
630
631 if (mpte->mpte_handshake_success)
632 tcpstat.tcps_mptcp_fp_aggregate_success++;
633 } else {
634 tcpstat.tcps_mptcp_aggregate_attempt++;
635
636 if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_aggregate_success++;
638 }
639 }
640
641 if (mpte->mpte_handshake_success) {
642 uint64_t cellbytes;
643 uint64_t allbytes;
644
645 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
646
647 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
648 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
649 }
650 break;
651 }
652
653 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
654 tcpstat.tcps_mptcp_back_to_wifi++;
655
656 if (mpte->mpte_triggered_cell)
657 tcpstat.tcps_mptcp_triggered_cell++;
658 }
659
660 /*
661 * Destroy an MPTCP session.
662 */
663 static void
664 mptcp_session_destroy(struct mptses *mpte)
665 {
666 struct mptcb *mp_tp;
667
668 mpte_lock_assert_held(mpte); /* same as MP socket lock */
669
670 mp_tp = mpte->mpte_mptcb;
671 VERIFY(mp_tp != NULL);
672
673 mptcpstats_session_wrapup(mpte);
674
675 mptcp_unset_cellicon();
676
677 /*
678 * MPTCP Multipath PCB Extension section
679 */
680 mptcp_flush_sopts(mpte);
681 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
682
683 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
684 _FREE(mpte->mpte_itfinfo, M_TEMP);
685
686 mpte->mpte_itfinfo = NULL;
687
688 m_freem_list(mpte->mpte_reinjectq);
689
690 /*
691 * MPTCP Protocol Control Block section
692 */
693 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
694 struct mptcb *, mp_tp);
695 }
696
697 static boolean_t
698 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
699 {
700 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
701 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
702 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
703 }
704
705 static int
706 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
707 {
708 static const struct in6_addr well_known_prefix = {
709 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
710 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
711 0x00, 0x00, 0x00, 0x00},
712 };
713 char buf[MAX_IPv6_STR_LEN];
714 char *ptrv4 = (char *)addrv4;
715 char *ptr = (char *)addr;
716
717 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
718 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
719 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
720 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
721 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
722 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
723 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
724 return (-1);
725 }
726
727 /* Check for the well-known prefix */
728 if (len == NAT64_PREFIX_LEN_96 &&
729 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
730 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
731 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space
732 return (-1);
733 }
734
735 switch (len) {
736 case NAT64_PREFIX_LEN_96:
737 memcpy(ptr + 12, ptrv4, 4);
738 break;
739 case NAT64_PREFIX_LEN_64:
740 memcpy(ptr + 9, ptrv4, 4);
741 break;
742 case NAT64_PREFIX_LEN_56:
743 memcpy(ptr + 7, ptrv4, 1);
744 memcpy(ptr + 9, ptrv4 + 1, 3);
745 break;
746 case NAT64_PREFIX_LEN_48:
747 memcpy(ptr + 6, ptrv4, 2);
748 memcpy(ptr + 9, ptrv4 + 2, 2);
749 break;
750 case NAT64_PREFIX_LEN_40:
751 memcpy(ptr + 5, ptrv4, 3);
752 memcpy(ptr + 9, ptrv4 + 3, 1);
753 break;
754 case NAT64_PREFIX_LEN_32:
755 memcpy(ptr + 4, ptrv4, 4);
756 break;
757 default:
758 panic("NAT64-prefix len is wrong: %u\n", len);
759 }
760
761 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
762 __func__, len,
763 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
764
765 return (0);
766 }
767
768 static void
769 mptcp_trigger_cell_bringup(struct mptses *mpte)
770 {
771 struct socket *mp_so = mptetoso(mpte);
772
773 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
774 uuid_string_t uuidstr;
775 int err;
776
777 mpte_unlock(mpte);
778 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
779 TRUE);
780 mpte_lock(mpte);
781
782 if (err == 0)
783 mpte->mpte_triggered_cell = 1;
784
785 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
786 os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
787 __func__, uuidstr, err);
788 } else {
789 os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
790 }
791 }
792
793
794 void
795 mptcp_check_subflows_and_add(struct mptses *mpte)
796 {
797 struct mptcb *mp_tp = mpte->mpte_mptcb;
798 boolean_t cellular_viable = FALSE;
799 boolean_t want_cellular = TRUE;
800 uint32_t i;
801
802 if (!mptcp_ok_to_create_subflows(mp_tp))
803 return;
804
805 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
806 struct mpt_itf_info *info;
807 struct mptsub *mpts;
808 struct ifnet *ifp;
809 uint32_t ifindex;
810 int found = 0;
811
812 info = &mpte->mpte_itfinfo[i];
813
814 if (info->no_mptcp_support)
815 continue;
816
817 ifindex = info->ifindex;
818 if (ifindex == IFSCOPE_NONE)
819 continue;
820
821 ifnet_head_lock_shared();
822 ifp = ifindex2ifnet[ifindex];
823 ifnet_head_done();
824
825 if (ifp == NULL)
826 continue;
827
828 if (IFNET_IS_CELLULAR(ifp))
829 cellular_viable = TRUE;
830
831 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
832 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
833
834 if (subifp == NULL)
835 continue;
836
837 /*
838 * In Handover mode, only create cell subflow if
839 * 1. Wi-Fi Assist is active
840 * 2. Symptoms marked WiFi as weak
841 * 3. We are experiencing RTOs or we are not sending data.
842 *
843 * This covers the scenario, where:
844 * 1. We send and get retransmission timeouts (thus,
845 * we confirmed that WiFi is indeed bad).
846 * 2. We are not sending and the server tries to send.
847 * Establshing a cell-subflow gives the server a
848 * chance to send us some data over cell if WiFi
849 * is dead. We establish the subflow with the
850 * backup-bit set, so the server is not allowed to
851 * send on this subflow as long as WiFi is providing
852 * good performance.
853 */
854 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
855 !IFNET_IS_CELLULAR(subifp) &&
856 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
857 (mptcp_is_wifi_unusable(mpte) == 0 ||
858 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
859 ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
860 os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
861 __func__, mptcp_is_wifi_unusable(mpte),
862 sototcpcb(mpts->mpts_socket)->t_rxtshift,
863 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
864 mptetoso(mpte)->so_snd.sb_cc,
865 ifindex, subifp->if_index);
866 found = 1;
867
868 /* We found a proper subflow on WiFi - no need for cell */
869 want_cellular = FALSE;
870 break;
871 } else {
872 os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
873 __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
874 mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
875 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
876
877 }
878
879 if (subifp->if_index == ifindex &&
880 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
881 sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
882 /*
883 * We found a subflow on this interface.
884 * No need to create a new one.
885 */
886 found = 1;
887 break;
888 }
889 }
890
891 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
892 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
893 mptcp_developer_mode == 0) {
894 mptcp_ask_symptoms(mpte);
895 return;
896 }
897
898 if (!found) {
899 struct sockaddr *dst = &mpte->mpte_dst;
900 struct sockaddr_in6 nat64pre;
901
902 if (mpte->mpte_dst.sa_family == AF_INET &&
903 !info->has_v4_conn && info->has_nat64_conn) {
904 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
905 int error, j;
906
907 bzero(&nat64pre, sizeof(struct sockaddr_in6));
908
909 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
910 if (error) {
911 os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
912 __func__, ifp->if_name, error);
913 continue;
914 }
915
916 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
917 if (nat64prefixes[j].prefix_len != 0)
918 break;
919 }
920
921 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
922
923 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
924 nat64prefixes[j].prefix_len,
925 &mpte->__mpte_dst_v4.sin_addr);
926 if (error != 0) {
927 os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
928 __func__);
929 continue;
930 }
931
932 memcpy(&nat64pre.sin6_addr,
933 &nat64prefixes[j].ipv6_prefix,
934 sizeof(nat64pre.sin6_addr));
935 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
936 nat64pre.sin6_family = AF_INET6;
937 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
938 nat64pre.sin6_flowinfo = 0;
939 nat64pre.sin6_scope_id = 0;
940
941 dst = (struct sockaddr *)&nat64pre;
942 }
943
944 /* Initial subflow started on a NAT64'd address? */
945 if (mpte->mpte_dst.sa_family == AF_INET6 &&
946 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
947 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
948 }
949
950 if (dst->sa_family == AF_INET && !info->has_v4_conn)
951 continue;
952 if (dst->sa_family == AF_INET6 && !info->has_v6_conn)
953 continue;
954
955 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
956 }
957 }
958
959 if (!cellular_viable && want_cellular) {
960 /* Trigger Cell Bringup */
961 mptcp_trigger_cell_bringup(mpte);
962 }
963 }
964
965 /*
966 * Based on the MPTCP Service-type and the state of the subflows, we
967 * will destroy subflows here.
968 */
969 static void
970 mptcp_check_subflows_and_remove(struct mptses *mpte)
971 {
972 struct mptsub *mpts, *tmpts;
973 int found_working_subflow = 0, removed_some = 0;
974 int wifi_unusable = mptcp_is_wifi_unusable(mpte);
975
976 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
977 return;
978
979 /*
980 * Look for a subflow that is on a non-cellular interface
981 * and actually works (aka, no retransmission timeout).
982 */
983 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
984 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
985 struct socket *so;
986 struct tcpcb *tp;
987
988 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
989 continue;
990
991 so = mpts->mpts_socket;
992 tp = sototcpcb(so);
993
994 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
995 tp->t_state != TCPS_ESTABLISHED)
996 continue;
997
998 /* Is this subflow in good condition? */
999 if (tp->t_rxtshift == 0)
1000 found_working_subflow = 1;
1001
1002 /* Or WiFi is fine */
1003 if (!wifi_unusable)
1004 found_working_subflow = 1;
1005 }
1006
1007 /*
1008 * Couldn't find a working subflow, let's not remove those on a cellular
1009 * interface.
1010 */
1011 if (!found_working_subflow)
1012 return;
1013
1014 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1015 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1016
1017 /* Only remove cellular subflows */
1018 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
1019 continue;
1020
1021 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1022 removed_some = 1;
1023 }
1024
1025 if (removed_some)
1026 mptcp_unset_cellicon();
1027 }
1028
1029 static void
1030 mptcp_remove_subflows(struct mptses *mpte)
1031 {
1032 struct mptsub *mpts, *tmpts;
1033
1034 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1035 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1036 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1037
1038 soevent(mpts->mpts_socket,
1039 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1040 }
1041 }
1042 }
1043
1044 static void
1045 mptcp_create_subflows(__unused void *arg)
1046 {
1047 struct mppcb *mpp;
1048
1049 /*
1050 * Start with clearing, because we might be processing connections
1051 * while a new event comes in.
1052 */
1053 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
1054 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
1055 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1056
1057 /* Iterate over all MPTCP connections */
1058
1059 lck_mtx_lock(&mtcbinfo.mppi_lock);
1060
1061 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1062 struct mptses *mpte;
1063 struct socket *mp_so;
1064
1065 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
1066 continue;
1067
1068 mpp_lock(mpp);
1069
1070 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1071
1072 mpte = mpp->mpp_pcbe;
1073 mp_so = mpp->mpp_socket;
1074
1075 VERIFY(mp_so->so_usecount > 0);
1076
1077 mptcp_check_subflows_and_add(mpte);
1078 mptcp_remove_subflows(mpte);
1079
1080 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1081 mpp_unlock(mpp);
1082 }
1083
1084 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1085 }
1086
1087 /*
1088 * We need this because we are coming from an NECP-event. This event gets posted
1089 * while holding NECP-locks. The creation of the subflow however leads us back
1090 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1091 * So, we would deadlock there as we already hold the NECP-lock.
1092 *
1093 * So, let's schedule this separately. It also gives NECP the chance to make
1094 * progress, without having to wait for MPTCP to finish its subflow creation.
1095 */
1096 void
1097 mptcp_sched_create_subflows(struct mptses *mpte)
1098 {
1099 struct mppcb *mpp = mpte->mpte_mppcb;
1100 struct mptcb *mp_tp = mpte->mpte_mptcb;
1101 struct socket *mp_so = mpp->mpp_socket;
1102
1103 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1104 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1105 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1106 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1107 return;
1108 }
1109
1110 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1111 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1112 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1113 }
1114
1115 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1116 return;
1117
1118 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1119 timeout(mptcp_create_subflows, NULL, hz/10);
1120 }
1121
1122 /*
1123 * Allocate an MPTCP socket option structure.
1124 */
1125 struct mptopt *
1126 mptcp_sopt_alloc(int how)
1127 {
1128 struct mptopt *mpo;
1129
1130 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1131 zalloc_noblock(mptopt_zone);
1132 if (mpo != NULL) {
1133 bzero(mpo, mptopt_zone_size);
1134 }
1135
1136 return (mpo);
1137 }
1138
1139 /*
1140 * Free an MPTCP socket option structure.
1141 */
1142 void
1143 mptcp_sopt_free(struct mptopt *mpo)
1144 {
1145 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1146
1147 zfree(mptopt_zone, mpo);
1148 }
1149
1150 /*
1151 * Add a socket option to the MPTCP socket option list.
1152 */
1153 void
1154 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1155 {
1156 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1157 mpo->mpo_flags |= MPOF_ATTACHED;
1158 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1159 }
1160
1161 /*
1162 * Remove a socket option from the MPTCP socket option list.
1163 */
1164 void
1165 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1166 {
1167 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1168 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1169 mpo->mpo_flags &= ~MPOF_ATTACHED;
1170 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1171 }
1172
1173 /*
1174 * Search for an existing <sopt_level,sopt_name> socket option.
1175 */
1176 struct mptopt *
1177 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1178 {
1179 struct mptopt *mpo;
1180
1181 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1182
1183 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1184 if (mpo->mpo_level == sopt->sopt_level &&
1185 mpo->mpo_name == sopt->sopt_name)
1186 break;
1187 }
1188 return (mpo);
1189 }
1190
1191 /*
1192 * Allocate a MPTCP subflow structure.
1193 */
1194 static struct mptsub *
1195 mptcp_subflow_alloc(void)
1196 {
1197 struct mptsub *mpts = zalloc(mptsub_zone);
1198
1199 if (mpts == NULL)
1200 return (NULL);
1201
1202 bzero(mpts, mptsub_zone_size);
1203 return (mpts);
1204 }
1205
1206 /*
1207 * Deallocate a subflow structure, called when all of the references held
1208 * on it have been released. This implies that the subflow has been deleted.
1209 */
1210 static void
1211 mptcp_subflow_free(struct mptsub *mpts)
1212 {
1213 VERIFY(mpts->mpts_refcnt == 0);
1214 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1215 VERIFY(mpts->mpts_mpte == NULL);
1216 VERIFY(mpts->mpts_socket == NULL);
1217
1218 if (mpts->mpts_src != NULL) {
1219 FREE(mpts->mpts_src, M_SONAME);
1220 mpts->mpts_src = NULL;
1221 }
1222
1223 zfree(mptsub_zone, mpts);
1224 }
1225
1226 static void
1227 mptcp_subflow_addref(struct mptsub *mpts)
1228 {
1229 if (++mpts->mpts_refcnt == 0)
1230 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1231 /* NOTREACHED */
1232 }
1233
1234 static void
1235 mptcp_subflow_remref(struct mptsub *mpts)
1236 {
1237 if (mpts->mpts_refcnt == 0) {
1238 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1239 /* NOTREACHED */
1240 }
1241 if (--mpts->mpts_refcnt > 0)
1242 return;
1243
1244 /* callee will unlock and destroy lock */
1245 mptcp_subflow_free(mpts);
1246 }
1247
1248 static void
1249 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1250 {
1251 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1252 struct tcpcb *tp = sototcpcb(so);
1253
1254 /*
1255 * From this moment on, the subflow is linked to the MPTCP-connection.
1256 * Locking,... happens now at the MPTCP-layer
1257 */
1258 tp->t_mptcb = mpte->mpte_mptcb;
1259 so->so_flags |= SOF_MP_SUBFLOW;
1260 mp_so->so_usecount++;
1261
1262 /*
1263 * Insert the subflow into the list, and associate the MPTCP PCB
1264 * as well as the the subflow socket. From this point on, removing
1265 * the subflow needs to be done via mptcp_subflow_del().
1266 */
1267 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1268 mpte->mpte_numflows++;
1269
1270 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1271 mpts->mpts_mpte = mpte;
1272 mpts->mpts_socket = so;
1273 tp->t_mpsub = mpts;
1274 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1275 mptcp_subflow_addref(mpts); /* for subflow socket */
1276 }
1277
1278 static void
1279 mptcp_subflow_necp_cb(void *handle, __unused int action,
1280 __unused uint32_t interface_index,
1281 uint32_t necp_flags, bool *viable)
1282 {
1283 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1284 struct inpcb *inp = (struct inpcb *)handle;
1285 struct socket *so = inp->inp_socket;
1286 struct mptsub *mpts;
1287 struct mptses *mpte;
1288
1289 if (low_power)
1290 action = NECP_CLIENT_CBACTION_NONVIABLE;
1291
1292 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1293 return;
1294
1295 /*
1296 * The socket is being garbage-collected. There is nothing to be done
1297 * here.
1298 */
1299 if (so->so_usecount == 0)
1300 return;
1301
1302 socket_lock(so, 1);
1303
1304 /* Check again after we acquired the lock. */
1305 if (so->so_usecount == 0)
1306 goto out;
1307
1308 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1309 mpts = sototcpcb(so)->t_mpsub;
1310
1311 os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
1312 __func__, mpts->mpts_ifscope, low_power);
1313
1314 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1315
1316 mptcp_sched_create_subflows(mpte);
1317
1318 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL)
1319 *viable = 1;
1320
1321 out:
1322 socket_unlock(so, 1);
1323 }
1324
1325 /*
1326 * Create an MPTCP subflow socket.
1327 */
1328 static int
1329 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1330 struct socket **so)
1331 {
1332 lck_mtx_t *subflow_mtx;
1333 struct mptopt smpo, *mpo, *tmpo;
1334 struct proc *p;
1335 struct socket *mp_so;
1336 int error;
1337
1338 *so = NULL;
1339 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1340 mp_so = mptetoso(mpte);
1341
1342 p = proc_find(mp_so->last_pid);
1343 if (p == PROC_NULL) {
1344 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1345 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1346
1347 return (ESRCH);
1348 }
1349
1350 /*
1351 * Create the subflow socket (multipath subflow, non-blocking.)
1352 *
1353 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1354 * socket; it will be cleared when the socket is peeled off or closed.
1355 * It also indicates to the underlying TCP to handle MPTCP options.
1356 * A multipath subflow socket implies SS_NOFDREF state.
1357 */
1358
1359 /*
1360 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1361 * the ipi-lock. We cannot hold the socket-lock at that point.
1362 */
1363 mpte_unlock(mpte);
1364 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1365 SOCF_ASYNC, PROC_NULL);
1366 mpte_lock(mpte);
1367 if (error) {
1368 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1369 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1370 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1371
1372 proc_rele(p);
1373
1374 mptcp_subflow_free(mpts);
1375 return (error);
1376 }
1377
1378 /*
1379 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1380 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1381 * Which is why we also need to get the lock with pr_getlock, as after
1382 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1383 */
1384 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1385 lck_mtx_lock(subflow_mtx);
1386
1387 /*
1388 * Must be the first thing we do, to make sure all pointers for this
1389 * subflow are set.
1390 */
1391 mptcp_subflow_attach(mpte, mpts, *so);
1392
1393 /*
1394 * A multipath subflow socket is used internally in the kernel,
1395 * therefore it does not have a file desciptor associated by
1396 * default.
1397 */
1398 (*so)->so_state |= SS_NOFDREF;
1399
1400 lck_mtx_unlock(subflow_mtx);
1401
1402 /* prevent the socket buffers from being compressed */
1403 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1404 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1405
1406 /* Inherit preconnect and TFO data flags */
1407 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1408 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1409 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1410 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1411
1412 /* Inherit uuid and create the related flow. */
1413 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1414 struct mptcb *mp_tp = mpte->mpte_mptcb;
1415
1416 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1417
1418 /*
1419 * A note on the unlock: With MPTCP, we do multiple times a
1420 * necp_client_register_socket_flow. This is problematic,
1421 * because now the lock-ordering guarantee (first necp-locks,
1422 * then socket-locks) is no more respected. So, we need to
1423 * unlock here.
1424 */
1425 mpte_unlock(mpte);
1426 error = necp_client_register_socket_flow(mp_so->last_pid,
1427 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1428 mpte_lock(mpte);
1429
1430 if (error)
1431 goto out_err;
1432
1433 /* Possible state-change during the unlock above */
1434 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1435 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1436 goto out_err;
1437
1438 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1439 } else {
1440 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1441 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1442 }
1443
1444 /* inherit the other socket options */
1445 bzero(&smpo, sizeof (smpo));
1446 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1447 smpo.mpo_level = SOL_SOCKET;
1448 smpo.mpo_intval = 1;
1449
1450 /* disable SIGPIPE */
1451 smpo.mpo_name = SO_NOSIGPIPE;
1452 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1453 goto out_err;
1454
1455 /* find out if the subflow's source address goes away */
1456 smpo.mpo_name = SO_NOADDRERR;
1457 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1458 goto out_err;
1459
1460 /* enable keepalive */
1461 smpo.mpo_name = SO_KEEPALIVE;
1462 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1463 goto out_err;
1464
1465 smpo.mpo_level = IPPROTO_TCP;
1466 smpo.mpo_intval = mptcp_subflow_keeptime;
1467 smpo.mpo_name = TCP_KEEPALIVE;
1468 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1469 goto out_err;
1470
1471 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1472 /*
1473 * On secondary subflows we might need to set the cell-fallback
1474 * flag (see conditions in mptcp_subflow_sosetopt).
1475 */
1476 smpo.mpo_level = SOL_SOCKET;
1477 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1478 smpo.mpo_intval = 1;
1479 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1480 goto out_err;
1481 }
1482
1483 /* replay setsockopt(2) on the subflow sockets for eligible options */
1484 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1485 int interim;
1486
1487 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1488 continue;
1489
1490 /*
1491 * Skip those that are handled internally; these options
1492 * should not have been recorded and marked with the
1493 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1494 */
1495 if (mpo->mpo_level == SOL_SOCKET &&
1496 (mpo->mpo_name == SO_NOSIGPIPE ||
1497 mpo->mpo_name == SO_NOADDRERR ||
1498 mpo->mpo_name == SO_KEEPALIVE))
1499 continue;
1500
1501 interim = (mpo->mpo_flags & MPOF_INTERIM);
1502 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1503 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1504 " sopt %s val %d interim record removed\n", __func__,
1505 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1506 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1507 mpo->mpo_intval),
1508 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1509 mptcp_sopt_remove(mpte, mpo);
1510 mptcp_sopt_free(mpo);
1511 continue;
1512 }
1513 }
1514
1515 /*
1516 * We need to receive everything that the subflow socket has,
1517 * so use a customized socket receive function. We will undo
1518 * this when the socket is peeled off or closed.
1519 */
1520 switch (dom) {
1521 case PF_INET:
1522 (*so)->so_proto = &mptcp_subflow_protosw;
1523 break;
1524 #if INET6
1525 case PF_INET6:
1526 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1527 break;
1528 #endif /* INET6 */
1529 default:
1530 VERIFY(0);
1531 /* NOTREACHED */
1532 }
1533
1534 proc_rele(p);
1535
1536 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1537 int, dom, int, error);
1538
1539 return (0);
1540
1541 out_err:
1542 mptcp_subflow_abort(mpts, error);
1543
1544 proc_rele(p);
1545
1546 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1547 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1548
1549 return (error);
1550 }
1551
1552 /*
1553 * Close an MPTCP subflow socket.
1554 *
1555 * Note that this may be called on an embryonic subflow, and the only
1556 * thing that is guaranteed valid is the protocol-user request.
1557 */
1558 static void
1559 mptcp_subflow_soclose(struct mptsub *mpts)
1560 {
1561 struct socket *so = mpts->mpts_socket;
1562
1563 if (mpts->mpts_flags & MPTSF_CLOSED)
1564 return;
1565
1566 VERIFY(so != NULL);
1567 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1568 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1569
1570 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1571 struct socket *, so,
1572 struct sockbuf *, &so->so_rcv,
1573 struct sockbuf *, &so->so_snd,
1574 struct mptses *, mpts->mpts_mpte);
1575
1576 mpts->mpts_flags |= MPTSF_CLOSED;
1577
1578 if (so->so_retaincnt == 0) {
1579 soclose_locked(so);
1580
1581 return;
1582 } else {
1583 VERIFY(so->so_usecount > 0);
1584 so->so_usecount--;
1585 }
1586
1587 return;
1588 }
1589
1590 /*
1591 * Connect an MPTCP subflow socket.
1592 *
1593 * Note that in the pending connect case, the subflow socket may have been
1594 * bound to an interface and/or a source IP address which may no longer be
1595 * around by the time this routine is called; in that case the connect attempt
1596 * will most likely fail.
1597 */
1598 static int
1599 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1600 {
1601 char dbuf[MAX_IPv6_STR_LEN];
1602 struct socket *mp_so, *so;
1603 struct mptcb *mp_tp;
1604 struct sockaddr *dst;
1605 struct proc *p;
1606 int af, error, dport;
1607
1608 mp_so = mptetoso(mpte);
1609 mp_tp = mpte->mpte_mptcb;
1610 so = mpts->mpts_socket;
1611 af = mpts->mpts_dst.sa_family;
1612 dst = &mpts->mpts_dst;
1613
1614 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1615 VERIFY(mpts->mpts_socket != NULL);
1616 VERIFY(af == AF_INET || af == AF_INET6);
1617
1618 if (af == AF_INET) {
1619 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof (dbuf));
1620 dport = ntohs(SIN(dst)->sin_port);
1621 } else {
1622 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof (dbuf));
1623 dport = ntohs(SIN6(dst)->sin6_port);
1624 }
1625
1626 os_log_info(mptcp_log_handle,
1627 "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
1628 dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1629
1630 p = proc_find(mp_so->last_pid);
1631 if (p == PROC_NULL) {
1632 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1633 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1634
1635 return (ESRCH);
1636 }
1637
1638 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1639
1640 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1641
1642 /* connect the subflow socket */
1643 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1644 p, mpts->mpts_ifscope,
1645 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1646
1647 mpts->mpts_iss = sototcpcb(so)->iss;
1648
1649 /* See tcp_connect_complete */
1650 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1651 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1652 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1653 }
1654
1655 /* Allocate a unique address id per subflow */
1656 mpte->mpte_addrid_last++;
1657 if (mpte->mpte_addrid_last == 0)
1658 mpte->mpte_addrid_last++;
1659
1660 proc_rele(p);
1661
1662 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1663 struct mptsub *, mpts, int, error);
1664 if (error)
1665 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1666 __func__, error, mpts->mpts_ifscope),
1667 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1668
1669 return (error);
1670 }
1671
1672 /*
1673 * MPTCP subflow socket receive routine, derived from soreceive().
1674 */
1675 static int
1676 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1677 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1678 {
1679 #pragma unused(uio)
1680 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1681 int flags, error = 0;
1682 struct proc *p = current_proc();
1683 struct mbuf *m, **mp = mp0;
1684 boolean_t proc_held = FALSE;
1685
1686 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
1687 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1688
1689 #ifdef MORE_LOCKING_DEBUG
1690 if (so->so_usecount == 1) {
1691 panic("%s: so=%x no other reference on socket\n", __func__, so);
1692 /* NOTREACHED */
1693 }
1694 #endif
1695 /*
1696 * We return all that is there in the subflow's socket receive buffer
1697 * to the MPTCP layer, so we require that the caller passes in the
1698 * expected parameters.
1699 */
1700 if (mp == NULL || controlp != NULL)
1701 return (EINVAL);
1702
1703 *mp = NULL;
1704 if (psa != NULL)
1705 *psa = NULL;
1706 if (flagsp != NULL)
1707 flags = *flagsp &~ MSG_EOR;
1708 else
1709 flags = 0;
1710
1711 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
1712 return (EOPNOTSUPP);
1713
1714 flags |= (MSG_DONTWAIT|MSG_NBIO);
1715
1716 /*
1717 * If a recv attempt is made on a previously-accepted socket
1718 * that has been marked as inactive (disconnected), reject
1719 * the request.
1720 */
1721 if (so->so_flags & SOF_DEFUNCT) {
1722 struct sockbuf *sb = &so->so_rcv;
1723
1724 error = ENOTCONN;
1725 /*
1726 * This socket should have been disconnected and flushed
1727 * prior to being returned from sodefunct(); there should
1728 * be no data on its receive list, so panic otherwise.
1729 */
1730 if (so->so_state & SS_DEFUNCT)
1731 sb_empty_assert(sb, __func__);
1732 return (error);
1733 }
1734
1735 /*
1736 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1737 * and if so just return to the caller. This could happen when
1738 * soreceive() is called by a socket upcall function during the
1739 * time the socket is freed. The socket buffer would have been
1740 * locked across the upcall, therefore we cannot put this thread
1741 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1742 * we may livelock), because the lock on the socket buffer will
1743 * only be released when the upcall routine returns to its caller.
1744 * Because the socket has been officially closed, there can be
1745 * no further read on it.
1746 *
1747 * A multipath subflow socket would have its SS_NOFDREF set by
1748 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1749 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1750 */
1751 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1752 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
1753 return (0);
1754
1755 /*
1756 * For consistency with soreceive() semantics, we need to obey
1757 * SB_LOCK in case some other code path has locked the buffer.
1758 */
1759 error = sblock(&so->so_rcv, 0);
1760 if (error != 0)
1761 return (error);
1762
1763 m = so->so_rcv.sb_mb;
1764 if (m == NULL) {
1765 /*
1766 * Panic if we notice inconsistencies in the socket's
1767 * receive list; both sb_mb and sb_cc should correctly
1768 * reflect the contents of the list, otherwise we may
1769 * end up with false positives during select() or poll()
1770 * which could put the application in a bad state.
1771 */
1772 SB_MB_CHECK(&so->so_rcv);
1773
1774 if (so->so_error != 0) {
1775 error = so->so_error;
1776 so->so_error = 0;
1777 goto release;
1778 }
1779
1780 if (so->so_state & SS_CANTRCVMORE) {
1781 goto release;
1782 }
1783
1784 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1785 error = ENOTCONN;
1786 goto release;
1787 }
1788
1789 /*
1790 * MSG_DONTWAIT is implicitly defined and this routine will
1791 * never block, so return EWOULDBLOCK when there is nothing.
1792 */
1793 error = EWOULDBLOCK;
1794 goto release;
1795 }
1796
1797 mptcp_update_last_owner(so, mp_so);
1798
1799 if (mp_so->last_pid != proc_pid(p)) {
1800 p = proc_find(mp_so->last_pid);
1801 if (p == PROC_NULL) {
1802 p = current_proc();
1803 } else {
1804 proc_held = TRUE;
1805 }
1806 }
1807
1808 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1809 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1810 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1811
1812 while (m != NULL) {
1813 int dlen = 0, dfin = 0, error_out = 0;
1814 struct mbuf *start = m;
1815 uint64_t dsn;
1816 uint32_t sseq;
1817 uint16_t orig_dlen;
1818 uint16_t csum;
1819
1820 VERIFY(m->m_nextpkt == NULL);
1821
1822 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1823 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1824 dsn = m->m_pkthdr.mp_dsn;
1825 sseq = m->m_pkthdr.mp_rseq;
1826 csum = m->m_pkthdr.mp_csum;
1827 } else {
1828 /* We did fallback */
1829 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
1830
1831 sbfree(&so->so_rcv, m);
1832
1833 if (mp != NULL) {
1834 *mp = m;
1835 mp = &m->m_next;
1836 so->so_rcv.sb_mb = m = m->m_next;
1837 *mp = NULL;
1838
1839 }
1840
1841 if (m != NULL) {
1842 so->so_rcv.sb_lastrecord = m;
1843 } else {
1844 SB_EMPTY_FIXUP(&so->so_rcv);
1845 }
1846
1847 continue;
1848 }
1849
1850 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
1851 dfin = 1;
1852
1853 /*
1854 * Check if the full mapping is now present
1855 */
1856 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
1857 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
1858 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
1859 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1860
1861 if (*mp0 == NULL)
1862 error = EWOULDBLOCK;
1863 goto release;
1864 }
1865
1866 /* Now, get the full mapping */
1867 while (dlen > 0) {
1868 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1869 error_out = 1;
1870 error = EIO;
1871 dlen = 0;
1872 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1873 break;
1874 }
1875
1876 dlen -= m->m_len;
1877 sbfree(&so->so_rcv, m);
1878
1879 if (mp != NULL) {
1880 *mp = m;
1881 mp = &m->m_next;
1882 so->so_rcv.sb_mb = m = m->m_next;
1883 *mp = NULL;
1884 }
1885
1886 if (dlen - dfin == 0)
1887 dlen = 0;
1888
1889 VERIFY(dlen <= 0 || m);
1890 }
1891
1892 VERIFY(dlen == 0);
1893
1894 if (m != NULL) {
1895 so->so_rcv.sb_lastrecord = m;
1896 } else {
1897 SB_EMPTY_FIXUP(&so->so_rcv);
1898 }
1899
1900 if (error_out)
1901 goto release;
1902
1903
1904 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
1905 error = EIO;
1906 *mp0 = NULL;
1907 goto release;
1908 }
1909
1910 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1911 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1912 }
1913
1914 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1915 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1916
1917 if (flagsp != NULL)
1918 *flagsp |= flags;
1919
1920 release:
1921 sbunlock(&so->so_rcv, TRUE);
1922
1923 if (proc_held)
1924 proc_rele(p);
1925
1926 return (error);
1927
1928 }
1929
1930 /*
1931 * MPTCP subflow socket send routine, derived from sosend().
1932 */
1933 static int
1934 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1935 struct mbuf *top, struct mbuf *control, int flags)
1936 {
1937 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1938 struct proc *p = current_proc();
1939 boolean_t en_tracing = FALSE, proc_held = FALSE;
1940 int en_tracing_val;
1941 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1942 int error;
1943
1944 VERIFY(control == NULL);
1945 VERIFY(addr == NULL);
1946 VERIFY(uio == NULL);
1947 VERIFY(flags == 0);
1948 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
1949
1950 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1951 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
1952
1953 /*
1954 * trace if tracing & network (vs. unix) sockets & and
1955 * non-loopback
1956 */
1957 if (ENTR_SHOULDTRACE &&
1958 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1959 struct inpcb *inp = sotoinpcb(so);
1960 if (inp->inp_last_outifp != NULL &&
1961 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1962 en_tracing = TRUE;
1963 en_tracing_val = top->m_pkthdr.len;
1964 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1965 VM_KERNEL_ADDRPERM(so),
1966 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1967 (int64_t)en_tracing_val);
1968 }
1969 }
1970
1971 mptcp_update_last_owner(so, mp_so);
1972
1973 if (mp_so->last_pid != proc_pid(p)) {
1974 p = proc_find(mp_so->last_pid);
1975 if (p == PROC_NULL) {
1976 p = current_proc();
1977 } else {
1978 proc_held = TRUE;
1979 }
1980 }
1981
1982 #if NECP
1983 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1984 #endif /* NECP */
1985
1986 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1987
1988 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1989 if (error)
1990 goto out;
1991
1992 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1993 top = NULL;
1994
1995 out:
1996 if (top != NULL)
1997 m_freem(top);
1998
1999 if (proc_held)
2000 proc_rele(p);
2001
2002 soclearfastopen(so);
2003
2004 if (en_tracing) {
2005 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2006 VM_KERNEL_ADDRPERM(so),
2007 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2008 (int64_t)en_tracing_val);
2009 }
2010
2011 return (error);
2012
2013 }
2014
2015 /*
2016 * Establish an initial MPTCP connection (if first subflow and not yet
2017 * connected), or add a subflow to an existing MPTCP connection.
2018 */
2019 int
2020 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2021 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2022 {
2023 struct socket *mp_so, *so = NULL;
2024 struct mptcb *mp_tp;
2025 struct mptsub *mpts = NULL;
2026 int af, error = 0;
2027
2028 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2029 mp_so = mptetoso(mpte);
2030 mp_tp = mpte->mpte_mptcb;
2031
2032 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2033 /* If the remote end sends Data FIN, refuse subflow adds */
2034 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
2035 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2036 error = ENOTCONN;
2037 goto out_err;
2038 }
2039
2040 mpts = mptcp_subflow_alloc();
2041 if (mpts == NULL) {
2042 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
2043 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2044 error = ENOMEM;
2045 goto out_err;
2046 }
2047
2048 if (src != NULL) {
2049 int len = src->sa_len;
2050
2051 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
2052 M_WAITOK | M_ZERO);
2053 if (mpts->mpts_src == NULL) {
2054 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
2055 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2056 error = ENOMEM;
2057 goto out_err;
2058 }
2059 bcopy(src, mpts->mpts_src, len);
2060 }
2061
2062 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
2063
2064 af = mpts->mpts_dst.sa_family;
2065
2066 mpts->mpts_ifscope = ifscope;
2067
2068 /* create the subflow socket */
2069 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
2070 /*
2071 * Returning (error) and not cleaning up, because up to here
2072 * all we did is creating mpts.
2073 *
2074 * And the contract is that the call to mptcp_subflow_socreate,
2075 * moves ownership of mpts to mptcp_subflow_socreate.
2076 */
2077 return (error);
2078
2079 /*
2080 * We may be called from within the kernel. Still need to account this
2081 * one to the real app.
2082 */
2083 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2084
2085 /*
2086 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2087 * -1 (SAE_CONNID_ALL).
2088 */
2089 mpte->mpte_connid_last++;
2090 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2091 mpte->mpte_connid_last == SAE_CONNID_ANY)
2092 mpte->mpte_connid_last++;
2093
2094 mpts->mpts_connid = mpte->mpte_connid_last;
2095
2096 mpts->mpts_rel_seq = 1;
2097
2098 /* Allocate a unique address id per subflow */
2099 mpte->mpte_addrid_last++;
2100 if (mpte->mpte_addrid_last == 0)
2101 mpte->mpte_addrid_last++;
2102
2103 /* register for subflow socket read/write events */
2104 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
2105
2106 /* Register for subflow socket control events */
2107 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2108 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2109 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2110 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2111 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2112 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2113 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2114 SO_FILT_HINT_ADAPTIVE_WTIMO);
2115
2116 /* sanity check */
2117 VERIFY(!(mpts->mpts_flags &
2118 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2119
2120 /*
2121 * Indicate to the TCP subflow whether or not it should establish
2122 * the initial MPTCP connection, or join an existing one. Fill
2123 * in the connection request structure with additional info needed
2124 * by the underlying TCP (to be used in the TCP options, etc.)
2125 */
2126 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2127 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2128
2129 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2130 mptcp_init_local_parms(mpte);
2131 }
2132 soisconnecting(mp_so);
2133
2134 /* If fastopen is requested, set state in mpts */
2135 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2136 mpts->mpts_flags |= MPTSF_TFO_REQD;
2137 } else {
2138 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2139 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2140 }
2141
2142 mpts->mpts_flags |= MPTSF_CONNECTING;
2143
2144 if (af == AF_INET || af == AF_INET6) {
2145 char dbuf[MAX_IPv6_STR_LEN];
2146
2147 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2148 "mp_so 0x%llx dst %s[%d] cid %d "
2149 "[pending %s]\n", __func__,
2150 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2151 inet_ntop(af, ((af == AF_INET) ?
2152 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2153 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
2154 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
2155 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2156 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
2157 mpts->mpts_connid,
2158 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
2159 "YES" : "NO")),
2160 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2161 }
2162
2163 /* connect right away if first attempt, or if join can be done now */
2164 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2165 error = mptcp_subflow_soconnectx(mpte, mpts);
2166
2167 if (error)
2168 goto out_err_close;
2169
2170 if (pcid)
2171 *pcid = mpts->mpts_connid;
2172
2173 return (0);
2174
2175 out_err_close:
2176 mptcp_subflow_abort(mpts, error);
2177
2178 return (error);
2179
2180 out_err:
2181 if (mpts)
2182 mptcp_subflow_free(mpts);
2183
2184 return (error);
2185 }
2186
2187 void
2188 mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2189 {
2190 int index = mptcp_get_statsindex(stats, mpts);
2191
2192 if (index != -1) {
2193 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2194
2195 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2196 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2197 }
2198 }
2199
2200 /*
2201 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2202 * will no longer be accessible after a subflow is deleted, thus this
2203 * should occur only after the subflow socket has been disconnected.
2204 */
2205 void
2206 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2207 {
2208 struct socket *mp_so = mptetoso(mpte);
2209 struct socket *so = mpts->mpts_socket;
2210 struct tcpcb *tp = sototcpcb(so);
2211
2212 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2213 VERIFY(mpts->mpts_mpte == mpte);
2214 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2215 VERIFY(mpte->mpte_numflows != 0);
2216 VERIFY(mp_so->so_usecount > 0);
2217
2218 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2219 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2220 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2221 mpts->mpts_flags, mp_so->so_error),
2222 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2223
2224 mptcpstats_update(mpte->mpte_itfstats, mpts);
2225 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2226 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2227
2228 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2229 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2230 mpte->mpte_numflows--;
2231 if (mpte->mpte_active_sub == mpts)
2232 mpte->mpte_active_sub = NULL;
2233
2234 /*
2235 * Drop references held by this subflow socket; there
2236 * will be no further upcalls made from this point.
2237 */
2238 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2239 sock_catchevents_locked(so, NULL, NULL, 0);
2240
2241 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2242
2243 mp_so->so_usecount--; /* for subflow socket */
2244 mpts->mpts_mpte = NULL;
2245 mpts->mpts_socket = NULL;
2246
2247 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2248 mptcp_subflow_remref(mpts); /* for subflow socket */
2249
2250 so->so_flags &= ~SOF_MP_SUBFLOW;
2251 tp->t_mptcb = NULL;
2252 tp->t_mpsub = NULL;
2253 }
2254
2255 void
2256 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2257 {
2258 struct socket *so = mpts->mpts_socket;
2259 struct mptcb *mp_tp = mpte->mpte_mptcb;
2260 int send_dfin = 0;
2261
2262 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2263 send_dfin = 1;
2264
2265 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2266 (so->so_state & SS_ISCONNECTED)) {
2267 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2268 __func__, mpts->mpts_connid, send_dfin),
2269 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2270
2271 if (send_dfin)
2272 mptcp_send_dfin(so);
2273 soshutdownlock(so, SHUT_WR);
2274 }
2275
2276 }
2277
2278 static void
2279 mptcp_subflow_abort(struct mptsub *mpts, int error)
2280 {
2281 struct socket *so = mpts->mpts_socket;
2282 struct tcpcb *tp = sototcpcb(so);
2283
2284 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2285 return;
2286
2287 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2288 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2289
2290 if (tp->t_state != TCPS_CLOSED)
2291 tcp_drop(tp, error);
2292
2293 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2294 }
2295
2296 /*
2297 * Disconnect a subflow socket.
2298 */
2299 void
2300 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2301 {
2302 struct socket *so;
2303 struct mptcb *mp_tp;
2304 int send_dfin = 0;
2305
2306 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2307
2308 VERIFY(mpts->mpts_mpte == mpte);
2309 VERIFY(mpts->mpts_socket != NULL);
2310
2311 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2312 return;
2313
2314 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2315
2316 so = mpts->mpts_socket;
2317 mp_tp = mpte->mpte_mptcb;
2318 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2319 send_dfin = 1;
2320
2321 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2322 (so->so_state & SS_ISCONNECTED)) {
2323 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2324 __func__, mpts->mpts_connid, send_dfin),
2325 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2326
2327 if (send_dfin)
2328 mptcp_send_dfin(so);
2329 (void) soshutdownlock(so, SHUT_RD);
2330 (void) soshutdownlock(so, SHUT_WR);
2331 (void) sodisconnectlocked(so);
2332 }
2333 /*
2334 * Generate a disconnect event for this subflow socket, in case
2335 * the lower layer doesn't do it; this is needed because the
2336 * subflow socket deletion relies on it.
2337 */
2338 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2339 }
2340
2341 /*
2342 * Called when the associated subflow socket posted a read event.
2343 */
2344 static void
2345 mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2346 {
2347 #pragma unused(so, waitf)
2348 struct mptsub *mpts = arg, *tmpts;
2349 struct mptses *mpte = mpts->mpts_mpte;
2350
2351 VERIFY(mpte != NULL);
2352
2353 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2354 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2355 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2356 return;
2357 }
2358
2359 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2360 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2361 if (mpts->mpts_socket->so_usecount == 0) {
2362 /* Will be removed soon by tcp_garbage_collect */
2363 continue;
2364 }
2365
2366 mptcp_subflow_addref(mpts);
2367 mpts->mpts_socket->so_usecount++;
2368
2369 mptcp_subflow_input(mpte, mpts);
2370
2371 mptcp_subflow_remref(mpts); /* ours */
2372
2373 VERIFY(mpts->mpts_socket->so_usecount != 0);
2374 mpts->mpts_socket->so_usecount--;
2375 }
2376
2377 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
2378 }
2379
2380 /*
2381 * Subflow socket input.
2382 */
2383 static void
2384 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2385 {
2386 struct socket *mp_so = mptetoso(mpte);
2387 struct mbuf *m = NULL;
2388 struct socket *so;
2389 int error, wakeup = 0;
2390
2391 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2392 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2393
2394 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2395 struct mptsub *, mpts);
2396
2397 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
2398 goto out;
2399
2400 so = mpts->mpts_socket;
2401
2402 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2403 if (error != 0 && error != EWOULDBLOCK) {
2404 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
2405 __func__, mpts->mpts_connid, error),
2406 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
2407 if (error == ENODATA) {
2408 /*
2409 * Don't ignore ENODATA so as to discover
2410 * nasty middleboxes.
2411 */
2412 mp_so->so_error = ENODATA;
2413
2414 wakeup = 1;
2415 goto out;
2416 }
2417 } else if (error == 0) {
2418 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2419 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2420 }
2421
2422 /* In fallback, make sure to accept data on all but one subflow */
2423 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2424 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2425 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2426 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2427 m_freem(m);
2428 goto out;
2429 }
2430
2431 if (m != NULL) {
2432 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2433 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2434
2435 mpte->mpte_used_cell = 1;
2436 } else {
2437 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2438
2439 mpte->mpte_used_wifi = 1;
2440 }
2441
2442 mptcp_input(mpte, m);
2443 }
2444
2445 /* notify protocol that we drained all the data */
2446 if (error == 0 && m != NULL &&
2447 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2448 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2449
2450 out:
2451 if (wakeup)
2452 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2453
2454 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2455 }
2456
2457 /*
2458 * Subflow socket write upcall.
2459 *
2460 * Called when the associated subflow socket posted a read event.
2461 */
2462 static void
2463 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2464 {
2465 #pragma unused(so, waitf)
2466 struct mptsub *mpts = arg;
2467 struct mptses *mpte = mpts->mpts_mpte;
2468
2469 VERIFY(mpte != NULL);
2470
2471 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2472 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2473 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2474 return;
2475 }
2476
2477 mptcp_output(mpte);
2478 }
2479
2480 static boolean_t
2481 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2482 {
2483 struct mbuf *so_m = so->so_snd.sb_mb;
2484 uint64_t dsn = m->m_pkthdr.mp_dsn;
2485
2486 while (so_m) {
2487 VERIFY(so_m->m_flags & M_PKTHDR);
2488 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2489
2490 /* Part of the segment is covered, don't reinject here */
2491 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2492 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn)
2493 return TRUE;
2494
2495 so_m = so_m->m_next;
2496 }
2497
2498 return FALSE;
2499 }
2500
2501 /*
2502 * Subflow socket output.
2503 *
2504 * Called for sending data from MPTCP to the underlying subflow socket.
2505 */
2506 int
2507 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2508 {
2509 struct mptcb *mp_tp = mpte->mpte_mptcb;
2510 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2511 struct socket *mp_so, *so;
2512 struct tcpcb *tp;
2513 uint64_t mpt_dsn = 0, off = 0;
2514 int sb_cc = 0, error = 0, wakeup = 0;
2515 uint32_t dss_csum;
2516 uint16_t tot_sent = 0;
2517 boolean_t reinjected = FALSE;
2518
2519 mpte_lock_assert_held(mpte);
2520
2521 mp_so = mptetoso(mpte);
2522 so = mpts->mpts_socket;
2523 tp = sototcpcb(so);
2524
2525 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2526 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2527
2528 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2529 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2530 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2531 (mpts->mpts_flags & MPTSF_TFO_REQD));
2532 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2533
2534 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2535 __func__, mpts->mpts_flags, mpte->mpte_flags,
2536 mptcp_subflow_cwnd_space(so)),
2537 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2538 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2539 struct mptsub *, mpts);
2540
2541 /* Remove Addr Option is not sent reliably as per I-D */
2542 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2543 tp->t_rem_aid = mpte->mpte_lost_aid;
2544 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2545 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2546 }
2547
2548 /*
2549 * The mbuf chains containing the metadata (as well as pointing to
2550 * the user data sitting at the MPTCP output queue) would then be
2551 * sent down to the subflow socket.
2552 *
2553 * Some notes on data sequencing:
2554 *
2555 * a. Each mbuf must be a M_PKTHDR.
2556 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2557 * in the mbuf pkthdr structure.
2558 * c. Each mbuf containing the MPTCP metadata must have its
2559 * pkt_flags marked with the PKTF_MPTCP flag.
2560 */
2561
2562 if (mpte->mpte_reinjectq)
2563 sb_mb = mpte->mpte_reinjectq;
2564 else
2565 sb_mb = mp_so->so_snd.sb_mb;
2566
2567 if (sb_mb == NULL) {
2568 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2569 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2570 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
2571 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2572
2573 /* Fix it to prevent looping */
2574 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2575 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2576 goto out;
2577 }
2578
2579 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2580
2581 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2582 !(so->so_state & SS_ISCONNECTED) &&
2583 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2584 tp->t_mpflags |= TMPF_TFO_REQUEST;
2585 goto zero_len_write;
2586 }
2587
2588 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2589
2590 /* First, drop acknowledged data */
2591 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2592 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2593 "dsn %u suna %u reinject? %u\n",
2594 __func__, (uint32_t)mpt_dsn,
2595 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2596 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2597 if (mpte->mpte_reinjectq) {
2598 mptcp_clean_reinjectq(mpte);
2599 } else {
2600 uint64_t len = 0;
2601 len = mp_tp->mpt_snduna - mpt_dsn;
2602 sbdrop(&mp_so->so_snd, (int)len);
2603 wakeup = 1;
2604 }
2605 }
2606
2607 /* Check again because of above sbdrop */
2608 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2609 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2610 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2611 goto out;
2612 }
2613
2614 /*
2615 * In degraded mode, we don't receive data acks, so force free
2616 * mbufs less than snd_nxt
2617 */
2618 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2619 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
2620 mp_so->so_snd.sb_mb) {
2621 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2622 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2623 uint64_t len = 0;
2624 len = mp_tp->mpt_snduna - mpt_dsn;
2625 sbdrop(&mp_so->so_snd, (int)len);
2626 wakeup = 1;
2627
2628 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2629 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2630 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2631 }
2632 }
2633
2634 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2635 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2636 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2637 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
2638 }
2639
2640 /*
2641 * Adjust the top level notion of next byte used for retransmissions
2642 * and sending FINs.
2643 */
2644 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2645 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2646
2647 /* Now determine the offset from which to start transmitting data */
2648 if (mpte->mpte_reinjectq)
2649 sb_mb = mpte->mpte_reinjectq;
2650 else
2651 dont_reinject:
2652 sb_mb = mp_so->so_snd.sb_mb;
2653 if (sb_mb == NULL) {
2654 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2655 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2656 goto out;
2657 }
2658
2659 if (sb_mb == mpte->mpte_reinjectq) {
2660 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2661 off = 0;
2662
2663 if (mptcp_search_seq_in_sub(sb_mb, so)) {
2664 if (mptcp_can_send_more(mp_tp, TRUE)) {
2665 goto dont_reinject;
2666 }
2667
2668 error = ECANCELED;
2669 goto out;
2670 }
2671
2672 reinjected = TRUE;
2673 } else if (flags & MPTCP_SUBOUT_PROBING) {
2674 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2675 off = 0;
2676 } else {
2677 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2678
2679 /*
2680 * With TFO, there might be no data at all, thus still go into this
2681 * code-path here.
2682 */
2683 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2684 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2685 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2686 sb_cc -= off;
2687 } else {
2688 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2689 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2690 (uint32_t)mp_tp->mpt_sndmax),
2691 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2692
2693 goto out;
2694 }
2695 }
2696
2697 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2698 if (sb_cc <= 0) {
2699 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2700 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2701 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2702 mptcp_subflow_cwnd_space(so)),
2703 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2704 }
2705
2706 sb_cc = min(sb_cc, UINT16_MAX);
2707
2708 /*
2709 * Create a DSN mapping for the data we are about to send. It all
2710 * has the same mapping.
2711 */
2712 if (reinjected)
2713 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2714 else
2715 mpt_dsn = mp_tp->mpt_snduna + off;
2716
2717 mpt_mbuf = sb_mb;
2718 while (mpt_mbuf && reinjected == FALSE &&
2719 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2720 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
2721 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2722 mpt_mbuf = mpt_mbuf->m_next;
2723 }
2724 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2725 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2726 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
2727 mpts->mpts_probecnt),
2728 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2729
2730 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
2731
2732 head = tail = NULL;
2733
2734 while (tot_sent < sb_cc) {
2735 ssize_t mlen;
2736
2737 mlen = mpt_mbuf->m_len;
2738 mlen -= off;
2739 mlen = min(mlen, sb_cc - tot_sent);
2740
2741 if (mlen < 0) {
2742 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2743 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2744 (uint32_t)off, sb_cc, tot_sent),
2745 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2746 goto out;
2747 }
2748
2749 if (mlen == 0)
2750 goto next;
2751
2752 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2753 M_COPYM_MUST_COPY_HDR);
2754 if (m == NULL) {
2755 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2756 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2757 error = ENOBUFS;
2758 break;
2759 }
2760
2761 /* Create a DSN mapping for the data (m_copym does it) */
2762 VERIFY(m->m_flags & M_PKTHDR);
2763 VERIFY(m->m_next == NULL);
2764
2765 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2766 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2767 m->m_pkthdr.mp_dsn = mpt_dsn;
2768 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2769 m->m_pkthdr.len = mlen;
2770
2771 if (head == NULL) {
2772 head = tail = m;
2773 } else {
2774 tail->m_next = m;
2775 tail = m;
2776 }
2777
2778 tot_sent += mlen;
2779 off = 0;
2780 next:
2781 mpt_mbuf = mpt_mbuf->m_next;
2782 }
2783
2784 if (reinjected) {
2785 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2786 struct mbuf *n = sb_mb;
2787
2788 while (n) {
2789 n->m_pkthdr.mp_dsn += sb_cc;
2790 n->m_pkthdr.mp_rlen -= sb_cc;
2791 n = n->m_next;
2792 }
2793 m_adj(sb_mb, sb_cc);
2794 } else {
2795 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2796 m_freem(sb_mb);
2797 }
2798 }
2799
2800 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2801 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2802 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2803
2804 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2805 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2806 tot_sent);
2807 }
2808
2809 /* Now, let's update rel-seq and the data-level length */
2810 mpts->mpts_rel_seq += tot_sent;
2811 m = head;
2812 while (m) {
2813 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2814 m->m_pkthdr.mp_csum = dss_csum;
2815 m->m_pkthdr.mp_rlen = tot_sent;
2816 m = m->m_next;
2817 }
2818
2819 if (head != NULL) {
2820 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2821 (tp->t_tfo_stats == 0))
2822 tp->t_mpflags |= TMPF_TFO_REQUEST;
2823
2824 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2825
2826 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
2827 struct sockbuf *, &so->so_rcv,
2828 struct sockbuf *, &so->so_snd,
2829 struct mptses *, mpte, struct mptsub *, mpts,
2830 size_t, tot_sent);
2831 }
2832
2833 done_sending:
2834 if (error == 0 ||
2835 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2836 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
2837
2838 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2839 tcpstat.tcps_mp_num_probes++;
2840 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
2841 mpts->mpts_probecnt += 1;
2842 else
2843 mpts->mpts_probecnt +=
2844 tot_sent/mpts->mpts_maxseg;
2845 }
2846
2847 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2848 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
2849 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2850 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2851 mp_tp->mpt_sndnxt = new_sndnxt;
2852 }
2853
2854 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
2855
2856 /* Must be here as mptcp_can_send_more() checks for this */
2857 soclearfastopen(mp_so);
2858
2859 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2860 (mpts->mpts_probesoon != 0))
2861 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2862 __func__, mpts->mpts_connid,
2863 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2864 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
2865 (tcp_now - mpts->mpts_probesoon)),
2866 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2867
2868 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2869 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2870
2871 mpte->mpte_used_cell = 1;
2872 } else {
2873 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2874
2875 mpte->mpte_used_wifi = 1;
2876 }
2877
2878 /*
2879 * Don't propagate EWOULDBLOCK - it's already taken care of
2880 * in mptcp_usr_send for TFO.
2881 */
2882 error = 0;
2883 } else {
2884 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2885 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
2886 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2887 }
2888 out:
2889
2890 if (wakeup)
2891 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2892
2893 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
2894 return (error);
2895
2896 zero_len_write:
2897 /* Opting to call pru_send as no mbuf at subflow level */
2898 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2899 NULL, current_proc());
2900
2901 goto done_sending;
2902 }
2903
2904 static void
2905 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
2906 {
2907 struct mbuf *n, *prev = NULL;
2908
2909 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2910 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2911 m->m_pkthdr.mp_rseq),
2912 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2913
2914 n = mpte->mpte_reinjectq;
2915
2916 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2917 * equal than m's sequence number.
2918 */
2919 while (n) {
2920 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2921 break;
2922
2923 prev = n;
2924
2925 n = n->m_nextpkt;
2926 }
2927
2928 if (n) {
2929 /* m is already fully covered by the next mbuf in the queue */
2930 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2931 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2932 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2933 __func__, n->m_pkthdr.mp_rlen),
2934 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2935 goto dont_queue;
2936 }
2937
2938 /* m is covering the next mbuf entirely, thus we remove this guy */
2939 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2940 struct mbuf *tmp = n->m_nextpkt;
2941
2942 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2943 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2944 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2945 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2946
2947 m->m_nextpkt = NULL;
2948 if (prev == NULL)
2949 mpte->mpte_reinjectq = tmp;
2950 else
2951 prev->m_nextpkt = tmp;
2952
2953 m_freem(n);
2954 n = tmp;
2955 }
2956
2957 }
2958
2959 if (prev) {
2960 /* m is already fully covered by the previous mbuf in the queue */
2961 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2962 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2963 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2964 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2965 goto dont_queue;
2966 }
2967 }
2968
2969 if (prev == NULL)
2970 mpte->mpte_reinjectq = m;
2971 else
2972 prev->m_nextpkt = m;
2973
2974 m->m_nextpkt = n;
2975
2976 return;
2977
2978 dont_queue:
2979 m_freem(m);
2980 return;
2981 }
2982
2983 static struct mbuf *
2984 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
2985 {
2986 struct socket *mp_so = mptetoso(mpte);
2987 struct mbuf *m;
2988
2989 m = mp_so->so_snd.sb_mb;
2990
2991 while (m) {
2992 /* If this segment covers what we are looking for, return it. */
2993 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2994 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2995 break;
2996
2997
2998 /* Segment is no more in the queue */
2999 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
3000 return NULL;
3001
3002 m = m->m_next;
3003 }
3004
3005 return m;
3006 }
3007
3008 static struct mbuf *
3009 mptcp_copy_mbuf_list(struct mbuf *m, int len)
3010 {
3011 struct mbuf *top = NULL, *tail = NULL;
3012 uint64_t dsn;
3013 uint32_t dlen, rseq;
3014
3015 dsn = m->m_pkthdr.mp_dsn;
3016 dlen = m->m_pkthdr.mp_rlen;
3017 rseq = m->m_pkthdr.mp_rseq;
3018
3019 while (len > 0) {
3020 struct mbuf *n;
3021
3022 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3023
3024 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3025 if (n == NULL) {
3026 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
3027 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
3028 goto err;
3029 }
3030
3031 VERIFY(n->m_flags & M_PKTHDR);
3032 VERIFY(n->m_next == NULL);
3033 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3034 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3035 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3036 VERIFY(n->m_len == m->m_len);
3037
3038 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3039
3040 if (top == NULL)
3041 top = n;
3042
3043 if (tail != NULL)
3044 tail->m_next = n;
3045
3046 tail = n;
3047
3048 len -= m->m_len;
3049 m = m->m_next;
3050 }
3051
3052 return top;
3053
3054 err:
3055 if (top)
3056 m_freem(top);
3057
3058 return NULL;
3059 }
3060
3061 static void
3062 mptcp_reinject_mbufs(struct socket *so)
3063 {
3064 struct tcpcb *tp = sototcpcb(so);
3065 struct mptsub *mpts = tp->t_mpsub;
3066 struct mptcb *mp_tp = tptomptp(tp);
3067 struct mptses *mpte = mp_tp->mpt_mpte;;
3068 struct sockbuf *sb = &so->so_snd;
3069 struct mbuf *m;
3070
3071 m = sb->sb_mb;
3072 while (m) {
3073 struct mbuf *n = m->m_next, *orig = m;
3074
3075 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3076 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3077 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3078 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3079
3080 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3081
3082 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
3083 goto next;
3084
3085 /* Has it all already been acknowledged at the data-level? */
3086 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
3087 goto next;
3088
3089 /* Part of this has already been acknowledged - lookup in the
3090 * MPTCP-socket for the segment.
3091 */
3092 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3093 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3094 if (m == NULL)
3095 goto next;
3096 }
3097
3098 /* Copy the mbuf with headers (aka, DSN-numbers) */
3099 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
3100 if (m == NULL)
3101 break;
3102
3103 VERIFY(m->m_nextpkt == NULL);
3104
3105 /* Now, add to the reinject-queue, eliminating overlapping
3106 * segments
3107 */
3108 mptcp_add_reinjectq(mpte, m);
3109
3110 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3111
3112 next:
3113 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3114 while (n) {
3115 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3116
3117 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
3118 break;
3119
3120 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3121 n = n->m_next;
3122 }
3123
3124 m = n;
3125 }
3126 }
3127
3128 void
3129 mptcp_clean_reinjectq(struct mptses *mpte)
3130 {
3131 struct mptcb *mp_tp = mpte->mpte_mptcb;
3132
3133 mpte_lock_assert_held(mpte);
3134
3135 while (mpte->mpte_reinjectq) {
3136 struct mbuf *m = mpte->mpte_reinjectq;
3137
3138 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3139 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
3140 break;
3141
3142 mpte->mpte_reinjectq = m->m_nextpkt;
3143 m->m_nextpkt = NULL;
3144 m_freem(m);
3145 }
3146 }
3147
3148 /*
3149 * Subflow socket control event upcall.
3150 */
3151 static void
3152 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3153 {
3154 #pragma unused(so)
3155 struct mptsub *mpts = arg;
3156 struct mptses *mpte = mpts->mpts_mpte;
3157
3158 VERIFY(mpte != NULL);
3159 mpte_lock_assert_held(mpte);
3160
3161 if ((mpts->mpts_evctl & events) == events)
3162 return;
3163
3164 mpts->mpts_evctl |= events;
3165
3166 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3167 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3168 return;
3169 }
3170
3171 mptcp_subflow_workloop(mpte);
3172 }
3173
3174 /*
3175 * Subflow socket control events.
3176 *
3177 * Called for handling events related to the underlying subflow socket.
3178 */
3179 static ev_ret_t
3180 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3181 uint64_t *p_mpsofilt_hint)
3182 {
3183 ev_ret_t ret = MPTS_EVRET_OK;
3184 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3185 sizeof(mpsub_ev_entry_tbl[0]);
3186
3187 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3188
3189 /* bail if there's nothing to process */
3190 if (!mpts->mpts_evctl)
3191 return (ret);
3192
3193 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3194 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3195 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3196 SO_FILT_HINT_DISCONNECTED)) {
3197 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3198 }
3199
3200 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3201 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3202
3203 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3204 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3205 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3206
3207 /*
3208 * Process all the socket filter hints and reset the hint
3209 * once it is handled
3210 */
3211 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3212 /*
3213 * Always execute the DISCONNECTED event, because it will wakeup
3214 * the app.
3215 */
3216 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3217 (ret >= MPTS_EVRET_OK ||
3218 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3219 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3220 ev_ret_t error =
3221 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3222 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3223 }
3224 }
3225
3226 /*
3227 * We should be getting only events specified via sock_catchevents(),
3228 * so loudly complain if we have any unprocessed one(s).
3229 */
3230 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3231 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3232 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3233 mpts->mpts_connid,
3234 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3235 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3236 else
3237 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3238 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3239 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3240
3241 return (ret);
3242 }
3243
3244 static ev_ret_t
3245 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3246 uint64_t *p_mpsofilt_hint, uint64_t event)
3247 {
3248 struct socket *mp_so, *so;
3249 struct mptcb *mp_tp;
3250
3251 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3252 VERIFY(mpte->mpte_mppcb != NULL);
3253 mp_so = mptetoso(mpte);
3254 mp_tp = mpte->mpte_mptcb;
3255 so = mpts->mpts_socket;
3256
3257 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3258 mpts->mpts_connid, event),
3259 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3260
3261 /*
3262 * We got an event for this subflow that might need to be propagated,
3263 * based on the state of the MPTCP connection.
3264 */
3265 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3266 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3267 mp_so->so_error = so->so_error;
3268 *p_mpsofilt_hint |= event;
3269 }
3270
3271 return (MPTS_EVRET_OK);
3272 }
3273
3274 /*
3275 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3276 */
3277 static ev_ret_t
3278 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3279 uint64_t *p_mpsofilt_hint, uint64_t event)
3280 {
3281 #pragma unused(p_mpsofilt_hint, event)
3282 struct socket *mp_so;
3283 struct tcpcb *tp;
3284
3285 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3286
3287 VERIFY(mpte->mpte_mppcb != NULL);
3288 mp_so = mptetoso(mpte);
3289 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3290
3291 /*
3292 * This overwrites any previous mpte_lost_aid to avoid storing
3293 * too much state when the typical case has only two subflows.
3294 */
3295 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3296 mpte->mpte_lost_aid = tp->t_local_aid;
3297
3298 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3299 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3300
3301 /*
3302 * The subflow connection has lost its source address.
3303 */
3304 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3305
3306 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3307 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3308
3309 return (MPTS_EVRET_DELETE);
3310 }
3311
3312 /*
3313 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3314 * indicates that the remote side sent a Data FIN
3315 */
3316 static ev_ret_t
3317 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3318 uint64_t *p_mpsofilt_hint, uint64_t event)
3319 {
3320 #pragma unused(event)
3321 struct mptcb *mp_tp;
3322
3323 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3324 mp_tp = mpte->mpte_mptcb;
3325
3326 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3327 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3328
3329 /*
3330 * We got a Data FIN for the MPTCP connection.
3331 * The FIN may arrive with data. The data is handed up to the
3332 * mptcp socket and the user is notified so that it may close
3333 * the socket if needed.
3334 */
3335 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
3336 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3337
3338 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3339 }
3340
3341 /*
3342 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3343 */
3344 static ev_ret_t
3345 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3346 uint64_t *p_mpsofilt_hint, uint64_t event)
3347 {
3348 #pragma unused(event, p_mpsofilt_hint)
3349 struct mptsub *mpts_alt = NULL;
3350 struct socket *alt_so = NULL;
3351 struct socket *mp_so;
3352 int altpath_exists = 0;
3353
3354 mpte_lock_assert_held(mpte);
3355 mp_so = mptetoso(mpte);
3356 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3357 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3358 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3359
3360 mptcp_reinject_mbufs(mpts->mpts_socket);
3361
3362 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
3363 /*
3364 * If there is no alternate eligible subflow, ignore the
3365 * failover hint.
3366 */
3367 if (mpts_alt == NULL) {
3368 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3369 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3370
3371 goto done;
3372 }
3373
3374 altpath_exists = 1;
3375 alt_so = mpts_alt->mpts_socket;
3376 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3377 /* All data acknowledged and no RTT spike */
3378 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3379 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3380 } else {
3381 /* no alternate path available */
3382 altpath_exists = 0;
3383 }
3384 }
3385
3386 if (altpath_exists) {
3387 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3388
3389 mpte->mpte_active_sub = mpts_alt;
3390 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3391 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3392
3393 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3394 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3395 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3396
3397 mptcpstats_inc_switch(mpte, mpts);
3398
3399 sowwakeup(alt_so);
3400 } else {
3401 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3402 mpts->mpts_connid),
3403 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3404 done:
3405 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3406 }
3407
3408 return (MPTS_EVRET_OK);
3409 }
3410
3411 /*
3412 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3413 */
3414 static ev_ret_t
3415 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3416 uint64_t *p_mpsofilt_hint, uint64_t event)
3417 {
3418 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3419 VERIFY(mpte->mpte_mppcb != NULL);
3420
3421 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3422 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3423
3424 /*
3425 * The subflow connection cannot use the outgoing interface, let's
3426 * close this subflow.
3427 */
3428 mptcp_subflow_abort(mpts, EPERM);
3429
3430 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3431
3432 return (MPTS_EVRET_DELETE);
3433 }
3434
3435 /*
3436 * https://tools.ietf.org/html/rfc6052#section-2
3437 * https://tools.ietf.org/html/rfc6147#section-5.2
3438 */
3439 static boolean_t
3440 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3441 const struct ipv6_prefix *prefix,
3442 struct in_addr *addrv4)
3443 {
3444 char buf[MAX_IPv4_STR_LEN];
3445 char *ptrv4 = (char *)addrv4;
3446 const char *ptr = (const char *)addr;
3447
3448 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0)
3449 return false;
3450
3451 switch (prefix->prefix_len) {
3452 case NAT64_PREFIX_LEN_96:
3453 memcpy(ptrv4, ptr + 12, 4);
3454 break;
3455 case NAT64_PREFIX_LEN_64:
3456 memcpy(ptrv4, ptr + 9, 4);
3457 break;
3458 case NAT64_PREFIX_LEN_56:
3459 memcpy(ptrv4, ptr + 7, 1);
3460 memcpy(ptrv4 + 1, ptr + 9, 3);
3461 break;
3462 case NAT64_PREFIX_LEN_48:
3463 memcpy(ptrv4, ptr + 6, 2);
3464 memcpy(ptrv4 + 2, ptr + 9, 2);
3465 break;
3466 case NAT64_PREFIX_LEN_40:
3467 memcpy(ptrv4, ptr + 5, 3);
3468 memcpy(ptrv4 + 3, ptr + 9, 1);
3469 break;
3470 case NAT64_PREFIX_LEN_32:
3471 memcpy(ptrv4, ptr + 4, 4);
3472 break;
3473 default:
3474 panic("NAT64-prefix len is wrong: %u\n",
3475 prefix->prefix_len);
3476 }
3477
3478 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3479 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3480
3481 return true;
3482 }
3483
3484 static void
3485 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3486 {
3487 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3488 struct socket *so = mpts->mpts_socket;
3489 struct ifnet *ifp;
3490 int j;
3491
3492 ifp = sotoinpcb(so)->inp_last_outifp;
3493
3494 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3495 mptcp_ask_for_nat64(ifp);
3496 return;
3497 }
3498
3499
3500 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3501 int success;
3502
3503 if (nat64prefixes[j].prefix_len == 0)
3504 continue;
3505
3506 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3507 &nat64prefixes[j],
3508 &mpte->mpte_dst_v4_nat64.sin_addr);
3509 if (success) {
3510 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3511 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3512 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3513 break;
3514 }
3515 }
3516 }
3517
3518 /*
3519 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3520 */
3521 static ev_ret_t
3522 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3523 uint64_t *p_mpsofilt_hint, uint64_t event)
3524 {
3525 #pragma unused(event, p_mpsofilt_hint)
3526 struct socket *mp_so, *so;
3527 struct inpcb *inp;
3528 struct tcpcb *tp;
3529 struct mptcb *mp_tp;
3530 int af;
3531 boolean_t mpok = FALSE;
3532
3533 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3534 VERIFY(mpte->mpte_mppcb != NULL);
3535
3536 mp_so = mptetoso(mpte);
3537 mp_tp = mpte->mpte_mptcb;
3538 so = mpts->mpts_socket;
3539 tp = sototcpcb(so);
3540 af = mpts->mpts_dst.sa_family;
3541
3542 if (mpts->mpts_flags & MPTSF_CONNECTED)
3543 return (MPTS_EVRET_OK);
3544
3545 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3546 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
3547 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3548 (so->so_state & SS_ISCONNECTED)) {
3549 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3550 __func__, mpts->mpts_connid),
3551 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3552 (void) soshutdownlock(so, SHUT_RD);
3553 (void) soshutdownlock(so, SHUT_WR);
3554 (void) sodisconnectlocked(so);
3555 }
3556 return (MPTS_EVRET_OK);
3557 }
3558
3559 /*
3560 * The subflow connection has been connected. Find out whether it
3561 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3562 *
3563 * a. If MPTCP connection is not yet established, then this must be
3564 * the first subflow connection. If MPTCP failed to negotiate,
3565 * fallback to regular TCP by degrading this subflow.
3566 *
3567 * b. If MPTCP connection has been established, then this must be
3568 * one of the subsequent subflow connections. If MPTCP failed
3569 * to negotiate, disconnect the connection.
3570 *
3571 * Right now, we simply unblock any waiters at the MPTCP socket layer
3572 * if the MPTCP connection has not been established.
3573 */
3574
3575 if (so->so_state & SS_ISDISCONNECTED) {
3576 /*
3577 * With MPTCP joins, a connection is connected at the subflow
3578 * level, but the 4th ACK from the server elevates the MPTCP
3579 * subflow to connected state. So there is a small window
3580 * where the subflow could get disconnected before the
3581 * connected event is processed.
3582 */
3583 return (MPTS_EVRET_OK);
3584 }
3585
3586 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3587 mptcp_drop_tfo_data(mpte, mpts);
3588
3589 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3590 mpts->mpts_flags |= MPTSF_CONNECTED;
3591
3592 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
3593 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3594
3595 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3596
3597 /* get/verify the outbound interface */
3598 inp = sotoinpcb(so);
3599
3600 mpts->mpts_maxseg = tp->t_maxseg;
3601
3602 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3603 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3604 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3605 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
3606
3607 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
3608
3609 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3610 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3611 mpte->mpte_associd = mpts->mpts_connid;
3612 DTRACE_MPTCP2(state__change,
3613 struct mptcb *, mp_tp,
3614 uint32_t, 0 /* event */);
3615
3616 if (SOCK_DOM(so) == AF_INET) {
3617 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3618 } else {
3619 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3620 }
3621
3622 mpts->mpts_flags |= MPTSF_ACTIVE;
3623
3624 /* case (a) above */
3625 if (!mpok) {
3626 tcpstat.tcps_mpcap_fallback++;
3627
3628 tp->t_mpflags |= TMPF_INFIN_SENT;
3629 mptcp_notify_mpfail(so);
3630 } else {
3631 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3632 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3633 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3634 } else {
3635 mpts->mpts_flags |= MPTSF_PREFERRED;
3636 }
3637 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3638 mpte->mpte_nummpcapflows++;
3639
3640 if (SOCK_DOM(so) == AF_INET6)
3641 mptcp_handle_ipv6_connection(mpte, mpts);
3642
3643 mptcp_check_subflows_and_add(mpte);
3644
3645 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3646 mpte->mpte_initial_cell = 1;
3647
3648 mpte->mpte_handshake_success = 1;
3649 }
3650
3651 mp_tp->mpt_sndwnd = tp->snd_wnd;
3652 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3653 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3654 soisconnected(mp_so);
3655
3656 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3657 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3658 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
3659 } else if (mpok) {
3660 /*
3661 * case (b) above
3662 * In case of additional flows, the MPTCP socket is not
3663 * MPTSF_MP_CAPABLE until an ACK is received from server
3664 * for 3-way handshake. TCP would have guaranteed that this
3665 * is an MPTCP subflow.
3666 */
3667 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3668 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3669 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3670 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3671 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3672 } else {
3673 mpts->mpts_flags |= MPTSF_PREFERRED;
3674 }
3675
3676 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3677 mpte->mpte_nummpcapflows++;
3678
3679 mpts->mpts_rel_seq = 1;
3680
3681 mptcp_check_subflows_and_remove(mpte);
3682 } else {
3683 unsigned int i;
3684
3685 /* Should we try the alternate port? */
3686 if (mpte->mpte_alternate_port &&
3687 inp->inp_fport != mpte->mpte_alternate_port) {
3688 union sockaddr_in_4_6 dst;
3689 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3690
3691 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3692
3693 dst_in->sin_port = mpte->mpte_alternate_port;
3694
3695 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3696 mpts->mpts_ifscope , NULL);
3697 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3698 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3699 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3700
3701 if (inp->inp_last_outifp->if_index == info->ifindex) {
3702 info->no_mptcp_support = 1;
3703 break;
3704 }
3705 }
3706 }
3707
3708 tcpstat.tcps_join_fallback++;
3709 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3710 tcpstat.tcps_mptcp_cell_proxy++;
3711 else
3712 tcpstat.tcps_mptcp_wifi_proxy++;
3713
3714 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3715
3716 return (MPTS_EVRET_OK);
3717 }
3718
3719 /* This call, just to "book" an entry in the stats-table for this ifindex */
3720 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3721
3722 mptcp_output(mpte);
3723
3724 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3725 }
3726
3727 /*
3728 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3729 */
3730 static ev_ret_t
3731 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
3732 uint64_t *p_mpsofilt_hint, uint64_t event)
3733 {
3734 #pragma unused(event, p_mpsofilt_hint)
3735 struct socket *mp_so, *so;
3736 struct mptcb *mp_tp;
3737
3738 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3739 VERIFY(mpte->mpte_mppcb != NULL);
3740 mp_so = mptetoso(mpte);
3741 mp_tp = mpte->mpte_mptcb;
3742 so = mpts->mpts_socket;
3743
3744 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3745 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3746 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3747 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3748 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3749
3750 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3751 return (MPTS_EVRET_DELETE);
3752
3753 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3754
3755 /* The subflow connection has been disconnected. */
3756
3757 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3758 mpte->mpte_nummpcapflows--;
3759 if (mpte->mpte_active_sub == mpts) {
3760 mpte->mpte_active_sub = NULL;
3761 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3762 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3763 }
3764 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3765 }
3766
3767 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3768 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3769 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3770 mptcp_drop(mpte, mp_tp, so->so_error);
3771 }
3772
3773 /*
3774 * Clear flags that are used by getconninfo to return state.
3775 * Retain like MPTSF_DELETEOK for internal purposes.
3776 */
3777 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3778 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3779 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3780
3781 return (MPTS_EVRET_DELETE);
3782 }
3783
3784 /*
3785 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3786 */
3787 static ev_ret_t
3788 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3789 uint64_t *p_mpsofilt_hint, uint64_t event)
3790 {
3791 #pragma unused(event, p_mpsofilt_hint)
3792 struct socket *mp_so, *so;
3793 struct mptcb *mp_tp;
3794 ev_ret_t ret = MPTS_EVRET_OK;
3795
3796 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3797 VERIFY(mpte->mpte_mppcb != NULL);
3798 mp_so = mptetoso(mpte);
3799 mp_tp = mpte->mpte_mptcb;
3800 so = mpts->mpts_socket;
3801
3802 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3803 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3804 else
3805 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3806
3807 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3808 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3809 goto done;
3810 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3811 } else {
3812 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3813 }
3814
3815 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3816 mpts->mpts_flags |= MPTSF_MP_READY;
3817 else
3818 mpts->mpts_flags &= ~MPTSF_MP_READY;
3819
3820 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3821 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3822 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3823 }
3824
3825 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3826 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3827 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3828
3829 m_freem_list(mpte->mpte_reinjectq);
3830 mpte->mpte_reinjectq = NULL;
3831 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3832 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3833 ret = MPTS_EVRET_CONNECT_PENDING;
3834 }
3835
3836 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3837 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3838 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3839 mpts->mpts_flags, MPTSF_BITS),
3840 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3841
3842 done:
3843 return (ret);
3844 }
3845
3846 /*
3847 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3848 */
3849 static ev_ret_t
3850 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3851 uint64_t *p_mpsofilt_hint, uint64_t event)
3852 {
3853 #pragma unused(event)
3854 struct socket *mp_so, *so;
3855 struct mptcb *mp_tp;
3856 boolean_t is_fastclose;
3857
3858 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3859 VERIFY(mpte->mpte_mppcb != NULL);
3860 mp_so = mptetoso(mpte);
3861 mp_tp = mpte->mpte_mptcb;
3862 so = mpts->mpts_socket;
3863
3864 /* We got an invalid option or a fast close */
3865 struct tcptemp *t_template;
3866 struct inpcb *inp = sotoinpcb(so);
3867 struct tcpcb *tp = NULL;
3868
3869 tp = intotcpcb(inp);
3870 so->so_error = ECONNABORTED;
3871
3872 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3873
3874 t_template = tcp_maketemplate(tp);
3875 if (t_template) {
3876 struct tcp_respond_args tra;
3877
3878 bzero(&tra, sizeof(tra));
3879 if (inp->inp_flags & INP_BOUND_IF)
3880 tra.ifscope = inp->inp_boundifp->if_index;
3881 else
3882 tra.ifscope = IFSCOPE_NONE;
3883 tra.awdl_unrestricted = 1;
3884
3885 tcp_respond(tp, t_template->tt_ipgen,
3886 &t_template->tt_t, (struct mbuf *)NULL,
3887 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
3888 (void) m_free(dtom(t_template));
3889 mptcplog((LOG_DEBUG, "MPTCP Events: "
3890 "%s: mp_so 0x%llx cid %d \n",
3891 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3892 so, mpts->mpts_connid),
3893 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3894 }
3895 mptcp_subflow_abort(mpts, ECONNABORTED);
3896
3897 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3898 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
3899
3900 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3901 mp_so->so_error = ECONNABORTED;
3902 else
3903 mp_so->so_error = ECONNRESET;
3904
3905 /*
3906 * mptcp_drop is being called after processing the events, to fully
3907 * close the MPTCP connection
3908 */
3909 }
3910
3911 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3912 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
3913
3914 return (MPTS_EVRET_DELETE);
3915 }
3916
3917 static ev_ret_t
3918 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3919 uint64_t *p_mpsofilt_hint, uint64_t event)
3920 {
3921 #pragma unused(event)
3922 bool found_active = false;
3923
3924 mpts->mpts_flags |= MPTSF_READ_STALL;
3925
3926 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3927 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3928
3929 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3930 TCPS_HAVERCVDFIN2(tp->t_state))
3931 continue;
3932
3933 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3934 found_active = true;
3935 break;
3936 }
3937 }
3938
3939 if (!found_active)
3940 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3941
3942 return (MPTS_EVRET_OK);
3943 }
3944
3945 static ev_ret_t
3946 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3947 uint64_t *p_mpsofilt_hint, uint64_t event)
3948 {
3949 #pragma unused(event)
3950 bool found_active = false;
3951
3952 mpts->mpts_flags |= MPTSF_WRITE_STALL;
3953
3954 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3955 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3956
3957 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3958 tp->t_state > TCPS_CLOSE_WAIT)
3959 continue;
3960
3961 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3962 found_active = true;
3963 break;
3964 }
3965 }
3966
3967 if (!found_active)
3968 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3969
3970 return (MPTS_EVRET_OK);
3971 }
3972
3973 static const char *
3974 mptcp_evret2str(ev_ret_t ret)
3975 {
3976 const char *c = "UNKNOWN";
3977
3978 switch (ret) {
3979 case MPTS_EVRET_DELETE:
3980 c = "MPTS_EVRET_DELETE";
3981 break;
3982 case MPTS_EVRET_CONNECT_PENDING:
3983 c = "MPTS_EVRET_CONNECT_PENDING";
3984 break;
3985 case MPTS_EVRET_DISCONNECT_FALLBACK:
3986 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3987 break;
3988 case MPTS_EVRET_OK:
3989 c = "MPTS_EVRET_OK";
3990 break;
3991 default:
3992 break;
3993 }
3994 return (c);
3995 }
3996
3997 /*
3998 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3999 * caller must ensure that the option can be issued on subflow sockets, via
4000 * MPOF_SUBFLOW_OK flag.
4001 */
4002 int
4003 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4004 {
4005 struct socket *mp_so, *so;
4006 struct sockopt sopt;
4007 int error;
4008
4009 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4010 mpte_lock_assert_held(mpte);
4011
4012 mp_so = mptetoso(mpte);
4013 so = mpts->mpts_socket;
4014
4015 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4016 mpo->mpo_level == SOL_SOCKET &&
4017 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4018 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4019
4020 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4021 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
4022 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4023 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4024 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4025
4026 /*
4027 * When we open a new subflow, mark it as cell fallback, if
4028 * this subflow goes over cell.
4029 *
4030 * (except for first-party apps)
4031 */
4032
4033 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
4034 return (0);
4035
4036 if (sotoinpcb(so)->inp_last_outifp &&
4037 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
4038 return (0);
4039
4040 /*
4041 * This here is an OR, because if the app is not binding to the
4042 * interface, then it definitely is not a cell-fallback
4043 * connection.
4044 */
4045 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4046 !IFNET_IS_CELLULAR(ifp))
4047 return (0);
4048 }
4049
4050 mpo->mpo_flags &= ~MPOF_INTERIM;
4051
4052 bzero(&sopt, sizeof (sopt));
4053 sopt.sopt_dir = SOPT_SET;
4054 sopt.sopt_level = mpo->mpo_level;
4055 sopt.sopt_name = mpo->mpo_name;
4056 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4057 sopt.sopt_valsize = sizeof (int);
4058 sopt.sopt_p = kernproc;
4059
4060 error = sosetoptlock(so, &sopt, 0);
4061 if (error == 0) {
4062 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
4063 "val %d set successful\n", __func__,
4064 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4065 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4066 mpo->mpo_intval),
4067 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4068 } else {
4069 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
4070 "val %d set error %d\n", __func__,
4071 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4072 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4073 mpo->mpo_intval, error),
4074 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4075 }
4076 return (error);
4077 }
4078
4079 /*
4080 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4081 * caller must ensure that the option can be issued on subflow sockets, via
4082 * MPOF_SUBFLOW_OK flag.
4083 */
4084 int
4085 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4086 struct mptopt *mpo)
4087 {
4088 struct socket *mp_so;
4089 struct sockopt sopt;
4090 int error;
4091
4092 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4093 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4094 mp_so = mptetoso(mpte);
4095
4096 bzero(&sopt, sizeof (sopt));
4097 sopt.sopt_dir = SOPT_GET;
4098 sopt.sopt_level = mpo->mpo_level;
4099 sopt.sopt_name = mpo->mpo_name;
4100 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4101 sopt.sopt_valsize = sizeof (int);
4102 sopt.sopt_p = kernproc;
4103
4104 error = sogetoptlock(so, &sopt, 0); /* already locked */
4105 if (error == 0) {
4106 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4107 "%s: mp_so 0x%llx sopt %s "
4108 "val %d get successful\n", __func__,
4109 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4110 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4111 mpo->mpo_intval),
4112 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4113 } else {
4114 mptcplog((LOG_ERR, "MPTCP Socket: "
4115 "%s: mp_so 0x%llx sopt %s get error %d\n",
4116 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4117 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
4118 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4119 }
4120 return (error);
4121 }
4122
4123
4124 /*
4125 * MPTCP garbage collector.
4126 *
4127 * This routine is called by the MP domain on-demand, periodic callout,
4128 * which is triggered when a MPTCP socket is closed. The callout will
4129 * repeat as long as this routine returns a non-zero value.
4130 */
4131 static uint32_t
4132 mptcp_gc(struct mppcbinfo *mppi)
4133 {
4134 struct mppcb *mpp, *tmpp;
4135 uint32_t active = 0;
4136
4137 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4138
4139 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4140 struct socket *mp_so;
4141 struct mptses *mpte;
4142 struct mptcb *mp_tp;
4143
4144 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4145 mp_so = mpp->mpp_socket;
4146 VERIFY(mp_so != NULL);
4147 mpte = mptompte(mpp);
4148 VERIFY(mpte != NULL);
4149 mp_tp = mpte->mpte_mptcb;
4150 VERIFY(mp_tp != NULL);
4151
4152 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4153 "%s: mp_so 0x%llx found "
4154 "(u=%d,r=%d,s=%d)\n", __func__,
4155 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
4156 mp_so->so_retaincnt, mpp->mpp_state),
4157 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4158
4159 if (!mpte_try_lock(mpte)) {
4160 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4161 "%s: mp_so 0x%llx skipped lock "
4162 "(u=%d,r=%d)\n", __func__,
4163 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4164 mp_so->so_usecount, mp_so->so_retaincnt),
4165 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4166 active++;
4167 continue;
4168 }
4169
4170 /* check again under the lock */
4171 if (mp_so->so_usecount > 0) {
4172 boolean_t wakeup = FALSE;
4173 struct mptsub *mpts, *tmpts;
4174
4175 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4176 "%s: mp_so 0x%llx skipped usecount "
4177 "[u=%d,r=%d] %d %d\n", __func__,
4178 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4179 mp_so->so_usecount, mp_so->so_retaincnt,
4180 mp_tp->mpt_gc_ticks,
4181 mp_tp->mpt_state),
4182 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4183
4184 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4185 if (mp_tp->mpt_gc_ticks > 0)
4186 mp_tp->mpt_gc_ticks--;
4187 if (mp_tp->mpt_gc_ticks == 0) {
4188 wakeup = TRUE;
4189 }
4190 }
4191 if (wakeup) {
4192 TAILQ_FOREACH_SAFE(mpts,
4193 &mpte->mpte_subflows, mpts_entry, tmpts) {
4194 mptcp_subflow_eupcall1(mpts->mpts_socket,
4195 mpts, SO_FILT_HINT_DISCONNECTED);
4196 }
4197 }
4198 mpte_unlock(mpte);
4199 active++;
4200 continue;
4201 }
4202
4203 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4204 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4205 "[u=%d,r=%d,s=%d]\n", __func__,
4206 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4207 mp_so->so_usecount, mp_so->so_retaincnt,
4208 mpp->mpp_state);
4209 }
4210
4211 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
4212 mptcp_close(mpte, mp_tp);
4213
4214 mptcp_session_destroy(mpte);
4215
4216 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4217 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
4218 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4219 mp_so->so_usecount, mp_so->so_retaincnt),
4220 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4221
4222 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4223 struct sockbuf *, &mp_so->so_rcv,
4224 struct sockbuf *, &mp_so->so_snd,
4225 struct mppcb *, mpp);
4226
4227 mp_pcbdispose(mpp);
4228 sodealloc(mp_so);
4229 }
4230
4231 return (active);
4232 }
4233
4234 /*
4235 * Drop a MPTCP connection, reporting the specified error.
4236 */
4237 struct mptses *
4238 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4239 {
4240 struct socket *mp_so;
4241
4242 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4243 VERIFY(mpte->mpte_mptcb == mp_tp);
4244 mp_so = mptetoso(mpte);
4245
4246 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4247 uint32_t, 0 /* event */);
4248
4249 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4250 errno = mp_tp->mpt_softerror;
4251 mp_so->so_error = errno;
4252
4253 return (mptcp_close(mpte, mp_tp));
4254 }
4255
4256 /*
4257 * Close a MPTCP control block.
4258 */
4259 struct mptses *
4260 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4261 {
4262 struct socket *mp_so = NULL;
4263 struct mptsub *mpts = NULL, *tmpts = NULL;
4264
4265 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4266 VERIFY(mpte->mpte_mptcb == mp_tp);
4267 mp_so = mptetoso(mpte);
4268
4269 mp_tp->mpt_state = MPTCPS_TERMINATE;
4270
4271 mptcp_freeq(mp_tp);
4272
4273 soisdisconnected(mp_so);
4274
4275 /* Clean up all subflows */
4276 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4277 mptcp_subflow_disconnect(mpte, mpts);
4278 }
4279
4280 return (NULL);
4281 }
4282
4283 void
4284 mptcp_notify_close(struct socket *so)
4285 {
4286 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4287 }
4288
4289 /*
4290 * MPTCP workloop.
4291 */
4292 void
4293 mptcp_subflow_workloop(struct mptses *mpte)
4294 {
4295 struct socket *mp_so;
4296 struct mptsub *mpts, *tmpts;
4297 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4298 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4299
4300 mpte_lock_assert_held(mpte);
4301 VERIFY(mpte->mpte_mppcb != NULL);
4302 mp_so = mptetoso(mpte);
4303 VERIFY(mp_so != NULL);
4304
4305 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4306 ev_ret_t ret;
4307
4308 if (mpts->mpts_socket->so_usecount == 0) {
4309 /* Will be removed soon by tcp_garbage_collect */
4310 continue;
4311 }
4312
4313 mptcp_subflow_addref(mpts);
4314 mpts->mpts_socket->so_usecount++;
4315
4316 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4317
4318 /*
4319 * If MPTCP socket is closed, disconnect all subflows.
4320 * This will generate a disconnect event which will
4321 * be handled during the next iteration, causing a
4322 * non-zero error to be returned above.
4323 */
4324 if (mp_so->so_flags & SOF_PCBCLEARING)
4325 mptcp_subflow_disconnect(mpte, mpts);
4326
4327 switch (ret) {
4328 case MPTS_EVRET_OK:
4329 /* nothing to do */
4330 break;
4331 case MPTS_EVRET_DELETE:
4332 mptcp_subflow_soclose(mpts);
4333 break;
4334 case MPTS_EVRET_CONNECT_PENDING:
4335 connect_pending = TRUE;
4336 break;
4337 case MPTS_EVRET_DISCONNECT_FALLBACK:
4338 disconnect_fallback = TRUE;
4339 break;
4340 default:
4341 mptcplog((LOG_DEBUG,
4342 "MPTCP Socket: %s: mptcp_subflow_events "
4343 "returned invalid value: %d\n", __func__,
4344 ret),
4345 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4346 break;
4347 }
4348 mptcp_subflow_remref(mpts); /* ours */
4349
4350 VERIFY(mpts->mpts_socket->so_usecount != 0);
4351 mpts->mpts_socket->so_usecount--;
4352 }
4353
4354 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4355 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4356
4357 soevent(mp_so, mpsofilt_hint_mask);
4358 }
4359
4360 if (!connect_pending && !disconnect_fallback)
4361 return;
4362
4363 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4364 if (disconnect_fallback) {
4365 struct socket *so = NULL;
4366 struct inpcb *inp = NULL;
4367 struct tcpcb *tp = NULL;
4368
4369 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
4370 continue;
4371
4372 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4373
4374 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
4375 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
4376 continue;
4377
4378 so = mpts->mpts_socket;
4379
4380 /*
4381 * The MPTCP connection has degraded to a fallback
4382 * mode, so there is no point in keeping this subflow
4383 * regardless of its MPTCP-readiness state, unless it
4384 * is the primary one which we use for fallback. This
4385 * assumes that the subflow used for fallback is the
4386 * ACTIVE one.
4387 */
4388
4389 inp = sotoinpcb(so);
4390 tp = intotcpcb(inp);
4391 tp->t_mpflags &=
4392 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4393 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4394
4395 if (mpts->mpts_flags & MPTSF_ACTIVE) {
4396 continue;
4397 }
4398 tp->t_mpflags |= TMPF_RESET;
4399 soevent(so, SO_FILT_HINT_MUSTRST);
4400 } else if (connect_pending) {
4401 /*
4402 * The MPTCP connection has progressed to a state
4403 * where it supports full multipath semantics; allow
4404 * additional joins to be attempted for all subflows
4405 * that are in the PENDING state.
4406 */
4407 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4408 int error = mptcp_subflow_soconnectx(mpte, mpts);
4409
4410 if (error)
4411 mptcp_subflow_abort(mpts, error);
4412 }
4413 }
4414 }
4415 }
4416
4417 /*
4418 * Protocol pr_lock callback.
4419 */
4420 int
4421 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4422 {
4423 struct mppcb *mpp = mpsotomppcb(mp_so);
4424 void *lr_saved;
4425
4426 if (lr == NULL)
4427 lr_saved = __builtin_return_address(0);
4428 else
4429 lr_saved = lr;
4430
4431 if (mpp == NULL) {
4432 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4433 mp_so, lr_saved, solockhistory_nr(mp_so));
4434 /* NOTREACHED */
4435 }
4436 mpp_lock(mpp);
4437
4438 if (mp_so->so_usecount < 0) {
4439 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4440 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4441 solockhistory_nr(mp_so));
4442 /* NOTREACHED */
4443 }
4444 if (refcount != 0)
4445 mp_so->so_usecount++;
4446 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4447 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4448
4449 return (0);
4450 }
4451
4452 /*
4453 * Protocol pr_unlock callback.
4454 */
4455 int
4456 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4457 {
4458 struct mppcb *mpp = mpsotomppcb(mp_so);
4459 void *lr_saved;
4460
4461 if (lr == NULL)
4462 lr_saved = __builtin_return_address(0);
4463 else
4464 lr_saved = lr;
4465
4466 if (mpp == NULL) {
4467 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4468 mp_so, mp_so->so_usecount, lr_saved,
4469 solockhistory_nr(mp_so));
4470 /* NOTREACHED */
4471 }
4472 mpp_lock_assert_held(mpp);
4473
4474 if (refcount != 0)
4475 mp_so->so_usecount--;
4476
4477 if (mp_so->so_usecount < 0) {
4478 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4479 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4480 /* NOTREACHED */
4481 }
4482 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4483 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4484 mpp_unlock(mpp);
4485
4486 return (0);
4487 }
4488
4489 /*
4490 * Protocol pr_getlock callback.
4491 */
4492 lck_mtx_t *
4493 mptcp_getlock(struct socket *mp_so, int flags)
4494 {
4495 struct mppcb *mpp = mpsotomppcb(mp_so);
4496
4497 if (mpp == NULL) {
4498 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4499 solockhistory_nr(mp_so));
4500 /* NOTREACHED */
4501 }
4502 if (mp_so->so_usecount < 0) {
4503 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4504 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4505 /* NOTREACHED */
4506 }
4507 return (mpp_getlock(mpp, flags));
4508 }
4509
4510 /*
4511 * MPTCP Join support
4512 */
4513
4514 static void
4515 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
4516 uint8_t addr_id)
4517 {
4518 struct tcpcb *tp = sototcpcb(so);
4519 struct mptcp_subf_auth_entry *sauth_entry;
4520 mpte_lock_assert_held(mp_tp->mpt_mpte);
4521
4522 /*
4523 * The address ID of the first flow is implicitly 0.
4524 */
4525 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4526 tp->t_local_aid = 0;
4527 } else {
4528 tp->t_local_aid = addr_id;
4529 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4530 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4531 }
4532 sauth_entry = zalloc(mpt_subauth_zone);
4533 sauth_entry->msae_laddr_id = tp->t_local_aid;
4534 sauth_entry->msae_raddr_id = 0;
4535 sauth_entry->msae_raddr_rand = 0;
4536 try_again:
4537 sauth_entry->msae_laddr_rand = RandomULong();
4538 if (sauth_entry->msae_laddr_rand == 0)
4539 goto try_again;
4540 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4541 }
4542
4543 static void
4544 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4545 {
4546 struct mptcp_subf_auth_entry *sauth_entry;
4547 struct tcpcb *tp = NULL;
4548 int found = 0;
4549
4550 tp = sototcpcb(so);
4551 if (tp == NULL)
4552 return;
4553
4554 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4555 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4556 found = 1;
4557 break;
4558 }
4559 }
4560 if (found) {
4561 LIST_REMOVE(sauth_entry, msae_next);
4562 }
4563
4564 if (found)
4565 zfree(mpt_subauth_zone, sauth_entry);
4566 }
4567
4568 void
4569 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4570 u_int32_t *rrand)
4571 {
4572 struct mptcp_subf_auth_entry *sauth_entry;
4573 mpte_lock_assert_held(mp_tp->mpt_mpte);
4574
4575 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4576 if (sauth_entry->msae_laddr_id == addr_id) {
4577 if (lrand)
4578 *lrand = sauth_entry->msae_laddr_rand;
4579 if (rrand)
4580 *rrand = sauth_entry->msae_raddr_rand;
4581 break;
4582 }
4583 }
4584 }
4585
4586 void
4587 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4588 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4589 {
4590 struct mptcp_subf_auth_entry *sauth_entry;
4591 mpte_lock_assert_held(mp_tp->mpt_mpte);
4592
4593 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4594 if (sauth_entry->msae_laddr_id == laddr_id) {
4595 if ((sauth_entry->msae_raddr_id != 0) &&
4596 (sauth_entry->msae_raddr_id != raddr_id)) {
4597 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
4598 " address ids %d %d \n", __func__, raddr_id,
4599 sauth_entry->msae_raddr_id),
4600 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4601 return;
4602 }
4603 sauth_entry->msae_raddr_id = raddr_id;
4604 if ((sauth_entry->msae_raddr_rand != 0) &&
4605 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4606 mptcplog((LOG_ERR, "MPTCP Socket: "
4607 "%s: dup SYN_ACK %d %d \n",
4608 __func__, raddr_rand,
4609 sauth_entry->msae_raddr_rand),
4610 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4611 return;
4612 }
4613 sauth_entry->msae_raddr_rand = raddr_rand;
4614 return;
4615 }
4616 }
4617 }
4618
4619 /*
4620 * SHA1 support for MPTCP
4621 */
4622 static void
4623 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
4624 {
4625 SHA1_CTX sha1ctxt;
4626 const unsigned char *sha1_base;
4627 int sha1_size;
4628
4629 sha1_base = (const unsigned char *) key;
4630 sha1_size = sizeof (mptcp_key_t);
4631 SHA1Init(&sha1ctxt);
4632 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4633 SHA1Final(sha_digest, &sha1ctxt);
4634 }
4635
4636 void
4637 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4638 u_int32_t rand1, u_int32_t rand2, u_char *digest)
4639 {
4640 SHA1_CTX sha1ctxt;
4641 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4642 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4643 u_int32_t data[2];
4644 int i;
4645
4646 bzero(digest, SHA1_RESULTLEN);
4647
4648 /* Set up the Key for HMAC */
4649 key_ipad[0] = key1;
4650 key_ipad[1] = key2;
4651
4652 key_opad[0] = key1;
4653 key_opad[1] = key2;
4654
4655 /* Set up the message for HMAC */
4656 data[0] = rand1;
4657 data[1] = rand2;
4658
4659 /* Key is 512 block length, so no need to compute hash */
4660
4661 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4662
4663 for (i = 0; i < 8; i++) {
4664 key_ipad[i] ^= 0x3636363636363636;
4665 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4666 }
4667
4668 /* Perform inner SHA1 */
4669 SHA1Init(&sha1ctxt);
4670 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4671 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4672 SHA1Final(digest, &sha1ctxt);
4673
4674 /* Perform outer SHA1 */
4675 SHA1Init(&sha1ctxt);
4676 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4677 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4678 SHA1Final(digest, &sha1ctxt);
4679 }
4680
4681 /*
4682 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4683 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4684 */
4685 void
4686 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
4687 {
4688 uint32_t lrand, rrand;
4689
4690 mpte_lock_assert_held(mp_tp->mpt_mpte);
4691
4692 lrand = rrand = 0;
4693 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4694 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4695 digest);
4696 }
4697
4698 /*
4699 * Authentication data generation
4700 */
4701 static void
4702 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4703 int token_len)
4704 {
4705 VERIFY(token_len == sizeof (u_int32_t));
4706 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4707
4708 /* Most significant 32 bits of the SHA1 hash */
4709 bcopy(sha_digest, token, sizeof (u_int32_t));
4710 return;
4711 }
4712
4713 static void
4714 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4715 int idsn_len)
4716 {
4717 VERIFY(idsn_len == sizeof (u_int64_t));
4718 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4719
4720 /*
4721 * Least significant 64 bits of the SHA1 hash
4722 */
4723
4724 idsn[7] = sha_digest[12];
4725 idsn[6] = sha_digest[13];
4726 idsn[5] = sha_digest[14];
4727 idsn[4] = sha_digest[15];
4728 idsn[3] = sha_digest[16];
4729 idsn[2] = sha_digest[17];
4730 idsn[1] = sha_digest[18];
4731 idsn[0] = sha_digest[19];
4732 return;
4733 }
4734
4735 static void
4736 mptcp_conn_properties(struct mptcb *mp_tp)
4737 {
4738 /* There is only Version 0 at this time */
4739 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4740
4741 /* Set DSS checksum flag */
4742 if (mptcp_dss_csum)
4743 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4744
4745 /* Set up receive window */
4746 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4747
4748 /* Set up gc ticks */
4749 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4750 }
4751
4752 static void
4753 mptcp_init_local_parms(struct mptses *mpte)
4754 {
4755 struct mptcb *mp_tp = mpte->mpte_mptcb;
4756 char key_digest[SHA1_RESULTLEN];
4757
4758 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4759 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4760
4761 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
4762 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4763 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
4764 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4765
4766 /* The subflow SYN is also first MPTCP byte */
4767 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4768 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4769
4770 mptcp_conn_properties(mp_tp);
4771 }
4772
4773 int
4774 mptcp_init_remote_parms(struct mptcb *mp_tp)
4775 {
4776 char remote_digest[SHA1_RESULTLEN];
4777 mpte_lock_assert_held(mp_tp->mpt_mpte);
4778
4779 /* Only Version 0 is supported for auth purposes */
4780 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
4781 return (-1);
4782
4783 /* Setup local and remote tokens and Initial DSNs */
4784 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
4785 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4786 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
4787 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4788 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4789 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4790
4791 return (0);
4792 }
4793
4794 static void
4795 mptcp_send_dfin(struct socket *so)
4796 {
4797 struct tcpcb *tp = NULL;
4798 struct inpcb *inp = NULL;
4799
4800 inp = sotoinpcb(so);
4801 if (!inp)
4802 return;
4803
4804 tp = intotcpcb(inp);
4805 if (!tp)
4806 return;
4807
4808 if (!(tp->t_mpflags & TMPF_RESET))
4809 tp->t_mpflags |= TMPF_SEND_DFIN;
4810 }
4811
4812 /*
4813 * Data Sequence Mapping routines
4814 */
4815 void
4816 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4817 {
4818 struct mptcb *mp_tp;
4819
4820 if (m == NULL)
4821 return;
4822
4823 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
4824 mpte_lock_assert_held(mp_tp->mpt_mpte);
4825
4826 while (m) {
4827 VERIFY(m->m_flags & M_PKTHDR);
4828 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4829 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4830 m->m_pkthdr.mp_rlen = m_pktlen(m);
4831 mp_tp->mpt_sndmax += m_pktlen(m);
4832 m = m->m_next;
4833 }
4834 }
4835
4836 void
4837 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4838 {
4839 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4840 uint64_t data_ack;
4841 uint64_t dsn;
4842
4843 if (!m || len == 0)
4844 return;
4845
4846 while (m && len > 0) {
4847 VERIFY(m->m_flags & M_PKTHDR);
4848 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4849
4850 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4851 dsn = m->m_pkthdr.mp_dsn;
4852
4853 len -= m->m_len;
4854 m = m->m_next;
4855 }
4856
4857 if (m && len == 0) {
4858 /*
4859 * If there is one more mbuf in the chain, it automatically means
4860 * that up to m->mp_dsn has been ack'ed.
4861 *
4862 * This means, we actually correct data_ack back down (compared
4863 * to what we set inside the loop - dsn + data_len). Because in
4864 * the loop we are "optimistic" and assume that the full mapping
4865 * will be acked. If that's not the case and we get out of the
4866 * loop with m != NULL, it means only up to m->mp_dsn has been
4867 * really acked.
4868 */
4869 data_ack = m->m_pkthdr.mp_dsn;
4870 }
4871
4872 if (len < 0) {
4873 /*
4874 * If len is negative, meaning we acked in the middle of an mbuf,
4875 * only up to this mbuf's data-sequence number has been acked
4876 * at the MPTCP-level.
4877 */
4878 data_ack = dsn;
4879 }
4880
4881 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4882 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4883 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
4884 }
4885
4886 void
4887 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
4888 {
4889 int rewinding = 0;
4890
4891 /* TFO makes things complicated. */
4892 if (so->so_flags1 & SOF1_TFO_REWIND) {
4893 rewinding = 1;
4894 so->so_flags1 &= ~SOF1_TFO_REWIND;
4895 }
4896
4897 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4898 u_int32_t sub_len;
4899 VERIFY(m->m_flags & M_PKTHDR);
4900 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4901
4902 sub_len = m->m_pkthdr.mp_rlen;
4903
4904 if (sub_len < len) {
4905 m->m_pkthdr.mp_dsn += sub_len;
4906 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4907 m->m_pkthdr.mp_rseq += sub_len;
4908 }
4909 m->m_pkthdr.mp_rlen = 0;
4910 len -= sub_len;
4911 } else {
4912 /* sub_len >= len */
4913 if (rewinding == 0)
4914 m->m_pkthdr.mp_dsn += len;
4915 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4916 if (rewinding == 0)
4917 m->m_pkthdr.mp_rseq += len;
4918 }
4919 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4920 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4921 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4922 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4923 m->m_pkthdr.mp_rlen -= len;
4924 break;
4925 }
4926 m = m->m_next;
4927 }
4928
4929 if (so->so_flags & SOF_MP_SUBFLOW &&
4930 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4931 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4932 /*
4933 * Received an ack without receiving a DATA_ACK.
4934 * Need to fallback to regular TCP (or destroy this subflow).
4935 */
4936 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
4937 mptcp_notify_mpfail(so);
4938 }
4939 }
4940
4941 /* Obtain the DSN mapping stored in the mbuf */
4942 void
4943 mptcp_output_getm_dsnmap32(struct socket *so, int off,
4944 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
4945 {
4946 u_int64_t dsn64;
4947
4948 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
4949 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4950 }
4951
4952 void
4953 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4954 uint32_t *relseq, uint16_t *data_len,
4955 uint16_t *dss_csum)
4956 {
4957 struct mbuf *m = so->so_snd.sb_mb;
4958 int off_orig = off;
4959
4960 VERIFY(off >= 0);
4961
4962 /*
4963 * In the subflow socket, the DSN sequencing can be discontiguous,
4964 * but the subflow sequence mapping is contiguous. Use the subflow
4965 * sequence property to find the right mbuf and corresponding dsn
4966 * mapping.
4967 */
4968
4969 while (m) {
4970 VERIFY(m->m_flags & M_PKTHDR);
4971 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4972
4973 if (off >= m->m_len) {
4974 off -= m->m_len;
4975 m = m->m_next;
4976 } else {
4977 break;
4978 }
4979 }
4980
4981 VERIFY(m);
4982 VERIFY(off >= 0);
4983 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
4984
4985 *dsn = m->m_pkthdr.mp_dsn;
4986 *relseq = m->m_pkthdr.mp_rseq;
4987 *data_len = m->m_pkthdr.mp_rlen;
4988 *dss_csum = m->m_pkthdr.mp_csum;
4989
4990 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4991 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4992 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4993 }
4994
4995 /*
4996 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4997 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4998 * When it trims data tcp_input calls m_adj() which does not remove the
4999 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5000 * The dsn map insertion cannot be delayed after trim, because data can be in
5001 * the reassembly queue for a while and the DSN option info in tp will be
5002 * overwritten for every new packet received.
5003 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5004 * with mptcp_adj_rmap()
5005 */
5006 void
5007 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5008 {
5009 VERIFY(m->m_flags & M_PKTHDR);
5010 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5011
5012 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5013 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5014 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5015 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5016 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5017 if (tp->t_rcv_map.mpt_dfin)
5018 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5019
5020 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5021
5022 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5023 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5024 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5025 if (th->th_flags & TH_FIN)
5026 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5027 }
5028 }
5029
5030 int
5031 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
5032 uint32_t rseq, uint16_t dlen)
5033 {
5034 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
5035
5036 if (m_pktlen(m) == 0)
5037 return (0);
5038
5039 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5040 if (off && (dsn != m->m_pkthdr.mp_dsn ||
5041 rseq != m->m_pkthdr.mp_rseq ||
5042 dlen != m->m_pkthdr.mp_rlen)) {
5043 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
5044 __func__, dsn, m->m_pkthdr.mp_dsn,
5045 rseq, m->m_pkthdr.mp_rseq,
5046 dlen, m->m_pkthdr.mp_rlen),
5047 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5048 return (-1);
5049 }
5050 m->m_pkthdr.mp_dsn += off;
5051 m->m_pkthdr.mp_rseq += off;
5052 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
5053 } else {
5054 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
5055 /* data arrived without an DSS option mapping */
5056
5057 /* initial subflow can fallback right after SYN handshake */
5058 mptcp_notify_mpfail(so);
5059 }
5060 }
5061
5062 mpts->mpts_flags |= MPTSF_CONFIRMED;
5063
5064 return (0);
5065 }
5066
5067 /*
5068 * Following routines help with failure detection and failover of data
5069 * transfer from one subflow to another.
5070 */
5071 void
5072 mptcp_act_on_txfail(struct socket *so)
5073 {
5074 struct tcpcb *tp = NULL;
5075 struct inpcb *inp = sotoinpcb(so);
5076
5077 if (inp == NULL)
5078 return;
5079
5080 tp = intotcpcb(inp);
5081 if (tp == NULL)
5082 return;
5083
5084 if (so->so_flags & SOF_MP_TRYFAILOVER)
5085 return;
5086
5087 so->so_flags |= SOF_MP_TRYFAILOVER;
5088 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5089 }
5090
5091 /*
5092 * Support for MP_FAIL option
5093 */
5094 int
5095 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5096 {
5097 struct mbuf *m = so->so_snd.sb_mb;
5098 u_int64_t dsn;
5099 int off = 0;
5100 u_int32_t datalen;
5101
5102 if (m == NULL)
5103 return (-1);
5104
5105 while (m != NULL) {
5106 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5107 VERIFY(m->m_flags & M_PKTHDR);
5108 dsn = m->m_pkthdr.mp_dsn;
5109 datalen = m->m_pkthdr.mp_rlen;
5110 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5111 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5112 off = dsn_fail - dsn;
5113 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5114 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5115 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5116 return (0);
5117 }
5118
5119 m = m->m_next;
5120 }
5121
5122 /*
5123 * If there was no mbuf data and a fallback to TCP occurred, there's
5124 * not much else to do.
5125 */
5126
5127 mptcplog((LOG_ERR, "MPTCP Sender: "
5128 "%s: %llu not found \n", __func__, dsn_fail),
5129 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5130 return (-1);
5131 }
5132
5133 /*
5134 * Support for sending contiguous MPTCP bytes in subflow
5135 * Also for preventing sending data with ACK in 3-way handshake
5136 */
5137 int32_t
5138 mptcp_adj_sendlen(struct socket *so, int32_t off)
5139 {
5140 struct tcpcb *tp = sototcpcb(so);
5141 struct mptsub *mpts = tp->t_mpsub;
5142 uint64_t mdss_dsn;
5143 uint32_t mdss_subflow_seq;
5144 int mdss_subflow_off;
5145 uint16_t mdss_data_len;
5146 uint16_t dss_csum;
5147
5148 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5149 &mdss_data_len, &dss_csum);
5150
5151 /*
5152 * We need to compute how much of the mapping still remains.
5153 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5154 */
5155 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5156
5157 /*
5158 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5159 * seq has been set to 1 (while it should be 0).
5160 */
5161 if (tp->t_mpflags & TMPF_TFO_REQUEST)
5162 mdss_subflow_off--;
5163
5164 if (off < mdss_subflow_off)
5165 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5166 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5167 VERIFY(off >= mdss_subflow_off);
5168
5169 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5170 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5171 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5172 return (mdss_data_len - (off - mdss_subflow_off));
5173 }
5174
5175 static uint32_t
5176 mptcp_get_maxseg(struct mptses *mpte)
5177 {
5178 struct mptsub *mpts;
5179 uint32_t maxseg = 0;
5180
5181 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5182 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5183
5184 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5185 TCPS_HAVERCVDFIN2(tp->t_state))
5186 continue;
5187
5188 if (tp->t_maxseg > maxseg)
5189 maxseg = tp->t_maxseg;
5190 }
5191
5192 return (maxseg);
5193 }
5194
5195 static uint8_t
5196 mptcp_get_rcvscale(struct mptses *mpte)
5197 {
5198 struct mptsub *mpts;
5199 uint8_t rcvscale = UINT8_MAX;
5200
5201 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5202 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5203
5204 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5205 TCPS_HAVERCVDFIN2(tp->t_state))
5206 continue;
5207
5208 if (tp->rcv_scale < rcvscale)
5209 rcvscale = tp->rcv_scale;
5210 }
5211
5212 return (rcvscale);
5213 }
5214
5215 /* Similar to tcp_sbrcv_reserve */
5216 static void
5217 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5218 u_int32_t newsize, u_int32_t idealsize)
5219 {
5220 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5221
5222 /* newsize should not exceed max */
5223 newsize = min(newsize, tcp_autorcvbuf_max);
5224
5225 /* The receive window scale negotiated at the
5226 * beginning of the connection will also set a
5227 * limit on the socket buffer size
5228 */
5229 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5230
5231 /* Set new socket buffer size */
5232 if (newsize > sbrcv->sb_hiwat &&
5233 (sbreserve(sbrcv, newsize) == 1)) {
5234 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5235 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5236
5237 /* Again check the limit set by the advertised
5238 * window scale
5239 */
5240 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5241 TCP_MAXWIN << rcvscale);
5242 }
5243 }
5244
5245 void
5246 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5247 {
5248 struct mptses *mpte = mp_tp->mpt_mpte;
5249 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5250 struct sockbuf *sbrcv = &mp_so->so_rcv;
5251 uint32_t hiwat_sum = 0;
5252 uint32_t ideal_sum = 0;
5253 struct mptsub *mpts;
5254
5255 /*
5256 * Do not grow the receive socket buffer if
5257 * - auto resizing is disabled, globally or on this socket
5258 * - the high water mark already reached the maximum
5259 * - the stream is in background and receive side is being
5260 * throttled
5261 * - if there are segments in reassembly queue indicating loss,
5262 * do not need to increase recv window during recovery as more
5263 * data is not going to be sent. A duplicate ack sent during
5264 * recovery should not change the receive window
5265 */
5266 if (tcp_do_autorcvbuf == 0 ||
5267 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5268 tcp_cansbgrow(sbrcv) == 0 ||
5269 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5270 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5271 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5272 /* Can not resize the socket buffer, just return */
5273 return;
5274 }
5275
5276 /*
5277 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5278 *
5279 * But, for this we first need accurate receiver-RTT estimations, which
5280 * we currently don't have.
5281 *
5282 * Let's use a dummy algorithm for now, just taking the sum of all
5283 * subflow's receive-buffers. It's too low, but that's all we can get
5284 * for now.
5285 */
5286
5287 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5288 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5289 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5290 }
5291
5292 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5293 }
5294
5295 /*
5296 * Determine if we can grow the recieve socket buffer to avoid sending
5297 * a zero window update to the peer. We allow even socket buffers that
5298 * have fixed size (set by the application) to grow if the resource
5299 * constraints are met. They will also be trimmed after the application
5300 * reads data.
5301 *
5302 * Similar to tcp_sbrcv_grow_rwin
5303 */
5304 static void
5305 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5306 {
5307 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5308 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5309 u_int32_t rcvbuf = sb->sb_hiwat;
5310
5311 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5312 return;
5313
5314 if (tcp_do_autorcvbuf == 1 &&
5315 tcp_cansbgrow(sb) &&
5316 /* Diff to tcp_sbrcv_grow_rwin */
5317 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5318 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5319 rcvbuf < tcp_autorcvbuf_max &&
5320 (sb->sb_idealsize > 0 &&
5321 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5322 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5323 }
5324 }
5325
5326 /* Similar to tcp_sbspace */
5327 int32_t
5328 mptcp_sbspace(struct mptcb *mp_tp)
5329 {
5330 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5331 uint32_t rcvbuf;
5332 int32_t space;
5333 int32_t pending = 0;
5334
5335 mpte_lock_assert_held(mp_tp->mpt_mpte);
5336
5337 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5338
5339 /* hiwat might have changed */
5340 rcvbuf = sb->sb_hiwat;
5341
5342 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5343 (sb->sb_mbmax - sb->sb_mbcnt)));
5344 if (space < 0)
5345 space = 0;
5346
5347 #if CONTENT_FILTER
5348 /* Compensate for data being processed by content filters */
5349 pending = cfil_sock_data_space(sb);
5350 #endif /* CONTENT_FILTER */
5351 if (pending > space)
5352 space = 0;
5353 else
5354 space -= pending;
5355
5356 return (space);
5357 }
5358
5359 /*
5360 * Support Fallback to Regular TCP
5361 */
5362 void
5363 mptcp_notify_mpready(struct socket *so)
5364 {
5365 struct tcpcb *tp = NULL;
5366
5367 if (so == NULL)
5368 return;
5369
5370 tp = intotcpcb(sotoinpcb(so));
5371
5372 if (tp == NULL)
5373 return;
5374
5375 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5376 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5377 struct tcpcb *, tp);
5378
5379 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5380 return;
5381
5382 if (tp->t_mpflags & TMPF_MPTCP_READY)
5383 return;
5384
5385 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5386 tp->t_mpflags |= TMPF_MPTCP_READY;
5387
5388 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5389 }
5390
5391 void
5392 mptcp_notify_mpfail(struct socket *so)
5393 {
5394 struct tcpcb *tp = NULL;
5395
5396 if (so == NULL)
5397 return;
5398
5399 tp = intotcpcb(sotoinpcb(so));
5400
5401 if (tp == NULL)
5402 return;
5403
5404 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5405 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5406 struct tcpcb *, tp);
5407
5408 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5409 return;
5410
5411 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5412 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5413
5414 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5415 }
5416
5417 /*
5418 * Keepalive helper function
5419 */
5420 boolean_t
5421 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5422 {
5423 boolean_t ret = 1;
5424 mpte_lock_assert_held(mp_tp->mpt_mpte);
5425
5426 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5427 ret = 0;
5428 }
5429 return (ret);
5430 }
5431
5432 /*
5433 * MPTCP t_maxseg adjustment function
5434 */
5435 int
5436 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5437 {
5438 int mss_lower = 0;
5439 struct mptcb *mp_tp = tptomptp(tp);
5440
5441 #define MPTCP_COMPUTE_LEN { \
5442 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5443 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5444 mss_lower += 2; \
5445 else \
5446 /* adjust to 32-bit boundary + EOL */ \
5447 mss_lower += 2; \
5448 }
5449 if (mp_tp == NULL)
5450 return (0);
5451
5452 mpte_lock_assert_held(mp_tp->mpt_mpte);
5453
5454 /*
5455 * For the first subflow and subsequent subflows, adjust mss for
5456 * most common MPTCP option size, for case where tcp_mss is called
5457 * during option processing and MTU discovery.
5458 */
5459 if (!mtudisc) {
5460 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5461 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5462 MPTCP_COMPUTE_LEN;
5463 }
5464
5465 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5466 tp->t_mpflags & TMPF_SENT_JOIN) {
5467 MPTCP_COMPUTE_LEN;
5468 }
5469 } else {
5470 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5471 MPTCP_COMPUTE_LEN;
5472 }
5473 }
5474
5475 return (mss_lower);
5476 }
5477
5478 /*
5479 * Update the pid, upid, uuid of the subflow so, based on parent so
5480 */
5481 void
5482 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5483 {
5484 if (so->last_pid != mp_so->last_pid ||
5485 so->last_upid != mp_so->last_upid) {
5486 so->last_upid = mp_so->last_upid;
5487 so->last_pid = mp_so->last_pid;
5488 uuid_copy(so->last_uuid, mp_so->last_uuid);
5489 }
5490 so_update_policy(so);
5491 }
5492
5493 static void
5494 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5495 {
5496 struct inpcb *inp;
5497
5498 tcp_getconninfo(so, &flow->flow_ci);
5499 inp = sotoinpcb(so);
5500 #if INET6
5501 if ((inp->inp_vflag & INP_IPV6) != 0) {
5502 flow->flow_src.ss_family = AF_INET6;
5503 flow->flow_dst.ss_family = AF_INET6;
5504 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5505 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5506 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5507 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5508 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5509 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5510 } else
5511 #endif
5512 if ((inp->inp_vflag & INP_IPV4) != 0) {
5513 flow->flow_src.ss_family = AF_INET;
5514 flow->flow_dst.ss_family = AF_INET;
5515 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5516 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5517 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5518 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5519 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5520 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5521 }
5522 flow->flow_len = sizeof(*flow);
5523 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5524 flow->flow_flags = mpts->mpts_flags;
5525 flow->flow_cid = mpts->mpts_connid;
5526 flow->flow_relseq = mpts->mpts_rel_seq;
5527 flow->flow_soerror = mpts->mpts_socket->so_error;
5528 flow->flow_probecnt = mpts->mpts_probecnt;
5529 }
5530
5531 static int
5532 mptcp_pcblist SYSCTL_HANDLER_ARGS
5533 {
5534 #pragma unused(oidp, arg1, arg2)
5535 int error = 0, f;
5536 size_t len;
5537 struct mppcb *mpp;
5538 struct mptses *mpte;
5539 struct mptcb *mp_tp;
5540 struct mptsub *mpts;
5541 struct socket *so;
5542 conninfo_mptcp_t mptcpci;
5543 mptcp_flow_t *flows = NULL;
5544
5545 if (req->newptr != USER_ADDR_NULL)
5546 return (EPERM);
5547
5548 lck_mtx_lock(&mtcbinfo.mppi_lock);
5549 if (req->oldptr == USER_ADDR_NULL) {
5550 size_t n = mtcbinfo.mppi_count;
5551 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5552 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5553 4 * (n + n/8) * sizeof(mptcp_flow_t);
5554 return (0);
5555 }
5556 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5557 flows = NULL;
5558 mpp_lock(mpp);
5559 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5560 mpte = mptompte(mpp);
5561 VERIFY(mpte != NULL);
5562 mpte_lock_assert_held(mpte);
5563 mp_tp = mpte->mpte_mptcb;
5564 VERIFY(mp_tp != NULL);
5565
5566 bzero(&mptcpci, sizeof(mptcpci));
5567 mptcpci.mptcpci_state = mp_tp->mpt_state;
5568 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5569 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5570 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5571 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5572 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5573 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5574 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5575 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5576 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5577 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5578 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
5579 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5580 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5581
5582 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5583 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5584 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5585 mptcpci.mptcpci_flow_offset =
5586 offsetof(conninfo_mptcp_t, mptcpci_flows);
5587
5588 len = sizeof(*flows) * mpte->mpte_numflows;
5589 if (mpte->mpte_numflows != 0) {
5590 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5591 if (flows == NULL) {
5592 mpp_unlock(mpp);
5593 break;
5594 }
5595 mptcpci.mptcpci_len = sizeof(mptcpci) +
5596 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5597 error = SYSCTL_OUT(req, &mptcpci,
5598 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5599 } else {
5600 mptcpci.mptcpci_len = sizeof(mptcpci);
5601 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
5602 }
5603 if (error) {
5604 mpp_unlock(mpp);
5605 FREE(flows, M_TEMP);
5606 break;
5607 }
5608 f = 0;
5609 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5610 so = mpts->mpts_socket;
5611 fill_mptcp_subflow(so, &flows[f], mpts);
5612 f++;
5613 }
5614 mpp_unlock(mpp);
5615 if (flows) {
5616 error = SYSCTL_OUT(req, flows, len);
5617 FREE(flows, M_TEMP);
5618 if (error)
5619 break;
5620 }
5621 }
5622 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5623
5624 return (error);
5625 }
5626
5627 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5628 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5629 "List of active MPTCP connections");
5630
5631 /*
5632 * Set notsent lowat mark on the MPTCB
5633 */
5634 int
5635 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5636 {
5637 struct mptcb *mp_tp = NULL;
5638 int error = 0;
5639
5640 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5641 mp_tp = mpte->mpte_mptcb;
5642
5643 if (mp_tp)
5644 mp_tp->mpt_notsent_lowat = optval;
5645 else
5646 error = EINVAL;
5647
5648 return (error);
5649 }
5650
5651 u_int32_t
5652 mptcp_get_notsent_lowat(struct mptses *mpte)
5653 {
5654 struct mptcb *mp_tp = NULL;
5655
5656 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5657 mp_tp = mpte->mpte_mptcb;
5658
5659 if (mp_tp)
5660 return (mp_tp->mpt_notsent_lowat);
5661 else
5662 return (0);
5663 }
5664
5665 int
5666 mptcp_notsent_lowat_check(struct socket *so)
5667 {
5668 struct mptses *mpte;
5669 struct mppcb *mpp;
5670 struct mptcb *mp_tp;
5671 struct mptsub *mpts;
5672
5673 int notsent = 0;
5674
5675 mpp = mpsotomppcb(so);
5676 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5677 return (0);
5678 }
5679
5680 mpte = mptompte(mpp);
5681 mpte_lock_assert_held(mpte);
5682 mp_tp = mpte->mpte_mptcb;
5683
5684 notsent = so->so_snd.sb_cc;
5685
5686 if ((notsent == 0) ||
5687 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5688 mp_tp->mpt_notsent_lowat)) {
5689 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5690 "lowat %d notsent %d actual %d \n",
5691 mp_tp->mpt_notsent_lowat, notsent,
5692 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5693 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5694 return (1);
5695 }
5696
5697 /* When Nagle's algorithm is not disabled, it is better
5698 * to wakeup the client even before there is atleast one
5699 * maxseg of data to write.
5700 */
5701 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5702 int retval = 0;
5703 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5704 struct socket *subf_so = mpts->mpts_socket;
5705 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5706
5707 notsent = so->so_snd.sb_cc -
5708 (tp->snd_nxt - tp->snd_una);
5709
5710 if ((tp->t_flags & TF_NODELAY) == 0 &&
5711 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5712 retval = 1;
5713 }
5714 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
5715 " nodelay false \n",
5716 mp_tp->mpt_notsent_lowat, notsent),
5717 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5718 return (retval);
5719 }
5720 }
5721 return (0);
5722 }
5723
5724 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5725 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5726 static uint32_t mptcp_kern_skt_inuse = 0;
5727 static uint32_t mptcp_kern_skt_unit;
5728 symptoms_advisory_t mptcp_advisory;
5729
5730 static errno_t
5731 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5732 void **unitinfo)
5733 {
5734 #pragma unused(kctlref, sac, unitinfo)
5735
5736 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5737 os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
5738
5739 mptcp_kern_skt_unit = sac->sc_unit;
5740
5741 return (0);
5742 }
5743
5744 static void
5745 mptcp_allow_uuid(uuid_t uuid)
5746 {
5747 struct mppcb *mpp;
5748
5749 /* Iterate over all MPTCP connections */
5750
5751 lck_mtx_lock(&mtcbinfo.mppi_lock);
5752
5753 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5754 struct mptses *mpte;
5755 struct socket *mp_so;
5756
5757 mpp_lock(mpp);
5758
5759 mpte = mpp->mpp_pcbe;
5760 mp_so = mpp->mpp_socket;
5761
5762 if (mp_so->so_flags & SOF_DELEGATED &&
5763 uuid_compare(uuid, mp_so->e_uuid))
5764 goto next;
5765 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5766 uuid_compare(uuid, mp_so->last_uuid))
5767 goto next;
5768
5769 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5770
5771 mptcp_check_subflows_and_add(mpte);
5772 mptcp_remove_subflows(mpte);
5773
5774 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5775
5776 next:
5777 mpp_unlock(mpp);
5778 }
5779
5780 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5781 }
5782
5783 static void
5784 mptcp_wifi_status_changed(void)
5785 {
5786 struct mppcb *mpp;
5787
5788 /* Iterate over all MPTCP connections */
5789
5790 lck_mtx_lock(&mtcbinfo.mppi_lock);
5791
5792 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5793 struct mptses *mpte;
5794 struct socket *mp_so;
5795
5796 mpp_lock(mpp);
5797
5798 mpte = mpp->mpp_pcbe;
5799 mp_so = mpp->mpp_socket;
5800
5801 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5802 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5803 goto next;
5804
5805 mptcp_check_subflows_and_add(mpte);
5806 mptcp_check_subflows_and_remove(mpte);
5807
5808 next:
5809 mpp_unlock(mpp);
5810 }
5811
5812 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5813 }
5814
5815 void
5816 mptcp_ask_symptoms(struct mptses *mpte)
5817 {
5818 struct mptcp_symptoms_ask_uuid ask;
5819 struct socket *mp_so;
5820 struct proc *p;
5821 int pid, prio, err;
5822
5823 if (mptcp_kern_skt_unit == 0) {
5824 os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
5825 return;
5826 }
5827
5828 mp_so = mptetoso(mpte);
5829
5830 if (mp_so->so_flags & SOF_DELEGATED)
5831 pid = mp_so->e_pid;
5832 else
5833 pid = mp_so->last_pid;
5834
5835 p = proc_find(pid);
5836 if (p == PROC_NULL) {
5837 os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
5838 return;
5839 }
5840
5841 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5842
5843 if (mp_so->so_flags & SOF_DELEGATED)
5844 uuid_copy(ask.uuid, mp_so->e_uuid);
5845 else
5846 uuid_copy(ask.uuid, mp_so->last_uuid);
5847
5848 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5849
5850 if (prio == TASK_BACKGROUND_APPLICATION)
5851 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5852 else if (prio == TASK_FOREGROUND_APPLICATION)
5853 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
5854 else
5855 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5856
5857 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5858 &ask, sizeof(ask), CTL_DATA_EOR);
5859
5860 os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
5861 __func__, pid, ask.priority, err);
5862
5863
5864 proc_rele(p);
5865 }
5866
5867 static errno_t
5868 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5869 void *unitinfo)
5870 {
5871 #pragma unused(kctlref, kcunit, unitinfo)
5872
5873 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5874
5875 return (0);
5876 }
5877
5878 static errno_t
5879 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5880 mbuf_t m, int flags)
5881 {
5882 #pragma unused(kctlref, unitinfo, flags)
5883 symptoms_advisory_t *sa = NULL;
5884
5885 if (kcunit != mptcp_kern_skt_unit)
5886 os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
5887 __func__, kcunit, mptcp_kern_skt_unit);
5888
5889 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5890 mbuf_freem(m);
5891 return (EINVAL);
5892 }
5893
5894 if (mbuf_len(m) < sizeof(*sa)) {
5895 mbuf_freem(m);
5896 return (EINVAL);
5897 }
5898
5899 sa = mbuf_data(m);
5900
5901 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5902 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5903 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
5904
5905 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5906 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5907 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5908
5909 if ((sa->sa_wifi_status &
5910 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5911 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
5912 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5913
5914 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5915 mptcp_wifi_status_changed();
5916 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5917 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5918 mptcp_advisory.sa_wifi_status),
5919 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5920 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5921 uuid_t uuid;
5922
5923 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5924 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5925
5926 uuid_copy(uuid, (unsigned char *)(sa + 1));
5927
5928 mptcp_allow_uuid(uuid);
5929 }
5930
5931 mbuf_freem(m);
5932 return (0);
5933 }
5934
5935 void
5936 mptcp_control_register(void)
5937 {
5938 /* Set up the advisory control socket */
5939 struct kern_ctl_reg mptcp_kern_ctl;
5940
5941 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5942 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5943 sizeof(mptcp_kern_ctl.ctl_name));
5944 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5945 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5946 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5947 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5948
5949 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5950 }
5951
5952 /*
5953 * Three return-values:
5954 * 1 : WiFi is bad
5955 * 0 : WiFi is good
5956 * -1 : WiFi-state is unknown, use subflow-only heuristics
5957 */
5958 int
5959 mptcp_is_wifi_unusable(struct mptses *mpte)
5960 {
5961 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
5962 if (mptcp_advisory.sa_wifi_status)
5963 return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
5964
5965 /*
5966 * If it's a first-party app and we don't have any info
5967 * about the Wi-Fi state, let's be pessimistic.
5968 */
5969 return (-1);
5970 }
5971
5972 return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
5973 }
5974
5975 boolean_t
5976 mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
5977 {
5978 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5979 int fail_thresh = mptcp_fail_thresh;
5980
5981 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
5982 fail_thresh *= 2;
5983
5984 return (tp->t_rxtshift >= fail_thresh &&
5985 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq));
5986 }
5987
5988 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5989 static void
5990 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
5991 {
5992 struct socket *mp_so = mptetoso(mpte);
5993 struct socket *so = mpts->mpts_socket;
5994 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5995 struct mptcb *mp_tp = mpte->mpte_mptcb;
5996
5997 /* If data was sent with SYN, rewind state */
5998 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5999 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6000 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6001
6002 VERIFY(mp_droplen <= (UINT_MAX));
6003 VERIFY(mp_droplen >= tcp_droplen);
6004
6005 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6006 mpts->mpts_iss += tcp_droplen;
6007 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6008
6009 if (mp_droplen > tcp_droplen) {
6010 /* handle partial TCP ack */
6011 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6012 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6013 mp_droplen = tcp_droplen;
6014 } else {
6015 /* all data on SYN was acked */
6016 mpts->mpts_rel_seq = 1;
6017 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6018 }
6019 mp_tp->mpt_sndmax -= tcp_droplen;
6020
6021 if (mp_droplen != 0) {
6022 VERIFY(mp_so->so_snd.sb_mb != NULL);
6023 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6024 }
6025 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
6026 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
6027 mpts->mpts_connid, tcp_droplen, mp_droplen),
6028 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6029 }
6030 }
6031
6032 int
6033 mptcp_freeq(struct mptcb *mp_tp)
6034 {
6035 struct tseg_qent *q;
6036 int rv = 0;
6037
6038 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6039 LIST_REMOVE(q, tqe_q);
6040 m_freem(q->tqe_m);
6041 zfree(tcp_reass_zone, q);
6042 rv = 1;
6043 }
6044 mp_tp->mpt_reassqlen = 0;
6045 return (rv);
6046 }
6047
6048 static int
6049 mptcp_post_event(u_int32_t event_code, int value)
6050 {
6051 struct kev_mptcp_data event_data;
6052 struct kev_msg ev_msg;
6053
6054 memset(&ev_msg, 0, sizeof(ev_msg));
6055
6056 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6057 ev_msg.kev_class = KEV_NETWORK_CLASS;
6058 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6059 ev_msg.event_code = event_code;
6060
6061 event_data.value = value;
6062
6063 ev_msg.dv[0].data_ptr = &event_data;
6064 ev_msg.dv[0].data_length = sizeof(event_data);
6065
6066 return kev_post_msg(&ev_msg);
6067 }
6068
6069 void
6070 mptcp_set_cellicon(struct mptses *mpte)
6071 {
6072 int error;
6073
6074 /* First-party apps (Siri) don't flip the cellicon */
6075 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
6076 return;
6077
6078 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
6079 mptcp_last_cellicon_set = tcp_now;
6080
6081 /* If cellicon is already set, get out of here! */
6082 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
6083 return;
6084
6085 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6086
6087 if (error)
6088 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
6089 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6090 else
6091 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
6092 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6093 }
6094
6095 void
6096 mptcp_unset_cellicon(void)
6097 {
6098 int error;
6099
6100 /* If cellicon is already unset, get out of here! */
6101 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
6102 return;
6103
6104 /*
6105 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6106 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6107 * it again.
6108 */
6109 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
6110 tcp_now)) {
6111 OSTestAndSet(7, &mptcp_cellicon_is_set);
6112 return;
6113 }
6114
6115 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6116
6117 if (error)
6118 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
6119 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6120 else
6121 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
6122 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6123 }
6124
6125 void
6126 mptcp_reset_rexmit_state(struct tcpcb *tp)
6127 {
6128 struct mptsub *mpts;
6129 struct inpcb *inp;
6130 struct socket *so;
6131
6132 inp = tp->t_inpcb;
6133 if (inp == NULL)
6134 return;
6135
6136 so = inp->inp_socket;
6137 if (so == NULL)
6138 return;
6139
6140 if (!(so->so_flags & SOF_MP_SUBFLOW))
6141 return;
6142
6143 mpts = tp->t_mpsub;
6144
6145 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6146 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6147 }
6148
6149 void
6150 mptcp_reset_keepalive(struct tcpcb *tp)
6151 {
6152 struct mptsub *mpts = tp->t_mpsub;
6153
6154 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6155 }
6156