]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
a2a656883cef2fd7bf59eae5819a043bde553017
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_fsm.h>
59 #include <netinet/tcp_seq.h>
60 #include <netinet/tcp_var.h>
61 #include <netinet/mptcp_var.h>
62 #include <netinet/mptcp.h>
63 #include <netinet/mptcp_opt.h>
64 #include <netinet/mptcp_seq.h>
65 #include <netinet/mptcp_timer.h>
66 #include <libkern/crypto/sha1.h>
67 #if INET6
68 #include <netinet6/in6_pcb.h>
69 #include <netinet6/ip6protosw.h>
70 #endif /* INET6 */
71 #include <dev/random/randomdev.h>
72
73 /*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_rupcall(struct socket *, void *, int);
121 static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122 static void mptcp_subflow_wupcall(struct socket *, void *, int);
123 static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127 static void mptcp_subflow_abort(struct mptsub *, int);
128
129 static void mptcp_send_dfin(struct socket *so);
130
131 /*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138 typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143 } ev_ret_t;
144
145 static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
146 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158 static const char *mptcp_evret2str(ev_ret_t);
159
160 static void mptcp_do_sha1(mptcp_key_t *, char *);
161 static void mptcp_init_local_parms(struct mptses *);
162
163 static unsigned int mptsub_zone_size; /* size of mptsub */
164 static struct zone *mptsub_zone; /* zone for mptsub */
165
166 static unsigned int mptopt_zone_size; /* size of mptopt */
167 static struct zone *mptopt_zone; /* zone for mptopt */
168
169 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172 struct mppcbinfo mtcbinfo;
173
174 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177 SYSCTL_DECL(_net_inet);
178
179 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
181 uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
182 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
185 uint32_t mptcp_dbg_level = 1;
186 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
189 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
192
193 static int mptcp_alternate_port = 0;
194 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
196
197 static struct protosw mptcp_subflow_protosw;
198 static struct pr_usrreqs mptcp_subflow_usrreqs;
199 #if INET6
200 static struct ip6protosw mptcp_subflow_protosw6;
201 static struct pr_usrreqs mptcp_subflow_usrreqs6;
202 #endif /* INET6 */
203
204 static uint8_t mptcp_create_subflows_scheduled;
205
206 typedef struct mptcp_subflow_event_entry {
207 uint64_t sofilt_hint_mask;
208 ev_ret_t (*sofilt_hint_ev_hdlr)(
209 struct mptses *mpte,
210 struct mptsub *mpts,
211 uint64_t *p_mpsofilt_hint,
212 uint64_t event);
213 } mptsub_ev_entry_t;
214
215 static uint8_t mptcp_cellicon_is_set;
216 static uint32_t mptcp_last_cellicon_set;
217 #define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
218
219 /*
220 * XXX The order of the event handlers below is really
221 * really important. Think twice before changing it.
222 */
223 static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
224 {
225 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
226 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
227 },
228 {
229 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
230 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
231 },
232 {
233 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
234 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
235 },
236 {
237 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
238 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
239 },
240 {
241 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
242 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
243 },
244 {
245 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
246 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
247 },
248 {
249 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
250 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
251 },
252 {
253 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
254 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
255 },
256 {
257 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
258 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
259 },
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
263 },
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
267 },
268 {
269 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
275 },
276 };
277
278 os_log_t mptcp_log_handle;
279
280 /*
281 * Protocol pr_init callback.
282 */
283 void
284 mptcp_init(struct protosw *pp, struct domain *dp)
285 {
286 #pragma unused(dp)
287 static int mptcp_initialized = 0;
288 struct protosw *prp;
289 #if INET6
290 struct ip6protosw *prp6;
291 #endif /* INET6 */
292
293 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
294
295 /* do this only once */
296 if (mptcp_initialized)
297 return;
298 mptcp_initialized = 1;
299
300 /*
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
303 */
304 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 VERIFY(prp != NULL);
306 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
307 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 sizeof (mptcp_subflow_usrreqs));
309 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
313 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
314 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 /*
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
319 */
320 mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324
325 #if INET6
326 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
327 IPPROTO_TCP, SOCK_STREAM);
328 VERIFY(prp6 != NULL);
329 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
330 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
331 sizeof (mptcp_subflow_usrreqs6));
332 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
333 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
334 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
335 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
336 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
337 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
338 /*
339 * Socket filters shouldn't attach/detach to/from this protosw
340 * since pr_protosw is to be used instead, which points to the
341 * real protocol; if they do, it is a bug and we should panic.
342 */
343 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
344 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
345 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
346 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
347 #endif /* INET6 */
348
349 bzero(&mtcbinfo, sizeof (mtcbinfo));
350 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
351 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
352 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
353 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
354 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
355 /* NOTREACHED */
356 }
357 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
358 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
359
360 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
361 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
362 mtcbinfo.mppi_lock_grp_attr);
363 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
364 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
365 mtcbinfo.mppi_lock_attr);
366
367 mtcbinfo.mppi_gc = mptcp_gc;
368 mtcbinfo.mppi_timer = mptcp_timer;
369
370 /* attach to MP domain for garbage collection to take place */
371 mp_pcbinfo_attach(&mtcbinfo);
372
373 mptsub_zone_size = sizeof (struct mptsub);
374 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
375 8192, "mptsub")) == NULL) {
376 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
377 /* NOTREACHED */
378 }
379 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
380 zone_change(mptsub_zone, Z_EXPAND, TRUE);
381
382 mptopt_zone_size = sizeof (struct mptopt);
383 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
384 1024, "mptopt")) == NULL) {
385 panic("%s: unable to allocate MPTCP option zone\n", __func__);
386 /* NOTREACHED */
387 }
388 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
389 zone_change(mptopt_zone, Z_EXPAND, TRUE);
390
391 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
392 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
393 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
394 panic("%s: unable to allocate MPTCP address auth zone \n",
395 __func__);
396 /* NOTREACHED */
397 }
398 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
399 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
400
401 mptcp_last_cellicon_set = tcp_now;
402
403 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
404 }
405
406 int
407 mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
408 {
409 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
410
411 int i, index = -1;
412
413 if (ifp == NULL) {
414 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
416 return (-1);
417 }
418
419 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
420 if (stats[i].ifindex == IFSCOPE_NONE) {
421 if (index < 0)
422 index = i;
423 continue;
424 }
425
426 if (stats[i].ifindex == ifp->if_index) {
427 index = i;
428 return (index);
429 }
430 }
431
432 if (index != -1) {
433 stats[index].ifindex = ifp->if_index;
434 if (stats[index].is_expensive == 0)
435 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
436 }
437
438 return (index);
439 }
440
441 void
442 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
443 {
444 int index;
445
446 tcpstat.tcps_mp_switches++;
447 mpte->mpte_subflow_switches++;
448
449 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
450
451 if (index != -1)
452 mpte->mpte_itfstats[index].switches++;
453 }
454
455 /*
456 * Flushes all recorded socket options from an MP socket.
457 */
458 static void
459 mptcp_flush_sopts(struct mptses *mpte)
460 {
461 struct mptopt *mpo, *tmpo;
462
463 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
464 mptcp_sopt_remove(mpte, mpo);
465 mptcp_sopt_free(mpo);
466 }
467 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
468 }
469
470 /*
471 * Create an MPTCP session, called as a result of opening a MPTCP socket.
472 */
473 int
474 mptcp_sescreate(struct mppcb *mpp)
475 {
476 struct mppcbinfo *mppi;
477 struct mptses *mpte;
478 struct mptcb *mp_tp;
479
480 VERIFY(mpp != NULL);
481 mppi = mpp->mpp_pcbinfo;
482 VERIFY(mppi != NULL);
483
484 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
485 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
486
487 /* MPTCP Multipath PCB Extension */
488 bzero(mpte, sizeof (*mpte));
489 VERIFY(mpp->mpp_pcbe == NULL);
490 mpp->mpp_pcbe = mpte;
491 mpte->mpte_mppcb = mpp;
492 mpte->mpte_mptcb = mp_tp;
493
494 TAILQ_INIT(&mpte->mpte_sopts);
495 TAILQ_INIT(&mpte->mpte_subflows);
496 mpte->mpte_associd = SAE_ASSOCID_ANY;
497 mpte->mpte_connid_last = SAE_CONNID_ANY;
498
499 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
500 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
501
502 if (mptcp_alternate_port)
503 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
504
505 /* MPTCP Protocol Control Block */
506 bzero(mp_tp, sizeof (*mp_tp));
507 mp_tp->mpt_mpte = mpte;
508 mp_tp->mpt_state = MPTCPS_CLOSED;
509
510 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
511
512 return (0);
513 }
514
515 static void
516 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
517 uint64_t *cellbytes, uint64_t *allbytes)
518 {
519 int64_t mycellbytes = 0;
520 uint64_t myallbytes = 0;
521 int i;
522
523 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
524 if (mpte->mpte_itfstats[i].is_expensive) {
525 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
526 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
527 }
528
529 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
530 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
531 }
532
533 if (initial_cell) {
534 mycellbytes -= mpte->mpte_init_txbytes;
535 mycellbytes -= mpte->mpte_init_txbytes;
536 }
537
538 if (mycellbytes < 0) {
539 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
540 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
541 *cellbytes = 0;
542 *allbytes = 0;
543 } else {
544 *cellbytes = mycellbytes;
545 *allbytes = myallbytes;
546 }
547 }
548
549 static void
550 mptcpstats_session_wrapup(struct mptses *mpte)
551 {
552 boolean_t cell = mpte->mpte_initial_cell;
553
554 switch (mpte->mpte_svctype) {
555 case MPTCP_SVCTYPE_HANDOVER:
556 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
557 tcpstat.tcps_mptcp_fp_handover_attempt++;
558
559 if (cell && mpte->mpte_handshake_success) {
560 tcpstat.tcps_mptcp_fp_handover_success_cell++;
561
562 if (mpte->mpte_used_wifi)
563 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
564 } else if (mpte->mpte_handshake_success) {
565 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
566
567 if (mpte->mpte_used_cell)
568 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
569 }
570 } else {
571 tcpstat.tcps_mptcp_handover_attempt++;
572
573 if (cell && mpte->mpte_handshake_success) {
574 tcpstat.tcps_mptcp_handover_success_cell++;
575
576 if (mpte->mpte_used_wifi)
577 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
578 } else if (mpte->mpte_handshake_success) {
579 tcpstat.tcps_mptcp_handover_success_wifi++;
580
581 if (mpte->mpte_used_cell)
582 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
583 }
584 }
585
586 if (mpte->mpte_handshake_success) {
587 uint64_t cellbytes;
588 uint64_t allbytes;
589
590 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
591
592 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
593 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
594 }
595 break;
596 case MPTCP_SVCTYPE_INTERACTIVE:
597 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
598 tcpstat.tcps_mptcp_fp_interactive_attempt++;
599
600 if (mpte->mpte_handshake_success) {
601 tcpstat.tcps_mptcp_fp_interactive_success++;
602
603 if (!cell && mpte->mpte_used_cell)
604 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
605 }
606 } else {
607 tcpstat.tcps_mptcp_interactive_attempt++;
608
609 if (mpte->mpte_handshake_success) {
610 tcpstat.tcps_mptcp_interactive_success++;
611
612 if (!cell && mpte->mpte_used_cell)
613 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
614 }
615 }
616
617 if (mpte->mpte_handshake_success) {
618 uint64_t cellbytes;
619 uint64_t allbytes;
620
621 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
622
623 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
624 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
625 }
626 break;
627 case MPTCP_SVCTYPE_AGGREGATE:
628 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
629 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
630
631 if (mpte->mpte_handshake_success)
632 tcpstat.tcps_mptcp_fp_aggregate_success++;
633 } else {
634 tcpstat.tcps_mptcp_aggregate_attempt++;
635
636 if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_aggregate_success++;
638 }
639 }
640
641 if (mpte->mpte_handshake_success) {
642 uint64_t cellbytes;
643 uint64_t allbytes;
644
645 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
646
647 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
648 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
649 }
650 break;
651 }
652
653 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
654 tcpstat.tcps_mptcp_back_to_wifi++;
655 }
656
657 /*
658 * Destroy an MPTCP session.
659 */
660 static void
661 mptcp_session_destroy(struct mptses *mpte)
662 {
663 struct mptcb *mp_tp;
664
665 mpte_lock_assert_held(mpte); /* same as MP socket lock */
666
667 mp_tp = mpte->mpte_mptcb;
668 VERIFY(mp_tp != NULL);
669
670 mptcpstats_session_wrapup(mpte);
671
672 mptcp_unset_cellicon();
673
674 /*
675 * MPTCP Multipath PCB Extension section
676 */
677 mptcp_flush_sopts(mpte);
678 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
679
680 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
681 _FREE(mpte->mpte_itfinfo, M_TEMP);
682
683 mpte->mpte_itfinfo = NULL;
684
685 m_freem_list(mpte->mpte_reinjectq);
686
687 /*
688 * MPTCP Protocol Control Block section
689 */
690 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
691 struct mptcb *, mp_tp);
692 }
693
694 static boolean_t
695 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
696 {
697 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
698 mp_tp->mpt_state < MPTCPS_TIME_WAIT &&
699 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
700 }
701
702 static int
703 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
704 {
705 static const struct in6_addr well_known_prefix = {
706 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
707 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
708 0x00, 0x00, 0x00, 0x00},
709 };
710 char buf[MAX_IPv6_STR_LEN];
711 char *ptrv4 = (char *)addrv4;
712 char *ptr = (char *)addr;
713
714 if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network
715 IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback
716 IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local
717 IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite
718 IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast
719 IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast
720 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
721 return (-1);
722 }
723
724 /* Check for the well-known prefix */
725 if (len == NAT64_PREFIX_LEN_96 &&
726 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
727 if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
728 IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space
729 return (-1);
730 }
731
732 switch (len) {
733 case NAT64_PREFIX_LEN_96:
734 memcpy(ptr + 12, ptrv4, 4);
735 break;
736 case NAT64_PREFIX_LEN_64:
737 memcpy(ptr + 9, ptrv4, 4);
738 break;
739 case NAT64_PREFIX_LEN_56:
740 memcpy(ptr + 7, ptrv4, 1);
741 memcpy(ptr + 9, ptrv4 + 1, 3);
742 break;
743 case NAT64_PREFIX_LEN_48:
744 memcpy(ptr + 6, ptrv4, 2);
745 memcpy(ptr + 9, ptrv4 + 2, 2);
746 break;
747 case NAT64_PREFIX_LEN_40:
748 memcpy(ptr + 5, ptrv4, 3);
749 memcpy(ptr + 9, ptrv4 + 3, 1);
750 break;
751 case NAT64_PREFIX_LEN_32:
752 memcpy(ptr + 4, ptrv4, 4);
753 break;
754 default:
755 panic("NAT64-prefix len is wrong: %u\n", len);
756 }
757
758 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
759 __func__, len,
760 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
761
762 return (0);
763 }
764
765 void
766 mptcp_check_subflows_and_add(struct mptses *mpte)
767 {
768 struct mptcb *mp_tp = mpte->mpte_mptcb;
769 uint32_t i;
770
771 if (!mptcp_ok_to_create_subflows(mp_tp))
772 return;
773
774 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
775 struct mpt_itf_info *info;
776 struct mptsub *mpts;
777 uint32_t ifindex;
778 int found = 0;
779
780 info = &mpte->mpte_itfinfo[i];
781
782 if (info->no_mptcp_support)
783 continue;
784
785 ifindex = info->ifindex;
786 if (ifindex == IFSCOPE_NONE)
787 continue;
788
789 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
790 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
791
792 if (ifp == NULL)
793 continue;
794
795 if (ifp->if_index == ifindex &&
796 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
797 sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
798 /*
799 * We found a subflow on this interface.
800 * No need to create a new one.
801 */
802 found = 1;
803 break;
804 }
805
806 /*
807 * In Handover mode, only create cell subflow if
808 * 1. Wi-Fi Assist is active
809 * 2. Symptoms marked WiFi as weak
810 * 3. We are experiencing RTOs or we are not sending data.
811 *
812 * This covers the scenario, where:
813 * 1. We send and get retransmission timeouts (thus,
814 * we confirmed that WiFi is indeed bad).
815 * 2. We are not sending and the server tries to send.
816 * Establshing a cell-subflow gives the server a
817 * chance to send us some data over cell if WiFi
818 * is dead. We establish the subflow with the
819 * backup-bit set, so the server is not allowed to
820 * send on this subflow as long as WiFi is providing
821 * good performance.
822 */
823 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
824 !IFNET_IS_CELLULAR(ifp) &&
825 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
826 (!mptcp_is_wifi_unusable() ||
827 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh &&
828 mptetoso(mpte)->so_snd.sb_cc))) {
829 mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n",
830 __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex,
831 ifp->if_index),
832 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
833 found = 1;
834 break;
835 }
836 }
837
838 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
839 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
840 mptcp_developer_mode == 0) {
841 mptcp_ask_symptoms(mpte);
842 return;
843 }
844
845 if (!found) {
846 struct sockaddr *dst = &mpte->mpte_dst;
847 struct sockaddr_in6 nat64pre;
848
849 if (mpte->mpte_dst.sa_family == AF_INET &&
850 !info->has_v4_conn && info->has_v6_conn) {
851 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
852 struct ifnet *ifp;
853 int error, j;
854
855 bzero(&nat64pre, sizeof(struct sockaddr_in6));
856
857 ifnet_head_lock_shared();
858 ifp = ifindex2ifnet[ifindex];
859 ifnet_head_done();
860
861 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
862 if (error) {
863 mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n",
864 __func__, ifp->if_name, error),
865 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
866 continue;
867 }
868
869 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
870 if (nat64prefixes[j].prefix_len != 0)
871 break;
872 }
873
874 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
875
876 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
877 nat64prefixes[j].prefix_len,
878 &mpte->__mpte_dst_v4.sin_addr);
879 if (error != 0) {
880 mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__),
881 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
882 continue;
883 }
884
885 memcpy(&nat64pre.sin6_addr,
886 &nat64prefixes[j].ipv6_prefix,
887 sizeof(nat64pre.sin6_addr));
888 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
889 nat64pre.sin6_family = AF_INET6;
890 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
891 nat64pre.sin6_flowinfo = 0;
892 nat64pre.sin6_scope_id = 0;
893
894 dst = (struct sockaddr *)&nat64pre;
895 }
896
897 /* Initial subflow started on a NAT64'd address? */
898 if (mpte->mpte_dst.sa_family == AF_INET6 &&
899 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
900 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
901 }
902
903 if (dst->sa_family == AF_INET && !info->has_v4_conn)
904 continue;
905 if (dst->sa_family == AF_INET6 && !info->has_v6_conn)
906 continue;
907
908 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
909 }
910 }
911 }
912
913 /*
914 * Based on the MPTCP Service-type and the state of the subflows, we
915 * will destroy subflows here.
916 */
917 static void
918 mptcp_check_subflows_and_remove(struct mptses *mpte)
919 {
920 struct mptsub *mpts, *tmpts;
921 int found_working_subflow = 0, removed_some = 0;
922 int wifi_unusable = mptcp_is_wifi_unusable();
923
924 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
925 return;
926
927 /*
928 * Look for a subflow that is on a non-cellular interface
929 * and actually works (aka, no retransmission timeout).
930 */
931 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
932 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
933 struct socket *so;
934 struct tcpcb *tp;
935
936 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
937 continue;
938
939 so = mpts->mpts_socket;
940 tp = sototcpcb(so);
941
942 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
943 tp->t_state != TCPS_ESTABLISHED)
944 continue;
945
946 /* Either this subflow is in good condition while we try to send */
947 if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc)
948 found_working_subflow = 1;
949
950 /* Or WiFi is fine */
951 if (!wifi_unusable)
952 found_working_subflow = 1;
953 }
954
955 /*
956 * Couldn't find a working subflow, let's not remove those on a cellular
957 * interface.
958 */
959 if (!found_working_subflow)
960 return;
961
962 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
963 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
964
965 /* Only remove cellular subflows */
966 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
967 continue;
968
969 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
970 removed_some = 1;
971 }
972
973 if (removed_some)
974 mptcp_unset_cellicon();
975 }
976
977 static void
978 mptcp_remove_subflows(struct mptses *mpte)
979 {
980 struct mptsub *mpts, *tmpts;
981
982 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
983 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
984 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
985
986 soevent(mpts->mpts_socket,
987 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
988 }
989 }
990 }
991
992 static void
993 mptcp_create_subflows(__unused void *arg)
994 {
995 struct mppcb *mpp;
996
997 /*
998 * Start with clearing, because we might be processing connections
999 * while a new event comes in.
1000 */
1001 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
1002 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
1003 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1004
1005 /* Iterate over all MPTCP connections */
1006
1007 lck_mtx_lock(&mtcbinfo.mppi_lock);
1008
1009 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1010 struct mptses *mpte;
1011 struct socket *mp_so;
1012
1013 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
1014 continue;
1015
1016 mpp_lock(mpp);
1017
1018 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1019
1020 mpte = mpp->mpp_pcbe;
1021 mp_so = mpp->mpp_socket;
1022
1023 VERIFY(mp_so->so_usecount > 0);
1024
1025 mptcp_check_subflows_and_add(mpte);
1026 mptcp_remove_subflows(mpte);
1027
1028 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1029 mpp_unlock(mpp);
1030 }
1031
1032 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1033 }
1034
1035 /*
1036 * We need this because we are coming from an NECP-event. This event gets posted
1037 * while holding NECP-locks. The creation of the subflow however leads us back
1038 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1039 * So, we would deadlock there as we already hold the NECP-lock.
1040 *
1041 * So, let's schedule this separately. It also gives NECP the chance to make
1042 * progress, without having to wait for MPTCP to finish its subflow creation.
1043 */
1044 void
1045 mptcp_sched_create_subflows(struct mptses *mpte)
1046 {
1047 struct mppcb *mpp = mpte->mpte_mppcb;
1048 struct mptcb *mp_tp = mpte->mpte_mptcb;
1049 struct socket *mp_so = mpp->mpp_socket;
1050
1051 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1052 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1053 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1054 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1055 return;
1056 }
1057
1058 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1059 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1060 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1061 }
1062
1063 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1064 return;
1065
1066 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1067 timeout(mptcp_create_subflows, NULL, hz/10);
1068 }
1069
1070 /*
1071 * Allocate an MPTCP socket option structure.
1072 */
1073 struct mptopt *
1074 mptcp_sopt_alloc(int how)
1075 {
1076 struct mptopt *mpo;
1077
1078 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1079 zalloc_noblock(mptopt_zone);
1080 if (mpo != NULL) {
1081 bzero(mpo, mptopt_zone_size);
1082 }
1083
1084 return (mpo);
1085 }
1086
1087 /*
1088 * Free an MPTCP socket option structure.
1089 */
1090 void
1091 mptcp_sopt_free(struct mptopt *mpo)
1092 {
1093 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1094
1095 zfree(mptopt_zone, mpo);
1096 }
1097
1098 /*
1099 * Add a socket option to the MPTCP socket option list.
1100 */
1101 void
1102 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1103 {
1104 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1105 mpo->mpo_flags |= MPOF_ATTACHED;
1106 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1107 }
1108
1109 /*
1110 * Remove a socket option from the MPTCP socket option list.
1111 */
1112 void
1113 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1114 {
1115 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1116 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1117 mpo->mpo_flags &= ~MPOF_ATTACHED;
1118 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1119 }
1120
1121 /*
1122 * Search for an existing <sopt_level,sopt_name> socket option.
1123 */
1124 struct mptopt *
1125 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1126 {
1127 struct mptopt *mpo;
1128
1129 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1130
1131 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1132 if (mpo->mpo_level == sopt->sopt_level &&
1133 mpo->mpo_name == sopt->sopt_name)
1134 break;
1135 }
1136 return (mpo);
1137 }
1138
1139 /*
1140 * Allocate a MPTCP subflow structure.
1141 */
1142 static struct mptsub *
1143 mptcp_subflow_alloc(void)
1144 {
1145 struct mptsub *mpts = zalloc(mptsub_zone);
1146
1147 if (mpts == NULL)
1148 return (NULL);
1149
1150 bzero(mpts, mptsub_zone_size);
1151 return (mpts);
1152 }
1153
1154 /*
1155 * Deallocate a subflow structure, called when all of the references held
1156 * on it have been released. This implies that the subflow has been deleted.
1157 */
1158 static void
1159 mptcp_subflow_free(struct mptsub *mpts)
1160 {
1161 VERIFY(mpts->mpts_refcnt == 0);
1162 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1163 VERIFY(mpts->mpts_mpte == NULL);
1164 VERIFY(mpts->mpts_socket == NULL);
1165
1166 if (mpts->mpts_src != NULL) {
1167 FREE(mpts->mpts_src, M_SONAME);
1168 mpts->mpts_src = NULL;
1169 }
1170
1171 zfree(mptsub_zone, mpts);
1172 }
1173
1174 static void
1175 mptcp_subflow_addref(struct mptsub *mpts)
1176 {
1177 if (++mpts->mpts_refcnt == 0)
1178 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1179 /* NOTREACHED */
1180 }
1181
1182 static void
1183 mptcp_subflow_remref(struct mptsub *mpts)
1184 {
1185 if (mpts->mpts_refcnt == 0) {
1186 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1187 /* NOTREACHED */
1188 }
1189 if (--mpts->mpts_refcnt > 0)
1190 return;
1191
1192 /* callee will unlock and destroy lock */
1193 mptcp_subflow_free(mpts);
1194 }
1195
1196 static void
1197 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1198 {
1199 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1200 struct tcpcb *tp = sototcpcb(so);
1201
1202 /*
1203 * From this moment on, the subflow is linked to the MPTCP-connection.
1204 * Locking,... happens now at the MPTCP-layer
1205 */
1206 tp->t_mptcb = mpte->mpte_mptcb;
1207 so->so_flags |= SOF_MP_SUBFLOW;
1208 mp_so->so_usecount++;
1209
1210 /*
1211 * Insert the subflow into the list, and associate the MPTCP PCB
1212 * as well as the the subflow socket. From this point on, removing
1213 * the subflow needs to be done via mptcp_subflow_del().
1214 */
1215 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1216 mpte->mpte_numflows++;
1217
1218 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1219 mpts->mpts_mpte = mpte;
1220 mpts->mpts_socket = so;
1221 tp->t_mpsub = mpts;
1222 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1223 mptcp_subflow_addref(mpts); /* for subflow socket */
1224 }
1225
1226 static void
1227 mptcp_subflow_necp_cb(void *handle, __unused int action,
1228 __unused struct necp_client_flow *flow)
1229 {
1230 struct inpcb *inp = (struct inpcb *)handle;
1231 struct socket *so = inp->inp_socket;
1232 struct mptsub *mpts;
1233 struct mptses *mpte;
1234
1235 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1236 return;
1237
1238 /*
1239 * The socket is being garbage-collected. There is nothing to be done
1240 * here.
1241 */
1242 if (so->so_usecount == 0)
1243 return;
1244
1245 socket_lock(so, 1);
1246
1247 /* Check again after we acquired the lock. */
1248 if (so->so_usecount == 0)
1249 goto out;
1250
1251 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1252 mpts = sototcpcb(so)->t_mpsub;
1253
1254 mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__),
1255 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
1256
1257 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1258
1259 mptcp_sched_create_subflows(mpte);
1260
1261 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
1262 flow->viable = 1;
1263
1264 out:
1265 socket_unlock(so, 1);
1266 }
1267
1268 /*
1269 * Create an MPTCP subflow socket.
1270 */
1271 static int
1272 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1273 struct socket **so)
1274 {
1275 lck_mtx_t *subflow_mtx;
1276 struct mptopt smpo, *mpo, *tmpo;
1277 struct proc *p;
1278 struct socket *mp_so;
1279 int error;
1280
1281 *so = NULL;
1282 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1283 mp_so = mptetoso(mpte);
1284
1285 p = proc_find(mp_so->last_pid);
1286 if (p == PROC_NULL) {
1287 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1288 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1289
1290 return (ESRCH);
1291 }
1292
1293 /*
1294 * Create the subflow socket (multipath subflow, non-blocking.)
1295 *
1296 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1297 * socket; it will be cleared when the socket is peeled off or closed.
1298 * It also indicates to the underlying TCP to handle MPTCP options.
1299 * A multipath subflow socket implies SS_NOFDREF state.
1300 */
1301
1302 /*
1303 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1304 * the ipi-lock. We cannot hold the socket-lock at that point.
1305 */
1306 mpte_unlock(mpte);
1307 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1308 SOCF_ASYNC, PROC_NULL);
1309 mpte_lock(mpte);
1310 if (error) {
1311 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1312 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1313 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1314
1315 proc_rele(p);
1316
1317 mptcp_subflow_free(mpts);
1318 return (error);
1319 }
1320
1321 /*
1322 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1323 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1324 * Which is why we also need to get the lock with pr_getlock, as after
1325 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1326 */
1327 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1328 lck_mtx_lock(subflow_mtx);
1329
1330 /*
1331 * Must be the first thing we do, to make sure all pointers for this
1332 * subflow are set.
1333 */
1334 mptcp_subflow_attach(mpte, mpts, *so);
1335
1336 /*
1337 * A multipath subflow socket is used internally in the kernel,
1338 * therefore it does not have a file desciptor associated by
1339 * default.
1340 */
1341 (*so)->so_state |= SS_NOFDREF;
1342
1343 lck_mtx_unlock(subflow_mtx);
1344
1345 /* prevent the socket buffers from being compressed */
1346 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1347 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1348
1349 /* Inherit preconnect and TFO data flags */
1350 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1351 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1352 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1353 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1354
1355 /* Inherit uuid and create the related flow. */
1356 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1357 struct mptcb *mp_tp = mpte->mpte_mptcb;
1358
1359 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1360
1361 /*
1362 * A note on the unlock: With MPTCP, we do multiple times a
1363 * necp_client_register_socket_flow. This is problematic,
1364 * because now the lock-ordering guarantee (first necp-locks,
1365 * then socket-locks) is no more respected. So, we need to
1366 * unlock here.
1367 */
1368 mpte_unlock(mpte);
1369 error = necp_client_register_socket_flow(mp_so->last_pid,
1370 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1371 mpte_lock(mpte);
1372
1373 if (error)
1374 goto out_err;
1375
1376 /* Possible state-change during the unlock above */
1377 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1378 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1379 goto out_err;
1380
1381 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1382 } else {
1383 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1384 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1385 }
1386
1387 /* inherit the other socket options */
1388 bzero(&smpo, sizeof (smpo));
1389 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1390 smpo.mpo_level = SOL_SOCKET;
1391 smpo.mpo_intval = 1;
1392
1393 /* disable SIGPIPE */
1394 smpo.mpo_name = SO_NOSIGPIPE;
1395 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1396 goto out_err;
1397
1398 /* find out if the subflow's source address goes away */
1399 smpo.mpo_name = SO_NOADDRERR;
1400 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1401 goto out_err;
1402
1403 /* enable keepalive */
1404 smpo.mpo_name = SO_KEEPALIVE;
1405 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1406 goto out_err;
1407
1408 smpo.mpo_level = IPPROTO_TCP;
1409 smpo.mpo_intval = mptcp_subflow_keeptime;
1410 smpo.mpo_name = TCP_KEEPALIVE;
1411 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1412 goto out_err;
1413
1414 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1415 /*
1416 * On secondary subflows we might need to set the cell-fallback
1417 * flag (see conditions in mptcp_subflow_sosetopt).
1418 */
1419 smpo.mpo_level = SOL_SOCKET;
1420 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1421 smpo.mpo_intval = 1;
1422 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1423 goto out_err;
1424 }
1425
1426 /* replay setsockopt(2) on the subflow sockets for eligible options */
1427 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1428 int interim;
1429
1430 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1431 continue;
1432
1433 /*
1434 * Skip those that are handled internally; these options
1435 * should not have been recorded and marked with the
1436 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1437 */
1438 if (mpo->mpo_level == SOL_SOCKET &&
1439 (mpo->mpo_name == SO_NOSIGPIPE ||
1440 mpo->mpo_name == SO_NOADDRERR ||
1441 mpo->mpo_name == SO_KEEPALIVE))
1442 continue;
1443
1444 interim = (mpo->mpo_flags & MPOF_INTERIM);
1445 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1446 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1447 " sopt %s val %d interim record removed\n", __func__,
1448 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1449 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1450 mpo->mpo_intval),
1451 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1452 mptcp_sopt_remove(mpte, mpo);
1453 mptcp_sopt_free(mpo);
1454 continue;
1455 }
1456 }
1457
1458 /*
1459 * We need to receive everything that the subflow socket has,
1460 * so use a customized socket receive function. We will undo
1461 * this when the socket is peeled off or closed.
1462 */
1463 switch (dom) {
1464 case PF_INET:
1465 (*so)->so_proto = &mptcp_subflow_protosw;
1466 break;
1467 #if INET6
1468 case PF_INET6:
1469 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1470 break;
1471 #endif /* INET6 */
1472 default:
1473 VERIFY(0);
1474 /* NOTREACHED */
1475 }
1476
1477 proc_rele(p);
1478
1479 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1480 int, dom, int, error);
1481
1482 return (0);
1483
1484 out_err:
1485 mptcp_subflow_abort(mpts, error);
1486
1487 proc_rele(p);
1488
1489 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1490 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1491
1492 return (error);
1493 }
1494
1495 /*
1496 * Close an MPTCP subflow socket.
1497 *
1498 * Note that this may be called on an embryonic subflow, and the only
1499 * thing that is guaranteed valid is the protocol-user request.
1500 */
1501 static void
1502 mptcp_subflow_soclose(struct mptsub *mpts)
1503 {
1504 struct socket *so = mpts->mpts_socket;
1505
1506 if (mpts->mpts_flags & MPTSF_CLOSED)
1507 return;
1508
1509 VERIFY(so != NULL);
1510 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1511 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1512
1513 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1514 struct socket *, so,
1515 struct sockbuf *, &so->so_rcv,
1516 struct sockbuf *, &so->so_snd,
1517 struct mptses *, mpts->mpts_mpte);
1518
1519 mpts->mpts_flags |= MPTSF_CLOSED;
1520
1521 if (so->so_retaincnt == 0) {
1522 soclose_locked(so);
1523
1524 return;
1525 } else {
1526 VERIFY(so->so_usecount > 0);
1527 so->so_usecount--;
1528 }
1529
1530 return;
1531 }
1532
1533 /*
1534 * Connect an MPTCP subflow socket.
1535 *
1536 * Note that in the pending connect case, the subflow socket may have been
1537 * bound to an interface and/or a source IP address which may no longer be
1538 * around by the time this routine is called; in that case the connect attempt
1539 * will most likely fail.
1540 */
1541 static int
1542 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1543 {
1544 char dbuf[MAX_IPv6_STR_LEN];
1545 struct socket *mp_so, *so;
1546 struct mptcb *mp_tp;
1547 struct sockaddr *dst;
1548 struct proc *p;
1549 int af, error, dport;
1550
1551 mp_so = mptetoso(mpte);
1552 mp_tp = mpte->mpte_mptcb;
1553 so = mpts->mpts_socket;
1554 af = mpts->mpts_dst.sa_family;
1555 dst = &mpts->mpts_dst;
1556
1557 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1558 VERIFY(mpts->mpts_socket != NULL);
1559 VERIFY(af == AF_INET || af == AF_INET6);
1560
1561 if (af == AF_INET) {
1562 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof (dbuf));
1563 dport = ntohs(SIN(dst)->sin_port);
1564 } else {
1565 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof (dbuf));
1566 dport = ntohs(SIN6(dst)->sin6_port);
1567 }
1568
1569 os_log_info(mptcp_log_handle,
1570 "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
1571 dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1572
1573 p = proc_find(mp_so->last_pid);
1574 if (p == PROC_NULL) {
1575 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1576 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1577
1578 return (ESRCH);
1579 }
1580
1581 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1582
1583 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1584
1585 /* connect the subflow socket */
1586 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1587 p, mpts->mpts_ifscope,
1588 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1589
1590 mpts->mpts_iss = sototcpcb(so)->iss;
1591
1592 /* See tcp_connect_complete */
1593 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1594 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1595 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1596 }
1597
1598 /* Allocate a unique address id per subflow */
1599 mpte->mpte_addrid_last++;
1600 if (mpte->mpte_addrid_last == 0)
1601 mpte->mpte_addrid_last++;
1602
1603 proc_rele(p);
1604
1605 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1606 struct mptsub *, mpts, int, error);
1607 if (error)
1608 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1609 __func__, error, mpts->mpts_ifscope),
1610 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1611
1612 return (error);
1613 }
1614
1615 /*
1616 * MPTCP subflow socket receive routine, derived from soreceive().
1617 */
1618 static int
1619 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1620 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1621 {
1622 #pragma unused(uio)
1623 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1624 int flags, error = 0;
1625 struct proc *p = current_proc();
1626 struct mbuf *m, **mp = mp0;
1627 boolean_t proc_held = FALSE;
1628
1629 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
1630 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1631
1632 #ifdef MORE_LOCKING_DEBUG
1633 if (so->so_usecount == 1) {
1634 panic("%s: so=%x no other reference on socket\n", __func__, so);
1635 /* NOTREACHED */
1636 }
1637 #endif
1638 /*
1639 * We return all that is there in the subflow's socket receive buffer
1640 * to the MPTCP layer, so we require that the caller passes in the
1641 * expected parameters.
1642 */
1643 if (mp == NULL || controlp != NULL)
1644 return (EINVAL);
1645
1646 *mp = NULL;
1647 if (psa != NULL)
1648 *psa = NULL;
1649 if (flagsp != NULL)
1650 flags = *flagsp &~ MSG_EOR;
1651 else
1652 flags = 0;
1653
1654 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
1655 return (EOPNOTSUPP);
1656
1657 flags |= (MSG_DONTWAIT|MSG_NBIO);
1658
1659 /*
1660 * If a recv attempt is made on a previously-accepted socket
1661 * that has been marked as inactive (disconnected), reject
1662 * the request.
1663 */
1664 if (so->so_flags & SOF_DEFUNCT) {
1665 struct sockbuf *sb = &so->so_rcv;
1666
1667 error = ENOTCONN;
1668 /*
1669 * This socket should have been disconnected and flushed
1670 * prior to being returned from sodefunct(); there should
1671 * be no data on its receive list, so panic otherwise.
1672 */
1673 if (so->so_state & SS_DEFUNCT)
1674 sb_empty_assert(sb, __func__);
1675 return (error);
1676 }
1677
1678 /*
1679 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1680 * and if so just return to the caller. This could happen when
1681 * soreceive() is called by a socket upcall function during the
1682 * time the socket is freed. The socket buffer would have been
1683 * locked across the upcall, therefore we cannot put this thread
1684 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1685 * we may livelock), because the lock on the socket buffer will
1686 * only be released when the upcall routine returns to its caller.
1687 * Because the socket has been officially closed, there can be
1688 * no further read on it.
1689 *
1690 * A multipath subflow socket would have its SS_NOFDREF set by
1691 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1692 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1693 */
1694 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1695 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
1696 return (0);
1697
1698 /*
1699 * For consistency with soreceive() semantics, we need to obey
1700 * SB_LOCK in case some other code path has locked the buffer.
1701 */
1702 error = sblock(&so->so_rcv, 0);
1703 if (error != 0)
1704 return (error);
1705
1706 m = so->so_rcv.sb_mb;
1707 if (m == NULL) {
1708 /*
1709 * Panic if we notice inconsistencies in the socket's
1710 * receive list; both sb_mb and sb_cc should correctly
1711 * reflect the contents of the list, otherwise we may
1712 * end up with false positives during select() or poll()
1713 * which could put the application in a bad state.
1714 */
1715 SB_MB_CHECK(&so->so_rcv);
1716
1717 if (so->so_error != 0) {
1718 error = so->so_error;
1719 so->so_error = 0;
1720 goto release;
1721 }
1722
1723 if (so->so_state & SS_CANTRCVMORE) {
1724 goto release;
1725 }
1726
1727 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1728 error = ENOTCONN;
1729 goto release;
1730 }
1731
1732 /*
1733 * MSG_DONTWAIT is implicitly defined and this routine will
1734 * never block, so return EWOULDBLOCK when there is nothing.
1735 */
1736 error = EWOULDBLOCK;
1737 goto release;
1738 }
1739
1740 mptcp_update_last_owner(so, mp_so);
1741
1742 if (mp_so->last_pid != proc_pid(p)) {
1743 p = proc_find(mp_so->last_pid);
1744 if (p == PROC_NULL) {
1745 p = current_proc();
1746 } else {
1747 proc_held = TRUE;
1748 }
1749 }
1750
1751 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1752 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1753 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1754
1755 while (m != NULL) {
1756 int dlen = 0, dfin = 0, error_out = 0;
1757 struct mbuf *start = m;
1758 uint64_t dsn;
1759 uint32_t sseq;
1760 uint16_t orig_dlen;
1761 uint16_t csum;
1762
1763 VERIFY(m->m_nextpkt == NULL);
1764
1765 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1766 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1767 dsn = m->m_pkthdr.mp_dsn;
1768 sseq = m->m_pkthdr.mp_rseq;
1769 csum = m->m_pkthdr.mp_csum;
1770 } else {
1771 /* We did fallback */
1772 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
1773
1774 sbfree(&so->so_rcv, m);
1775
1776 if (mp != NULL) {
1777 *mp = m;
1778 mp = &m->m_next;
1779 so->so_rcv.sb_mb = m = m->m_next;
1780 *mp = NULL;
1781
1782 }
1783
1784 if (m != NULL) {
1785 so->so_rcv.sb_lastrecord = m;
1786 } else {
1787 SB_EMPTY_FIXUP(&so->so_rcv);
1788 }
1789
1790 continue;
1791 }
1792
1793 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
1794 dfin = 1;
1795
1796 /*
1797 * Check if the full mapping is now present
1798 */
1799 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
1800 mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n",
1801 __func__, so->so_rcv.sb_cc, dlen),
1802 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1803
1804 if (*mp0 == NULL)
1805 error = EWOULDBLOCK;
1806 goto release;
1807 }
1808
1809 /* Now, get the full mapping */
1810 while (dlen > 0) {
1811 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1812 error_out = 1;
1813 error = EIO;
1814 dlen = 0;
1815 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1816 break;
1817 }
1818
1819 dlen -= m->m_len;
1820 sbfree(&so->so_rcv, m);
1821
1822 if (mp != NULL) {
1823 *mp = m;
1824 mp = &m->m_next;
1825 so->so_rcv.sb_mb = m = m->m_next;
1826 *mp = NULL;
1827 }
1828
1829 if (dlen - dfin == 0)
1830 dlen = 0;
1831
1832 VERIFY(dlen <= 0 || m);
1833 }
1834
1835 VERIFY(dlen == 0);
1836
1837 if (m != NULL) {
1838 so->so_rcv.sb_lastrecord = m;
1839 } else {
1840 SB_EMPTY_FIXUP(&so->so_rcv);
1841 }
1842
1843 if (error_out)
1844 goto release;
1845
1846
1847 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
1848 error = EIO;
1849 *mp0 = NULL;
1850 goto release;
1851 }
1852
1853 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1854 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1855 }
1856
1857 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1858 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1859
1860 if (flagsp != NULL)
1861 *flagsp |= flags;
1862
1863 release:
1864 sbunlock(&so->so_rcv, TRUE);
1865
1866 if (proc_held)
1867 proc_rele(p);
1868
1869 return (error);
1870
1871 }
1872
1873 /*
1874 * MPTCP subflow socket send routine, derived from sosend().
1875 */
1876 static int
1877 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1878 struct mbuf *top, struct mbuf *control, int flags)
1879 {
1880 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1881 struct proc *p = current_proc();
1882 boolean_t en_tracing = FALSE, proc_held = FALSE;
1883 int en_tracing_val;
1884 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1885 int error;
1886
1887 VERIFY(control == NULL);
1888 VERIFY(addr == NULL);
1889 VERIFY(uio == NULL);
1890 VERIFY(flags == 0);
1891 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
1892
1893 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1894 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
1895
1896 /*
1897 * trace if tracing & network (vs. unix) sockets & and
1898 * non-loopback
1899 */
1900 if (ENTR_SHOULDTRACE &&
1901 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1902 struct inpcb *inp = sotoinpcb(so);
1903 if (inp->inp_last_outifp != NULL &&
1904 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1905 en_tracing = TRUE;
1906 en_tracing_val = top->m_pkthdr.len;
1907 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1908 VM_KERNEL_ADDRPERM(so),
1909 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1910 (int64_t)en_tracing_val);
1911 }
1912 }
1913
1914 mptcp_update_last_owner(so, mp_so);
1915
1916 if (mp_so->last_pid != proc_pid(p)) {
1917 p = proc_find(mp_so->last_pid);
1918 if (p == PROC_NULL) {
1919 p = current_proc();
1920 } else {
1921 proc_held = TRUE;
1922 }
1923 }
1924
1925 #if NECP
1926 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1927 #endif /* NECP */
1928
1929 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1930
1931 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1932 if (error)
1933 goto out;
1934
1935 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1936 top = NULL;
1937
1938 out:
1939 if (top != NULL)
1940 m_freem(top);
1941
1942 if (proc_held)
1943 proc_rele(p);
1944
1945 soclearfastopen(so);
1946
1947 if (en_tracing) {
1948 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
1949 VM_KERNEL_ADDRPERM(so),
1950 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
1951 (int64_t)en_tracing_val);
1952 }
1953
1954 return (error);
1955
1956 }
1957
1958 /*
1959 * Establish an initial MPTCP connection (if first subflow and not yet
1960 * connected), or add a subflow to an existing MPTCP connection.
1961 */
1962 int
1963 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
1964 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
1965 {
1966 struct socket *mp_so, *so = NULL;
1967 struct mptcb *mp_tp;
1968 struct mptsub *mpts = NULL;
1969 int af, error = 0;
1970
1971 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1972 mp_so = mptetoso(mpte);
1973 mp_tp = mpte->mpte_mptcb;
1974
1975 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1976 /* If the remote end sends Data FIN, refuse subflow adds */
1977 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
1978 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1979 error = ENOTCONN;
1980 goto out_err;
1981 }
1982
1983 mpts = mptcp_subflow_alloc();
1984 if (mpts == NULL) {
1985 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
1986 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1987 error = ENOMEM;
1988 goto out_err;
1989 }
1990
1991 if (src != NULL) {
1992 int len = src->sa_len;
1993
1994 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
1995 M_WAITOK | M_ZERO);
1996 if (mpts->mpts_src == NULL) {
1997 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
1998 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1999 error = ENOMEM;
2000 goto out_err;
2001 }
2002 bcopy(src, mpts->mpts_src, len);
2003 }
2004
2005 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
2006
2007 af = mpts->mpts_dst.sa_family;
2008
2009 mpts->mpts_ifscope = ifscope;
2010
2011 /* create the subflow socket */
2012 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
2013 /*
2014 * Returning (error) and not cleaning up, because up to here
2015 * all we did is creating mpts.
2016 *
2017 * And the contract is that the call to mptcp_subflow_socreate,
2018 * moves ownership of mpts to mptcp_subflow_socreate.
2019 */
2020 return (error);
2021
2022 /*
2023 * We may be called from within the kernel. Still need to account this
2024 * one to the real app.
2025 */
2026 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2027
2028 /*
2029 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2030 * -1 (SAE_CONNID_ALL).
2031 */
2032 mpte->mpte_connid_last++;
2033 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2034 mpte->mpte_connid_last == SAE_CONNID_ANY)
2035 mpte->mpte_connid_last++;
2036
2037 mpts->mpts_connid = mpte->mpte_connid_last;
2038
2039 mpts->mpts_rel_seq = 1;
2040
2041 /* Allocate a unique address id per subflow */
2042 mpte->mpte_addrid_last++;
2043 if (mpte->mpte_addrid_last == 0)
2044 mpte->mpte_addrid_last++;
2045
2046 /* register for subflow socket read/write events */
2047 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
2048
2049 /* Register for subflow socket control events */
2050 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2051 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2052 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2053 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2054 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2055 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2056 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2057 SO_FILT_HINT_ADAPTIVE_WTIMO);
2058
2059 /* sanity check */
2060 VERIFY(!(mpts->mpts_flags &
2061 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2062
2063 /*
2064 * Indicate to the TCP subflow whether or not it should establish
2065 * the initial MPTCP connection, or join an existing one. Fill
2066 * in the connection request structure with additional info needed
2067 * by the underlying TCP (to be used in the TCP options, etc.)
2068 */
2069 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2070 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2071
2072 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2073 mptcp_init_local_parms(mpte);
2074 }
2075 soisconnecting(mp_so);
2076
2077 /* If fastopen is requested, set state in mpts */
2078 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2079 mpts->mpts_flags |= MPTSF_TFO_REQD;
2080 } else {
2081 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2082 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2083 }
2084
2085 mpts->mpts_flags |= MPTSF_CONNECTING;
2086
2087 if (af == AF_INET || af == AF_INET6) {
2088 char dbuf[MAX_IPv6_STR_LEN];
2089
2090 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2091 "mp_so 0x%llx dst %s[%d] cid %d "
2092 "[pending %s]\n", __func__,
2093 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2094 inet_ntop(af, ((af == AF_INET) ?
2095 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2096 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
2097 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
2098 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2099 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
2100 mpts->mpts_connid,
2101 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
2102 "YES" : "NO")),
2103 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2104 }
2105
2106 /* connect right away if first attempt, or if join can be done now */
2107 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2108 error = mptcp_subflow_soconnectx(mpte, mpts);
2109
2110 if (error)
2111 goto out_err_close;
2112
2113 if (pcid)
2114 *pcid = mpts->mpts_connid;
2115
2116 return (0);
2117
2118 out_err_close:
2119 mptcp_subflow_abort(mpts, error);
2120
2121 return (error);
2122
2123 out_err:
2124 if (mpts)
2125 mptcp_subflow_free(mpts);
2126
2127 return (error);
2128 }
2129
2130 void
2131 mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2132 {
2133 int index = mptcp_get_statsindex(stats, mpts);
2134
2135 if (index != -1) {
2136 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2137
2138 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2139 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2140 }
2141 }
2142
2143 /*
2144 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2145 * will no longer be accessible after a subflow is deleted, thus this
2146 * should occur only after the subflow socket has been disconnected.
2147 */
2148 void
2149 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2150 {
2151 struct socket *mp_so = mptetoso(mpte);
2152 struct socket *so = mpts->mpts_socket;
2153 struct tcpcb *tp = sototcpcb(so);
2154
2155 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2156 VERIFY(mpts->mpts_mpte == mpte);
2157 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2158 VERIFY(mpte->mpte_numflows != 0);
2159 VERIFY(mp_so->so_usecount > 0);
2160
2161 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2162 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2163 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2164 mpts->mpts_flags, mp_so->so_error),
2165 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2166
2167 mptcpstats_update(mpte->mpte_itfstats, mpts);
2168 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2169 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2170
2171 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2172 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2173 mpte->mpte_numflows--;
2174 if (mpte->mpte_active_sub == mpts)
2175 mpte->mpte_active_sub = NULL;
2176
2177 /*
2178 * Drop references held by this subflow socket; there
2179 * will be no further upcalls made from this point.
2180 */
2181 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2182 sock_catchevents_locked(so, NULL, NULL, 0);
2183
2184 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2185
2186 mp_so->so_usecount--; /* for subflow socket */
2187 mpts->mpts_mpte = NULL;
2188 mpts->mpts_socket = NULL;
2189
2190 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2191 mptcp_subflow_remref(mpts); /* for subflow socket */
2192
2193 so->so_flags &= ~SOF_MP_SUBFLOW;
2194 tp->t_mptcb = NULL;
2195 tp->t_mpsub = NULL;
2196 }
2197
2198 void
2199 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2200 {
2201 struct socket *so = mpts->mpts_socket;
2202 struct mptcb *mp_tp = mpte->mpte_mptcb;
2203 int send_dfin = 0;
2204
2205 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2206 send_dfin = 1;
2207
2208 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2209 (so->so_state & SS_ISCONNECTED)) {
2210 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2211 __func__, mpts->mpts_connid, send_dfin),
2212 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2213
2214 if (send_dfin)
2215 mptcp_send_dfin(so);
2216 soshutdownlock(so, SHUT_WR);
2217 }
2218
2219 }
2220
2221 static void
2222 mptcp_subflow_abort(struct mptsub *mpts, int error)
2223 {
2224 struct socket *so = mpts->mpts_socket;
2225 struct tcpcb *tp = sototcpcb(so);
2226
2227 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2228 return;
2229
2230 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2231 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2232
2233 if (tp->t_state != TCPS_CLOSED)
2234 tcp_drop(tp, error);
2235
2236 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2237 }
2238
2239 /*
2240 * Disconnect a subflow socket.
2241 */
2242 void
2243 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2244 {
2245 struct socket *so;
2246 struct mptcb *mp_tp;
2247 int send_dfin = 0;
2248
2249 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2250
2251 VERIFY(mpts->mpts_mpte == mpte);
2252 VERIFY(mpts->mpts_socket != NULL);
2253
2254 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2255 return;
2256
2257 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2258
2259 so = mpts->mpts_socket;
2260 mp_tp = mpte->mpte_mptcb;
2261 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2262 send_dfin = 1;
2263
2264 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2265 (so->so_state & SS_ISCONNECTED)) {
2266 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2267 __func__, mpts->mpts_connid, send_dfin),
2268 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2269
2270 if (send_dfin)
2271 mptcp_send_dfin(so);
2272 (void) soshutdownlock(so, SHUT_RD);
2273 (void) soshutdownlock(so, SHUT_WR);
2274 (void) sodisconnectlocked(so);
2275 }
2276 /*
2277 * Generate a disconnect event for this subflow socket, in case
2278 * the lower layer doesn't do it; this is needed because the
2279 * subflow socket deletion relies on it.
2280 */
2281 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2282 }
2283
2284 /*
2285 * Called when the associated subflow socket posted a read event.
2286 */
2287 static void
2288 mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2289 {
2290 #pragma unused(so, waitf)
2291 struct mptsub *mpts = arg, *tmpts;
2292 struct mptses *mpte = mpts->mpts_mpte;
2293
2294 VERIFY(mpte != NULL);
2295
2296 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2297 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2298 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2299 return;
2300 }
2301
2302 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2303 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2304 if (mpts->mpts_socket->so_usecount == 0) {
2305 /* Will be removed soon by tcp_garbage_collect */
2306 continue;
2307 }
2308
2309 mptcp_subflow_addref(mpts);
2310 mpts->mpts_socket->so_usecount++;
2311
2312 mptcp_subflow_input(mpte, mpts);
2313
2314 mptcp_subflow_remref(mpts); /* ours */
2315
2316 VERIFY(mpts->mpts_socket->so_usecount != 0);
2317 mpts->mpts_socket->so_usecount--;
2318 }
2319
2320 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
2321 }
2322
2323 /*
2324 * Subflow socket input.
2325 */
2326 static void
2327 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2328 {
2329 struct socket *mp_so = mptetoso(mpte);
2330 struct mbuf *m = NULL;
2331 struct socket *so;
2332 int error, wakeup = 0;
2333
2334 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2335 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2336
2337 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2338 struct mptsub *, mpts);
2339
2340 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
2341 goto out;
2342
2343 so = mpts->mpts_socket;
2344
2345 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2346 if (error != 0 && error != EWOULDBLOCK) {
2347 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
2348 __func__, mpts->mpts_connid, error),
2349 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
2350 if (error == ENODATA) {
2351 /*
2352 * Don't ignore ENODATA so as to discover
2353 * nasty middleboxes.
2354 */
2355 mp_so->so_error = ENODATA;
2356
2357 wakeup = 1;
2358 goto out;
2359 }
2360 } else if (error == 0) {
2361 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2362 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2363 }
2364
2365 /* In fallback, make sure to accept data on all but one subflow */
2366 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2367 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2368 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2369 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2370 m_freem(m);
2371 goto out;
2372 }
2373
2374 if (m != NULL) {
2375 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2376 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2377
2378 mpte->mpte_used_cell = 1;
2379 } else {
2380 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2381
2382 mpte->mpte_used_wifi = 1;
2383 }
2384
2385 mptcp_input(mpte, m);
2386 }
2387
2388 /* notify protocol that we drained all the data */
2389 if (error == 0 && m != NULL &&
2390 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2391 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2392
2393 out:
2394 if (wakeup)
2395 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2396
2397 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2398 }
2399
2400 /*
2401 * Subflow socket write upcall.
2402 *
2403 * Called when the associated subflow socket posted a read event.
2404 */
2405 static void
2406 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2407 {
2408 #pragma unused(so, waitf)
2409 struct mptsub *mpts = arg;
2410 struct mptses *mpte = mpts->mpts_mpte;
2411
2412 VERIFY(mpte != NULL);
2413
2414 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2415 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2416 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2417 return;
2418 }
2419
2420 mptcp_output(mpte);
2421 }
2422
2423 static boolean_t
2424 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2425 {
2426 struct mbuf *so_m = so->so_snd.sb_mb;
2427 uint64_t dsn = m->m_pkthdr.mp_dsn;
2428
2429 while (so_m) {
2430 VERIFY(so_m->m_flags & M_PKTHDR);
2431 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2432
2433 /* Part of the segment is covered, don't reinject here */
2434 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2435 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn)
2436 return TRUE;
2437
2438 so_m = so_m->m_next;
2439 }
2440
2441 return FALSE;
2442 }
2443
2444 /*
2445 * Subflow socket output.
2446 *
2447 * Called for sending data from MPTCP to the underlying subflow socket.
2448 */
2449 int
2450 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2451 {
2452 struct mptcb *mp_tp = mpte->mpte_mptcb;
2453 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2454 struct socket *mp_so, *so;
2455 struct tcpcb *tp;
2456 uint64_t mpt_dsn = 0, off = 0;
2457 int sb_cc = 0, error = 0, wakeup = 0;
2458 uint32_t dss_csum;
2459 uint16_t tot_sent = 0;
2460 boolean_t reinjected = FALSE;
2461
2462 mpte_lock_assert_held(mpte);
2463
2464 mp_so = mptetoso(mpte);
2465 so = mpts->mpts_socket;
2466 tp = sototcpcb(so);
2467
2468 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2469 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2470
2471 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2472 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2473 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2474 (mpts->mpts_flags & MPTSF_TFO_REQD));
2475 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2476
2477 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2478 __func__, mpts->mpts_flags, mpte->mpte_flags,
2479 mptcp_subflow_cwnd_space(so)),
2480 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2481 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2482 struct mptsub *, mpts);
2483
2484 /* Remove Addr Option is not sent reliably as per I-D */
2485 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2486 tp->t_rem_aid = mpte->mpte_lost_aid;
2487 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2488 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2489 }
2490
2491 /*
2492 * The mbuf chains containing the metadata (as well as pointing to
2493 * the user data sitting at the MPTCP output queue) would then be
2494 * sent down to the subflow socket.
2495 *
2496 * Some notes on data sequencing:
2497 *
2498 * a. Each mbuf must be a M_PKTHDR.
2499 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2500 * in the mbuf pkthdr structure.
2501 * c. Each mbuf containing the MPTCP metadata must have its
2502 * pkt_flags marked with the PKTF_MPTCP flag.
2503 */
2504
2505 if (mpte->mpte_reinjectq)
2506 sb_mb = mpte->mpte_reinjectq;
2507 else
2508 sb_mb = mp_so->so_snd.sb_mb;
2509
2510 if (sb_mb == NULL) {
2511 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2512 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2513 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
2514 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2515
2516 /* Fix it to prevent looping */
2517 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2518 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2519 goto out;
2520 }
2521
2522 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2523
2524 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2525 !(so->so_state & SS_ISCONNECTED) &&
2526 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2527 tp->t_mpflags |= TMPF_TFO_REQUEST;
2528 goto zero_len_write;
2529 }
2530
2531 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2532
2533 /* First, drop acknowledged data */
2534 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2535 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2536 "dsn %u suna %u reinject? %u\n",
2537 __func__, (uint32_t)mpt_dsn,
2538 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2539 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2540 if (mpte->mpte_reinjectq) {
2541 mptcp_clean_reinjectq(mpte);
2542 } else {
2543 uint64_t len = 0;
2544 len = mp_tp->mpt_snduna - mpt_dsn;
2545 sbdrop(&mp_so->so_snd, (int)len);
2546 wakeup = 1;
2547 }
2548 }
2549
2550 /* Check again because of above sbdrop */
2551 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2552 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2553 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2554 goto out;
2555 }
2556
2557 /*
2558 * In degraded mode, we don't receive data acks, so force free
2559 * mbufs less than snd_nxt
2560 */
2561 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2562 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
2563 mp_so->so_snd.sb_mb) {
2564 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2565 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2566 uint64_t len = 0;
2567 len = mp_tp->mpt_snduna - mpt_dsn;
2568 sbdrop(&mp_so->so_snd, (int)len);
2569 wakeup = 1;
2570
2571 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2572 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2573 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2574 }
2575 }
2576
2577 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2578 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2579 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2580 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
2581 }
2582
2583 /*
2584 * Adjust the top level notion of next byte used for retransmissions
2585 * and sending FINs.
2586 */
2587 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2588 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2589
2590 /* Now determine the offset from which to start transmitting data */
2591 if (mpte->mpte_reinjectq)
2592 sb_mb = mpte->mpte_reinjectq;
2593 else
2594 dont_reinject:
2595 sb_mb = mp_so->so_snd.sb_mb;
2596 if (sb_mb == NULL) {
2597 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2598 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2599 goto out;
2600 }
2601
2602 if (sb_mb == mpte->mpte_reinjectq) {
2603 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2604 off = 0;
2605
2606 if (mptcp_search_seq_in_sub(sb_mb, so)) {
2607 if (mptcp_can_send_more(mp_tp, TRUE)) {
2608 goto dont_reinject;
2609 }
2610
2611 error = ECANCELED;
2612 goto out;
2613 }
2614
2615 reinjected = TRUE;
2616 } else if (flags & MPTCP_SUBOUT_PROBING) {
2617 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2618 off = 0;
2619 } else {
2620 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2621
2622 /*
2623 * With TFO, there might be no data at all, thus still go into this
2624 * code-path here.
2625 */
2626 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2627 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2628 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2629 sb_cc -= off;
2630 } else {
2631 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2632 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2633 (uint32_t)mp_tp->mpt_sndmax),
2634 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2635
2636 goto out;
2637 }
2638 }
2639
2640 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2641 if (sb_cc <= 0) {
2642 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2643 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2644 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2645 mptcp_subflow_cwnd_space(so)),
2646 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2647 }
2648
2649 sb_cc = min(sb_cc, UINT16_MAX);
2650
2651 /*
2652 * Create a DSN mapping for the data we are about to send. It all
2653 * has the same mapping.
2654 */
2655 if (reinjected)
2656 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2657 else
2658 mpt_dsn = mp_tp->mpt_snduna + off;
2659
2660 mpt_mbuf = sb_mb;
2661 while (mpt_mbuf && reinjected == FALSE &&
2662 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2663 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
2664 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2665 mpt_mbuf = mpt_mbuf->m_next;
2666 }
2667 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2668 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2669 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
2670 mpts->mpts_probecnt),
2671 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2672
2673 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
2674
2675 head = tail = NULL;
2676
2677 while (tot_sent < sb_cc) {
2678 ssize_t mlen;
2679
2680 mlen = mpt_mbuf->m_len;
2681 mlen -= off;
2682 mlen = min(mlen, sb_cc - tot_sent);
2683
2684 if (mlen < 0) {
2685 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2686 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2687 (uint32_t)off, sb_cc, tot_sent),
2688 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2689 goto out;
2690 }
2691
2692 if (mlen == 0)
2693 goto next;
2694
2695 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2696 M_COPYM_MUST_COPY_HDR);
2697 if (m == NULL) {
2698 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2699 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2700 error = ENOBUFS;
2701 break;
2702 }
2703
2704 /* Create a DSN mapping for the data (m_copym does it) */
2705 VERIFY(m->m_flags & M_PKTHDR);
2706 VERIFY(m->m_next == NULL);
2707
2708 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2709 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2710 m->m_pkthdr.mp_dsn = mpt_dsn;
2711 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2712 m->m_pkthdr.len = mlen;
2713
2714 if (head == NULL) {
2715 head = tail = m;
2716 } else {
2717 tail->m_next = m;
2718 tail = m;
2719 }
2720
2721 tot_sent += mlen;
2722 off = 0;
2723 next:
2724 mpt_mbuf = mpt_mbuf->m_next;
2725 }
2726
2727 if (reinjected) {
2728 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2729 struct mbuf *n = sb_mb;
2730
2731 while (n) {
2732 n->m_pkthdr.mp_dsn += sb_cc;
2733 n->m_pkthdr.mp_rlen -= sb_cc;
2734 n = n->m_next;
2735 }
2736 m_adj(sb_mb, sb_cc);
2737 } else {
2738 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2739 m_freem(sb_mb);
2740 }
2741 }
2742
2743 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2744 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2745 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2746
2747 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2748 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2749 tot_sent);
2750 }
2751
2752 /* Now, let's update rel-seq and the data-level length */
2753 mpts->mpts_rel_seq += tot_sent;
2754 m = head;
2755 while (m) {
2756 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2757 m->m_pkthdr.mp_csum = dss_csum;
2758 m->m_pkthdr.mp_rlen = tot_sent;
2759 m = m->m_next;
2760 }
2761
2762 if (head != NULL) {
2763 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2764 (tp->t_tfo_stats == 0))
2765 tp->t_mpflags |= TMPF_TFO_REQUEST;
2766
2767 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2768
2769 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
2770 struct sockbuf *, &so->so_rcv,
2771 struct sockbuf *, &so->so_snd,
2772 struct mptses *, mpte, struct mptsub *, mpts,
2773 size_t, tot_sent);
2774 }
2775
2776 done_sending:
2777 if (error == 0 ||
2778 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2779 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
2780
2781 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2782 tcpstat.tcps_mp_num_probes++;
2783 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
2784 mpts->mpts_probecnt += 1;
2785 else
2786 mpts->mpts_probecnt +=
2787 tot_sent/mpts->mpts_maxseg;
2788 }
2789
2790 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2791 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
2792 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2793 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2794 mp_tp->mpt_sndnxt = new_sndnxt;
2795 }
2796
2797 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
2798
2799 /* Must be here as mptcp_can_send_more() checks for this */
2800 soclearfastopen(mp_so);
2801
2802 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2803 (mpts->mpts_probesoon != 0))
2804 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2805 __func__, mpts->mpts_connid,
2806 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2807 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
2808 (tcp_now - mpts->mpts_probesoon)),
2809 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2810
2811 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2812 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2813
2814 mpte->mpte_used_cell = 1;
2815 } else {
2816 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2817
2818 mpte->mpte_used_wifi = 1;
2819 }
2820
2821 /*
2822 * Don't propagate EWOULDBLOCK - it's already taken care of
2823 * in mptcp_usr_send for TFO.
2824 */
2825 error = 0;
2826 } else {
2827 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2828 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
2829 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2830 }
2831 out:
2832
2833 if (wakeup)
2834 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2835
2836 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
2837 return (error);
2838
2839 zero_len_write:
2840 /* Opting to call pru_send as no mbuf at subflow level */
2841 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2842 NULL, current_proc());
2843
2844 goto done_sending;
2845 }
2846
2847 static void
2848 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
2849 {
2850 struct mbuf *n, *prev = NULL;
2851
2852 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2853 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2854 m->m_pkthdr.mp_rseq),
2855 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2856
2857 n = mpte->mpte_reinjectq;
2858
2859 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2860 * equal than m's sequence number.
2861 */
2862 while (n) {
2863 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2864 break;
2865
2866 prev = n;
2867
2868 n = n->m_nextpkt;
2869 }
2870
2871 if (n) {
2872 /* m is already fully covered by the next mbuf in the queue */
2873 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2874 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2875 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2876 __func__, n->m_pkthdr.mp_rlen),
2877 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2878 goto dont_queue;
2879 }
2880
2881 /* m is covering the next mbuf entirely, thus we remove this guy */
2882 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2883 struct mbuf *tmp = n->m_nextpkt;
2884
2885 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2886 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2887 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2888 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2889
2890 m->m_nextpkt = NULL;
2891 if (prev == NULL)
2892 mpte->mpte_reinjectq = tmp;
2893 else
2894 prev->m_nextpkt = tmp;
2895
2896 m_freem(n);
2897 n = tmp;
2898 }
2899
2900 }
2901
2902 if (prev) {
2903 /* m is already fully covered by the previous mbuf in the queue */
2904 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2905 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2906 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2907 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2908 goto dont_queue;
2909 }
2910 }
2911
2912 if (prev == NULL)
2913 mpte->mpte_reinjectq = m;
2914 else
2915 prev->m_nextpkt = m;
2916
2917 m->m_nextpkt = n;
2918
2919 return;
2920
2921 dont_queue:
2922 m_freem(m);
2923 return;
2924 }
2925
2926 static struct mbuf *
2927 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
2928 {
2929 struct socket *mp_so = mptetoso(mpte);
2930 struct mbuf *m;
2931
2932 m = mp_so->so_snd.sb_mb;
2933
2934 while (m) {
2935 /* If this segment covers what we are looking for, return it. */
2936 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2937 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2938 break;
2939
2940
2941 /* Segment is no more in the queue */
2942 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
2943 return NULL;
2944
2945 m = m->m_next;
2946 }
2947
2948 return m;
2949 }
2950
2951 static struct mbuf *
2952 mptcp_copy_mbuf_list(struct mbuf *m, int len)
2953 {
2954 struct mbuf *top = NULL, *tail = NULL;
2955 uint64_t dsn;
2956 uint32_t dlen, rseq;
2957
2958 dsn = m->m_pkthdr.mp_dsn;
2959 dlen = m->m_pkthdr.mp_rlen;
2960 rseq = m->m_pkthdr.mp_rseq;
2961
2962 while (len > 0) {
2963 struct mbuf *n;
2964
2965 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
2966
2967 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
2968 if (n == NULL) {
2969 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
2970 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2971 goto err;
2972 }
2973
2974 VERIFY(n->m_flags & M_PKTHDR);
2975 VERIFY(n->m_next == NULL);
2976 VERIFY(n->m_pkthdr.mp_dsn == dsn);
2977 VERIFY(n->m_pkthdr.mp_rlen == dlen);
2978 VERIFY(n->m_pkthdr.mp_rseq == rseq);
2979 VERIFY(n->m_len == m->m_len);
2980
2981 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
2982
2983 if (top == NULL)
2984 top = n;
2985
2986 if (tail != NULL)
2987 tail->m_next = n;
2988
2989 tail = n;
2990
2991 len -= m->m_len;
2992 m = m->m_next;
2993 }
2994
2995 return top;
2996
2997 err:
2998 if (top)
2999 m_freem(top);
3000
3001 return NULL;
3002 }
3003
3004 static void
3005 mptcp_reinject_mbufs(struct socket *so)
3006 {
3007 struct tcpcb *tp = sototcpcb(so);
3008 struct mptsub *mpts = tp->t_mpsub;
3009 struct mptcb *mp_tp = tptomptp(tp);
3010 struct mptses *mpte = mp_tp->mpt_mpte;;
3011 struct sockbuf *sb = &so->so_snd;
3012 struct mbuf *m;
3013
3014 m = sb->sb_mb;
3015 while (m) {
3016 struct mbuf *n = m->m_next, *orig = m;
3017
3018 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3019 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3020 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3021 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3022
3023 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3024
3025 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
3026 goto next;
3027
3028 /* Has it all already been acknowledged at the data-level? */
3029 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
3030 goto next;
3031
3032 /* Part of this has already been acknowledged - lookup in the
3033 * MPTCP-socket for the segment.
3034 */
3035 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3036 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3037 if (m == NULL)
3038 goto next;
3039 }
3040
3041 /* Copy the mbuf with headers (aka, DSN-numbers) */
3042 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
3043 if (m == NULL)
3044 break;
3045
3046 VERIFY(m->m_nextpkt == NULL);
3047
3048 /* Now, add to the reinject-queue, eliminating overlapping
3049 * segments
3050 */
3051 mptcp_add_reinjectq(mpte, m);
3052
3053 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3054
3055 next:
3056 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3057 while (n) {
3058 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3059
3060 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
3061 break;
3062
3063 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3064 n = n->m_next;
3065 }
3066
3067 m = n;
3068 }
3069 }
3070
3071 void
3072 mptcp_clean_reinjectq(struct mptses *mpte)
3073 {
3074 struct mptcb *mp_tp = mpte->mpte_mptcb;
3075
3076 mpte_lock_assert_held(mpte);
3077
3078 while (mpte->mpte_reinjectq) {
3079 struct mbuf *m = mpte->mpte_reinjectq;
3080
3081 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3082 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
3083 break;
3084
3085 mpte->mpte_reinjectq = m->m_nextpkt;
3086 m->m_nextpkt = NULL;
3087 m_freem(m);
3088 }
3089 }
3090
3091 /*
3092 * Subflow socket control event upcall.
3093 */
3094 static void
3095 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3096 {
3097 #pragma unused(so)
3098 struct mptsub *mpts = arg;
3099 struct mptses *mpte = mpts->mpts_mpte;
3100
3101 VERIFY(mpte != NULL);
3102 mpte_lock_assert_held(mpte);
3103
3104 if ((mpts->mpts_evctl & events) == events)
3105 return;
3106
3107 mpts->mpts_evctl |= events;
3108
3109 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3110 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3111 return;
3112 }
3113
3114 mptcp_subflow_workloop(mpte);
3115 }
3116
3117 /*
3118 * Subflow socket control events.
3119 *
3120 * Called for handling events related to the underlying subflow socket.
3121 */
3122 static ev_ret_t
3123 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3124 uint64_t *p_mpsofilt_hint)
3125 {
3126 ev_ret_t ret = MPTS_EVRET_OK;
3127 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3128 sizeof(mpsub_ev_entry_tbl[0]);
3129
3130 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3131
3132 /* bail if there's nothing to process */
3133 if (!mpts->mpts_evctl)
3134 return (ret);
3135
3136 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3137 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3138 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3139 SO_FILT_HINT_DISCONNECTED)) {
3140 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3141 }
3142
3143 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3144 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3145
3146 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3147 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3148 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3149
3150 /*
3151 * Process all the socket filter hints and reset the hint
3152 * once it is handled
3153 */
3154 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3155 /*
3156 * Always execute the DISCONNECTED event, because it will wakeup
3157 * the app.
3158 */
3159 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3160 (ret >= MPTS_EVRET_OK ||
3161 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3162 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3163 ev_ret_t error =
3164 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3165 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3166 }
3167 }
3168
3169 /*
3170 * We should be getting only events specified via sock_catchevents(),
3171 * so loudly complain if we have any unprocessed one(s).
3172 */
3173 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3174 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3175 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3176 mpts->mpts_connid,
3177 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3178 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3179 else
3180 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3181 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3182 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3183
3184 return (ret);
3185 }
3186
3187 static ev_ret_t
3188 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3189 uint64_t *p_mpsofilt_hint, uint64_t event)
3190 {
3191 struct socket *mp_so, *so;
3192 struct mptcb *mp_tp;
3193
3194 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3195 VERIFY(mpte->mpte_mppcb != NULL);
3196 mp_so = mptetoso(mpte);
3197 mp_tp = mpte->mpte_mptcb;
3198 so = mpts->mpts_socket;
3199
3200 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3201 mpts->mpts_connid, event),
3202 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3203
3204 /*
3205 * We got an event for this subflow that might need to be propagated,
3206 * based on the state of the MPTCP connection.
3207 */
3208 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3209 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3210 mp_so->so_error = so->so_error;
3211 *p_mpsofilt_hint |= event;
3212 }
3213
3214 return (MPTS_EVRET_OK);
3215 }
3216
3217 /*
3218 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3219 */
3220 static ev_ret_t
3221 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3222 uint64_t *p_mpsofilt_hint, uint64_t event)
3223 {
3224 #pragma unused(p_mpsofilt_hint, event)
3225 struct socket *mp_so;
3226 struct tcpcb *tp;
3227
3228 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3229
3230 VERIFY(mpte->mpte_mppcb != NULL);
3231 mp_so = mptetoso(mpte);
3232 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3233
3234 /*
3235 * This overwrites any previous mpte_lost_aid to avoid storing
3236 * too much state when the typical case has only two subflows.
3237 */
3238 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3239 mpte->mpte_lost_aid = tp->t_local_aid;
3240
3241 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3242 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3243
3244 /*
3245 * The subflow connection has lost its source address.
3246 */
3247 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3248
3249 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3250 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3251
3252 return (MPTS_EVRET_DELETE);
3253 }
3254
3255 /*
3256 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3257 * indicates that the remote side sent a Data FIN
3258 */
3259 static ev_ret_t
3260 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3261 uint64_t *p_mpsofilt_hint, uint64_t event)
3262 {
3263 #pragma unused(event)
3264 struct mptcb *mp_tp;
3265
3266 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3267 mp_tp = mpte->mpte_mptcb;
3268
3269 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3270 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3271
3272 /*
3273 * We got a Data FIN for the MPTCP connection.
3274 * The FIN may arrive with data. The data is handed up to the
3275 * mptcp socket and the user is notified so that it may close
3276 * the socket if needed.
3277 */
3278 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
3279 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3280
3281 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3282 }
3283
3284 /*
3285 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3286 */
3287 static ev_ret_t
3288 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3289 uint64_t *p_mpsofilt_hint, uint64_t event)
3290 {
3291 #pragma unused(event, p_mpsofilt_hint)
3292 struct mptsub *mpts_alt = NULL;
3293 struct socket *alt_so = NULL;
3294 struct socket *mp_so;
3295 int altpath_exists = 0;
3296
3297 mpte_lock_assert_held(mpte);
3298 mp_so = mptetoso(mpte);
3299 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3300 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3301 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3302
3303 mptcp_reinject_mbufs(mpts->mpts_socket);
3304
3305 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
3306 /*
3307 * If there is no alternate eligible subflow, ignore the
3308 * failover hint.
3309 */
3310 if (mpts_alt == NULL) {
3311 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3312 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3313
3314 goto done;
3315 }
3316
3317 altpath_exists = 1;
3318 alt_so = mpts_alt->mpts_socket;
3319 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3320 /* All data acknowledged and no RTT spike */
3321 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3322 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3323 } else {
3324 /* no alternate path available */
3325 altpath_exists = 0;
3326 }
3327 }
3328
3329 if (altpath_exists) {
3330 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3331
3332 mpte->mpte_active_sub = mpts_alt;
3333 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3334 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3335
3336 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3337 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3338 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3339
3340 mptcpstats_inc_switch(mpte, mpts);
3341
3342 sowwakeup(alt_so);
3343 } else {
3344 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3345 mpts->mpts_connid),
3346 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3347 done:
3348 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3349 }
3350
3351 return (MPTS_EVRET_OK);
3352 }
3353
3354 /*
3355 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3356 */
3357 static ev_ret_t
3358 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3359 uint64_t *p_mpsofilt_hint, uint64_t event)
3360 {
3361 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3362 VERIFY(mpte->mpte_mppcb != NULL);
3363
3364 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3365 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3366
3367 /*
3368 * The subflow connection cannot use the outgoing interface, let's
3369 * close this subflow.
3370 */
3371 mptcp_subflow_abort(mpts, EPERM);
3372
3373 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3374
3375 return (MPTS_EVRET_DELETE);
3376 }
3377
3378 /*
3379 * https://tools.ietf.org/html/rfc6052#section-2
3380 * https://tools.ietf.org/html/rfc6147#section-5.2
3381 */
3382 static boolean_t
3383 mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3384 const struct ipv6_prefix *prefix,
3385 struct in_addr *addrv4)
3386 {
3387 char buf[MAX_IPv4_STR_LEN];
3388 char *ptrv4 = (char *)addrv4;
3389 const char *ptr = (const char *)addr;
3390
3391 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0)
3392 return false;
3393
3394 switch (prefix->prefix_len) {
3395 case NAT64_PREFIX_LEN_96:
3396 memcpy(ptrv4, ptr + 12, 4);
3397 break;
3398 case NAT64_PREFIX_LEN_64:
3399 memcpy(ptrv4, ptr + 9, 4);
3400 break;
3401 case NAT64_PREFIX_LEN_56:
3402 memcpy(ptrv4, ptr + 7, 1);
3403 memcpy(ptrv4 + 1, ptr + 9, 3);
3404 break;
3405 case NAT64_PREFIX_LEN_48:
3406 memcpy(ptrv4, ptr + 6, 2);
3407 memcpy(ptrv4 + 2, ptr + 9, 2);
3408 break;
3409 case NAT64_PREFIX_LEN_40:
3410 memcpy(ptrv4, ptr + 5, 3);
3411 memcpy(ptrv4 + 3, ptr + 9, 1);
3412 break;
3413 case NAT64_PREFIX_LEN_32:
3414 memcpy(ptrv4, ptr + 4, 4);
3415 break;
3416 default:
3417 panic("NAT64-prefix len is wrong: %u\n",
3418 prefix->prefix_len);
3419 }
3420
3421 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3422 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3423
3424 return true;
3425 }
3426
3427 static void
3428 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3429 {
3430 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3431 struct socket *so = mpts->mpts_socket;
3432 struct ifnet *ifp;
3433 int j;
3434
3435 ifp = sotoinpcb(so)->inp_last_outifp;
3436
3437 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3438 mptcp_ask_for_nat64(ifp);
3439 return;
3440 }
3441
3442
3443 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3444 int success;
3445
3446 if (nat64prefixes[j].prefix_len == 0)
3447 continue;
3448
3449 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3450 &nat64prefixes[j],
3451 &mpte->mpte_dst_v4_nat64.sin_addr);
3452 if (success) {
3453 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3454 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3455 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3456 break;
3457 }
3458 }
3459 }
3460
3461 /*
3462 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3463 */
3464 static ev_ret_t
3465 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3466 uint64_t *p_mpsofilt_hint, uint64_t event)
3467 {
3468 #pragma unused(event, p_mpsofilt_hint)
3469 struct socket *mp_so, *so;
3470 struct inpcb *inp;
3471 struct tcpcb *tp;
3472 struct mptcb *mp_tp;
3473 int af;
3474 boolean_t mpok = FALSE;
3475
3476 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3477 VERIFY(mpte->mpte_mppcb != NULL);
3478
3479 mp_so = mptetoso(mpte);
3480 mp_tp = mpte->mpte_mptcb;
3481 so = mpts->mpts_socket;
3482 tp = sototcpcb(so);
3483 af = mpts->mpts_dst.sa_family;
3484
3485 if (mpts->mpts_flags & MPTSF_CONNECTED)
3486 return (MPTS_EVRET_OK);
3487
3488 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3489 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
3490 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3491 (so->so_state & SS_ISCONNECTED)) {
3492 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3493 __func__, mpts->mpts_connid),
3494 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3495 (void) soshutdownlock(so, SHUT_RD);
3496 (void) soshutdownlock(so, SHUT_WR);
3497 (void) sodisconnectlocked(so);
3498 }
3499 return (MPTS_EVRET_OK);
3500 }
3501
3502 /*
3503 * The subflow connection has been connected. Find out whether it
3504 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3505 *
3506 * a. If MPTCP connection is not yet established, then this must be
3507 * the first subflow connection. If MPTCP failed to negotiate,
3508 * fallback to regular TCP by degrading this subflow.
3509 *
3510 * b. If MPTCP connection has been established, then this must be
3511 * one of the subsequent subflow connections. If MPTCP failed
3512 * to negotiate, disconnect the connection.
3513 *
3514 * Right now, we simply unblock any waiters at the MPTCP socket layer
3515 * if the MPTCP connection has not been established.
3516 */
3517
3518 if (so->so_state & SS_ISDISCONNECTED) {
3519 /*
3520 * With MPTCP joins, a connection is connected at the subflow
3521 * level, but the 4th ACK from the server elevates the MPTCP
3522 * subflow to connected state. So there is a small window
3523 * where the subflow could get disconnected before the
3524 * connected event is processed.
3525 */
3526 return (MPTS_EVRET_OK);
3527 }
3528
3529 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3530 mptcp_drop_tfo_data(mpte, mpts);
3531
3532 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3533 mpts->mpts_flags |= MPTSF_CONNECTED;
3534
3535 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
3536 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3537
3538 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3539
3540 /* get/verify the outbound interface */
3541 inp = sotoinpcb(so);
3542
3543 mpts->mpts_maxseg = tp->t_maxseg;
3544
3545 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3546 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3547 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3548 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
3549
3550 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
3551
3552 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3553 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3554 mpte->mpte_associd = mpts->mpts_connid;
3555 DTRACE_MPTCP2(state__change,
3556 struct mptcb *, mp_tp,
3557 uint32_t, 0 /* event */);
3558
3559 if (SOCK_DOM(so) == AF_INET) {
3560 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3561 } else {
3562 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3563 }
3564
3565 mpts->mpts_flags |= MPTSF_ACTIVE;
3566
3567 /* case (a) above */
3568 if (!mpok) {
3569 tcpstat.tcps_mpcap_fallback++;
3570
3571 tp->t_mpflags |= TMPF_INFIN_SENT;
3572 mptcp_notify_mpfail(so);
3573 } else {
3574 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3575 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3576 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3577 } else {
3578 mpts->mpts_flags |= MPTSF_PREFERRED;
3579 }
3580 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3581 mpte->mpte_nummpcapflows++;
3582
3583 if (SOCK_DOM(so) == AF_INET6)
3584 mptcp_handle_ipv6_connection(mpte, mpts);
3585
3586 mptcp_check_subflows_and_add(mpte);
3587
3588 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3589 mpte->mpte_initial_cell = 1;
3590
3591 mpte->mpte_handshake_success = 1;
3592 }
3593
3594 mp_tp->mpt_sndwnd = tp->snd_wnd;
3595 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3596 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3597 soisconnected(mp_so);
3598
3599 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3600 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3601 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
3602 } else if (mpok) {
3603 /*
3604 * case (b) above
3605 * In case of additional flows, the MPTCP socket is not
3606 * MPTSF_MP_CAPABLE until an ACK is received from server
3607 * for 3-way handshake. TCP would have guaranteed that this
3608 * is an MPTCP subflow.
3609 */
3610 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3611 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3612 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3613 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3614 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3615 } else {
3616 mpts->mpts_flags |= MPTSF_PREFERRED;
3617 }
3618
3619 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3620 mpte->mpte_nummpcapflows++;
3621
3622 mpts->mpts_rel_seq = 1;
3623
3624 mptcp_check_subflows_and_remove(mpte);
3625 } else {
3626 unsigned int i;
3627
3628 /* Should we try the alternate port? */
3629 if (mpte->mpte_alternate_port &&
3630 inp->inp_fport != mpte->mpte_alternate_port) {
3631 union sockaddr_in_4_6 dst;
3632 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3633
3634 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3635
3636 dst_in->sin_port = mpte->mpte_alternate_port;
3637
3638 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3639 mpts->mpts_ifscope , NULL);
3640 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3641 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3642 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3643
3644 if (inp->inp_last_outifp->if_index == info->ifindex) {
3645 info->no_mptcp_support = 1;
3646 break;
3647 }
3648 }
3649 }
3650
3651 tcpstat.tcps_join_fallback++;
3652 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3653 tcpstat.tcps_mptcp_cell_proxy++;
3654 else
3655 tcpstat.tcps_mptcp_wifi_proxy++;
3656
3657 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3658
3659 return (MPTS_EVRET_OK);
3660 }
3661
3662 /* This call, just to "book" an entry in the stats-table for this ifindex */
3663 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3664
3665 mptcp_output(mpte);
3666
3667 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3668 }
3669
3670 /*
3671 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3672 */
3673 static ev_ret_t
3674 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
3675 uint64_t *p_mpsofilt_hint, uint64_t event)
3676 {
3677 #pragma unused(event, p_mpsofilt_hint)
3678 struct socket *mp_so, *so;
3679 struct mptcb *mp_tp;
3680
3681 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3682 VERIFY(mpte->mpte_mppcb != NULL);
3683 mp_so = mptetoso(mpte);
3684 mp_tp = mpte->mpte_mptcb;
3685 so = mpts->mpts_socket;
3686
3687 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3688 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3689 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3690 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3691 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3692
3693 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3694 return (MPTS_EVRET_DELETE);
3695
3696 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3697
3698 /* The subflow connection has been disconnected. */
3699
3700 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3701 mpte->mpte_nummpcapflows--;
3702 if (mpte->mpte_active_sub == mpts) {
3703 mpte->mpte_active_sub = NULL;
3704 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3705 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3706 }
3707 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3708 }
3709
3710 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3711 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3712 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3713 mptcp_drop(mpte, mp_tp, so->so_error);
3714 }
3715
3716 /*
3717 * Clear flags that are used by getconninfo to return state.
3718 * Retain like MPTSF_DELETEOK for internal purposes.
3719 */
3720 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3721 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3722 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3723
3724 return (MPTS_EVRET_DELETE);
3725 }
3726
3727 /*
3728 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3729 */
3730 static ev_ret_t
3731 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3732 uint64_t *p_mpsofilt_hint, uint64_t event)
3733 {
3734 #pragma unused(event, p_mpsofilt_hint)
3735 struct socket *mp_so, *so;
3736 struct mptcb *mp_tp;
3737 ev_ret_t ret = MPTS_EVRET_OK;
3738
3739 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3740 VERIFY(mpte->mpte_mppcb != NULL);
3741 mp_so = mptetoso(mpte);
3742 mp_tp = mpte->mpte_mptcb;
3743 so = mpts->mpts_socket;
3744
3745 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3746 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3747 else
3748 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3749
3750 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3751 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3752 goto done;
3753 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3754 }
3755 else
3756 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3757
3758 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3759 mpts->mpts_flags |= MPTSF_MP_READY;
3760 else
3761 mpts->mpts_flags &= ~MPTSF_MP_READY;
3762
3763 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3764 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3765 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3766 }
3767
3768 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3769 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3770 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3771 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3772 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3773 ret = MPTS_EVRET_CONNECT_PENDING;
3774 }
3775
3776 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3777 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3778 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3779 mpts->mpts_flags, MPTSF_BITS),
3780 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3781
3782 done:
3783 return (ret);
3784 }
3785
3786 /*
3787 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3788 */
3789 static ev_ret_t
3790 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3791 uint64_t *p_mpsofilt_hint, uint64_t event)
3792 {
3793 #pragma unused(event)
3794 struct socket *mp_so, *so;
3795 struct mptcb *mp_tp;
3796 boolean_t is_fastclose;
3797
3798 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3799 VERIFY(mpte->mpte_mppcb != NULL);
3800 mp_so = mptetoso(mpte);
3801 mp_tp = mpte->mpte_mptcb;
3802 so = mpts->mpts_socket;
3803
3804 /* We got an invalid option or a fast close */
3805 struct tcptemp *t_template;
3806 struct inpcb *inp = sotoinpcb(so);
3807 struct tcpcb *tp = NULL;
3808
3809 tp = intotcpcb(inp);
3810 so->so_error = ECONNABORTED;
3811
3812 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3813
3814 t_template = tcp_maketemplate(tp);
3815 if (t_template) {
3816 struct tcp_respond_args tra;
3817
3818 bzero(&tra, sizeof(tra));
3819 if (inp->inp_flags & INP_BOUND_IF)
3820 tra.ifscope = inp->inp_boundifp->if_index;
3821 else
3822 tra.ifscope = IFSCOPE_NONE;
3823 tra.awdl_unrestricted = 1;
3824
3825 tcp_respond(tp, t_template->tt_ipgen,
3826 &t_template->tt_t, (struct mbuf *)NULL,
3827 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
3828 (void) m_free(dtom(t_template));
3829 mptcplog((LOG_DEBUG, "MPTCP Events: "
3830 "%s: mp_so 0x%llx cid %d \n",
3831 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3832 so, mpts->mpts_connid),
3833 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3834 }
3835 mptcp_subflow_abort(mpts, ECONNABORTED);
3836
3837 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3838 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
3839
3840 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3841 mp_so->so_error = ECONNABORTED;
3842 else
3843 mp_so->so_error = ECONNRESET;
3844
3845 /*
3846 * mptcp_drop is being called after processing the events, to fully
3847 * close the MPTCP connection
3848 */
3849 }
3850
3851 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3852 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
3853
3854 return (MPTS_EVRET_DELETE);
3855 }
3856
3857 static ev_ret_t
3858 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3859 uint64_t *p_mpsofilt_hint, uint64_t event)
3860 {
3861 #pragma unused(event)
3862 bool found_active = false;
3863
3864 mpts->mpts_flags |= MPTSF_READ_STALL;
3865
3866 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3867 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3868
3869 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3870 TCPS_HAVERCVDFIN2(tp->t_state))
3871 continue;
3872
3873 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3874 found_active = true;
3875 break;
3876 }
3877 }
3878
3879 if (!found_active)
3880 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3881
3882 return (MPTS_EVRET_OK);
3883 }
3884
3885 static ev_ret_t
3886 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3887 uint64_t *p_mpsofilt_hint, uint64_t event)
3888 {
3889 #pragma unused(event)
3890 bool found_active = false;
3891
3892 mpts->mpts_flags |= MPTSF_WRITE_STALL;
3893
3894 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3895 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3896
3897 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3898 tp->t_state > TCPS_CLOSE_WAIT)
3899 continue;
3900
3901 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3902 found_active = true;
3903 break;
3904 }
3905 }
3906
3907 if (!found_active)
3908 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3909
3910 return (MPTS_EVRET_OK);
3911 }
3912
3913 static const char *
3914 mptcp_evret2str(ev_ret_t ret)
3915 {
3916 const char *c = "UNKNOWN";
3917
3918 switch (ret) {
3919 case MPTS_EVRET_DELETE:
3920 c = "MPTS_EVRET_DELETE";
3921 break;
3922 case MPTS_EVRET_CONNECT_PENDING:
3923 c = "MPTS_EVRET_CONNECT_PENDING";
3924 break;
3925 case MPTS_EVRET_DISCONNECT_FALLBACK:
3926 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3927 break;
3928 case MPTS_EVRET_OK:
3929 c = "MPTS_EVRET_OK";
3930 break;
3931 default:
3932 break;
3933 }
3934 return (c);
3935 }
3936
3937 /*
3938 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3939 * caller must ensure that the option can be issued on subflow sockets, via
3940 * MPOF_SUBFLOW_OK flag.
3941 */
3942 int
3943 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
3944 {
3945 struct socket *mp_so, *so;
3946 struct sockopt sopt;
3947 int error;
3948
3949 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3950 mpte_lock_assert_held(mpte);
3951
3952 mp_so = mptetoso(mpte);
3953 so = mpts->mpts_socket;
3954
3955 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
3956 mpo->mpo_level == SOL_SOCKET &&
3957 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
3958 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n",
3959 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(),
3960 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
3961 mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1),
3962 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3963
3964 /*
3965 * When we open a new subflow, mark it as cell fallback, if
3966 * this subflow goes over cell.
3967 *
3968 * (except for first-party apps)
3969 */
3970
3971 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
3972 return (0);
3973
3974 if (sotoinpcb(so)->inp_last_outifp &&
3975 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
3976 return (0);
3977
3978 /*
3979 * This here is an OR, because if the app is not binding to the
3980 * interface, then it definitely is not a cell-fallback
3981 * connection.
3982 */
3983 if (mpts->mpts_ifscope == IFSCOPE_NONE ||
3984 !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]))
3985 return (0);
3986 }
3987
3988 mpo->mpo_flags &= ~MPOF_INTERIM;
3989
3990 bzero(&sopt, sizeof (sopt));
3991 sopt.sopt_dir = SOPT_SET;
3992 sopt.sopt_level = mpo->mpo_level;
3993 sopt.sopt_name = mpo->mpo_name;
3994 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3995 sopt.sopt_valsize = sizeof (int);
3996 sopt.sopt_p = kernproc;
3997
3998 error = sosetoptlock(so, &sopt, 0);
3999 if (error == 0) {
4000 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
4001 "val %d set successful\n", __func__,
4002 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4003 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4004 mpo->mpo_intval),
4005 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4006 } else {
4007 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
4008 "val %d set error %d\n", __func__,
4009 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4010 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4011 mpo->mpo_intval, error),
4012 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4013 }
4014 return (error);
4015 }
4016
4017 /*
4018 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4019 * caller must ensure that the option can be issued on subflow sockets, via
4020 * MPOF_SUBFLOW_OK flag.
4021 */
4022 int
4023 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4024 struct mptopt *mpo)
4025 {
4026 struct socket *mp_so;
4027 struct sockopt sopt;
4028 int error;
4029
4030 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4031 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4032 mp_so = mptetoso(mpte);
4033
4034 bzero(&sopt, sizeof (sopt));
4035 sopt.sopt_dir = SOPT_GET;
4036 sopt.sopt_level = mpo->mpo_level;
4037 sopt.sopt_name = mpo->mpo_name;
4038 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4039 sopt.sopt_valsize = sizeof (int);
4040 sopt.sopt_p = kernproc;
4041
4042 error = sogetoptlock(so, &sopt, 0); /* already locked */
4043 if (error == 0) {
4044 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4045 "%s: mp_so 0x%llx sopt %s "
4046 "val %d get successful\n", __func__,
4047 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4048 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4049 mpo->mpo_intval),
4050 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4051 } else {
4052 mptcplog((LOG_ERR, "MPTCP Socket: "
4053 "%s: mp_so 0x%llx sopt %s get error %d\n",
4054 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4055 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
4056 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4057 }
4058 return (error);
4059 }
4060
4061
4062 /*
4063 * MPTCP garbage collector.
4064 *
4065 * This routine is called by the MP domain on-demand, periodic callout,
4066 * which is triggered when a MPTCP socket is closed. The callout will
4067 * repeat as long as this routine returns a non-zero value.
4068 */
4069 static uint32_t
4070 mptcp_gc(struct mppcbinfo *mppi)
4071 {
4072 struct mppcb *mpp, *tmpp;
4073 uint32_t active = 0;
4074
4075 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4076
4077 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4078 struct socket *mp_so;
4079 struct mptses *mpte;
4080 struct mptcb *mp_tp;
4081
4082 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4083 mp_so = mpp->mpp_socket;
4084 VERIFY(mp_so != NULL);
4085 mpte = mptompte(mpp);
4086 VERIFY(mpte != NULL);
4087 mp_tp = mpte->mpte_mptcb;
4088 VERIFY(mp_tp != NULL);
4089
4090 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4091 "%s: mp_so 0x%llx found "
4092 "(u=%d,r=%d,s=%d)\n", __func__,
4093 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
4094 mp_so->so_retaincnt, mpp->mpp_state),
4095 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4096
4097 if (!mpte_try_lock(mpte)) {
4098 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4099 "%s: mp_so 0x%llx skipped lock "
4100 "(u=%d,r=%d)\n", __func__,
4101 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4102 mp_so->so_usecount, mp_so->so_retaincnt),
4103 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4104 active++;
4105 continue;
4106 }
4107
4108 /* check again under the lock */
4109 if (mp_so->so_usecount > 0) {
4110 boolean_t wakeup = FALSE;
4111 struct mptsub *mpts, *tmpts;
4112
4113 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4114 "%s: mp_so 0x%llx skipped usecount "
4115 "[u=%d,r=%d] %d %d\n", __func__,
4116 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4117 mp_so->so_usecount, mp_so->so_retaincnt,
4118 mp_tp->mpt_gc_ticks,
4119 mp_tp->mpt_state),
4120 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4121
4122 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4123 if (mp_tp->mpt_gc_ticks > 0)
4124 mp_tp->mpt_gc_ticks--;
4125 if (mp_tp->mpt_gc_ticks == 0) {
4126 wakeup = TRUE;
4127 }
4128 }
4129 if (wakeup) {
4130 TAILQ_FOREACH_SAFE(mpts,
4131 &mpte->mpte_subflows, mpts_entry, tmpts) {
4132 mptcp_subflow_eupcall1(mpts->mpts_socket,
4133 mpts, SO_FILT_HINT_DISCONNECTED);
4134 }
4135 }
4136 mpte_unlock(mpte);
4137 active++;
4138 continue;
4139 }
4140
4141 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4142 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4143 "[u=%d,r=%d,s=%d]\n", __func__,
4144 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4145 mp_so->so_usecount, mp_so->so_retaincnt,
4146 mpp->mpp_state);
4147 }
4148
4149 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
4150 mptcp_close(mpte, mp_tp);
4151
4152 mptcp_session_destroy(mpte);
4153
4154 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4155 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
4156 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4157 mp_so->so_usecount, mp_so->so_retaincnt),
4158 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4159
4160 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4161 struct sockbuf *, &mp_so->so_rcv,
4162 struct sockbuf *, &mp_so->so_snd,
4163 struct mppcb *, mpp);
4164
4165 mp_pcbdispose(mpp);
4166 sodealloc(mp_so);
4167 }
4168
4169 return (active);
4170 }
4171
4172 /*
4173 * Drop a MPTCP connection, reporting the specified error.
4174 */
4175 struct mptses *
4176 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4177 {
4178 struct socket *mp_so;
4179
4180 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4181 VERIFY(mpte->mpte_mptcb == mp_tp);
4182 mp_so = mptetoso(mpte);
4183
4184 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4185 uint32_t, 0 /* event */);
4186
4187 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4188 errno = mp_tp->mpt_softerror;
4189 mp_so->so_error = errno;
4190
4191 return (mptcp_close(mpte, mp_tp));
4192 }
4193
4194 /*
4195 * Close a MPTCP control block.
4196 */
4197 struct mptses *
4198 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4199 {
4200 struct socket *mp_so = NULL;
4201 struct mptsub *mpts = NULL, *tmpts = NULL;
4202
4203 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4204 VERIFY(mpte->mpte_mptcb == mp_tp);
4205 mp_so = mptetoso(mpte);
4206
4207 mp_tp->mpt_state = MPTCPS_TERMINATE;
4208
4209 mptcp_freeq(mp_tp);
4210
4211 soisdisconnected(mp_so);
4212
4213 /* Clean up all subflows */
4214 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4215 mptcp_subflow_disconnect(mpte, mpts);
4216 }
4217
4218 return (NULL);
4219 }
4220
4221 void
4222 mptcp_notify_close(struct socket *so)
4223 {
4224 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4225 }
4226
4227 /*
4228 * MPTCP workloop.
4229 */
4230 void
4231 mptcp_subflow_workloop(struct mptses *mpte)
4232 {
4233 struct socket *mp_so;
4234 struct mptsub *mpts, *tmpts;
4235 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4236 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4237
4238 mpte_lock_assert_held(mpte);
4239 VERIFY(mpte->mpte_mppcb != NULL);
4240 mp_so = mptetoso(mpte);
4241 VERIFY(mp_so != NULL);
4242
4243 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4244 ev_ret_t ret;
4245
4246 if (mpts->mpts_socket->so_usecount == 0) {
4247 /* Will be removed soon by tcp_garbage_collect */
4248 continue;
4249 }
4250
4251 mptcp_subflow_addref(mpts);
4252 mpts->mpts_socket->so_usecount++;
4253
4254 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4255
4256 /*
4257 * If MPTCP socket is closed, disconnect all subflows.
4258 * This will generate a disconnect event which will
4259 * be handled during the next iteration, causing a
4260 * non-zero error to be returned above.
4261 */
4262 if (mp_so->so_flags & SOF_PCBCLEARING)
4263 mptcp_subflow_disconnect(mpte, mpts);
4264
4265 switch (ret) {
4266 case MPTS_EVRET_OK:
4267 /* nothing to do */
4268 break;
4269 case MPTS_EVRET_DELETE:
4270 mptcp_subflow_soclose(mpts);
4271 break;
4272 case MPTS_EVRET_CONNECT_PENDING:
4273 connect_pending = TRUE;
4274 break;
4275 case MPTS_EVRET_DISCONNECT_FALLBACK:
4276 disconnect_fallback = TRUE;
4277 break;
4278 default:
4279 mptcplog((LOG_DEBUG,
4280 "MPTCP Socket: %s: mptcp_subflow_events "
4281 "returned invalid value: %d\n", __func__,
4282 ret),
4283 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4284 break;
4285 }
4286 mptcp_subflow_remref(mpts); /* ours */
4287
4288 VERIFY(mpts->mpts_socket->so_usecount != 0);
4289 mpts->mpts_socket->so_usecount--;
4290 }
4291
4292 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4293 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4294
4295 soevent(mp_so, mpsofilt_hint_mask);
4296 }
4297
4298 if (!connect_pending && !disconnect_fallback)
4299 return;
4300
4301 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4302 if (disconnect_fallback) {
4303 struct socket *so = NULL;
4304 struct inpcb *inp = NULL;
4305 struct tcpcb *tp = NULL;
4306
4307 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
4308 continue;
4309
4310 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4311
4312 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
4313 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
4314 continue;
4315
4316 so = mpts->mpts_socket;
4317
4318 /*
4319 * The MPTCP connection has degraded to a fallback
4320 * mode, so there is no point in keeping this subflow
4321 * regardless of its MPTCP-readiness state, unless it
4322 * is the primary one which we use for fallback. This
4323 * assumes that the subflow used for fallback is the
4324 * ACTIVE one.
4325 */
4326
4327 inp = sotoinpcb(so);
4328 tp = intotcpcb(inp);
4329 tp->t_mpflags &=
4330 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4331 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4332
4333 if (mpts->mpts_flags & MPTSF_ACTIVE) {
4334 continue;
4335 }
4336 tp->t_mpflags |= TMPF_RESET;
4337 soevent(so, SO_FILT_HINT_MUSTRST);
4338 } else if (connect_pending) {
4339 /*
4340 * The MPTCP connection has progressed to a state
4341 * where it supports full multipath semantics; allow
4342 * additional joins to be attempted for all subflows
4343 * that are in the PENDING state.
4344 */
4345 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4346 int error = mptcp_subflow_soconnectx(mpte, mpts);
4347
4348 if (error)
4349 mptcp_subflow_abort(mpts, error);
4350 }
4351 }
4352 }
4353 }
4354
4355 /*
4356 * Protocol pr_lock callback.
4357 */
4358 int
4359 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4360 {
4361 struct mppcb *mpp = mpsotomppcb(mp_so);
4362 void *lr_saved;
4363
4364 if (lr == NULL)
4365 lr_saved = __builtin_return_address(0);
4366 else
4367 lr_saved = lr;
4368
4369 if (mpp == NULL) {
4370 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4371 mp_so, lr_saved, solockhistory_nr(mp_so));
4372 /* NOTREACHED */
4373 }
4374 mpp_lock(mpp);
4375
4376 if (mp_so->so_usecount < 0) {
4377 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4378 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4379 solockhistory_nr(mp_so));
4380 /* NOTREACHED */
4381 }
4382 if (refcount != 0)
4383 mp_so->so_usecount++;
4384 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4385 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4386
4387 return (0);
4388 }
4389
4390 /*
4391 * Protocol pr_unlock callback.
4392 */
4393 int
4394 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4395 {
4396 struct mppcb *mpp = mpsotomppcb(mp_so);
4397 void *lr_saved;
4398
4399 if (lr == NULL)
4400 lr_saved = __builtin_return_address(0);
4401 else
4402 lr_saved = lr;
4403
4404 if (mpp == NULL) {
4405 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4406 mp_so, mp_so->so_usecount, lr_saved,
4407 solockhistory_nr(mp_so));
4408 /* NOTREACHED */
4409 }
4410 mpp_lock_assert_held(mpp);
4411
4412 if (refcount != 0)
4413 mp_so->so_usecount--;
4414
4415 if (mp_so->so_usecount < 0) {
4416 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4417 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4418 /* NOTREACHED */
4419 }
4420 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4421 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4422 mpp_unlock(mpp);
4423
4424 return (0);
4425 }
4426
4427 /*
4428 * Protocol pr_getlock callback.
4429 */
4430 lck_mtx_t *
4431 mptcp_getlock(struct socket *mp_so, int flags)
4432 {
4433 struct mppcb *mpp = mpsotomppcb(mp_so);
4434
4435 if (mpp == NULL) {
4436 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4437 solockhistory_nr(mp_so));
4438 /* NOTREACHED */
4439 }
4440 if (mp_so->so_usecount < 0) {
4441 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4442 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4443 /* NOTREACHED */
4444 }
4445 return (mpp_getlock(mpp, flags));
4446 }
4447
4448 /*
4449 * MPTCP Join support
4450 */
4451
4452 static void
4453 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
4454 uint8_t addr_id)
4455 {
4456 struct tcpcb *tp = sototcpcb(so);
4457 struct mptcp_subf_auth_entry *sauth_entry;
4458 mpte_lock_assert_held(mp_tp->mpt_mpte);
4459
4460 /*
4461 * The address ID of the first flow is implicitly 0.
4462 */
4463 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4464 tp->t_local_aid = 0;
4465 } else {
4466 tp->t_local_aid = addr_id;
4467 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4468 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4469 }
4470 sauth_entry = zalloc(mpt_subauth_zone);
4471 sauth_entry->msae_laddr_id = tp->t_local_aid;
4472 sauth_entry->msae_raddr_id = 0;
4473 sauth_entry->msae_raddr_rand = 0;
4474 try_again:
4475 sauth_entry->msae_laddr_rand = RandomULong();
4476 if (sauth_entry->msae_laddr_rand == 0)
4477 goto try_again;
4478 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4479 }
4480
4481 static void
4482 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4483 {
4484 struct mptcp_subf_auth_entry *sauth_entry;
4485 struct tcpcb *tp = NULL;
4486 int found = 0;
4487
4488 tp = sototcpcb(so);
4489 if (tp == NULL)
4490 return;
4491
4492 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4493 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4494 found = 1;
4495 break;
4496 }
4497 }
4498 if (found) {
4499 LIST_REMOVE(sauth_entry, msae_next);
4500 }
4501
4502 if (found)
4503 zfree(mpt_subauth_zone, sauth_entry);
4504 }
4505
4506 void
4507 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4508 u_int32_t *rrand)
4509 {
4510 struct mptcp_subf_auth_entry *sauth_entry;
4511 mpte_lock_assert_held(mp_tp->mpt_mpte);
4512
4513 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4514 if (sauth_entry->msae_laddr_id == addr_id) {
4515 if (lrand)
4516 *lrand = sauth_entry->msae_laddr_rand;
4517 if (rrand)
4518 *rrand = sauth_entry->msae_raddr_rand;
4519 break;
4520 }
4521 }
4522 }
4523
4524 void
4525 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4526 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4527 {
4528 struct mptcp_subf_auth_entry *sauth_entry;
4529 mpte_lock_assert_held(mp_tp->mpt_mpte);
4530
4531 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4532 if (sauth_entry->msae_laddr_id == laddr_id) {
4533 if ((sauth_entry->msae_raddr_id != 0) &&
4534 (sauth_entry->msae_raddr_id != raddr_id)) {
4535 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
4536 " address ids %d %d \n", __func__, raddr_id,
4537 sauth_entry->msae_raddr_id),
4538 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4539 return;
4540 }
4541 sauth_entry->msae_raddr_id = raddr_id;
4542 if ((sauth_entry->msae_raddr_rand != 0) &&
4543 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4544 mptcplog((LOG_ERR, "MPTCP Socket: "
4545 "%s: dup SYN_ACK %d %d \n",
4546 __func__, raddr_rand,
4547 sauth_entry->msae_raddr_rand),
4548 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4549 return;
4550 }
4551 sauth_entry->msae_raddr_rand = raddr_rand;
4552 return;
4553 }
4554 }
4555 }
4556
4557 /*
4558 * SHA1 support for MPTCP
4559 */
4560 static void
4561 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
4562 {
4563 SHA1_CTX sha1ctxt;
4564 const unsigned char *sha1_base;
4565 int sha1_size;
4566
4567 sha1_base = (const unsigned char *) key;
4568 sha1_size = sizeof (mptcp_key_t);
4569 SHA1Init(&sha1ctxt);
4570 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4571 SHA1Final(sha_digest, &sha1ctxt);
4572 }
4573
4574 void
4575 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4576 u_int32_t rand1, u_int32_t rand2, u_char *digest)
4577 {
4578 SHA1_CTX sha1ctxt;
4579 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4580 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4581 u_int32_t data[2];
4582 int i;
4583
4584 bzero(digest, SHA1_RESULTLEN);
4585
4586 /* Set up the Key for HMAC */
4587 key_ipad[0] = key1;
4588 key_ipad[1] = key2;
4589
4590 key_opad[0] = key1;
4591 key_opad[1] = key2;
4592
4593 /* Set up the message for HMAC */
4594 data[0] = rand1;
4595 data[1] = rand2;
4596
4597 /* Key is 512 block length, so no need to compute hash */
4598
4599 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4600
4601 for (i = 0; i < 8; i++) {
4602 key_ipad[i] ^= 0x3636363636363636;
4603 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4604 }
4605
4606 /* Perform inner SHA1 */
4607 SHA1Init(&sha1ctxt);
4608 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4609 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4610 SHA1Final(digest, &sha1ctxt);
4611
4612 /* Perform outer SHA1 */
4613 SHA1Init(&sha1ctxt);
4614 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4615 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4616 SHA1Final(digest, &sha1ctxt);
4617 }
4618
4619 /*
4620 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4621 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4622 */
4623 void
4624 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
4625 {
4626 uint32_t lrand, rrand;
4627
4628 mpte_lock_assert_held(mp_tp->mpt_mpte);
4629
4630 lrand = rrand = 0;
4631 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4632 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4633 digest);
4634 }
4635
4636 /*
4637 * Authentication data generation
4638 */
4639 static void
4640 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4641 int token_len)
4642 {
4643 VERIFY(token_len == sizeof (u_int32_t));
4644 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4645
4646 /* Most significant 32 bits of the SHA1 hash */
4647 bcopy(sha_digest, token, sizeof (u_int32_t));
4648 return;
4649 }
4650
4651 static void
4652 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4653 int idsn_len)
4654 {
4655 VERIFY(idsn_len == sizeof (u_int64_t));
4656 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4657
4658 /*
4659 * Least significant 64 bits of the SHA1 hash
4660 */
4661
4662 idsn[7] = sha_digest[12];
4663 idsn[6] = sha_digest[13];
4664 idsn[5] = sha_digest[14];
4665 idsn[4] = sha_digest[15];
4666 idsn[3] = sha_digest[16];
4667 idsn[2] = sha_digest[17];
4668 idsn[1] = sha_digest[18];
4669 idsn[0] = sha_digest[19];
4670 return;
4671 }
4672
4673 static void
4674 mptcp_conn_properties(struct mptcb *mp_tp)
4675 {
4676 /* There is only Version 0 at this time */
4677 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4678
4679 /* Set DSS checksum flag */
4680 if (mptcp_dss_csum)
4681 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4682
4683 /* Set up receive window */
4684 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4685
4686 /* Set up gc ticks */
4687 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4688 }
4689
4690 static void
4691 mptcp_init_local_parms(struct mptses *mpte)
4692 {
4693 struct mptcb *mp_tp = mpte->mpte_mptcb;
4694 char key_digest[SHA1_RESULTLEN];
4695
4696 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4697 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4698
4699 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
4700 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4701 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
4702 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4703
4704 /* The subflow SYN is also first MPTCP byte */
4705 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4706 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4707
4708 mptcp_conn_properties(mp_tp);
4709 }
4710
4711 int
4712 mptcp_init_remote_parms(struct mptcb *mp_tp)
4713 {
4714 char remote_digest[SHA1_RESULTLEN];
4715 mpte_lock_assert_held(mp_tp->mpt_mpte);
4716
4717 /* Only Version 0 is supported for auth purposes */
4718 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
4719 return (-1);
4720
4721 /* Setup local and remote tokens and Initial DSNs */
4722 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
4723 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4724 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
4725 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4726 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4727 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4728
4729 return (0);
4730 }
4731
4732 static void
4733 mptcp_send_dfin(struct socket *so)
4734 {
4735 struct tcpcb *tp = NULL;
4736 struct inpcb *inp = NULL;
4737
4738 inp = sotoinpcb(so);
4739 if (!inp)
4740 return;
4741
4742 tp = intotcpcb(inp);
4743 if (!tp)
4744 return;
4745
4746 if (!(tp->t_mpflags & TMPF_RESET))
4747 tp->t_mpflags |= TMPF_SEND_DFIN;
4748 }
4749
4750 /*
4751 * Data Sequence Mapping routines
4752 */
4753 void
4754 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4755 {
4756 struct mptcb *mp_tp;
4757
4758 if (m == NULL)
4759 return;
4760
4761 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
4762 mpte_lock_assert_held(mp_tp->mpt_mpte);
4763
4764 while (m) {
4765 VERIFY(m->m_flags & M_PKTHDR);
4766 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4767 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4768 m->m_pkthdr.mp_rlen = m_pktlen(m);
4769 mp_tp->mpt_sndmax += m_pktlen(m);
4770 m = m->m_next;
4771 }
4772 }
4773
4774 void
4775 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4776 {
4777 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4778 uint64_t data_ack;
4779 uint64_t dsn;
4780
4781 if (!m || len == 0)
4782 return;
4783
4784 while (m && len > 0) {
4785 VERIFY(m->m_flags & M_PKTHDR);
4786 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4787
4788 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4789 dsn = m->m_pkthdr.mp_dsn;
4790
4791 len -= m->m_len;
4792 m = m->m_next;
4793 }
4794
4795 if (m && len == 0) {
4796 /*
4797 * If there is one more mbuf in the chain, it automatically means
4798 * that up to m->mp_dsn has been ack'ed.
4799 *
4800 * This means, we actually correct data_ack back down (compared
4801 * to what we set inside the loop - dsn + data_len). Because in
4802 * the loop we are "optimistic" and assume that the full mapping
4803 * will be acked. If that's not the case and we get out of the
4804 * loop with m != NULL, it means only up to m->mp_dsn has been
4805 * really acked.
4806 */
4807 data_ack = m->m_pkthdr.mp_dsn;
4808 }
4809
4810 if (len < 0) {
4811 /*
4812 * If len is negative, meaning we acked in the middle of an mbuf,
4813 * only up to this mbuf's data-sequence number has been acked
4814 * at the MPTCP-level.
4815 */
4816 data_ack = dsn;
4817 }
4818
4819 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4820 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4821 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
4822 }
4823
4824 void
4825 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
4826 {
4827 int rewinding = 0;
4828
4829 /* TFO makes things complicated. */
4830 if (so->so_flags1 & SOF1_TFO_REWIND) {
4831 rewinding = 1;
4832 so->so_flags1 &= ~SOF1_TFO_REWIND;
4833 }
4834
4835 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4836 u_int32_t sub_len;
4837 VERIFY(m->m_flags & M_PKTHDR);
4838 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4839
4840 sub_len = m->m_pkthdr.mp_rlen;
4841
4842 if (sub_len < len) {
4843 m->m_pkthdr.mp_dsn += sub_len;
4844 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4845 m->m_pkthdr.mp_rseq += sub_len;
4846 }
4847 m->m_pkthdr.mp_rlen = 0;
4848 len -= sub_len;
4849 } else {
4850 /* sub_len >= len */
4851 if (rewinding == 0)
4852 m->m_pkthdr.mp_dsn += len;
4853 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4854 if (rewinding == 0)
4855 m->m_pkthdr.mp_rseq += len;
4856 }
4857 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4858 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4859 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4860 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4861 m->m_pkthdr.mp_rlen -= len;
4862 break;
4863 }
4864 m = m->m_next;
4865 }
4866
4867 if (so->so_flags & SOF_MP_SUBFLOW &&
4868 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4869 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4870 /*
4871 * Received an ack without receiving a DATA_ACK.
4872 * Need to fallback to regular TCP (or destroy this subflow).
4873 */
4874 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
4875 mptcp_notify_mpfail(so);
4876 }
4877 }
4878
4879 /* Obtain the DSN mapping stored in the mbuf */
4880 void
4881 mptcp_output_getm_dsnmap32(struct socket *so, int off,
4882 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
4883 {
4884 u_int64_t dsn64;
4885
4886 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
4887 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4888 }
4889
4890 void
4891 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4892 uint32_t *relseq, uint16_t *data_len,
4893 uint16_t *dss_csum)
4894 {
4895 struct mbuf *m = so->so_snd.sb_mb;
4896 int off_orig = off;
4897
4898 VERIFY(off >= 0);
4899
4900 /*
4901 * In the subflow socket, the DSN sequencing can be discontiguous,
4902 * but the subflow sequence mapping is contiguous. Use the subflow
4903 * sequence property to find the right mbuf and corresponding dsn
4904 * mapping.
4905 */
4906
4907 while (m) {
4908 VERIFY(m->m_flags & M_PKTHDR);
4909 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4910
4911 if (off >= m->m_len) {
4912 off -= m->m_len;
4913 m = m->m_next;
4914 } else {
4915 break;
4916 }
4917 }
4918
4919 VERIFY(m);
4920 VERIFY(off >= 0);
4921 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
4922
4923 *dsn = m->m_pkthdr.mp_dsn;
4924 *relseq = m->m_pkthdr.mp_rseq;
4925 *data_len = m->m_pkthdr.mp_rlen;
4926 *dss_csum = m->m_pkthdr.mp_csum;
4927
4928 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4929 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4930 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4931 }
4932
4933 /*
4934 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4935 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4936 * When it trims data tcp_input calls m_adj() which does not remove the
4937 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4938 * The dsn map insertion cannot be delayed after trim, because data can be in
4939 * the reassembly queue for a while and the DSN option info in tp will be
4940 * overwritten for every new packet received.
4941 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4942 * with mptcp_adj_rmap()
4943 */
4944 void
4945 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
4946 {
4947 VERIFY(m->m_flags & M_PKTHDR);
4948 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4949
4950 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4951 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4952 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4953 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4954 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
4955 if (tp->t_rcv_map.mpt_dfin)
4956 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
4957
4958 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4959
4960 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4961 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4962 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
4963 if (th->th_flags & TH_FIN)
4964 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
4965 }
4966 }
4967
4968 int
4969 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
4970 uint32_t rseq, uint16_t dlen)
4971 {
4972 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
4973
4974 if (m_pktlen(m) == 0)
4975 return (0);
4976
4977 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
4978 if (off && (dsn != m->m_pkthdr.mp_dsn ||
4979 rseq != m->m_pkthdr.mp_rseq ||
4980 dlen != m->m_pkthdr.mp_rlen)) {
4981 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
4982 __func__, dsn, m->m_pkthdr.mp_dsn,
4983 rseq, m->m_pkthdr.mp_rseq,
4984 dlen, m->m_pkthdr.mp_rlen),
4985 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
4986 return (-1);
4987 }
4988 m->m_pkthdr.mp_dsn += off;
4989 m->m_pkthdr.mp_rseq += off;
4990 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4991 } else {
4992 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
4993 /* data arrived without an DSS option mapping */
4994
4995 /* initial subflow can fallback right after SYN handshake */
4996 mptcp_notify_mpfail(so);
4997 }
4998 }
4999
5000 mpts->mpts_flags |= MPTSF_CONFIRMED;
5001
5002 return (0);
5003 }
5004
5005 /*
5006 * Following routines help with failure detection and failover of data
5007 * transfer from one subflow to another.
5008 */
5009 void
5010 mptcp_act_on_txfail(struct socket *so)
5011 {
5012 struct tcpcb *tp = NULL;
5013 struct inpcb *inp = sotoinpcb(so);
5014
5015 if (inp == NULL)
5016 return;
5017
5018 tp = intotcpcb(inp);
5019 if (tp == NULL)
5020 return;
5021
5022 if (so->so_flags & SOF_MP_TRYFAILOVER)
5023 return;
5024
5025 so->so_flags |= SOF_MP_TRYFAILOVER;
5026 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5027 }
5028
5029 /*
5030 * Support for MP_FAIL option
5031 */
5032 int
5033 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5034 {
5035 struct mbuf *m = so->so_snd.sb_mb;
5036 u_int64_t dsn;
5037 int off = 0;
5038 u_int32_t datalen;
5039
5040 if (m == NULL)
5041 return (-1);
5042
5043 while (m != NULL) {
5044 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5045 VERIFY(m->m_flags & M_PKTHDR);
5046 dsn = m->m_pkthdr.mp_dsn;
5047 datalen = m->m_pkthdr.mp_rlen;
5048 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5049 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5050 off = dsn_fail - dsn;
5051 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5052 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5053 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5054 return (0);
5055 }
5056
5057 m = m->m_next;
5058 }
5059
5060 /*
5061 * If there was no mbuf data and a fallback to TCP occurred, there's
5062 * not much else to do.
5063 */
5064
5065 mptcplog((LOG_ERR, "MPTCP Sender: "
5066 "%s: %llu not found \n", __func__, dsn_fail),
5067 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5068 return (-1);
5069 }
5070
5071 /*
5072 * Support for sending contiguous MPTCP bytes in subflow
5073 * Also for preventing sending data with ACK in 3-way handshake
5074 */
5075 int32_t
5076 mptcp_adj_sendlen(struct socket *so, int32_t off)
5077 {
5078 struct tcpcb *tp = sototcpcb(so);
5079 struct mptsub *mpts = tp->t_mpsub;
5080 uint64_t mdss_dsn;
5081 uint32_t mdss_subflow_seq;
5082 int mdss_subflow_off;
5083 uint16_t mdss_data_len;
5084 uint16_t dss_csum;
5085
5086 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5087 &mdss_data_len, &dss_csum);
5088
5089 /*
5090 * We need to compute how much of the mapping still remains.
5091 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5092 */
5093 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5094
5095 /*
5096 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5097 * seq has been set to 1 (while it should be 0).
5098 */
5099 if (tp->t_mpflags & TMPF_TFO_REQUEST)
5100 mdss_subflow_off--;
5101
5102 if (off < mdss_subflow_off)
5103 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5104 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5105 VERIFY(off >= mdss_subflow_off);
5106
5107 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5108 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5109 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5110 return (mdss_data_len - (off - mdss_subflow_off));
5111 }
5112
5113 static uint32_t
5114 mptcp_get_maxseg(struct mptses *mpte)
5115 {
5116 struct mptsub *mpts;
5117 uint32_t maxseg = 0;
5118
5119 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5120 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5121
5122 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5123 TCPS_HAVERCVDFIN2(tp->t_state))
5124 continue;
5125
5126 if (tp->t_maxseg > maxseg)
5127 maxseg = tp->t_maxseg;
5128 }
5129
5130 return (maxseg);
5131 }
5132
5133 static uint8_t
5134 mptcp_get_rcvscale(struct mptses *mpte)
5135 {
5136 struct mptsub *mpts;
5137 uint8_t rcvscale = UINT8_MAX;
5138
5139 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5140 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5141
5142 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5143 TCPS_HAVERCVDFIN2(tp->t_state))
5144 continue;
5145
5146 if (tp->rcv_scale < rcvscale)
5147 rcvscale = tp->rcv_scale;
5148 }
5149
5150 return (rcvscale);
5151 }
5152
5153 /* Similar to tcp_sbrcv_reserve */
5154 static void
5155 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5156 u_int32_t newsize, u_int32_t idealsize)
5157 {
5158 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5159
5160 /* newsize should not exceed max */
5161 newsize = min(newsize, tcp_autorcvbuf_max);
5162
5163 /* The receive window scale negotiated at the
5164 * beginning of the connection will also set a
5165 * limit on the socket buffer size
5166 */
5167 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5168
5169 /* Set new socket buffer size */
5170 if (newsize > sbrcv->sb_hiwat &&
5171 (sbreserve(sbrcv, newsize) == 1)) {
5172 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5173 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5174
5175 /* Again check the limit set by the advertised
5176 * window scale
5177 */
5178 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5179 TCP_MAXWIN << rcvscale);
5180 }
5181 }
5182
5183 void
5184 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5185 {
5186 struct mptses *mpte = mp_tp->mpt_mpte;
5187 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5188 struct sockbuf *sbrcv = &mp_so->so_rcv;
5189 uint32_t hiwat_sum = 0;
5190 uint32_t ideal_sum = 0;
5191 struct mptsub *mpts;
5192
5193 /*
5194 * Do not grow the receive socket buffer if
5195 * - auto resizing is disabled, globally or on this socket
5196 * - the high water mark already reached the maximum
5197 * - the stream is in background and receive side is being
5198 * throttled
5199 * - if there are segments in reassembly queue indicating loss,
5200 * do not need to increase recv window during recovery as more
5201 * data is not going to be sent. A duplicate ack sent during
5202 * recovery should not change the receive window
5203 */
5204 if (tcp_do_autorcvbuf == 0 ||
5205 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5206 tcp_cansbgrow(sbrcv) == 0 ||
5207 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5208 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5209 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5210 /* Can not resize the socket buffer, just return */
5211 return;
5212 }
5213
5214 /*
5215 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5216 *
5217 * But, for this we first need accurate receiver-RTT estimations, which
5218 * we currently don't have.
5219 *
5220 * Let's use a dummy algorithm for now, just taking the sum of all
5221 * subflow's receive-buffers. It's too low, but that's all we can get
5222 * for now.
5223 */
5224
5225 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5226 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5227 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5228 }
5229
5230 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5231 }
5232
5233 /*
5234 * Determine if we can grow the recieve socket buffer to avoid sending
5235 * a zero window update to the peer. We allow even socket buffers that
5236 * have fixed size (set by the application) to grow if the resource
5237 * constraints are met. They will also be trimmed after the application
5238 * reads data.
5239 *
5240 * Similar to tcp_sbrcv_grow_rwin
5241 */
5242 static void
5243 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5244 {
5245 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5246 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5247 u_int32_t rcvbuf = sb->sb_hiwat;
5248
5249 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5250 return;
5251
5252 if (tcp_do_autorcvbuf == 1 &&
5253 tcp_cansbgrow(sb) &&
5254 /* Diff to tcp_sbrcv_grow_rwin */
5255 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5256 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5257 rcvbuf < tcp_autorcvbuf_max &&
5258 (sb->sb_idealsize > 0 &&
5259 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5260 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5261 }
5262 }
5263
5264 /* Similar to tcp_sbspace */
5265 int32_t
5266 mptcp_sbspace(struct mptcb *mp_tp)
5267 {
5268 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5269 uint32_t rcvbuf;
5270 int32_t space;
5271 int32_t pending = 0;
5272
5273 mpte_lock_assert_held(mp_tp->mpt_mpte);
5274
5275 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5276
5277 /* hiwat might have changed */
5278 rcvbuf = sb->sb_hiwat;
5279
5280 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5281 (sb->sb_mbmax - sb->sb_mbcnt)));
5282 if (space < 0)
5283 space = 0;
5284
5285 #if CONTENT_FILTER
5286 /* Compensate for data being processed by content filters */
5287 pending = cfil_sock_data_space(sb);
5288 #endif /* CONTENT_FILTER */
5289 if (pending > space)
5290 space = 0;
5291 else
5292 space -= pending;
5293
5294 return (space);
5295 }
5296
5297 /*
5298 * Support Fallback to Regular TCP
5299 */
5300 void
5301 mptcp_notify_mpready(struct socket *so)
5302 {
5303 struct tcpcb *tp = NULL;
5304
5305 if (so == NULL)
5306 return;
5307
5308 tp = intotcpcb(sotoinpcb(so));
5309
5310 if (tp == NULL)
5311 return;
5312
5313 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5314 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5315 struct tcpcb *, tp);
5316
5317 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5318 return;
5319
5320 if (tp->t_mpflags & TMPF_MPTCP_READY)
5321 return;
5322
5323 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5324 tp->t_mpflags |= TMPF_MPTCP_READY;
5325
5326 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5327 }
5328
5329 void
5330 mptcp_notify_mpfail(struct socket *so)
5331 {
5332 struct tcpcb *tp = NULL;
5333
5334 if (so == NULL)
5335 return;
5336
5337 tp = intotcpcb(sotoinpcb(so));
5338
5339 if (tp == NULL)
5340 return;
5341
5342 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5343 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5344 struct tcpcb *, tp);
5345
5346 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5347 return;
5348
5349 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5350 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5351
5352 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5353 }
5354
5355 /*
5356 * Keepalive helper function
5357 */
5358 boolean_t
5359 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5360 {
5361 boolean_t ret = 1;
5362 mpte_lock_assert_held(mp_tp->mpt_mpte);
5363
5364 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5365 ret = 0;
5366 }
5367 return (ret);
5368 }
5369
5370 /*
5371 * MPTCP t_maxseg adjustment function
5372 */
5373 int
5374 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5375 {
5376 int mss_lower = 0;
5377 struct mptcb *mp_tp = tptomptp(tp);
5378
5379 #define MPTCP_COMPUTE_LEN { \
5380 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5381 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5382 mss_lower += 2; \
5383 else \
5384 /* adjust to 32-bit boundary + EOL */ \
5385 mss_lower += 2; \
5386 }
5387 if (mp_tp == NULL)
5388 return (0);
5389
5390 mpte_lock_assert_held(mp_tp->mpt_mpte);
5391
5392 /*
5393 * For the first subflow and subsequent subflows, adjust mss for
5394 * most common MPTCP option size, for case where tcp_mss is called
5395 * during option processing and MTU discovery.
5396 */
5397 if (!mtudisc) {
5398 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5399 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5400 MPTCP_COMPUTE_LEN;
5401 }
5402
5403 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5404 tp->t_mpflags & TMPF_SENT_JOIN) {
5405 MPTCP_COMPUTE_LEN;
5406 }
5407 } else {
5408 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5409 MPTCP_COMPUTE_LEN;
5410 }
5411 }
5412
5413 return (mss_lower);
5414 }
5415
5416 /*
5417 * Update the pid, upid, uuid of the subflow so, based on parent so
5418 */
5419 void
5420 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5421 {
5422 if (so->last_pid != mp_so->last_pid ||
5423 so->last_upid != mp_so->last_upid) {
5424 so->last_upid = mp_so->last_upid;
5425 so->last_pid = mp_so->last_pid;
5426 uuid_copy(so->last_uuid, mp_so->last_uuid);
5427 }
5428 so_update_policy(so);
5429 }
5430
5431 static void
5432 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5433 {
5434 struct inpcb *inp;
5435
5436 tcp_getconninfo(so, &flow->flow_ci);
5437 inp = sotoinpcb(so);
5438 #if INET6
5439 if ((inp->inp_vflag & INP_IPV6) != 0) {
5440 flow->flow_src.ss_family = AF_INET6;
5441 flow->flow_dst.ss_family = AF_INET6;
5442 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5443 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5444 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5445 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5446 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5447 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5448 } else
5449 #endif
5450 if ((inp->inp_vflag & INP_IPV4) != 0) {
5451 flow->flow_src.ss_family = AF_INET;
5452 flow->flow_dst.ss_family = AF_INET;
5453 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5454 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5455 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5456 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5457 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5458 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5459 }
5460 flow->flow_len = sizeof(*flow);
5461 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5462 flow->flow_flags = mpts->mpts_flags;
5463 flow->flow_cid = mpts->mpts_connid;
5464 flow->flow_relseq = mpts->mpts_rel_seq;
5465 flow->flow_soerror = mpts->mpts_socket->so_error;
5466 flow->flow_probecnt = mpts->mpts_probecnt;
5467 }
5468
5469 static int
5470 mptcp_pcblist SYSCTL_HANDLER_ARGS
5471 {
5472 #pragma unused(oidp, arg1, arg2)
5473 int error = 0, f;
5474 size_t len;
5475 struct mppcb *mpp;
5476 struct mptses *mpte;
5477 struct mptcb *mp_tp;
5478 struct mptsub *mpts;
5479 struct socket *so;
5480 conninfo_mptcp_t mptcpci;
5481 mptcp_flow_t *flows = NULL;
5482
5483 if (req->newptr != USER_ADDR_NULL)
5484 return (EPERM);
5485
5486 lck_mtx_lock(&mtcbinfo.mppi_lock);
5487 if (req->oldptr == USER_ADDR_NULL) {
5488 size_t n = mtcbinfo.mppi_count;
5489 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5490 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5491 4 * (n + n/8) * sizeof(mptcp_flow_t);
5492 return (0);
5493 }
5494 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5495 flows = NULL;
5496 mpp_lock(mpp);
5497 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5498 mpte = mptompte(mpp);
5499 VERIFY(mpte != NULL);
5500 mpte_lock_assert_held(mpte);
5501 mp_tp = mpte->mpte_mptcb;
5502 VERIFY(mp_tp != NULL);
5503
5504 bzero(&mptcpci, sizeof(mptcpci));
5505 mptcpci.mptcpci_state = mp_tp->mpt_state;
5506 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5507 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5508 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5509 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5510 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5511 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5512 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5513 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5514 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5515 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5516 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
5517 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5518 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5519
5520 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5521 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5522 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5523 mptcpci.mptcpci_flow_offset =
5524 offsetof(conninfo_mptcp_t, mptcpci_flows);
5525
5526 len = sizeof(*flows) * mpte->mpte_numflows;
5527 if (mpte->mpte_numflows != 0) {
5528 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5529 if (flows == NULL) {
5530 mpp_unlock(mpp);
5531 break;
5532 }
5533 mptcpci.mptcpci_len = sizeof(mptcpci) +
5534 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5535 error = SYSCTL_OUT(req, &mptcpci,
5536 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5537 } else {
5538 mptcpci.mptcpci_len = sizeof(mptcpci);
5539 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
5540 }
5541 if (error) {
5542 mpp_unlock(mpp);
5543 FREE(flows, M_TEMP);
5544 break;
5545 }
5546 f = 0;
5547 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5548 so = mpts->mpts_socket;
5549 fill_mptcp_subflow(so, &flows[f], mpts);
5550 f++;
5551 }
5552 mpp_unlock(mpp);
5553 if (flows) {
5554 error = SYSCTL_OUT(req, flows, len);
5555 FREE(flows, M_TEMP);
5556 if (error)
5557 break;
5558 }
5559 }
5560 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5561
5562 return (error);
5563 }
5564
5565 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5566 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5567 "List of active MPTCP connections");
5568
5569 /*
5570 * Set notsent lowat mark on the MPTCB
5571 */
5572 int
5573 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5574 {
5575 struct mptcb *mp_tp = NULL;
5576 int error = 0;
5577
5578 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5579 mp_tp = mpte->mpte_mptcb;
5580
5581 if (mp_tp)
5582 mp_tp->mpt_notsent_lowat = optval;
5583 else
5584 error = EINVAL;
5585
5586 return (error);
5587 }
5588
5589 u_int32_t
5590 mptcp_get_notsent_lowat(struct mptses *mpte)
5591 {
5592 struct mptcb *mp_tp = NULL;
5593
5594 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5595 mp_tp = mpte->mpte_mptcb;
5596
5597 if (mp_tp)
5598 return (mp_tp->mpt_notsent_lowat);
5599 else
5600 return (0);
5601 }
5602
5603 int
5604 mptcp_notsent_lowat_check(struct socket *so)
5605 {
5606 struct mptses *mpte;
5607 struct mppcb *mpp;
5608 struct mptcb *mp_tp;
5609 struct mptsub *mpts;
5610
5611 int notsent = 0;
5612
5613 mpp = mpsotomppcb(so);
5614 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5615 return (0);
5616 }
5617
5618 mpte = mptompte(mpp);
5619 mpte_lock_assert_held(mpte);
5620 mp_tp = mpte->mpte_mptcb;
5621
5622 notsent = so->so_snd.sb_cc;
5623
5624 if ((notsent == 0) ||
5625 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5626 mp_tp->mpt_notsent_lowat)) {
5627 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5628 "lowat %d notsent %d actual %d \n",
5629 mp_tp->mpt_notsent_lowat, notsent,
5630 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5631 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5632 return (1);
5633 }
5634
5635 /* When Nagle's algorithm is not disabled, it is better
5636 * to wakeup the client even before there is atleast one
5637 * maxseg of data to write.
5638 */
5639 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5640 int retval = 0;
5641 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5642 struct socket *subf_so = mpts->mpts_socket;
5643 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5644
5645 notsent = so->so_snd.sb_cc -
5646 (tp->snd_nxt - tp->snd_una);
5647
5648 if ((tp->t_flags & TF_NODELAY) == 0 &&
5649 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5650 retval = 1;
5651 }
5652 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
5653 " nodelay false \n",
5654 mp_tp->mpt_notsent_lowat, notsent),
5655 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5656 return (retval);
5657 }
5658 }
5659 return (0);
5660 }
5661
5662 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5663 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5664 static uint32_t mptcp_kern_skt_inuse = 0;
5665 static uint32_t mptcp_kern_skt_unit;
5666 symptoms_advisory_t mptcp_advisory;
5667
5668 static errno_t
5669 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5670 void **unitinfo)
5671 {
5672 #pragma unused(kctlref, sac, unitinfo)
5673
5674 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5675 mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__),
5676 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5677
5678 mptcp_kern_skt_unit = sac->sc_unit;
5679
5680 return (0);
5681 }
5682
5683 static void
5684 mptcp_allow_uuid(uuid_t uuid)
5685 {
5686 struct mppcb *mpp;
5687
5688 /* Iterate over all MPTCP connections */
5689
5690 lck_mtx_lock(&mtcbinfo.mppi_lock);
5691
5692 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5693 struct mptses *mpte;
5694 struct socket *mp_so;
5695
5696 mpp_lock(mpp);
5697
5698 mpte = mpp->mpp_pcbe;
5699 mp_so = mpp->mpp_socket;
5700
5701 if (mp_so->so_flags & SOF_DELEGATED &&
5702 uuid_compare(uuid, mp_so->e_uuid))
5703 goto next;
5704 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5705 uuid_compare(uuid, mp_so->last_uuid))
5706 goto next;
5707
5708 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5709
5710 mptcp_check_subflows_and_add(mpte);
5711 mptcp_remove_subflows(mpte);
5712
5713 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5714
5715 next:
5716 mpp_unlock(mpp);
5717 }
5718
5719 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5720 }
5721
5722 static void
5723 mptcp_wifi_status_changed(void)
5724 {
5725 struct mppcb *mpp;
5726
5727 /* Iterate over all MPTCP connections */
5728
5729 lck_mtx_lock(&mtcbinfo.mppi_lock);
5730
5731 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5732 struct mptses *mpte;
5733 struct socket *mp_so;
5734
5735 mpp_lock(mpp);
5736
5737 mpte = mpp->mpp_pcbe;
5738 mp_so = mpp->mpp_socket;
5739
5740 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5741 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5742 goto next;
5743
5744 mptcp_check_subflows_and_add(mpte);
5745 mptcp_check_subflows_and_remove(mpte);
5746
5747 next:
5748 mpp_unlock(mpp);
5749 }
5750
5751 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5752 }
5753
5754 void
5755 mptcp_ask_symptoms(struct mptses *mpte)
5756 {
5757 struct mptcp_symptoms_ask_uuid ask;
5758 struct socket *mp_so;
5759 struct proc *p;
5760 int pid, prio, err;
5761
5762 if (mptcp_kern_skt_unit == 0) {
5763 mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__),
5764 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5765 return;
5766 }
5767
5768 mp_so = mptetoso(mpte);
5769
5770 if (mp_so->so_flags & SOF_DELEGATED)
5771 pid = mp_so->e_pid;
5772 else
5773 pid = mp_so->last_pid;
5774
5775 p = proc_find(pid);
5776 if (p == PROC_NULL) {
5777 mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__,
5778 pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5779 return;
5780 }
5781
5782 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5783
5784 if (mp_so->so_flags & SOF_DELEGATED)
5785 uuid_copy(ask.uuid, mp_so->e_uuid);
5786 else
5787 uuid_copy(ask.uuid, mp_so->last_uuid);
5788
5789 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5790
5791 if (prio == TASK_BACKGROUND_APPLICATION)
5792 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5793 else if (prio == TASK_FOREGROUND_APPLICATION)
5794 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
5795 else
5796 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5797
5798 mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__,
5799 pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5800
5801 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5802 &ask, sizeof(ask), CTL_DATA_EOR);
5803 if (err)
5804 mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err),
5805 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5806
5807 proc_rele(p);
5808 }
5809
5810 static errno_t
5811 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5812 void *unitinfo)
5813 {
5814 #pragma unused(kctlref, kcunit, unitinfo)
5815
5816 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5817
5818 return (0);
5819 }
5820
5821 static errno_t
5822 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5823 mbuf_t m, int flags)
5824 {
5825 #pragma unused(kctlref, unitinfo, flags)
5826 symptoms_advisory_t *sa = NULL;
5827
5828 if (kcunit != mptcp_kern_skt_unit)
5829 mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n",
5830 __func__, kcunit, mptcp_kern_skt_unit),
5831 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
5832
5833 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5834 mbuf_freem(m);
5835 return (EINVAL);
5836 }
5837
5838 if (mbuf_len(m) >= sizeof(*sa))
5839 sa = mbuf_data(m);
5840 else
5841 return (EINVAL);
5842
5843 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5844 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5845 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
5846
5847 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5848 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5849 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5850
5851 if ((sa->sa_wifi_status &
5852 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5853 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
5854 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5855
5856 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5857 mptcp_wifi_status_changed();
5858 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5859 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5860 mptcp_advisory.sa_wifi_status),
5861 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5862 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5863 uuid_t uuid;
5864
5865 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5866 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5867
5868 uuid_copy(uuid, (unsigned char *)(sa + 1));
5869
5870 mptcp_allow_uuid(uuid);
5871 }
5872
5873 return (0);
5874 }
5875
5876 void
5877 mptcp_control_register(void)
5878 {
5879 /* Set up the advisory control socket */
5880 struct kern_ctl_reg mptcp_kern_ctl;
5881
5882 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5883 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5884 sizeof(mptcp_kern_ctl.ctl_name));
5885 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5886 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5887 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5888 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5889
5890 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5891 }
5892
5893 int
5894 mptcp_is_wifi_unusable(void)
5895 {
5896 /* a false return val indicates there is no info or wifi is ok */
5897 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5898 }
5899
5900 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5901 static void
5902 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
5903 {
5904 struct socket *mp_so = mptetoso(mpte);
5905 struct socket *so = mpts->mpts_socket;
5906 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5907 struct mptcb *mp_tp = mpte->mpte_mptcb;
5908
5909 /* If data was sent with SYN, rewind state */
5910 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5911 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
5912 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5913
5914 VERIFY(mp_droplen <= (UINT_MAX));
5915 VERIFY(mp_droplen >= tcp_droplen);
5916
5917 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5918 mpts->mpts_iss += tcp_droplen;
5919 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5920
5921 if (mp_droplen > tcp_droplen) {
5922 /* handle partial TCP ack */
5923 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5924 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
5925 mp_droplen = tcp_droplen;
5926 } else {
5927 /* all data on SYN was acked */
5928 mpts->mpts_rel_seq = 1;
5929 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5930 }
5931 mp_tp->mpt_sndmax -= tcp_droplen;
5932
5933 if (mp_droplen != 0) {
5934 VERIFY(mp_so->so_snd.sb_mb != NULL);
5935 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5936 }
5937 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
5938 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
5939 mpts->mpts_connid, tcp_droplen, mp_droplen),
5940 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5941 }
5942 }
5943
5944 int
5945 mptcp_freeq(struct mptcb *mp_tp)
5946 {
5947 struct tseg_qent *q;
5948 int rv = 0;
5949
5950 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
5951 LIST_REMOVE(q, tqe_q);
5952 m_freem(q->tqe_m);
5953 zfree(tcp_reass_zone, q);
5954 rv = 1;
5955 }
5956 mp_tp->mpt_reassqlen = 0;
5957 return (rv);
5958 }
5959
5960 static int
5961 mptcp_post_event(u_int32_t event_code, int value)
5962 {
5963 struct kev_mptcp_data event_data;
5964 struct kev_msg ev_msg;
5965
5966 memset(&ev_msg, 0, sizeof(ev_msg));
5967
5968 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5969 ev_msg.kev_class = KEV_NETWORK_CLASS;
5970 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
5971 ev_msg.event_code = event_code;
5972
5973 event_data.value = value;
5974
5975 ev_msg.dv[0].data_ptr = &event_data;
5976 ev_msg.dv[0].data_length = sizeof(event_data);
5977
5978 return kev_post_msg(&ev_msg);
5979 }
5980
5981 void
5982 mptcp_set_cellicon(struct mptses *mpte)
5983 {
5984 int error;
5985
5986 /* First-party apps (Siri) don't flip the cellicon */
5987 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
5988 return;
5989
5990 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
5991 mptcp_last_cellicon_set = tcp_now;
5992
5993 /* If cellicon is already set, get out of here! */
5994 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
5995 return;
5996
5997 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
5998
5999 if (error)
6000 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
6001 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6002 else
6003 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
6004 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6005 }
6006
6007 void
6008 mptcp_unset_cellicon(void)
6009 {
6010 int error;
6011
6012 /* If cellicon is already unset, get out of here! */
6013 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
6014 return;
6015
6016 /*
6017 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6018 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6019 * it again.
6020 */
6021 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
6022 tcp_now)) {
6023 OSTestAndSet(7, &mptcp_cellicon_is_set);
6024 return;
6025 }
6026
6027 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6028
6029 if (error)
6030 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
6031 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6032 else
6033 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
6034 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6035 }
6036
6037 void
6038 mptcp_reset_rexmit_state(struct tcpcb *tp)
6039 {
6040 struct mptsub *mpts;
6041 struct inpcb *inp;
6042 struct socket *so;
6043
6044 inp = tp->t_inpcb;
6045 if (inp == NULL)
6046 return;
6047
6048 so = inp->inp_socket;
6049 if (so == NULL)
6050 return;
6051
6052 if (!(so->so_flags & SOF_MP_SUBFLOW))
6053 return;
6054
6055 mpts = tp->t_mpsub;
6056
6057 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6058 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6059 }
6060
6061 void
6062 mptcp_reset_keepalive(struct tcpcb *tp)
6063 {
6064 struct mptsub *mpts = tp->t_mpsub;
6065
6066 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6067 }
6068